diff --git a/packages/kernel/linux/package.mk b/packages/kernel/linux/package.mk
index e00829143..f48ce4a9f 100644
--- a/packages/kernel/linux/package.mk
+++ b/packages/kernel/linux/package.mk
@@ -4,7 +4,7 @@
 
 PKG_NAME="linux"
 PKG_LICENSE="GPL"
-PKG_VERSION="6.0.7"
+PKG_VERSION="6.0.11"
 PKG_URL="https://www.kernel.org/pub/linux/kernel/v6.x/${PKG_NAME}-${PKG_VERSION}.tar.xz"
 PKG_SITE="http://www.kernel.org"
 PKG_DEPENDS_HOST="ccache:host rsync:host openssl:host"
diff --git a/packages/kernel/linux/patches/X86_64/patch-6.0.5-rt14.patch b/packages/kernel/linux/patches/X86_64/patch-6.0.5-rt14.patch
index c0c976eb9..2de168ec2 100644
--- a/packages/kernel/linux/patches/X86_64/patch-6.0.5-rt14.patch
+++ b/packages/kernel/linux/patches/X86_64/patch-6.0.5-rt14.patch
@@ -1,7 +1,43 @@
-diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
-index 11ecf09aadc86..98aa5a478719c 100644
---- a/arch/arm/Kconfig
-+++ b/arch/arm/Kconfig
+diff -rupN linux.orig/arch/arm/include/asm/thread_info.h linux/arch/arm/include/asm/thread_info.h
+--- linux.orig/arch/arm/include/asm/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/include/asm/thread_info.h	2022-12-04 10:40:26.676034147 -0500
+@@ -62,6 +62,7 @@ struct cpu_context_save {
+ struct thread_info {
+ 	unsigned long		flags;		/* low level flags */
+ 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
++	int			preempt_lazy_count; /* 0 => preemptable, <0 => bug */
+ 	__u32			cpu;		/* cpu */
+ 	__u32			cpu_domain;	/* cpu domain */
+ 	struct cpu_context_save	cpu_context;	/* cpu context */
+@@ -133,6 +134,7 @@ extern int vfp_restore_user_hwstate(stru
+ #define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
+ #define TIF_SECCOMP		7	/* seccomp syscall filtering active */
+ #define TIF_NOTIFY_SIGNAL	8	/* signal notifications exist */
++#define TIF_NEED_RESCHED_LAZY	9
+ 
+ #define TIF_USING_IWMMXT	17
+ #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
+@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(stru
+ #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+ #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+ #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
++#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
+ #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
+ 
+ /* Checks for any syscall work in entry-common.S */
+@@ -156,7 +159,8 @@ extern int vfp_restore_user_hwstate(stru
+ /*
+  * Change these and you break ASM code in entry-common.S
+  */
+-#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
++#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
++				 _TIF_SIGPENDING | \
+ 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ 				 _TIF_NOTIFY_SIGNAL)
+ 
+diff -rupN linux.orig/arch/arm/Kconfig linux/arch/arm/Kconfig
+--- linux.orig/arch/arm/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/Kconfig	2022-12-04 10:40:26.676034147 -0500
 @@ -33,6 +33,7 @@ config ARM
  	select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
  	select ARCH_SUPPORTS_ATOMIC_RMW
@@ -35,48 +71,9 @@ index 11ecf09aadc86..98aa5a478719c 100644
  	select RTC_LIB
  	select SYS_SUPPORTS_APM_EMULATION
  	select THREAD_INFO_IN_TASK
-diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
-index aecc403b28804..1b56e56f8f415 100644
---- a/arch/arm/include/asm/thread_info.h
-+++ b/arch/arm/include/asm/thread_info.h
-@@ -62,6 +62,7 @@ struct cpu_context_save {
- struct thread_info {
- 	unsigned long		flags;		/* low level flags */
- 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
-+	int			preempt_lazy_count; /* 0 => preemptable, <0 => bug */
- 	__u32			cpu;		/* cpu */
- 	__u32			cpu_domain;	/* cpu domain */
- 	struct cpu_context_save	cpu_context;	/* cpu context */
-@@ -133,6 +134,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
- #define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
- #define TIF_SECCOMP		7	/* seccomp syscall filtering active */
- #define TIF_NOTIFY_SIGNAL	8	/* signal notifications exist */
-+#define TIF_NEED_RESCHED_LAZY	9
- 
- #define TIF_USING_IWMMXT	17
- #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
-@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
- #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
- #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
- #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
-+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
- #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
- 
- /* Checks for any syscall work in entry-common.S */
-@@ -156,7 +159,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
- /*
-  * Change these and you break ASM code in entry-common.S
-  */
--#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-+#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
-+				 _TIF_SIGPENDING | \
- 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- 				 _TIF_NOTIFY_SIGNAL)
- 
-diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
-index 2c8d76fd7c662..c3bdec7d2df9c 100644
---- a/arch/arm/kernel/asm-offsets.c
-+++ b/arch/arm/kernel/asm-offsets.c
+diff -rupN linux.orig/arch/arm/kernel/asm-offsets.c linux/arch/arm/kernel/asm-offsets.c
+--- linux.orig/arch/arm/kernel/asm-offsets.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/kernel/asm-offsets.c	2022-12-04 10:40:26.676034147 -0500
 @@ -43,6 +43,7 @@ int main(void)
    BLANK();
    DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
@@ -85,11 +82,10 @@ index 2c8d76fd7c662..c3bdec7d2df9c 100644
    DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
    DEFINE(TI_CPU_DOMAIN,		offsetof(struct thread_info, cpu_domain));
    DEFINE(TI_CPU_SAVE,		offsetof(struct thread_info, cpu_context));
-diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
-index c39303e5c2347..cfb4660e9feab 100644
---- a/arch/arm/kernel/entry-armv.S
-+++ b/arch/arm/kernel/entry-armv.S
-@@ -222,11 +222,18 @@ ENDPROC(__dabt_svc)
+diff -rupN linux.orig/arch/arm/kernel/entry-armv.S linux/arch/arm/kernel/entry-armv.S
+--- linux.orig/arch/arm/kernel/entry-armv.S	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/kernel/entry-armv.S	2022-12-04 10:40:26.676034147 -0500
+@@ -222,11 +222,18 @@ __irq_svc:
  
  #ifdef CONFIG_PREEMPTION
  	ldr	r8, [tsk, #TI_PREEMPT]		@ get preempt count
@@ -110,7 +106,7 @@ index c39303e5c2347..cfb4660e9feab 100644
  #endif
  
  	svc_exit r5, irq = 1			@ return from exception
-@@ -241,8 +248,14 @@ ENDPROC(__irq_svc)
+@@ -241,8 +248,14 @@ svc_preempt:
  1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
  	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
  	tst	r0, #_TIF_NEED_RESCHED
@@ -126,11 +122,10 @@ index c39303e5c2347..cfb4660e9feab 100644
  #endif
  
  __und_fault:
-diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
-index ea128e32e8ca8..3671a4214d6f4 100644
---- a/arch/arm/kernel/signal.c
-+++ b/arch/arm/kernel/signal.c
-@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
+diff -rupN linux.orig/arch/arm/kernel/signal.c linux/arch/arm/kernel/signal.c
+--- linux.orig/arch/arm/kernel/signal.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/kernel/signal.c	2022-12-04 10:40:26.676034147 -0500
+@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, un
  	 */
  	trace_hardirqs_off();
  	do {
@@ -140,11 +135,10 @@ index ea128e32e8ca8..3671a4214d6f4 100644
  			schedule();
  		} else {
  			if (unlikely(!user_mode(regs)))
-diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
-index 46cccd6bf705a..480a1976a9dce 100644
---- a/arch/arm/mm/fault.c
-+++ b/arch/arm/mm/fault.c
-@@ -421,6 +421,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
+diff -rupN linux.orig/arch/arm/mm/fault.c linux/arch/arm/mm/fault.c
+--- linux.orig/arch/arm/mm/fault.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/mm/fault.c	2022-12-04 10:40:26.676034147 -0500
+@@ -421,6 +421,9 @@ do_translation_fault(unsigned long addr,
  	if (addr < TASK_SIZE)
  		return do_page_fault(addr, fsr, regs);
  
@@ -154,7 +148,7 @@ index 46cccd6bf705a..480a1976a9dce 100644
  	if (user_mode(regs))
  		goto bad_area;
  
-@@ -491,6 +494,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
+@@ -491,6 +494,9 @@ do_translation_fault(unsigned long addr,
  static int
  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
  {
@@ -164,31 +158,10 @@ index 46cccd6bf705a..480a1976a9dce 100644
  	do_bad_area(addr, fsr, regs);
  	return 0;
  }
-diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
-index 3795eb5ba1cdd..6922949e61b71 100644
---- a/arch/arm64/Kconfig
-+++ b/arch/arm64/Kconfig
-@@ -93,6 +93,7 @@ config ARM64
- 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
- 	select ARCH_SUPPORTS_NUMA_BALANCING
- 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
-+	select ARCH_SUPPORTS_RT
- 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
- 	select ARCH_WANT_DEFAULT_BPF_JIT
- 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
-@@ -200,6 +201,7 @@ config ARM64
- 	select HAVE_PERF_USER_STACK_DUMP
- 	select HAVE_PREEMPT_DYNAMIC_KEY
- 	select HAVE_REGS_AND_STACK_ACCESS_API
-+	select HAVE_PREEMPT_LAZY
- 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
- 	select HAVE_FUNCTION_ARG_ACCESS_API
- 	select MMU_GATHER_RCU_TABLE_FREE
-diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
-index 0159b625cc7f0..a5486918e5eeb 100644
---- a/arch/arm64/include/asm/preempt.h
-+++ b/arch/arm64/include/asm/preempt.h
-@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_and_test(void)
+diff -rupN linux.orig/arch/arm64/include/asm/preempt.h linux/arch/arm64/include/asm/preempt.h
+--- linux.orig/arch/arm64/include/asm/preempt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/include/asm/preempt.h	2022-12-04 10:40:26.676034147 -0500
+@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_a
  	 * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE
  	 * pair.
  	 */
@@ -226,10 +199,9 @@ index 0159b625cc7f0..a5486918e5eeb 100644
  }
  
  #ifdef CONFIG_PREEMPTION
-diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
-index 848739c15de82..4b7148fd5551f 100644
---- a/arch/arm64/include/asm/thread_info.h
-+++ b/arch/arm64/include/asm/thread_info.h
+diff -rupN linux.orig/arch/arm64/include/asm/thread_info.h linux/arch/arm64/include/asm/thread_info.h
+--- linux.orig/arch/arm64/include/asm/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/include/asm/thread_info.h	2022-12-04 10:40:26.676034147 -0500
 @@ -26,6 +26,7 @@ struct thread_info {
  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
  	u64			ttbr0;		/* saved TTBR0_EL1 */
@@ -238,7 +210,7 @@ index 848739c15de82..4b7148fd5551f 100644
  	union {
  		u64		preempt_count;	/* 0 => preemptible, <0 => bug */
  		struct {
-@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_struct *dst,
+@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_str
  #define TIF_UPROBE		4	/* uprobe breakpoint or singlestep */
  #define TIF_MTE_ASYNC_FAULT	5	/* MTE Asynchronous Tag Check Fault */
  #define TIF_NOTIFY_SIGNAL	6	/* signal notifications exist */
@@ -246,7 +218,7 @@ index 848739c15de82..4b7148fd5551f 100644
  #define TIF_SYSCALL_TRACE	8	/* syscall trace active */
  #define TIF_SYSCALL_AUDIT	9	/* syscall auditing */
  #define TIF_SYSCALL_TRACEPOINT	10	/* syscall tracepoint for ftrace */
-@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_struct *dst,
+@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_str
  #define _TIF_SVE		(1 << TIF_SVE)
  #define _TIF_MTE_ASYNC_FAULT	(1 << TIF_MTE_ASYNC_FAULT)
  #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
@@ -258,7 +230,7 @@ index 848739c15de82..4b7148fd5551f 100644
  				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
  				 _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
  				 _TIF_NOTIFY_SIGNAL)
-@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_struct *dst,
+@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_str
  				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
  				 _TIF_SYSCALL_EMU)
  
@@ -267,10 +239,28 @@ index 848739c15de82..4b7148fd5551f 100644
  #ifdef CONFIG_SHADOW_CALL_STACK
  #define INIT_SCS							\
  	.scs_base	= init_shadow_call_stack,			\
-diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
-index 1197e7679882e..e74c0415f67ea 100644
---- a/arch/arm64/kernel/asm-offsets.c
-+++ b/arch/arm64/kernel/asm-offsets.c
+diff -rupN linux.orig/arch/arm64/Kconfig linux/arch/arm64/Kconfig
+--- linux.orig/arch/arm64/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/Kconfig	2022-12-04 10:40:26.676034147 -0500
+@@ -93,6 +93,7 @@ config ARM64
+ 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ 	select ARCH_SUPPORTS_NUMA_BALANCING
+ 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
++	select ARCH_SUPPORTS_RT
+ 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
+ 	select ARCH_WANT_DEFAULT_BPF_JIT
+ 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+@@ -200,6 +201,7 @@ config ARM64
+ 	select HAVE_PERF_USER_STACK_DUMP
+ 	select HAVE_PREEMPT_DYNAMIC_KEY
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
++	select HAVE_PREEMPT_LAZY
+ 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ 	select HAVE_FUNCTION_ARG_ACCESS_API
+ 	select MMU_GATHER_RCU_TABLE_FREE
+diff -rupN linux.orig/arch/arm64/kernel/asm-offsets.c linux/arch/arm64/kernel/asm-offsets.c
+--- linux.orig/arch/arm64/kernel/asm-offsets.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/kernel/asm-offsets.c	2022-12-04 10:40:26.676034147 -0500
 @@ -32,6 +32,7 @@ int main(void)
    DEFINE(TSK_TI_CPU,		offsetof(struct task_struct, thread_info.cpu));
    DEFINE(TSK_TI_FLAGS,		offsetof(struct task_struct, thread_info.flags));
@@ -279,11 +269,10 @@ index 1197e7679882e..e74c0415f67ea 100644
  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
    DEFINE(TSK_TI_TTBR0,		offsetof(struct task_struct, thread_info.ttbr0));
  #endif
-diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
-index 9ad911f1647c8..545c41a84411e 100644
---- a/arch/arm64/kernel/signal.c
-+++ b/arch/arm64/kernel/signal.c
-@@ -1103,7 +1103,7 @@ static void do_signal(struct pt_regs *regs)
+diff -rupN linux.orig/arch/arm64/kernel/signal.c linux/arch/arm64/kernel/signal.c
+--- linux.orig/arch/arm64/kernel/signal.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/kernel/signal.c	2022-12-04 10:40:26.676034147 -0500
+@@ -1103,7 +1103,7 @@ static void do_signal(struct pt_regs *re
  void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
  {
  	do {
@@ -292,34 +281,10 @@ index 9ad911f1647c8..545c41a84411e 100644
  			/* Unmask Debug and SError for the next task */
  			local_daif_restore(DAIF_PROCCTX_NOIRQ);
  
-diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
-index cbe7bb029aec8..ad5bcc255f4e3 100644
---- a/arch/powerpc/Kconfig
-+++ b/arch/powerpc/Kconfig
-@@ -149,6 +149,7 @@ config PPC
- 	select ARCH_STACKWALK
- 	select ARCH_SUPPORTS_ATOMIC_RMW
- 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx || 40x
-+	select ARCH_SUPPORTS_RT			if HAVE_POSIX_CPU_TIMERS_TASK_WORK
- 	select ARCH_USE_BUILTIN_BSWAP
- 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
- 	select ARCH_USE_MEMTEST
-@@ -241,8 +242,10 @@ config PPC
- 	select HAVE_PERF_EVENTS_NMI		if PPC64
- 	select HAVE_PERF_REGS
- 	select HAVE_PERF_USER_STACK_DUMP
-+	select HAVE_PREEMPT_LAZY
- 	select HAVE_REGS_AND_STACK_ACCESS_API
- 	select HAVE_RELIABLE_STACKTRACE
-+	select HAVE_POSIX_CPU_TIMERS_TASK_WORK	if !KVM
- 	select HAVE_RSEQ
- 	select HAVE_SETUP_PER_CPU_AREA		if PPC64
- 	select HAVE_SOFTIRQ_ON_OWN_STACK
-diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h
-index 1c8460e235838..b1653c160bab9 100644
---- a/arch/powerpc/include/asm/stackprotector.h
-+++ b/arch/powerpc/include/asm/stackprotector.h
-@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void)
+diff -rupN linux.orig/arch/powerpc/include/asm/stackprotector.h linux/arch/powerpc/include/asm/stackprotector.h
+--- linux.orig/arch/powerpc/include/asm/stackprotector.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/include/asm/stackprotector.h	2022-12-04 10:40:26.676034147 -0500
+@@ -24,7 +24,11 @@ static __always_inline void boot_init_st
  	unsigned long canary;
  
  	/* Try to get a semi random initial value. */
@@ -331,10 +296,9 @@ index 1c8460e235838..b1653c160bab9 100644
  	canary ^= mftb();
  	canary ^= LINUX_VERSION_CODE;
  	canary &= CANARY_MASK;
-diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
-index af58f1ed3952e..520864de8bb27 100644
---- a/arch/powerpc/include/asm/thread_info.h
-+++ b/arch/powerpc/include/asm/thread_info.h
+diff -rupN linux.orig/arch/powerpc/include/asm/thread_info.h linux/arch/powerpc/include/asm/thread_info.h
+--- linux.orig/arch/powerpc/include/asm/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/include/asm/thread_info.h	2022-12-04 10:40:26.676034147 -0500
 @@ -53,6 +53,8 @@
  struct thread_info {
  	int		preempt_count;		/* 0 => preemptable,
@@ -389,11 +353,32 @@ index af58f1ed3952e..520864de8bb27 100644
  
  /* Bits in local_flags */
  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
-diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
-index f9db0a172401a..38aa3d06c632c 100644
---- a/arch/powerpc/kernel/interrupt.c
-+++ b/arch/powerpc/kernel/interrupt.c
-@@ -184,7 +184,7 @@ interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs)
+diff -rupN linux.orig/arch/powerpc/Kconfig linux/arch/powerpc/Kconfig
+--- linux.orig/arch/powerpc/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/Kconfig	2022-12-04 10:40:26.676034147 -0500
+@@ -149,6 +149,7 @@ config PPC
+ 	select ARCH_STACKWALK
+ 	select ARCH_SUPPORTS_ATOMIC_RMW
+ 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx || 40x
++	select ARCH_SUPPORTS_RT			if HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ 	select ARCH_USE_BUILTIN_BSWAP
+ 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
+ 	select ARCH_USE_MEMTEST
+@@ -241,8 +242,10 @@ config PPC
+ 	select HAVE_PERF_EVENTS_NMI		if PPC64
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
++	select HAVE_PREEMPT_LAZY
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
+ 	select HAVE_RELIABLE_STACKTRACE
++	select HAVE_POSIX_CPU_TIMERS_TASK_WORK	if !KVM
+ 	select HAVE_RSEQ
+ 	select HAVE_SETUP_PER_CPU_AREA		if PPC64
+ 	select HAVE_SOFTIRQ_ON_OWN_STACK
+diff -rupN linux.orig/arch/powerpc/kernel/interrupt.c linux/arch/powerpc/kernel/interrupt.c
+--- linux.orig/arch/powerpc/kernel/interrupt.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/kernel/interrupt.c	2022-12-04 10:40:26.676034147 -0500
+@@ -184,7 +184,7 @@ again:
  	ti_flags = read_thread_flags();
  	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
  		local_irq_enable();
@@ -402,7 +387,7 @@ index f9db0a172401a..38aa3d06c632c 100644
  			schedule();
  		} else {
  			/*
-@@ -388,11 +388,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
+@@ -388,11 +388,15 @@ notrace unsigned long interrupt_exit_ker
  		/* Returning to a kernel context with local irqs enabled. */
  		WARN_ON_ONCE(!(regs->msr & MSR_EE));
  again:
@@ -419,10 +404,9 @@ index f9db0a172401a..38aa3d06c632c 100644
  			}
  		}
  
-diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
-index dadfcef5d6db4..3bfe55d82b042 100644
---- a/arch/powerpc/kernel/traps.c
-+++ b/arch/powerpc/kernel/traps.c
+diff -rupN linux.orig/arch/powerpc/kernel/traps.c linux/arch/powerpc/kernel/traps.c
+--- linux.orig/arch/powerpc/kernel/traps.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/kernel/traps.c	2022-12-04 10:40:26.676034147 -0500
 @@ -260,12 +260,17 @@ static char *get_mmu_str(void)
  
  static int __die(const char *str, struct pt_regs *regs, long err)
@@ -442,10 +426,9 @@ index dadfcef5d6db4..3bfe55d82b042 100644
  	       IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
  	       IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
  	       debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
-diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
-index dcb398d5e0093..2cfa432afdb12 100644
---- a/arch/powerpc/kvm/Kconfig
-+++ b/arch/powerpc/kvm/Kconfig
+diff -rupN linux.orig/arch/powerpc/kvm/Kconfig linux/arch/powerpc/kvm/Kconfig
+--- linux.orig/arch/powerpc/kvm/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/kvm/Kconfig	2022-12-04 10:40:26.676034147 -0500
 @@ -221,6 +221,7 @@ config KVM_E500MC
  config KVM_MPIC
  	bool "KVM in-kernel MPIC emulation"
@@ -454,10 +437,9 @@ index dcb398d5e0093..2cfa432afdb12 100644
  	select HAVE_KVM_IRQCHIP
  	select HAVE_KVM_IRQFD
  	select HAVE_KVM_IRQ_ROUTING
-diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
-index 561adac690229..61c4c0610aa6a 100644
---- a/arch/powerpc/platforms/pseries/iommu.c
-+++ b/arch/powerpc/platforms/pseries/iommu.c
+diff -rupN linux.orig/arch/powerpc/platforms/pseries/iommu.c linux/arch/powerpc/platforms/pseries/iommu.c
+--- linux.orig/arch/powerpc/platforms/pseries/iommu.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/platforms/pseries/iommu.c	2022-12-04 10:40:26.676034147 -0500
 @@ -24,6 +24,7 @@
  #include <linux/of.h>
  #include <linux/iommu.h>
@@ -466,7 +448,7 @@ index 561adac690229..61c4c0610aa6a 100644
  #include <asm/io.h>
  #include <asm/prom.h>
  #include <asm/rtas.h>
-@@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
+@@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned
  	return ret;
  }
  
@@ -481,7 +463,7 @@ index 561adac690229..61c4c0610aa6a 100644
  
  static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
  				     long npages, unsigned long uaddr,
-@@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+@@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(stru
  		                           direction, attrs);
  	}
  
@@ -494,7 +476,7 @@ index 561adac690229..61c4c0610aa6a 100644
  
  	/* This is safe to do since interrupts are off when we're called
  	 * from iommu_alloc{,_sg}()
-@@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+@@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(stru
  		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
  		/* If allocation fails, fall back to the loop implementation */
  		if (!tcep) {
@@ -509,7 +491,7 @@ index 561adac690229..61c4c0610aa6a 100644
  	}
  
  	rpn = __pa(uaddr) >> tceshift;
-@@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+@@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(stru
  		tcenum += limit;
  	} while (npages > 0 && !rc);
  
@@ -518,7 +500,7 @@ index 561adac690229..61c4c0610aa6a 100644
  
  	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
  		ret = (int)rc;
-@@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
+@@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP(
  				DMA_BIDIRECTIONAL, 0);
  	}
  
@@ -540,7 +522,7 @@ index 561adac690229..61c4c0610aa6a 100644
  	}
  
  	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
-@@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
+@@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP(
  
  	/* error cleanup: caller will clear whole range */
  
@@ -549,31 +531,10 @@ index 561adac690229..61c4c0610aa6a 100644
  	return rc;
  }
  
-diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
-index 159c025ebb03e..4d62ceece1bb0 100644
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -109,6 +109,7 @@ config X86
- 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
- 	select ARCH_SUPPORTS_LTO_CLANG
- 	select ARCH_SUPPORTS_LTO_CLANG_THIN
-+	select ARCH_SUPPORTS_RT
- 	select ARCH_USE_BUILTIN_BSWAP
- 	select ARCH_USE_MEMTEST
- 	select ARCH_USE_QUEUED_RWLOCKS
-@@ -243,6 +244,7 @@ config X86
- 	select HAVE_PCI
- 	select HAVE_PERF_REGS
- 	select HAVE_PERF_USER_STACK_DUMP
-+	select HAVE_PREEMPT_LAZY
- 	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
- 	select MMU_GATHER_MERGE_VMAS
- 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
-diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
-index 5f6daea1ee248..cd20b4a5719a4 100644
---- a/arch/x86/include/asm/preempt.h
-+++ b/arch/x86/include/asm/preempt.h
-@@ -90,17 +90,48 @@ static __always_inline void __preempt_count_sub(int val)
+diff -rupN linux.orig/arch/x86/include/asm/preempt.h linux/arch/x86/include/asm/preempt.h
+--- linux.orig/arch/x86/include/asm/preempt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/x86/include/asm/preempt.h	2022-12-04 10:40:26.676034147 -0500
+@@ -90,17 +90,48 @@ static __always_inline void __preempt_co
   * a decrement which hits zero means we have no preempt_count and should
   * reschedule.
   */
@@ -623,10 +584,9 @@ index 5f6daea1ee248..cd20b4a5719a4 100644
  }
  
  #ifdef CONFIG_PREEMPTION
-diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
-index f0cb881c1d690..fd8fb76f324fc 100644
---- a/arch/x86/include/asm/thread_info.h
-+++ b/arch/x86/include/asm/thread_info.h
+diff -rupN linux.orig/arch/x86/include/asm/thread_info.h linux/arch/x86/include/asm/thread_info.h
+--- linux.orig/arch/x86/include/asm/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/x86/include/asm/thread_info.h	2022-12-04 10:40:26.676034147 -0500
 @@ -57,6 +57,8 @@ struct thread_info {
  	unsigned long		flags;		/* low level flags */
  	unsigned long		syscall_work;	/* SYSCALL_WORK_ flags */
@@ -660,11 +620,29 @@ index f0cb881c1d690..fd8fb76f324fc 100644
  #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
  #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
  #define _TIF_SPEC_FORCE_UPDATE	(1 << TIF_SPEC_FORCE_UPDATE)
-diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c
-index fac8ff983aec8..65fb9bad1577a 100644
---- a/drivers/bcma/driver_gpio.c
-+++ b/drivers/bcma/driver_gpio.c
-@@ -115,7 +115,7 @@ static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id)
+diff -rupN linux.orig/arch/x86/Kconfig linux/arch/x86/Kconfig
+--- linux.orig/arch/x86/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/x86/Kconfig	2022-12-04 10:40:26.676034147 -0500
+@@ -109,6 +109,7 @@ config X86
+ 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
+ 	select ARCH_SUPPORTS_LTO_CLANG
+ 	select ARCH_SUPPORTS_LTO_CLANG_THIN
++	select ARCH_SUPPORTS_RT
+ 	select ARCH_USE_BUILTIN_BSWAP
+ 	select ARCH_USE_MEMTEST
+ 	select ARCH_USE_QUEUED_RWLOCKS
+@@ -243,6 +244,7 @@ config X86
+ 	select HAVE_PCI
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
++	select HAVE_PREEMPT_LAZY
+ 	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
+ 	select MMU_GATHER_MERGE_VMAS
+ 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+diff -rupN linux.orig/drivers/bcma/driver_gpio.c linux/drivers/bcma/driver_gpio.c
+--- linux.orig/drivers/bcma/driver_gpio.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/bcma/driver_gpio.c	2022-12-04 10:40:26.680034137 -0500
+@@ -115,7 +115,7 @@ static irqreturn_t bcma_gpio_irq_handler
  		return IRQ_NONE;
  
  	for_each_set_bit(gpio, &irqs, gc->ngpio)
@@ -673,11 +651,10 @@ index fac8ff983aec8..65fb9bad1577a 100644
  	bcma_chipco_gpio_polarity(cc, irqs, val & irqs);
  
  	return IRQ_HANDLED;
-diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
-index 226ea76cc8197..4043d909d41bf 100644
---- a/drivers/block/zram/zram_drv.c
-+++ b/drivers/block/zram/zram_drv.c
-@@ -60,6 +60,40 @@ static void zram_free_page(struct zram *zram, size_t index);
+diff -rupN linux.orig/drivers/block/zram/zram_drv.c linux/drivers/block/zram/zram_drv.c
+--- linux.orig/drivers/block/zram/zram_drv.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/block/zram/zram_drv.c	2022-12-04 10:40:26.680034137 -0500
+@@ -60,6 +60,40 @@ static void zram_free_page(struct zram *
  static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
  				u32 index, int offset, struct bio *bio);
  
@@ -718,7 +695,7 @@ index 226ea76cc8197..4043d909d41bf 100644
  
  static int zram_slot_trylock(struct zram *zram, u32 index)
  {
-@@ -75,6 +109,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
+@@ -75,6 +109,7 @@ static void zram_slot_unlock(struct zram
  {
  	bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
  }
@@ -726,7 +703,7 @@ index 226ea76cc8197..4043d909d41bf 100644
  
  static inline bool init_done(struct zram *zram)
  {
-@@ -1198,6 +1233,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
+@@ -1198,6 +1233,7 @@ static bool zram_meta_alloc(struct zram
  
  	if (!huge_class_size)
  		huge_class_size = zs_huge_class_size(zram->mem_pool);
@@ -734,10 +711,9 @@ index 226ea76cc8197..4043d909d41bf 100644
  	return true;
  }
  
-diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
-index 80c3b43b4828f..ff021a9728d1e 100644
---- a/drivers/block/zram/zram_drv.h
-+++ b/drivers/block/zram/zram_drv.h
+diff -rupN linux.orig/drivers/block/zram/zram_drv.h linux/drivers/block/zram/zram_drv.h
+--- linux.orig/drivers/block/zram/zram_drv.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/block/zram/zram_drv.h	2022-12-04 10:40:26.680034137 -0500
 @@ -63,6 +63,9 @@ struct zram_table_entry {
  		unsigned long element;
  	};
@@ -748,11 +724,10 @@ index 80c3b43b4828f..ff021a9728d1e 100644
  #ifdef CONFIG_ZRAM_MEMORY_TRACKING
  	ktime_t ac_time;
  #endif
-diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
-index bcff6429e0b4f..4a9ae338a2bdf 100644
---- a/drivers/char/tpm/tpm_tis.c
-+++ b/drivers/char/tpm/tpm_tis.c
-@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
+diff -rupN linux.orig/drivers/char/tpm/tpm_tis.c linux/drivers/char/tpm/tpm_tis.c
+--- linux.orig/drivers/char/tpm/tpm_tis.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/char/tpm/tpm_tis.c	2022-12-04 10:40:26.680034137 -0500
+@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to
  	return container_of(data, struct tpm_tis_tcg_phy, priv);
  }
  
@@ -784,7 +759,7 @@ index bcff6429e0b4f..4a9ae338a2bdf 100644
  static int interrupts = -1;
  module_param(interrupts, int, 0444);
  MODULE_PARM_DESC(interrupts, "Enable interrupts");
-@@ -185,12 +210,12 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
+@@ -185,12 +210,12 @@ static int tpm_tcg_write_bytes(struct tp
  	switch (io_mode) {
  	case TPM_TIS_PHYS_8:
  		while (len--)
@@ -799,11 +774,10 @@ index bcff6429e0b4f..4a9ae338a2bdf 100644
  		break;
  	}
  
-diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c
-index 64cb060d9d753..77a41151c921b 100644
---- a/drivers/gpio/gpio-mlxbf2.c
-+++ b/drivers/gpio/gpio-mlxbf2.c
-@@ -273,10 +273,8 @@ static irqreturn_t mlxbf2_gpio_irq_handler(int irq, void *ptr)
+diff -rupN linux.orig/drivers/gpio/gpio-mlxbf2.c linux/drivers/gpio/gpio-mlxbf2.c
+--- linux.orig/drivers/gpio/gpio-mlxbf2.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpio/gpio-mlxbf2.c	2022-12-04 10:40:26.680034137 -0500
+@@ -273,10 +273,8 @@ static irqreturn_t mlxbf2_gpio_irq_handl
  	pending = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_CAUSE_EVTEN0);
  	writel(pending, gs->gpio_io + YU_GPIO_CAUSE_OR_CLRCAUSE);
  
@@ -816,23 +790,10 @@ index 64cb060d9d753..77a41151c921b 100644
  
  	return IRQ_RETVAL(pending);
  }
-diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
-index 7ae3b7d67fcfc..844f54f1daea9 100644
---- a/drivers/gpu/drm/i915/Kconfig
-+++ b/drivers/gpu/drm/i915/Kconfig
-@@ -3,7 +3,6 @@ config DRM_I915
- 	tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics"
- 	depends on DRM
- 	depends on X86 && PCI
--	depends on !PREEMPT_RT
- 	select INTEL_GTT if X86
- 	select INTERVAL_TREE
- 	# we need shmfs for the swappable backing store, and in particular
-diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c
-index 4442aa355f868..23085e82c3ed5 100644
---- a/drivers/gpu/drm/i915/display/intel_crtc.c
-+++ b/drivers/gpu/drm/i915/display/intel_crtc.c
-@@ -522,7 +522,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
+diff -rupN linux.orig/drivers/gpu/drm/i915/display/intel_crtc.c linux/drivers/gpu/drm/i915/display/intel_crtc.c
+--- linux.orig/drivers/gpu/drm/i915/display/intel_crtc.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/display/intel_crtc.c	2022-12-04 10:40:26.680034137 -0500
+@@ -522,7 +522,8 @@ void intel_pipe_update_start(struct inte
  	 */
  	intel_psr_wait_for_idle_locked(new_crtc_state);
  
@@ -842,7 +803,7 @@ index 4442aa355f868..23085e82c3ed5 100644
  
  	crtc->debug.min_vbl = min;
  	crtc->debug.max_vbl = max;
-@@ -547,11 +548,13 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
+@@ -547,11 +548,13 @@ void intel_pipe_update_start(struct inte
  			break;
  		}
  
@@ -858,7 +819,7 @@ index 4442aa355f868..23085e82c3ed5 100644
  	}
  
  	finish_wait(wq, &wait);
-@@ -584,7 +587,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
+@@ -584,7 +587,8 @@ void intel_pipe_update_start(struct inte
  	return;
  
  irq_disable:
@@ -868,7 +829,7 @@ index 4442aa355f868..23085e82c3ed5 100644
  }
  
  #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE)
-@@ -685,7 +689,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state)
+@@ -685,7 +689,8 @@ void intel_pipe_update_end(struct intel_
  	 */
  	intel_vrr_send_push(new_crtc_state);
  
@@ -878,11 +839,10 @@ index 4442aa355f868..23085e82c3ed5 100644
  
  	if (intel_vgpu_active(dev_priv))
  		return;
-diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
-index ecc990ec1b952..8d04b10681f0d 100644
---- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
-+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
-@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
+diff -rupN linux.orig/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c linux/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+--- linux.orig/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c	2022-12-04 10:40:26.680034137 -0500
+@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct int
  	/* Kick the work once more to drain the signalers, and disarm the irq */
  	irq_work_sync(&b->irq_work);
  	while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
@@ -895,11 +855,10 @@ index ecc990ec1b952..8d04b10681f0d 100644
  	}
  }
  
-diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
-index c718e6dc40b51..0e592999b7d60 100644
---- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
-+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
-@@ -1302,7 +1302,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+diff -rupN linux.orig/drivers/gpu/drm/i915/gt/intel_execlists_submission.c linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+--- linux.orig/drivers/gpu/drm/i915/gt/intel_execlists_submission.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c	2022-12-04 10:40:26.680034137 -0500
+@@ -1302,7 +1302,7 @@ static void execlists_dequeue(struct int
  	 * and context switches) submission.
  	 */
  
@@ -908,7 +867,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  
  	/*
  	 * If the queue is higher priority than the last
-@@ -1402,7 +1402,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+@@ -1402,7 +1402,7 @@ static void execlists_dequeue(struct int
  				 * Even if ELSP[1] is occupied and not worthy
  				 * of timeslices, our queue might be.
  				 */
@@ -917,7 +876,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  				return;
  			}
  		}
-@@ -1428,7 +1428,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+@@ -1428,7 +1428,7 @@ static void execlists_dequeue(struct int
  
  		if (last && !can_merge_rq(last, rq)) {
  			spin_unlock(&ve->base.sched_engine->lock);
@@ -926,7 +885,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  			return; /* leave this for another sibling */
  		}
  
-@@ -1590,7 +1590,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+@@ -1590,7 +1590,7 @@ done:
  	 */
  	sched_engine->queue_priority_hint = queue_prio(sched_engine);
  	i915_sched_engine_reset_on_empty(sched_engine);
@@ -935,7 +894,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  
  	/*
  	 * We can skip poking the HW if we ended up with exactly the same set
-@@ -1616,13 +1616,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+@@ -1616,13 +1616,6 @@ done:
  	}
  }
  
@@ -949,7 +908,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  static void clear_ports(struct i915_request **ports, int count)
  {
  	memset_p((void **)ports, NULL, count);
-@@ -2468,7 +2461,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t)
+@@ -2468,7 +2461,7 @@ static void execlists_submission_tasklet
  	}
  
  	if (!engine->execlists.pending[0]) {
@@ -958,11 +917,10 @@ index c718e6dc40b51..0e592999b7d60 100644
  		start_timeslice(engine);
  	}
  
-diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
-index 73cebc6aa6507..98305fb393413 100644
---- a/drivers/gpu/drm/i915/i915_irq.c
-+++ b/drivers/gpu/drm/i915/i915_irq.c
-@@ -917,7 +917,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
+diff -rupN linux.orig/drivers/gpu/drm/i915/i915_irq.c linux/drivers/gpu/drm/i915/i915_irq.c
+--- linux.orig/drivers/gpu/drm/i915/i915_irq.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/i915_irq.c	2022-12-04 10:40:26.680034137 -0500
+@@ -917,7 +917,8 @@ static bool i915_get_crtc_scanoutpos(str
  	 */
  	spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
  
@@ -972,7 +930,7 @@ index 73cebc6aa6507..98305fb393413 100644
  
  	/* Get optional system timestamp before query. */
  	if (stime)
-@@ -981,7 +982,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
+@@ -981,7 +982,8 @@ static bool i915_get_crtc_scanoutpos(str
  	if (etime)
  		*etime = ktime_get();
  
@@ -982,11 +940,10 @@ index 73cebc6aa6507..98305fb393413 100644
  
  	spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
  
-diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
-index 62fad16a55e84..af07927650b24 100644
---- a/drivers/gpu/drm/i915/i915_request.c
-+++ b/drivers/gpu/drm/i915/i915_request.c
-@@ -612,7 +612,6 @@ bool __i915_request_submit(struct i915_request *request)
+diff -rupN linux.orig/drivers/gpu/drm/i915/i915_request.c linux/drivers/gpu/drm/i915/i915_request.c
+--- linux.orig/drivers/gpu/drm/i915/i915_request.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/i915_request.c	2022-12-04 10:40:26.680034137 -0500
+@@ -612,7 +612,6 @@ bool __i915_request_submit(struct i915_r
  
  	RQ_TRACE(request, "\n");
  
@@ -994,7 +951,7 @@ index 62fad16a55e84..af07927650b24 100644
  	lockdep_assert_held(&engine->sched_engine->lock);
  
  	/*
-@@ -721,7 +720,6 @@ void __i915_request_unsubmit(struct i915_request *request)
+@@ -721,7 +720,6 @@ void __i915_request_unsubmit(struct i915
  	 */
  	RQ_TRACE(request, "\n");
  
@@ -1002,10 +959,9 @@ index 62fad16a55e84..af07927650b24 100644
  	lockdep_assert_held(&engine->sched_engine->lock);
  
  	/*
-diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
-index 37b5c9e9d260e..73f29d8008f0c 100644
---- a/drivers/gpu/drm/i915/i915_trace.h
-+++ b/drivers/gpu/drm/i915/i915_trace.h
+diff -rupN linux.orig/drivers/gpu/drm/i915/i915_trace.h linux/drivers/gpu/drm/i915/i915_trace.h
+--- linux.orig/drivers/gpu/drm/i915/i915_trace.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/i915_trace.h	2022-12-04 10:40:26.680034137 -0500
 @@ -6,6 +6,10 @@
  #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
  #define _I915_TRACE_H_
@@ -1017,7 +973,7 @@ index 37b5c9e9d260e..73f29d8008f0c 100644
  #include <linux/stringify.h>
  #include <linux/types.h>
  #include <linux/tracepoint.h>
-@@ -323,7 +327,7 @@ DEFINE_EVENT(i915_request, i915_request_add,
+@@ -323,7 +327,7 @@ DEFINE_EVENT(i915_request, i915_request_
  	     TP_ARGS(rq)
  );
  
@@ -1026,11 +982,10 @@ index 37b5c9e9d260e..73f29d8008f0c 100644
  DEFINE_EVENT(i915_request, i915_request_guc_submit,
  	     TP_PROTO(struct i915_request *rq),
  	     TP_ARGS(rq)
-diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
-index c10d68cdc3ca5..593f3a7e0e4fc 100644
---- a/drivers/gpu/drm/i915/i915_utils.h
-+++ b/drivers/gpu/drm/i915/i915_utils.h
-@@ -294,7 +294,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
+diff -rupN linux.orig/drivers/gpu/drm/i915/i915_utils.h linux/drivers/gpu/drm/i915/i915_utils.h
+--- linux.orig/drivers/gpu/drm/i915/i915_utils.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/i915_utils.h	2022-12-04 10:40:26.680034137 -0500
+@@ -294,7 +294,7 @@ wait_remaining_ms_from_jiffies(unsigned
  #define wait_for(COND, MS)		_wait_for((COND), (MS) * 1000, 10, 1000)
  
  /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */
@@ -1039,10 +994,20 @@ index c10d68cdc3ca5..593f3a7e0e4fc 100644
  # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic())
  #else
  # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0)
-diff --git a/drivers/net/ethernet/alacritech/slic.h b/drivers/net/ethernet/alacritech/slic.h
-index 4eecbdfff3ff1..82071d0e5f7fc 100644
---- a/drivers/net/ethernet/alacritech/slic.h
-+++ b/drivers/net/ethernet/alacritech/slic.h
+diff -rupN linux.orig/drivers/gpu/drm/i915/Kconfig linux/drivers/gpu/drm/i915/Kconfig
+--- linux.orig/drivers/gpu/drm/i915/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/Kconfig	2022-12-04 10:40:26.680034137 -0500
+@@ -3,7 +3,6 @@ config DRM_I915
+ 	tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics"
+ 	depends on DRM
+ 	depends on X86 && PCI
+-	depends on !PREEMPT_RT
+ 	select INTEL_GTT if X86
+ 	select INTERVAL_TREE
+ 	# we need shmfs for the swappable backing store, and in particular
+diff -rupN linux.orig/drivers/net/ethernet/alacritech/slic.h linux/drivers/net/ethernet/alacritech/slic.h
+--- linux.orig/drivers/net/ethernet/alacritech/slic.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/alacritech/slic.h	2022-12-04 10:40:26.680034137 -0500
 @@ -288,13 +288,13 @@ do {						\
  	u64_stats_update_end(&(st)->syncp);	\
  } while (0)
@@ -1063,11 +1028,10 @@ index 4eecbdfff3ff1..82071d0e5f7fc 100644
  }
  
  struct slic_upr {
-diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
-index 39242c5a17290..8f81d288c4880 100644
---- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c
-+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
-@@ -118,9 +118,9 @@ static void ena_safe_update_stat(u64 *src, u64 *dst,
+diff -rupN linux.orig/drivers/net/ethernet/amazon/ena/ena_ethtool.c linux/drivers/net/ethernet/amazon/ena/ena_ethtool.c
+--- linux.orig/drivers/net/ethernet/amazon/ena/ena_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/amazon/ena/ena_ethtool.c	2022-12-04 10:40:26.680034137 -0500
+@@ -118,9 +118,9 @@ static void ena_safe_update_stat(u64 *sr
  	unsigned int start;
  
  	do {
@@ -1079,11 +1043,10 @@ index 39242c5a17290..8f81d288c4880 100644
  }
  
  static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
-diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
-index 6a356a6cee15a..1c5d482990806 100644
---- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
-+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
-@@ -3270,10 +3270,10 @@ static void ena_get_stats64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/amazon/ena/ena_netdev.c linux/drivers/net/ethernet/amazon/ena/ena_netdev.c
+--- linux.orig/drivers/net/ethernet/amazon/ena/ena_netdev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/amazon/ena/ena_netdev.c	2022-12-04 10:40:26.680034137 -0500
+@@ -3270,10 +3270,10 @@ static void ena_get_stats64(struct net_d
  		tx_ring = &adapter->tx_ring[i];
  
  		do {
@@ -1096,7 +1059,7 @@ index 6a356a6cee15a..1c5d482990806 100644
  
  		stats->tx_packets += packets;
  		stats->tx_bytes += bytes;
-@@ -3281,20 +3281,20 @@ static void ena_get_stats64(struct net_device *netdev,
+@@ -3281,20 +3281,20 @@ static void ena_get_stats64(struct net_d
  		rx_ring = &adapter->rx_ring[i];
  
  		do {
@@ -1121,11 +1084,10 @@ index 6a356a6cee15a..1c5d482990806 100644
  
  	stats->rx_dropped = rx_drops;
  	stats->tx_dropped = tx_drops;
-diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
-index 25129e723b575..1e8d902e1c8ea 100644
---- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
-+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
-@@ -934,7 +934,7 @@ unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data)
+diff -rupN linux.orig/drivers/net/ethernet/aquantia/atlantic/aq_ring.c linux/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+--- linux.orig/drivers/net/ethernet/aquantia/atlantic/aq_ring.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/aquantia/atlantic/aq_ring.c	2022-12-04 10:40:26.680034137 -0500
+@@ -934,7 +934,7 @@ unsigned int aq_ring_fill_stats_data(str
  		/* This data should mimic aq_ethtool_queue_rx_stat_names structure */
  		do {
  			count = 0;
@@ -1134,7 +1096,7 @@ index 25129e723b575..1e8d902e1c8ea 100644
  			data[count] = self->stats.rx.packets;
  			data[++count] = self->stats.rx.jumbo_packets;
  			data[++count] = self->stats.rx.lro_packets;
-@@ -951,15 +951,15 @@ unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data)
+@@ -951,15 +951,15 @@ unsigned int aq_ring_fill_stats_data(str
  			data[++count] = self->stats.rx.xdp_tx;
  			data[++count] = self->stats.rx.xdp_invalid;
  			data[++count] = self->stats.rx.xdp_redirect;
@@ -1153,11 +1115,10 @@ index 25129e723b575..1e8d902e1c8ea 100644
  	}
  
  	return ++count;
-diff --git a/drivers/net/ethernet/asix/ax88796c_main.c b/drivers/net/ethernet/asix/ax88796c_main.c
-index 6ba5b024a7be7..25e7beb68e515 100644
---- a/drivers/net/ethernet/asix/ax88796c_main.c
-+++ b/drivers/net/ethernet/asix/ax88796c_main.c
-@@ -662,12 +662,12 @@ static void ax88796c_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/ethernet/asix/ax88796c_main.c linux/drivers/net/ethernet/asix/ax88796c_main.c
+--- linux.orig/drivers/net/ethernet/asix/ax88796c_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/asix/ax88796c_main.c	2022-12-04 10:40:26.680034137 -0500
+@@ -662,12 +662,12 @@ static void ax88796c_get_stats64(struct
  		s = per_cpu_ptr(ax_local->stats, cpu);
  
  		do {
@@ -1172,11 +1133,10 @@ index 6ba5b024a7be7..25e7beb68e515 100644
  
  		stats->rx_packets += rx_packets;
  		stats->rx_bytes   += rx_bytes;
-diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
-index e5857e88c2076..caf1714f36a18 100644
---- a/drivers/net/ethernet/broadcom/b44.c
-+++ b/drivers/net/ethernet/broadcom/b44.c
-@@ -1680,7 +1680,7 @@ static void b44_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/broadcom/b44.c linux/drivers/net/ethernet/broadcom/b44.c
+--- linux.orig/drivers/net/ethernet/broadcom/b44.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/broadcom/b44.c	2022-12-04 10:40:26.680034137 -0500
+@@ -1680,7 +1680,7 @@ static void b44_get_stats64(struct net_d
  	unsigned int start;
  
  	do {
@@ -1185,7 +1145,7 @@ index e5857e88c2076..caf1714f36a18 100644
  
  		/* Convert HW stats into rtnl_link_stats64 stats. */
  		nstat->rx_packets = hwstat->rx_pkts;
-@@ -1714,7 +1714,7 @@ static void b44_get_stats64(struct net_device *dev,
+@@ -1714,7 +1714,7 @@ static void b44_get_stats64(struct net_d
  		/* Carrier lost counter seems to be broken for some devices */
  		nstat->tx_carrier_errors = hwstat->tx_carrier_lost;
  #endif
@@ -1194,7 +1154,7 @@ index e5857e88c2076..caf1714f36a18 100644
  
  }
  
-@@ -2082,12 +2082,12 @@ static void b44_get_ethtool_stats(struct net_device *dev,
+@@ -2082,12 +2082,12 @@ static void b44_get_ethtool_stats(struct
  	do {
  		data_src = &hwstat->tx_good_octets;
  		data_dst = data;
@@ -1209,11 +1169,10 @@ index e5857e88c2076..caf1714f36a18 100644
  }
  
  static void b44_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
-diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
-index 47fc8e6963d59..98d5bd15ee433 100644
---- a/drivers/net/ethernet/broadcom/bcmsysport.c
-+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
-@@ -457,10 +457,10 @@ static void bcm_sysport_update_tx_stats(struct bcm_sysport_priv *priv,
+diff -rupN linux.orig/drivers/net/ethernet/broadcom/bcmsysport.c linux/drivers/net/ethernet/broadcom/bcmsysport.c
+--- linux.orig/drivers/net/ethernet/broadcom/bcmsysport.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/broadcom/bcmsysport.c	2022-12-04 10:40:26.680034137 -0500
+@@ -457,10 +457,10 @@ static void bcm_sysport_update_tx_stats(
  	for (q = 0; q < priv->netdev->num_tx_queues; q++) {
  		ring = &priv->tx_rings[q];
  		do {
@@ -1226,7 +1185,7 @@ index 47fc8e6963d59..98d5bd15ee433 100644
  
  		*tx_bytes += bytes;
  		*tx_packets += packets;
-@@ -504,9 +504,9 @@ static void bcm_sysport_get_stats(struct net_device *dev,
+@@ -504,9 +504,9 @@ static void bcm_sysport_get_stats(struct
  		if (s->stat_sizeof == sizeof(u64) &&
  		    s->type == BCM_SYSPORT_STAT_NETDEV64) {
  			do {
@@ -1238,7 +1197,7 @@ index 47fc8e6963d59..98d5bd15ee433 100644
  		} else
  			data[i] = *(u32 *)p;
  		j++;
-@@ -1878,10 +1878,10 @@ static void bcm_sysport_get_stats64(struct net_device *dev,
+@@ -1878,10 +1878,10 @@ static void bcm_sysport_get_stats64(stru
  				    &stats->tx_packets);
  
  	do {
@@ -1251,11 +1210,10 @@ index 47fc8e6963d59..98d5bd15ee433 100644
  }
  
  static void bcm_sysport_netif_start(struct net_device *dev)
-diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
-index 6dae768671e3d..9e6de2f968fa3 100644
---- a/drivers/net/ethernet/cortina/gemini.c
-+++ b/drivers/net/ethernet/cortina/gemini.c
-@@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/cortina/gemini.c linux/drivers/net/ethernet/cortina/gemini.c
+--- linux.orig/drivers/net/ethernet/cortina/gemini.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/cortina/gemini.c	2022-12-04 10:40:26.680034137 -0500
+@@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_
  
  	/* Racing with RX NAPI */
  	do {
@@ -1264,7 +1222,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  		stats->rx_packets = port->stats.rx_packets;
  		stats->rx_bytes = port->stats.rx_bytes;
-@@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_device *netdev,
+@@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_
  		stats->rx_crc_errors = port->stats.rx_crc_errors;
  		stats->rx_frame_errors = port->stats.rx_frame_errors;
  
@@ -1278,7 +1236,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  		stats->tx_errors = port->stats.tx_errors;
  		stats->tx_packets = port->stats.tx_packets;
-@@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_device *netdev,
+@@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_
  		stats->rx_missed_errors = port->stats.rx_missed_errors;
  		stats->rx_fifo_errors = port->stats.rx_fifo_errors;
  
@@ -1297,7 +1255,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  	stats->rx_dropped += stats->rx_missed_errors;
  }
-@@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
+@@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struc
  	/* Racing with MIB interrupt */
  	do {
  		p = values;
@@ -1319,7 +1277,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  		for (i = 0; i < RX_STATUS_NUM; i++)
  			*p++ = port->rx_stats[i];
-@@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
+@@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struc
  			*p++ = port->rx_csum_stats[i];
  		*p++ = port->rx_napi_exits;
  
@@ -1335,7 +1293,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  		for (i = 0; i < TX_MAX_FRAGS; i++) {
  			*values++ = port->tx_frag_stats[i];
-@@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
+@@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struc
  		*values++ = port->tx_frags_linearized;
  		*values++ = port->tx_hw_csummed;
  
@@ -1344,11 +1302,10 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  }
  
  static int gmac_get_ksettings(struct net_device *netdev,
-diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
-index bd0df189d8719..39e7a4a3c15e6 100644
---- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
-+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
-@@ -389,10 +389,10 @@ static void be_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/emulex/benet/be_ethtool.c linux/drivers/net/ethernet/emulex/benet/be_ethtool.c
+--- linux.orig/drivers/net/ethernet/emulex/benet/be_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/emulex/benet/be_ethtool.c	2022-12-04 10:40:26.680034137 -0500
+@@ -389,10 +389,10 @@ static void be_get_ethtool_stats(struct
  		struct be_rx_stats *stats = rx_stats(rxo);
  
  		do {
@@ -1361,7 +1318,7 @@ index bd0df189d8719..39e7a4a3c15e6 100644
  
  		for (i = 2; i < ETHTOOL_RXSTATS_NUM; i++) {
  			p = (u8 *)stats + et_rx_stats[i].offset;
-@@ -405,19 +405,19 @@ static void be_get_ethtool_stats(struct net_device *netdev,
+@@ -405,19 +405,19 @@ static void be_get_ethtool_stats(struct
  		struct be_tx_stats *stats = tx_stats(txo);
  
  		do {
@@ -1385,11 +1342,10 @@ index bd0df189d8719..39e7a4a3c15e6 100644
  		base += ETHTOOL_TXSTATS_NUM;
  	}
  }
-diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
-index 414362febbb9d..9350c901aa27b 100644
---- a/drivers/net/ethernet/emulex/benet/be_main.c
-+++ b/drivers/net/ethernet/emulex/benet/be_main.c
-@@ -665,10 +665,10 @@ static void be_get_stats64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/emulex/benet/be_main.c linux/drivers/net/ethernet/emulex/benet/be_main.c
+--- linux.orig/drivers/net/ethernet/emulex/benet/be_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/emulex/benet/be_main.c	2022-12-04 10:40:26.684034126 -0500
+@@ -665,10 +665,10 @@ static void be_get_stats64(struct net_de
  		const struct be_rx_stats *rx_stats = rx_stats(rxo);
  
  		do {
@@ -1402,7 +1358,7 @@ index 414362febbb9d..9350c901aa27b 100644
  		stats->rx_packets += pkts;
  		stats->rx_bytes += bytes;
  		stats->multicast += rx_stats(rxo)->rx_mcast_pkts;
-@@ -680,10 +680,10 @@ static void be_get_stats64(struct net_device *netdev,
+@@ -680,10 +680,10 @@ static void be_get_stats64(struct net_de
  		const struct be_tx_stats *tx_stats = tx_stats(txo);
  
  		do {
@@ -1415,7 +1371,7 @@ index 414362febbb9d..9350c901aa27b 100644
  		stats->tx_packets += pkts;
  		stats->tx_bytes += bytes;
  	}
-@@ -2155,16 +2155,16 @@ static int be_get_new_eqd(struct be_eq_obj *eqo)
+@@ -2155,16 +2155,16 @@ static int be_get_new_eqd(struct be_eq_o
  
  	for_all_rx_queues_on_eq(adapter, eqo, rxo, i) {
  		do {
@@ -1436,10 +1392,9 @@ index 414362febbb9d..9350c901aa27b 100644
  	}
  
  	/* Skip, if wrapped around or first calculation */
-diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
-index 671f51135c269..53b7e95213a85 100644
---- a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
-+++ b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
+diff -rupN linux.orig/drivers/net/ethernet/fungible/funeth/funeth_txrx.h linux/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
+--- linux.orig/drivers/net/ethernet/fungible/funeth/funeth_txrx.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/fungible/funeth/funeth_txrx.h	2022-12-04 10:40:26.684034126 -0500
 @@ -206,9 +206,9 @@ struct funeth_rxq {
  
  #define FUN_QSTAT_READ(q, seq, stats_copy) \
@@ -1452,11 +1407,10 @@ index 671f51135c269..53b7e95213a85 100644
  
  #define FUN_INT_NAME_LEN (IFNAMSIZ + 16)
  
-diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c
-index 7b9a2d9d96243..50b384910c839 100644
---- a/drivers/net/ethernet/google/gve/gve_ethtool.c
-+++ b/drivers/net/ethernet/google/gve/gve_ethtool.c
-@@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/google/gve/gve_ethtool.c linux/drivers/net/ethernet/google/gve/gve_ethtool.c
+--- linux.orig/drivers/net/ethernet/google/gve/gve_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/google/gve/gve_ethtool.c	2022-12-04 10:40:26.684034126 -0500
+@@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device
  				struct gve_rx_ring *rx = &priv->rx[ring];
  
  				start =
@@ -1473,7 +1427,7 @@ index 7b9a2d9d96243..50b384910c839 100644
  						       start));
  			rx_pkts += tmp_rx_pkts;
  			rx_bytes += tmp_rx_bytes;
-@@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device *netdev,
+@@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device
  		if (priv->tx) {
  			do {
  				start =
@@ -1486,7 +1440,7 @@ index 7b9a2d9d96243..50b384910c839 100644
  						       start));
  			tx_pkts += tmp_tx_pkts;
  			tx_bytes += tmp_tx_bytes;
-@@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device *netdev,
+@@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device
  			data[i++] = rx->fill_cnt - rx->cnt;
  			do {
  				start =
@@ -1502,7 +1456,7 @@ index 7b9a2d9d96243..50b384910c839 100644
  						       start));
  			data[i++] = tmp_rx_bytes;
  			data[i++] = rx->rx_cont_packet_cnt;
-@@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device *netdev,
+@@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device
  			}
  			do {
  				start =
@@ -1514,11 +1468,10 @@ index 7b9a2d9d96243..50b384910c839 100644
  						       start));
  			data[i++] = tmp_tx_bytes;
  			data[i++] = tx->wake_queue;
-diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
-index 044db3ebb071c..6cafee55efc32 100644
---- a/drivers/net/ethernet/google/gve/gve_main.c
-+++ b/drivers/net/ethernet/google/gve/gve_main.c
-@@ -51,10 +51,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
+diff -rupN linux.orig/drivers/net/ethernet/google/gve/gve_main.c linux/drivers/net/ethernet/google/gve/gve_main.c
+--- linux.orig/drivers/net/ethernet/google/gve/gve_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/google/gve/gve_main.c	2022-12-04 10:40:26.684034126 -0500
+@@ -51,10 +51,10 @@ static void gve_get_stats(struct net_dev
  		for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {
  			do {
  				start =
@@ -1531,7 +1484,7 @@ index 044db3ebb071c..6cafee55efc32 100644
  						       start));
  			s->rx_packets += packets;
  			s->rx_bytes += bytes;
-@@ -64,10 +64,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
+@@ -64,10 +64,10 @@ static void gve_get_stats(struct net_dev
  		for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) {
  			do {
  				start =
@@ -1544,7 +1497,7 @@ index 044db3ebb071c..6cafee55efc32 100644
  						       start));
  			s->tx_packets += packets;
  			s->tx_bytes += bytes;
-@@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_priv *priv)
+@@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_
  			}
  
  			do {
@@ -1556,11 +1509,10 @@ index 044db3ebb071c..6cafee55efc32 100644
  			stats[stats_idx++] = (struct stats) {
  				.stat_name = cpu_to_be32(TX_WAKE_CNT),
  				.value = cpu_to_be64(priv->tx[idx].wake_queue),
-diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
-index 35d70041b9e84..f82e98263307a 100644
---- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
-+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
-@@ -2486,7 +2486,7 @@ static void hns3_fetch_stats(struct rtnl_link_stats64 *stats,
+diff -rupN linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+--- linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c	2022-12-04 10:40:26.684034126 -0500
+@@ -2488,7 +2488,7 @@ static void hns3_fetch_stats(struct rtnl
  	unsigned int start;
  
  	do {
@@ -1569,7 +1521,7 @@ index 35d70041b9e84..f82e98263307a 100644
  		if (is_tx) {
  			stats->tx_bytes += ring->stats.tx_bytes;
  			stats->tx_packets += ring->stats.tx_pkts;
-@@ -2520,7 +2520,7 @@ static void hns3_fetch_stats(struct rtnl_link_stats64 *stats,
+@@ -2522,7 +2522,7 @@ static void hns3_fetch_stats(struct rtnl
  			stats->multicast += ring->stats.rx_multicast;
  			stats->rx_length_errors += ring->stats.err_pkt_len;
  		}
@@ -1578,11 +1530,5909 @@ index 35d70041b9e84..f82e98263307a 100644
  }
  
  static void hns3_nic_get_stats64(struct net_device *netdev,
-diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
-index e5828a658caf4..a866bea651103 100644
---- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c
-+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
-@@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, struct hinic_rxq_stats *stats)
+diff -rupN linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig
+--- linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig	2022-12-04 10:40:18.116056079 -0500
+@@ -0,0 +1,5895 @@
++// SPDX-License-Identifier: GPL-2.0+
++// Copyright (c) 2016-2017 Hisilicon Limited.
++
++#include <linux/dma-mapping.h>
++#include <linux/etherdevice.h>
++#include <linux/interrupt.h>
++#ifdef CONFIG_RFS_ACCEL
++#include <linux/cpu_rmap.h>
++#endif
++#include <linux/if_vlan.h>
++#include <linux/irq.h>
++#include <linux/ip.h>
++#include <linux/ipv6.h>
++#include <linux/module.h>
++#include <linux/pci.h>
++#include <linux/aer.h>
++#include <linux/skbuff.h>
++#include <linux/sctp.h>
++#include <net/gre.h>
++#include <net/gro.h>
++#include <net/ip6_checksum.h>
++#include <net/pkt_cls.h>
++#include <net/tcp.h>
++#include <net/vxlan.h>
++#include <net/geneve.h>
++
++#include "hnae3.h"
++#include "hns3_enet.h"
++/* All hns3 tracepoints are defined by the include below, which
++ * must be included exactly once across the whole kernel with
++ * CREATE_TRACE_POINTS defined
++ */
++#define CREATE_TRACE_POINTS
++#include "hns3_trace.h"
++
++#define hns3_set_field(origin, shift, val)	((origin) |= (val) << (shift))
++#define hns3_tx_bd_count(S)	DIV_ROUND_UP(S, HNS3_MAX_BD_SIZE)
++
++#define hns3_rl_err(fmt, ...)						\
++	do {								\
++		if (net_ratelimit())					\
++			netdev_err(fmt, ##__VA_ARGS__);			\
++	} while (0)
++
++static void hns3_clear_all_ring(struct hnae3_handle *h, bool force);
++
++static const char hns3_driver_name[] = "hns3";
++static const char hns3_driver_string[] =
++			"Hisilicon Ethernet Network Driver for Hip08 Family";
++static const char hns3_copyright[] = "Copyright (c) 2017 Huawei Corporation.";
++static struct hnae3_client client;
++
++static int debug = -1;
++module_param(debug, int, 0);
++MODULE_PARM_DESC(debug, " Network interface message level setting");
++
++static unsigned int tx_sgl = 1;
++module_param(tx_sgl, uint, 0600);
++MODULE_PARM_DESC(tx_sgl, "Minimum number of frags when using dma_map_sg() to optimize the IOMMU mapping");
++
++static bool page_pool_enabled = true;
++module_param(page_pool_enabled, bool, 0400);
++
++#define HNS3_SGL_SIZE(nfrag)	(sizeof(struct scatterlist) * (nfrag) +	\
++				 sizeof(struct sg_table))
++#define HNS3_MAX_SGL_SIZE	ALIGN(HNS3_SGL_SIZE(HNS3_MAX_TSO_BD_NUM), \
++				      dma_get_cache_alignment())
++
++#define DEFAULT_MSG_LEVEL (NETIF_MSG_PROBE | NETIF_MSG_LINK | \
++			   NETIF_MSG_IFDOWN | NETIF_MSG_IFUP)
++
++#define HNS3_INNER_VLAN_TAG	1
++#define HNS3_OUTER_VLAN_TAG	2
++
++#define HNS3_MIN_TX_LEN		33U
++#define HNS3_MIN_TUN_PKT_LEN	65U
++
++/* hns3_pci_tbl - PCI Device ID Table
++ *
++ * Last entry must be all 0s
++ *
++ * { Vendor ID, Device ID, SubVendor ID, SubDevice ID,
++ *   Class, Class Mask, private data (not used) }
++ */
++static const struct pci_device_id hns3_pci_tbl[] = {
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_200G_RDMA),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_VF), 0},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_RDMA_DCB_PFC_VF),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	/* required last entry */
++	{0,}
++};
++MODULE_DEVICE_TABLE(pci, hns3_pci_tbl);
++
++#define HNS3_RX_PTYPE_ENTRY(ptype, l, s, t, h) \
++	{	ptype, \
++		l, \
++		CHECKSUM_##s, \
++		HNS3_L3_TYPE_##t, \
++		1, \
++		h}
++
++#define HNS3_RX_PTYPE_UNUSED_ENTRY(ptype) \
++		{ ptype, 0, CHECKSUM_NONE, HNS3_L3_TYPE_PARSE_FAIL, 0, \
++		  PKT_HASH_TYPE_NONE }
++
++static const struct hns3_rx_ptype hns3_rx_ptype_tbl[] = {
++	HNS3_RX_PTYPE_UNUSED_ENTRY(0),
++	HNS3_RX_PTYPE_ENTRY(1, 0, COMPLETE, ARP, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(2, 0, COMPLETE, RARP, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(3, 0, COMPLETE, LLDP, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(4, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(5, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(6, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(7, 0, COMPLETE, CNM, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(8, 0, NONE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(9),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(10),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(11),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(12),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(13),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(14),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(15),
++	HNS3_RX_PTYPE_ENTRY(16, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(17, 0, COMPLETE, IPV4, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(18, 0, COMPLETE, IPV4, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(19, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(20, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(21, 0, NONE, IPV4, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(22, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(23, 0, NONE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(24, 0, NONE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(25, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(26),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(27),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(28),
++	HNS3_RX_PTYPE_ENTRY(29, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(30, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(31, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(32, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(33, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(34, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(35, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(36, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(37, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(38),
++	HNS3_RX_PTYPE_ENTRY(39, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(40, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(41, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(42, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(43, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(44, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(45, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(46),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(47),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(48),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(49),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(50),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(51),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(52),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(53),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(54),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(55),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(56),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(57),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(58),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(59),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(60),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(61),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(62),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(63),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(64),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(65),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(66),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(67),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(68),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(69),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(70),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(71),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(72),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(73),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(74),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(75),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(76),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(77),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(78),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(79),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(80),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(81),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(82),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(83),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(84),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(85),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(86),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(87),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(88),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(89),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(90),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(91),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(92),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(93),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(94),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(95),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(96),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(97),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(98),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(99),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(100),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(101),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(102),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(103),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(104),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(105),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(106),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(107),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(108),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(109),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(110),
++	HNS3_RX_PTYPE_ENTRY(111, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(112, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(113, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(114, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(115, 0, NONE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(116, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(117, 0, NONE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(118, 0, NONE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(119, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(120),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(121),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(122),
++	HNS3_RX_PTYPE_ENTRY(123, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(124, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(125, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(126, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(127, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(128, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(129, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(130, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(131, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(132),
++	HNS3_RX_PTYPE_ENTRY(133, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(134, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(135, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(136, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(137, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(138, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(139, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(140),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(141),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(142),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(143),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(144),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(145),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(146),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(147),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(148),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(149),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(150),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(151),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(152),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(153),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(154),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(155),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(156),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(157),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(158),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(159),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(160),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(161),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(162),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(163),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(164),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(165),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(166),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(167),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(168),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(169),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(170),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(171),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(172),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(173),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(174),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(175),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(176),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(177),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(178),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(179),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(180),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(181),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(182),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(183),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(184),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(185),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(186),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(187),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(188),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(189),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(190),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(191),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(192),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(193),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(194),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(195),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(196),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(197),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(198),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(199),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(200),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(201),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(202),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(203),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(204),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(205),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(206),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(207),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(208),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(209),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(210),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(211),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(212),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(213),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(214),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(215),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(216),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(217),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(218),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(219),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(220),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(221),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(222),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(223),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(224),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(225),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(226),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(227),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(228),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(229),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(230),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(231),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(232),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(233),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(234),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(235),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(236),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(237),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(238),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(239),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(240),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(241),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(242),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(243),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(244),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(245),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(246),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(247),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(248),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(249),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(250),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(251),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(252),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(253),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(254),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(255),
++};
++
++#define HNS3_INVALID_PTYPE \
++		ARRAY_SIZE(hns3_rx_ptype_tbl)
++
++static irqreturn_t hns3_irq_handle(int irq, void *vector)
++{
++	struct hns3_enet_tqp_vector *tqp_vector = vector;
++
++	napi_schedule_irqoff(&tqp_vector->napi);
++	tqp_vector->event_cnt++;
++
++	return IRQ_HANDLED;
++}
++
++static void hns3_nic_uninit_irq(struct hns3_nic_priv *priv)
++{
++	struct hns3_enet_tqp_vector *tqp_vectors;
++	unsigned int i;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vectors = &priv->tqp_vector[i];
++
++		if (tqp_vectors->irq_init_flag != HNS3_VECTOR_INITED)
++			continue;
++
++		/* clear the affinity mask */
++		irq_set_affinity_hint(tqp_vectors->vector_irq, NULL);
++
++		/* release the irq resource */
++		free_irq(tqp_vectors->vector_irq, tqp_vectors);
++		tqp_vectors->irq_init_flag = HNS3_VECTOR_NOT_INITED;
++	}
++}
++
++static int hns3_nic_init_irq(struct hns3_nic_priv *priv)
++{
++	struct hns3_enet_tqp_vector *tqp_vectors;
++	int txrx_int_idx = 0;
++	int rx_int_idx = 0;
++	int tx_int_idx = 0;
++	unsigned int i;
++	int ret;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vectors = &priv->tqp_vector[i];
++
++		if (tqp_vectors->irq_init_flag == HNS3_VECTOR_INITED)
++			continue;
++
++		if (tqp_vectors->tx_group.ring && tqp_vectors->rx_group.ring) {
++			snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN,
++				 "%s-%s-%s-%d", hns3_driver_name,
++				 pci_name(priv->ae_handle->pdev),
++				 "TxRx", txrx_int_idx++);
++			txrx_int_idx++;
++		} else if (tqp_vectors->rx_group.ring) {
++			snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN,
++				 "%s-%s-%s-%d", hns3_driver_name,
++				 pci_name(priv->ae_handle->pdev),
++				 "Rx", rx_int_idx++);
++		} else if (tqp_vectors->tx_group.ring) {
++			snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN,
++				 "%s-%s-%s-%d", hns3_driver_name,
++				 pci_name(priv->ae_handle->pdev),
++				 "Tx", tx_int_idx++);
++		} else {
++			/* Skip this unused q_vector */
++			continue;
++		}
++
++		tqp_vectors->name[HNAE3_INT_NAME_LEN - 1] = '\0';
++
++		irq_set_status_flags(tqp_vectors->vector_irq, IRQ_NOAUTOEN);
++		ret = request_irq(tqp_vectors->vector_irq, hns3_irq_handle, 0,
++				  tqp_vectors->name, tqp_vectors);
++		if (ret) {
++			netdev_err(priv->netdev, "request irq(%d) fail\n",
++				   tqp_vectors->vector_irq);
++			hns3_nic_uninit_irq(priv);
++			return ret;
++		}
++
++		irq_set_affinity_hint(tqp_vectors->vector_irq,
++				      &tqp_vectors->affinity_mask);
++
++		tqp_vectors->irq_init_flag = HNS3_VECTOR_INITED;
++	}
++
++	return 0;
++}
++
++static void hns3_mask_vector_irq(struct hns3_enet_tqp_vector *tqp_vector,
++				 u32 mask_en)
++{
++	writel(mask_en, tqp_vector->mask_addr);
++}
++
++static void hns3_vector_enable(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	napi_enable(&tqp_vector->napi);
++	enable_irq(tqp_vector->vector_irq);
++
++	/* enable vector */
++	hns3_mask_vector_irq(tqp_vector, 1);
++}
++
++static void hns3_vector_disable(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	/* disable vector */
++	hns3_mask_vector_irq(tqp_vector, 0);
++
++	disable_irq(tqp_vector->vector_irq);
++	napi_disable(&tqp_vector->napi);
++	cancel_work_sync(&tqp_vector->rx_group.dim.work);
++	cancel_work_sync(&tqp_vector->tx_group.dim.work);
++}
++
++void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector,
++				 u32 rl_value)
++{
++	u32 rl_reg = hns3_rl_usec_to_reg(rl_value);
++
++	/* this defines the configuration for RL (Interrupt Rate Limiter).
++	 * Rl defines rate of interrupts i.e. number of interrupts-per-second
++	 * GL and RL(Rate Limiter) are 2 ways to acheive interrupt coalescing
++	 */
++	if (rl_reg > 0 && !tqp_vector->tx_group.coal.adapt_enable &&
++	    !tqp_vector->rx_group.coal.adapt_enable)
++		/* According to the hardware, the range of rl_reg is
++		 * 0-59 and the unit is 4.
++		 */
++		rl_reg |=  HNS3_INT_RL_ENABLE_MASK;
++
++	writel(rl_reg, tqp_vector->mask_addr + HNS3_VECTOR_RL_OFFSET);
++}
++
++void hns3_set_vector_coalesce_rx_gl(struct hns3_enet_tqp_vector *tqp_vector,
++				    u32 gl_value)
++{
++	u32 new_val;
++
++	if (tqp_vector->rx_group.coal.unit_1us)
++		new_val = gl_value | HNS3_INT_GL_1US;
++	else
++		new_val = hns3_gl_usec_to_reg(gl_value);
++
++	writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET);
++}
++
++void hns3_set_vector_coalesce_tx_gl(struct hns3_enet_tqp_vector *tqp_vector,
++				    u32 gl_value)
++{
++	u32 new_val;
++
++	if (tqp_vector->tx_group.coal.unit_1us)
++		new_val = gl_value | HNS3_INT_GL_1US;
++	else
++		new_val = hns3_gl_usec_to_reg(gl_value);
++
++	writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET);
++}
++
++void hns3_set_vector_coalesce_tx_ql(struct hns3_enet_tqp_vector *tqp_vector,
++				    u32 ql_value)
++{
++	writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_TX_QL_OFFSET);
++}
++
++void hns3_set_vector_coalesce_rx_ql(struct hns3_enet_tqp_vector *tqp_vector,
++				    u32 ql_value)
++{
++	writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_RX_QL_OFFSET);
++}
++
++static void hns3_vector_coalesce_init(struct hns3_enet_tqp_vector *tqp_vector,
++				      struct hns3_nic_priv *priv)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
++	struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal;
++	struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal;
++	struct hns3_enet_coalesce *ptx_coal = &priv->tx_coal;
++	struct hns3_enet_coalesce *prx_coal = &priv->rx_coal;
++
++	tx_coal->adapt_enable = ptx_coal->adapt_enable;
++	rx_coal->adapt_enable = prx_coal->adapt_enable;
++
++	tx_coal->int_gl = ptx_coal->int_gl;
++	rx_coal->int_gl = prx_coal->int_gl;
++
++	rx_coal->flow_level = prx_coal->flow_level;
++	tx_coal->flow_level = ptx_coal->flow_level;
++
++	/* device version above V3(include V3), GL can configure 1us
++	 * unit, so uses 1us unit.
++	 */
++	if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) {
++		tx_coal->unit_1us = 1;
++		rx_coal->unit_1us = 1;
++	}
++
++	if (ae_dev->dev_specs.int_ql_max) {
++		tx_coal->ql_enable = 1;
++		rx_coal->ql_enable = 1;
++		tx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max;
++		rx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max;
++		tx_coal->int_ql = ptx_coal->int_ql;
++		rx_coal->int_ql = prx_coal->int_ql;
++	}
++}
++
++static void
++hns3_vector_coalesce_init_hw(struct hns3_enet_tqp_vector *tqp_vector,
++			     struct hns3_nic_priv *priv)
++{
++	struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal;
++	struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal;
++	struct hnae3_handle *h = priv->ae_handle;
++
++	hns3_set_vector_coalesce_tx_gl(tqp_vector, tx_coal->int_gl);
++	hns3_set_vector_coalesce_rx_gl(tqp_vector, rx_coal->int_gl);
++	hns3_set_vector_coalesce_rl(tqp_vector, h->kinfo.int_rl_setting);
++
++	if (tx_coal->ql_enable)
++		hns3_set_vector_coalesce_tx_ql(tqp_vector, tx_coal->int_ql);
++
++	if (rx_coal->ql_enable)
++		hns3_set_vector_coalesce_rx_ql(tqp_vector, rx_coal->int_ql);
++}
++
++static int hns3_nic_set_real_num_queue(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct hnae3_knic_private_info *kinfo = &h->kinfo;
++	struct hnae3_tc_info *tc_info = &kinfo->tc_info;
++	unsigned int queue_size = kinfo->num_tqps;
++	int i, ret;
++
++	if (tc_info->num_tc <= 1 && !tc_info->mqprio_active) {
++		netdev_reset_tc(netdev);
++	} else {
++		ret = netdev_set_num_tc(netdev, tc_info->num_tc);
++		if (ret) {
++			netdev_err(netdev,
++				   "netdev_set_num_tc fail, ret=%d!\n", ret);
++			return ret;
++		}
++
++		for (i = 0; i < tc_info->num_tc; i++)
++			netdev_set_tc_queue(netdev, i, tc_info->tqp_count[i],
++					    tc_info->tqp_offset[i]);
++	}
++
++	ret = netif_set_real_num_tx_queues(netdev, queue_size);
++	if (ret) {
++		netdev_err(netdev,
++			   "netif_set_real_num_tx_queues fail, ret=%d!\n", ret);
++		return ret;
++	}
++
++	ret = netif_set_real_num_rx_queues(netdev, queue_size);
++	if (ret) {
++		netdev_err(netdev,
++			   "netif_set_real_num_rx_queues fail, ret=%d!\n", ret);
++		return ret;
++	}
++
++	return 0;
++}
++
++u16 hns3_get_max_available_channels(struct hnae3_handle *h)
++{
++	u16 alloc_tqps, max_rss_size, rss_size;
++
++	h->ae_algo->ops->get_tqps_and_rss_info(h, &alloc_tqps, &max_rss_size);
++	rss_size = alloc_tqps / h->kinfo.tc_info.num_tc;
++
++	return min_t(u16, rss_size, max_rss_size);
++}
++
++static void hns3_tqp_enable(struct hnae3_queue *tqp)
++{
++	u32 rcb_reg;
++
++	rcb_reg = hns3_read_dev(tqp, HNS3_RING_EN_REG);
++	rcb_reg |= BIT(HNS3_RING_EN_B);
++	hns3_write_dev(tqp, HNS3_RING_EN_REG, rcb_reg);
++}
++
++static void hns3_tqp_disable(struct hnae3_queue *tqp)
++{
++	u32 rcb_reg;
++
++	rcb_reg = hns3_read_dev(tqp, HNS3_RING_EN_REG);
++	rcb_reg &= ~BIT(HNS3_RING_EN_B);
++	hns3_write_dev(tqp, HNS3_RING_EN_REG, rcb_reg);
++}
++
++static void hns3_free_rx_cpu_rmap(struct net_device *netdev)
++{
++#ifdef CONFIG_RFS_ACCEL
++	free_irq_cpu_rmap(netdev->rx_cpu_rmap);
++	netdev->rx_cpu_rmap = NULL;
++#endif
++}
++
++static int hns3_set_rx_cpu_rmap(struct net_device *netdev)
++{
++#ifdef CONFIG_RFS_ACCEL
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hns3_enet_tqp_vector *tqp_vector;
++	int i, ret;
++
++	if (!netdev->rx_cpu_rmap) {
++		netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->vector_num);
++		if (!netdev->rx_cpu_rmap)
++			return -ENOMEM;
++	}
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vector = &priv->tqp_vector[i];
++		ret = irq_cpu_rmap_add(netdev->rx_cpu_rmap,
++				       tqp_vector->vector_irq);
++		if (ret) {
++			hns3_free_rx_cpu_rmap(netdev);
++			return ret;
++		}
++	}
++#endif
++	return 0;
++}
++
++static int hns3_nic_net_up(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = priv->ae_handle;
++	int i, j;
++	int ret;
++
++	ret = hns3_nic_reset_all_ring(h);
++	if (ret)
++		return ret;
++
++	clear_bit(HNS3_NIC_STATE_DOWN, &priv->state);
++
++	/* enable the vectors */
++	for (i = 0; i < priv->vector_num; i++)
++		hns3_vector_enable(&priv->tqp_vector[i]);
++
++	/* enable rcb */
++	for (j = 0; j < h->kinfo.num_tqps; j++)
++		hns3_tqp_enable(h->kinfo.tqp[j]);
++
++	/* start the ae_dev */
++	ret = h->ae_algo->ops->start ? h->ae_algo->ops->start(h) : 0;
++	if (ret) {
++		set_bit(HNS3_NIC_STATE_DOWN, &priv->state);
++		while (j--)
++			hns3_tqp_disable(h->kinfo.tqp[j]);
++
++		for (j = i - 1; j >= 0; j--)
++			hns3_vector_disable(&priv->tqp_vector[j]);
++	}
++
++	return ret;
++}
++
++static void hns3_config_xps(struct hns3_nic_priv *priv)
++{
++	int i;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		struct hns3_enet_tqp_vector *tqp_vector = &priv->tqp_vector[i];
++		struct hns3_enet_ring *ring = tqp_vector->tx_group.ring;
++
++		while (ring) {
++			int ret;
++
++			ret = netif_set_xps_queue(priv->netdev,
++						  &tqp_vector->affinity_mask,
++						  ring->tqp->tqp_index);
++			if (ret)
++				netdev_warn(priv->netdev,
++					    "set xps queue failed: %d", ret);
++
++			ring = ring->next;
++		}
++	}
++}
++
++static int hns3_nic_net_open(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct hnae3_knic_private_info *kinfo;
++	int i, ret;
++
++	if (hns3_nic_resetting(netdev))
++		return -EBUSY;
++
++	if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) {
++		netdev_warn(netdev, "net open repeatedly!\n");
++		return 0;
++	}
++
++	netif_carrier_off(netdev);
++
++	ret = hns3_nic_set_real_num_queue(netdev);
++	if (ret)
++		return ret;
++
++	ret = hns3_nic_net_up(netdev);
++	if (ret) {
++		netdev_err(netdev, "net up fail, ret=%d!\n", ret);
++		return ret;
++	}
++
++	kinfo = &h->kinfo;
++	for (i = 0; i < HNAE3_MAX_USER_PRIO; i++)
++		netdev_set_prio_tc_map(netdev, i, kinfo->tc_info.prio_tc[i]);
++
++	if (h->ae_algo->ops->set_timer_task)
++		h->ae_algo->ops->set_timer_task(priv->ae_handle, true);
++
++	hns3_config_xps(priv);
++
++	netif_dbg(h, drv, netdev, "net open\n");
++
++	return 0;
++}
++
++static void hns3_reset_tx_queue(struct hnae3_handle *h)
++{
++	struct net_device *ndev = h->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	struct netdev_queue *dev_queue;
++	u32 i;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		dev_queue = netdev_get_tx_queue(ndev,
++						priv->ring[i].queue_index);
++		netdev_tx_reset_queue(dev_queue);
++	}
++}
++
++static void hns3_nic_net_down(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	const struct hnae3_ae_ops *ops;
++	int i;
++
++	/* disable vectors */
++	for (i = 0; i < priv->vector_num; i++)
++		hns3_vector_disable(&priv->tqp_vector[i]);
++
++	/* disable rcb */
++	for (i = 0; i < h->kinfo.num_tqps; i++)
++		hns3_tqp_disable(h->kinfo.tqp[i]);
++
++	/* stop ae_dev */
++	ops = priv->ae_handle->ae_algo->ops;
++	if (ops->stop)
++		ops->stop(priv->ae_handle);
++
++	/* delay ring buffer clearing to hns3_reset_notify_uninit_enet
++	 * during reset process, because driver may not be able
++	 * to disable the ring through firmware when downing the netdev.
++	 */
++	if (!hns3_nic_resetting(netdev))
++		hns3_clear_all_ring(priv->ae_handle, false);
++
++	hns3_reset_tx_queue(priv->ae_handle);
++}
++
++static int hns3_nic_net_stop(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state))
++		return 0;
++
++	netif_dbg(h, drv, netdev, "net stop\n");
++
++	if (h->ae_algo->ops->set_timer_task)
++		h->ae_algo->ops->set_timer_task(priv->ae_handle, false);
++
++	netif_carrier_off(netdev);
++	netif_tx_disable(netdev);
++
++	hns3_nic_net_down(netdev);
++
++	return 0;
++}
++
++static int hns3_nic_uc_sync(struct net_device *netdev,
++			    const unsigned char *addr)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (h->ae_algo->ops->add_uc_addr)
++		return h->ae_algo->ops->add_uc_addr(h, addr);
++
++	return 0;
++}
++
++static int hns3_nic_uc_unsync(struct net_device *netdev,
++			      const unsigned char *addr)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	/* need ignore the request of removing device address, because
++	 * we store the device address and other addresses of uc list
++	 * in the function's mac filter list.
++	 */
++	if (ether_addr_equal(addr, netdev->dev_addr))
++		return 0;
++
++	if (h->ae_algo->ops->rm_uc_addr)
++		return h->ae_algo->ops->rm_uc_addr(h, addr);
++
++	return 0;
++}
++
++static int hns3_nic_mc_sync(struct net_device *netdev,
++			    const unsigned char *addr)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (h->ae_algo->ops->add_mc_addr)
++		return h->ae_algo->ops->add_mc_addr(h, addr);
++
++	return 0;
++}
++
++static int hns3_nic_mc_unsync(struct net_device *netdev,
++			      const unsigned char *addr)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (h->ae_algo->ops->rm_mc_addr)
++		return h->ae_algo->ops->rm_mc_addr(h, addr);
++
++	return 0;
++}
++
++static u8 hns3_get_netdev_flags(struct net_device *netdev)
++{
++	u8 flags = 0;
++
++	if (netdev->flags & IFF_PROMISC)
++		flags = HNAE3_USER_UPE | HNAE3_USER_MPE | HNAE3_BPE;
++	else if (netdev->flags & IFF_ALLMULTI)
++		flags = HNAE3_USER_MPE;
++
++	return flags;
++}
++
++static void hns3_nic_set_rx_mode(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	u8 new_flags;
++
++	new_flags = hns3_get_netdev_flags(netdev);
++
++	__dev_uc_sync(netdev, hns3_nic_uc_sync, hns3_nic_uc_unsync);
++	__dev_mc_sync(netdev, hns3_nic_mc_sync, hns3_nic_mc_unsync);
++
++	/* User mode Promisc mode enable and vlan filtering is disabled to
++	 * let all packets in.
++	 */
++	h->netdev_flags = new_flags;
++	hns3_request_update_promisc_mode(h);
++}
++
++void hns3_request_update_promisc_mode(struct hnae3_handle *handle)
++{
++	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
++
++	if (ops->request_update_promisc_mode)
++		ops->request_update_promisc_mode(handle);
++}
++
++static u32 hns3_tx_spare_space(struct hns3_enet_ring *ring)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++	u32 ntc, ntu;
++
++	/* This smp_load_acquire() pairs with smp_store_release() in
++	 * hns3_tx_spare_update() called in tx desc cleaning process.
++	 */
++	ntc = smp_load_acquire(&tx_spare->last_to_clean);
++	ntu = tx_spare->next_to_use;
++
++	if (ntc > ntu)
++		return ntc - ntu - 1;
++
++	/* The free tx buffer is divided into two part, so pick the
++	 * larger one.
++	 */
++	return max(ntc, tx_spare->len - ntu) - 1;
++}
++
++static void hns3_tx_spare_update(struct hns3_enet_ring *ring)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++
++	if (!tx_spare ||
++	    tx_spare->last_to_clean == tx_spare->next_to_clean)
++		return;
++
++	/* This smp_store_release() pairs with smp_load_acquire() in
++	 * hns3_tx_spare_space() called in xmit process.
++	 */
++	smp_store_release(&tx_spare->last_to_clean,
++			  tx_spare->next_to_clean);
++}
++
++static bool hns3_can_use_tx_bounce(struct hns3_enet_ring *ring,
++				   struct sk_buff *skb,
++				   u32 space)
++{
++	u32 len = skb->len <= ring->tx_copybreak ? skb->len :
++				skb_headlen(skb);
++
++	if (len > ring->tx_copybreak)
++		return false;
++
++	if (ALIGN(len, dma_get_cache_alignment()) > space) {
++		hns3_ring_stats_update(ring, tx_spare_full);
++		return false;
++	}
++
++	return true;
++}
++
++static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring,
++				struct sk_buff *skb,
++				u32 space)
++{
++	if (skb->len <= ring->tx_copybreak || !tx_sgl ||
++	    (!skb_has_frag_list(skb) &&
++	     skb_shinfo(skb)->nr_frags < tx_sgl))
++		return false;
++
++	if (space < HNS3_MAX_SGL_SIZE) {
++		hns3_ring_stats_update(ring, tx_spare_full);
++		return false;
++	}
++
++	return true;
++}
++
++static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
++{
++	u32 alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size;
++	struct hns3_tx_spare *tx_spare;
++	struct page *page;
++	dma_addr_t dma;
++	int order;
++
++	if (!alloc_size)
++		return;
++
++	order = get_order(alloc_size);
++	if (order >= MAX_ORDER) {
++		if (net_ratelimit())
++			dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n");
++		return;
++	}
++
++	tx_spare = devm_kzalloc(ring_to_dev(ring), sizeof(*tx_spare),
++				GFP_KERNEL);
++	if (!tx_spare) {
++		/* The driver still work without the tx spare buffer */
++		dev_warn(ring_to_dev(ring), "failed to allocate hns3_tx_spare\n");
++		goto devm_kzalloc_error;
++	}
++
++	page = alloc_pages_node(dev_to_node(ring_to_dev(ring)),
++				GFP_KERNEL, order);
++	if (!page) {
++		dev_warn(ring_to_dev(ring), "failed to allocate tx spare pages\n");
++		goto alloc_pages_error;
++	}
++
++	dma = dma_map_page(ring_to_dev(ring), page, 0,
++			   PAGE_SIZE << order, DMA_TO_DEVICE);
++	if (dma_mapping_error(ring_to_dev(ring), dma)) {
++		dev_warn(ring_to_dev(ring), "failed to map pages for tx spare\n");
++		goto dma_mapping_error;
++	}
++
++	tx_spare->dma = dma;
++	tx_spare->buf = page_address(page);
++	tx_spare->len = PAGE_SIZE << order;
++	ring->tx_spare = tx_spare;
++	return;
++
++dma_mapping_error:
++	put_page(page);
++alloc_pages_error:
++	devm_kfree(ring_to_dev(ring), tx_spare);
++devm_kzalloc_error:
++	ring->tqp->handle->kinfo.tx_spare_buf_size = 0;
++}
++
++/* Use hns3_tx_spare_space() to make sure there is enough buffer
++ * before calling below function to allocate tx buffer.
++ */
++static void *hns3_tx_spare_alloc(struct hns3_enet_ring *ring,
++				 unsigned int size, dma_addr_t *dma,
++				 u32 *cb_len)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++	u32 ntu = tx_spare->next_to_use;
++
++	size = ALIGN(size, dma_get_cache_alignment());
++	*cb_len = size;
++
++	/* Tx spare buffer wraps back here because the end of
++	 * freed tx buffer is not enough.
++	 */
++	if (ntu + size > tx_spare->len) {
++		*cb_len += (tx_spare->len - ntu);
++		ntu = 0;
++	}
++
++	tx_spare->next_to_use = ntu + size;
++	if (tx_spare->next_to_use == tx_spare->len)
++		tx_spare->next_to_use = 0;
++
++	*dma = tx_spare->dma + ntu;
++
++	return tx_spare->buf + ntu;
++}
++
++static void hns3_tx_spare_rollback(struct hns3_enet_ring *ring, u32 len)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++
++	if (len > tx_spare->next_to_use) {
++		len -= tx_spare->next_to_use;
++		tx_spare->next_to_use = tx_spare->len - len;
++	} else {
++		tx_spare->next_to_use -= len;
++	}
++}
++
++static void hns3_tx_spare_reclaim_cb(struct hns3_enet_ring *ring,
++				     struct hns3_desc_cb *cb)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++	u32 ntc = tx_spare->next_to_clean;
++	u32 len = cb->length;
++
++	tx_spare->next_to_clean += len;
++
++	if (tx_spare->next_to_clean >= tx_spare->len) {
++		tx_spare->next_to_clean -= tx_spare->len;
++
++		if (tx_spare->next_to_clean) {
++			ntc = 0;
++			len = tx_spare->next_to_clean;
++		}
++	}
++
++	/* This tx spare buffer is only really reclaimed after calling
++	 * hns3_tx_spare_update(), so it is still safe to use the info in
++	 * the tx buffer to do the dma sync or sg unmapping after
++	 * tx_spare->next_to_clean is moved forword.
++	 */
++	if (cb->type & (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) {
++		dma_addr_t dma = tx_spare->dma + ntc;
++
++		dma_sync_single_for_cpu(ring_to_dev(ring), dma, len,
++					DMA_TO_DEVICE);
++	} else {
++		struct sg_table *sgt = tx_spare->buf + ntc;
++
++		dma_unmap_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents,
++			     DMA_TO_DEVICE);
++	}
++}
++
++static int hns3_set_tso(struct sk_buff *skb, u32 *paylen_fdop_ol4cs,
++			u16 *mss, u32 *type_cs_vlan_tso, u32 *send_bytes)
++{
++	u32 l4_offset, hdr_len;
++	union l3_hdr_info l3;
++	union l4_hdr_info l4;
++	u32 l4_paylen;
++	int ret;
++
++	if (!skb_is_gso(skb))
++		return 0;
++
++	ret = skb_cow_head(skb, 0);
++	if (unlikely(ret < 0))
++		return ret;
++
++	l3.hdr = skb_network_header(skb);
++	l4.hdr = skb_transport_header(skb);
++
++	/* Software should clear the IPv4's checksum field when tso is
++	 * needed.
++	 */
++	if (l3.v4->version == 4)
++		l3.v4->check = 0;
++
++	/* tunnel packet */
++	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
++					 SKB_GSO_GRE_CSUM |
++					 SKB_GSO_UDP_TUNNEL |
++					 SKB_GSO_UDP_TUNNEL_CSUM)) {
++		/* reset l3&l4 pointers from outer to inner headers */
++		l3.hdr = skb_inner_network_header(skb);
++		l4.hdr = skb_inner_transport_header(skb);
++
++		/* Software should clear the IPv4's checksum field when
++		 * tso is needed.
++		 */
++		if (l3.v4->version == 4)
++			l3.v4->check = 0;
++	}
++
++	/* normal or tunnel packet */
++	l4_offset = l4.hdr - skb->data;
++
++	/* remove payload length from inner pseudo checksum when tso */
++	l4_paylen = skb->len - l4_offset;
++
++	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
++		hdr_len = sizeof(*l4.udp) + l4_offset;
++		csum_replace_by_diff(&l4.udp->check,
++				     (__force __wsum)htonl(l4_paylen));
++	} else {
++		hdr_len = (l4.tcp->doff << 2) + l4_offset;
++		csum_replace_by_diff(&l4.tcp->check,
++				     (__force __wsum)htonl(l4_paylen));
++	}
++
++	*send_bytes = (skb_shinfo(skb)->gso_segs - 1) * hdr_len + skb->len;
++
++	/* find the txbd field values */
++	*paylen_fdop_ol4cs = skb->len - hdr_len;
++	hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_TSO_B, 1);
++
++	/* offload outer UDP header checksum */
++	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)
++		hns3_set_field(*paylen_fdop_ol4cs, HNS3_TXD_OL4CS_B, 1);
++
++	/* get MSS for TSO */
++	*mss = skb_shinfo(skb)->gso_size;
++
++	trace_hns3_tso(skb);
++
++	return 0;
++}
++
++static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto,
++				u8 *il4_proto)
++{
++	union l3_hdr_info l3;
++	unsigned char *l4_hdr;
++	unsigned char *exthdr;
++	u8 l4_proto_tmp;
++	__be16 frag_off;
++
++	/* find outer header point */
++	l3.hdr = skb_network_header(skb);
++	l4_hdr = skb_transport_header(skb);
++
++	if (skb->protocol == htons(ETH_P_IPV6)) {
++		exthdr = l3.hdr + sizeof(*l3.v6);
++		l4_proto_tmp = l3.v6->nexthdr;
++		if (l4_hdr != exthdr)
++			ipv6_skip_exthdr(skb, exthdr - skb->data,
++					 &l4_proto_tmp, &frag_off);
++	} else if (skb->protocol == htons(ETH_P_IP)) {
++		l4_proto_tmp = l3.v4->protocol;
++	} else {
++		return -EINVAL;
++	}
++
++	*ol4_proto = l4_proto_tmp;
++
++	/* tunnel packet */
++	if (!skb->encapsulation) {
++		*il4_proto = 0;
++		return 0;
++	}
++
++	/* find inner header point */
++	l3.hdr = skb_inner_network_header(skb);
++	l4_hdr = skb_inner_transport_header(skb);
++
++	if (l3.v6->version == 6) {
++		exthdr = l3.hdr + sizeof(*l3.v6);
++		l4_proto_tmp = l3.v6->nexthdr;
++		if (l4_hdr != exthdr)
++			ipv6_skip_exthdr(skb, exthdr - skb->data,
++					 &l4_proto_tmp, &frag_off);
++	} else if (l3.v4->version == 4) {
++		l4_proto_tmp = l3.v4->protocol;
++	}
++
++	*il4_proto = l4_proto_tmp;
++
++	return 0;
++}
++
++/* when skb->encapsulation is 0, skb->ip_summed is CHECKSUM_PARTIAL
++ * and it is udp packet, which has a dest port as the IANA assigned.
++ * the hardware is expected to do the checksum offload, but the
++ * hardware will not do the checksum offload when udp dest port is
++ * 4789, 4790 or 6081.
++ */
++static bool hns3_tunnel_csum_bug(struct sk_buff *skb)
++{
++	struct hns3_nic_priv *priv = netdev_priv(skb->dev);
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
++	union l4_hdr_info l4;
++
++	/* device version above V3(include V3), the hardware can
++	 * do this checksum offload.
++	 */
++	if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3)
++		return false;
++
++	l4.hdr = skb_transport_header(skb);
++
++	if (!(!skb->encapsulation &&
++	      (l4.udp->dest == htons(IANA_VXLAN_UDP_PORT) ||
++	      l4.udp->dest == htons(GENEVE_UDP_PORT) ||
++	      l4.udp->dest == htons(IANA_VXLAN_GPE_UDP_PORT))))
++		return false;
++
++	return true;
++}
++
++static void hns3_set_outer_l2l3l4(struct sk_buff *skb, u8 ol4_proto,
++				  u32 *ol_type_vlan_len_msec)
++{
++	u32 l2_len, l3_len, l4_len;
++	unsigned char *il2_hdr;
++	union l3_hdr_info l3;
++	union l4_hdr_info l4;
++
++	l3.hdr = skb_network_header(skb);
++	l4.hdr = skb_transport_header(skb);
++
++	/* compute OL2 header size, defined in 2 Bytes */
++	l2_len = l3.hdr - skb->data;
++	hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L2LEN_S, l2_len >> 1);
++
++	/* compute OL3 header size, defined in 4 Bytes */
++	l3_len = l4.hdr - l3.hdr;
++	hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L3LEN_S, l3_len >> 2);
++
++	il2_hdr = skb_inner_mac_header(skb);
++	/* compute OL4 header size, defined in 4 Bytes */
++	l4_len = il2_hdr - l4.hdr;
++	hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L4LEN_S, l4_len >> 2);
++
++	/* define outer network header type */
++	if (skb->protocol == htons(ETH_P_IP)) {
++		if (skb_is_gso(skb))
++			hns3_set_field(*ol_type_vlan_len_msec,
++				       HNS3_TXD_OL3T_S,
++				       HNS3_OL3T_IPV4_CSUM);
++		else
++			hns3_set_field(*ol_type_vlan_len_msec,
++				       HNS3_TXD_OL3T_S,
++				       HNS3_OL3T_IPV4_NO_CSUM);
++	} else if (skb->protocol == htons(ETH_P_IPV6)) {
++		hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_OL3T_S,
++			       HNS3_OL3T_IPV6);
++	}
++
++	if (ol4_proto == IPPROTO_UDP)
++		hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_TUNTYPE_S,
++			       HNS3_TUN_MAC_IN_UDP);
++	else if (ol4_proto == IPPROTO_GRE)
++		hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_TUNTYPE_S,
++			       HNS3_TUN_NVGRE);
++}
++
++static void hns3_set_l3_type(struct sk_buff *skb, union l3_hdr_info l3,
++			     u32 *type_cs_vlan_tso)
++{
++	if (l3.v4->version == 4) {
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S,
++			       HNS3_L3T_IPV4);
++
++		/* the stack computes the IP header already, the only time we
++		 * need the hardware to recompute it is in the case of TSO.
++		 */
++		if (skb_is_gso(skb))
++			hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3CS_B, 1);
++	} else if (l3.v6->version == 6) {
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S,
++			       HNS3_L3T_IPV6);
++	}
++}
++
++static int hns3_set_l4_csum_length(struct sk_buff *skb, union l4_hdr_info l4,
++				   u32 l4_proto, u32 *type_cs_vlan_tso)
++{
++	/* compute inner(/normal) L4 header size, defined in 4 Bytes */
++	switch (l4_proto) {
++	case IPPROTO_TCP:
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S,
++			       HNS3_L4T_TCP);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S,
++			       l4.tcp->doff);
++		break;
++	case IPPROTO_UDP:
++		if (hns3_tunnel_csum_bug(skb)) {
++			int ret = skb_put_padto(skb, HNS3_MIN_TUN_PKT_LEN);
++
++			return ret ? ret : skb_checksum_help(skb);
++		}
++
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S,
++			       HNS3_L4T_UDP);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S,
++			       (sizeof(struct udphdr) >> 2));
++		break;
++	case IPPROTO_SCTP:
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S,
++			       HNS3_L4T_SCTP);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S,
++			       (sizeof(struct sctphdr) >> 2));
++		break;
++	default:
++		/* drop the skb tunnel packet if hardware don't support,
++		 * because hardware can't calculate csum when TSO.
++		 */
++		if (skb_is_gso(skb))
++			return -EDOM;
++
++		/* the stack computes the IP header already,
++		 * driver calculate l4 checksum when not TSO.
++		 */
++		return skb_checksum_help(skb);
++	}
++
++	return 0;
++}
++
++static int hns3_set_l2l3l4(struct sk_buff *skb, u8 ol4_proto,
++			   u8 il4_proto, u32 *type_cs_vlan_tso,
++			   u32 *ol_type_vlan_len_msec)
++{
++	unsigned char *l2_hdr = skb->data;
++	u32 l4_proto = ol4_proto;
++	union l4_hdr_info l4;
++	union l3_hdr_info l3;
++	u32 l2_len, l3_len;
++
++	l4.hdr = skb_transport_header(skb);
++	l3.hdr = skb_network_header(skb);
++
++	/* handle encapsulation skb */
++	if (skb->encapsulation) {
++		/* If this is a not UDP/GRE encapsulation skb */
++		if (!(ol4_proto == IPPROTO_UDP || ol4_proto == IPPROTO_GRE)) {
++			/* drop the skb tunnel packet if hardware don't support,
++			 * because hardware can't calculate csum when TSO.
++			 */
++			if (skb_is_gso(skb))
++				return -EDOM;
++
++			/* the stack computes the IP header already,
++			 * driver calculate l4 checksum when not TSO.
++			 */
++			return skb_checksum_help(skb);
++		}
++
++		hns3_set_outer_l2l3l4(skb, ol4_proto, ol_type_vlan_len_msec);
++
++		/* switch to inner header */
++		l2_hdr = skb_inner_mac_header(skb);
++		l3.hdr = skb_inner_network_header(skb);
++		l4.hdr = skb_inner_transport_header(skb);
++		l4_proto = il4_proto;
++	}
++
++	hns3_set_l3_type(skb, l3, type_cs_vlan_tso);
++
++	/* compute inner(/normal) L2 header size, defined in 2 Bytes */
++	l2_len = l3.hdr - l2_hdr;
++	hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L2LEN_S, l2_len >> 1);
++
++	/* compute inner(/normal) L3 header size, defined in 4 Bytes */
++	l3_len = l4.hdr - l3.hdr;
++	hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3LEN_S, l3_len >> 2);
++
++	return hns3_set_l4_csum_length(skb, l4, l4_proto, type_cs_vlan_tso);
++}
++
++static int hns3_handle_vtags(struct hns3_enet_ring *tx_ring,
++			     struct sk_buff *skb)
++{
++	struct hnae3_handle *handle = tx_ring->tqp->handle;
++	struct hnae3_ae_dev *ae_dev;
++	struct vlan_ethhdr *vhdr;
++	int rc;
++
++	if (!(skb->protocol == htons(ETH_P_8021Q) ||
++	      skb_vlan_tag_present(skb)))
++		return 0;
++
++	/* For HW limitation on HNAE3_DEVICE_VERSION_V2, if port based insert
++	 * VLAN enabled, only one VLAN header is allowed in skb, otherwise it
++	 * will cause RAS error.
++	 */
++	ae_dev = pci_get_drvdata(handle->pdev);
++	if (unlikely(skb_vlan_tagged_multi(skb) &&
++		     ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2 &&
++		     handle->port_base_vlan_state ==
++		     HNAE3_PORT_BASE_VLAN_ENABLE))
++		return -EINVAL;
++
++	if (skb->protocol == htons(ETH_P_8021Q) &&
++	    !(handle->kinfo.netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
++		/* When HW VLAN acceleration is turned off, and the stack
++		 * sets the protocol to 802.1q, the driver just need to
++		 * set the protocol to the encapsulated ethertype.
++		 */
++		skb->protocol = vlan_get_protocol(skb);
++		return 0;
++	}
++
++	if (skb_vlan_tag_present(skb)) {
++		/* Based on hw strategy, use out_vtag in two layer tag case,
++		 * and use inner_vtag in one tag case.
++		 */
++		if (skb->protocol == htons(ETH_P_8021Q) &&
++		    handle->port_base_vlan_state ==
++		    HNAE3_PORT_BASE_VLAN_DISABLE)
++			rc = HNS3_OUTER_VLAN_TAG;
++		else
++			rc = HNS3_INNER_VLAN_TAG;
++
++		skb->protocol = vlan_get_protocol(skb);
++		return rc;
++	}
++
++	rc = skb_cow_head(skb, 0);
++	if (unlikely(rc < 0))
++		return rc;
++
++	vhdr = (struct vlan_ethhdr *)skb->data;
++	vhdr->h_vlan_TCI |= cpu_to_be16((skb->priority << VLAN_PRIO_SHIFT)
++					 & VLAN_PRIO_MASK);
++
++	skb->protocol = vlan_get_protocol(skb);
++	return 0;
++}
++
++/* check if the hardware is capable of checksum offloading */
++static bool hns3_check_hw_tx_csum(struct sk_buff *skb)
++{
++	struct hns3_nic_priv *priv = netdev_priv(skb->dev);
++
++	/* Kindly note, due to backward compatibility of the TX descriptor,
++	 * HW checksum of the non-IP packets and GSO packets is handled at
++	 * different place in the following code
++	 */
++	if (skb_csum_is_sctp(skb) || skb_is_gso(skb) ||
++	    !test_bit(HNS3_NIC_STATE_HW_TX_CSUM_ENABLE, &priv->state))
++		return false;
++
++	return true;
++}
++
++struct hns3_desc_param {
++	u32 paylen_ol4cs;
++	u32 ol_type_vlan_len_msec;
++	u32 type_cs_vlan_tso;
++	u16 mss_hw_csum;
++	u16 inner_vtag;
++	u16 out_vtag;
++};
++
++static void hns3_init_desc_data(struct sk_buff *skb, struct hns3_desc_param *pa)
++{
++	pa->paylen_ol4cs = skb->len;
++	pa->ol_type_vlan_len_msec = 0;
++	pa->type_cs_vlan_tso = 0;
++	pa->mss_hw_csum = 0;
++	pa->inner_vtag = 0;
++	pa->out_vtag = 0;
++}
++
++static int hns3_handle_vlan_info(struct hns3_enet_ring *ring,
++				 struct sk_buff *skb,
++				 struct hns3_desc_param *param)
++{
++	int ret;
++
++	ret = hns3_handle_vtags(ring, skb);
++	if (unlikely(ret < 0)) {
++		hns3_ring_stats_update(ring, tx_vlan_err);
++		return ret;
++	} else if (ret == HNS3_INNER_VLAN_TAG) {
++		param->inner_vtag = skb_vlan_tag_get(skb);
++		param->inner_vtag |= (skb->priority << VLAN_PRIO_SHIFT) &
++				VLAN_PRIO_MASK;
++		hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_VLAN_B, 1);
++	} else if (ret == HNS3_OUTER_VLAN_TAG) {
++		param->out_vtag = skb_vlan_tag_get(skb);
++		param->out_vtag |= (skb->priority << VLAN_PRIO_SHIFT) &
++				VLAN_PRIO_MASK;
++		hns3_set_field(param->ol_type_vlan_len_msec, HNS3_TXD_OVLAN_B,
++			       1);
++	}
++	return 0;
++}
++
++static int hns3_handle_csum_partial(struct hns3_enet_ring *ring,
++				    struct sk_buff *skb,
++				    struct hns3_desc_cb *desc_cb,
++				    struct hns3_desc_param *param)
++{
++	u8 ol4_proto, il4_proto;
++	int ret;
++
++	if (hns3_check_hw_tx_csum(skb)) {
++		/* set checksum start and offset, defined in 2 Bytes */
++		hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_CSUM_START_S,
++			       skb_checksum_start_offset(skb) >> 1);
++		hns3_set_field(param->ol_type_vlan_len_msec,
++			       HNS3_TXD_CSUM_OFFSET_S,
++			       skb->csum_offset >> 1);
++		param->mss_hw_csum |= BIT(HNS3_TXD_HW_CS_B);
++		return 0;
++	}
++
++	skb_reset_mac_len(skb);
++
++	ret = hns3_get_l4_protocol(skb, &ol4_proto, &il4_proto);
++	if (unlikely(ret < 0)) {
++		hns3_ring_stats_update(ring, tx_l4_proto_err);
++		return ret;
++	}
++
++	ret = hns3_set_l2l3l4(skb, ol4_proto, il4_proto,
++			      &param->type_cs_vlan_tso,
++			      &param->ol_type_vlan_len_msec);
++	if (unlikely(ret < 0)) {
++		hns3_ring_stats_update(ring, tx_l2l3l4_err);
++		return ret;
++	}
++
++	ret = hns3_set_tso(skb, &param->paylen_ol4cs, &param->mss_hw_csum,
++			   &param->type_cs_vlan_tso, &desc_cb->send_bytes);
++	if (unlikely(ret < 0)) {
++		hns3_ring_stats_update(ring, tx_tso_err);
++		return ret;
++	}
++	return 0;
++}
++
++static int hns3_fill_skb_desc(struct hns3_enet_ring *ring,
++			      struct sk_buff *skb, struct hns3_desc *desc,
++			      struct hns3_desc_cb *desc_cb)
++{
++	struct hns3_desc_param param;
++	int ret;
++
++	hns3_init_desc_data(skb, &param);
++	ret = hns3_handle_vlan_info(ring, skb, &param);
++	if (unlikely(ret < 0))
++		return ret;
++
++	desc_cb->send_bytes = skb->len;
++
++	if (skb->ip_summed == CHECKSUM_PARTIAL) {
++		ret = hns3_handle_csum_partial(ring, skb, desc_cb, &param);
++		if (ret)
++			return ret;
++	}
++
++	/* Set txbd */
++	desc->tx.ol_type_vlan_len_msec =
++		cpu_to_le32(param.ol_type_vlan_len_msec);
++	desc->tx.type_cs_vlan_tso_len = cpu_to_le32(param.type_cs_vlan_tso);
++	desc->tx.paylen_ol4cs = cpu_to_le32(param.paylen_ol4cs);
++	desc->tx.mss_hw_csum = cpu_to_le16(param.mss_hw_csum);
++	desc->tx.vlan_tag = cpu_to_le16(param.inner_vtag);
++	desc->tx.outer_vlan_tag = cpu_to_le16(param.out_vtag);
++
++	return 0;
++}
++
++static int hns3_fill_desc(struct hns3_enet_ring *ring, dma_addr_t dma,
++			  unsigned int size)
++{
++#define HNS3_LIKELY_BD_NUM	1
++
++	struct hns3_desc *desc = &ring->desc[ring->next_to_use];
++	unsigned int frag_buf_num;
++	int k, sizeoflast;
++
++	if (likely(size <= HNS3_MAX_BD_SIZE)) {
++		desc->addr = cpu_to_le64(dma);
++		desc->tx.send_size = cpu_to_le16(size);
++		desc->tx.bdtp_fe_sc_vld_ra_ri =
++			cpu_to_le16(BIT(HNS3_TXD_VLD_B));
++
++		trace_hns3_tx_desc(ring, ring->next_to_use);
++		ring_ptr_move_fw(ring, next_to_use);
++		return HNS3_LIKELY_BD_NUM;
++	}
++
++	frag_buf_num = hns3_tx_bd_count(size);
++	sizeoflast = size % HNS3_MAX_BD_SIZE;
++	sizeoflast = sizeoflast ? sizeoflast : HNS3_MAX_BD_SIZE;
++
++	/* When frag size is bigger than hardware limit, split this frag */
++	for (k = 0; k < frag_buf_num; k++) {
++		/* now, fill the descriptor */
++		desc->addr = cpu_to_le64(dma + HNS3_MAX_BD_SIZE * k);
++		desc->tx.send_size = cpu_to_le16((k == frag_buf_num - 1) ?
++				     (u16)sizeoflast : (u16)HNS3_MAX_BD_SIZE);
++		desc->tx.bdtp_fe_sc_vld_ra_ri =
++				cpu_to_le16(BIT(HNS3_TXD_VLD_B));
++
++		trace_hns3_tx_desc(ring, ring->next_to_use);
++		/* move ring pointer to next */
++		ring_ptr_move_fw(ring, next_to_use);
++
++		desc = &ring->desc[ring->next_to_use];
++	}
++
++	return frag_buf_num;
++}
++
++static int hns3_map_and_fill_desc(struct hns3_enet_ring *ring, void *priv,
++				  unsigned int type)
++{
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use];
++	struct device *dev = ring_to_dev(ring);
++	unsigned int size;
++	dma_addr_t dma;
++
++	if (type & (DESC_TYPE_FRAGLIST_SKB | DESC_TYPE_SKB)) {
++		struct sk_buff *skb = (struct sk_buff *)priv;
++
++		size = skb_headlen(skb);
++		if (!size)
++			return 0;
++
++		dma = dma_map_single(dev, skb->data, size, DMA_TO_DEVICE);
++	} else if (type & DESC_TYPE_BOUNCE_HEAD) {
++		/* Head data has been filled in hns3_handle_tx_bounce(),
++		 * just return 0 here.
++		 */
++		return 0;
++	} else {
++		skb_frag_t *frag = (skb_frag_t *)priv;
++
++		size = skb_frag_size(frag);
++		if (!size)
++			return 0;
++
++		dma = skb_frag_dma_map(dev, frag, 0, size, DMA_TO_DEVICE);
++	}
++
++	if (unlikely(dma_mapping_error(dev, dma))) {
++		hns3_ring_stats_update(ring, sw_err_cnt);
++		return -ENOMEM;
++	}
++
++	desc_cb->priv = priv;
++	desc_cb->length = size;
++	desc_cb->dma = dma;
++	desc_cb->type = type;
++
++	return hns3_fill_desc(ring, dma, size);
++}
++
++static unsigned int hns3_skb_bd_num(struct sk_buff *skb, unsigned int *bd_size,
++				    unsigned int bd_num)
++{
++	unsigned int size;
++	int i;
++
++	size = skb_headlen(skb);
++	while (size > HNS3_MAX_BD_SIZE) {
++		bd_size[bd_num++] = HNS3_MAX_BD_SIZE;
++		size -= HNS3_MAX_BD_SIZE;
++
++		if (bd_num > HNS3_MAX_TSO_BD_NUM)
++			return bd_num;
++	}
++
++	if (size) {
++		bd_size[bd_num++] = size;
++		if (bd_num > HNS3_MAX_TSO_BD_NUM)
++			return bd_num;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++		size = skb_frag_size(frag);
++		if (!size)
++			continue;
++
++		while (size > HNS3_MAX_BD_SIZE) {
++			bd_size[bd_num++] = HNS3_MAX_BD_SIZE;
++			size -= HNS3_MAX_BD_SIZE;
++
++			if (bd_num > HNS3_MAX_TSO_BD_NUM)
++				return bd_num;
++		}
++
++		bd_size[bd_num++] = size;
++		if (bd_num > HNS3_MAX_TSO_BD_NUM)
++			return bd_num;
++	}
++
++	return bd_num;
++}
++
++static unsigned int hns3_tx_bd_num(struct sk_buff *skb, unsigned int *bd_size,
++				   u8 max_non_tso_bd_num, unsigned int bd_num,
++				   unsigned int recursion_level)
++{
++#define HNS3_MAX_RECURSION_LEVEL	24
++
++	struct sk_buff *frag_skb;
++
++	/* If the total len is within the max bd limit */
++	if (likely(skb->len <= HNS3_MAX_BD_SIZE && !recursion_level &&
++		   !skb_has_frag_list(skb) &&
++		   skb_shinfo(skb)->nr_frags < max_non_tso_bd_num))
++		return skb_shinfo(skb)->nr_frags + 1U;
++
++	if (unlikely(recursion_level >= HNS3_MAX_RECURSION_LEVEL))
++		return UINT_MAX;
++
++	bd_num = hns3_skb_bd_num(skb, bd_size, bd_num);
++	if (!skb_has_frag_list(skb) || bd_num > HNS3_MAX_TSO_BD_NUM)
++		return bd_num;
++
++	skb_walk_frags(skb, frag_skb) {
++		bd_num = hns3_tx_bd_num(frag_skb, bd_size, max_non_tso_bd_num,
++					bd_num, recursion_level + 1);
++		if (bd_num > HNS3_MAX_TSO_BD_NUM)
++			return bd_num;
++	}
++
++	return bd_num;
++}
++
++static unsigned int hns3_gso_hdr_len(struct sk_buff *skb)
++{
++	if (!skb->encapsulation)
++		return skb_tcp_all_headers(skb);
++
++	return skb_inner_tcp_all_headers(skb);
++}
++
++/* HW need every continuous max_non_tso_bd_num buffer data to be larger
++ * than MSS, we simplify it by ensuring skb_headlen + the first continuous
++ * max_non_tso_bd_num - 1 frags to be larger than gso header len + mss,
++ * and the remaining continuous max_non_tso_bd_num - 1 frags to be larger
++ * than MSS except the last max_non_tso_bd_num - 1 frags.
++ */
++static bool hns3_skb_need_linearized(struct sk_buff *skb, unsigned int *bd_size,
++				     unsigned int bd_num, u8 max_non_tso_bd_num)
++{
++	unsigned int tot_len = 0;
++	int i;
++
++	for (i = 0; i < max_non_tso_bd_num - 1U; i++)
++		tot_len += bd_size[i];
++
++	/* ensure the first max_non_tso_bd_num frags is greater than
++	 * mss + header
++	 */
++	if (tot_len + bd_size[max_non_tso_bd_num - 1U] <
++	    skb_shinfo(skb)->gso_size + hns3_gso_hdr_len(skb))
++		return true;
++
++	/* ensure every continuous max_non_tso_bd_num - 1 buffer is greater
++	 * than mss except the last one.
++	 */
++	for (i = 0; i < bd_num - max_non_tso_bd_num; i++) {
++		tot_len -= bd_size[i];
++		tot_len += bd_size[i + max_non_tso_bd_num - 1U];
++
++		if (tot_len < skb_shinfo(skb)->gso_size)
++			return true;
++	}
++
++	return false;
++}
++
++void hns3_shinfo_pack(struct skb_shared_info *shinfo, __u32 *size)
++{
++	int i;
++
++	for (i = 0; i < MAX_SKB_FRAGS; i++)
++		size[i] = skb_frag_size(&shinfo->frags[i]);
++}
++
++static int hns3_skb_linearize(struct hns3_enet_ring *ring,
++			      struct sk_buff *skb,
++			      unsigned int bd_num)
++{
++	/* 'bd_num == UINT_MAX' means the skb' fraglist has a
++	 * recursion level of over HNS3_MAX_RECURSION_LEVEL.
++	 */
++	if (bd_num == UINT_MAX) {
++		hns3_ring_stats_update(ring, over_max_recursion);
++		return -ENOMEM;
++	}
++
++	/* The skb->len has exceeded the hw limitation, linearization
++	 * will not help.
++	 */
++	if (skb->len > HNS3_MAX_TSO_SIZE ||
++	    (!skb_is_gso(skb) && skb->len > HNS3_MAX_NON_TSO_SIZE)) {
++		hns3_ring_stats_update(ring, hw_limitation);
++		return -ENOMEM;
++	}
++
++	if (__skb_linearize(skb)) {
++		hns3_ring_stats_update(ring, sw_err_cnt);
++		return -ENOMEM;
++	}
++
++	return 0;
++}
++
++static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring,
++				  struct net_device *netdev,
++				  struct sk_buff *skb)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	u8 max_non_tso_bd_num = priv->max_non_tso_bd_num;
++	unsigned int bd_size[HNS3_MAX_TSO_BD_NUM + 1U];
++	unsigned int bd_num;
++
++	bd_num = hns3_tx_bd_num(skb, bd_size, max_non_tso_bd_num, 0, 0);
++	if (unlikely(bd_num > max_non_tso_bd_num)) {
++		if (bd_num <= HNS3_MAX_TSO_BD_NUM && skb_is_gso(skb) &&
++		    !hns3_skb_need_linearized(skb, bd_size, bd_num,
++					      max_non_tso_bd_num)) {
++			trace_hns3_over_max_bd(skb);
++			goto out;
++		}
++
++		if (hns3_skb_linearize(ring, skb, bd_num))
++			return -ENOMEM;
++
++		bd_num = hns3_tx_bd_count(skb->len);
++
++		hns3_ring_stats_update(ring, tx_copy);
++	}
++
++out:
++	if (likely(ring_space(ring) >= bd_num))
++		return bd_num;
++
++	netif_stop_subqueue(netdev, ring->queue_index);
++	smp_mb(); /* Memory barrier before checking ring_space */
++
++	/* Start queue in case hns3_clean_tx_ring has just made room
++	 * available and has not seen the queue stopped state performed
++	 * by netif_stop_subqueue above.
++	 */
++	if (ring_space(ring) >= bd_num && netif_carrier_ok(netdev) &&
++	    !test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) {
++		netif_start_subqueue(netdev, ring->queue_index);
++		return bd_num;
++	}
++
++	hns3_ring_stats_update(ring, tx_busy);
++
++	return -EBUSY;
++}
++
++static void hns3_clear_desc(struct hns3_enet_ring *ring, int next_to_use_orig)
++{
++	struct device *dev = ring_to_dev(ring);
++	unsigned int i;
++
++	for (i = 0; i < ring->desc_num; i++) {
++		struct hns3_desc *desc = &ring->desc[ring->next_to_use];
++		struct hns3_desc_cb *desc_cb;
++
++		memset(desc, 0, sizeof(*desc));
++
++		/* check if this is where we started */
++		if (ring->next_to_use == next_to_use_orig)
++			break;
++
++		/* rollback one */
++		ring_ptr_move_bw(ring, next_to_use);
++
++		desc_cb = &ring->desc_cb[ring->next_to_use];
++
++		if (!desc_cb->dma)
++			continue;
++
++		/* unmap the descriptor dma address */
++		if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB))
++			dma_unmap_single(dev, desc_cb->dma, desc_cb->length,
++					 DMA_TO_DEVICE);
++		else if (desc_cb->type &
++			 (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL))
++			hns3_tx_spare_rollback(ring, desc_cb->length);
++		else if (desc_cb->length)
++			dma_unmap_page(dev, desc_cb->dma, desc_cb->length,
++				       DMA_TO_DEVICE);
++
++		desc_cb->length = 0;
++		desc_cb->dma = 0;
++		desc_cb->type = DESC_TYPE_UNKNOWN;
++	}
++}
++
++static int hns3_fill_skb_to_desc(struct hns3_enet_ring *ring,
++				 struct sk_buff *skb, unsigned int type)
++{
++	struct sk_buff *frag_skb;
++	int i, ret, bd_num = 0;
++
++	ret = hns3_map_and_fill_desc(ring, skb, type);
++	if (unlikely(ret < 0))
++		return ret;
++
++	bd_num += ret;
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++		ret = hns3_map_and_fill_desc(ring, frag, DESC_TYPE_PAGE);
++		if (unlikely(ret < 0))
++			return ret;
++
++		bd_num += ret;
++	}
++
++	skb_walk_frags(skb, frag_skb) {
++		ret = hns3_fill_skb_to_desc(ring, frag_skb,
++					    DESC_TYPE_FRAGLIST_SKB);
++		if (unlikely(ret < 0))
++			return ret;
++
++		bd_num += ret;
++	}
++
++	return bd_num;
++}
++
++static void hns3_tx_push_bd(struct hns3_enet_ring *ring, int num)
++{
++#define HNS3_BYTES_PER_64BIT		8
++
++	struct hns3_desc desc[HNS3_MAX_PUSH_BD_NUM] = {};
++	int offset = 0;
++
++	/* make sure everything is visible to device before
++	 * excuting tx push or updating doorbell
++	 */
++	dma_wmb();
++
++	do {
++		int idx = (ring->next_to_use - num + ring->desc_num) %
++			  ring->desc_num;
++
++		u64_stats_update_begin(&ring->syncp);
++		ring->stats.tx_push++;
++		u64_stats_update_end(&ring->syncp);
++		memcpy(&desc[offset], &ring->desc[idx],
++		       sizeof(struct hns3_desc));
++		offset++;
++	} while (--num);
++
++	__iowrite64_copy(ring->tqp->mem_base, desc,
++			 (sizeof(struct hns3_desc) * HNS3_MAX_PUSH_BD_NUM) /
++			 HNS3_BYTES_PER_64BIT);
++
++	io_stop_wc();
++}
++
++static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring)
++{
++#define HNS3_MEM_DOORBELL_OFFSET	64
++
++	__le64 bd_num = cpu_to_le64((u64)ring->pending_buf);
++
++	/* make sure everything is visible to device before
++	 * excuting tx push or updating doorbell
++	 */
++	dma_wmb();
++
++	__iowrite64_copy(ring->tqp->mem_base + HNS3_MEM_DOORBELL_OFFSET,
++			 &bd_num, 1);
++	u64_stats_update_begin(&ring->syncp);
++	ring->stats.tx_mem_doorbell += ring->pending_buf;
++	u64_stats_update_end(&ring->syncp);
++
++	io_stop_wc();
++}
++
++static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num,
++			     bool doorbell)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	/* when tx push is enabled, the packet whose number of BD below
++	 * HNS3_MAX_PUSH_BD_NUM can be pushed directly.
++	 */
++	if (test_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state) && num &&
++	    !ring->pending_buf && num <= HNS3_MAX_PUSH_BD_NUM && doorbell) {
++		hns3_tx_push_bd(ring, num);
++		WRITE_ONCE(ring->last_to_use, ring->next_to_use);
++		return;
++	}
++
++	ring->pending_buf += num;
++
++	if (!doorbell) {
++		hns3_ring_stats_update(ring, tx_more);
++		return;
++	}
++
++	if (ring->tqp->mem_base)
++		hns3_tx_mem_doorbell(ring);
++	else
++		writel(ring->pending_buf,
++		       ring->tqp->io_base + HNS3_RING_TX_RING_TAIL_REG);
++
++	ring->pending_buf = 0;
++	WRITE_ONCE(ring->last_to_use, ring->next_to_use);
++}
++
++static void hns3_tsyn(struct net_device *netdev, struct sk_buff *skb,
++		      struct hns3_desc *desc)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (!(h->ae_algo->ops->set_tx_hwts_info &&
++	      h->ae_algo->ops->set_tx_hwts_info(h, skb)))
++		return;
++
++	desc->tx.bdtp_fe_sc_vld_ra_ri |= cpu_to_le16(BIT(HNS3_TXD_TSYN_B));
++}
++
++static int hns3_handle_tx_bounce(struct hns3_enet_ring *ring,
++				 struct sk_buff *skb)
++{
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use];
++	unsigned int type = DESC_TYPE_BOUNCE_HEAD;
++	unsigned int size = skb_headlen(skb);
++	dma_addr_t dma;
++	int bd_num = 0;
++	u32 cb_len;
++	void *buf;
++	int ret;
++
++	if (skb->len <= ring->tx_copybreak) {
++		size = skb->len;
++		type = DESC_TYPE_BOUNCE_ALL;
++	}
++
++	/* hns3_can_use_tx_bounce() is called to ensure the below
++	 * function can always return the tx buffer.
++	 */
++	buf = hns3_tx_spare_alloc(ring, size, &dma, &cb_len);
++
++	ret = skb_copy_bits(skb, 0, buf, size);
++	if (unlikely(ret < 0)) {
++		hns3_tx_spare_rollback(ring, cb_len);
++		hns3_ring_stats_update(ring, copy_bits_err);
++		return ret;
++	}
++
++	desc_cb->priv = skb;
++	desc_cb->length = cb_len;
++	desc_cb->dma = dma;
++	desc_cb->type = type;
++
++	bd_num += hns3_fill_desc(ring, dma, size);
++
++	if (type == DESC_TYPE_BOUNCE_HEAD) {
++		ret = hns3_fill_skb_to_desc(ring, skb,
++					    DESC_TYPE_BOUNCE_HEAD);
++		if (unlikely(ret < 0))
++			return ret;
++
++		bd_num += ret;
++	}
++
++	dma_sync_single_for_device(ring_to_dev(ring), dma, size,
++				   DMA_TO_DEVICE);
++
++	hns3_ring_stats_update(ring, tx_bounce);
++
++	return bd_num;
++}
++
++static int hns3_handle_tx_sgl(struct hns3_enet_ring *ring,
++			      struct sk_buff *skb)
++{
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use];
++	u32 nfrag = skb_shinfo(skb)->nr_frags + 1;
++	struct sg_table *sgt;
++	int i, bd_num = 0;
++	dma_addr_t dma;
++	u32 cb_len;
++	int nents;
++
++	if (skb_has_frag_list(skb))
++		nfrag = HNS3_MAX_TSO_BD_NUM;
++
++	/* hns3_can_use_tx_sgl() is called to ensure the below
++	 * function can always return the tx buffer.
++	 */
++	sgt = hns3_tx_spare_alloc(ring, HNS3_SGL_SIZE(nfrag),
++				  &dma, &cb_len);
++
++	/* scatterlist follows by the sg table */
++	sgt->sgl = (struct scatterlist *)(sgt + 1);
++	sg_init_table(sgt->sgl, nfrag);
++	nents = skb_to_sgvec(skb, sgt->sgl, 0, skb->len);
++	if (unlikely(nents < 0)) {
++		hns3_tx_spare_rollback(ring, cb_len);
++		hns3_ring_stats_update(ring, skb2sgl_err);
++		return -ENOMEM;
++	}
++
++	sgt->orig_nents = nents;
++	sgt->nents = dma_map_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents,
++				DMA_TO_DEVICE);
++	if (unlikely(!sgt->nents)) {
++		hns3_tx_spare_rollback(ring, cb_len);
++		hns3_ring_stats_update(ring, map_sg_err);
++		return -ENOMEM;
++	}
++
++	desc_cb->priv = skb;
++	desc_cb->length = cb_len;
++	desc_cb->dma = dma;
++	desc_cb->type = DESC_TYPE_SGL_SKB;
++
++	for (i = 0; i < sgt->nents; i++)
++		bd_num += hns3_fill_desc(ring, sg_dma_address(sgt->sgl + i),
++					 sg_dma_len(sgt->sgl + i));
++	hns3_ring_stats_update(ring, tx_sgl);
++
++	return bd_num;
++}
++
++static int hns3_handle_desc_filling(struct hns3_enet_ring *ring,
++				    struct sk_buff *skb)
++{
++	u32 space;
++
++	if (!ring->tx_spare)
++		goto out;
++
++	space = hns3_tx_spare_space(ring);
++
++	if (hns3_can_use_tx_sgl(ring, skb, space))
++		return hns3_handle_tx_sgl(ring, skb);
++
++	if (hns3_can_use_tx_bounce(ring, skb, space))
++		return hns3_handle_tx_bounce(ring, skb);
++
++out:
++	return hns3_fill_skb_to_desc(ring, skb, DESC_TYPE_SKB);
++}
++
++static int hns3_handle_skb_desc(struct hns3_enet_ring *ring,
++				struct sk_buff *skb,
++				struct hns3_desc_cb *desc_cb,
++				int next_to_use_head)
++{
++	int ret;
++
++	ret = hns3_fill_skb_desc(ring, skb, &ring->desc[ring->next_to_use],
++				 desc_cb);
++	if (unlikely(ret < 0))
++		goto fill_err;
++
++	/* 'ret < 0' means filling error, 'ret == 0' means skb->len is
++	 * zero, which is unlikely, and 'ret > 0' means how many tx desc
++	 * need to be notified to the hw.
++	 */
++	ret = hns3_handle_desc_filling(ring, skb);
++	if (likely(ret > 0))
++		return ret;
++
++fill_err:
++	hns3_clear_desc(ring, next_to_use_head);
++	return ret;
++}
++
++netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hns3_enet_ring *ring = &priv->ring[skb->queue_mapping];
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use];
++	struct netdev_queue *dev_queue;
++	int pre_ntu, ret;
++	bool doorbell;
++
++	/* Hardware can only handle short frames above 32 bytes */
++	if (skb_put_padto(skb, HNS3_MIN_TX_LEN)) {
++		hns3_tx_doorbell(ring, 0, !netdev_xmit_more());
++
++		hns3_ring_stats_update(ring, sw_err_cnt);
++
++		return NETDEV_TX_OK;
++	}
++
++	/* Prefetch the data used later */
++	prefetch(skb->data);
++
++	ret = hns3_nic_maybe_stop_tx(ring, netdev, skb);
++	if (unlikely(ret <= 0)) {
++		if (ret == -EBUSY) {
++			hns3_tx_doorbell(ring, 0, true);
++			return NETDEV_TX_BUSY;
++		}
++
++		hns3_rl_err(netdev, "xmit error: %d!\n", ret);
++		goto out_err_tx_ok;
++	}
++
++	ret = hns3_handle_skb_desc(ring, skb, desc_cb, ring->next_to_use);
++	if (unlikely(ret <= 0))
++		goto out_err_tx_ok;
++
++	pre_ntu = ring->next_to_use ? (ring->next_to_use - 1) :
++					(ring->desc_num - 1);
++
++	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
++		hns3_tsyn(netdev, skb, &ring->desc[pre_ntu]);
++
++	ring->desc[pre_ntu].tx.bdtp_fe_sc_vld_ra_ri |=
++				cpu_to_le16(BIT(HNS3_TXD_FE_B));
++	trace_hns3_tx_desc(ring, pre_ntu);
++
++	skb_tx_timestamp(skb);
++
++	/* Complete translate all packets */
++	dev_queue = netdev_get_tx_queue(netdev, ring->queue_index);
++	doorbell = __netdev_tx_sent_queue(dev_queue, desc_cb->send_bytes,
++					  netdev_xmit_more());
++	hns3_tx_doorbell(ring, ret, doorbell);
++
++	return NETDEV_TX_OK;
++
++out_err_tx_ok:
++	dev_kfree_skb_any(skb);
++	hns3_tx_doorbell(ring, 0, !netdev_xmit_more());
++	return NETDEV_TX_OK;
++}
++
++static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p)
++{
++	char format_mac_addr_perm[HNAE3_FORMAT_MAC_ADDR_LEN];
++	char format_mac_addr_sa[HNAE3_FORMAT_MAC_ADDR_LEN];
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct sockaddr *mac_addr = p;
++	int ret;
++
++	if (!mac_addr || !is_valid_ether_addr((const u8 *)mac_addr->sa_data))
++		return -EADDRNOTAVAIL;
++
++	if (ether_addr_equal(netdev->dev_addr, mac_addr->sa_data)) {
++		hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data);
++		netdev_info(netdev, "already using mac address %s\n",
++			    format_mac_addr_sa);
++		return 0;
++	}
++
++	/* For VF device, if there is a perm_addr, then the user will not
++	 * be allowed to change the address.
++	 */
++	if (!hns3_is_phys_func(h->pdev) &&
++	    !is_zero_ether_addr(netdev->perm_addr)) {
++		hnae3_format_mac_addr(format_mac_addr_perm, netdev->perm_addr);
++		hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data);
++		netdev_err(netdev, "has permanent MAC %s, user MAC %s not allow\n",
++			   format_mac_addr_perm, format_mac_addr_sa);
++		return -EPERM;
++	}
++
++	ret = h->ae_algo->ops->set_mac_addr(h, mac_addr->sa_data, false);
++	if (ret) {
++		netdev_err(netdev, "set_mac_address fail, ret=%d!\n", ret);
++		return ret;
++	}
++
++	eth_hw_addr_set(netdev, mac_addr->sa_data);
++
++	return 0;
++}
++
++static int hns3_nic_do_ioctl(struct net_device *netdev,
++			     struct ifreq *ifr, int cmd)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (!netif_running(netdev))
++		return -EINVAL;
++
++	if (!h->ae_algo->ops->do_ioctl)
++		return -EOPNOTSUPP;
++
++	return h->ae_algo->ops->do_ioctl(h, ifr, cmd);
++}
++
++static int hns3_nic_set_features(struct net_device *netdev,
++				 netdev_features_t features)
++{
++	netdev_features_t changed = netdev->features ^ features;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = priv->ae_handle;
++	bool enable;
++	int ret;
++
++	if (changed & (NETIF_F_GRO_HW) && h->ae_algo->ops->set_gro_en) {
++		enable = !!(features & NETIF_F_GRO_HW);
++		ret = h->ae_algo->ops->set_gro_en(h, enable);
++		if (ret)
++			return ret;
++	}
++
++	if ((changed & NETIF_F_HW_VLAN_CTAG_RX) &&
++	    h->ae_algo->ops->enable_hw_strip_rxvtag) {
++		enable = !!(features & NETIF_F_HW_VLAN_CTAG_RX);
++		ret = h->ae_algo->ops->enable_hw_strip_rxvtag(h, enable);
++		if (ret)
++			return ret;
++	}
++
++	if ((changed & NETIF_F_NTUPLE) && h->ae_algo->ops->enable_fd) {
++		enable = !!(features & NETIF_F_NTUPLE);
++		h->ae_algo->ops->enable_fd(h, enable);
++	}
++
++	if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) &&
++	    h->ae_algo->ops->cls_flower_active(h)) {
++		netdev_err(netdev,
++			   "there are offloaded TC filters active, cannot disable HW TC offload");
++		return -EINVAL;
++	}
++
++	if ((changed & NETIF_F_HW_VLAN_CTAG_FILTER) &&
++	    h->ae_algo->ops->enable_vlan_filter) {
++		enable = !!(features & NETIF_F_HW_VLAN_CTAG_FILTER);
++		ret = h->ae_algo->ops->enable_vlan_filter(h, enable);
++		if (ret)
++			return ret;
++	}
++
++	netdev->features = features;
++	return 0;
++}
++
++static netdev_features_t hns3_features_check(struct sk_buff *skb,
++					     struct net_device *dev,
++					     netdev_features_t features)
++{
++#define HNS3_MAX_HDR_LEN	480U
++#define HNS3_MAX_L4_HDR_LEN	60U
++
++	size_t len;
++
++	if (skb->ip_summed != CHECKSUM_PARTIAL)
++		return features;
++
++	if (skb->encapsulation)
++		len = skb_inner_transport_header(skb) - skb->data;
++	else
++		len = skb_transport_header(skb) - skb->data;
++
++	/* Assume L4 is 60 byte as TCP is the only protocol with a
++	 * a flexible value, and it's max len is 60 bytes.
++	 */
++	len += HNS3_MAX_L4_HDR_LEN;
++
++	/* Hardware only supports checksum on the skb with a max header
++	 * len of 480 bytes.
++	 */
++	if (len > HNS3_MAX_HDR_LEN)
++		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
++
++	return features;
++}
++
++static void hns3_fetch_stats(struct rtnl_link_stats64 *stats,
++			     struct hns3_enet_ring *ring, bool is_tx)
++{
++	unsigned int start;
++
++	do {
++		start = u64_stats_fetch_begin_irq(&ring->syncp);
++		if (is_tx) {
++			stats->tx_bytes += ring->stats.tx_bytes;
++			stats->tx_packets += ring->stats.tx_pkts;
++			stats->tx_dropped += ring->stats.sw_err_cnt;
++			stats->tx_dropped += ring->stats.tx_vlan_err;
++			stats->tx_dropped += ring->stats.tx_l4_proto_err;
++			stats->tx_dropped += ring->stats.tx_l2l3l4_err;
++			stats->tx_dropped += ring->stats.tx_tso_err;
++			stats->tx_dropped += ring->stats.over_max_recursion;
++			stats->tx_dropped += ring->stats.hw_limitation;
++			stats->tx_dropped += ring->stats.copy_bits_err;
++			stats->tx_dropped += ring->stats.skb2sgl_err;
++			stats->tx_dropped += ring->stats.map_sg_err;
++			stats->tx_errors += ring->stats.sw_err_cnt;
++			stats->tx_errors += ring->stats.tx_vlan_err;
++			stats->tx_errors += ring->stats.tx_l4_proto_err;
++			stats->tx_errors += ring->stats.tx_l2l3l4_err;
++			stats->tx_errors += ring->stats.tx_tso_err;
++			stats->tx_errors += ring->stats.over_max_recursion;
++			stats->tx_errors += ring->stats.hw_limitation;
++			stats->tx_errors += ring->stats.copy_bits_err;
++			stats->tx_errors += ring->stats.skb2sgl_err;
++			stats->tx_errors += ring->stats.map_sg_err;
++		} else {
++			stats->rx_bytes += ring->stats.rx_bytes;
++			stats->rx_packets += ring->stats.rx_pkts;
++			stats->rx_dropped += ring->stats.l2_err;
++			stats->rx_errors += ring->stats.l2_err;
++			stats->rx_errors += ring->stats.l3l4_csum_err;
++			stats->rx_crc_errors += ring->stats.l2_err;
++			stats->multicast += ring->stats.rx_multicast;
++			stats->rx_length_errors += ring->stats.err_pkt_len;
++		}
++	} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
++}
++
++static void hns3_nic_get_stats64(struct net_device *netdev,
++				 struct rtnl_link_stats64 *stats)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	int queue_num = priv->ae_handle->kinfo.num_tqps;
++	struct hnae3_handle *handle = priv->ae_handle;
++	struct rtnl_link_stats64 ring_total_stats;
++	struct hns3_enet_ring *ring;
++	unsigned int idx;
++
++	if (test_bit(HNS3_NIC_STATE_DOWN, &priv->state))
++		return;
++
++	handle->ae_algo->ops->update_stats(handle, &netdev->stats);
++
++	memset(&ring_total_stats, 0, sizeof(ring_total_stats));
++	for (idx = 0; idx < queue_num; idx++) {
++		/* fetch the tx stats */
++		ring = &priv->ring[idx];
++		hns3_fetch_stats(&ring_total_stats, ring, true);
++
++		/* fetch the rx stats */
++		ring = &priv->ring[idx + queue_num];
++		hns3_fetch_stats(&ring_total_stats, ring, false);
++	}
++
++	stats->tx_bytes = ring_total_stats.tx_bytes;
++	stats->tx_packets = ring_total_stats.tx_packets;
++	stats->rx_bytes = ring_total_stats.rx_bytes;
++	stats->rx_packets = ring_total_stats.rx_packets;
++
++	stats->rx_errors = ring_total_stats.rx_errors;
++	stats->multicast = ring_total_stats.multicast;
++	stats->rx_length_errors = ring_total_stats.rx_length_errors;
++	stats->rx_crc_errors = ring_total_stats.rx_crc_errors;
++	stats->rx_missed_errors = netdev->stats.rx_missed_errors;
++
++	stats->tx_errors = ring_total_stats.tx_errors;
++	stats->rx_dropped = ring_total_stats.rx_dropped;
++	stats->tx_dropped = ring_total_stats.tx_dropped;
++	stats->collisions = netdev->stats.collisions;
++	stats->rx_over_errors = netdev->stats.rx_over_errors;
++	stats->rx_frame_errors = netdev->stats.rx_frame_errors;
++	stats->rx_fifo_errors = netdev->stats.rx_fifo_errors;
++	stats->tx_aborted_errors = netdev->stats.tx_aborted_errors;
++	stats->tx_carrier_errors = netdev->stats.tx_carrier_errors;
++	stats->tx_fifo_errors = netdev->stats.tx_fifo_errors;
++	stats->tx_heartbeat_errors = netdev->stats.tx_heartbeat_errors;
++	stats->tx_window_errors = netdev->stats.tx_window_errors;
++	stats->rx_compressed = netdev->stats.rx_compressed;
++	stats->tx_compressed = netdev->stats.tx_compressed;
++}
++
++static int hns3_setup_tc(struct net_device *netdev, void *type_data)
++{
++	struct tc_mqprio_qopt_offload *mqprio_qopt = type_data;
++	struct hnae3_knic_private_info *kinfo;
++	u8 tc = mqprio_qopt->qopt.num_tc;
++	u16 mode = mqprio_qopt->mode;
++	u8 hw = mqprio_qopt->qopt.hw;
++	struct hnae3_handle *h;
++
++	if (!((hw == TC_MQPRIO_HW_OFFLOAD_TCS &&
++	       mode == TC_MQPRIO_MODE_CHANNEL) || (!hw && tc == 0)))
++		return -EOPNOTSUPP;
++
++	if (tc > HNAE3_MAX_TC)
++		return -EINVAL;
++
++	if (!netdev)
++		return -EINVAL;
++
++	h = hns3_get_handle(netdev);
++	kinfo = &h->kinfo;
++
++	netif_dbg(h, drv, netdev, "setup tc: num_tc=%u\n", tc);
++
++	return (kinfo->dcb_ops && kinfo->dcb_ops->setup_tc) ?
++		kinfo->dcb_ops->setup_tc(h, mqprio_qopt) : -EOPNOTSUPP;
++}
++
++static int hns3_setup_tc_cls_flower(struct hns3_nic_priv *priv,
++				    struct flow_cls_offload *flow)
++{
++	int tc = tc_classid_to_hwtc(priv->netdev, flow->classid);
++	struct hnae3_handle *h = hns3_get_handle(priv->netdev);
++
++	switch (flow->command) {
++	case FLOW_CLS_REPLACE:
++		if (h->ae_algo->ops->add_cls_flower)
++			return h->ae_algo->ops->add_cls_flower(h, flow, tc);
++		break;
++	case FLOW_CLS_DESTROY:
++		if (h->ae_algo->ops->del_cls_flower)
++			return h->ae_algo->ops->del_cls_flower(h, flow);
++		break;
++	default:
++		break;
++	}
++
++	return -EOPNOTSUPP;
++}
++
++static int hns3_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
++				  void *cb_priv)
++{
++	struct hns3_nic_priv *priv = cb_priv;
++
++	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
++		return -EOPNOTSUPP;
++
++	switch (type) {
++	case TC_SETUP_CLSFLOWER:
++		return hns3_setup_tc_cls_flower(priv, type_data);
++	default:
++		return -EOPNOTSUPP;
++	}
++}
++
++static LIST_HEAD(hns3_block_cb_list);
++
++static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type,
++			     void *type_data)
++{
++	struct hns3_nic_priv *priv = netdev_priv(dev);
++	int ret;
++
++	switch (type) {
++	case TC_SETUP_QDISC_MQPRIO:
++		ret = hns3_setup_tc(dev, type_data);
++		break;
++	case TC_SETUP_BLOCK:
++		ret = flow_block_cb_setup_simple(type_data,
++						 &hns3_block_cb_list,
++						 hns3_setup_tc_block_cb,
++						 priv, priv, true);
++		break;
++	default:
++		return -EOPNOTSUPP;
++	}
++
++	return ret;
++}
++
++static int hns3_vlan_rx_add_vid(struct net_device *netdev,
++				__be16 proto, u16 vid)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret = -EIO;
++
++	if (h->ae_algo->ops->set_vlan_filter)
++		ret = h->ae_algo->ops->set_vlan_filter(h, proto, vid, false);
++
++	return ret;
++}
++
++static int hns3_vlan_rx_kill_vid(struct net_device *netdev,
++				 __be16 proto, u16 vid)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret = -EIO;
++
++	if (h->ae_algo->ops->set_vlan_filter)
++		ret = h->ae_algo->ops->set_vlan_filter(h, proto, vid, true);
++
++	return ret;
++}
++
++static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
++				u8 qos, __be16 vlan_proto)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret = -EIO;
++
++	netif_dbg(h, drv, netdev,
++		  "set vf vlan: vf=%d, vlan=%u, qos=%u, vlan_proto=0x%x\n",
++		  vf, vlan, qos, ntohs(vlan_proto));
++
++	if (h->ae_algo->ops->set_vf_vlan_filter)
++		ret = h->ae_algo->ops->set_vf_vlan_filter(h, vf, vlan,
++							  qos, vlan_proto);
++
++	return ret;
++}
++
++static int hns3_set_vf_spoofchk(struct net_device *netdev, int vf, bool enable)
++{
++	struct hnae3_handle *handle = hns3_get_handle(netdev);
++
++	if (hns3_nic_resetting(netdev))
++		return -EBUSY;
++
++	if (!handle->ae_algo->ops->set_vf_spoofchk)
++		return -EOPNOTSUPP;
++
++	return handle->ae_algo->ops->set_vf_spoofchk(handle, vf, enable);
++}
++
++static int hns3_set_vf_trust(struct net_device *netdev, int vf, bool enable)
++{
++	struct hnae3_handle *handle = hns3_get_handle(netdev);
++
++	if (!handle->ae_algo->ops->set_vf_trust)
++		return -EOPNOTSUPP;
++
++	return handle->ae_algo->ops->set_vf_trust(handle, vf, enable);
++}
++
++static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret;
++
++	if (hns3_nic_resetting(netdev))
++		return -EBUSY;
++
++	if (!h->ae_algo->ops->set_mtu)
++		return -EOPNOTSUPP;
++
++	netif_dbg(h, drv, netdev,
++		  "change mtu from %u to %d\n", netdev->mtu, new_mtu);
++
++	ret = h->ae_algo->ops->set_mtu(h, new_mtu);
++	if (ret)
++		netdev_err(netdev, "failed to change MTU in hardware %d\n",
++			   ret);
++	else
++		netdev->mtu = new_mtu;
++
++	return ret;
++}
++
++static int hns3_get_timeout_queue(struct net_device *ndev)
++{
++	int i;
++
++	/* Find the stopped queue the same way the stack does */
++	for (i = 0; i < ndev->num_tx_queues; i++) {
++		struct netdev_queue *q;
++		unsigned long trans_start;
++
++		q = netdev_get_tx_queue(ndev, i);
++		trans_start = READ_ONCE(q->trans_start);
++		if (netif_xmit_stopped(q) &&
++		    time_after(jiffies,
++			       (trans_start + ndev->watchdog_timeo))) {
++#ifdef CONFIG_BQL
++			struct dql *dql = &q->dql;
++
++			netdev_info(ndev, "DQL info last_cnt: %u, queued: %u, adj_limit: %u, completed: %u\n",
++				    dql->last_obj_cnt, dql->num_queued,
++				    dql->adj_limit, dql->num_completed);
++#endif
++			netdev_info(ndev, "queue state: 0x%lx, delta msecs: %u\n",
++				    q->state,
++				    jiffies_to_msecs(jiffies - trans_start));
++			break;
++		}
++	}
++
++	return i;
++}
++
++static void hns3_dump_queue_stats(struct net_device *ndev,
++				  struct hns3_enet_ring *tx_ring,
++				  int timeout_queue)
++{
++	struct napi_struct *napi = &tx_ring->tqp_vector->napi;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++
++	netdev_info(ndev,
++		    "tx_timeout count: %llu, queue id: %d, SW_NTU: 0x%x, SW_NTC: 0x%x, napi state: %lu\n",
++		    priv->tx_timeout_count, timeout_queue, tx_ring->next_to_use,
++		    tx_ring->next_to_clean, napi->state);
++
++	netdev_info(ndev,
++		    "tx_pkts: %llu, tx_bytes: %llu, sw_err_cnt: %llu, tx_pending: %d\n",
++		    tx_ring->stats.tx_pkts, tx_ring->stats.tx_bytes,
++		    tx_ring->stats.sw_err_cnt, tx_ring->pending_buf);
++
++	netdev_info(ndev,
++		    "seg_pkt_cnt: %llu, tx_more: %llu, restart_queue: %llu, tx_busy: %llu\n",
++		    tx_ring->stats.seg_pkt_cnt, tx_ring->stats.tx_more,
++		    tx_ring->stats.restart_queue, tx_ring->stats.tx_busy);
++
++	netdev_info(ndev, "tx_push: %llu, tx_mem_doorbell: %llu\n",
++		    tx_ring->stats.tx_push, tx_ring->stats.tx_mem_doorbell);
++}
++
++static void hns3_dump_queue_reg(struct net_device *ndev,
++				struct hns3_enet_ring *tx_ring)
++{
++	netdev_info(ndev,
++		    "BD_NUM: 0x%x HW_HEAD: 0x%x, HW_TAIL: 0x%x, BD_ERR: 0x%x, INT: 0x%x\n",
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_NUM_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_HEAD_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TAIL_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_ERR_REG),
++		    readl(tx_ring->tqp_vector->mask_addr));
++	netdev_info(ndev,
++		    "RING_EN: 0x%x, TC: 0x%x, FBD_NUM: 0x%x FBD_OFT: 0x%x, EBD_NUM: 0x%x, EBD_OFT: 0x%x\n",
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_EN_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TC_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_FBDNUM_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_OFFSET_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_EBDNUM_REG),
++		    hns3_tqp_read_reg(tx_ring,
++				      HNS3_RING_TX_RING_EBD_OFFSET_REG));
++}
++
++static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	struct hnae3_handle *h = hns3_get_handle(ndev);
++	struct hns3_enet_ring *tx_ring;
++	int timeout_queue;
++
++	timeout_queue = hns3_get_timeout_queue(ndev);
++	if (timeout_queue >= ndev->num_tx_queues) {
++		netdev_info(ndev,
++			    "no netdev TX timeout queue found, timeout count: %llu\n",
++			    priv->tx_timeout_count);
++		return false;
++	}
++
++	priv->tx_timeout_count++;
++
++	tx_ring = &priv->ring[timeout_queue];
++	hns3_dump_queue_stats(ndev, tx_ring, timeout_queue);
++
++	/* When mac received many pause frames continuous, it's unable to send
++	 * packets, which may cause tx timeout
++	 */
++	if (h->ae_algo->ops->get_mac_stats) {
++		struct hns3_mac_stats mac_stats;
++
++		h->ae_algo->ops->get_mac_stats(h, &mac_stats);
++		netdev_info(ndev, "tx_pause_cnt: %llu, rx_pause_cnt: %llu\n",
++			    mac_stats.tx_pause_cnt, mac_stats.rx_pause_cnt);
++	}
++
++	hns3_dump_queue_reg(ndev, tx_ring);
++
++	return true;
++}
++
++static void hns3_nic_net_timeout(struct net_device *ndev, unsigned int txqueue)
++{
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	struct hnae3_handle *h = priv->ae_handle;
++
++	if (!hns3_get_tx_timeo_queue_info(ndev))
++		return;
++
++	/* request the reset, and let the hclge to determine
++	 * which reset level should be done
++	 */
++	if (h->ae_algo->ops->reset_event)
++		h->ae_algo->ops->reset_event(h->pdev, h);
++}
++
++#ifdef CONFIG_RFS_ACCEL
++static int hns3_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
++			      u16 rxq_index, u32 flow_id)
++{
++	struct hnae3_handle *h = hns3_get_handle(dev);
++	struct flow_keys fkeys;
++
++	if (!h->ae_algo->ops->add_arfs_entry)
++		return -EOPNOTSUPP;
++
++	if (skb->encapsulation)
++		return -EPROTONOSUPPORT;
++
++	if (!skb_flow_dissect_flow_keys(skb, &fkeys, 0))
++		return -EPROTONOSUPPORT;
++
++	if ((fkeys.basic.n_proto != htons(ETH_P_IP) &&
++	     fkeys.basic.n_proto != htons(ETH_P_IPV6)) ||
++	    (fkeys.basic.ip_proto != IPPROTO_TCP &&
++	     fkeys.basic.ip_proto != IPPROTO_UDP))
++		return -EPROTONOSUPPORT;
++
++	return h->ae_algo->ops->add_arfs_entry(h, rxq_index, flow_id, &fkeys);
++}
++#endif
++
++static int hns3_nic_get_vf_config(struct net_device *ndev, int vf,
++				  struct ifla_vf_info *ivf)
++{
++	struct hnae3_handle *h = hns3_get_handle(ndev);
++
++	if (!h->ae_algo->ops->get_vf_config)
++		return -EOPNOTSUPP;
++
++	return h->ae_algo->ops->get_vf_config(h, vf, ivf);
++}
++
++static int hns3_nic_set_vf_link_state(struct net_device *ndev, int vf,
++				      int link_state)
++{
++	struct hnae3_handle *h = hns3_get_handle(ndev);
++
++	if (!h->ae_algo->ops->set_vf_link_state)
++		return -EOPNOTSUPP;
++
++	return h->ae_algo->ops->set_vf_link_state(h, vf, link_state);
++}
++
++static int hns3_nic_set_vf_rate(struct net_device *ndev, int vf,
++				int min_tx_rate, int max_tx_rate)
++{
++	struct hnae3_handle *h = hns3_get_handle(ndev);
++
++	if (!h->ae_algo->ops->set_vf_rate)
++		return -EOPNOTSUPP;
++
++	return h->ae_algo->ops->set_vf_rate(h, vf, min_tx_rate, max_tx_rate,
++					    false);
++}
++
++static int hns3_nic_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
++
++	if (!h->ae_algo->ops->set_vf_mac)
++		return -EOPNOTSUPP;
++
++	if (is_multicast_ether_addr(mac)) {
++		hnae3_format_mac_addr(format_mac_addr, mac);
++		netdev_err(netdev,
++			   "Invalid MAC:%s specified. Could not set MAC\n",
++			   format_mac_addr);
++		return -EINVAL;
++	}
++
++	return h->ae_algo->ops->set_vf_mac(h, vf_id, mac);
++}
++
++static const struct net_device_ops hns3_nic_netdev_ops = {
++	.ndo_open		= hns3_nic_net_open,
++	.ndo_stop		= hns3_nic_net_stop,
++	.ndo_start_xmit		= hns3_nic_net_xmit,
++	.ndo_tx_timeout		= hns3_nic_net_timeout,
++	.ndo_set_mac_address	= hns3_nic_net_set_mac_address,
++	.ndo_eth_ioctl		= hns3_nic_do_ioctl,
++	.ndo_change_mtu		= hns3_nic_change_mtu,
++	.ndo_set_features	= hns3_nic_set_features,
++	.ndo_features_check	= hns3_features_check,
++	.ndo_get_stats64	= hns3_nic_get_stats64,
++	.ndo_setup_tc		= hns3_nic_setup_tc,
++	.ndo_set_rx_mode	= hns3_nic_set_rx_mode,
++	.ndo_vlan_rx_add_vid	= hns3_vlan_rx_add_vid,
++	.ndo_vlan_rx_kill_vid	= hns3_vlan_rx_kill_vid,
++	.ndo_set_vf_vlan	= hns3_ndo_set_vf_vlan,
++	.ndo_set_vf_spoofchk	= hns3_set_vf_spoofchk,
++	.ndo_set_vf_trust	= hns3_set_vf_trust,
++#ifdef CONFIG_RFS_ACCEL
++	.ndo_rx_flow_steer	= hns3_rx_flow_steer,
++#endif
++	.ndo_get_vf_config	= hns3_nic_get_vf_config,
++	.ndo_set_vf_link_state	= hns3_nic_set_vf_link_state,
++	.ndo_set_vf_rate	= hns3_nic_set_vf_rate,
++	.ndo_set_vf_mac		= hns3_nic_set_vf_mac,
++};
++
++bool hns3_is_phys_func(struct pci_dev *pdev)
++{
++	u32 dev_id = pdev->device;
++
++	switch (dev_id) {
++	case HNAE3_DEV_ID_GE:
++	case HNAE3_DEV_ID_25GE:
++	case HNAE3_DEV_ID_25GE_RDMA:
++	case HNAE3_DEV_ID_25GE_RDMA_MACSEC:
++	case HNAE3_DEV_ID_50GE_RDMA:
++	case HNAE3_DEV_ID_50GE_RDMA_MACSEC:
++	case HNAE3_DEV_ID_100G_RDMA_MACSEC:
++	case HNAE3_DEV_ID_200G_RDMA:
++		return true;
++	case HNAE3_DEV_ID_VF:
++	case HNAE3_DEV_ID_RDMA_DCB_PFC_VF:
++		return false;
++	default:
++		dev_warn(&pdev->dev, "un-recognized pci device-id %u",
++			 dev_id);
++	}
++
++	return false;
++}
++
++static void hns3_disable_sriov(struct pci_dev *pdev)
++{
++	/* If our VFs are assigned we cannot shut down SR-IOV
++	 * without causing issues, so just leave the hardware
++	 * available but disabled
++	 */
++	if (pci_vfs_assigned(pdev)) {
++		dev_warn(&pdev->dev,
++			 "disabling driver while VFs are assigned\n");
++		return;
++	}
++
++	pci_disable_sriov(pdev);
++}
++
++/* hns3_probe - Device initialization routine
++ * @pdev: PCI device information struct
++ * @ent: entry in hns3_pci_tbl
++ *
++ * hns3_probe initializes a PF identified by a pci_dev structure.
++ * The OS initialization, configuring of the PF private structure,
++ * and a hardware reset occur.
++ *
++ * Returns 0 on success, negative on failure
++ */
++static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
++{
++	struct hnae3_ae_dev *ae_dev;
++	int ret;
++
++	ae_dev = devm_kzalloc(&pdev->dev, sizeof(*ae_dev), GFP_KERNEL);
++	if (!ae_dev)
++		return -ENOMEM;
++
++	ae_dev->pdev = pdev;
++	ae_dev->flag = ent->driver_data;
++	pci_set_drvdata(pdev, ae_dev);
++
++	ret = hnae3_register_ae_dev(ae_dev);
++	if (ret)
++		pci_set_drvdata(pdev, NULL);
++
++	return ret;
++}
++
++/**
++ * hns3_clean_vf_config
++ * @pdev: pointer to a pci_dev structure
++ * @num_vfs: number of VFs allocated
++ *
++ * Clean residual vf config after disable sriov
++ **/
++static void hns3_clean_vf_config(struct pci_dev *pdev, int num_vfs)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	if (ae_dev->ops->clean_vf_config)
++		ae_dev->ops->clean_vf_config(ae_dev, num_vfs);
++}
++
++/* hns3_remove - Device removal routine
++ * @pdev: PCI device information struct
++ */
++static void hns3_remove(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	if (hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))
++		hns3_disable_sriov(pdev);
++
++	hnae3_unregister_ae_dev(ae_dev);
++	pci_set_drvdata(pdev, NULL);
++}
++
++/**
++ * hns3_pci_sriov_configure
++ * @pdev: pointer to a pci_dev structure
++ * @num_vfs: number of VFs to allocate
++ *
++ * Enable or change the number of VFs. Called when the user updates the number
++ * of VFs in sysfs.
++ **/
++static int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
++{
++	int ret;
++
++	if (!(hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))) {
++		dev_warn(&pdev->dev, "Can not config SRIOV\n");
++		return -EINVAL;
++	}
++
++	if (num_vfs) {
++		ret = pci_enable_sriov(pdev, num_vfs);
++		if (ret)
++			dev_err(&pdev->dev, "SRIOV enable failed %d\n", ret);
++		else
++			return num_vfs;
++	} else if (!pci_vfs_assigned(pdev)) {
++		int num_vfs_pre = pci_num_vf(pdev);
++
++		pci_disable_sriov(pdev);
++		hns3_clean_vf_config(pdev, num_vfs_pre);
++	} else {
++		dev_warn(&pdev->dev,
++			 "Unable to free VFs because some are assigned to VMs.\n");
++	}
++
++	return 0;
++}
++
++static void hns3_shutdown(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	hnae3_unregister_ae_dev(ae_dev);
++	pci_set_drvdata(pdev, NULL);
++
++	if (system_state == SYSTEM_POWER_OFF)
++		pci_set_power_state(pdev, PCI_D3hot);
++}
++
++static int __maybe_unused hns3_suspend(struct device *dev)
++{
++	struct hnae3_ae_dev *ae_dev = dev_get_drvdata(dev);
++
++	if (ae_dev && hns3_is_phys_func(ae_dev->pdev)) {
++		dev_info(dev, "Begin to suspend.\n");
++		if (ae_dev->ops && ae_dev->ops->reset_prepare)
++			ae_dev->ops->reset_prepare(ae_dev, HNAE3_FUNC_RESET);
++	}
++
++	return 0;
++}
++
++static int __maybe_unused hns3_resume(struct device *dev)
++{
++	struct hnae3_ae_dev *ae_dev = dev_get_drvdata(dev);
++
++	if (ae_dev && hns3_is_phys_func(ae_dev->pdev)) {
++		dev_info(dev, "Begin to resume.\n");
++		if (ae_dev->ops && ae_dev->ops->reset_done)
++			ae_dev->ops->reset_done(ae_dev);
++	}
++
++	return 0;
++}
++
++static pci_ers_result_t hns3_error_detected(struct pci_dev *pdev,
++					    pci_channel_state_t state)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++	pci_ers_result_t ret;
++
++	dev_info(&pdev->dev, "PCI error detected, state(=%u)!!\n", state);
++
++	if (state == pci_channel_io_perm_failure)
++		return PCI_ERS_RESULT_DISCONNECT;
++
++	if (!ae_dev || !ae_dev->ops) {
++		dev_err(&pdev->dev,
++			"Can't recover - error happened before device initialized\n");
++		return PCI_ERS_RESULT_NONE;
++	}
++
++	if (ae_dev->ops->handle_hw_ras_error)
++		ret = ae_dev->ops->handle_hw_ras_error(ae_dev);
++	else
++		return PCI_ERS_RESULT_NONE;
++
++	return ret;
++}
++
++static pci_ers_result_t hns3_slot_reset(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++	const struct hnae3_ae_ops *ops;
++	enum hnae3_reset_type reset_type;
++	struct device *dev = &pdev->dev;
++
++	if (!ae_dev || !ae_dev->ops)
++		return PCI_ERS_RESULT_NONE;
++
++	ops = ae_dev->ops;
++	/* request the reset */
++	if (ops->reset_event && ops->get_reset_level &&
++	    ops->set_default_reset_request) {
++		if (ae_dev->hw_err_reset_req) {
++			reset_type = ops->get_reset_level(ae_dev,
++						&ae_dev->hw_err_reset_req);
++			ops->set_default_reset_request(ae_dev, reset_type);
++			dev_info(dev, "requesting reset due to PCI error\n");
++			ops->reset_event(pdev, NULL);
++		}
++
++		return PCI_ERS_RESULT_RECOVERED;
++	}
++
++	return PCI_ERS_RESULT_DISCONNECT;
++}
++
++static void hns3_reset_prepare(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	dev_info(&pdev->dev, "FLR prepare\n");
++	if (ae_dev && ae_dev->ops && ae_dev->ops->reset_prepare)
++		ae_dev->ops->reset_prepare(ae_dev, HNAE3_FLR_RESET);
++}
++
++static void hns3_reset_done(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	dev_info(&pdev->dev, "FLR done\n");
++	if (ae_dev && ae_dev->ops && ae_dev->ops->reset_done)
++		ae_dev->ops->reset_done(ae_dev);
++}
++
++static const struct pci_error_handlers hns3_err_handler = {
++	.error_detected = hns3_error_detected,
++	.slot_reset     = hns3_slot_reset,
++	.reset_prepare	= hns3_reset_prepare,
++	.reset_done	= hns3_reset_done,
++};
++
++static SIMPLE_DEV_PM_OPS(hns3_pm_ops, hns3_suspend, hns3_resume);
++
++static struct pci_driver hns3_driver = {
++	.name     = hns3_driver_name,
++	.id_table = hns3_pci_tbl,
++	.probe    = hns3_probe,
++	.remove   = hns3_remove,
++	.shutdown = hns3_shutdown,
++	.driver.pm  = &hns3_pm_ops,
++	.sriov_configure = hns3_pci_sriov_configure,
++	.err_handler    = &hns3_err_handler,
++};
++
++/* set default feature to hns3 */
++static void hns3_set_default_feature(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct pci_dev *pdev = h->pdev;
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	netdev->priv_flags |= IFF_UNICAST_FLT;
++
++	netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM;
++
++	netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER |
++		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
++		NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO |
++		NETIF_F_GRO | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_GSO_GRE |
++		NETIF_F_GSO_GRE_CSUM | NETIF_F_GSO_UDP_TUNNEL |
++		NETIF_F_SCTP_CRC | NETIF_F_FRAGLIST;
++
++	if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V2) {
++		netdev->features |= NETIF_F_GRO_HW;
++
++		if (!(h->flags & HNAE3_SUPPORT_VF))
++			netdev->features |= NETIF_F_NTUPLE;
++	}
++
++	if (test_bit(HNAE3_DEV_SUPPORT_UDP_GSO_B, ae_dev->caps))
++		netdev->features |= NETIF_F_GSO_UDP_L4;
++
++	if (test_bit(HNAE3_DEV_SUPPORT_HW_TX_CSUM_B, ae_dev->caps))
++		netdev->features |= NETIF_F_HW_CSUM;
++	else
++		netdev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
++
++	if (test_bit(HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B, ae_dev->caps))
++		netdev->features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
++
++	if (test_bit(HNAE3_DEV_SUPPORT_FD_FORWARD_TC_B, ae_dev->caps))
++		netdev->features |= NETIF_F_HW_TC;
++
++	netdev->hw_features |= netdev->features;
++	if (!test_bit(HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, ae_dev->caps))
++		netdev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
++
++	netdev->vlan_features |= netdev->features &
++		~(NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_TX |
++		  NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_GRO_HW | NETIF_F_NTUPLE |
++		  NETIF_F_HW_TC);
++
++	netdev->hw_enc_features |= netdev->vlan_features | NETIF_F_TSO_MANGLEID;
++}
++
++static int hns3_alloc_buffer(struct hns3_enet_ring *ring,
++			     struct hns3_desc_cb *cb)
++{
++	unsigned int order = hns3_page_order(ring);
++	struct page *p;
++
++	if (ring->page_pool) {
++		p = page_pool_dev_alloc_frag(ring->page_pool,
++					     &cb->page_offset,
++					     hns3_buf_size(ring));
++		if (unlikely(!p))
++			return -ENOMEM;
++
++		cb->priv = p;
++		cb->buf = page_address(p);
++		cb->dma = page_pool_get_dma_addr(p);
++		cb->type = DESC_TYPE_PP_FRAG;
++		cb->reuse_flag = 0;
++		return 0;
++	}
++
++	p = dev_alloc_pages(order);
++	if (!p)
++		return -ENOMEM;
++
++	cb->priv = p;
++	cb->page_offset = 0;
++	cb->reuse_flag = 0;
++	cb->buf  = page_address(p);
++	cb->length = hns3_page_size(ring);
++	cb->type = DESC_TYPE_PAGE;
++	page_ref_add(p, USHRT_MAX - 1);
++	cb->pagecnt_bias = USHRT_MAX;
++
++	return 0;
++}
++
++static void hns3_free_buffer(struct hns3_enet_ring *ring,
++			     struct hns3_desc_cb *cb, int budget)
++{
++	if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD |
++			DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB))
++		napi_consume_skb(cb->priv, budget);
++	else if (!HNAE3_IS_TX_RING(ring)) {
++		if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias)
++			__page_frag_cache_drain(cb->priv, cb->pagecnt_bias);
++		else if (cb->type & DESC_TYPE_PP_FRAG)
++			page_pool_put_full_page(ring->page_pool, cb->priv,
++						false);
++	}
++	memset(cb, 0, sizeof(*cb));
++}
++
++static int hns3_map_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb)
++{
++	cb->dma = dma_map_page(ring_to_dev(ring), cb->priv, 0,
++			       cb->length, ring_to_dma_dir(ring));
++
++	if (unlikely(dma_mapping_error(ring_to_dev(ring), cb->dma)))
++		return -EIO;
++
++	return 0;
++}
++
++static void hns3_unmap_buffer(struct hns3_enet_ring *ring,
++			      struct hns3_desc_cb *cb)
++{
++	if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB))
++		dma_unmap_single(ring_to_dev(ring), cb->dma, cb->length,
++				 ring_to_dma_dir(ring));
++	else if ((cb->type & DESC_TYPE_PAGE) && cb->length)
++		dma_unmap_page(ring_to_dev(ring), cb->dma, cb->length,
++			       ring_to_dma_dir(ring));
++	else if (cb->type & (DESC_TYPE_BOUNCE_ALL | DESC_TYPE_BOUNCE_HEAD |
++			     DESC_TYPE_SGL_SKB))
++		hns3_tx_spare_reclaim_cb(ring, cb);
++}
++
++static void hns3_buffer_detach(struct hns3_enet_ring *ring, int i)
++{
++	hns3_unmap_buffer(ring, &ring->desc_cb[i]);
++	ring->desc[i].addr = 0;
++	ring->desc_cb[i].refill = 0;
++}
++
++static void hns3_free_buffer_detach(struct hns3_enet_ring *ring, int i,
++				    int budget)
++{
++	struct hns3_desc_cb *cb = &ring->desc_cb[i];
++
++	if (!ring->desc_cb[i].dma)
++		return;
++
++	hns3_buffer_detach(ring, i);
++	hns3_free_buffer(ring, cb, budget);
++}
++
++static void hns3_free_buffers(struct hns3_enet_ring *ring)
++{
++	int i;
++
++	for (i = 0; i < ring->desc_num; i++)
++		hns3_free_buffer_detach(ring, i, 0);
++}
++
++/* free desc along with its attached buffer */
++static void hns3_free_desc(struct hns3_enet_ring *ring)
++{
++	int size = ring->desc_num * sizeof(ring->desc[0]);
++
++	hns3_free_buffers(ring);
++
++	if (ring->desc) {
++		dma_free_coherent(ring_to_dev(ring), size,
++				  ring->desc, ring->desc_dma_addr);
++		ring->desc = NULL;
++	}
++}
++
++static int hns3_alloc_desc(struct hns3_enet_ring *ring)
++{
++	int size = ring->desc_num * sizeof(ring->desc[0]);
++
++	ring->desc = dma_alloc_coherent(ring_to_dev(ring), size,
++					&ring->desc_dma_addr, GFP_KERNEL);
++	if (!ring->desc)
++		return -ENOMEM;
++
++	return 0;
++}
++
++static int hns3_alloc_and_map_buffer(struct hns3_enet_ring *ring,
++				   struct hns3_desc_cb *cb)
++{
++	int ret;
++
++	ret = hns3_alloc_buffer(ring, cb);
++	if (ret || ring->page_pool)
++		goto out;
++
++	ret = hns3_map_buffer(ring, cb);
++	if (ret)
++		goto out_with_buf;
++
++	return 0;
++
++out_with_buf:
++	hns3_free_buffer(ring, cb, 0);
++out:
++	return ret;
++}
++
++static int hns3_alloc_and_attach_buffer(struct hns3_enet_ring *ring, int i)
++{
++	int ret = hns3_alloc_and_map_buffer(ring, &ring->desc_cb[i]);
++
++	if (ret)
++		return ret;
++
++	ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
++					 ring->desc_cb[i].page_offset);
++	ring->desc_cb[i].refill = 1;
++
++	return 0;
++}
++
++/* Allocate memory for raw pkg, and map with dma */
++static int hns3_alloc_ring_buffers(struct hns3_enet_ring *ring)
++{
++	int i, j, ret;
++
++	for (i = 0; i < ring->desc_num; i++) {
++		ret = hns3_alloc_and_attach_buffer(ring, i);
++		if (ret)
++			goto out_buffer_fail;
++	}
++
++	return 0;
++
++out_buffer_fail:
++	for (j = i - 1; j >= 0; j--)
++		hns3_free_buffer_detach(ring, j, 0);
++	return ret;
++}
++
++/* detach a in-used buffer and replace with a reserved one */
++static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i,
++				struct hns3_desc_cb *res_cb)
++{
++	hns3_unmap_buffer(ring, &ring->desc_cb[i]);
++	ring->desc_cb[i] = *res_cb;
++	ring->desc_cb[i].refill = 1;
++	ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
++					 ring->desc_cb[i].page_offset);
++	ring->desc[i].rx.bd_base_info = 0;
++}
++
++static void hns3_reuse_buffer(struct hns3_enet_ring *ring, int i)
++{
++	ring->desc_cb[i].reuse_flag = 0;
++	ring->desc_cb[i].refill = 1;
++	ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
++					 ring->desc_cb[i].page_offset);
++	ring->desc[i].rx.bd_base_info = 0;
++
++	dma_sync_single_for_device(ring_to_dev(ring),
++			ring->desc_cb[i].dma + ring->desc_cb[i].page_offset,
++			hns3_buf_size(ring),
++			DMA_FROM_DEVICE);
++}
++
++static bool hns3_nic_reclaim_desc(struct hns3_enet_ring *ring,
++				  int *bytes, int *pkts, int budget)
++{
++	/* pair with ring->last_to_use update in hns3_tx_doorbell(),
++	 * smp_store_release() is not used in hns3_tx_doorbell() because
++	 * the doorbell operation already have the needed barrier operation.
++	 */
++	int ltu = smp_load_acquire(&ring->last_to_use);
++	int ntc = ring->next_to_clean;
++	struct hns3_desc_cb *desc_cb;
++	bool reclaimed = false;
++	struct hns3_desc *desc;
++
++	while (ltu != ntc) {
++		desc = &ring->desc[ntc];
++
++		if (le16_to_cpu(desc->tx.bdtp_fe_sc_vld_ra_ri) &
++				BIT(HNS3_TXD_VLD_B))
++			break;
++
++		desc_cb = &ring->desc_cb[ntc];
++
++		if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_ALL |
++				     DESC_TYPE_BOUNCE_HEAD |
++				     DESC_TYPE_SGL_SKB)) {
++			(*pkts)++;
++			(*bytes) += desc_cb->send_bytes;
++		}
++
++		/* desc_cb will be cleaned, after hnae3_free_buffer_detach */
++		hns3_free_buffer_detach(ring, ntc, budget);
++
++		if (++ntc == ring->desc_num)
++			ntc = 0;
++
++		/* Issue prefetch for next Tx descriptor */
++		prefetch(&ring->desc_cb[ntc]);
++		reclaimed = true;
++	}
++
++	if (unlikely(!reclaimed))
++		return false;
++
++	/* This smp_store_release() pairs with smp_load_acquire() in
++	 * ring_space called by hns3_nic_net_xmit.
++	 */
++	smp_store_release(&ring->next_to_clean, ntc);
++
++	hns3_tx_spare_update(ring);
++
++	return true;
++}
++
++void hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct netdev_queue *dev_queue;
++	int bytes, pkts;
++
++	bytes = 0;
++	pkts = 0;
++
++	if (unlikely(!hns3_nic_reclaim_desc(ring, &bytes, &pkts, budget)))
++		return;
++
++	ring->tqp_vector->tx_group.total_bytes += bytes;
++	ring->tqp_vector->tx_group.total_packets += pkts;
++
++	u64_stats_update_begin(&ring->syncp);
++	ring->stats.tx_bytes += bytes;
++	ring->stats.tx_pkts += pkts;
++	u64_stats_update_end(&ring->syncp);
++
++	dev_queue = netdev_get_tx_queue(netdev, ring->tqp->tqp_index);
++	netdev_tx_completed_queue(dev_queue, pkts, bytes);
++
++	if (unlikely(netif_carrier_ok(netdev) &&
++		     ring_space(ring) > HNS3_MAX_TSO_BD_NUM)) {
++		/* Make sure that anybody stopping the queue after this
++		 * sees the new next_to_clean.
++		 */
++		smp_mb();
++		if (netif_tx_queue_stopped(dev_queue) &&
++		    !test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) {
++			netif_tx_wake_queue(dev_queue);
++			ring->stats.restart_queue++;
++		}
++	}
++}
++
++static int hns3_desc_unused(struct hns3_enet_ring *ring)
++{
++	int ntc = ring->next_to_clean;
++	int ntu = ring->next_to_use;
++
++	if (unlikely(ntc == ntu && !ring->desc_cb[ntc].refill))
++		return ring->desc_num;
++
++	return ((ntc >= ntu) ? 0 : ring->desc_num) + ntc - ntu;
++}
++
++/* Return true if there is any allocation failure */
++static bool hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring,
++				      int cleand_count)
++{
++	struct hns3_desc_cb *desc_cb;
++	struct hns3_desc_cb res_cbs;
++	int i, ret;
++
++	for (i = 0; i < cleand_count; i++) {
++		desc_cb = &ring->desc_cb[ring->next_to_use];
++		if (desc_cb->reuse_flag) {
++			hns3_ring_stats_update(ring, reuse_pg_cnt);
++
++			hns3_reuse_buffer(ring, ring->next_to_use);
++		} else {
++			ret = hns3_alloc_and_map_buffer(ring, &res_cbs);
++			if (ret) {
++				hns3_ring_stats_update(ring, sw_err_cnt);
++
++				hns3_rl_err(ring_to_netdev(ring),
++					    "alloc rx buffer failed: %d\n",
++					    ret);
++
++				writel(i, ring->tqp->io_base +
++				       HNS3_RING_RX_RING_HEAD_REG);
++				return true;
++			}
++			hns3_replace_buffer(ring, ring->next_to_use, &res_cbs);
++
++			hns3_ring_stats_update(ring, non_reuse_pg);
++		}
++
++		ring_ptr_move_fw(ring, next_to_use);
++	}
++
++	writel(i, ring->tqp->io_base + HNS3_RING_RX_RING_HEAD_REG);
++	return false;
++}
++
++static bool hns3_can_reuse_page(struct hns3_desc_cb *cb)
++{
++	return page_count(cb->priv) == cb->pagecnt_bias;
++}
++
++static int hns3_handle_rx_copybreak(struct sk_buff *skb, int i,
++				    struct hns3_enet_ring *ring,
++				    int pull_len,
++				    struct hns3_desc_cb *desc_cb)
++{
++	struct hns3_desc *desc = &ring->desc[ring->next_to_clean];
++	u32 frag_offset = desc_cb->page_offset + pull_len;
++	int size = le16_to_cpu(desc->rx.size);
++	u32 frag_size = size - pull_len;
++	void *frag = napi_alloc_frag(frag_size);
++
++	if (unlikely(!frag)) {
++		hns3_ring_stats_update(ring, frag_alloc_err);
++
++		hns3_rl_err(ring_to_netdev(ring),
++			    "failed to allocate rx frag\n");
++		return -ENOMEM;
++	}
++
++	desc_cb->reuse_flag = 1;
++	memcpy(frag, desc_cb->buf + frag_offset, frag_size);
++	skb_add_rx_frag(skb, i, virt_to_page(frag),
++			offset_in_page(frag), frag_size, frag_size);
++
++	hns3_ring_stats_update(ring, frag_alloc);
++	return 0;
++}
++
++static void hns3_nic_reuse_page(struct sk_buff *skb, int i,
++				struct hns3_enet_ring *ring, int pull_len,
++				struct hns3_desc_cb *desc_cb)
++{
++	struct hns3_desc *desc = &ring->desc[ring->next_to_clean];
++	u32 frag_offset = desc_cb->page_offset + pull_len;
++	int size = le16_to_cpu(desc->rx.size);
++	u32 truesize = hns3_buf_size(ring);
++	u32 frag_size = size - pull_len;
++	int ret = 0;
++	bool reused;
++
++	if (ring->page_pool) {
++		skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset,
++				frag_size, truesize);
++		return;
++	}
++
++	/* Avoid re-using remote or pfmem page */
++	if (unlikely(!dev_page_is_reusable(desc_cb->priv)))
++		goto out;
++
++	reused = hns3_can_reuse_page(desc_cb);
++
++	/* Rx page can be reused when:
++	 * 1. Rx page is only owned by the driver when page_offset
++	 *    is zero, which means 0 @ truesize will be used by
++	 *    stack after skb_add_rx_frag() is called, and the rest
++	 *    of rx page can be reused by driver.
++	 * Or
++	 * 2. Rx page is only owned by the driver when page_offset
++	 *    is non-zero, which means page_offset @ truesize will
++	 *    be used by stack after skb_add_rx_frag() is called,
++	 *    and 0 @ truesize can be reused by driver.
++	 */
++	if ((!desc_cb->page_offset && reused) ||
++	    ((desc_cb->page_offset + truesize + truesize) <=
++	     hns3_page_size(ring) && desc_cb->page_offset)) {
++		desc_cb->page_offset += truesize;
++		desc_cb->reuse_flag = 1;
++	} else if (desc_cb->page_offset && reused) {
++		desc_cb->page_offset = 0;
++		desc_cb->reuse_flag = 1;
++	} else if (frag_size <= ring->rx_copybreak) {
++		ret = hns3_handle_rx_copybreak(skb, i, ring, pull_len, desc_cb);
++		if (!ret)
++			return;
++	}
++
++out:
++	desc_cb->pagecnt_bias--;
++
++	if (unlikely(!desc_cb->pagecnt_bias)) {
++		page_ref_add(desc_cb->priv, USHRT_MAX);
++		desc_cb->pagecnt_bias = USHRT_MAX;
++	}
++
++	skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset,
++			frag_size, truesize);
++
++	if (unlikely(!desc_cb->reuse_flag))
++		__page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias);
++}
++
++static int hns3_gro_complete(struct sk_buff *skb, u32 l234info)
++{
++	__be16 type = skb->protocol;
++	struct tcphdr *th;
++	int depth = 0;
++
++	while (eth_type_vlan(type)) {
++		struct vlan_hdr *vh;
++
++		if ((depth + VLAN_HLEN) > skb_headlen(skb))
++			return -EFAULT;
++
++		vh = (struct vlan_hdr *)(skb->data + depth);
++		type = vh->h_vlan_encapsulated_proto;
++		depth += VLAN_HLEN;
++	}
++
++	skb_set_network_header(skb, depth);
++
++	if (type == htons(ETH_P_IP)) {
++		const struct iphdr *iph = ip_hdr(skb);
++
++		depth += sizeof(struct iphdr);
++		skb_set_transport_header(skb, depth);
++		th = tcp_hdr(skb);
++		th->check = ~tcp_v4_check(skb->len - depth, iph->saddr,
++					  iph->daddr, 0);
++	} else if (type == htons(ETH_P_IPV6)) {
++		const struct ipv6hdr *iph = ipv6_hdr(skb);
++
++		depth += sizeof(struct ipv6hdr);
++		skb_set_transport_header(skb, depth);
++		th = tcp_hdr(skb);
++		th->check = ~tcp_v6_check(skb->len - depth, &iph->saddr,
++					  &iph->daddr, 0);
++	} else {
++		hns3_rl_err(skb->dev,
++			    "Error: FW GRO supports only IPv4/IPv6, not 0x%04x, depth: %d\n",
++			    be16_to_cpu(type), depth);
++		return -EFAULT;
++	}
++
++	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
++	if (th->cwr)
++		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
++
++	if (l234info & BIT(HNS3_RXD_GRO_FIXID_B))
++		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
++
++	skb->csum_start = (unsigned char *)th - skb->head;
++	skb->csum_offset = offsetof(struct tcphdr, check);
++	skb->ip_summed = CHECKSUM_PARTIAL;
++
++	trace_hns3_gro(skb);
++
++	return 0;
++}
++
++static bool hns3_checksum_complete(struct hns3_enet_ring *ring,
++				   struct sk_buff *skb, u32 ptype, u16 csum)
++{
++	if (ptype == HNS3_INVALID_PTYPE ||
++	    hns3_rx_ptype_tbl[ptype].ip_summed != CHECKSUM_COMPLETE)
++		return false;
++
++	hns3_ring_stats_update(ring, csum_complete);
++	skb->ip_summed = CHECKSUM_COMPLETE;
++	skb->csum = csum_unfold((__force __sum16)csum);
++
++	return true;
++}
++
++static void hns3_rx_handle_csum(struct sk_buff *skb, u32 l234info,
++				u32 ol_info, u32 ptype)
++{
++	int l3_type, l4_type;
++	int ol4_type;
++
++	if (ptype != HNS3_INVALID_PTYPE) {
++		skb->csum_level = hns3_rx_ptype_tbl[ptype].csum_level;
++		skb->ip_summed = hns3_rx_ptype_tbl[ptype].ip_summed;
++
++		return;
++	}
++
++	ol4_type = hnae3_get_field(ol_info, HNS3_RXD_OL4ID_M,
++				   HNS3_RXD_OL4ID_S);
++	switch (ol4_type) {
++	case HNS3_OL4_TYPE_MAC_IN_UDP:
++	case HNS3_OL4_TYPE_NVGRE:
++		skb->csum_level = 1;
++		fallthrough;
++	case HNS3_OL4_TYPE_NO_TUN:
++		l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M,
++					  HNS3_RXD_L3ID_S);
++		l4_type = hnae3_get_field(l234info, HNS3_RXD_L4ID_M,
++					  HNS3_RXD_L4ID_S);
++		/* Can checksum ipv4 or ipv6 + UDP/TCP/SCTP packets */
++		if ((l3_type == HNS3_L3_TYPE_IPV4 ||
++		     l3_type == HNS3_L3_TYPE_IPV6) &&
++		    (l4_type == HNS3_L4_TYPE_UDP ||
++		     l4_type == HNS3_L4_TYPE_TCP ||
++		     l4_type == HNS3_L4_TYPE_SCTP))
++			skb->ip_summed = CHECKSUM_UNNECESSARY;
++		break;
++	default:
++		break;
++	}
++}
++
++static void hns3_rx_checksum(struct hns3_enet_ring *ring, struct sk_buff *skb,
++			     u32 l234info, u32 bd_base_info, u32 ol_info,
++			     u16 csum)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	u32 ptype = HNS3_INVALID_PTYPE;
++
++	skb->ip_summed = CHECKSUM_NONE;
++
++	skb_checksum_none_assert(skb);
++
++	if (!(netdev->features & NETIF_F_RXCSUM))
++		return;
++
++	if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state))
++		ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M,
++					HNS3_RXD_PTYPE_S);
++
++	if (hns3_checksum_complete(ring, skb, ptype, csum))
++		return;
++
++	/* check if hardware has done checksum */
++	if (!(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
++		return;
++
++	if (unlikely(l234info & (BIT(HNS3_RXD_L3E_B) | BIT(HNS3_RXD_L4E_B) |
++				 BIT(HNS3_RXD_OL3E_B) |
++				 BIT(HNS3_RXD_OL4E_B)))) {
++		hns3_ring_stats_update(ring, l3l4_csum_err);
++
++		return;
++	}
++
++	hns3_rx_handle_csum(skb, l234info, ol_info, ptype);
++}
++
++static void hns3_rx_skb(struct hns3_enet_ring *ring, struct sk_buff *skb)
++{
++	if (skb_has_frag_list(skb))
++		napi_gro_flush(&ring->tqp_vector->napi, false);
++
++	napi_gro_receive(&ring->tqp_vector->napi, skb);
++}
++
++static bool hns3_parse_vlan_tag(struct hns3_enet_ring *ring,
++				struct hns3_desc *desc, u32 l234info,
++				u16 *vlan_tag)
++{
++	struct hnae3_handle *handle = ring->tqp->handle;
++	struct pci_dev *pdev = ring->tqp->handle->pdev;
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	if (unlikely(ae_dev->dev_version < HNAE3_DEVICE_VERSION_V2)) {
++		*vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
++		if (!(*vlan_tag & VLAN_VID_MASK))
++			*vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
++
++		return (*vlan_tag != 0);
++	}
++
++#define HNS3_STRP_OUTER_VLAN	0x1
++#define HNS3_STRP_INNER_VLAN	0x2
++#define HNS3_STRP_BOTH		0x3
++
++	/* Hardware always insert VLAN tag into RX descriptor when
++	 * remove the tag from packet, driver needs to determine
++	 * reporting which tag to stack.
++	 */
++	switch (hnae3_get_field(l234info, HNS3_RXD_STRP_TAGP_M,
++				HNS3_RXD_STRP_TAGP_S)) {
++	case HNS3_STRP_OUTER_VLAN:
++		if (handle->port_base_vlan_state !=
++				HNAE3_PORT_BASE_VLAN_DISABLE)
++			return false;
++
++		*vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
++		return true;
++	case HNS3_STRP_INNER_VLAN:
++		if (handle->port_base_vlan_state !=
++				HNAE3_PORT_BASE_VLAN_DISABLE)
++			return false;
++
++		*vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
++		return true;
++	case HNS3_STRP_BOTH:
++		if (handle->port_base_vlan_state ==
++				HNAE3_PORT_BASE_VLAN_DISABLE)
++			*vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
++		else
++			*vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
++
++		return true;
++	default:
++		return false;
++	}
++}
++
++static void hns3_rx_ring_move_fw(struct hns3_enet_ring *ring)
++{
++	ring->desc[ring->next_to_clean].rx.bd_base_info &=
++		cpu_to_le32(~BIT(HNS3_RXD_VLD_B));
++	ring->desc_cb[ring->next_to_clean].refill = 0;
++	ring->next_to_clean += 1;
++
++	if (unlikely(ring->next_to_clean == ring->desc_num))
++		ring->next_to_clean = 0;
++}
++
++static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length,
++			  unsigned char *va)
++{
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_clean];
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct sk_buff *skb;
++
++	ring->skb = napi_alloc_skb(&ring->tqp_vector->napi, HNS3_RX_HEAD_SIZE);
++	skb = ring->skb;
++	if (unlikely(!skb)) {
++		hns3_rl_err(netdev, "alloc rx skb fail\n");
++		hns3_ring_stats_update(ring, sw_err_cnt);
++
++		return -ENOMEM;
++	}
++
++	trace_hns3_rx_desc(ring);
++	prefetchw(skb->data);
++
++	ring->pending_buf = 1;
++	ring->frag_num = 0;
++	ring->tail_skb = NULL;
++	if (length <= HNS3_RX_HEAD_SIZE) {
++		memcpy(__skb_put(skb, length), va, ALIGN(length, sizeof(long)));
++
++		/* We can reuse buffer as-is, just make sure it is reusable */
++		if (dev_page_is_reusable(desc_cb->priv))
++			desc_cb->reuse_flag = 1;
++		else if (desc_cb->type & DESC_TYPE_PP_FRAG)
++			page_pool_put_full_page(ring->page_pool, desc_cb->priv,
++						false);
++		else /* This page cannot be reused so discard it */
++			__page_frag_cache_drain(desc_cb->priv,
++						desc_cb->pagecnt_bias);
++
++		hns3_rx_ring_move_fw(ring);
++		return 0;
++	}
++
++	if (ring->page_pool)
++		skb_mark_for_recycle(skb);
++
++	hns3_ring_stats_update(ring, seg_pkt_cnt);
++
++	ring->pull_len = eth_get_headlen(netdev, va, HNS3_RX_HEAD_SIZE);
++	__skb_put(skb, ring->pull_len);
++	hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len,
++			    desc_cb);
++	hns3_rx_ring_move_fw(ring);
++
++	return 0;
++}
++
++static int hns3_add_frag(struct hns3_enet_ring *ring)
++{
++	struct sk_buff *skb = ring->skb;
++	struct sk_buff *head_skb = skb;
++	struct sk_buff *new_skb;
++	struct hns3_desc_cb *desc_cb;
++	struct hns3_desc *desc;
++	u32 bd_base_info;
++
++	do {
++		desc = &ring->desc[ring->next_to_clean];
++		desc_cb = &ring->desc_cb[ring->next_to_clean];
++		bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
++		/* make sure HW write desc complete */
++		dma_rmb();
++		if (!(bd_base_info & BIT(HNS3_RXD_VLD_B)))
++			return -ENXIO;
++
++		if (unlikely(ring->frag_num >= MAX_SKB_FRAGS)) {
++			new_skb = napi_alloc_skb(&ring->tqp_vector->napi, 0);
++			if (unlikely(!new_skb)) {
++				hns3_rl_err(ring_to_netdev(ring),
++					    "alloc rx fraglist skb fail\n");
++				return -ENXIO;
++			}
++
++			if (ring->page_pool)
++				skb_mark_for_recycle(new_skb);
++
++			ring->frag_num = 0;
++
++			if (ring->tail_skb) {
++				ring->tail_skb->next = new_skb;
++				ring->tail_skb = new_skb;
++			} else {
++				skb_shinfo(skb)->frag_list = new_skb;
++				ring->tail_skb = new_skb;
++			}
++		}
++
++		if (ring->tail_skb) {
++			head_skb->truesize += hns3_buf_size(ring);
++			head_skb->data_len += le16_to_cpu(desc->rx.size);
++			head_skb->len += le16_to_cpu(desc->rx.size);
++			skb = ring->tail_skb;
++		}
++
++		dma_sync_single_for_cpu(ring_to_dev(ring),
++				desc_cb->dma + desc_cb->page_offset,
++				hns3_buf_size(ring),
++				DMA_FROM_DEVICE);
++
++		hns3_nic_reuse_page(skb, ring->frag_num++, ring, 0, desc_cb);
++		trace_hns3_rx_desc(ring);
++		hns3_rx_ring_move_fw(ring);
++		ring->pending_buf++;
++	} while (!(bd_base_info & BIT(HNS3_RXD_FE_B)));
++
++	return 0;
++}
++
++static int hns3_set_gro_and_checksum(struct hns3_enet_ring *ring,
++				     struct sk_buff *skb, u32 l234info,
++				     u32 bd_base_info, u32 ol_info, u16 csum)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	u32 l3_type;
++
++	skb_shinfo(skb)->gso_size = hnae3_get_field(bd_base_info,
++						    HNS3_RXD_GRO_SIZE_M,
++						    HNS3_RXD_GRO_SIZE_S);
++	/* if there is no HW GRO, do not set gro params */
++	if (!skb_shinfo(skb)->gso_size) {
++		hns3_rx_checksum(ring, skb, l234info, bd_base_info, ol_info,
++				 csum);
++		return 0;
++	}
++
++	NAPI_GRO_CB(skb)->count = hnae3_get_field(l234info,
++						  HNS3_RXD_GRO_COUNT_M,
++						  HNS3_RXD_GRO_COUNT_S);
++
++	if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) {
++		u32 ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M,
++					    HNS3_RXD_PTYPE_S);
++
++		l3_type = hns3_rx_ptype_tbl[ptype].l3_type;
++	} else {
++		l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M,
++					  HNS3_RXD_L3ID_S);
++	}
++
++	if (l3_type == HNS3_L3_TYPE_IPV4)
++		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
++	else if (l3_type == HNS3_L3_TYPE_IPV6)
++		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
++	else
++		return -EFAULT;
++
++	return  hns3_gro_complete(skb, l234info);
++}
++
++static void hns3_set_rx_skb_rss_type(struct hns3_enet_ring *ring,
++				     struct sk_buff *skb, u32 rss_hash,
++				     u32 l234info, u32 ol_info)
++{
++	enum pkt_hash_types rss_type = PKT_HASH_TYPE_NONE;
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) {
++		u32 ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M,
++					    HNS3_RXD_PTYPE_S);
++
++		rss_type = hns3_rx_ptype_tbl[ptype].hash_type;
++	} else {
++		int l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M,
++					      HNS3_RXD_L3ID_S);
++		int l4_type = hnae3_get_field(l234info, HNS3_RXD_L4ID_M,
++					      HNS3_RXD_L4ID_S);
++
++		if (l3_type == HNS3_L3_TYPE_IPV4 ||
++		    l3_type == HNS3_L3_TYPE_IPV6) {
++			if (l4_type == HNS3_L4_TYPE_UDP ||
++			    l4_type == HNS3_L4_TYPE_TCP ||
++			    l4_type == HNS3_L4_TYPE_SCTP)
++				rss_type = PKT_HASH_TYPE_L4;
++			else if (l4_type == HNS3_L4_TYPE_IGMP ||
++				 l4_type == HNS3_L4_TYPE_ICMP)
++				rss_type = PKT_HASH_TYPE_L3;
++		}
++	}
++
++	skb_set_hash(skb, rss_hash, rss_type);
++}
++
++static void hns3_handle_rx_ts_info(struct net_device *netdev,
++				   struct hns3_desc *desc, struct sk_buff *skb,
++				   u32 bd_base_info)
++{
++	if (unlikely(bd_base_info & BIT(HNS3_RXD_TS_VLD_B))) {
++		struct hnae3_handle *h = hns3_get_handle(netdev);
++		u32 nsec = le32_to_cpu(desc->ts_nsec);
++		u32 sec = le32_to_cpu(desc->ts_sec);
++
++		if (h->ae_algo->ops->get_rx_hwts)
++			h->ae_algo->ops->get_rx_hwts(h, skb, nsec, sec);
++	}
++}
++
++static void hns3_handle_rx_vlan_tag(struct hns3_enet_ring *ring,
++				    struct hns3_desc *desc, struct sk_buff *skb,
++				    u32 l234info)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++
++	/* Based on hw strategy, the tag offloaded will be stored at
++	 * ot_vlan_tag in two layer tag case, and stored at vlan_tag
++	 * in one layer tag case.
++	 */
++	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
++		u16 vlan_tag;
++
++		if (hns3_parse_vlan_tag(ring, desc, l234info, &vlan_tag))
++			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
++					       vlan_tag);
++	}
++}
++
++static int hns3_handle_bdinfo(struct hns3_enet_ring *ring, struct sk_buff *skb)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	enum hns3_pkt_l2t_type l2_frame_type;
++	u32 bd_base_info, l234info, ol_info;
++	struct hns3_desc *desc;
++	unsigned int len;
++	int pre_ntc, ret;
++	u16 csum;
++
++	/* bdinfo handled below is only valid on the last BD of the
++	 * current packet, and ring->next_to_clean indicates the first
++	 * descriptor of next packet, so need - 1 below.
++	 */
++	pre_ntc = ring->next_to_clean ? (ring->next_to_clean - 1) :
++					(ring->desc_num - 1);
++	desc = &ring->desc[pre_ntc];
++	bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
++	l234info = le32_to_cpu(desc->rx.l234_info);
++	ol_info = le32_to_cpu(desc->rx.ol_info);
++	csum = le16_to_cpu(desc->csum);
++
++	hns3_handle_rx_ts_info(netdev, desc, skb, bd_base_info);
++
++	hns3_handle_rx_vlan_tag(ring, desc, skb, l234info);
++
++	if (unlikely(!desc->rx.pkt_len || (l234info & (BIT(HNS3_RXD_TRUNCAT_B) |
++				  BIT(HNS3_RXD_L2E_B))))) {
++		u64_stats_update_begin(&ring->syncp);
++		if (l234info & BIT(HNS3_RXD_L2E_B))
++			ring->stats.l2_err++;
++		else
++			ring->stats.err_pkt_len++;
++		u64_stats_update_end(&ring->syncp);
++
++		return -EFAULT;
++	}
++
++	len = skb->len;
++
++	/* Do update ip stack process */
++	skb->protocol = eth_type_trans(skb, netdev);
++
++	/* This is needed in order to enable forwarding support */
++	ret = hns3_set_gro_and_checksum(ring, skb, l234info,
++					bd_base_info, ol_info, csum);
++	if (unlikely(ret)) {
++		hns3_ring_stats_update(ring, rx_err_cnt);
++		return ret;
++	}
++
++	l2_frame_type = hnae3_get_field(l234info, HNS3_RXD_DMAC_M,
++					HNS3_RXD_DMAC_S);
++
++	u64_stats_update_begin(&ring->syncp);
++	ring->stats.rx_pkts++;
++	ring->stats.rx_bytes += len;
++
++	if (l2_frame_type == HNS3_L2_TYPE_MULTICAST)
++		ring->stats.rx_multicast++;
++
++	u64_stats_update_end(&ring->syncp);
++
++	ring->tqp_vector->rx_group.total_bytes += len;
++
++	hns3_set_rx_skb_rss_type(ring, skb, le32_to_cpu(desc->rx.rss_hash),
++				 l234info, ol_info);
++	return 0;
++}
++
++static int hns3_handle_rx_bd(struct hns3_enet_ring *ring)
++{
++	struct sk_buff *skb = ring->skb;
++	struct hns3_desc_cb *desc_cb;
++	struct hns3_desc *desc;
++	unsigned int length;
++	u32 bd_base_info;
++	int ret;
++
++	desc = &ring->desc[ring->next_to_clean];
++	desc_cb = &ring->desc_cb[ring->next_to_clean];
++
++	prefetch(desc);
++
++	if (!skb) {
++		bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
++		/* Check valid BD */
++		if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
++			return -ENXIO;
++
++		dma_rmb();
++		length = le16_to_cpu(desc->rx.size);
++
++		ring->va = desc_cb->buf + desc_cb->page_offset;
++
++		dma_sync_single_for_cpu(ring_to_dev(ring),
++				desc_cb->dma + desc_cb->page_offset,
++				hns3_buf_size(ring),
++				DMA_FROM_DEVICE);
++
++		/* Prefetch first cache line of first page.
++		 * Idea is to cache few bytes of the header of the packet.
++		 * Our L1 Cache line size is 64B so need to prefetch twice to make
++		 * it 128B. But in actual we can have greater size of caches with
++		 * 128B Level 1 cache lines. In such a case, single fetch would
++		 * suffice to cache in the relevant part of the header.
++		 */
++		net_prefetch(ring->va);
++
++		ret = hns3_alloc_skb(ring, length, ring->va);
++		skb = ring->skb;
++
++		if (ret < 0) /* alloc buffer fail */
++			return ret;
++		if (!(bd_base_info & BIT(HNS3_RXD_FE_B))) { /* need add frag */
++			ret = hns3_add_frag(ring);
++			if (ret)
++				return ret;
++		}
++	} else {
++		ret = hns3_add_frag(ring);
++		if (ret)
++			return ret;
++	}
++
++	/* As the head data may be changed when GRO enable, copy
++	 * the head data in after other data rx completed
++	 */
++	if (skb->len > HNS3_RX_HEAD_SIZE)
++		memcpy(skb->data, ring->va,
++		       ALIGN(ring->pull_len, sizeof(long)));
++
++	ret = hns3_handle_bdinfo(ring, skb);
++	if (unlikely(ret)) {
++		dev_kfree_skb_any(skb);
++		return ret;
++	}
++
++	skb_record_rx_queue(skb, ring->tqp->tqp_index);
++	return 0;
++}
++
++int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget,
++		       void (*rx_fn)(struct hns3_enet_ring *, struct sk_buff *))
++{
++#define RCB_NOF_ALLOC_RX_BUFF_ONCE 16
++	int unused_count = hns3_desc_unused(ring);
++	bool failure = false;
++	int recv_pkts = 0;
++	int err;
++
++	unused_count -= ring->pending_buf;
++
++	while (recv_pkts < budget) {
++		/* Reuse or realloc buffers */
++		if (unused_count >= RCB_NOF_ALLOC_RX_BUFF_ONCE) {
++			failure = failure ||
++				hns3_nic_alloc_rx_buffers(ring, unused_count);
++			unused_count = 0;
++		}
++
++		/* Poll one pkt */
++		err = hns3_handle_rx_bd(ring);
++		/* Do not get FE for the packet or failed to alloc skb */
++		if (unlikely(!ring->skb || err == -ENXIO)) {
++			goto out;
++		} else if (likely(!err)) {
++			rx_fn(ring, ring->skb);
++			recv_pkts++;
++		}
++
++		unused_count += ring->pending_buf;
++		ring->skb = NULL;
++		ring->pending_buf = 0;
++	}
++
++out:
++	/* sync head pointer before exiting, since hardware will calculate
++	 * FBD number with head pointer
++	 */
++	if (unused_count > 0)
++		failure = failure ||
++			  hns3_nic_alloc_rx_buffers(ring, unused_count);
++
++	return failure ? budget : recv_pkts;
++}
++
++static void hns3_update_rx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	struct hns3_enet_ring_group *rx_group = &tqp_vector->rx_group;
++	struct dim_sample sample = {};
++
++	if (!rx_group->coal.adapt_enable)
++		return;
++
++	dim_update_sample(tqp_vector->event_cnt, rx_group->total_packets,
++			  rx_group->total_bytes, &sample);
++	net_dim(&rx_group->dim, sample);
++}
++
++static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	struct hns3_enet_ring_group *tx_group = &tqp_vector->tx_group;
++	struct dim_sample sample = {};
++
++	if (!tx_group->coal.adapt_enable)
++		return;
++
++	dim_update_sample(tqp_vector->event_cnt, tx_group->total_packets,
++			  tx_group->total_bytes, &sample);
++	net_dim(&tx_group->dim, sample);
++}
++
++static int hns3_nic_common_poll(struct napi_struct *napi, int budget)
++{
++	struct hns3_nic_priv *priv = netdev_priv(napi->dev);
++	struct hns3_enet_ring *ring;
++	int rx_pkt_total = 0;
++
++	struct hns3_enet_tqp_vector *tqp_vector =
++		container_of(napi, struct hns3_enet_tqp_vector, napi);
++	bool clean_complete = true;
++	int rx_budget = budget;
++
++	if (unlikely(test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) {
++		napi_complete(napi);
++		return 0;
++	}
++
++	/* Since the actual Tx work is minimal, we can give the Tx a larger
++	 * budget and be more aggressive about cleaning up the Tx descriptors.
++	 */
++	hns3_for_each_ring(ring, tqp_vector->tx_group)
++		hns3_clean_tx_ring(ring, budget);
++
++	/* make sure rx ring budget not smaller than 1 */
++	if (tqp_vector->num_tqps > 1)
++		rx_budget = max(budget / tqp_vector->num_tqps, 1);
++
++	hns3_for_each_ring(ring, tqp_vector->rx_group) {
++		int rx_cleaned = hns3_clean_rx_ring(ring, rx_budget,
++						    hns3_rx_skb);
++		if (rx_cleaned >= rx_budget)
++			clean_complete = false;
++
++		rx_pkt_total += rx_cleaned;
++	}
++
++	tqp_vector->rx_group.total_packets += rx_pkt_total;
++
++	if (!clean_complete)
++		return budget;
++
++	if (napi_complete(napi) &&
++	    likely(!test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) {
++		hns3_update_rx_int_coalesce(tqp_vector);
++		hns3_update_tx_int_coalesce(tqp_vector);
++
++		hns3_mask_vector_irq(tqp_vector, 1);
++	}
++
++	return rx_pkt_total;
++}
++
++static int hns3_create_ring_chain(struct hns3_enet_tqp_vector *tqp_vector,
++				  struct hnae3_ring_chain_node **head,
++				  bool is_tx)
++{
++	u32 bit_value = is_tx ? HNAE3_RING_TYPE_TX : HNAE3_RING_TYPE_RX;
++	u32 field_value = is_tx ? HNAE3_RING_GL_TX : HNAE3_RING_GL_RX;
++	struct hnae3_ring_chain_node *cur_chain = *head;
++	struct pci_dev *pdev = tqp_vector->handle->pdev;
++	struct hnae3_ring_chain_node *chain;
++	struct hns3_enet_ring *ring;
++
++	ring = is_tx ? tqp_vector->tx_group.ring : tqp_vector->rx_group.ring;
++
++	if (cur_chain) {
++		while (cur_chain->next)
++			cur_chain = cur_chain->next;
++	}
++
++	while (ring) {
++		chain = devm_kzalloc(&pdev->dev, sizeof(*chain), GFP_KERNEL);
++		if (!chain)
++			return -ENOMEM;
++		if (cur_chain)
++			cur_chain->next = chain;
++		else
++			*head = chain;
++		chain->tqp_index = ring->tqp->tqp_index;
++		hnae3_set_bit(chain->flag, HNAE3_RING_TYPE_B,
++				bit_value);
++		hnae3_set_field(chain->int_gl_idx,
++				HNAE3_RING_GL_IDX_M,
++				HNAE3_RING_GL_IDX_S, field_value);
++
++		cur_chain = chain;
++
++		ring = ring->next;
++	}
++
++	return 0;
++}
++
++static struct hnae3_ring_chain_node *
++hns3_get_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	struct pci_dev *pdev = tqp_vector->handle->pdev;
++	struct hnae3_ring_chain_node *cur_chain = NULL;
++	struct hnae3_ring_chain_node *chain;
++
++	if (hns3_create_ring_chain(tqp_vector, &cur_chain, true))
++		goto err_free_chain;
++
++	if (hns3_create_ring_chain(tqp_vector, &cur_chain, false))
++		goto err_free_chain;
++
++	return cur_chain;
++
++err_free_chain:
++	while (cur_chain) {
++		chain = cur_chain->next;
++		devm_kfree(&pdev->dev, cur_chain);
++		cur_chain = chain;
++	}
++
++	return NULL;
++}
++
++static void hns3_free_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector,
++					struct hnae3_ring_chain_node *head)
++{
++	struct pci_dev *pdev = tqp_vector->handle->pdev;
++	struct hnae3_ring_chain_node *chain_tmp, *chain;
++
++	chain = head;
++
++	while (chain) {
++		chain_tmp = chain->next;
++		devm_kfree(&pdev->dev, chain);
++		chain = chain_tmp;
++	}
++}
++
++static void hns3_add_ring_to_group(struct hns3_enet_ring_group *group,
++				   struct hns3_enet_ring *ring)
++{
++	ring->next = group->ring;
++	group->ring = ring;
++
++	group->count++;
++}
++
++static void hns3_nic_set_cpumask(struct hns3_nic_priv *priv)
++{
++	struct pci_dev *pdev = priv->ae_handle->pdev;
++	struct hns3_enet_tqp_vector *tqp_vector;
++	int num_vectors = priv->vector_num;
++	int numa_node;
++	int vector_i;
++
++	numa_node = dev_to_node(&pdev->dev);
++
++	for (vector_i = 0; vector_i < num_vectors; vector_i++) {
++		tqp_vector = &priv->tqp_vector[vector_i];
++		cpumask_set_cpu(cpumask_local_spread(vector_i, numa_node),
++				&tqp_vector->affinity_mask);
++	}
++}
++
++static void hns3_rx_dim_work(struct work_struct *work)
++{
++	struct dim *dim = container_of(work, struct dim, work);
++	struct hns3_enet_ring_group *group = container_of(dim,
++		struct hns3_enet_ring_group, dim);
++	struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector;
++	struct dim_cq_moder cur_moder =
++		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
++
++	hns3_set_vector_coalesce_rx_gl(group->ring->tqp_vector, cur_moder.usec);
++	tqp_vector->rx_group.coal.int_gl = cur_moder.usec;
++
++	if (cur_moder.pkts < tqp_vector->rx_group.coal.int_ql_max) {
++		hns3_set_vector_coalesce_rx_ql(tqp_vector, cur_moder.pkts);
++		tqp_vector->rx_group.coal.int_ql = cur_moder.pkts;
++	}
++
++	dim->state = DIM_START_MEASURE;
++}
++
++static void hns3_tx_dim_work(struct work_struct *work)
++{
++	struct dim *dim = container_of(work, struct dim, work);
++	struct hns3_enet_ring_group *group = container_of(dim,
++		struct hns3_enet_ring_group, dim);
++	struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector;
++	struct dim_cq_moder cur_moder =
++		net_dim_get_tx_moderation(dim->mode, dim->profile_ix);
++
++	hns3_set_vector_coalesce_tx_gl(tqp_vector, cur_moder.usec);
++	tqp_vector->tx_group.coal.int_gl = cur_moder.usec;
++
++	if (cur_moder.pkts < tqp_vector->tx_group.coal.int_ql_max) {
++		hns3_set_vector_coalesce_tx_ql(tqp_vector, cur_moder.pkts);
++		tqp_vector->tx_group.coal.int_ql = cur_moder.pkts;
++	}
++
++	dim->state = DIM_START_MEASURE;
++}
++
++static void hns3_nic_init_dim(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	INIT_WORK(&tqp_vector->rx_group.dim.work, hns3_rx_dim_work);
++	INIT_WORK(&tqp_vector->tx_group.dim.work, hns3_tx_dim_work);
++}
++
++static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	struct hns3_enet_tqp_vector *tqp_vector;
++	int ret;
++	int i;
++
++	hns3_nic_set_cpumask(priv);
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vector = &priv->tqp_vector[i];
++		hns3_vector_coalesce_init_hw(tqp_vector, priv);
++		tqp_vector->num_tqps = 0;
++		hns3_nic_init_dim(tqp_vector);
++	}
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		u16 vector_i = i % priv->vector_num;
++		u16 tqp_num = h->kinfo.num_tqps;
++
++		tqp_vector = &priv->tqp_vector[vector_i];
++
++		hns3_add_ring_to_group(&tqp_vector->tx_group,
++				       &priv->ring[i]);
++
++		hns3_add_ring_to_group(&tqp_vector->rx_group,
++				       &priv->ring[i + tqp_num]);
++
++		priv->ring[i].tqp_vector = tqp_vector;
++		priv->ring[i + tqp_num].tqp_vector = tqp_vector;
++		tqp_vector->num_tqps++;
++	}
++
++	for (i = 0; i < priv->vector_num; i++) {
++		struct hnae3_ring_chain_node *vector_ring_chain;
++
++		tqp_vector = &priv->tqp_vector[i];
++
++		tqp_vector->rx_group.total_bytes = 0;
++		tqp_vector->rx_group.total_packets = 0;
++		tqp_vector->tx_group.total_bytes = 0;
++		tqp_vector->tx_group.total_packets = 0;
++		tqp_vector->handle = h;
++
++		vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector);
++		if (!vector_ring_chain) {
++			ret = -ENOMEM;
++			goto map_ring_fail;
++		}
++
++		ret = h->ae_algo->ops->map_ring_to_vector(h,
++			tqp_vector->vector_irq, vector_ring_chain);
++
++		hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain);
++
++		if (ret)
++			goto map_ring_fail;
++
++		netif_napi_add(priv->netdev, &tqp_vector->napi,
++			       hns3_nic_common_poll, NAPI_POLL_WEIGHT);
++	}
++
++	return 0;
++
++map_ring_fail:
++	while (i--)
++		netif_napi_del(&priv->tqp_vector[i].napi);
++
++	return ret;
++}
++
++static void hns3_nic_init_coal_cfg(struct hns3_nic_priv *priv)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
++	struct hns3_enet_coalesce *tx_coal = &priv->tx_coal;
++	struct hns3_enet_coalesce *rx_coal = &priv->rx_coal;
++
++	/* initialize the configuration for interrupt coalescing.
++	 * 1. GL (Interrupt Gap Limiter)
++	 * 2. RL (Interrupt Rate Limiter)
++	 * 3. QL (Interrupt Quantity Limiter)
++	 *
++	 * Default: enable interrupt coalescing self-adaptive and GL
++	 */
++	tx_coal->adapt_enable = 1;
++	rx_coal->adapt_enable = 1;
++
++	tx_coal->int_gl = HNS3_INT_GL_50K;
++	rx_coal->int_gl = HNS3_INT_GL_50K;
++
++	rx_coal->flow_level = HNS3_FLOW_LOW;
++	tx_coal->flow_level = HNS3_FLOW_LOW;
++
++	if (ae_dev->dev_specs.int_ql_max) {
++		tx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG;
++		rx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG;
++	}
++}
++
++static int hns3_nic_alloc_vector_data(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	struct hns3_enet_tqp_vector *tqp_vector;
++	struct hnae3_vector_info *vector;
++	struct pci_dev *pdev = h->pdev;
++	u16 tqp_num = h->kinfo.num_tqps;
++	u16 vector_num;
++	int ret = 0;
++	u16 i;
++
++	/* RSS size, cpu online and vector_num should be the same */
++	/* Should consider 2p/4p later */
++	vector_num = min_t(u16, num_online_cpus(), tqp_num);
++
++	vector = devm_kcalloc(&pdev->dev, vector_num, sizeof(*vector),
++			      GFP_KERNEL);
++	if (!vector)
++		return -ENOMEM;
++
++	/* save the actual available vector number */
++	vector_num = h->ae_algo->ops->get_vector(h, vector_num, vector);
++
++	priv->vector_num = vector_num;
++	priv->tqp_vector = (struct hns3_enet_tqp_vector *)
++		devm_kcalloc(&pdev->dev, vector_num, sizeof(*priv->tqp_vector),
++			     GFP_KERNEL);
++	if (!priv->tqp_vector) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vector = &priv->tqp_vector[i];
++		tqp_vector->idx = i;
++		tqp_vector->mask_addr = vector[i].io_addr;
++		tqp_vector->vector_irq = vector[i].vector;
++		hns3_vector_coalesce_init(tqp_vector, priv);
++	}
++
++out:
++	devm_kfree(&pdev->dev, vector);
++	return ret;
++}
++
++static void hns3_clear_ring_group(struct hns3_enet_ring_group *group)
++{
++	group->ring = NULL;
++	group->count = 0;
++}
++
++static void hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv)
++{
++	struct hnae3_ring_chain_node *vector_ring_chain;
++	struct hnae3_handle *h = priv->ae_handle;
++	struct hns3_enet_tqp_vector *tqp_vector;
++	int i;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vector = &priv->tqp_vector[i];
++
++		if (!tqp_vector->rx_group.ring && !tqp_vector->tx_group.ring)
++			continue;
++
++		/* Since the mapping can be overwritten, when fail to get the
++		 * chain between vector and ring, we should go on to deal with
++		 * the remaining options.
++		 */
++		vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector);
++		if (!vector_ring_chain)
++			dev_warn(priv->dev, "failed to get ring chain\n");
++
++		h->ae_algo->ops->unmap_ring_from_vector(h,
++			tqp_vector->vector_irq, vector_ring_chain);
++
++		hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain);
++
++		hns3_clear_ring_group(&tqp_vector->rx_group);
++		hns3_clear_ring_group(&tqp_vector->tx_group);
++		netif_napi_del(&priv->tqp_vector[i].napi);
++	}
++}
++
++static void hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	struct pci_dev *pdev = h->pdev;
++	int i, ret;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		struct hns3_enet_tqp_vector *tqp_vector;
++
++		tqp_vector = &priv->tqp_vector[i];
++		ret = h->ae_algo->ops->put_vector(h, tqp_vector->vector_irq);
++		if (ret)
++			return;
++	}
++
++	devm_kfree(&pdev->dev, priv->tqp_vector);
++}
++
++static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv,
++			      unsigned int ring_type)
++{
++	int queue_num = priv->ae_handle->kinfo.num_tqps;
++	struct hns3_enet_ring *ring;
++	int desc_num;
++
++	if (ring_type == HNAE3_RING_TYPE_TX) {
++		ring = &priv->ring[q->tqp_index];
++		desc_num = priv->ae_handle->kinfo.num_tx_desc;
++		ring->queue_index = q->tqp_index;
++		ring->tx_copybreak = priv->tx_copybreak;
++		ring->last_to_use = 0;
++	} else {
++		ring = &priv->ring[q->tqp_index + queue_num];
++		desc_num = priv->ae_handle->kinfo.num_rx_desc;
++		ring->queue_index = q->tqp_index;
++		ring->rx_copybreak = priv->rx_copybreak;
++	}
++
++	hnae3_set_bit(ring->flag, HNAE3_RING_TYPE_B, ring_type);
++
++	ring->tqp = q;
++	ring->desc = NULL;
++	ring->desc_cb = NULL;
++	ring->dev = priv->dev;
++	ring->desc_dma_addr = 0;
++	ring->buf_size = q->buf_size;
++	ring->desc_num = desc_num;
++	ring->next_to_use = 0;
++	ring->next_to_clean = 0;
++}
++
++static void hns3_queue_to_ring(struct hnae3_queue *tqp,
++			       struct hns3_nic_priv *priv)
++{
++	hns3_ring_get_cfg(tqp, priv, HNAE3_RING_TYPE_TX);
++	hns3_ring_get_cfg(tqp, priv, HNAE3_RING_TYPE_RX);
++}
++
++static int hns3_get_ring_config(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	struct pci_dev *pdev = h->pdev;
++	int i;
++
++	priv->ring = devm_kzalloc(&pdev->dev,
++				  array3_size(h->kinfo.num_tqps,
++					      sizeof(*priv->ring), 2),
++				  GFP_KERNEL);
++	if (!priv->ring)
++		return -ENOMEM;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++)
++		hns3_queue_to_ring(h->kinfo.tqp[i], priv);
++
++	return 0;
++}
++
++static void hns3_put_ring_config(struct hns3_nic_priv *priv)
++{
++	if (!priv->ring)
++		return;
++
++	devm_kfree(priv->dev, priv->ring);
++	priv->ring = NULL;
++}
++
++static void hns3_alloc_page_pool(struct hns3_enet_ring *ring)
++{
++	struct page_pool_params pp_params = {
++		.flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG |
++				PP_FLAG_DMA_SYNC_DEV,
++		.order = hns3_page_order(ring),
++		.pool_size = ring->desc_num * hns3_buf_size(ring) /
++				(PAGE_SIZE << hns3_page_order(ring)),
++		.nid = dev_to_node(ring_to_dev(ring)),
++		.dev = ring_to_dev(ring),
++		.dma_dir = DMA_FROM_DEVICE,
++		.offset = 0,
++		.max_len = PAGE_SIZE << hns3_page_order(ring),
++	};
++
++	ring->page_pool = page_pool_create(&pp_params);
++	if (IS_ERR(ring->page_pool)) {
++		dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n",
++			 PTR_ERR(ring->page_pool));
++		ring->page_pool = NULL;
++	}
++}
++
++static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring)
++{
++	int ret;
++
++	if (ring->desc_num <= 0 || ring->buf_size <= 0)
++		return -EINVAL;
++
++	ring->desc_cb = devm_kcalloc(ring_to_dev(ring), ring->desc_num,
++				     sizeof(ring->desc_cb[0]), GFP_KERNEL);
++	if (!ring->desc_cb) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	ret = hns3_alloc_desc(ring);
++	if (ret)
++		goto out_with_desc_cb;
++
++	if (!HNAE3_IS_TX_RING(ring)) {
++		if (page_pool_enabled)
++			hns3_alloc_page_pool(ring);
++
++		ret = hns3_alloc_ring_buffers(ring);
++		if (ret)
++			goto out_with_desc;
++	} else {
++		hns3_init_tx_spare_buffer(ring);
++	}
++
++	return 0;
++
++out_with_desc:
++	hns3_free_desc(ring);
++out_with_desc_cb:
++	devm_kfree(ring_to_dev(ring), ring->desc_cb);
++	ring->desc_cb = NULL;
++out:
++	return ret;
++}
++
++void hns3_fini_ring(struct hns3_enet_ring *ring)
++{
++	hns3_free_desc(ring);
++	devm_kfree(ring_to_dev(ring), ring->desc_cb);
++	ring->desc_cb = NULL;
++	ring->next_to_clean = 0;
++	ring->next_to_use = 0;
++	ring->last_to_use = 0;
++	ring->pending_buf = 0;
++	if (!HNAE3_IS_TX_RING(ring) && ring->skb) {
++		dev_kfree_skb_any(ring->skb);
++		ring->skb = NULL;
++	} else if (HNAE3_IS_TX_RING(ring) && ring->tx_spare) {
++		struct hns3_tx_spare *tx_spare = ring->tx_spare;
++
++		dma_unmap_page(ring_to_dev(ring), tx_spare->dma, tx_spare->len,
++			       DMA_TO_DEVICE);
++		free_pages((unsigned long)tx_spare->buf,
++			   get_order(tx_spare->len));
++		devm_kfree(ring_to_dev(ring), tx_spare);
++		ring->tx_spare = NULL;
++	}
++
++	if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) {
++		page_pool_destroy(ring->page_pool);
++		ring->page_pool = NULL;
++	}
++}
++
++static int hns3_buf_size2type(u32 buf_size)
++{
++	int bd_size_type;
++
++	switch (buf_size) {
++	case 512:
++		bd_size_type = HNS3_BD_SIZE_512_TYPE;
++		break;
++	case 1024:
++		bd_size_type = HNS3_BD_SIZE_1024_TYPE;
++		break;
++	case 2048:
++		bd_size_type = HNS3_BD_SIZE_2048_TYPE;
++		break;
++	case 4096:
++		bd_size_type = HNS3_BD_SIZE_4096_TYPE;
++		break;
++	default:
++		bd_size_type = HNS3_BD_SIZE_2048_TYPE;
++	}
++
++	return bd_size_type;
++}
++
++static void hns3_init_ring_hw(struct hns3_enet_ring *ring)
++{
++	dma_addr_t dma = ring->desc_dma_addr;
++	struct hnae3_queue *q = ring->tqp;
++
++	if (!HNAE3_IS_TX_RING(ring)) {
++		hns3_write_dev(q, HNS3_RING_RX_RING_BASEADDR_L_REG, (u32)dma);
++		hns3_write_dev(q, HNS3_RING_RX_RING_BASEADDR_H_REG,
++			       (u32)((dma >> 31) >> 1));
++
++		hns3_write_dev(q, HNS3_RING_RX_RING_BD_LEN_REG,
++			       hns3_buf_size2type(ring->buf_size));
++		hns3_write_dev(q, HNS3_RING_RX_RING_BD_NUM_REG,
++			       ring->desc_num / 8 - 1);
++	} else {
++		hns3_write_dev(q, HNS3_RING_TX_RING_BASEADDR_L_REG,
++			       (u32)dma);
++		hns3_write_dev(q, HNS3_RING_TX_RING_BASEADDR_H_REG,
++			       (u32)((dma >> 31) >> 1));
++
++		hns3_write_dev(q, HNS3_RING_TX_RING_BD_NUM_REG,
++			       ring->desc_num / 8 - 1);
++	}
++}
++
++static void hns3_init_tx_ring_tc(struct hns3_nic_priv *priv)
++{
++	struct hnae3_knic_private_info *kinfo = &priv->ae_handle->kinfo;
++	struct hnae3_tc_info *tc_info = &kinfo->tc_info;
++	int i;
++
++	for (i = 0; i < tc_info->num_tc; i++) {
++		int j;
++
++		for (j = 0; j < tc_info->tqp_count[i]; j++) {
++			struct hnae3_queue *q;
++
++			q = priv->ring[tc_info->tqp_offset[i] + j].tqp;
++			hns3_write_dev(q, HNS3_RING_TX_RING_TC_REG, i);
++		}
++	}
++}
++
++int hns3_init_all_ring(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	int ring_num = h->kinfo.num_tqps * 2;
++	int i, j;
++	int ret;
++
++	for (i = 0; i < ring_num; i++) {
++		ret = hns3_alloc_ring_memory(&priv->ring[i]);
++		if (ret) {
++			dev_err(priv->dev,
++				"Alloc ring memory fail! ret=%d\n", ret);
++			goto out_when_alloc_ring_memory;
++		}
++
++		u64_stats_init(&priv->ring[i].syncp);
++	}
++
++	return 0;
++
++out_when_alloc_ring_memory:
++	for (j = i - 1; j >= 0; j--)
++		hns3_fini_ring(&priv->ring[j]);
++
++	return -ENOMEM;
++}
++
++static void hns3_uninit_all_ring(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	int i;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		hns3_fini_ring(&priv->ring[i]);
++		hns3_fini_ring(&priv->ring[i + h->kinfo.num_tqps]);
++	}
++}
++
++/* Set mac addr if it is configured. or leave it to the AE driver */
++static int hns3_init_mac_addr(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
++	struct hnae3_handle *h = priv->ae_handle;
++	u8 mac_addr_temp[ETH_ALEN];
++	int ret = 0;
++
++	if (h->ae_algo->ops->get_mac_addr)
++		h->ae_algo->ops->get_mac_addr(h, mac_addr_temp);
++
++	/* Check if the MAC address is valid, if not get a random one */
++	if (!is_valid_ether_addr(mac_addr_temp)) {
++		eth_hw_addr_random(netdev);
++		hnae3_format_mac_addr(format_mac_addr, netdev->dev_addr);
++		dev_warn(priv->dev, "using random MAC address %s\n",
++			 format_mac_addr);
++	} else if (!ether_addr_equal(netdev->dev_addr, mac_addr_temp)) {
++		eth_hw_addr_set(netdev, mac_addr_temp);
++		ether_addr_copy(netdev->perm_addr, mac_addr_temp);
++	} else {
++		return 0;
++	}
++
++	if (h->ae_algo->ops->set_mac_addr)
++		ret = h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr, true);
++
++	return ret;
++}
++
++static int hns3_init_phy(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret = 0;
++
++	if (h->ae_algo->ops->mac_connect_phy)
++		ret = h->ae_algo->ops->mac_connect_phy(h);
++
++	return ret;
++}
++
++static void hns3_uninit_phy(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (h->ae_algo->ops->mac_disconnect_phy)
++		h->ae_algo->ops->mac_disconnect_phy(h);
++}
++
++static int hns3_client_start(struct hnae3_handle *handle)
++{
++	if (!handle->ae_algo->ops->client_start)
++		return 0;
++
++	return handle->ae_algo->ops->client_start(handle);
++}
++
++static void hns3_client_stop(struct hnae3_handle *handle)
++{
++	if (!handle->ae_algo->ops->client_stop)
++		return;
++
++	handle->ae_algo->ops->client_stop(handle);
++}
++
++static void hns3_info_show(struct hns3_nic_priv *priv)
++{
++	struct hnae3_knic_private_info *kinfo = &priv->ae_handle->kinfo;
++	char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
++
++	hnae3_format_mac_addr(format_mac_addr, priv->netdev->dev_addr);
++	dev_info(priv->dev, "MAC address: %s\n", format_mac_addr);
++	dev_info(priv->dev, "Task queue pairs numbers: %u\n", kinfo->num_tqps);
++	dev_info(priv->dev, "RSS size: %u\n", kinfo->rss_size);
++	dev_info(priv->dev, "Allocated RSS size: %u\n", kinfo->req_rss_size);
++	dev_info(priv->dev, "RX buffer length: %u\n", kinfo->rx_buf_len);
++	dev_info(priv->dev, "Desc num per TX queue: %u\n", kinfo->num_tx_desc);
++	dev_info(priv->dev, "Desc num per RX queue: %u\n", kinfo->num_rx_desc);
++	dev_info(priv->dev, "Total number of enabled TCs: %u\n",
++		 kinfo->tc_info.num_tc);
++	dev_info(priv->dev, "Max mtu size: %u\n", priv->netdev->max_mtu);
++}
++
++static void hns3_set_cq_period_mode(struct hns3_nic_priv *priv,
++				    enum dim_cq_period_mode mode, bool is_tx)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
++	struct hnae3_handle *handle = priv->ae_handle;
++	int i;
++
++	if (is_tx) {
++		priv->tx_cqe_mode = mode;
++
++		for (i = 0; i < priv->vector_num; i++)
++			priv->tqp_vector[i].tx_group.dim.mode = mode;
++	} else {
++		priv->rx_cqe_mode = mode;
++
++		for (i = 0; i < priv->vector_num; i++)
++			priv->tqp_vector[i].rx_group.dim.mode = mode;
++	}
++
++	if (hnae3_ae_dev_cq_supported(ae_dev)) {
++		u32 new_mode;
++		u64 reg;
++
++		new_mode = (mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE) ?
++			HNS3_CQ_MODE_CQE : HNS3_CQ_MODE_EQE;
++		reg = is_tx ? HNS3_GL1_CQ_MODE_REG : HNS3_GL0_CQ_MODE_REG;
++
++		writel(new_mode, handle->kinfo.io_base + reg);
++	}
++}
++
++void hns3_cq_period_mode_init(struct hns3_nic_priv *priv,
++			      enum dim_cq_period_mode tx_mode,
++			      enum dim_cq_period_mode rx_mode)
++{
++	hns3_set_cq_period_mode(priv, tx_mode, true);
++	hns3_set_cq_period_mode(priv, rx_mode, false);
++}
++
++static void hns3_state_init(struct hnae3_handle *handle)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev);
++	struct net_device *netdev = handle->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	set_bit(HNS3_NIC_STATE_INITED, &priv->state);
++
++	if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps))
++		set_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state);
++
++	if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3)
++		set_bit(HNAE3_PFLAG_LIMIT_PROMISC, &handle->supported_pflags);
++
++	if (test_bit(HNAE3_DEV_SUPPORT_HW_TX_CSUM_B, ae_dev->caps))
++		set_bit(HNS3_NIC_STATE_HW_TX_CSUM_ENABLE, &priv->state);
++
++	if (hnae3_ae_dev_rxd_adv_layout_supported(ae_dev))
++		set_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state);
++}
++
++static void hns3_state_uninit(struct hnae3_handle *handle)
++{
++	struct hns3_nic_priv *priv  = handle->priv;
++
++	clear_bit(HNS3_NIC_STATE_INITED, &priv->state);
++}
++
++static int hns3_client_init(struct hnae3_handle *handle)
++{
++	struct pci_dev *pdev = handle->pdev;
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++	u16 alloc_tqps, max_rss_size;
++	struct hns3_nic_priv *priv;
++	struct net_device *netdev;
++	int ret;
++
++	handle->ae_algo->ops->get_tqps_and_rss_info(handle, &alloc_tqps,
++						    &max_rss_size);
++	netdev = alloc_etherdev_mq(sizeof(struct hns3_nic_priv), alloc_tqps);
++	if (!netdev)
++		return -ENOMEM;
++
++	priv = netdev_priv(netdev);
++	priv->dev = &pdev->dev;
++	priv->netdev = netdev;
++	priv->ae_handle = handle;
++	priv->tx_timeout_count = 0;
++	priv->max_non_tso_bd_num = ae_dev->dev_specs.max_non_tso_bd_num;
++	set_bit(HNS3_NIC_STATE_DOWN, &priv->state);
++
++	handle->msg_enable = netif_msg_init(debug, DEFAULT_MSG_LEVEL);
++
++	handle->kinfo.netdev = netdev;
++	handle->priv = (void *)priv;
++
++	hns3_init_mac_addr(netdev);
++
++	hns3_set_default_feature(netdev);
++
++	netdev->watchdog_timeo = HNS3_TX_TIMEOUT;
++	netdev->priv_flags |= IFF_UNICAST_FLT;
++	netdev->netdev_ops = &hns3_nic_netdev_ops;
++	SET_NETDEV_DEV(netdev, &pdev->dev);
++	hns3_ethtool_set_ops(netdev);
++
++	/* Carrier off reporting is important to ethtool even BEFORE open */
++	netif_carrier_off(netdev);
++
++	ret = hns3_get_ring_config(priv);
++	if (ret) {
++		ret = -ENOMEM;
++		goto out_get_ring_cfg;
++	}
++
++	hns3_nic_init_coal_cfg(priv);
++
++	ret = hns3_nic_alloc_vector_data(priv);
++	if (ret) {
++		ret = -ENOMEM;
++		goto out_alloc_vector_data;
++	}
++
++	ret = hns3_nic_init_vector_data(priv);
++	if (ret) {
++		ret = -ENOMEM;
++		goto out_init_vector_data;
++	}
++
++	ret = hns3_init_all_ring(priv);
++	if (ret) {
++		ret = -ENOMEM;
++		goto out_init_ring;
++	}
++
++	hns3_cq_period_mode_init(priv, DIM_CQ_PERIOD_MODE_START_FROM_EQE,
++				 DIM_CQ_PERIOD_MODE_START_FROM_EQE);
++
++	ret = hns3_init_phy(netdev);
++	if (ret)
++		goto out_init_phy;
++
++	/* the device can work without cpu rmap, only aRFS needs it */
++	ret = hns3_set_rx_cpu_rmap(netdev);
++	if (ret)
++		dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret);
++
++	ret = hns3_nic_init_irq(priv);
++	if (ret) {
++		dev_err(priv->dev, "init irq failed! ret=%d\n", ret);
++		hns3_free_rx_cpu_rmap(netdev);
++		goto out_init_irq_fail;
++	}
++
++	ret = hns3_client_start(handle);
++	if (ret) {
++		dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret);
++		goto out_client_start;
++	}
++
++	hns3_dcbnl_setup(handle);
++
++	ret = hns3_dbg_init(handle);
++	if (ret) {
++		dev_err(priv->dev, "failed to init debugfs, ret = %d\n",
++			ret);
++		goto out_client_start;
++	}
++
++	netdev->max_mtu = HNS3_MAX_MTU(ae_dev->dev_specs.max_frm_size);
++
++	hns3_state_init(handle);
++
++	ret = register_netdev(netdev);
++	if (ret) {
++		dev_err(priv->dev, "probe register netdev fail!\n");
++		goto out_reg_netdev_fail;
++	}
++
++	if (netif_msg_drv(handle))
++		hns3_info_show(priv);
++
++	return ret;
++
++out_reg_netdev_fail:
++	hns3_state_uninit(handle);
++	hns3_dbg_uninit(handle);
++	hns3_client_stop(handle);
++out_client_start:
++	hns3_free_rx_cpu_rmap(netdev);
++	hns3_nic_uninit_irq(priv);
++out_init_irq_fail:
++	hns3_uninit_phy(netdev);
++out_init_phy:
++	hns3_uninit_all_ring(priv);
++out_init_ring:
++	hns3_nic_uninit_vector_data(priv);
++out_init_vector_data:
++	hns3_nic_dealloc_vector_data(priv);
++out_alloc_vector_data:
++	priv->ring = NULL;
++out_get_ring_cfg:
++	priv->ae_handle = NULL;
++	free_netdev(netdev);
++	return ret;
++}
++
++static void hns3_client_uninit(struct hnae3_handle *handle, bool reset)
++{
++	struct net_device *netdev = handle->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	if (netdev->reg_state != NETREG_UNINITIALIZED)
++		unregister_netdev(netdev);
++
++	hns3_client_stop(handle);
++
++	hns3_uninit_phy(netdev);
++
++	if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) {
++		netdev_warn(netdev, "already uninitialized\n");
++		goto out_netdev_free;
++	}
++
++	hns3_free_rx_cpu_rmap(netdev);
++
++	hns3_nic_uninit_irq(priv);
++
++	hns3_clear_all_ring(handle, true);
++
++	hns3_nic_uninit_vector_data(priv);
++
++	hns3_nic_dealloc_vector_data(priv);
++
++	hns3_uninit_all_ring(priv);
++
++	hns3_put_ring_config(priv);
++
++out_netdev_free:
++	hns3_dbg_uninit(handle);
++	free_netdev(netdev);
++}
++
++static void hns3_link_status_change(struct hnae3_handle *handle, bool linkup)
++{
++	struct net_device *netdev = handle->kinfo.netdev;
++
++	if (!netdev)
++		return;
++
++	if (linkup) {
++		netif_tx_wake_all_queues(netdev);
++		netif_carrier_on(netdev);
++		if (netif_msg_link(handle))
++			netdev_info(netdev, "link up\n");
++	} else {
++		netif_carrier_off(netdev);
++		netif_tx_stop_all_queues(netdev);
++		if (netif_msg_link(handle))
++			netdev_info(netdev, "link down\n");
++	}
++}
++
++static void hns3_clear_tx_ring(struct hns3_enet_ring *ring)
++{
++	while (ring->next_to_clean != ring->next_to_use) {
++		ring->desc[ring->next_to_clean].tx.bdtp_fe_sc_vld_ra_ri = 0;
++		hns3_free_buffer_detach(ring, ring->next_to_clean, 0);
++		ring_ptr_move_fw(ring, next_to_clean);
++	}
++
++	ring->pending_buf = 0;
++}
++
++static int hns3_clear_rx_ring(struct hns3_enet_ring *ring)
++{
++	struct hns3_desc_cb res_cbs;
++	int ret;
++
++	while (ring->next_to_use != ring->next_to_clean) {
++		/* When a buffer is not reused, it's memory has been
++		 * freed in hns3_handle_rx_bd or will be freed by
++		 * stack, so we need to replace the buffer here.
++		 */
++		if (!ring->desc_cb[ring->next_to_use].reuse_flag) {
++			ret = hns3_alloc_and_map_buffer(ring, &res_cbs);
++			if (ret) {
++				hns3_ring_stats_update(ring, sw_err_cnt);
++				/* if alloc new buffer fail, exit directly
++				 * and reclear in up flow.
++				 */
++				netdev_warn(ring_to_netdev(ring),
++					    "reserve buffer map failed, ret = %d\n",
++					    ret);
++				return ret;
++			}
++			hns3_replace_buffer(ring, ring->next_to_use, &res_cbs);
++		}
++		ring_ptr_move_fw(ring, next_to_use);
++	}
++
++	/* Free the pending skb in rx ring */
++	if (ring->skb) {
++		dev_kfree_skb_any(ring->skb);
++		ring->skb = NULL;
++		ring->pending_buf = 0;
++	}
++
++	return 0;
++}
++
++static void hns3_force_clear_rx_ring(struct hns3_enet_ring *ring)
++{
++	while (ring->next_to_use != ring->next_to_clean) {
++		/* When a buffer is not reused, it's memory has been
++		 * freed in hns3_handle_rx_bd or will be freed by
++		 * stack, so only need to unmap the buffer here.
++		 */
++		if (!ring->desc_cb[ring->next_to_use].reuse_flag) {
++			hns3_unmap_buffer(ring,
++					  &ring->desc_cb[ring->next_to_use]);
++			ring->desc_cb[ring->next_to_use].dma = 0;
++		}
++
++		ring_ptr_move_fw(ring, next_to_use);
++	}
++}
++
++static void hns3_clear_all_ring(struct hnae3_handle *h, bool force)
++{
++	struct net_device *ndev = h->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	u32 i;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		struct hns3_enet_ring *ring;
++
++		ring = &priv->ring[i];
++		hns3_clear_tx_ring(ring);
++
++		ring = &priv->ring[i + h->kinfo.num_tqps];
++		/* Continue to clear other rings even if clearing some
++		 * rings failed.
++		 */
++		if (force)
++			hns3_force_clear_rx_ring(ring);
++		else
++			hns3_clear_rx_ring(ring);
++	}
++}
++
++int hns3_nic_reset_all_ring(struct hnae3_handle *h)
++{
++	struct net_device *ndev = h->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	struct hns3_enet_ring *rx_ring;
++	int i, j;
++	int ret;
++
++	ret = h->ae_algo->ops->reset_queue(h);
++	if (ret)
++		return ret;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		hns3_init_ring_hw(&priv->ring[i]);
++
++		/* We need to clear tx ring here because self test will
++		 * use the ring and will not run down before up
++		 */
++		hns3_clear_tx_ring(&priv->ring[i]);
++		priv->ring[i].next_to_clean = 0;
++		priv->ring[i].next_to_use = 0;
++		priv->ring[i].last_to_use = 0;
++
++		rx_ring = &priv->ring[i + h->kinfo.num_tqps];
++		hns3_init_ring_hw(rx_ring);
++		ret = hns3_clear_rx_ring(rx_ring);
++		if (ret)
++			return ret;
++
++		/* We can not know the hardware head and tail when this
++		 * function is called in reset flow, so we reuse all desc.
++		 */
++		for (j = 0; j < rx_ring->desc_num; j++)
++			hns3_reuse_buffer(rx_ring, j);
++
++		rx_ring->next_to_clean = 0;
++		rx_ring->next_to_use = 0;
++	}
++
++	hns3_init_tx_ring_tc(priv);
++
++	return 0;
++}
++
++static int hns3_reset_notify_down_enet(struct hnae3_handle *handle)
++{
++	struct hnae3_knic_private_info *kinfo = &handle->kinfo;
++	struct net_device *ndev = kinfo->netdev;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++
++	if (test_and_set_bit(HNS3_NIC_STATE_RESETTING, &priv->state))
++		return 0;
++
++	if (!netif_running(ndev))
++		return 0;
++
++	return hns3_nic_net_stop(ndev);
++}
++
++static int hns3_reset_notify_up_enet(struct hnae3_handle *handle)
++{
++	struct hnae3_knic_private_info *kinfo = &handle->kinfo;
++	struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev);
++	int ret = 0;
++
++	if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state)) {
++		netdev_err(kinfo->netdev, "device is not initialized yet\n");
++		return -EFAULT;
++	}
++
++	clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state);
++
++	if (netif_running(kinfo->netdev)) {
++		ret = hns3_nic_net_open(kinfo->netdev);
++		if (ret) {
++			set_bit(HNS3_NIC_STATE_RESETTING, &priv->state);
++			netdev_err(kinfo->netdev,
++				   "net up fail, ret=%d!\n", ret);
++			return ret;
++		}
++	}
++
++	return ret;
++}
++
++static int hns3_reset_notify_init_enet(struct hnae3_handle *handle)
++{
++	struct net_device *netdev = handle->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	int ret;
++
++	/* Carrier off reporting is important to ethtool even BEFORE open */
++	netif_carrier_off(netdev);
++
++	ret = hns3_get_ring_config(priv);
++	if (ret)
++		return ret;
++
++	ret = hns3_nic_alloc_vector_data(priv);
++	if (ret)
++		goto err_put_ring;
++
++	ret = hns3_nic_init_vector_data(priv);
++	if (ret)
++		goto err_dealloc_vector;
++
++	ret = hns3_init_all_ring(priv);
++	if (ret)
++		goto err_uninit_vector;
++
++	hns3_cq_period_mode_init(priv, priv->tx_cqe_mode, priv->rx_cqe_mode);
++
++	/* the device can work without cpu rmap, only aRFS needs it */
++	ret = hns3_set_rx_cpu_rmap(netdev);
++	if (ret)
++		dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret);
++
++	ret = hns3_nic_init_irq(priv);
++	if (ret) {
++		dev_err(priv->dev, "init irq failed! ret=%d\n", ret);
++		hns3_free_rx_cpu_rmap(netdev);
++		goto err_init_irq_fail;
++	}
++
++	if (!hns3_is_phys_func(handle->pdev))
++		hns3_init_mac_addr(netdev);
++
++	ret = hns3_client_start(handle);
++	if (ret) {
++		dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret);
++		goto err_client_start_fail;
++	}
++
++	set_bit(HNS3_NIC_STATE_INITED, &priv->state);
++
++	return ret;
++
++err_client_start_fail:
++	hns3_free_rx_cpu_rmap(netdev);
++	hns3_nic_uninit_irq(priv);
++err_init_irq_fail:
++	hns3_uninit_all_ring(priv);
++err_uninit_vector:
++	hns3_nic_uninit_vector_data(priv);
++err_dealloc_vector:
++	hns3_nic_dealloc_vector_data(priv);
++err_put_ring:
++	hns3_put_ring_config(priv);
++
++	return ret;
++}
++
++static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle)
++{
++	struct net_device *netdev = handle->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) {
++		netdev_warn(netdev, "already uninitialized\n");
++		return 0;
++	}
++
++	hns3_free_rx_cpu_rmap(netdev);
++	hns3_nic_uninit_irq(priv);
++	hns3_clear_all_ring(handle, true);
++	hns3_reset_tx_queue(priv->ae_handle);
++
++	hns3_nic_uninit_vector_data(priv);
++
++	hns3_nic_dealloc_vector_data(priv);
++
++	hns3_uninit_all_ring(priv);
++
++	hns3_put_ring_config(priv);
++
++	return 0;
++}
++
++int hns3_reset_notify(struct hnae3_handle *handle,
++		      enum hnae3_reset_notify_type type)
++{
++	int ret = 0;
++
++	switch (type) {
++	case HNAE3_UP_CLIENT:
++		ret = hns3_reset_notify_up_enet(handle);
++		break;
++	case HNAE3_DOWN_CLIENT:
++		ret = hns3_reset_notify_down_enet(handle);
++		break;
++	case HNAE3_INIT_CLIENT:
++		ret = hns3_reset_notify_init_enet(handle);
++		break;
++	case HNAE3_UNINIT_CLIENT:
++		ret = hns3_reset_notify_uninit_enet(handle);
++		break;
++	default:
++		break;
++	}
++
++	return ret;
++}
++
++static int hns3_change_channels(struct hnae3_handle *handle, u32 new_tqp_num,
++				bool rxfh_configured)
++{
++	int ret;
++
++	ret = handle->ae_algo->ops->set_channels(handle, new_tqp_num,
++						 rxfh_configured);
++	if (ret) {
++		dev_err(&handle->pdev->dev,
++			"Change tqp num(%u) fail.\n", new_tqp_num);
++		return ret;
++	}
++
++	ret = hns3_reset_notify(handle, HNAE3_INIT_CLIENT);
++	if (ret)
++		return ret;
++
++	ret =  hns3_reset_notify(handle, HNAE3_UP_CLIENT);
++	if (ret)
++		hns3_reset_notify(handle, HNAE3_UNINIT_CLIENT);
++
++	return ret;
++}
++
++int hns3_set_channels(struct net_device *netdev,
++		      struct ethtool_channels *ch)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct hnae3_knic_private_info *kinfo = &h->kinfo;
++	bool rxfh_configured = netif_is_rxfh_configured(netdev);
++	u32 new_tqp_num = ch->combined_count;
++	u16 org_tqp_num;
++	int ret;
++
++	if (hns3_nic_resetting(netdev))
++		return -EBUSY;
++
++	if (ch->rx_count || ch->tx_count)
++		return -EINVAL;
++
++	if (kinfo->tc_info.mqprio_active) {
++		dev_err(&netdev->dev,
++			"it's not allowed to set channels via ethtool when MQPRIO mode is on\n");
++		return -EINVAL;
++	}
++
++	if (new_tqp_num > hns3_get_max_available_channels(h) ||
++	    new_tqp_num < 1) {
++		dev_err(&netdev->dev,
++			"Change tqps fail, the tqp range is from 1 to %u",
++			hns3_get_max_available_channels(h));
++		return -EINVAL;
++	}
++
++	if (kinfo->rss_size == new_tqp_num)
++		return 0;
++
++	netif_dbg(h, drv, netdev,
++		  "set channels: tqp_num=%u, rxfh=%d\n",
++		  new_tqp_num, rxfh_configured);
++
++	ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT);
++	if (ret)
++		return ret;
++
++	ret = hns3_reset_notify(h, HNAE3_UNINIT_CLIENT);
++	if (ret)
++		return ret;
++
++	org_tqp_num = h->kinfo.num_tqps;
++	ret = hns3_change_channels(h, new_tqp_num, rxfh_configured);
++	if (ret) {
++		int ret1;
++
++		netdev_warn(netdev,
++			    "Change channels fail, revert to old value\n");
++		ret1 = hns3_change_channels(h, org_tqp_num, rxfh_configured);
++		if (ret1) {
++			netdev_err(netdev,
++				   "revert to old channel fail\n");
++			return ret1;
++		}
++
++		return ret;
++	}
++
++	return 0;
++}
++
++static const struct hns3_hw_error_info hns3_hw_err[] = {
++	{ .type = HNAE3_PPU_POISON_ERROR,
++	  .msg = "PPU poison" },
++	{ .type = HNAE3_CMDQ_ECC_ERROR,
++	  .msg = "IMP CMDQ error" },
++	{ .type = HNAE3_IMP_RD_POISON_ERROR,
++	  .msg = "IMP RD poison" },
++	{ .type = HNAE3_ROCEE_AXI_RESP_ERROR,
++	  .msg = "ROCEE AXI RESP error" },
++};
++
++static void hns3_process_hw_error(struct hnae3_handle *handle,
++				  enum hnae3_hw_error_type type)
++{
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(hns3_hw_err); i++) {
++		if (hns3_hw_err[i].type == type) {
++			dev_err(&handle->pdev->dev, "Detected %s!\n",
++				hns3_hw_err[i].msg);
++			break;
++		}
++	}
++}
++
++static const struct hnae3_client_ops client_ops = {
++	.init_instance = hns3_client_init,
++	.uninit_instance = hns3_client_uninit,
++	.link_status_change = hns3_link_status_change,
++	.reset_notify = hns3_reset_notify,
++	.process_hw_error = hns3_process_hw_error,
++};
++
++/* hns3_init_module - Driver registration routine
++ * hns3_init_module is the first routine called when the driver is
++ * loaded. All it does is register with the PCI subsystem.
++ */
++static int __init hns3_init_module(void)
++{
++	int ret;
++
++	pr_info("%s: %s - version\n", hns3_driver_name, hns3_driver_string);
++	pr_info("%s: %s\n", hns3_driver_name, hns3_copyright);
++
++	client.type = HNAE3_CLIENT_KNIC;
++	snprintf(client.name, HNAE3_CLIENT_NAME_LENGTH, "%s",
++		 hns3_driver_name);
++
++	client.ops = &client_ops;
++
++	INIT_LIST_HEAD(&client.node);
++
++	hns3_dbg_register_debugfs(hns3_driver_name);
++
++	ret = hnae3_register_client(&client);
++	if (ret)
++		goto err_reg_client;
++
++	ret = pci_register_driver(&hns3_driver);
++	if (ret)
++		goto err_reg_driver;
++
++	return ret;
++
++err_reg_driver:
++	hnae3_unregister_client(&client);
++err_reg_client:
++	hns3_dbg_unregister_debugfs();
++	return ret;
++}
++module_init(hns3_init_module);
++
++/* hns3_exit_module - Driver exit cleanup routine
++ * hns3_exit_module is called just before the driver is removed
++ * from memory.
++ */
++static void __exit hns3_exit_module(void)
++{
++	pci_unregister_driver(&hns3_driver);
++	hnae3_unregister_client(&client);
++	hns3_dbg_unregister_debugfs();
++}
++module_exit(hns3_exit_module);
++
++MODULE_DESCRIPTION("HNS3: Hisilicon Ethernet Driver");
++MODULE_AUTHOR("Huawei Tech. Co., Ltd.");
++MODULE_LICENSE("GPL");
++MODULE_ALIAS("pci:hns-nic");
+diff -rupN linux.orig/drivers/net/ethernet/huawei/hinic/hinic_rx.c linux/drivers/net/ethernet/huawei/hinic/hinic_rx.c
+--- linux.orig/drivers/net/ethernet/huawei/hinic/hinic_rx.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/huawei/hinic/hinic_rx.c	2022-12-04 10:40:26.684034126 -0500
+@@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rx
  	unsigned int start;
  
  	do {
@@ -1599,11 +7449,10 @@ index e5828a658caf4..a866bea651103 100644
  }
  
  /**
-diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
-index 3b6c7b5857376..5051cdff2384b 100644
---- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c
-+++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
-@@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_txq *txq, struct hinic_txq_stats *stats)
+diff -rupN linux.orig/drivers/net/ethernet/huawei/hinic/hinic_tx.c linux/drivers/net/ethernet/huawei/hinic/hinic_tx.c
+--- linux.orig/drivers/net/ethernet/huawei/hinic/hinic_tx.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/huawei/hinic/hinic_tx.c	2022-12-04 10:40:26.684034126 -0500
+@@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_tx
  	unsigned int start;
  
  	do {
@@ -1620,11 +7469,10 @@ index 3b6c7b5857376..5051cdff2384b 100644
  }
  
  /**
-diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
-index 2cca9e84e31e1..34ab5ff9823b7 100644
---- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
-+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
-@@ -1229,10 +1229,10 @@ static void fm10k_get_stats64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c linux/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+--- linux.orig/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c	2022-12-04 10:40:26.684034126 -0500
+@@ -1229,10 +1229,10 @@ static void fm10k_get_stats64(struct net
  			continue;
  
  		do {
@@ -1637,7 +7485,7 @@ index 2cca9e84e31e1..34ab5ff9823b7 100644
  
  		stats->rx_packets += packets;
  		stats->rx_bytes   += bytes;
-@@ -1245,10 +1245,10 @@ static void fm10k_get_stats64(struct net_device *netdev,
+@@ -1245,10 +1245,10 @@ static void fm10k_get_stats64(struct net
  			continue;
  
  		do {
@@ -1650,11 +7498,10 @@ index 2cca9e84e31e1..34ab5ff9823b7 100644
  
  		stats->tx_packets += packets;
  		stats->tx_bytes   += bytes;
-diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
-index e9cd0fa6a0d2f..90f2eee78a3ee 100644
---- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
-+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
-@@ -154,7 +154,7 @@ __i40e_add_ethtool_stats(u64 **data, void *pointer,
+diff -rupN linux.orig/drivers/net/ethernet/intel/i40e/i40e_ethtool.c linux/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/i40e/i40e_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/i40e/i40e_ethtool.c	2022-12-04 10:40:26.684034126 -0500
+@@ -154,7 +154,7 @@ __i40e_add_ethtool_stats(u64 **data, voi
   * @ring: the ring to copy
   *
   * Queue statistics must be copied while protected by
@@ -1663,7 +7510,7 @@ index e9cd0fa6a0d2f..90f2eee78a3ee 100644
   * Assumes that queue stats are defined in i40e_gstrings_queue_stats. If the
   * ring pointer is null, zero out the queue stat values and update the data
   * pointer. Otherwise safely copy the stats from the ring into the supplied
-@@ -172,16 +172,16 @@ i40e_add_queue_stats(u64 **data, struct i40e_ring *ring)
+@@ -172,16 +172,16 @@ i40e_add_queue_stats(u64 **data, struct
  
  	/* To avoid invalid statistics values, ensure that we keep retrying
  	 * the copy until we get a consistent value according to
@@ -1683,11 +7530,10 @@ index e9cd0fa6a0d2f..90f2eee78a3ee 100644
  
  	/* Once we successfully copy the stats in, update the data pointer */
  	*data += size;
-diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
-index e3d9804aeb25e..09a9f67d9ebc0 100644
---- a/drivers/net/ethernet/intel/i40e/i40e_main.c
-+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
-@@ -418,10 +418,10 @@ static void i40e_get_netdev_stats_struct_tx(struct i40e_ring *ring,
+diff -rupN linux.orig/drivers/net/ethernet/intel/i40e/i40e_main.c linux/drivers/net/ethernet/intel/i40e/i40e_main.c
+--- linux.orig/drivers/net/ethernet/intel/i40e/i40e_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/i40e/i40e_main.c	2022-12-04 10:40:26.684034126 -0500
+@@ -418,10 +418,10 @@ static void i40e_get_netdev_stats_struct
  	unsigned int start;
  
  	do {
@@ -1700,7 +7546,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  
  	stats->tx_packets += packets;
  	stats->tx_bytes   += bytes;
-@@ -471,10 +471,10 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
+@@ -471,10 +471,10 @@ static void i40e_get_netdev_stats_struct
  		if (!ring)
  			continue;
  		do {
@@ -1713,7 +7559,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  
  		stats->rx_packets += packets;
  		stats->rx_bytes   += bytes;
-@@ -896,10 +896,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
+@@ -896,10 +896,10 @@ static void i40e_update_vsi_stats(struct
  			continue;
  
  		do {
@@ -1726,7 +7572,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  		tx_b += bytes;
  		tx_p += packets;
  		tx_restart += p->tx_stats.restart_queue;
-@@ -914,10 +914,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
+@@ -914,10 +914,10 @@ static void i40e_update_vsi_stats(struct
  			continue;
  
  		do {
@@ -1739,7 +7585,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  		rx_b += bytes;
  		rx_p += packets;
  		rx_buf += p->rx_stats.alloc_buff_failed;
-@@ -934,10 +934,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
+@@ -934,10 +934,10 @@ static void i40e_update_vsi_stats(struct
  				continue;
  
  			do {
@@ -1752,11 +7598,10 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  			tx_b += bytes;
  			tx_p += packets;
  			tx_restart += p->tx_stats.restart_queue;
-diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
-index e535d4c3da49d..fafa3406e0bcc 100644
---- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
-+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
-@@ -147,7 +147,7 @@ __iavf_add_ethtool_stats(u64 **data, void *pointer,
+diff -rupN linux.orig/drivers/net/ethernet/intel/iavf/iavf_ethtool.c linux/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/iavf/iavf_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/iavf/iavf_ethtool.c	2022-12-04 10:40:26.684034126 -0500
+@@ -147,7 +147,7 @@ __iavf_add_ethtool_stats(u64 **data, voi
   * @ring: the ring to copy
   *
   * Queue statistics must be copied while protected by
@@ -1765,7 +7610,7 @@ index e535d4c3da49d..fafa3406e0bcc 100644
   * Assumes that queue stats are defined in iavf_gstrings_queue_stats. If the
   * ring pointer is null, zero out the queue stat values and update the data
   * pointer. Otherwise safely copy the stats from the ring into the supplied
-@@ -165,14 +165,14 @@ iavf_add_queue_stats(u64 **data, struct iavf_ring *ring)
+@@ -165,14 +165,14 @@ iavf_add_queue_stats(u64 **data, struct
  
  	/* To avoid invalid statistics values, ensure that we keep retrying
  	 * the copy until we get a consistent value according to
@@ -1783,11 +7628,10 @@ index e535d4c3da49d..fafa3406e0bcc 100644
  
  	/* Once we successfully copy the stats in, update the data pointer */
  	*data += size;
-diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
-index e109cb93886be..b7394c7e5eed2 100644
---- a/drivers/net/ethernet/intel/ice/ice_main.c
-+++ b/drivers/net/ethernet/intel/ice/ice_main.c
-@@ -6295,10 +6295,10 @@ ice_fetch_u64_stats_per_ring(struct u64_stats_sync *syncp,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ice/ice_main.c linux/drivers/net/ethernet/intel/ice/ice_main.c
+--- linux.orig/drivers/net/ethernet/intel/ice/ice_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ice/ice_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -6295,10 +6295,10 @@ ice_fetch_u64_stats_per_ring(struct u64_
  	unsigned int start;
  
  	do {
@@ -1800,11 +7644,10 @@ index e109cb93886be..b7394c7e5eed2 100644
  }
  
  /**
-diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
-index c14fc871dd417..23c6fcfcb905c 100644
---- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
-+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
-@@ -2311,15 +2311,15 @@ static void igb_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/igb/igb_ethtool.c linux/drivers/net/ethernet/intel/igb/igb_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/igb/igb_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/igb/igb_ethtool.c	2022-12-04 10:40:26.688034116 -0500
+@@ -2311,15 +2311,15 @@ static void igb_get_ethtool_stats(struct
  
  		ring = adapter->tx_ring[j];
  		do {
@@ -1824,7 +7667,7 @@ index c14fc871dd417..23c6fcfcb905c 100644
  		data[i+2] += restart2;
  
  		i += IGB_TX_QUEUE_STATS_LEN;
-@@ -2327,13 +2327,13 @@ static void igb_get_ethtool_stats(struct net_device *netdev,
+@@ -2327,13 +2327,13 @@ static void igb_get_ethtool_stats(struct
  	for (j = 0; j < adapter->num_rx_queues; j++) {
  		ring = adapter->rx_ring[j];
  		do {
@@ -1840,11 +7683,10 @@ index c14fc871dd417..23c6fcfcb905c 100644
  		i += IGB_RX_QUEUE_STATS_LEN;
  	}
  	spin_unlock(&adapter->stats64_lock);
-diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
-index 2796e81d27260..98df55dc1e933 100644
---- a/drivers/net/ethernet/intel/igb/igb_main.c
-+++ b/drivers/net/ethernet/intel/igb/igb_main.c
-@@ -6633,10 +6633,10 @@ void igb_update_stats(struct igb_adapter *adapter)
+diff -rupN linux.orig/drivers/net/ethernet/intel/igb/igb_main.c linux/drivers/net/ethernet/intel/igb/igb_main.c
+--- linux.orig/drivers/net/ethernet/intel/igb/igb_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/igb/igb_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -6633,10 +6633,10 @@ void igb_update_stats(struct igb_adapter
  		}
  
  		do {
@@ -1857,7 +7699,7 @@ index 2796e81d27260..98df55dc1e933 100644
  		bytes += _bytes;
  		packets += _packets;
  	}
-@@ -6649,10 +6649,10 @@ void igb_update_stats(struct igb_adapter *adapter)
+@@ -6649,10 +6649,10 @@ void igb_update_stats(struct igb_adapter
  	for (i = 0; i < adapter->num_tx_queues; i++) {
  		struct igb_ring *ring = adapter->tx_ring[i];
  		do {
@@ -1870,11 +7712,10 @@ index 2796e81d27260..98df55dc1e933 100644
  		bytes += _bytes;
  		packets += _packets;
  	}
-diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
-index 8cc077b712add..5a26a7805ef80 100644
---- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
-+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
-@@ -839,15 +839,15 @@ static void igc_ethtool_get_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/igc/igc_ethtool.c linux/drivers/net/ethernet/intel/igc/igc_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/igc/igc_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/igc/igc_ethtool.c	2022-12-04 10:40:26.688034116 -0500
+@@ -839,15 +839,15 @@ static void igc_ethtool_get_stats(struct
  
  		ring = adapter->tx_ring[j];
  		do {
@@ -1894,7 +7735,7 @@ index 8cc077b712add..5a26a7805ef80 100644
  		data[i + 2] += restart2;
  
  		i += IGC_TX_QUEUE_STATS_LEN;
-@@ -855,13 +855,13 @@ static void igc_ethtool_get_stats(struct net_device *netdev,
+@@ -855,13 +855,13 @@ static void igc_ethtool_get_stats(struct
  	for (j = 0; j < adapter->num_rx_queues; j++) {
  		ring = adapter->rx_ring[j];
  		do {
@@ -1910,11 +7751,10 @@ index 8cc077b712add..5a26a7805ef80 100644
  		i += IGC_RX_QUEUE_STATS_LEN;
  	}
  	spin_unlock(&adapter->stats64_lock);
-diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
-index ebff0e04045d6..944299b06cc3d 100644
---- a/drivers/net/ethernet/intel/igc/igc_main.c
-+++ b/drivers/net/ethernet/intel/igc/igc_main.c
-@@ -4645,10 +4645,10 @@ void igc_update_stats(struct igc_adapter *adapter)
+diff -rupN linux.orig/drivers/net/ethernet/intel/igc/igc_main.c linux/drivers/net/ethernet/intel/igc/igc_main.c
+--- linux.orig/drivers/net/ethernet/intel/igc/igc_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/igc/igc_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -4645,10 +4645,10 @@ void igc_update_stats(struct igc_adapter
  		}
  
  		do {
@@ -1927,7 +7767,7 @@ index ebff0e04045d6..944299b06cc3d 100644
  		bytes += _bytes;
  		packets += _packets;
  	}
-@@ -4662,10 +4662,10 @@ void igc_update_stats(struct igc_adapter *adapter)
+@@ -4662,10 +4662,10 @@ void igc_update_stats(struct igc_adapter
  		struct igc_ring *ring = adapter->tx_ring[i];
  
  		do {
@@ -1940,11 +7780,10 @@ index ebff0e04045d6..944299b06cc3d 100644
  		bytes += _bytes;
  		packets += _packets;
  	}
-diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
-index 04f453eabef64..51bcf0df3adcc 100644
---- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
-+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
-@@ -1335,10 +1335,10 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c linux/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c	2022-12-04 10:40:26.688034116 -0500
+@@ -1335,10 +1335,10 @@ static void ixgbe_get_ethtool_stats(stru
  		}
  
  		do {
@@ -1957,7 +7796,7 @@ index 04f453eabef64..51bcf0df3adcc 100644
  		i += 2;
  	}
  	for (j = 0; j < IXGBE_NUM_RX_QUEUES; j++) {
-@@ -1351,10 +1351,10 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
+@@ -1351,10 +1351,10 @@ static void ixgbe_get_ethtool_stats(stru
  		}
  
  		do {
@@ -1970,11 +7809,10 @@ index 04f453eabef64..51bcf0df3adcc 100644
  		i += 2;
  	}
  
-diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
-index d1e430b8c8aa1..01c5548f181d5 100644
---- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
-+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
-@@ -9041,10 +9041,10 @@ static void ixgbe_get_ring_stats64(struct rtnl_link_stats64 *stats,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c linux/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+--- linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -9041,10 +9041,10 @@ static void ixgbe_get_ring_stats64(struc
  
  	if (ring) {
  		do {
@@ -1987,7 +7825,7 @@ index d1e430b8c8aa1..01c5548f181d5 100644
  		stats->tx_packets += packets;
  		stats->tx_bytes   += bytes;
  	}
-@@ -9064,10 +9064,10 @@ static void ixgbe_get_stats64(struct net_device *netdev,
+@@ -9064,10 +9064,10 @@ static void ixgbe_get_stats64(struct net
  
  		if (ring) {
  			do {
@@ -2000,11 +7838,10 @@ index d1e430b8c8aa1..01c5548f181d5 100644
  			stats->rx_packets += packets;
  			stats->rx_bytes   += bytes;
  		}
-diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
-index fed46872af2bf..b4632b67ab143 100644
---- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
-+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
-@@ -458,10 +458,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbevf/ethtool.c linux/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/ixgbevf/ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ixgbevf/ethtool.c	2022-12-04 10:40:26.688034116 -0500
+@@ -458,10 +458,10 @@ static void ixgbevf_get_ethtool_stats(st
  		}
  
  		do {
@@ -2017,7 +7854,7 @@ index fed46872af2bf..b4632b67ab143 100644
  		i += 2;
  	}
  
-@@ -475,10 +475,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev,
+@@ -475,10 +475,10 @@ static void ixgbevf_get_ethtool_stats(st
  		}
  
  		do {
@@ -2030,7 +7867,7 @@ index fed46872af2bf..b4632b67ab143 100644
  		i += 2;
  	}
  
-@@ -492,10 +492,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev,
+@@ -492,10 +492,10 @@ static void ixgbevf_get_ethtool_stats(st
  		}
  
  		do {
@@ -2043,11 +7880,10 @@ index fed46872af2bf..b4632b67ab143 100644
  		i += 2;
  	}
  }
-diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
-index 2f12fbe229c15..1d31b8cff4f10 100644
---- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
-+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
-@@ -4350,10 +4350,10 @@ static void ixgbevf_get_tx_ring_stats(struct rtnl_link_stats64 *stats,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c linux/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+--- linux.orig/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -4350,10 +4350,10 @@ static void ixgbevf_get_tx_ring_stats(st
  
  	if (ring) {
  		do {
@@ -2060,7 +7896,7 @@ index 2f12fbe229c15..1d31b8cff4f10 100644
  		stats->tx_bytes += bytes;
  		stats->tx_packets += packets;
  	}
-@@ -4376,10 +4376,10 @@ static void ixgbevf_get_stats(struct net_device *netdev,
+@@ -4376,10 +4376,10 @@ static void ixgbevf_get_stats(struct net
  	for (i = 0; i < adapter->num_rx_queues; i++) {
  		ring = adapter->rx_ring[i];
  		do {
@@ -2073,11 +7909,10 @@ index 2f12fbe229c15..1d31b8cff4f10 100644
  		stats->rx_bytes += bytes;
  		stats->rx_packets += packets;
  	}
-diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
-index 0caa2df87c044..89ea3ef0ee162 100644
---- a/drivers/net/ethernet/marvell/mvneta.c
-+++ b/drivers/net/ethernet/marvell/mvneta.c
-@@ -813,14 +813,14 @@ mvneta_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/marvell/mvneta.c linux/drivers/net/ethernet/marvell/mvneta.c
+--- linux.orig/drivers/net/ethernet/marvell/mvneta.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/marvell/mvneta.c	2022-12-04 10:40:26.692034106 -0500
+@@ -813,14 +813,14 @@ mvneta_get_stats64(struct net_device *de
  
  		cpu_stats = per_cpu_ptr(pp->stats, cpu);
  		do {
@@ -2094,7 +7929,7 @@ index 0caa2df87c044..89ea3ef0ee162 100644
  
  		stats->rx_packets += rx_packets;
  		stats->rx_bytes   += rx_bytes;
-@@ -4762,7 +4762,7 @@ mvneta_ethtool_update_pcpu_stats(struct mvneta_port *pp,
+@@ -4762,7 +4762,7 @@ mvneta_ethtool_update_pcpu_stats(struct
  
  		stats = per_cpu_ptr(pp->stats, cpu);
  		do {
@@ -2103,7 +7938,7 @@ index 0caa2df87c044..89ea3ef0ee162 100644
  			skb_alloc_error = stats->es.skb_alloc_error;
  			refill_error = stats->es.refill_error;
  			xdp_redirect = stats->es.ps.xdp_redirect;
-@@ -4772,7 +4772,7 @@ mvneta_ethtool_update_pcpu_stats(struct mvneta_port *pp,
+@@ -4772,7 +4772,7 @@ mvneta_ethtool_update_pcpu_stats(struct
  			xdp_xmit_err = stats->es.ps.xdp_xmit_err;
  			xdp_tx = stats->es.ps.xdp_tx;
  			xdp_tx_err = stats->es.ps.xdp_tx_err;
@@ -2112,11 +7947,10 @@ index 0caa2df87c044..89ea3ef0ee162 100644
  
  		es->skb_alloc_error += skb_alloc_error;
  		es->refill_error += refill_error;
-diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
-index eaa51cd7456b6..9dd8e0315dd4f 100644
---- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
-+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
-@@ -2008,7 +2008,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *port, struct mvpp2_pcpu_stats *xdp_stats)
+diff -rupN linux.orig/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c linux/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+--- linux.orig/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c	2022-12-04 10:40:26.692034106 -0500
+@@ -2008,7 +2008,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *p
  
  		cpu_stats = per_cpu_ptr(port->stats, cpu);
  		do {
@@ -2125,7 +7959,7 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644
  			xdp_redirect = cpu_stats->xdp_redirect;
  			xdp_pass   = cpu_stats->xdp_pass;
  			xdp_drop = cpu_stats->xdp_drop;
-@@ -2016,7 +2016,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *port, struct mvpp2_pcpu_stats *xdp_stats)
+@@ -2016,7 +2016,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *p
  			xdp_xmit_err   = cpu_stats->xdp_xmit_err;
  			xdp_tx   = cpu_stats->xdp_tx;
  			xdp_tx_err   = cpu_stats->xdp_tx_err;
@@ -2134,7 +7968,7 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644
  
  		xdp_stats->xdp_redirect += xdp_redirect;
  		xdp_stats->xdp_pass   += xdp_pass;
-@@ -5115,12 +5115,12 @@ mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+@@ -5115,12 +5115,12 @@ mvpp2_get_stats64(struct net_device *dev
  
  		cpu_stats = per_cpu_ptr(port->stats, cpu);
  		do {
@@ -2149,11 +7983,10 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644
  
  		stats->rx_packets += rx_packets;
  		stats->rx_bytes   += rx_bytes;
-diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
-index bbea5458000bf..c9bb92187719c 100644
---- a/drivers/net/ethernet/marvell/sky2.c
-+++ b/drivers/net/ethernet/marvell/sky2.c
-@@ -3894,19 +3894,19 @@ static void sky2_get_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/marvell/sky2.c linux/drivers/net/ethernet/marvell/sky2.c
+--- linux.orig/drivers/net/ethernet/marvell/sky2.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/marvell/sky2.c	2022-12-04 10:40:26.692034106 -0500
+@@ -3894,19 +3894,19 @@ static void sky2_get_stats(struct net_de
  	u64 _bytes, _packets;
  
  	do {
@@ -2177,11 +8010,10 @@ index bbea5458000bf..c9bb92187719c 100644
  
  	stats->tx_packets = _packets;
  	stats->tx_bytes = _bytes;
-diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
-index b344632beaddf..988927f8c5d7d 100644
---- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
-+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
-@@ -853,7 +853,7 @@ static void mtk_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+--- linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c	2022-12-04 10:40:26.692034106 -0500
+@@ -853,7 +853,7 @@ static void mtk_get_stats64(struct net_d
  	}
  
  	do {
@@ -2190,7 +8022,7 @@ index b344632beaddf..988927f8c5d7d 100644
  		storage->rx_packets = hw_stats->rx_packets;
  		storage->tx_packets = hw_stats->tx_packets;
  		storage->rx_bytes = hw_stats->rx_bytes;
-@@ -865,7 +865,7 @@ static void mtk_get_stats64(struct net_device *dev,
+@@ -865,7 +865,7 @@ static void mtk_get_stats64(struct net_d
  		storage->rx_crc_errors = hw_stats->rx_fcs_errors;
  		storage->rx_errors = hw_stats->rx_checksum_errors;
  		storage->tx_aborted_errors = hw_stats->tx_skip;
@@ -2199,7 +8031,7 @@ index b344632beaddf..988927f8c5d7d 100644
  
  	storage->tx_errors = dev->stats.tx_errors;
  	storage->rx_dropped = dev->stats.rx_dropped;
-@@ -3664,13 +3664,13 @@ static void mtk_get_ethtool_stats(struct net_device *dev,
+@@ -3668,13 +3668,13 @@ static void mtk_get_ethtool_stats(struct
  
  	do {
  		data_dst = data;
@@ -2215,11 +8047,4339 @@ index b344632beaddf..988927f8c5d7d 100644
  }
  
  static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
-diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
-index 30c7b0e157218..fa2753318cdf7 100644
---- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
-+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
-@@ -827,12 +827,12 @@ mlxsw_sp_port_get_sw_stats64(const struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig
+--- linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig	2022-12-04 10:40:18.136056029 -0500
+@@ -0,0 +1,4325 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ *
++ *   Copyright (C) 2009-2016 John Crispin <blogic@openwrt.org>
++ *   Copyright (C) 2009-2016 Felix Fietkau <nbd@openwrt.org>
++ *   Copyright (C) 2013-2016 Michael Lee <igvtee@gmail.com>
++ */
++
++#include <linux/of_device.h>
++#include <linux/of_mdio.h>
++#include <linux/of_net.h>
++#include <linux/of_address.h>
++#include <linux/mfd/syscon.h>
++#include <linux/regmap.h>
++#include <linux/clk.h>
++#include <linux/pm_runtime.h>
++#include <linux/if_vlan.h>
++#include <linux/reset.h>
++#include <linux/tcp.h>
++#include <linux/interrupt.h>
++#include <linux/pinctrl/devinfo.h>
++#include <linux/phylink.h>
++#include <linux/jhash.h>
++#include <linux/bitfield.h>
++#include <net/dsa.h>
++
++#include "mtk_eth_soc.h"
++#include "mtk_wed.h"
++
++static int mtk_msg_level = -1;
++module_param_named(msg_level, mtk_msg_level, int, 0);
++MODULE_PARM_DESC(msg_level, "Message level (-1=defaults,0=none,...,16=all)");
++
++#define MTK_ETHTOOL_STAT(x) { #x, \
++			      offsetof(struct mtk_hw_stats, x) / sizeof(u64) }
++
++#define MTK_ETHTOOL_XDP_STAT(x) { #x, \
++				  offsetof(struct mtk_hw_stats, xdp_stats.x) / \
++				  sizeof(u64) }
++
++static const struct mtk_reg_map mtk_reg_map = {
++	.tx_irq_mask		= 0x1a1c,
++	.tx_irq_status		= 0x1a18,
++	.pdma = {
++		.rx_ptr		= 0x0900,
++		.rx_cnt_cfg	= 0x0904,
++		.pcrx_ptr	= 0x0908,
++		.glo_cfg	= 0x0a04,
++		.rst_idx	= 0x0a08,
++		.delay_irq	= 0x0a0c,
++		.irq_status	= 0x0a20,
++		.irq_mask	= 0x0a28,
++		.int_grp	= 0x0a50,
++	},
++	.qdma = {
++		.qtx_cfg	= 0x1800,
++		.rx_ptr		= 0x1900,
++		.rx_cnt_cfg	= 0x1904,
++		.qcrx_ptr	= 0x1908,
++		.glo_cfg	= 0x1a04,
++		.rst_idx	= 0x1a08,
++		.delay_irq	= 0x1a0c,
++		.fc_th		= 0x1a10,
++		.int_grp	= 0x1a20,
++		.hred		= 0x1a44,
++		.ctx_ptr	= 0x1b00,
++		.dtx_ptr	= 0x1b04,
++		.crx_ptr	= 0x1b10,
++		.drx_ptr	= 0x1b14,
++		.fq_head	= 0x1b20,
++		.fq_tail	= 0x1b24,
++		.fq_count	= 0x1b28,
++		.fq_blen	= 0x1b2c,
++	},
++	.gdm1_cnt		= 0x2400,
++};
++
++static const struct mtk_reg_map mt7628_reg_map = {
++	.tx_irq_mask		= 0x0a28,
++	.tx_irq_status		= 0x0a20,
++	.pdma = {
++		.rx_ptr		= 0x0900,
++		.rx_cnt_cfg	= 0x0904,
++		.pcrx_ptr	= 0x0908,
++		.glo_cfg	= 0x0a04,
++		.rst_idx	= 0x0a08,
++		.delay_irq	= 0x0a0c,
++		.irq_status	= 0x0a20,
++		.irq_mask	= 0x0a28,
++		.int_grp	= 0x0a50,
++	},
++};
++
++static const struct mtk_reg_map mt7986_reg_map = {
++	.tx_irq_mask		= 0x461c,
++	.tx_irq_status		= 0x4618,
++	.pdma = {
++		.rx_ptr		= 0x6100,
++		.rx_cnt_cfg	= 0x6104,
++		.pcrx_ptr	= 0x6108,
++		.glo_cfg	= 0x6204,
++		.rst_idx	= 0x6208,
++		.delay_irq	= 0x620c,
++		.irq_status	= 0x6220,
++		.irq_mask	= 0x6228,
++		.int_grp	= 0x6250,
++	},
++	.qdma = {
++		.qtx_cfg	= 0x4400,
++		.rx_ptr		= 0x4500,
++		.rx_cnt_cfg	= 0x4504,
++		.qcrx_ptr	= 0x4508,
++		.glo_cfg	= 0x4604,
++		.rst_idx	= 0x4608,
++		.delay_irq	= 0x460c,
++		.fc_th		= 0x4610,
++		.int_grp	= 0x4620,
++		.hred		= 0x4644,
++		.ctx_ptr	= 0x4700,
++		.dtx_ptr	= 0x4704,
++		.crx_ptr	= 0x4710,
++		.drx_ptr	= 0x4714,
++		.fq_head	= 0x4720,
++		.fq_tail	= 0x4724,
++		.fq_count	= 0x4728,
++		.fq_blen	= 0x472c,
++	},
++	.gdm1_cnt		= 0x1c00,
++};
++
++/* strings used by ethtool */
++static const struct mtk_ethtool_stats {
++	char str[ETH_GSTRING_LEN];
++	u32 offset;
++} mtk_ethtool_stats[] = {
++	MTK_ETHTOOL_STAT(tx_bytes),
++	MTK_ETHTOOL_STAT(tx_packets),
++	MTK_ETHTOOL_STAT(tx_skip),
++	MTK_ETHTOOL_STAT(tx_collisions),
++	MTK_ETHTOOL_STAT(rx_bytes),
++	MTK_ETHTOOL_STAT(rx_packets),
++	MTK_ETHTOOL_STAT(rx_overflow),
++	MTK_ETHTOOL_STAT(rx_fcs_errors),
++	MTK_ETHTOOL_STAT(rx_short_errors),
++	MTK_ETHTOOL_STAT(rx_long_errors),
++	MTK_ETHTOOL_STAT(rx_checksum_errors),
++	MTK_ETHTOOL_STAT(rx_flow_control_packets),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_redirect),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_pass),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_drop),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_tx),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_tx_errors),
++	MTK_ETHTOOL_XDP_STAT(tx_xdp_xmit),
++	MTK_ETHTOOL_XDP_STAT(tx_xdp_xmit_errors),
++};
++
++static const char * const mtk_clks_source_name[] = {
++	"ethif", "sgmiitop", "esw", "gp0", "gp1", "gp2", "fe", "trgpll",
++	"sgmii_tx250m", "sgmii_rx250m", "sgmii_cdr_ref", "sgmii_cdr_fb",
++	"sgmii2_tx250m", "sgmii2_rx250m", "sgmii2_cdr_ref", "sgmii2_cdr_fb",
++	"sgmii_ck", "eth2pll", "wocpu0", "wocpu1", "netsys0", "netsys1"
++};
++
++void mtk_w32(struct mtk_eth *eth, u32 val, unsigned reg)
++{
++	__raw_writel(val, eth->base + reg);
++}
++
++u32 mtk_r32(struct mtk_eth *eth, unsigned reg)
++{
++	return __raw_readl(eth->base + reg);
++}
++
++static u32 mtk_m32(struct mtk_eth *eth, u32 mask, u32 set, unsigned reg)
++{
++	u32 val;
++
++	val = mtk_r32(eth, reg);
++	val &= ~mask;
++	val |= set;
++	mtk_w32(eth, val, reg);
++	return reg;
++}
++
++static int mtk_mdio_busy_wait(struct mtk_eth *eth)
++{
++	unsigned long t_start = jiffies;
++
++	while (1) {
++		if (!(mtk_r32(eth, MTK_PHY_IAC) & PHY_IAC_ACCESS))
++			return 0;
++		if (time_after(jiffies, t_start + PHY_IAC_TIMEOUT))
++			break;
++		cond_resched();
++	}
++
++	dev_err(eth->dev, "mdio: MDIO timeout\n");
++	return -ETIMEDOUT;
++}
++
++static int _mtk_mdio_write(struct mtk_eth *eth, u32 phy_addr, u32 phy_reg,
++			   u32 write_data)
++{
++	int ret;
++
++	ret = mtk_mdio_busy_wait(eth);
++	if (ret < 0)
++		return ret;
++
++	if (phy_reg & MII_ADDR_C45) {
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C45 |
++			     PHY_IAC_CMD_C45_ADDR |
++			     PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) |
++			     PHY_IAC_ADDR(phy_addr) |
++			     PHY_IAC_DATA(mdiobus_c45_regad(phy_reg)),
++			MTK_PHY_IAC);
++
++		ret = mtk_mdio_busy_wait(eth);
++		if (ret < 0)
++			return ret;
++
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C45 |
++			     PHY_IAC_CMD_WRITE |
++			     PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) |
++			     PHY_IAC_ADDR(phy_addr) |
++			     PHY_IAC_DATA(write_data),
++			MTK_PHY_IAC);
++	} else {
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C22 |
++			     PHY_IAC_CMD_WRITE |
++			     PHY_IAC_REG(phy_reg) |
++			     PHY_IAC_ADDR(phy_addr) |
++			     PHY_IAC_DATA(write_data),
++			MTK_PHY_IAC);
++	}
++
++	ret = mtk_mdio_busy_wait(eth);
++	if (ret < 0)
++		return ret;
++
++	return 0;
++}
++
++static int _mtk_mdio_read(struct mtk_eth *eth, u32 phy_addr, u32 phy_reg)
++{
++	int ret;
++
++	ret = mtk_mdio_busy_wait(eth);
++	if (ret < 0)
++		return ret;
++
++	if (phy_reg & MII_ADDR_C45) {
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C45 |
++			     PHY_IAC_CMD_C45_ADDR |
++			     PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) |
++			     PHY_IAC_ADDR(phy_addr) |
++			     PHY_IAC_DATA(mdiobus_c45_regad(phy_reg)),
++			MTK_PHY_IAC);
++
++		ret = mtk_mdio_busy_wait(eth);
++		if (ret < 0)
++			return ret;
++
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C45 |
++			     PHY_IAC_CMD_C45_READ |
++			     PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) |
++			     PHY_IAC_ADDR(phy_addr),
++			MTK_PHY_IAC);
++	} else {
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C22 |
++			     PHY_IAC_CMD_C22_READ |
++			     PHY_IAC_REG(phy_reg) |
++			     PHY_IAC_ADDR(phy_addr),
++			MTK_PHY_IAC);
++	}
++
++	ret = mtk_mdio_busy_wait(eth);
++	if (ret < 0)
++		return ret;
++
++	return mtk_r32(eth, MTK_PHY_IAC) & PHY_IAC_DATA_MASK;
++}
++
++static int mtk_mdio_write(struct mii_bus *bus, int phy_addr,
++			  int phy_reg, u16 val)
++{
++	struct mtk_eth *eth = bus->priv;
++
++	return _mtk_mdio_write(eth, phy_addr, phy_reg, val);
++}
++
++static int mtk_mdio_read(struct mii_bus *bus, int phy_addr, int phy_reg)
++{
++	struct mtk_eth *eth = bus->priv;
++
++	return _mtk_mdio_read(eth, phy_addr, phy_reg);
++}
++
++static int mt7621_gmac0_rgmii_adjust(struct mtk_eth *eth,
++				     phy_interface_t interface)
++{
++	u32 val;
++
++	/* Check DDR memory type.
++	 * Currently TRGMII mode with DDR2 memory is not supported.
++	 */
++	regmap_read(eth->ethsys, ETHSYS_SYSCFG, &val);
++	if (interface == PHY_INTERFACE_MODE_TRGMII &&
++	    val & SYSCFG_DRAM_TYPE_DDR2) {
++		dev_err(eth->dev,
++			"TRGMII mode with DDR2 memory is not supported!\n");
++		return -EOPNOTSUPP;
++	}
++
++	val = (interface == PHY_INTERFACE_MODE_TRGMII) ?
++		ETHSYS_TRGMII_MT7621_DDR_PLL : 0;
++
++	regmap_update_bits(eth->ethsys, ETHSYS_CLKCFG0,
++			   ETHSYS_TRGMII_MT7621_MASK, val);
++
++	return 0;
++}
++
++static void mtk_gmac0_rgmii_adjust(struct mtk_eth *eth,
++				   phy_interface_t interface, int speed)
++{
++	u32 val;
++	int ret;
++
++	if (interface == PHY_INTERFACE_MODE_TRGMII) {
++		mtk_w32(eth, TRGMII_MODE, INTF_MODE);
++		val = 500000000;
++		ret = clk_set_rate(eth->clks[MTK_CLK_TRGPLL], val);
++		if (ret)
++			dev_err(eth->dev, "Failed to set trgmii pll: %d\n", ret);
++		return;
++	}
++
++	val = (speed == SPEED_1000) ?
++		INTF_MODE_RGMII_1000 : INTF_MODE_RGMII_10_100;
++	mtk_w32(eth, val, INTF_MODE);
++
++	regmap_update_bits(eth->ethsys, ETHSYS_CLKCFG0,
++			   ETHSYS_TRGMII_CLK_SEL362_5,
++			   ETHSYS_TRGMII_CLK_SEL362_5);
++
++	val = (speed == SPEED_1000) ? 250000000 : 500000000;
++	ret = clk_set_rate(eth->clks[MTK_CLK_TRGPLL], val);
++	if (ret)
++		dev_err(eth->dev, "Failed to set trgmii pll: %d\n", ret);
++
++	val = (speed == SPEED_1000) ?
++		RCK_CTRL_RGMII_1000 : RCK_CTRL_RGMII_10_100;
++	mtk_w32(eth, val, TRGMII_RCK_CTRL);
++
++	val = (speed == SPEED_1000) ?
++		TCK_CTRL_RGMII_1000 : TCK_CTRL_RGMII_10_100;
++	mtk_w32(eth, val, TRGMII_TCK_CTRL);
++}
++
++static struct phylink_pcs *mtk_mac_select_pcs(struct phylink_config *config,
++					      phy_interface_t interface)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	struct mtk_eth *eth = mac->hw;
++	unsigned int sid;
++
++	if (interface == PHY_INTERFACE_MODE_SGMII ||
++	    phy_interface_mode_is_8023z(interface)) {
++		sid = (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_SGMII)) ?
++		       0 : mac->id;
++
++		return mtk_sgmii_select_pcs(eth->sgmii, sid);
++	}
++
++	return NULL;
++}
++
++static void mtk_mac_config(struct phylink_config *config, unsigned int mode,
++			   const struct phylink_link_state *state)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	struct mtk_eth *eth = mac->hw;
++	int val, ge_mode, err = 0;
++	u32 i;
++
++	/* MT76x8 has no hardware settings between for the MAC */
++	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) &&
++	    mac->interface != state->interface) {
++		/* Setup soc pin functions */
++		switch (state->interface) {
++		case PHY_INTERFACE_MODE_TRGMII:
++			if (mac->id)
++				goto err_phy;
++			if (!MTK_HAS_CAPS(mac->hw->soc->caps,
++					  MTK_GMAC1_TRGMII))
++				goto err_phy;
++			fallthrough;
++		case PHY_INTERFACE_MODE_RGMII_TXID:
++		case PHY_INTERFACE_MODE_RGMII_RXID:
++		case PHY_INTERFACE_MODE_RGMII_ID:
++		case PHY_INTERFACE_MODE_RGMII:
++		case PHY_INTERFACE_MODE_MII:
++		case PHY_INTERFACE_MODE_REVMII:
++		case PHY_INTERFACE_MODE_RMII:
++			if (MTK_HAS_CAPS(eth->soc->caps, MTK_RGMII)) {
++				err = mtk_gmac_rgmii_path_setup(eth, mac->id);
++				if (err)
++					goto init_err;
++			}
++			break;
++		case PHY_INTERFACE_MODE_1000BASEX:
++		case PHY_INTERFACE_MODE_2500BASEX:
++		case PHY_INTERFACE_MODE_SGMII:
++			if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) {
++				err = mtk_gmac_sgmii_path_setup(eth, mac->id);
++				if (err)
++					goto init_err;
++			}
++			break;
++		case PHY_INTERFACE_MODE_GMII:
++			if (MTK_HAS_CAPS(eth->soc->caps, MTK_GEPHY)) {
++				err = mtk_gmac_gephy_path_setup(eth, mac->id);
++				if (err)
++					goto init_err;
++			}
++			break;
++		default:
++			goto err_phy;
++		}
++
++		/* Setup clock for 1st gmac */
++		if (!mac->id && state->interface != PHY_INTERFACE_MODE_SGMII &&
++		    !phy_interface_mode_is_8023z(state->interface) &&
++		    MTK_HAS_CAPS(mac->hw->soc->caps, MTK_GMAC1_TRGMII)) {
++			if (MTK_HAS_CAPS(mac->hw->soc->caps,
++					 MTK_TRGMII_MT7621_CLK)) {
++				if (mt7621_gmac0_rgmii_adjust(mac->hw,
++							      state->interface))
++					goto err_phy;
++			} else {
++				/* FIXME: this is incorrect. Not only does it
++				 * use state->speed (which is not guaranteed
++				 * to be correct) but it also makes use of it
++				 * in a code path that will only be reachable
++				 * when the PHY interface mode changes, not
++				 * when the speed changes. Consequently, RGMII
++				 * is probably broken.
++				 */
++				mtk_gmac0_rgmii_adjust(mac->hw,
++						       state->interface,
++						       state->speed);
++
++				/* mt7623_pad_clk_setup */
++				for (i = 0 ; i < NUM_TRGMII_CTRL; i++)
++					mtk_w32(mac->hw,
++						TD_DM_DRVP(8) | TD_DM_DRVN(8),
++						TRGMII_TD_ODT(i));
++
++				/* Assert/release MT7623 RXC reset */
++				mtk_m32(mac->hw, 0, RXC_RST | RXC_DQSISEL,
++					TRGMII_RCK_CTRL);
++				mtk_m32(mac->hw, RXC_RST, 0, TRGMII_RCK_CTRL);
++			}
++		}
++
++		ge_mode = 0;
++		switch (state->interface) {
++		case PHY_INTERFACE_MODE_MII:
++		case PHY_INTERFACE_MODE_GMII:
++			ge_mode = 1;
++			break;
++		case PHY_INTERFACE_MODE_REVMII:
++			ge_mode = 2;
++			break;
++		case PHY_INTERFACE_MODE_RMII:
++			if (mac->id)
++				goto err_phy;
++			ge_mode = 3;
++			break;
++		default:
++			break;
++		}
++
++		/* put the gmac into the right mode */
++		regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val);
++		val &= ~SYSCFG0_GE_MODE(SYSCFG0_GE_MASK, mac->id);
++		val |= SYSCFG0_GE_MODE(ge_mode, mac->id);
++		regmap_write(eth->ethsys, ETHSYS_SYSCFG0, val);
++
++		mac->interface = state->interface;
++	}
++
++	/* SGMII */
++	if (state->interface == PHY_INTERFACE_MODE_SGMII ||
++	    phy_interface_mode_is_8023z(state->interface)) {
++		/* The path GMAC to SGMII will be enabled once the SGMIISYS is
++		 * being setup done.
++		 */
++		regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val);
++
++		regmap_update_bits(eth->ethsys, ETHSYS_SYSCFG0,
++				   SYSCFG0_SGMII_MASK,
++				   ~(u32)SYSCFG0_SGMII_MASK);
++
++		/* Save the syscfg0 value for mac_finish */
++		mac->syscfg0 = val;
++	} else if (phylink_autoneg_inband(mode)) {
++		dev_err(eth->dev,
++			"In-band mode not supported in non SGMII mode!\n");
++		return;
++	}
++
++	return;
++
++err_phy:
++	dev_err(eth->dev, "%s: GMAC%d mode %s not supported!\n", __func__,
++		mac->id, phy_modes(state->interface));
++	return;
++
++init_err:
++	dev_err(eth->dev, "%s: GMAC%d mode %s err: %d!\n", __func__,
++		mac->id, phy_modes(state->interface), err);
++}
++
++static int mtk_mac_finish(struct phylink_config *config, unsigned int mode,
++			  phy_interface_t interface)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	struct mtk_eth *eth = mac->hw;
++	u32 mcr_cur, mcr_new;
++
++	/* Enable SGMII */
++	if (interface == PHY_INTERFACE_MODE_SGMII ||
++	    phy_interface_mode_is_8023z(interface))
++		regmap_update_bits(eth->ethsys, ETHSYS_SYSCFG0,
++				   SYSCFG0_SGMII_MASK, mac->syscfg0);
++
++	/* Setup gmac */
++	mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
++	mcr_new = mcr_cur;
++	mcr_new |= MAC_MCR_IPG_CFG | MAC_MCR_FORCE_MODE |
++		   MAC_MCR_BACKOFF_EN | MAC_MCR_BACKPR_EN | MAC_MCR_FORCE_LINK;
++
++	/* Only update control register when needed! */
++	if (mcr_new != mcr_cur)
++		mtk_w32(mac->hw, mcr_new, MTK_MAC_MCR(mac->id));
++
++	return 0;
++}
++
++static void mtk_mac_pcs_get_state(struct phylink_config *config,
++				  struct phylink_link_state *state)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	u32 pmsr = mtk_r32(mac->hw, MTK_MAC_MSR(mac->id));
++
++	state->link = (pmsr & MAC_MSR_LINK);
++	state->duplex = (pmsr & MAC_MSR_DPX) >> 1;
++
++	switch (pmsr & (MAC_MSR_SPEED_1000 | MAC_MSR_SPEED_100)) {
++	case 0:
++		state->speed = SPEED_10;
++		break;
++	case MAC_MSR_SPEED_100:
++		state->speed = SPEED_100;
++		break;
++	case MAC_MSR_SPEED_1000:
++		state->speed = SPEED_1000;
++		break;
++	default:
++		state->speed = SPEED_UNKNOWN;
++		break;
++	}
++
++	state->pause &= (MLO_PAUSE_RX | MLO_PAUSE_TX);
++	if (pmsr & MAC_MSR_RX_FC)
++		state->pause |= MLO_PAUSE_RX;
++	if (pmsr & MAC_MSR_TX_FC)
++		state->pause |= MLO_PAUSE_TX;
++}
++
++static void mtk_mac_link_down(struct phylink_config *config, unsigned int mode,
++			      phy_interface_t interface)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	u32 mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
++
++	mcr &= ~(MAC_MCR_TX_EN | MAC_MCR_RX_EN);
++	mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id));
++}
++
++static void mtk_mac_link_up(struct phylink_config *config,
++			    struct phy_device *phy,
++			    unsigned int mode, phy_interface_t interface,
++			    int speed, int duplex, bool tx_pause, bool rx_pause)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	u32 mcr;
++
++	mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
++	mcr &= ~(MAC_MCR_SPEED_100 | MAC_MCR_SPEED_1000 |
++		 MAC_MCR_FORCE_DPX | MAC_MCR_FORCE_TX_FC |
++		 MAC_MCR_FORCE_RX_FC);
++
++	/* Configure speed */
++	switch (speed) {
++	case SPEED_2500:
++	case SPEED_1000:
++		mcr |= MAC_MCR_SPEED_1000;
++		break;
++	case SPEED_100:
++		mcr |= MAC_MCR_SPEED_100;
++		break;
++	}
++
++	/* Configure duplex */
++	if (duplex == DUPLEX_FULL)
++		mcr |= MAC_MCR_FORCE_DPX;
++
++	/* Configure pause modes - phylink will avoid these for half duplex */
++	if (tx_pause)
++		mcr |= MAC_MCR_FORCE_TX_FC;
++	if (rx_pause)
++		mcr |= MAC_MCR_FORCE_RX_FC;
++
++	mcr |= MAC_MCR_TX_EN | MAC_MCR_RX_EN;
++	mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id));
++}
++
++static const struct phylink_mac_ops mtk_phylink_ops = {
++	.validate = phylink_generic_validate,
++	.mac_select_pcs = mtk_mac_select_pcs,
++	.mac_pcs_get_state = mtk_mac_pcs_get_state,
++	.mac_config = mtk_mac_config,
++	.mac_finish = mtk_mac_finish,
++	.mac_link_down = mtk_mac_link_down,
++	.mac_link_up = mtk_mac_link_up,
++};
++
++static int mtk_mdio_init(struct mtk_eth *eth)
++{
++	struct device_node *mii_np;
++	int ret;
++
++	mii_np = of_get_child_by_name(eth->dev->of_node, "mdio-bus");
++	if (!mii_np) {
++		dev_err(eth->dev, "no %s child node found", "mdio-bus");
++		return -ENODEV;
++	}
++
++	if (!of_device_is_available(mii_np)) {
++		ret = -ENODEV;
++		goto err_put_node;
++	}
++
++	eth->mii_bus = devm_mdiobus_alloc(eth->dev);
++	if (!eth->mii_bus) {
++		ret = -ENOMEM;
++		goto err_put_node;
++	}
++
++	eth->mii_bus->name = "mdio";
++	eth->mii_bus->read = mtk_mdio_read;
++	eth->mii_bus->write = mtk_mdio_write;
++	eth->mii_bus->probe_capabilities = MDIOBUS_C22_C45;
++	eth->mii_bus->priv = eth;
++	eth->mii_bus->parent = eth->dev;
++
++	snprintf(eth->mii_bus->id, MII_BUS_ID_SIZE, "%pOFn", mii_np);
++	ret = of_mdiobus_register(eth->mii_bus, mii_np);
++
++err_put_node:
++	of_node_put(mii_np);
++	return ret;
++}
++
++static void mtk_mdio_cleanup(struct mtk_eth *eth)
++{
++	if (!eth->mii_bus)
++		return;
++
++	mdiobus_unregister(eth->mii_bus);
++}
++
++static inline void mtk_tx_irq_disable(struct mtk_eth *eth, u32 mask)
++{
++	unsigned long flags;
++	u32 val;
++
++	spin_lock_irqsave(&eth->tx_irq_lock, flags);
++	val = mtk_r32(eth, eth->soc->reg_map->tx_irq_mask);
++	mtk_w32(eth, val & ~mask, eth->soc->reg_map->tx_irq_mask);
++	spin_unlock_irqrestore(&eth->tx_irq_lock, flags);
++}
++
++static inline void mtk_tx_irq_enable(struct mtk_eth *eth, u32 mask)
++{
++	unsigned long flags;
++	u32 val;
++
++	spin_lock_irqsave(&eth->tx_irq_lock, flags);
++	val = mtk_r32(eth, eth->soc->reg_map->tx_irq_mask);
++	mtk_w32(eth, val | mask, eth->soc->reg_map->tx_irq_mask);
++	spin_unlock_irqrestore(&eth->tx_irq_lock, flags);
++}
++
++static inline void mtk_rx_irq_disable(struct mtk_eth *eth, u32 mask)
++{
++	unsigned long flags;
++	u32 val;
++
++	spin_lock_irqsave(&eth->rx_irq_lock, flags);
++	val = mtk_r32(eth, eth->soc->reg_map->pdma.irq_mask);
++	mtk_w32(eth, val & ~mask, eth->soc->reg_map->pdma.irq_mask);
++	spin_unlock_irqrestore(&eth->rx_irq_lock, flags);
++}
++
++static inline void mtk_rx_irq_enable(struct mtk_eth *eth, u32 mask)
++{
++	unsigned long flags;
++	u32 val;
++
++	spin_lock_irqsave(&eth->rx_irq_lock, flags);
++	val = mtk_r32(eth, eth->soc->reg_map->pdma.irq_mask);
++	mtk_w32(eth, val | mask, eth->soc->reg_map->pdma.irq_mask);
++	spin_unlock_irqrestore(&eth->rx_irq_lock, flags);
++}
++
++static int mtk_set_mac_address(struct net_device *dev, void *p)
++{
++	int ret = eth_mac_addr(dev, p);
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	const char *macaddr = dev->dev_addr;
++
++	if (ret)
++		return ret;
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return -EBUSY;
++
++	spin_lock_bh(&mac->hw->page_lock);
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1],
++			MT7628_SDM_MAC_ADRH);
++		mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) |
++			(macaddr[4] << 8) | macaddr[5],
++			MT7628_SDM_MAC_ADRL);
++	} else {
++		mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1],
++			MTK_GDMA_MAC_ADRH(mac->id));
++		mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) |
++			(macaddr[4] << 8) | macaddr[5],
++			MTK_GDMA_MAC_ADRL(mac->id));
++	}
++	spin_unlock_bh(&mac->hw->page_lock);
++
++	return 0;
++}
++
++void mtk_stats_update_mac(struct mtk_mac *mac)
++{
++	struct mtk_hw_stats *hw_stats = mac->hw_stats;
++	struct mtk_eth *eth = mac->hw;
++
++	u64_stats_update_begin(&hw_stats->syncp);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		hw_stats->tx_packets += mtk_r32(mac->hw, MT7628_SDM_TPCNT);
++		hw_stats->tx_bytes += mtk_r32(mac->hw, MT7628_SDM_TBCNT);
++		hw_stats->rx_packets += mtk_r32(mac->hw, MT7628_SDM_RPCNT);
++		hw_stats->rx_bytes += mtk_r32(mac->hw, MT7628_SDM_RBCNT);
++		hw_stats->rx_checksum_errors +=
++			mtk_r32(mac->hw, MT7628_SDM_CS_ERR);
++	} else {
++		const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++		unsigned int offs = hw_stats->reg_offset;
++		u64 stats;
++
++		hw_stats->rx_bytes += mtk_r32(mac->hw, reg_map->gdm1_cnt + offs);
++		stats = mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x4 + offs);
++		if (stats)
++			hw_stats->rx_bytes += (stats << 32);
++		hw_stats->rx_packets +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x8 + offs);
++		hw_stats->rx_overflow +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x10 + offs);
++		hw_stats->rx_fcs_errors +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x14 + offs);
++		hw_stats->rx_short_errors +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x18 + offs);
++		hw_stats->rx_long_errors +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x1c + offs);
++		hw_stats->rx_checksum_errors +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x20 + offs);
++		hw_stats->rx_flow_control_packets +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x24 + offs);
++		hw_stats->tx_skip +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x28 + offs);
++		hw_stats->tx_collisions +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x2c + offs);
++		hw_stats->tx_bytes +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x30 + offs);
++		stats =  mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x34 + offs);
++		if (stats)
++			hw_stats->tx_bytes += (stats << 32);
++		hw_stats->tx_packets +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x38 + offs);
++	}
++
++	u64_stats_update_end(&hw_stats->syncp);
++}
++
++static void mtk_stats_update(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->mac[i] || !eth->mac[i]->hw_stats)
++			continue;
++		if (spin_trylock(&eth->mac[i]->hw_stats->stats_lock)) {
++			mtk_stats_update_mac(eth->mac[i]);
++			spin_unlock(&eth->mac[i]->hw_stats->stats_lock);
++		}
++	}
++}
++
++static void mtk_get_stats64(struct net_device *dev,
++			    struct rtnl_link_stats64 *storage)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_hw_stats *hw_stats = mac->hw_stats;
++	unsigned int start;
++
++	if (netif_running(dev) && netif_device_present(dev)) {
++		if (spin_trylock_bh(&hw_stats->stats_lock)) {
++			mtk_stats_update_mac(mac);
++			spin_unlock_bh(&hw_stats->stats_lock);
++		}
++	}
++
++	do {
++		start = u64_stats_fetch_begin_irq(&hw_stats->syncp);
++		storage->rx_packets = hw_stats->rx_packets;
++		storage->tx_packets = hw_stats->tx_packets;
++		storage->rx_bytes = hw_stats->rx_bytes;
++		storage->tx_bytes = hw_stats->tx_bytes;
++		storage->collisions = hw_stats->tx_collisions;
++		storage->rx_length_errors = hw_stats->rx_short_errors +
++			hw_stats->rx_long_errors;
++		storage->rx_over_errors = hw_stats->rx_overflow;
++		storage->rx_crc_errors = hw_stats->rx_fcs_errors;
++		storage->rx_errors = hw_stats->rx_checksum_errors;
++		storage->tx_aborted_errors = hw_stats->tx_skip;
++	} while (u64_stats_fetch_retry_irq(&hw_stats->syncp, start));
++
++	storage->tx_errors = dev->stats.tx_errors;
++	storage->rx_dropped = dev->stats.rx_dropped;
++	storage->tx_dropped = dev->stats.tx_dropped;
++}
++
++static inline int mtk_max_frag_size(int mtu)
++{
++	/* make sure buf_size will be at least MTK_MAX_RX_LENGTH */
++	if (mtu + MTK_RX_ETH_HLEN < MTK_MAX_RX_LENGTH_2K)
++		mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN;
++
++	return SKB_DATA_ALIGN(MTK_RX_HLEN + mtu) +
++		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++}
++
++static inline int mtk_max_buf_size(int frag_size)
++{
++	int buf_size = frag_size - NET_SKB_PAD - NET_IP_ALIGN -
++		       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++
++	WARN_ON(buf_size < MTK_MAX_RX_LENGTH_2K);
++
++	return buf_size;
++}
++
++static bool mtk_rx_get_desc(struct mtk_eth *eth, struct mtk_rx_dma_v2 *rxd,
++			    struct mtk_rx_dma_v2 *dma_rxd)
++{
++	rxd->rxd2 = READ_ONCE(dma_rxd->rxd2);
++	if (!(rxd->rxd2 & RX_DMA_DONE))
++		return false;
++
++	rxd->rxd1 = READ_ONCE(dma_rxd->rxd1);
++	rxd->rxd3 = READ_ONCE(dma_rxd->rxd3);
++	rxd->rxd4 = READ_ONCE(dma_rxd->rxd4);
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++		rxd->rxd5 = READ_ONCE(dma_rxd->rxd5);
++		rxd->rxd6 = READ_ONCE(dma_rxd->rxd6);
++	}
++
++	return true;
++}
++
++static void *mtk_max_lro_buf_alloc(gfp_t gfp_mask)
++{
++	unsigned int size = mtk_max_frag_size(MTK_MAX_LRO_RX_LENGTH);
++	unsigned long data;
++
++	data = __get_free_pages(gfp_mask | __GFP_COMP | __GFP_NOWARN,
++				get_order(size));
++
++	return (void *)data;
++}
++
++/* the qdma core needs scratch memory to be setup */
++static int mtk_init_fq_dma(struct mtk_eth *eth)
++{
++	const struct mtk_soc_data *soc = eth->soc;
++	dma_addr_t phy_ring_tail;
++	int cnt = MTK_DMA_SIZE;
++	dma_addr_t dma_addr;
++	int i;
++
++	eth->scratch_ring = dma_alloc_coherent(eth->dma_dev,
++					       cnt * soc->txrx.txd_size,
++					       &eth->phy_scratch_ring,
++					       GFP_KERNEL);
++	if (unlikely(!eth->scratch_ring))
++		return -ENOMEM;
++
++	eth->scratch_head = kcalloc(cnt, MTK_QDMA_PAGE_SIZE, GFP_KERNEL);
++	if (unlikely(!eth->scratch_head))
++		return -ENOMEM;
++
++	dma_addr = dma_map_single(eth->dma_dev,
++				  eth->scratch_head, cnt * MTK_QDMA_PAGE_SIZE,
++				  DMA_FROM_DEVICE);
++	if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr)))
++		return -ENOMEM;
++
++	phy_ring_tail = eth->phy_scratch_ring + soc->txrx.txd_size * (cnt - 1);
++
++	for (i = 0; i < cnt; i++) {
++		struct mtk_tx_dma_v2 *txd;
++
++		txd = eth->scratch_ring + i * soc->txrx.txd_size;
++		txd->txd1 = dma_addr + i * MTK_QDMA_PAGE_SIZE;
++		if (i < cnt - 1)
++			txd->txd2 = eth->phy_scratch_ring +
++				    (i + 1) * soc->txrx.txd_size;
++
++		txd->txd3 = TX_DMA_PLEN0(MTK_QDMA_PAGE_SIZE);
++		txd->txd4 = 0;
++		if (MTK_HAS_CAPS(soc->caps, MTK_NETSYS_V2)) {
++			txd->txd5 = 0;
++			txd->txd6 = 0;
++			txd->txd7 = 0;
++			txd->txd8 = 0;
++		}
++	}
++
++	mtk_w32(eth, eth->phy_scratch_ring, soc->reg_map->qdma.fq_head);
++	mtk_w32(eth, phy_ring_tail, soc->reg_map->qdma.fq_tail);
++	mtk_w32(eth, (cnt << 16) | cnt, soc->reg_map->qdma.fq_count);
++	mtk_w32(eth, MTK_QDMA_PAGE_SIZE << 16, soc->reg_map->qdma.fq_blen);
++
++	return 0;
++}
++
++static void *mtk_qdma_phys_to_virt(struct mtk_tx_ring *ring, u32 desc)
++{
++	return ring->dma + (desc - ring->phys);
++}
++
++static struct mtk_tx_buf *mtk_desc_to_tx_buf(struct mtk_tx_ring *ring,
++					     void *txd, u32 txd_size)
++{
++	int idx = (txd - ring->dma) / txd_size;
++
++	return &ring->buf[idx];
++}
++
++static struct mtk_tx_dma *qdma_to_pdma(struct mtk_tx_ring *ring,
++				       struct mtk_tx_dma *dma)
++{
++	return ring->dma_pdma - (struct mtk_tx_dma *)ring->dma + dma;
++}
++
++static int txd_to_idx(struct mtk_tx_ring *ring, void *dma, u32 txd_size)
++{
++	return (dma - ring->dma) / txd_size;
++}
++
++static void mtk_tx_unmap(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf,
++			 struct xdp_frame_bulk *bq, bool napi)
++{
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) {
++			dma_unmap_single(eth->dma_dev,
++					 dma_unmap_addr(tx_buf, dma_addr0),
++					 dma_unmap_len(tx_buf, dma_len0),
++					 DMA_TO_DEVICE);
++		} else if (tx_buf->flags & MTK_TX_FLAGS_PAGE0) {
++			dma_unmap_page(eth->dma_dev,
++				       dma_unmap_addr(tx_buf, dma_addr0),
++				       dma_unmap_len(tx_buf, dma_len0),
++				       DMA_TO_DEVICE);
++		}
++	} else {
++		if (dma_unmap_len(tx_buf, dma_len0)) {
++			dma_unmap_page(eth->dma_dev,
++				       dma_unmap_addr(tx_buf, dma_addr0),
++				       dma_unmap_len(tx_buf, dma_len0),
++				       DMA_TO_DEVICE);
++		}
++
++		if (dma_unmap_len(tx_buf, dma_len1)) {
++			dma_unmap_page(eth->dma_dev,
++				       dma_unmap_addr(tx_buf, dma_addr1),
++				       dma_unmap_len(tx_buf, dma_len1),
++				       DMA_TO_DEVICE);
++		}
++	}
++
++	if (tx_buf->data && tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) {
++		if (tx_buf->type == MTK_TYPE_SKB) {
++			struct sk_buff *skb = tx_buf->data;
++
++			if (napi)
++				napi_consume_skb(skb, napi);
++			else
++				dev_kfree_skb_any(skb);
++		} else {
++			struct xdp_frame *xdpf = tx_buf->data;
++
++			if (napi && tx_buf->type == MTK_TYPE_XDP_TX)
++				xdp_return_frame_rx_napi(xdpf);
++			else if (bq)
++				xdp_return_frame_bulk(xdpf, bq);
++			else
++				xdp_return_frame(xdpf);
++		}
++	}
++	tx_buf->flags = 0;
++	tx_buf->data = NULL;
++}
++
++static void setup_tx_buf(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf,
++			 struct mtk_tx_dma *txd, dma_addr_t mapped_addr,
++			 size_t size, int idx)
++{
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
++		dma_unmap_len_set(tx_buf, dma_len0, size);
++	} else {
++		if (idx & 1) {
++			txd->txd3 = mapped_addr;
++			txd->txd2 |= TX_DMA_PLEN1(size);
++			dma_unmap_addr_set(tx_buf, dma_addr1, mapped_addr);
++			dma_unmap_len_set(tx_buf, dma_len1, size);
++		} else {
++			tx_buf->data = (void *)MTK_DMA_DUMMY_DESC;
++			txd->txd1 = mapped_addr;
++			txd->txd2 = TX_DMA_PLEN0(size);
++			dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
++			dma_unmap_len_set(tx_buf, dma_len0, size);
++		}
++	}
++}
++
++static void mtk_tx_set_dma_desc_v1(struct net_device *dev, void *txd,
++				   struct mtk_tx_dma_desc_info *info)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	struct mtk_tx_dma *desc = txd;
++	u32 data;
++
++	WRITE_ONCE(desc->txd1, info->addr);
++
++	data = TX_DMA_SWC | TX_DMA_PLEN0(info->size);
++	if (info->last)
++		data |= TX_DMA_LS0;
++	WRITE_ONCE(desc->txd3, data);
++
++	data = (mac->id + 1) << TX_DMA_FPORT_SHIFT; /* forward port */
++	if (info->first) {
++		if (info->gso)
++			data |= TX_DMA_TSO;
++		/* tx checksum offload */
++		if (info->csum)
++			data |= TX_DMA_CHKSUM;
++		/* vlan header offload */
++		if (info->vlan)
++			data |= TX_DMA_INS_VLAN | info->vlan_tci;
++	}
++	WRITE_ONCE(desc->txd4, data);
++}
++
++static void mtk_tx_set_dma_desc_v2(struct net_device *dev, void *txd,
++				   struct mtk_tx_dma_desc_info *info)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_tx_dma_v2 *desc = txd;
++	struct mtk_eth *eth = mac->hw;
++	u32 data;
++
++	WRITE_ONCE(desc->txd1, info->addr);
++
++	data = TX_DMA_PLEN0(info->size);
++	if (info->last)
++		data |= TX_DMA_LS0;
++	WRITE_ONCE(desc->txd3, data);
++
++	if (!info->qid && mac->id)
++		info->qid = MTK_QDMA_GMAC2_QID;
++
++	data = (mac->id + 1) << TX_DMA_FPORT_SHIFT_V2; /* forward port */
++	data |= TX_DMA_SWC_V2 | QID_BITS_V2(info->qid);
++	WRITE_ONCE(desc->txd4, data);
++
++	data = 0;
++	if (info->first) {
++		if (info->gso)
++			data |= TX_DMA_TSO_V2;
++		/* tx checksum offload */
++		if (info->csum)
++			data |= TX_DMA_CHKSUM_V2;
++	}
++	WRITE_ONCE(desc->txd5, data);
++
++	data = 0;
++	if (info->first && info->vlan)
++		data |= TX_DMA_INS_VLAN_V2 | info->vlan_tci;
++	WRITE_ONCE(desc->txd6, data);
++
++	WRITE_ONCE(desc->txd7, 0);
++	WRITE_ONCE(desc->txd8, 0);
++}
++
++static void mtk_tx_set_dma_desc(struct net_device *dev, void *txd,
++				struct mtk_tx_dma_desc_info *info)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
++		mtk_tx_set_dma_desc_v2(dev, txd, info);
++	else
++		mtk_tx_set_dma_desc_v1(dev, txd, info);
++}
++
++static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
++		      int tx_num, struct mtk_tx_ring *ring, bool gso)
++{
++	struct mtk_tx_dma_desc_info txd_info = {
++		.size = skb_headlen(skb),
++		.gso = gso,
++		.csum = skb->ip_summed == CHECKSUM_PARTIAL,
++		.vlan = skb_vlan_tag_present(skb),
++		.qid = skb->mark & MTK_QDMA_TX_MASK,
++		.vlan_tci = skb_vlan_tag_get(skb),
++		.first = true,
++		.last = !skb_is_nonlinear(skb),
++	};
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	const struct mtk_soc_data *soc = eth->soc;
++	struct mtk_tx_dma *itxd, *txd;
++	struct mtk_tx_dma *itxd_pdma, *txd_pdma;
++	struct mtk_tx_buf *itx_buf, *tx_buf;
++	int i, n_desc = 1;
++	int k = 0;
++
++	itxd = ring->next_free;
++	itxd_pdma = qdma_to_pdma(ring, itxd);
++	if (itxd == ring->last_free)
++		return -ENOMEM;
++
++	itx_buf = mtk_desc_to_tx_buf(ring, itxd, soc->txrx.txd_size);
++	memset(itx_buf, 0, sizeof(*itx_buf));
++
++	txd_info.addr = dma_map_single(eth->dma_dev, skb->data, txd_info.size,
++				       DMA_TO_DEVICE);
++	if (unlikely(dma_mapping_error(eth->dma_dev, txd_info.addr)))
++		return -ENOMEM;
++
++	mtk_tx_set_dma_desc(dev, itxd, &txd_info);
++
++	itx_buf->flags |= MTK_TX_FLAGS_SINGLE0;
++	itx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 :
++			  MTK_TX_FLAGS_FPORT1;
++	setup_tx_buf(eth, itx_buf, itxd_pdma, txd_info.addr, txd_info.size,
++		     k++);
++
++	/* TX SG offload */
++	txd = itxd;
++	txd_pdma = qdma_to_pdma(ring, txd);
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++		unsigned int offset = 0;
++		int frag_size = skb_frag_size(frag);
++
++		while (frag_size) {
++			bool new_desc = true;
++
++			if (MTK_HAS_CAPS(soc->caps, MTK_QDMA) ||
++			    (i & 0x1)) {
++				txd = mtk_qdma_phys_to_virt(ring, txd->txd2);
++				txd_pdma = qdma_to_pdma(ring, txd);
++				if (txd == ring->last_free)
++					goto err_dma;
++
++				n_desc++;
++			} else {
++				new_desc = false;
++			}
++
++			memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info));
++			txd_info.size = min_t(unsigned int, frag_size,
++					      soc->txrx.dma_max_len);
++			txd_info.qid = skb->mark & MTK_QDMA_TX_MASK;
++			txd_info.last = i == skb_shinfo(skb)->nr_frags - 1 &&
++					!(frag_size - txd_info.size);
++			txd_info.addr = skb_frag_dma_map(eth->dma_dev, frag,
++							 offset, txd_info.size,
++							 DMA_TO_DEVICE);
++			if (unlikely(dma_mapping_error(eth->dma_dev, txd_info.addr)))
++				goto err_dma;
++
++			mtk_tx_set_dma_desc(dev, txd, &txd_info);
++
++			tx_buf = mtk_desc_to_tx_buf(ring, txd,
++						    soc->txrx.txd_size);
++			if (new_desc)
++				memset(tx_buf, 0, sizeof(*tx_buf));
++			tx_buf->data = (void *)MTK_DMA_DUMMY_DESC;
++			tx_buf->flags |= MTK_TX_FLAGS_PAGE0;
++			tx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 :
++					 MTK_TX_FLAGS_FPORT1;
++
++			setup_tx_buf(eth, tx_buf, txd_pdma, txd_info.addr,
++				     txd_info.size, k++);
++
++			frag_size -= txd_info.size;
++			offset += txd_info.size;
++		}
++	}
++
++	/* store skb to cleanup */
++	itx_buf->type = MTK_TYPE_SKB;
++	itx_buf->data = skb;
++
++	if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		if (k & 0x1)
++			txd_pdma->txd2 |= TX_DMA_LS0;
++		else
++			txd_pdma->txd2 |= TX_DMA_LS1;
++	}
++
++	netdev_sent_queue(dev, skb->len);
++	skb_tx_timestamp(skb);
++
++	ring->next_free = mtk_qdma_phys_to_virt(ring, txd->txd2);
++	atomic_sub(n_desc, &ring->free_count);
++
++	/* make sure that all changes to the dma ring are flushed before we
++	 * continue
++	 */
++	wmb();
++
++	if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)) ||
++		    !netdev_xmit_more())
++			mtk_w32(eth, txd->txd2, soc->reg_map->qdma.ctx_ptr);
++	} else {
++		int next_idx;
++
++		next_idx = NEXT_DESP_IDX(txd_to_idx(ring, txd, soc->txrx.txd_size),
++					 ring->dma_size);
++		mtk_w32(eth, next_idx, MT7628_TX_CTX_IDX0);
++	}
++
++	return 0;
++
++err_dma:
++	do {
++		tx_buf = mtk_desc_to_tx_buf(ring, itxd, soc->txrx.txd_size);
++
++		/* unmap dma */
++		mtk_tx_unmap(eth, tx_buf, NULL, false);
++
++		itxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU;
++		if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA))
++			itxd_pdma->txd2 = TX_DMA_DESP2_DEF;
++
++		itxd = mtk_qdma_phys_to_virt(ring, itxd->txd2);
++		itxd_pdma = qdma_to_pdma(ring, itxd);
++	} while (itxd != txd);
++
++	return -ENOMEM;
++}
++
++static int mtk_cal_txd_req(struct mtk_eth *eth, struct sk_buff *skb)
++{
++	int i, nfrags = 1;
++	skb_frag_t *frag;
++
++	if (skb_is_gso(skb)) {
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++			frag = &skb_shinfo(skb)->frags[i];
++			nfrags += DIV_ROUND_UP(skb_frag_size(frag),
++					       eth->soc->txrx.dma_max_len);
++		}
++	} else {
++		nfrags += skb_shinfo(skb)->nr_frags;
++	}
++
++	return nfrags;
++}
++
++static int mtk_queue_stopped(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		if (netif_queue_stopped(eth->netdev[i]))
++			return 1;
++	}
++
++	return 0;
++}
++
++static void mtk_wake_queue(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		netif_wake_queue(eth->netdev[i]);
++	}
++}
++
++static netdev_tx_t mtk_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct net_device_stats *stats = &dev->stats;
++	bool gso = false;
++	int tx_num;
++
++	/* normally we can rely on the stack not calling this more than once,
++	 * however we have 2 queues running on the same ring so we need to lock
++	 * the ring access
++	 */
++	spin_lock(&eth->page_lock);
++
++	if (unlikely(test_bit(MTK_RESETTING, &eth->state)))
++		goto drop;
++
++	tx_num = mtk_cal_txd_req(eth, skb);
++	if (unlikely(atomic_read(&ring->free_count) <= tx_num)) {
++		netif_stop_queue(dev);
++		netif_err(eth, tx_queued, dev,
++			  "Tx Ring full when queue awake!\n");
++		spin_unlock(&eth->page_lock);
++		return NETDEV_TX_BUSY;
++	}
++
++	/* TSO: fill MSS info in tcp checksum field */
++	if (skb_is_gso(skb)) {
++		if (skb_cow_head(skb, 0)) {
++			netif_warn(eth, tx_err, dev,
++				   "GSO expand head fail.\n");
++			goto drop;
++		}
++
++		if (skb_shinfo(skb)->gso_type &
++				(SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) {
++			gso = true;
++			tcp_hdr(skb)->check = htons(skb_shinfo(skb)->gso_size);
++		}
++	}
++
++	if (mtk_tx_map(skb, dev, tx_num, ring, gso) < 0)
++		goto drop;
++
++	if (unlikely(atomic_read(&ring->free_count) <= ring->thresh))
++		netif_stop_queue(dev);
++
++	spin_unlock(&eth->page_lock);
++
++	return NETDEV_TX_OK;
++
++drop:
++	spin_unlock(&eth->page_lock);
++	stats->tx_dropped++;
++	dev_kfree_skb_any(skb);
++	return NETDEV_TX_OK;
++}
++
++static struct mtk_rx_ring *mtk_get_rx_ring(struct mtk_eth *eth)
++{
++	int i;
++	struct mtk_rx_ring *ring;
++	int idx;
++
++	if (!eth->hwlro)
++		return &eth->rx_ring[0];
++
++	for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) {
++		struct mtk_rx_dma *rxd;
++
++		ring = &eth->rx_ring[i];
++		idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
++		rxd = ring->dma + idx * eth->soc->txrx.rxd_size;
++		if (rxd->rxd2 & RX_DMA_DONE) {
++			ring->calc_idx_update = true;
++			return ring;
++		}
++	}
++
++	return NULL;
++}
++
++static void mtk_update_rx_cpu_idx(struct mtk_eth *eth)
++{
++	struct mtk_rx_ring *ring;
++	int i;
++
++	if (!eth->hwlro) {
++		ring = &eth->rx_ring[0];
++		mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg);
++	} else {
++		for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) {
++			ring = &eth->rx_ring[i];
++			if (ring->calc_idx_update) {
++				ring->calc_idx_update = false;
++				mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg);
++			}
++		}
++	}
++}
++
++static bool mtk_page_pool_enabled(struct mtk_eth *eth)
++{
++	return MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2);
++}
++
++static struct page_pool *mtk_create_page_pool(struct mtk_eth *eth,
++					      struct xdp_rxq_info *xdp_q,
++					      int id, int size)
++{
++	struct page_pool_params pp_params = {
++		.order = 0,
++		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
++		.pool_size = size,
++		.nid = NUMA_NO_NODE,
++		.dev = eth->dma_dev,
++		.offset = MTK_PP_HEADROOM,
++		.max_len = MTK_PP_MAX_BUF_SIZE,
++	};
++	struct page_pool *pp;
++	int err;
++
++	pp_params.dma_dir = rcu_access_pointer(eth->prog) ? DMA_BIDIRECTIONAL
++							  : DMA_FROM_DEVICE;
++	pp = page_pool_create(&pp_params);
++	if (IS_ERR(pp))
++		return pp;
++
++	err = __xdp_rxq_info_reg(xdp_q, &eth->dummy_dev, eth->rx_napi.napi_id,
++				 id, PAGE_SIZE);
++	if (err < 0)
++		goto err_free_pp;
++
++	err = xdp_rxq_info_reg_mem_model(xdp_q, MEM_TYPE_PAGE_POOL, pp);
++	if (err)
++		goto err_unregister_rxq;
++
++	return pp;
++
++err_unregister_rxq:
++	xdp_rxq_info_unreg(xdp_q);
++err_free_pp:
++	page_pool_destroy(pp);
++
++	return ERR_PTR(err);
++}
++
++static void *mtk_page_pool_get_buff(struct page_pool *pp, dma_addr_t *dma_addr,
++				    gfp_t gfp_mask)
++{
++	struct page *page;
++
++	page = page_pool_alloc_pages(pp, gfp_mask | __GFP_NOWARN);
++	if (!page)
++		return NULL;
++
++	*dma_addr = page_pool_get_dma_addr(page) + MTK_PP_HEADROOM;
++	return page_address(page);
++}
++
++static void mtk_rx_put_buff(struct mtk_rx_ring *ring, void *data, bool napi)
++{
++	if (ring->page_pool)
++		page_pool_put_full_page(ring->page_pool,
++					virt_to_head_page(data), napi);
++	else
++		skb_free_frag(data);
++}
++
++static int mtk_xdp_frame_map(struct mtk_eth *eth, struct net_device *dev,
++			     struct mtk_tx_dma_desc_info *txd_info,
++			     struct mtk_tx_dma *txd, struct mtk_tx_buf *tx_buf,
++			     void *data, u16 headroom, int index, bool dma_map)
++{
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_tx_dma *txd_pdma;
++
++	if (dma_map) {  /* ndo_xdp_xmit */
++		txd_info->addr = dma_map_single(eth->dma_dev, data,
++						txd_info->size, DMA_TO_DEVICE);
++		if (unlikely(dma_mapping_error(eth->dma_dev, txd_info->addr)))
++			return -ENOMEM;
++
++		tx_buf->flags |= MTK_TX_FLAGS_SINGLE0;
++	} else {
++		struct page *page = virt_to_head_page(data);
++
++		txd_info->addr = page_pool_get_dma_addr(page) +
++				 sizeof(struct xdp_frame) + headroom;
++		dma_sync_single_for_device(eth->dma_dev, txd_info->addr,
++					   txd_info->size, DMA_BIDIRECTIONAL);
++	}
++	mtk_tx_set_dma_desc(dev, txd, txd_info);
++
++	tx_buf->flags |= !mac->id ? MTK_TX_FLAGS_FPORT0 : MTK_TX_FLAGS_FPORT1;
++	tx_buf->type = dma_map ? MTK_TYPE_XDP_NDO : MTK_TYPE_XDP_TX;
++	tx_buf->data = (void *)MTK_DMA_DUMMY_DESC;
++
++	txd_pdma = qdma_to_pdma(ring, txd);
++	setup_tx_buf(eth, tx_buf, txd_pdma, txd_info->addr, txd_info->size,
++		     index);
++
++	return 0;
++}
++
++static int mtk_xdp_submit_frame(struct mtk_eth *eth, struct xdp_frame *xdpf,
++				struct net_device *dev, bool dma_map)
++{
++	struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
++	const struct mtk_soc_data *soc = eth->soc;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct mtk_tx_dma_desc_info txd_info = {
++		.size	= xdpf->len,
++		.first	= true,
++		.last	= !xdp_frame_has_frags(xdpf),
++	};
++	int err, index = 0, n_desc = 1, nr_frags;
++	struct mtk_tx_dma *htxd, *txd, *txd_pdma;
++	struct mtk_tx_buf *htx_buf, *tx_buf;
++	void *data = xdpf->data;
++
++	if (unlikely(test_bit(MTK_RESETTING, &eth->state)))
++		return -EBUSY;
++
++	nr_frags = unlikely(xdp_frame_has_frags(xdpf)) ? sinfo->nr_frags : 0;
++	if (unlikely(atomic_read(&ring->free_count) <= 1 + nr_frags))
++		return -EBUSY;
++
++	spin_lock(&eth->page_lock);
++
++	txd = ring->next_free;
++	if (txd == ring->last_free) {
++		spin_unlock(&eth->page_lock);
++		return -ENOMEM;
++	}
++	htxd = txd;
++
++	tx_buf = mtk_desc_to_tx_buf(ring, txd, soc->txrx.txd_size);
++	memset(tx_buf, 0, sizeof(*tx_buf));
++	htx_buf = tx_buf;
++
++	for (;;) {
++		err = mtk_xdp_frame_map(eth, dev, &txd_info, txd, tx_buf,
++					data, xdpf->headroom, index, dma_map);
++		if (err < 0)
++			goto unmap;
++
++		if (txd_info.last)
++			break;
++
++		if (MTK_HAS_CAPS(soc->caps, MTK_QDMA) || (index & 0x1)) {
++			txd = mtk_qdma_phys_to_virt(ring, txd->txd2);
++			txd_pdma = qdma_to_pdma(ring, txd);
++			if (txd == ring->last_free)
++				goto unmap;
++
++			tx_buf = mtk_desc_to_tx_buf(ring, txd,
++						    soc->txrx.txd_size);
++			memset(tx_buf, 0, sizeof(*tx_buf));
++			n_desc++;
++		}
++
++		memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info));
++		txd_info.size = skb_frag_size(&sinfo->frags[index]);
++		txd_info.last = index + 1 == nr_frags;
++		data = skb_frag_address(&sinfo->frags[index]);
++
++		index++;
++	}
++	/* store xdpf for cleanup */
++	htx_buf->data = xdpf;
++
++	if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		txd_pdma = qdma_to_pdma(ring, txd);
++		if (index & 1)
++			txd_pdma->txd2 |= TX_DMA_LS0;
++		else
++			txd_pdma->txd2 |= TX_DMA_LS1;
++	}
++
++	ring->next_free = mtk_qdma_phys_to_virt(ring, txd->txd2);
++	atomic_sub(n_desc, &ring->free_count);
++
++	/* make sure that all changes to the dma ring are flushed before we
++	 * continue
++	 */
++	wmb();
++
++	if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		mtk_w32(eth, txd->txd2, soc->reg_map->qdma.ctx_ptr);
++	} else {
++		int idx;
++
++		idx = txd_to_idx(ring, txd, soc->txrx.txd_size);
++		mtk_w32(eth, NEXT_DESP_IDX(idx, ring->dma_size),
++			MT7628_TX_CTX_IDX0);
++	}
++
++	spin_unlock(&eth->page_lock);
++
++	return 0;
++
++unmap:
++	while (htxd != txd) {
++		txd_pdma = qdma_to_pdma(ring, htxd);
++		tx_buf = mtk_desc_to_tx_buf(ring, htxd, soc->txrx.txd_size);
++		mtk_tx_unmap(eth, tx_buf, NULL, false);
++
++		htxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU;
++		if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA))
++			txd_pdma->txd2 = TX_DMA_DESP2_DEF;
++
++		htxd = mtk_qdma_phys_to_virt(ring, htxd->txd2);
++	}
++
++	spin_unlock(&eth->page_lock);
++
++	return err;
++}
++
++static int mtk_xdp_xmit(struct net_device *dev, int num_frame,
++			struct xdp_frame **frames, u32 flags)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_hw_stats *hw_stats = mac->hw_stats;
++	struct mtk_eth *eth = mac->hw;
++	int i, nxmit = 0;
++
++	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
++		return -EINVAL;
++
++	for (i = 0; i < num_frame; i++) {
++		if (mtk_xdp_submit_frame(eth, frames[i], dev, true))
++			break;
++		nxmit++;
++	}
++
++	u64_stats_update_begin(&hw_stats->syncp);
++	hw_stats->xdp_stats.tx_xdp_xmit += nxmit;
++	hw_stats->xdp_stats.tx_xdp_xmit_errors += num_frame - nxmit;
++	u64_stats_update_end(&hw_stats->syncp);
++
++	return nxmit;
++}
++
++static u32 mtk_xdp_run(struct mtk_eth *eth, struct mtk_rx_ring *ring,
++		       struct xdp_buff *xdp, struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_hw_stats *hw_stats = mac->hw_stats;
++	u64 *count = &hw_stats->xdp_stats.rx_xdp_drop;
++	struct bpf_prog *prog;
++	u32 act = XDP_PASS;
++
++	rcu_read_lock();
++
++	prog = rcu_dereference(eth->prog);
++	if (!prog)
++		goto out;
++
++	act = bpf_prog_run_xdp(prog, xdp);
++	switch (act) {
++	case XDP_PASS:
++		count = &hw_stats->xdp_stats.rx_xdp_pass;
++		goto update_stats;
++	case XDP_REDIRECT:
++		if (unlikely(xdp_do_redirect(dev, xdp, prog))) {
++			act = XDP_DROP;
++			break;
++		}
++
++		count = &hw_stats->xdp_stats.rx_xdp_redirect;
++		goto update_stats;
++	case XDP_TX: {
++		struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
++
++		if (!xdpf || mtk_xdp_submit_frame(eth, xdpf, dev, false)) {
++			count = &hw_stats->xdp_stats.rx_xdp_tx_errors;
++			act = XDP_DROP;
++			break;
++		}
++
++		count = &hw_stats->xdp_stats.rx_xdp_tx;
++		goto update_stats;
++	}
++	default:
++		bpf_warn_invalid_xdp_action(dev, prog, act);
++		fallthrough;
++	case XDP_ABORTED:
++		trace_xdp_exception(dev, prog, act);
++		fallthrough;
++	case XDP_DROP:
++		break;
++	}
++
++	page_pool_put_full_page(ring->page_pool,
++				virt_to_head_page(xdp->data), true);
++
++update_stats:
++	u64_stats_update_begin(&hw_stats->syncp);
++	*count = *count + 1;
++	u64_stats_update_end(&hw_stats->syncp);
++out:
++	rcu_read_unlock();
++
++	return act;
++}
++
++static int mtk_poll_rx(struct napi_struct *napi, int budget,
++		       struct mtk_eth *eth)
++{
++	struct dim_sample dim_sample = {};
++	struct mtk_rx_ring *ring;
++	bool xdp_flush = false;
++	int idx;
++	struct sk_buff *skb;
++	u8 *data, *new_data;
++	struct mtk_rx_dma_v2 *rxd, trxd;
++	int done = 0, bytes = 0;
++
++	while (done < budget) {
++		unsigned int pktlen, *rxdcsum;
++		struct net_device *netdev;
++		dma_addr_t dma_addr;
++		u32 hash, reason;
++		int mac = 0;
++
++		ring = mtk_get_rx_ring(eth);
++		if (unlikely(!ring))
++			goto rx_done;
++
++		idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
++		rxd = ring->dma + idx * eth->soc->txrx.rxd_size;
++		data = ring->data[idx];
++
++		if (!mtk_rx_get_desc(eth, &trxd, rxd))
++			break;
++
++		/* find out which mac the packet come from. values start at 1 */
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
++			mac = RX_DMA_GET_SPORT_V2(trxd.rxd5) - 1;
++		else if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) &&
++			 !(trxd.rxd4 & RX_DMA_SPECIAL_TAG))
++			mac = RX_DMA_GET_SPORT(trxd.rxd4) - 1;
++
++		if (unlikely(mac < 0 || mac >= MTK_MAC_COUNT ||
++			     !eth->netdev[mac]))
++			goto release_desc;
++
++		netdev = eth->netdev[mac];
++
++		if (unlikely(test_bit(MTK_RESETTING, &eth->state)))
++			goto release_desc;
++
++		pktlen = RX_DMA_GET_PLEN0(trxd.rxd2);
++
++		/* alloc new buffer */
++		if (ring->page_pool) {
++			struct page *page = virt_to_head_page(data);
++			struct xdp_buff xdp;
++			u32 ret;
++
++			new_data = mtk_page_pool_get_buff(ring->page_pool,
++							  &dma_addr,
++							  GFP_ATOMIC);
++			if (unlikely(!new_data)) {
++				netdev->stats.rx_dropped++;
++				goto release_desc;
++			}
++
++			dma_sync_single_for_cpu(eth->dma_dev,
++				page_pool_get_dma_addr(page) + MTK_PP_HEADROOM,
++				pktlen, page_pool_get_dma_dir(ring->page_pool));
++
++			xdp_init_buff(&xdp, PAGE_SIZE, &ring->xdp_q);
++			xdp_prepare_buff(&xdp, data, MTK_PP_HEADROOM, pktlen,
++					 false);
++			xdp_buff_clear_frags_flag(&xdp);
++
++			ret = mtk_xdp_run(eth, ring, &xdp, netdev);
++			if (ret == XDP_REDIRECT)
++				xdp_flush = true;
++
++			if (ret != XDP_PASS)
++				goto skip_rx;
++
++			skb = build_skb(data, PAGE_SIZE);
++			if (unlikely(!skb)) {
++				page_pool_put_full_page(ring->page_pool,
++							page, true);
++				netdev->stats.rx_dropped++;
++				goto skip_rx;
++			}
++
++			skb_reserve(skb, xdp.data - xdp.data_hard_start);
++			skb_put(skb, xdp.data_end - xdp.data);
++			skb_mark_for_recycle(skb);
++		} else {
++			if (ring->frag_size <= PAGE_SIZE)
++				new_data = napi_alloc_frag(ring->frag_size);
++			else
++				new_data = mtk_max_lro_buf_alloc(GFP_ATOMIC);
++
++			if (unlikely(!new_data)) {
++				netdev->stats.rx_dropped++;
++				goto release_desc;
++			}
++
++			dma_addr = dma_map_single(eth->dma_dev,
++				new_data + NET_SKB_PAD + eth->ip_align,
++				ring->buf_size, DMA_FROM_DEVICE);
++			if (unlikely(dma_mapping_error(eth->dma_dev,
++						       dma_addr))) {
++				skb_free_frag(new_data);
++				netdev->stats.rx_dropped++;
++				goto release_desc;
++			}
++
++			dma_unmap_single(eth->dma_dev, trxd.rxd1,
++					 ring->buf_size, DMA_FROM_DEVICE);
++
++			skb = build_skb(data, ring->frag_size);
++			if (unlikely(!skb)) {
++				netdev->stats.rx_dropped++;
++				skb_free_frag(data);
++				goto skip_rx;
++			}
++
++			skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
++			skb_put(skb, pktlen);
++		}
++
++		skb->dev = netdev;
++		bytes += skb->len;
++
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++			hash = trxd.rxd5 & MTK_RXD5_FOE_ENTRY;
++			if (hash != MTK_RXD5_FOE_ENTRY)
++				skb_set_hash(skb, jhash_1word(hash, 0),
++					     PKT_HASH_TYPE_L4);
++			rxdcsum = &trxd.rxd3;
++		} else {
++			hash = trxd.rxd4 & MTK_RXD4_FOE_ENTRY;
++			if (hash != MTK_RXD4_FOE_ENTRY)
++				skb_set_hash(skb, jhash_1word(hash, 0),
++					     PKT_HASH_TYPE_L4);
++			rxdcsum = &trxd.rxd4;
++		}
++
++		if (*rxdcsum & eth->soc->txrx.rx_dma_l4_valid)
++			skb->ip_summed = CHECKSUM_UNNECESSARY;
++		else
++			skb_checksum_none_assert(skb);
++		skb->protocol = eth_type_trans(skb, netdev);
++
++		reason = FIELD_GET(MTK_RXD4_PPE_CPU_REASON, trxd.rxd4);
++		if (reason == MTK_PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED)
++			mtk_ppe_check_skb(eth->ppe, skb, hash);
++
++		if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
++			if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++				if (trxd.rxd3 & RX_DMA_VTAG_V2)
++					__vlan_hwaccel_put_tag(skb,
++						htons(RX_DMA_VPID(trxd.rxd4)),
++						RX_DMA_VID(trxd.rxd4));
++			} else if (trxd.rxd2 & RX_DMA_VTAG) {
++				__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
++						       RX_DMA_VID(trxd.rxd3));
++			}
++
++			/* If the device is attached to a dsa switch, the special
++			 * tag inserted in VLAN field by hw switch can * be offloaded
++			 * by RX HW VLAN offload. Clear vlan info.
++			 */
++			if (netdev_uses_dsa(netdev))
++				__vlan_hwaccel_clear_tag(skb);
++		}
++
++		skb_record_rx_queue(skb, 0);
++		napi_gro_receive(napi, skb);
++
++skip_rx:
++		ring->data[idx] = new_data;
++		rxd->rxd1 = (unsigned int)dma_addr;
++release_desc:
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++			rxd->rxd2 = RX_DMA_LSO;
++		else
++			rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size);
++
++		ring->calc_idx = idx;
++		done++;
++	}
++
++rx_done:
++	if (done) {
++		/* make sure that all changes to the dma ring are flushed before
++		 * we continue
++		 */
++		wmb();
++		mtk_update_rx_cpu_idx(eth);
++	}
++
++	eth->rx_packets += done;
++	eth->rx_bytes += bytes;
++	dim_update_sample(eth->rx_events, eth->rx_packets, eth->rx_bytes,
++			  &dim_sample);
++	net_dim(&eth->rx_dim, dim_sample);
++
++	if (xdp_flush)
++		xdp_do_flush_map();
++
++	return done;
++}
++
++static int mtk_poll_tx_qdma(struct mtk_eth *eth, int budget,
++			    unsigned int *done, unsigned int *bytes)
++{
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct mtk_tx_buf *tx_buf;
++	struct xdp_frame_bulk bq;
++	struct mtk_tx_dma *desc;
++	u32 cpu, dma;
++
++	cpu = ring->last_free_ptr;
++	dma = mtk_r32(eth, reg_map->qdma.drx_ptr);
++
++	desc = mtk_qdma_phys_to_virt(ring, cpu);
++	xdp_frame_bulk_init(&bq);
++
++	while ((cpu != dma) && budget) {
++		u32 next_cpu = desc->txd2;
++		int mac = 0;
++
++		desc = mtk_qdma_phys_to_virt(ring, desc->txd2);
++		if ((desc->txd3 & TX_DMA_OWNER_CPU) == 0)
++			break;
++
++		tx_buf = mtk_desc_to_tx_buf(ring, desc,
++					    eth->soc->txrx.txd_size);
++		if (tx_buf->flags & MTK_TX_FLAGS_FPORT1)
++			mac = 1;
++
++		if (!tx_buf->data)
++			break;
++
++		if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) {
++			if (tx_buf->type == MTK_TYPE_SKB) {
++				struct sk_buff *skb = tx_buf->data;
++
++				bytes[mac] += skb->len;
++				done[mac]++;
++			}
++			budget--;
++		}
++		mtk_tx_unmap(eth, tx_buf, &bq, true);
++
++		ring->last_free = desc;
++		atomic_inc(&ring->free_count);
++
++		cpu = next_cpu;
++	}
++	xdp_flush_frame_bulk(&bq);
++
++	ring->last_free_ptr = cpu;
++	mtk_w32(eth, cpu, reg_map->qdma.crx_ptr);
++
++	return budget;
++}
++
++static int mtk_poll_tx_pdma(struct mtk_eth *eth, int budget,
++			    unsigned int *done, unsigned int *bytes)
++{
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct mtk_tx_buf *tx_buf;
++	struct xdp_frame_bulk bq;
++	struct mtk_tx_dma *desc;
++	u32 cpu, dma;
++
++	cpu = ring->cpu_idx;
++	dma = mtk_r32(eth, MT7628_TX_DTX_IDX0);
++	xdp_frame_bulk_init(&bq);
++
++	while ((cpu != dma) && budget) {
++		tx_buf = &ring->buf[cpu];
++		if (!tx_buf->data)
++			break;
++
++		if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) {
++			if (tx_buf->type == MTK_TYPE_SKB) {
++				struct sk_buff *skb = tx_buf->data;
++
++				bytes[0] += skb->len;
++				done[0]++;
++			}
++			budget--;
++		}
++		mtk_tx_unmap(eth, tx_buf, &bq, true);
++
++		desc = ring->dma + cpu * eth->soc->txrx.txd_size;
++		ring->last_free = desc;
++		atomic_inc(&ring->free_count);
++
++		cpu = NEXT_DESP_IDX(cpu, ring->dma_size);
++	}
++	xdp_flush_frame_bulk(&bq);
++
++	ring->cpu_idx = cpu;
++
++	return budget;
++}
++
++static int mtk_poll_tx(struct mtk_eth *eth, int budget)
++{
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct dim_sample dim_sample = {};
++	unsigned int done[MTK_MAX_DEVS];
++	unsigned int bytes[MTK_MAX_DEVS];
++	int total = 0, i;
++
++	memset(done, 0, sizeof(done));
++	memset(bytes, 0, sizeof(bytes));
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		budget = mtk_poll_tx_qdma(eth, budget, done, bytes);
++	else
++		budget = mtk_poll_tx_pdma(eth, budget, done, bytes);
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i] || !done[i])
++			continue;
++		netdev_completed_queue(eth->netdev[i], done[i], bytes[i]);
++		total += done[i];
++		eth->tx_packets += done[i];
++		eth->tx_bytes += bytes[i];
++	}
++
++	dim_update_sample(eth->tx_events, eth->tx_packets, eth->tx_bytes,
++			  &dim_sample);
++	net_dim(&eth->tx_dim, dim_sample);
++
++	if (mtk_queue_stopped(eth) &&
++	    (atomic_read(&ring->free_count) > ring->thresh))
++		mtk_wake_queue(eth);
++
++	return total;
++}
++
++static void mtk_handle_status_irq(struct mtk_eth *eth)
++{
++	u32 status2 = mtk_r32(eth, MTK_INT_STATUS2);
++
++	if (unlikely(status2 & (MTK_GDM1_AF | MTK_GDM2_AF))) {
++		mtk_stats_update(eth);
++		mtk_w32(eth, (MTK_GDM1_AF | MTK_GDM2_AF),
++			MTK_INT_STATUS2);
++	}
++}
++
++static int mtk_napi_tx(struct napi_struct *napi, int budget)
++{
++	struct mtk_eth *eth = container_of(napi, struct mtk_eth, tx_napi);
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	int tx_done = 0;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		mtk_handle_status_irq(eth);
++	mtk_w32(eth, MTK_TX_DONE_INT, reg_map->tx_irq_status);
++	tx_done = mtk_poll_tx(eth, budget);
++
++	if (unlikely(netif_msg_intr(eth))) {
++		dev_info(eth->dev,
++			 "done tx %d, intr 0x%08x/0x%x\n", tx_done,
++			 mtk_r32(eth, reg_map->tx_irq_status),
++			 mtk_r32(eth, reg_map->tx_irq_mask));
++	}
++
++	if (tx_done == budget)
++		return budget;
++
++	if (mtk_r32(eth, reg_map->tx_irq_status) & MTK_TX_DONE_INT)
++		return budget;
++
++	if (napi_complete_done(napi, tx_done))
++		mtk_tx_irq_enable(eth, MTK_TX_DONE_INT);
++
++	return tx_done;
++}
++
++static int mtk_napi_rx(struct napi_struct *napi, int budget)
++{
++	struct mtk_eth *eth = container_of(napi, struct mtk_eth, rx_napi);
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	int rx_done_total = 0;
++
++	mtk_handle_status_irq(eth);
++
++	do {
++		int rx_done;
++
++		mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask,
++			reg_map->pdma.irq_status);
++		rx_done = mtk_poll_rx(napi, budget - rx_done_total, eth);
++		rx_done_total += rx_done;
++
++		if (unlikely(netif_msg_intr(eth))) {
++			dev_info(eth->dev,
++				 "done rx %d, intr 0x%08x/0x%x\n", rx_done,
++				 mtk_r32(eth, reg_map->pdma.irq_status),
++				 mtk_r32(eth, reg_map->pdma.irq_mask));
++		}
++
++		if (rx_done_total == budget)
++			return budget;
++
++	} while (mtk_r32(eth, reg_map->pdma.irq_status) &
++		 eth->soc->txrx.rx_irq_done_mask);
++
++	if (napi_complete_done(napi, rx_done_total))
++		mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask);
++
++	return rx_done_total;
++}
++
++static int mtk_tx_alloc(struct mtk_eth *eth)
++{
++	const struct mtk_soc_data *soc = eth->soc;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	int i, sz = soc->txrx.txd_size;
++	struct mtk_tx_dma_v2 *txd;
++
++	ring->buf = kcalloc(MTK_DMA_SIZE, sizeof(*ring->buf),
++			       GFP_KERNEL);
++	if (!ring->buf)
++		goto no_tx_mem;
++
++	ring->dma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
++				       &ring->phys, GFP_KERNEL);
++	if (!ring->dma)
++		goto no_tx_mem;
++
++	for (i = 0; i < MTK_DMA_SIZE; i++) {
++		int next = (i + 1) % MTK_DMA_SIZE;
++		u32 next_ptr = ring->phys + next * sz;
++
++		txd = ring->dma + i * sz;
++		txd->txd2 = next_ptr;
++		txd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU;
++		txd->txd4 = 0;
++		if (MTK_HAS_CAPS(soc->caps, MTK_NETSYS_V2)) {
++			txd->txd5 = 0;
++			txd->txd6 = 0;
++			txd->txd7 = 0;
++			txd->txd8 = 0;
++		}
++	}
++
++	/* On MT7688 (PDMA only) this driver uses the ring->dma structs
++	 * only as the framework. The real HW descriptors are the PDMA
++	 * descriptors in ring->dma_pdma.
++	 */
++	if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		ring->dma_pdma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
++						    &ring->phys_pdma, GFP_KERNEL);
++		if (!ring->dma_pdma)
++			goto no_tx_mem;
++
++		for (i = 0; i < MTK_DMA_SIZE; i++) {
++			ring->dma_pdma[i].txd2 = TX_DMA_DESP2_DEF;
++			ring->dma_pdma[i].txd4 = 0;
++		}
++	}
++
++	ring->dma_size = MTK_DMA_SIZE;
++	atomic_set(&ring->free_count, MTK_DMA_SIZE - 2);
++	ring->next_free = ring->dma;
++	ring->last_free = (void *)txd;
++	ring->last_free_ptr = (u32)(ring->phys + ((MTK_DMA_SIZE - 1) * sz));
++	ring->thresh = MAX_SKB_FRAGS;
++
++	/* make sure that all changes to the dma ring are flushed before we
++	 * continue
++	 */
++	wmb();
++
++	if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		mtk_w32(eth, ring->phys, soc->reg_map->qdma.ctx_ptr);
++		mtk_w32(eth, ring->phys, soc->reg_map->qdma.dtx_ptr);
++		mtk_w32(eth,
++			ring->phys + ((MTK_DMA_SIZE - 1) * sz),
++			soc->reg_map->qdma.crx_ptr);
++		mtk_w32(eth, ring->last_free_ptr, soc->reg_map->qdma.drx_ptr);
++		mtk_w32(eth, (QDMA_RES_THRES << 8) | QDMA_RES_THRES,
++			soc->reg_map->qdma.qtx_cfg);
++	} else {
++		mtk_w32(eth, ring->phys_pdma, MT7628_TX_BASE_PTR0);
++		mtk_w32(eth, MTK_DMA_SIZE, MT7628_TX_MAX_CNT0);
++		mtk_w32(eth, 0, MT7628_TX_CTX_IDX0);
++		mtk_w32(eth, MT7628_PST_DTX_IDX0, soc->reg_map->pdma.rst_idx);
++	}
++
++	return 0;
++
++no_tx_mem:
++	return -ENOMEM;
++}
++
++static void mtk_tx_clean(struct mtk_eth *eth)
++{
++	const struct mtk_soc_data *soc = eth->soc;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	int i;
++
++	if (ring->buf) {
++		for (i = 0; i < MTK_DMA_SIZE; i++)
++			mtk_tx_unmap(eth, &ring->buf[i], NULL, false);
++		kfree(ring->buf);
++		ring->buf = NULL;
++	}
++
++	if (ring->dma) {
++		dma_free_coherent(eth->dma_dev,
++				  MTK_DMA_SIZE * soc->txrx.txd_size,
++				  ring->dma, ring->phys);
++		ring->dma = NULL;
++	}
++
++	if (ring->dma_pdma) {
++		dma_free_coherent(eth->dma_dev,
++				  MTK_DMA_SIZE * soc->txrx.txd_size,
++				  ring->dma_pdma, ring->phys_pdma);
++		ring->dma_pdma = NULL;
++	}
++}
++
++static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag)
++{
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	struct mtk_rx_ring *ring;
++	int rx_data_len, rx_dma_size;
++	int i;
++
++	if (rx_flag == MTK_RX_FLAGS_QDMA) {
++		if (ring_no)
++			return -EINVAL;
++		ring = &eth->rx_ring_qdma;
++	} else {
++		ring = &eth->rx_ring[ring_no];
++	}
++
++	if (rx_flag == MTK_RX_FLAGS_HWLRO) {
++		rx_data_len = MTK_MAX_LRO_RX_LENGTH;
++		rx_dma_size = MTK_HW_LRO_DMA_SIZE;
++	} else {
++		rx_data_len = ETH_DATA_LEN;
++		rx_dma_size = MTK_DMA_SIZE;
++	}
++
++	ring->frag_size = mtk_max_frag_size(rx_data_len);
++	ring->buf_size = mtk_max_buf_size(ring->frag_size);
++	ring->data = kcalloc(rx_dma_size, sizeof(*ring->data),
++			     GFP_KERNEL);
++	if (!ring->data)
++		return -ENOMEM;
++
++	if (mtk_page_pool_enabled(eth)) {
++		struct page_pool *pp;
++
++		pp = mtk_create_page_pool(eth, &ring->xdp_q, ring_no,
++					  rx_dma_size);
++		if (IS_ERR(pp))
++			return PTR_ERR(pp);
++
++		ring->page_pool = pp;
++	}
++
++	ring->dma = dma_alloc_coherent(eth->dma_dev,
++				       rx_dma_size * eth->soc->txrx.rxd_size,
++				       &ring->phys, GFP_KERNEL);
++	if (!ring->dma)
++		return -ENOMEM;
++
++	for (i = 0; i < rx_dma_size; i++) {
++		struct mtk_rx_dma_v2 *rxd;
++		dma_addr_t dma_addr;
++		void *data;
++
++		rxd = ring->dma + i * eth->soc->txrx.rxd_size;
++		if (ring->page_pool) {
++			data = mtk_page_pool_get_buff(ring->page_pool,
++						      &dma_addr, GFP_KERNEL);
++			if (!data)
++				return -ENOMEM;
++		} else {
++			if (ring->frag_size <= PAGE_SIZE)
++				data = netdev_alloc_frag(ring->frag_size);
++			else
++				data = mtk_max_lro_buf_alloc(GFP_KERNEL);
++
++			if (!data)
++				return -ENOMEM;
++
++			dma_addr = dma_map_single(eth->dma_dev,
++				data + NET_SKB_PAD + eth->ip_align,
++				ring->buf_size, DMA_FROM_DEVICE);
++			if (unlikely(dma_mapping_error(eth->dma_dev,
++						       dma_addr))) {
++				skb_free_frag(data);
++				return -ENOMEM;
++			}
++		}
++		rxd->rxd1 = (unsigned int)dma_addr;
++		ring->data[i] = data;
++
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++			rxd->rxd2 = RX_DMA_LSO;
++		else
++			rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size);
++
++		rxd->rxd3 = 0;
++		rxd->rxd4 = 0;
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++			rxd->rxd5 = 0;
++			rxd->rxd6 = 0;
++			rxd->rxd7 = 0;
++			rxd->rxd8 = 0;
++		}
++	}
++
++	ring->dma_size = rx_dma_size;
++	ring->calc_idx_update = false;
++	ring->calc_idx = rx_dma_size - 1;
++	if (rx_flag == MTK_RX_FLAGS_QDMA)
++		ring->crx_idx_reg = reg_map->qdma.qcrx_ptr +
++				    ring_no * MTK_QRX_OFFSET;
++	else
++		ring->crx_idx_reg = reg_map->pdma.pcrx_ptr +
++				    ring_no * MTK_QRX_OFFSET;
++	/* make sure that all changes to the dma ring are flushed before we
++	 * continue
++	 */
++	wmb();
++
++	if (rx_flag == MTK_RX_FLAGS_QDMA) {
++		mtk_w32(eth, ring->phys,
++			reg_map->qdma.rx_ptr + ring_no * MTK_QRX_OFFSET);
++		mtk_w32(eth, rx_dma_size,
++			reg_map->qdma.rx_cnt_cfg + ring_no * MTK_QRX_OFFSET);
++		mtk_w32(eth, MTK_PST_DRX_IDX_CFG(ring_no),
++			reg_map->qdma.rst_idx);
++	} else {
++		mtk_w32(eth, ring->phys,
++			reg_map->pdma.rx_ptr + ring_no * MTK_QRX_OFFSET);
++		mtk_w32(eth, rx_dma_size,
++			reg_map->pdma.rx_cnt_cfg + ring_no * MTK_QRX_OFFSET);
++		mtk_w32(eth, MTK_PST_DRX_IDX_CFG(ring_no),
++			reg_map->pdma.rst_idx);
++	}
++	mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg);
++
++	return 0;
++}
++
++static void mtk_rx_clean(struct mtk_eth *eth, struct mtk_rx_ring *ring)
++{
++	int i;
++
++	if (ring->data && ring->dma) {
++		for (i = 0; i < ring->dma_size; i++) {
++			struct mtk_rx_dma *rxd;
++
++			if (!ring->data[i])
++				continue;
++
++			rxd = ring->dma + i * eth->soc->txrx.rxd_size;
++			if (!rxd->rxd1)
++				continue;
++
++			dma_unmap_single(eth->dma_dev, rxd->rxd1,
++					 ring->buf_size, DMA_FROM_DEVICE);
++			mtk_rx_put_buff(ring, ring->data[i], false);
++		}
++		kfree(ring->data);
++		ring->data = NULL;
++	}
++
++	if (ring->dma) {
++		dma_free_coherent(eth->dma_dev,
++				  ring->dma_size * eth->soc->txrx.rxd_size,
++				  ring->dma, ring->phys);
++		ring->dma = NULL;
++	}
++
++	if (ring->page_pool) {
++		if (xdp_rxq_info_is_reg(&ring->xdp_q))
++			xdp_rxq_info_unreg(&ring->xdp_q);
++		page_pool_destroy(ring->page_pool);
++		ring->page_pool = NULL;
++	}
++}
++
++static int mtk_hwlro_rx_init(struct mtk_eth *eth)
++{
++	int i;
++	u32 ring_ctrl_dw1 = 0, ring_ctrl_dw2 = 0, ring_ctrl_dw3 = 0;
++	u32 lro_ctrl_dw0 = 0, lro_ctrl_dw3 = 0;
++
++	/* set LRO rings to auto-learn modes */
++	ring_ctrl_dw2 |= MTK_RING_AUTO_LERAN_MODE;
++
++	/* validate LRO ring */
++	ring_ctrl_dw2 |= MTK_RING_VLD;
++
++	/* set AGE timer (unit: 20us) */
++	ring_ctrl_dw2 |= MTK_RING_AGE_TIME_H;
++	ring_ctrl_dw1 |= MTK_RING_AGE_TIME_L;
++
++	/* set max AGG timer (unit: 20us) */
++	ring_ctrl_dw2 |= MTK_RING_MAX_AGG_TIME;
++
++	/* set max LRO AGG count */
++	ring_ctrl_dw2 |= MTK_RING_MAX_AGG_CNT_L;
++	ring_ctrl_dw3 |= MTK_RING_MAX_AGG_CNT_H;
++
++	for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) {
++		mtk_w32(eth, ring_ctrl_dw1, MTK_LRO_CTRL_DW1_CFG(i));
++		mtk_w32(eth, ring_ctrl_dw2, MTK_LRO_CTRL_DW2_CFG(i));
++		mtk_w32(eth, ring_ctrl_dw3, MTK_LRO_CTRL_DW3_CFG(i));
++	}
++
++	/* IPv4 checksum update enable */
++	lro_ctrl_dw0 |= MTK_L3_CKS_UPD_EN;
++
++	/* switch priority comparison to packet count mode */
++	lro_ctrl_dw0 |= MTK_LRO_ALT_PKT_CNT_MODE;
++
++	/* bandwidth threshold setting */
++	mtk_w32(eth, MTK_HW_LRO_BW_THRE, MTK_PDMA_LRO_CTRL_DW2);
++
++	/* auto-learn score delta setting */
++	mtk_w32(eth, MTK_HW_LRO_REPLACE_DELTA, MTK_PDMA_LRO_ALT_SCORE_DELTA);
++
++	/* set refresh timer for altering flows to 1 sec. (unit: 20us) */
++	mtk_w32(eth, (MTK_HW_LRO_TIMER_UNIT << 16) | MTK_HW_LRO_REFRESH_TIME,
++		MTK_PDMA_LRO_ALT_REFRESH_TIMER);
++
++	/* set HW LRO mode & the max aggregation count for rx packets */
++	lro_ctrl_dw3 |= MTK_ADMA_MODE | (MTK_HW_LRO_MAX_AGG_CNT & 0xff);
++
++	/* the minimal remaining room of SDL0 in RXD for lro aggregation */
++	lro_ctrl_dw3 |= MTK_LRO_MIN_RXD_SDL;
++
++	/* enable HW LRO */
++	lro_ctrl_dw0 |= MTK_LRO_EN;
++
++	mtk_w32(eth, lro_ctrl_dw3, MTK_PDMA_LRO_CTRL_DW3);
++	mtk_w32(eth, lro_ctrl_dw0, MTK_PDMA_LRO_CTRL_DW0);
++
++	return 0;
++}
++
++static void mtk_hwlro_rx_uninit(struct mtk_eth *eth)
++{
++	int i;
++	u32 val;
++
++	/* relinquish lro rings, flush aggregated packets */
++	mtk_w32(eth, MTK_LRO_RING_RELINQUISH_REQ, MTK_PDMA_LRO_CTRL_DW0);
++
++	/* wait for relinquishments done */
++	for (i = 0; i < 10; i++) {
++		val = mtk_r32(eth, MTK_PDMA_LRO_CTRL_DW0);
++		if (val & MTK_LRO_RING_RELINQUISH_DONE) {
++			msleep(20);
++			continue;
++		}
++		break;
++	}
++
++	/* invalidate lro rings */
++	for (i = 1; i < MTK_MAX_RX_RING_NUM; i++)
++		mtk_w32(eth, 0, MTK_LRO_CTRL_DW2_CFG(i));
++
++	/* disable HW LRO */
++	mtk_w32(eth, 0, MTK_PDMA_LRO_CTRL_DW0);
++}
++
++static void mtk_hwlro_val_ipaddr(struct mtk_eth *eth, int idx, __be32 ip)
++{
++	u32 reg_val;
++
++	reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx));
++
++	/* invalidate the IP setting */
++	mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx));
++
++	mtk_w32(eth, ip, MTK_LRO_DIP_DW0_CFG(idx));
++
++	/* validate the IP setting */
++	mtk_w32(eth, (reg_val | MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx));
++}
++
++static void mtk_hwlro_inval_ipaddr(struct mtk_eth *eth, int idx)
++{
++	u32 reg_val;
++
++	reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx));
++
++	/* invalidate the IP setting */
++	mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx));
++
++	mtk_w32(eth, 0, MTK_LRO_DIP_DW0_CFG(idx));
++}
++
++static int mtk_hwlro_get_ip_cnt(struct mtk_mac *mac)
++{
++	int cnt = 0;
++	int i;
++
++	for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) {
++		if (mac->hwlro_ip[i])
++			cnt++;
++	}
++
++	return cnt;
++}
++
++static int mtk_hwlro_add_ipaddr(struct net_device *dev,
++				struct ethtool_rxnfc *cmd)
++{
++	struct ethtool_rx_flow_spec *fsp =
++		(struct ethtool_rx_flow_spec *)&cmd->fs;
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int hwlro_idx;
++
++	if ((fsp->flow_type != TCP_V4_FLOW) ||
++	    (!fsp->h_u.tcp_ip4_spec.ip4dst) ||
++	    (fsp->location > 1))
++		return -EINVAL;
++
++	mac->hwlro_ip[fsp->location] = htonl(fsp->h_u.tcp_ip4_spec.ip4dst);
++	hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location;
++
++	mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac);
++
++	mtk_hwlro_val_ipaddr(eth, hwlro_idx, mac->hwlro_ip[fsp->location]);
++
++	return 0;
++}
++
++static int mtk_hwlro_del_ipaddr(struct net_device *dev,
++				struct ethtool_rxnfc *cmd)
++{
++	struct ethtool_rx_flow_spec *fsp =
++		(struct ethtool_rx_flow_spec *)&cmd->fs;
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int hwlro_idx;
++
++	if (fsp->location > 1)
++		return -EINVAL;
++
++	mac->hwlro_ip[fsp->location] = 0;
++	hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location;
++
++	mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac);
++
++	mtk_hwlro_inval_ipaddr(eth, hwlro_idx);
++
++	return 0;
++}
++
++static void mtk_hwlro_netdev_disable(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int i, hwlro_idx;
++
++	for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) {
++		mac->hwlro_ip[i] = 0;
++		hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + i;
++
++		mtk_hwlro_inval_ipaddr(eth, hwlro_idx);
++	}
++
++	mac->hwlro_ip_cnt = 0;
++}
++
++static int mtk_hwlro_get_fdir_entry(struct net_device *dev,
++				    struct ethtool_rxnfc *cmd)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct ethtool_rx_flow_spec *fsp =
++		(struct ethtool_rx_flow_spec *)&cmd->fs;
++
++	if (fsp->location >= ARRAY_SIZE(mac->hwlro_ip))
++		return -EINVAL;
++
++	/* only tcp dst ipv4 is meaningful, others are meaningless */
++	fsp->flow_type = TCP_V4_FLOW;
++	fsp->h_u.tcp_ip4_spec.ip4dst = ntohl(mac->hwlro_ip[fsp->location]);
++	fsp->m_u.tcp_ip4_spec.ip4dst = 0;
++
++	fsp->h_u.tcp_ip4_spec.ip4src = 0;
++	fsp->m_u.tcp_ip4_spec.ip4src = 0xffffffff;
++	fsp->h_u.tcp_ip4_spec.psrc = 0;
++	fsp->m_u.tcp_ip4_spec.psrc = 0xffff;
++	fsp->h_u.tcp_ip4_spec.pdst = 0;
++	fsp->m_u.tcp_ip4_spec.pdst = 0xffff;
++	fsp->h_u.tcp_ip4_spec.tos = 0;
++	fsp->m_u.tcp_ip4_spec.tos = 0xff;
++
++	return 0;
++}
++
++static int mtk_hwlro_get_fdir_all(struct net_device *dev,
++				  struct ethtool_rxnfc *cmd,
++				  u32 *rule_locs)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	int cnt = 0;
++	int i;
++
++	for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) {
++		if (mac->hwlro_ip[i]) {
++			rule_locs[cnt] = i;
++			cnt++;
++		}
++	}
++
++	cmd->rule_cnt = cnt;
++
++	return 0;
++}
++
++static netdev_features_t mtk_fix_features(struct net_device *dev,
++					  netdev_features_t features)
++{
++	if (!(features & NETIF_F_LRO)) {
++		struct mtk_mac *mac = netdev_priv(dev);
++		int ip_cnt = mtk_hwlro_get_ip_cnt(mac);
++
++		if (ip_cnt) {
++			netdev_info(dev, "RX flow is programmed, LRO should keep on\n");
++
++			features |= NETIF_F_LRO;
++		}
++	}
++
++	return features;
++}
++
++static int mtk_set_features(struct net_device *dev, netdev_features_t features)
++{
++	int err = 0;
++
++	if (!((dev->features ^ features) & NETIF_F_LRO))
++		return 0;
++
++	if (!(features & NETIF_F_LRO))
++		mtk_hwlro_netdev_disable(dev);
++
++	return err;
++}
++
++/* wait for DMA to finish whatever it is doing before we start using it again */
++static int mtk_dma_busy_wait(struct mtk_eth *eth)
++{
++	unsigned int reg;
++	int ret;
++	u32 val;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		reg = eth->soc->reg_map->qdma.glo_cfg;
++	else
++		reg = eth->soc->reg_map->pdma.glo_cfg;
++
++	ret = readx_poll_timeout_atomic(__raw_readl, eth->base + reg, val,
++					!(val & (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)),
++					5, MTK_DMA_BUSY_TIMEOUT_US);
++	if (ret)
++		dev_err(eth->dev, "DMA init timeout\n");
++
++	return ret;
++}
++
++static int mtk_dma_init(struct mtk_eth *eth)
++{
++	int err;
++	u32 i;
++
++	if (mtk_dma_busy_wait(eth))
++		return -EBUSY;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		/* QDMA needs scratch memory for internal reordering of the
++		 * descriptors
++		 */
++		err = mtk_init_fq_dma(eth);
++		if (err)
++			return err;
++	}
++
++	err = mtk_tx_alloc(eth);
++	if (err)
++		return err;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_QDMA);
++		if (err)
++			return err;
++	}
++
++	err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_NORMAL);
++	if (err)
++		return err;
++
++	if (eth->hwlro) {
++		for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) {
++			err = mtk_rx_alloc(eth, i, MTK_RX_FLAGS_HWLRO);
++			if (err)
++				return err;
++		}
++		err = mtk_hwlro_rx_init(eth);
++		if (err)
++			return err;
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		/* Enable random early drop and set drop threshold
++		 * automatically
++		 */
++		mtk_w32(eth, FC_THRES_DROP_MODE | FC_THRES_DROP_EN |
++			FC_THRES_MIN, eth->soc->reg_map->qdma.fc_th);
++		mtk_w32(eth, 0x0, eth->soc->reg_map->qdma.hred);
++	}
++
++	return 0;
++}
++
++static void mtk_dma_free(struct mtk_eth *eth)
++{
++	const struct mtk_soc_data *soc = eth->soc;
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++)
++		if (eth->netdev[i])
++			netdev_reset_queue(eth->netdev[i]);
++	if (eth->scratch_ring) {
++		dma_free_coherent(eth->dma_dev,
++				  MTK_DMA_SIZE * soc->txrx.txd_size,
++				  eth->scratch_ring, eth->phy_scratch_ring);
++		eth->scratch_ring = NULL;
++		eth->phy_scratch_ring = 0;
++	}
++	mtk_tx_clean(eth);
++	mtk_rx_clean(eth, &eth->rx_ring[0]);
++	mtk_rx_clean(eth, &eth->rx_ring_qdma);
++
++	if (eth->hwlro) {
++		mtk_hwlro_rx_uninit(eth);
++		for (i = 1; i < MTK_MAX_RX_RING_NUM; i++)
++			mtk_rx_clean(eth, &eth->rx_ring[i]);
++	}
++
++	kfree(eth->scratch_head);
++}
++
++static void mtk_tx_timeout(struct net_device *dev, unsigned int txqueue)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	eth->netdev[mac->id]->stats.tx_errors++;
++	netif_err(eth, tx_err, dev,
++		  "transmit timed out\n");
++	schedule_work(&eth->pending_work);
++}
++
++static irqreturn_t mtk_handle_irq_rx(int irq, void *_eth)
++{
++	struct mtk_eth *eth = _eth;
++
++	eth->rx_events++;
++	if (likely(napi_schedule_prep(&eth->rx_napi))) {
++		__napi_schedule(&eth->rx_napi);
++		mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask);
++	}
++
++	return IRQ_HANDLED;
++}
++
++static irqreturn_t mtk_handle_irq_tx(int irq, void *_eth)
++{
++	struct mtk_eth *eth = _eth;
++
++	eth->tx_events++;
++	if (likely(napi_schedule_prep(&eth->tx_napi))) {
++		__napi_schedule(&eth->tx_napi);
++		mtk_tx_irq_disable(eth, MTK_TX_DONE_INT);
++	}
++
++	return IRQ_HANDLED;
++}
++
++static irqreturn_t mtk_handle_irq(int irq, void *_eth)
++{
++	struct mtk_eth *eth = _eth;
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++
++	if (mtk_r32(eth, reg_map->pdma.irq_mask) &
++	    eth->soc->txrx.rx_irq_done_mask) {
++		if (mtk_r32(eth, reg_map->pdma.irq_status) &
++		    eth->soc->txrx.rx_irq_done_mask)
++			mtk_handle_irq_rx(irq, _eth);
++	}
++	if (mtk_r32(eth, reg_map->tx_irq_mask) & MTK_TX_DONE_INT) {
++		if (mtk_r32(eth, reg_map->tx_irq_status) & MTK_TX_DONE_INT)
++			mtk_handle_irq_tx(irq, _eth);
++	}
++
++	return IRQ_HANDLED;
++}
++
++#ifdef CONFIG_NET_POLL_CONTROLLER
++static void mtk_poll_controller(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	mtk_tx_irq_disable(eth, MTK_TX_DONE_INT);
++	mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask);
++	mtk_handle_irq_rx(eth->irq[2], dev);
++	mtk_tx_irq_enable(eth, MTK_TX_DONE_INT);
++	mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask);
++}
++#endif
++
++static int mtk_start_dma(struct mtk_eth *eth)
++{
++	u32 val, rx_2b_offset = (NET_IP_ALIGN == 2) ? MTK_RX_2B_OFFSET : 0;
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	int err;
++
++	err = mtk_dma_init(eth);
++	if (err) {
++		mtk_dma_free(eth);
++		return err;
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		val = mtk_r32(eth, reg_map->qdma.glo_cfg);
++		val |= MTK_TX_DMA_EN | MTK_RX_DMA_EN |
++		       MTK_TX_BT_32DWORDS | MTK_NDP_CO_PRO |
++		       MTK_RX_2B_OFFSET | MTK_TX_WB_DDONE;
++
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
++			val |= MTK_MUTLI_CNT | MTK_RESV_BUF |
++			       MTK_WCOMP_EN | MTK_DMAD_WR_WDONE |
++			       MTK_CHK_DDONE_EN;
++		else
++			val |= MTK_RX_BT_32DWORDS;
++		mtk_w32(eth, val, reg_map->qdma.glo_cfg);
++
++		mtk_w32(eth,
++			MTK_RX_DMA_EN | rx_2b_offset |
++			MTK_RX_BT_32DWORDS | MTK_MULTI_EN,
++			reg_map->pdma.glo_cfg);
++	} else {
++		mtk_w32(eth, MTK_TX_WB_DDONE | MTK_TX_DMA_EN | MTK_RX_DMA_EN |
++			MTK_MULTI_EN | MTK_PDMA_SIZE_8DWORDS,
++			reg_map->pdma.glo_cfg);
++	}
++
++	return 0;
++}
++
++static void mtk_gdm_config(struct mtk_eth *eth, u32 config)
++{
++	int i;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++		return;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		u32 val = mtk_r32(eth, MTK_GDMA_FWD_CFG(i));
++
++		/* default setup the forward port to send frame to PDMA */
++		val &= ~0xffff;
++
++		/* Enable RX checksum */
++		val |= MTK_GDMA_ICS_EN | MTK_GDMA_TCS_EN | MTK_GDMA_UCS_EN;
++
++		val |= config;
++
++		if (!i && eth->netdev[0] && netdev_uses_dsa(eth->netdev[0]))
++			val |= MTK_GDMA_SPECIAL_TAG;
++
++		mtk_w32(eth, val, MTK_GDMA_FWD_CFG(i));
++	}
++	/* Reset and enable PSE */
++	mtk_w32(eth, RST_GL_PSE, MTK_RST_GL);
++	mtk_w32(eth, 0, MTK_RST_GL);
++}
++
++static int mtk_open(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int err;
++
++	err = phylink_of_phy_connect(mac->phylink, mac->of_node, 0);
++	if (err) {
++		netdev_err(dev, "%s: could not attach PHY: %d\n", __func__,
++			   err);
++		return err;
++	}
++
++	/* we run 2 netdevs on the same dma ring so we only bring it up once */
++	if (!refcount_read(&eth->dma_refcnt)) {
++		u32 gdm_config = MTK_GDMA_TO_PDMA;
++
++		err = mtk_start_dma(eth);
++		if (err) {
++			phylink_disconnect_phy(mac->phylink);
++			return err;
++		}
++
++		if (eth->soc->offload_version && mtk_ppe_start(eth->ppe) == 0)
++			gdm_config = MTK_GDMA_TO_PPE;
++
++		mtk_gdm_config(eth, gdm_config);
++
++		napi_enable(&eth->tx_napi);
++		napi_enable(&eth->rx_napi);
++		mtk_tx_irq_enable(eth, MTK_TX_DONE_INT);
++		mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask);
++		refcount_set(&eth->dma_refcnt, 1);
++	}
++	else
++		refcount_inc(&eth->dma_refcnt);
++
++	phylink_start(mac->phylink);
++	netif_start_queue(dev);
++	return 0;
++}
++
++static void mtk_stop_dma(struct mtk_eth *eth, u32 glo_cfg)
++{
++	u32 val;
++	int i;
++
++	/* stop the dma engine */
++	spin_lock_bh(&eth->page_lock);
++	val = mtk_r32(eth, glo_cfg);
++	mtk_w32(eth, val & ~(MTK_TX_WB_DDONE | MTK_RX_DMA_EN | MTK_TX_DMA_EN),
++		glo_cfg);
++	spin_unlock_bh(&eth->page_lock);
++
++	/* wait for dma stop */
++	for (i = 0; i < 10; i++) {
++		val = mtk_r32(eth, glo_cfg);
++		if (val & (MTK_TX_DMA_BUSY | MTK_RX_DMA_BUSY)) {
++			msleep(20);
++			continue;
++		}
++		break;
++	}
++}
++
++static int mtk_stop(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	phylink_stop(mac->phylink);
++
++	netif_tx_disable(dev);
++
++	phylink_disconnect_phy(mac->phylink);
++
++	/* only shutdown DMA if this is the last user */
++	if (!refcount_dec_and_test(&eth->dma_refcnt))
++		return 0;
++
++	mtk_gdm_config(eth, MTK_GDMA_DROP_ALL);
++
++	mtk_tx_irq_disable(eth, MTK_TX_DONE_INT);
++	mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask);
++	napi_disable(&eth->tx_napi);
++	napi_disable(&eth->rx_napi);
++
++	cancel_work_sync(&eth->rx_dim.work);
++	cancel_work_sync(&eth->tx_dim.work);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		mtk_stop_dma(eth, eth->soc->reg_map->qdma.glo_cfg);
++	mtk_stop_dma(eth, eth->soc->reg_map->pdma.glo_cfg);
++
++	mtk_dma_free(eth);
++
++	if (eth->soc->offload_version)
++		mtk_ppe_stop(eth->ppe);
++
++	return 0;
++}
++
++static int mtk_xdp_setup(struct net_device *dev, struct bpf_prog *prog,
++			 struct netlink_ext_ack *extack)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	struct bpf_prog *old_prog;
++	bool need_update;
++
++	if (eth->hwlro) {
++		NL_SET_ERR_MSG_MOD(extack, "XDP not supported with HWLRO");
++		return -EOPNOTSUPP;
++	}
++
++	if (dev->mtu > MTK_PP_MAX_BUF_SIZE) {
++		NL_SET_ERR_MSG_MOD(extack, "MTU too large for XDP");
++		return -EOPNOTSUPP;
++	}
++
++	need_update = !!eth->prog != !!prog;
++	if (netif_running(dev) && need_update)
++		mtk_stop(dev);
++
++	old_prog = rcu_replace_pointer(eth->prog, prog, lockdep_rtnl_is_held());
++	if (old_prog)
++		bpf_prog_put(old_prog);
++
++	if (netif_running(dev) && need_update)
++		return mtk_open(dev);
++
++	return 0;
++}
++
++static int mtk_xdp(struct net_device *dev, struct netdev_bpf *xdp)
++{
++	switch (xdp->command) {
++	case XDP_SETUP_PROG:
++		return mtk_xdp_setup(dev, xdp->prog, xdp->extack);
++	default:
++		return -EINVAL;
++	}
++}
++
++static void ethsys_reset(struct mtk_eth *eth, u32 reset_bits)
++{
++	regmap_update_bits(eth->ethsys, ETHSYS_RSTCTRL,
++			   reset_bits,
++			   reset_bits);
++
++	usleep_range(1000, 1100);
++	regmap_update_bits(eth->ethsys, ETHSYS_RSTCTRL,
++			   reset_bits,
++			   ~reset_bits);
++	mdelay(10);
++}
++
++static void mtk_clk_disable(struct mtk_eth *eth)
++{
++	int clk;
++
++	for (clk = MTK_CLK_MAX - 1; clk >= 0; clk--)
++		clk_disable_unprepare(eth->clks[clk]);
++}
++
++static int mtk_clk_enable(struct mtk_eth *eth)
++{
++	int clk, ret;
++
++	for (clk = 0; clk < MTK_CLK_MAX ; clk++) {
++		ret = clk_prepare_enable(eth->clks[clk]);
++		if (ret)
++			goto err_disable_clks;
++	}
++
++	return 0;
++
++err_disable_clks:
++	while (--clk >= 0)
++		clk_disable_unprepare(eth->clks[clk]);
++
++	return ret;
++}
++
++static void mtk_dim_rx(struct work_struct *work)
++{
++	struct dim *dim = container_of(work, struct dim, work);
++	struct mtk_eth *eth = container_of(dim, struct mtk_eth, rx_dim);
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	struct dim_cq_moder cur_profile;
++	u32 val, cur;
++
++	cur_profile = net_dim_get_rx_moderation(eth->rx_dim.mode,
++						dim->profile_ix);
++	spin_lock_bh(&eth->dim_lock);
++
++	val = mtk_r32(eth, reg_map->pdma.delay_irq);
++	val &= MTK_PDMA_DELAY_TX_MASK;
++	val |= MTK_PDMA_DELAY_RX_EN;
++
++	cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK);
++	val |= cur << MTK_PDMA_DELAY_RX_PTIME_SHIFT;
++
++	cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK);
++	val |= cur << MTK_PDMA_DELAY_RX_PINT_SHIFT;
++
++	mtk_w32(eth, val, reg_map->pdma.delay_irq);
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		mtk_w32(eth, val, reg_map->qdma.delay_irq);
++
++	spin_unlock_bh(&eth->dim_lock);
++
++	dim->state = DIM_START_MEASURE;
++}
++
++static void mtk_dim_tx(struct work_struct *work)
++{
++	struct dim *dim = container_of(work, struct dim, work);
++	struct mtk_eth *eth = container_of(dim, struct mtk_eth, tx_dim);
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	struct dim_cq_moder cur_profile;
++	u32 val, cur;
++
++	cur_profile = net_dim_get_tx_moderation(eth->tx_dim.mode,
++						dim->profile_ix);
++	spin_lock_bh(&eth->dim_lock);
++
++	val = mtk_r32(eth, reg_map->pdma.delay_irq);
++	val &= MTK_PDMA_DELAY_RX_MASK;
++	val |= MTK_PDMA_DELAY_TX_EN;
++
++	cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK);
++	val |= cur << MTK_PDMA_DELAY_TX_PTIME_SHIFT;
++
++	cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK);
++	val |= cur << MTK_PDMA_DELAY_TX_PINT_SHIFT;
++
++	mtk_w32(eth, val, reg_map->pdma.delay_irq);
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		mtk_w32(eth, val, reg_map->qdma.delay_irq);
++
++	spin_unlock_bh(&eth->dim_lock);
++
++	dim->state = DIM_START_MEASURE;
++}
++
++static int mtk_hw_init(struct mtk_eth *eth)
++{
++	u32 dma_mask = ETHSYS_DMA_AG_MAP_PDMA | ETHSYS_DMA_AG_MAP_QDMA |
++		       ETHSYS_DMA_AG_MAP_PPE;
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	int i, val, ret;
++
++	if (test_and_set_bit(MTK_HW_INIT, &eth->state))
++		return 0;
++
++	pm_runtime_enable(eth->dev);
++	pm_runtime_get_sync(eth->dev);
++
++	ret = mtk_clk_enable(eth);
++	if (ret)
++		goto err_disable_pm;
++
++	if (eth->ethsys)
++		regmap_update_bits(eth->ethsys, ETHSYS_DMA_AG_MAP, dma_mask,
++				   of_dma_is_coherent(eth->dma_dev->of_node) * dma_mask);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		ret = device_reset(eth->dev);
++		if (ret) {
++			dev_err(eth->dev, "MAC reset failed!\n");
++			goto err_disable_pm;
++		}
++
++		/* set interrupt delays based on current Net DIM sample */
++		mtk_dim_rx(&eth->rx_dim.work);
++		mtk_dim_tx(&eth->tx_dim.work);
++
++		/* disable delay and normal interrupt */
++		mtk_tx_irq_disable(eth, ~0);
++		mtk_rx_irq_disable(eth, ~0);
++
++		return 0;
++	}
++
++	val = RSTCTRL_FE | RSTCTRL_PPE;
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++		regmap_write(eth->ethsys, ETHSYS_FE_RST_CHK_IDLE_EN, 0);
++
++		val |= RSTCTRL_ETH;
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_RSTCTRL_PPE1))
++			val |= RSTCTRL_PPE1;
++	}
++
++	ethsys_reset(eth, val);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++		regmap_write(eth->ethsys, ETHSYS_FE_RST_CHK_IDLE_EN,
++			     0x3ffffff);
++
++		/* Set FE to PDMAv2 if necessary */
++		val = mtk_r32(eth, MTK_FE_GLO_MISC);
++		mtk_w32(eth,  val | BIT(4), MTK_FE_GLO_MISC);
++	}
++
++	if (eth->pctl) {
++		/* Set GE2 driving and slew rate */
++		regmap_write(eth->pctl, GPIO_DRV_SEL10, 0xa00);
++
++		/* set GE2 TDSEL */
++		regmap_write(eth->pctl, GPIO_OD33_CTRL8, 0x5);
++
++		/* set GE2 TUNE */
++		regmap_write(eth->pctl, GPIO_BIAS_CTRL, 0x0);
++	}
++
++	/* Set linkdown as the default for each GMAC. Its own MCR would be set
++	 * up with the more appropriate value when mtk_mac_config call is being
++	 * invoked.
++	 */
++	for (i = 0; i < MTK_MAC_COUNT; i++)
++		mtk_w32(eth, MAC_MCR_FORCE_LINK_DOWN, MTK_MAC_MCR(i));
++
++	/* Indicates CDM to parse the MTK special tag from CPU
++	 * which also is working out for untag packets.
++	 */
++	val = mtk_r32(eth, MTK_CDMQ_IG_CTRL);
++	mtk_w32(eth, val | MTK_CDMQ_STAG_EN, MTK_CDMQ_IG_CTRL);
++
++	/* Enable RX VLan Offloading */
++	mtk_w32(eth, 1, MTK_CDMP_EG_CTRL);
++
++	/* set interrupt delays based on current Net DIM sample */
++	mtk_dim_rx(&eth->rx_dim.work);
++	mtk_dim_tx(&eth->tx_dim.work);
++
++	/* disable delay and normal interrupt */
++	mtk_tx_irq_disable(eth, ~0);
++	mtk_rx_irq_disable(eth, ~0);
++
++	/* FE int grouping */
++	mtk_w32(eth, MTK_TX_DONE_INT, reg_map->pdma.int_grp);
++	mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, reg_map->pdma.int_grp + 4);
++	mtk_w32(eth, MTK_TX_DONE_INT, reg_map->qdma.int_grp);
++	mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, reg_map->qdma.int_grp + 4);
++	mtk_w32(eth, 0x21021000, MTK_FE_INT_GRP);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++		/* PSE should not drop port8 and port9 packets */
++		mtk_w32(eth, 0x00000300, PSE_DROP_CFG);
++
++		/* PSE Free Queue Flow Control  */
++		mtk_w32(eth, 0x01fa01f4, PSE_FQFC_CFG2);
++
++		/* PSE config input queue threshold */
++		mtk_w32(eth, 0x001a000e, PSE_IQ_REV(1));
++		mtk_w32(eth, 0x01ff001a, PSE_IQ_REV(2));
++		mtk_w32(eth, 0x000e01ff, PSE_IQ_REV(3));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(4));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(5));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(6));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(7));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(8));
++
++		/* PSE config output queue threshold */
++		mtk_w32(eth, 0x000f000a, PSE_OQ_TH(1));
++		mtk_w32(eth, 0x001a000f, PSE_OQ_TH(2));
++		mtk_w32(eth, 0x000f001a, PSE_OQ_TH(3));
++		mtk_w32(eth, 0x01ff000f, PSE_OQ_TH(4));
++		mtk_w32(eth, 0x000f000f, PSE_OQ_TH(5));
++		mtk_w32(eth, 0x0006000f, PSE_OQ_TH(6));
++		mtk_w32(eth, 0x00060006, PSE_OQ_TH(7));
++		mtk_w32(eth, 0x00060006, PSE_OQ_TH(8));
++
++		/* GDM and CDM Threshold */
++		mtk_w32(eth, 0x00000004, MTK_GDM2_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDMW0_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDMW1_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDME0_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDME1_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDMM_THRES);
++	}
++
++	return 0;
++
++err_disable_pm:
++	pm_runtime_put_sync(eth->dev);
++	pm_runtime_disable(eth->dev);
++
++	return ret;
++}
++
++static int mtk_hw_deinit(struct mtk_eth *eth)
++{
++	if (!test_and_clear_bit(MTK_HW_INIT, &eth->state))
++		return 0;
++
++	mtk_clk_disable(eth);
++
++	pm_runtime_put_sync(eth->dev);
++	pm_runtime_disable(eth->dev);
++
++	return 0;
++}
++
++static int __init mtk_init(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int ret;
++
++	ret = of_get_ethdev_address(mac->of_node, dev);
++	if (ret) {
++		/* If the mac address is invalid, use random mac address */
++		eth_hw_addr_random(dev);
++		dev_err(eth->dev, "generated random MAC address %pM\n",
++			dev->dev_addr);
++	}
++
++	return 0;
++}
++
++static void mtk_uninit(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	phylink_disconnect_phy(mac->phylink);
++	mtk_tx_irq_disable(eth, ~0);
++	mtk_rx_irq_disable(eth, ~0);
++}
++
++static int mtk_change_mtu(struct net_device *dev, int new_mtu)
++{
++	int length = new_mtu + MTK_RX_ETH_HLEN;
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	u32 mcr_cur, mcr_new;
++
++	if (rcu_access_pointer(eth->prog) &&
++	    length > MTK_PP_MAX_BUF_SIZE) {
++		netdev_err(dev, "Invalid MTU for XDP mode\n");
++		return -EINVAL;
++	}
++
++	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
++		mcr_new = mcr_cur & ~MAC_MCR_MAX_RX_MASK;
++
++		if (length <= 1518)
++			mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1518);
++		else if (length <= 1536)
++			mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1536);
++		else if (length <= 1552)
++			mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1552);
++		else
++			mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_2048);
++
++		if (mcr_new != mcr_cur)
++			mtk_w32(mac->hw, mcr_new, MTK_MAC_MCR(mac->id));
++	}
++
++	dev->mtu = new_mtu;
++
++	return 0;
++}
++
++static int mtk_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	switch (cmd) {
++	case SIOCGMIIPHY:
++	case SIOCGMIIREG:
++	case SIOCSMIIREG:
++		return phylink_mii_ioctl(mac->phylink, ifr, cmd);
++	default:
++		break;
++	}
++
++	return -EOPNOTSUPP;
++}
++
++static void mtk_pending_work(struct work_struct *work)
++{
++	struct mtk_eth *eth = container_of(work, struct mtk_eth, pending_work);
++	int err, i;
++	unsigned long restart = 0;
++
++	rtnl_lock();
++
++	dev_dbg(eth->dev, "[%s][%d] reset\n", __func__, __LINE__);
++
++	while (test_and_set_bit_lock(MTK_RESETTING, &eth->state))
++		cpu_relax();
++
++	dev_dbg(eth->dev, "[%s][%d] mtk_stop starts\n", __func__, __LINE__);
++	/* stop all devices to make sure that dma is properly shut down */
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		mtk_stop(eth->netdev[i]);
++		__set_bit(i, &restart);
++	}
++	dev_dbg(eth->dev, "[%s][%d] mtk_stop ends\n", __func__, __LINE__);
++
++	/* restart underlying hardware such as power, clock, pin mux
++	 * and the connected phy
++	 */
++	mtk_hw_deinit(eth);
++
++	if (eth->dev->pins)
++		pinctrl_select_state(eth->dev->pins->p,
++				     eth->dev->pins->default_state);
++	mtk_hw_init(eth);
++
++	/* restart DMA and enable IRQs */
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!test_bit(i, &restart))
++			continue;
++		err = mtk_open(eth->netdev[i]);
++		if (err) {
++			netif_alert(eth, ifup, eth->netdev[i],
++			      "Driver up/down cycle failed, closing device.\n");
++			dev_close(eth->netdev[i]);
++		}
++	}
++
++	dev_dbg(eth->dev, "[%s][%d] reset done\n", __func__, __LINE__);
++
++	clear_bit_unlock(MTK_RESETTING, &eth->state);
++
++	rtnl_unlock();
++}
++
++static int mtk_free_dev(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		free_netdev(eth->netdev[i]);
++	}
++
++	return 0;
++}
++
++static int mtk_unreg_dev(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		unregister_netdev(eth->netdev[i]);
++	}
++
++	return 0;
++}
++
++static int mtk_cleanup(struct mtk_eth *eth)
++{
++	mtk_unreg_dev(eth);
++	mtk_free_dev(eth);
++	cancel_work_sync(&eth->pending_work);
++
++	return 0;
++}
++
++static int mtk_get_link_ksettings(struct net_device *ndev,
++				  struct ethtool_link_ksettings *cmd)
++{
++	struct mtk_mac *mac = netdev_priv(ndev);
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return -EBUSY;
++
++	return phylink_ethtool_ksettings_get(mac->phylink, cmd);
++}
++
++static int mtk_set_link_ksettings(struct net_device *ndev,
++				  const struct ethtool_link_ksettings *cmd)
++{
++	struct mtk_mac *mac = netdev_priv(ndev);
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return -EBUSY;
++
++	return phylink_ethtool_ksettings_set(mac->phylink, cmd);
++}
++
++static void mtk_get_drvinfo(struct net_device *dev,
++			    struct ethtool_drvinfo *info)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	strlcpy(info->driver, mac->hw->dev->driver->name, sizeof(info->driver));
++	strlcpy(info->bus_info, dev_name(mac->hw->dev), sizeof(info->bus_info));
++	info->n_stats = ARRAY_SIZE(mtk_ethtool_stats);
++}
++
++static u32 mtk_get_msglevel(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	return mac->hw->msg_enable;
++}
++
++static void mtk_set_msglevel(struct net_device *dev, u32 value)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	mac->hw->msg_enable = value;
++}
++
++static int mtk_nway_reset(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return -EBUSY;
++
++	if (!mac->phylink)
++		return -ENOTSUPP;
++
++	return phylink_ethtool_nway_reset(mac->phylink);
++}
++
++static void mtk_get_strings(struct net_device *dev, u32 stringset, u8 *data)
++{
++	int i;
++
++	switch (stringset) {
++	case ETH_SS_STATS: {
++		struct mtk_mac *mac = netdev_priv(dev);
++
++		for (i = 0; i < ARRAY_SIZE(mtk_ethtool_stats); i++) {
++			memcpy(data, mtk_ethtool_stats[i].str, ETH_GSTRING_LEN);
++			data += ETH_GSTRING_LEN;
++		}
++		if (mtk_page_pool_enabled(mac->hw))
++			page_pool_ethtool_stats_get_strings(data);
++		break;
++	}
++	default:
++		break;
++	}
++}
++
++static int mtk_get_sset_count(struct net_device *dev, int sset)
++{
++	switch (sset) {
++	case ETH_SS_STATS: {
++		int count = ARRAY_SIZE(mtk_ethtool_stats);
++		struct mtk_mac *mac = netdev_priv(dev);
++
++		if (mtk_page_pool_enabled(mac->hw))
++			count += page_pool_ethtool_stats_get_count();
++		return count;
++	}
++	default:
++		return -EOPNOTSUPP;
++	}
++}
++
++static void mtk_ethtool_pp_stats(struct mtk_eth *eth, u64 *data)
++{
++	struct page_pool_stats stats = {};
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(eth->rx_ring); i++) {
++		struct mtk_rx_ring *ring = &eth->rx_ring[i];
++
++		if (!ring->page_pool)
++			continue;
++
++		page_pool_get_stats(ring->page_pool, &stats);
++	}
++	page_pool_ethtool_stats_get(data, &stats);
++}
++
++static void mtk_get_ethtool_stats(struct net_device *dev,
++				  struct ethtool_stats *stats, u64 *data)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_hw_stats *hwstats = mac->hw_stats;
++	u64 *data_src, *data_dst;
++	unsigned int start;
++	int i;
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return;
++
++	if (netif_running(dev) && netif_device_present(dev)) {
++		if (spin_trylock_bh(&hwstats->stats_lock)) {
++			mtk_stats_update_mac(mac);
++			spin_unlock_bh(&hwstats->stats_lock);
++		}
++	}
++
++	data_src = (u64 *)hwstats;
++
++	do {
++		data_dst = data;
++		start = u64_stats_fetch_begin_irq(&hwstats->syncp);
++
++		for (i = 0; i < ARRAY_SIZE(mtk_ethtool_stats); i++)
++			*data_dst++ = *(data_src + mtk_ethtool_stats[i].offset);
++		if (mtk_page_pool_enabled(mac->hw))
++			mtk_ethtool_pp_stats(mac->hw, data_dst);
++	} while (u64_stats_fetch_retry_irq(&hwstats->syncp, start));
++}
++
++static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
++			 u32 *rule_locs)
++{
++	int ret = -EOPNOTSUPP;
++
++	switch (cmd->cmd) {
++	case ETHTOOL_GRXRINGS:
++		if (dev->hw_features & NETIF_F_LRO) {
++			cmd->data = MTK_MAX_RX_RING_NUM;
++			ret = 0;
++		}
++		break;
++	case ETHTOOL_GRXCLSRLCNT:
++		if (dev->hw_features & NETIF_F_LRO) {
++			struct mtk_mac *mac = netdev_priv(dev);
++
++			cmd->rule_cnt = mac->hwlro_ip_cnt;
++			ret = 0;
++		}
++		break;
++	case ETHTOOL_GRXCLSRULE:
++		if (dev->hw_features & NETIF_F_LRO)
++			ret = mtk_hwlro_get_fdir_entry(dev, cmd);
++		break;
++	case ETHTOOL_GRXCLSRLALL:
++		if (dev->hw_features & NETIF_F_LRO)
++			ret = mtk_hwlro_get_fdir_all(dev, cmd,
++						     rule_locs);
++		break;
++	default:
++		break;
++	}
++
++	return ret;
++}
++
++static int mtk_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
++{
++	int ret = -EOPNOTSUPP;
++
++	switch (cmd->cmd) {
++	case ETHTOOL_SRXCLSRLINS:
++		if (dev->hw_features & NETIF_F_LRO)
++			ret = mtk_hwlro_add_ipaddr(dev, cmd);
++		break;
++	case ETHTOOL_SRXCLSRLDEL:
++		if (dev->hw_features & NETIF_F_LRO)
++			ret = mtk_hwlro_del_ipaddr(dev, cmd);
++		break;
++	default:
++		break;
++	}
++
++	return ret;
++}
++
++static const struct ethtool_ops mtk_ethtool_ops = {
++	.get_link_ksettings	= mtk_get_link_ksettings,
++	.set_link_ksettings	= mtk_set_link_ksettings,
++	.get_drvinfo		= mtk_get_drvinfo,
++	.get_msglevel		= mtk_get_msglevel,
++	.set_msglevel		= mtk_set_msglevel,
++	.nway_reset		= mtk_nway_reset,
++	.get_link		= ethtool_op_get_link,
++	.get_strings		= mtk_get_strings,
++	.get_sset_count		= mtk_get_sset_count,
++	.get_ethtool_stats	= mtk_get_ethtool_stats,
++	.get_rxnfc		= mtk_get_rxnfc,
++	.set_rxnfc              = mtk_set_rxnfc,
++};
++
++static const struct net_device_ops mtk_netdev_ops = {
++	.ndo_init		= mtk_init,
++	.ndo_uninit		= mtk_uninit,
++	.ndo_open		= mtk_open,
++	.ndo_stop		= mtk_stop,
++	.ndo_start_xmit		= mtk_start_xmit,
++	.ndo_set_mac_address	= mtk_set_mac_address,
++	.ndo_validate_addr	= eth_validate_addr,
++	.ndo_eth_ioctl		= mtk_do_ioctl,
++	.ndo_change_mtu		= mtk_change_mtu,
++	.ndo_tx_timeout		= mtk_tx_timeout,
++	.ndo_get_stats64        = mtk_get_stats64,
++	.ndo_fix_features	= mtk_fix_features,
++	.ndo_set_features	= mtk_set_features,
++#ifdef CONFIG_NET_POLL_CONTROLLER
++	.ndo_poll_controller	= mtk_poll_controller,
++#endif
++	.ndo_setup_tc		= mtk_eth_setup_tc,
++	.ndo_bpf		= mtk_xdp,
++	.ndo_xdp_xmit		= mtk_xdp_xmit,
++};
++
++static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np)
++{
++	const __be32 *_id = of_get_property(np, "reg", NULL);
++	phy_interface_t phy_mode;
++	struct phylink *phylink;
++	struct mtk_mac *mac;
++	int id, err;
++
++	if (!_id) {
++		dev_err(eth->dev, "missing mac id\n");
++		return -EINVAL;
++	}
++
++	id = be32_to_cpup(_id);
++	if (id >= MTK_MAC_COUNT) {
++		dev_err(eth->dev, "%d is not a valid mac id\n", id);
++		return -EINVAL;
++	}
++
++	if (eth->netdev[id]) {
++		dev_err(eth->dev, "duplicate mac id found: %d\n", id);
++		return -EINVAL;
++	}
++
++	eth->netdev[id] = alloc_etherdev(sizeof(*mac));
++	if (!eth->netdev[id]) {
++		dev_err(eth->dev, "alloc_etherdev failed\n");
++		return -ENOMEM;
++	}
++	mac = netdev_priv(eth->netdev[id]);
++	eth->mac[id] = mac;
++	mac->id = id;
++	mac->hw = eth;
++	mac->of_node = np;
++
++	memset(mac->hwlro_ip, 0, sizeof(mac->hwlro_ip));
++	mac->hwlro_ip_cnt = 0;
++
++	mac->hw_stats = devm_kzalloc(eth->dev,
++				     sizeof(*mac->hw_stats),
++				     GFP_KERNEL);
++	if (!mac->hw_stats) {
++		dev_err(eth->dev, "failed to allocate counter memory\n");
++		err = -ENOMEM;
++		goto free_netdev;
++	}
++	spin_lock_init(&mac->hw_stats->stats_lock);
++	u64_stats_init(&mac->hw_stats->syncp);
++	mac->hw_stats->reg_offset = id * MTK_STAT_OFFSET;
++
++	/* phylink create */
++	err = of_get_phy_mode(np, &phy_mode);
++	if (err) {
++		dev_err(eth->dev, "incorrect phy-mode\n");
++		goto free_netdev;
++	}
++
++	/* mac config is not set */
++	mac->interface = PHY_INTERFACE_MODE_NA;
++	mac->speed = SPEED_UNKNOWN;
++
++	mac->phylink_config.dev = &eth->netdev[id]->dev;
++	mac->phylink_config.type = PHYLINK_NETDEV;
++	/* This driver makes use of state->speed in mac_config */
++	mac->phylink_config.legacy_pre_march2020 = true;
++	mac->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE |
++		MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD;
++
++	__set_bit(PHY_INTERFACE_MODE_MII,
++		  mac->phylink_config.supported_interfaces);
++	__set_bit(PHY_INTERFACE_MODE_GMII,
++		  mac->phylink_config.supported_interfaces);
++
++	if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_RGMII))
++		phy_interface_set_rgmii(mac->phylink_config.supported_interfaces);
++
++	if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_TRGMII) && !mac->id)
++		__set_bit(PHY_INTERFACE_MODE_TRGMII,
++			  mac->phylink_config.supported_interfaces);
++
++	if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_SGMII)) {
++		__set_bit(PHY_INTERFACE_MODE_SGMII,
++			  mac->phylink_config.supported_interfaces);
++		__set_bit(PHY_INTERFACE_MODE_1000BASEX,
++			  mac->phylink_config.supported_interfaces);
++		__set_bit(PHY_INTERFACE_MODE_2500BASEX,
++			  mac->phylink_config.supported_interfaces);
++	}
++
++	phylink = phylink_create(&mac->phylink_config,
++				 of_fwnode_handle(mac->of_node),
++				 phy_mode, &mtk_phylink_ops);
++	if (IS_ERR(phylink)) {
++		err = PTR_ERR(phylink);
++		goto free_netdev;
++	}
++
++	mac->phylink = phylink;
++
++	SET_NETDEV_DEV(eth->netdev[id], eth->dev);
++	eth->netdev[id]->watchdog_timeo = 5 * HZ;
++	eth->netdev[id]->netdev_ops = &mtk_netdev_ops;
++	eth->netdev[id]->base_addr = (unsigned long)eth->base;
++
++	eth->netdev[id]->hw_features = eth->soc->hw_features;
++	if (eth->hwlro)
++		eth->netdev[id]->hw_features |= NETIF_F_LRO;
++
++	eth->netdev[id]->vlan_features = eth->soc->hw_features &
++		~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX);
++	eth->netdev[id]->features |= eth->soc->hw_features;
++	eth->netdev[id]->ethtool_ops = &mtk_ethtool_ops;
++
++	eth->netdev[id]->irq = eth->irq[0];
++	eth->netdev[id]->dev.of_node = np;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++		eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH - MTK_RX_ETH_HLEN;
++	else
++		eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN;
++
++	return 0;
++
++free_netdev:
++	free_netdev(eth->netdev[id]);
++	return err;
++}
++
++void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev)
++{
++	struct net_device *dev, *tmp;
++	LIST_HEAD(dev_list);
++	int i;
++
++	rtnl_lock();
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		dev = eth->netdev[i];
++
++		if (!dev || !(dev->flags & IFF_UP))
++			continue;
++
++		list_add_tail(&dev->close_list, &dev_list);
++	}
++
++	dev_close_many(&dev_list, false);
++
++	eth->dma_dev = dma_dev;
++
++	list_for_each_entry_safe(dev, tmp, &dev_list, close_list) {
++		list_del_init(&dev->close_list);
++		dev_open(dev, NULL);
++	}
++
++	rtnl_unlock();
++}
++
++static int mtk_probe(struct platform_device *pdev)
++{
++	struct device_node *mac_np;
++	struct mtk_eth *eth;
++	int err, i;
++
++	eth = devm_kzalloc(&pdev->dev, sizeof(*eth), GFP_KERNEL);
++	if (!eth)
++		return -ENOMEM;
++
++	eth->soc = of_device_get_match_data(&pdev->dev);
++
++	eth->dev = &pdev->dev;
++	eth->dma_dev = &pdev->dev;
++	eth->base = devm_platform_ioremap_resource(pdev, 0);
++	if (IS_ERR(eth->base))
++		return PTR_ERR(eth->base);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++		eth->ip_align = NET_IP_ALIGN;
++
++	spin_lock_init(&eth->page_lock);
++	spin_lock_init(&eth->tx_irq_lock);
++	spin_lock_init(&eth->rx_irq_lock);
++	spin_lock_init(&eth->dim_lock);
++
++	eth->rx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
++	INIT_WORK(&eth->rx_dim.work, mtk_dim_rx);
++
++	eth->tx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
++	INIT_WORK(&eth->tx_dim.work, mtk_dim_tx);
++
++	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		eth->ethsys = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
++							      "mediatek,ethsys");
++		if (IS_ERR(eth->ethsys)) {
++			dev_err(&pdev->dev, "no ethsys regmap found\n");
++			return PTR_ERR(eth->ethsys);
++		}
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_INFRA)) {
++		eth->infra = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
++							     "mediatek,infracfg");
++		if (IS_ERR(eth->infra)) {
++			dev_err(&pdev->dev, "no infracfg regmap found\n");
++			return PTR_ERR(eth->infra);
++		}
++	}
++
++	if (of_dma_is_coherent(pdev->dev.of_node)) {
++		struct regmap *cci;
++
++		cci = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
++						      "cci-control-port");
++		/* enable CPU/bus coherency */
++		if (!IS_ERR(cci))
++			regmap_write(cci, 0, 3);
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) {
++		eth->sgmii = devm_kzalloc(eth->dev, sizeof(*eth->sgmii),
++					  GFP_KERNEL);
++		if (!eth->sgmii)
++			return -ENOMEM;
++
++		err = mtk_sgmii_init(eth->sgmii, pdev->dev.of_node,
++				     eth->soc->ana_rgc3);
++
++		if (err)
++			return err;
++	}
++
++	if (eth->soc->required_pctl) {
++		eth->pctl = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
++							    "mediatek,pctl");
++		if (IS_ERR(eth->pctl)) {
++			dev_err(&pdev->dev, "no pctl regmap found\n");
++			return PTR_ERR(eth->pctl);
++		}
++	}
++
++	for (i = 0;; i++) {
++		struct device_node *np = of_parse_phandle(pdev->dev.of_node,
++							  "mediatek,wed", i);
++		static const u32 wdma_regs[] = {
++			MTK_WDMA0_BASE,
++			MTK_WDMA1_BASE
++		};
++		void __iomem *wdma;
++
++		if (!np || i >= ARRAY_SIZE(wdma_regs))
++			break;
++
++		wdma = eth->base + wdma_regs[i];
++		mtk_wed_add_hw(np, eth, wdma, i);
++	}
++
++	for (i = 0; i < 3; i++) {
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_INT) && i > 0)
++			eth->irq[i] = eth->irq[0];
++		else
++			eth->irq[i] = platform_get_irq(pdev, i);
++		if (eth->irq[i] < 0) {
++			dev_err(&pdev->dev, "no IRQ%d resource found\n", i);
++			err = -ENXIO;
++			goto err_wed_exit;
++		}
++	}
++	for (i = 0; i < ARRAY_SIZE(eth->clks); i++) {
++		eth->clks[i] = devm_clk_get(eth->dev,
++					    mtk_clks_source_name[i]);
++		if (IS_ERR(eth->clks[i])) {
++			if (PTR_ERR(eth->clks[i]) == -EPROBE_DEFER) {
++				err = -EPROBE_DEFER;
++				goto err_wed_exit;
++			}
++			if (eth->soc->required_clks & BIT(i)) {
++				dev_err(&pdev->dev, "clock %s not found\n",
++					mtk_clks_source_name[i]);
++				err = -EINVAL;
++				goto err_wed_exit;
++			}
++			eth->clks[i] = NULL;
++		}
++	}
++
++	eth->msg_enable = netif_msg_init(mtk_msg_level, MTK_DEFAULT_MSG_ENABLE);
++	INIT_WORK(&eth->pending_work, mtk_pending_work);
++
++	err = mtk_hw_init(eth);
++	if (err)
++		goto err_wed_exit;
++
++	eth->hwlro = MTK_HAS_CAPS(eth->soc->caps, MTK_HWLRO);
++
++	for_each_child_of_node(pdev->dev.of_node, mac_np) {
++		if (!of_device_is_compatible(mac_np,
++					     "mediatek,eth-mac"))
++			continue;
++
++		if (!of_device_is_available(mac_np))
++			continue;
++
++		err = mtk_add_mac(eth, mac_np);
++		if (err) {
++			of_node_put(mac_np);
++			goto err_deinit_hw;
++		}
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_INT)) {
++		err = devm_request_irq(eth->dev, eth->irq[0],
++				       mtk_handle_irq, 0,
++				       dev_name(eth->dev), eth);
++	} else {
++		err = devm_request_irq(eth->dev, eth->irq[1],
++				       mtk_handle_irq_tx, 0,
++				       dev_name(eth->dev), eth);
++		if (err)
++			goto err_free_dev;
++
++		err = devm_request_irq(eth->dev, eth->irq[2],
++				       mtk_handle_irq_rx, 0,
++				       dev_name(eth->dev), eth);
++	}
++	if (err)
++		goto err_free_dev;
++
++	/* No MT7628/88 support yet */
++	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		err = mtk_mdio_init(eth);
++		if (err)
++			goto err_free_dev;
++	}
++
++	if (eth->soc->offload_version) {
++		eth->ppe = mtk_ppe_init(eth, eth->base + MTK_ETH_PPE_BASE, 2);
++		if (!eth->ppe) {
++			err = -ENOMEM;
++			goto err_deinit_mdio;
++		}
++
++		err = mtk_eth_offload_init(eth);
++		if (err)
++			goto err_deinit_mdio;
++	}
++
++	for (i = 0; i < MTK_MAX_DEVS; i++) {
++		if (!eth->netdev[i])
++			continue;
++
++		err = register_netdev(eth->netdev[i]);
++		if (err) {
++			dev_err(eth->dev, "error bringing up device\n");
++			goto err_deinit_mdio;
++		} else
++			netif_info(eth, probe, eth->netdev[i],
++				   "mediatek frame engine at 0x%08lx, irq %d\n",
++				   eth->netdev[i]->base_addr, eth->irq[0]);
++	}
++
++	/* we run 2 devices on the same DMA ring so we need a dummy device
++	 * for NAPI to work
++	 */
++	init_dummy_netdev(&eth->dummy_dev);
++	netif_napi_add(&eth->dummy_dev, &eth->tx_napi, mtk_napi_tx,
++		       NAPI_POLL_WEIGHT);
++	netif_napi_add(&eth->dummy_dev, &eth->rx_napi, mtk_napi_rx,
++		       NAPI_POLL_WEIGHT);
++
++	platform_set_drvdata(pdev, eth);
++
++	return 0;
++
++err_deinit_mdio:
++	mtk_mdio_cleanup(eth);
++err_free_dev:
++	mtk_free_dev(eth);
++err_deinit_hw:
++	mtk_hw_deinit(eth);
++err_wed_exit:
++	mtk_wed_exit();
++
++	return err;
++}
++
++static int mtk_remove(struct platform_device *pdev)
++{
++	struct mtk_eth *eth = platform_get_drvdata(pdev);
++	struct mtk_mac *mac;
++	int i;
++
++	/* stop all devices to make sure that dma is properly shut down */
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		mtk_stop(eth->netdev[i]);
++		mac = netdev_priv(eth->netdev[i]);
++		phylink_disconnect_phy(mac->phylink);
++	}
++
++	mtk_wed_exit();
++	mtk_hw_deinit(eth);
++
++	netif_napi_del(&eth->tx_napi);
++	netif_napi_del(&eth->rx_napi);
++	mtk_cleanup(eth);
++	mtk_mdio_cleanup(eth);
++
++	return 0;
++}
++
++static const struct mtk_soc_data mt2701_data = {
++	.reg_map = &mtk_reg_map,
++	.caps = MT7623_CAPS | MTK_HWLRO,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7623_CLKS_BITMAP,
++	.required_pctl = true,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7621_data = {
++	.reg_map = &mtk_reg_map,
++	.caps = MT7621_CAPS,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7621_CLKS_BITMAP,
++	.required_pctl = false,
++	.offload_version = 2,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7622_data = {
++	.reg_map = &mtk_reg_map,
++	.ana_rgc3 = 0x2028,
++	.caps = MT7622_CAPS | MTK_HWLRO,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7622_CLKS_BITMAP,
++	.required_pctl = false,
++	.offload_version = 2,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7623_data = {
++	.reg_map = &mtk_reg_map,
++	.caps = MT7623_CAPS | MTK_HWLRO,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7623_CLKS_BITMAP,
++	.required_pctl = true,
++	.offload_version = 2,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7629_data = {
++	.reg_map = &mtk_reg_map,
++	.ana_rgc3 = 0x128,
++	.caps = MT7629_CAPS | MTK_HWLRO,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7629_CLKS_BITMAP,
++	.required_pctl = false,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7986_data = {
++	.reg_map = &mt7986_reg_map,
++	.ana_rgc3 = 0x128,
++	.caps = MT7986_CAPS,
++	.required_clks = MT7986_CLKS_BITMAP,
++	.required_pctl = false,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma_v2),
++		.rxd_size = sizeof(struct mtk_rx_dma_v2),
++		.rx_irq_done_mask = MTK_RX_DONE_INT_V2,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID_V2,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN_V2,
++		.dma_len_offset = 8,
++	},
++};
++
++static const struct mtk_soc_data rt5350_data = {
++	.reg_map = &mt7628_reg_map,
++	.caps = MT7628_CAPS,
++	.hw_features = MTK_HW_FEATURES_MT7628,
++	.required_clks = MT7628_CLKS_BITMAP,
++	.required_pctl = false,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID_PDMA,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++const struct of_device_id of_mtk_match[] = {
++	{ .compatible = "mediatek,mt2701-eth", .data = &mt2701_data},
++	{ .compatible = "mediatek,mt7621-eth", .data = &mt7621_data},
++	{ .compatible = "mediatek,mt7622-eth", .data = &mt7622_data},
++	{ .compatible = "mediatek,mt7623-eth", .data = &mt7623_data},
++	{ .compatible = "mediatek,mt7629-eth", .data = &mt7629_data},
++	{ .compatible = "mediatek,mt7986-eth", .data = &mt7986_data},
++	{ .compatible = "ralink,rt5350-eth", .data = &rt5350_data},
++	{},
++};
++MODULE_DEVICE_TABLE(of, of_mtk_match);
++
++static struct platform_driver mtk_driver = {
++	.probe = mtk_probe,
++	.remove = mtk_remove,
++	.driver = {
++		.name = "mtk_soc_eth",
++		.of_match_table = of_mtk_match,
++	},
++};
++
++module_platform_driver(mtk_driver);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("John Crispin <blogic@openwrt.org>");
++MODULE_DESCRIPTION("Ethernet driver for MediaTek SoC");
+diff -rupN linux.orig/drivers/net/ethernet/mellanox/mlxsw/spectrum.c linux/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+--- linux.orig/drivers/net/ethernet/mellanox/mlxsw/spectrum.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/mellanox/mlxsw/spectrum.c	2022-12-04 10:40:26.692034106 -0500
+@@ -827,12 +827,12 @@ mlxsw_sp_port_get_sw_stats64(const struc
  	for_each_possible_cpu(i) {
  		p = per_cpu_ptr(mlxsw_sp_port->pcpu_stats, i);
  		do {
@@ -2234,11 +12394,10 @@ index 30c7b0e157218..fa2753318cdf7 100644
  
  		stats->rx_packets	+= rx_packets;
  		stats->rx_bytes		+= rx_bytes;
-diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
-index 9259a74eca40b..318dbbb482797 100644
---- a/drivers/net/ethernet/microsoft/mana/mana_en.c
-+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
-@@ -315,10 +315,10 @@ static void mana_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/ethernet/microsoft/mana/mana_en.c linux/drivers/net/ethernet/microsoft/mana/mana_en.c
+--- linux.orig/drivers/net/ethernet/microsoft/mana/mana_en.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/microsoft/mana/mana_en.c	2022-12-04 10:40:26.692034106 -0500
+@@ -315,10 +315,10 @@ static void mana_get_stats64(struct net_
  		rx_stats = &apc->rxqs[q]->stats;
  
  		do {
@@ -2251,7 +12410,7 @@ index 9259a74eca40b..318dbbb482797 100644
  
  		st->rx_packets += packets;
  		st->rx_bytes += bytes;
-@@ -328,10 +328,10 @@ static void mana_get_stats64(struct net_device *ndev,
+@@ -328,10 +328,10 @@ static void mana_get_stats64(struct net_
  		tx_stats = &apc->tx_qp[q].txq.stats;
  
  		do {
@@ -2264,11 +12423,10 @@ index 9259a74eca40b..318dbbb482797 100644
  
  		st->tx_packets += packets;
  		st->tx_bytes += bytes;
-diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
-index c530db76880f0..96d55c91c9698 100644
---- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
-+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
-@@ -90,13 +90,13 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/ethernet/microsoft/mana/mana_ethtool.c linux/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+--- linux.orig/drivers/net/ethernet/microsoft/mana/mana_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/microsoft/mana/mana_ethtool.c	2022-12-04 10:40:26.692034106 -0500
+@@ -90,13 +90,13 @@ static void mana_get_ethtool_stats(struc
  		rx_stats = &apc->rxqs[q]->stats;
  
  		do {
@@ -2284,7 +12442,7 @@ index c530db76880f0..96d55c91c9698 100644
  
  		data[i++] = packets;
  		data[i++] = bytes;
-@@ -109,11 +109,11 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
+@@ -109,11 +109,11 @@ static void mana_get_ethtool_stats(struc
  		tx_stats = &apc->tx_qp[q].txq.stats;
  
  		do {
@@ -2298,11 +12456,10 @@ index c530db76880f0..96d55c91c9698 100644
  
  		data[i++] = packets;
  		data[i++] = bytes;
-diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
-index 349a2b1a19a24..cf4d6f1129fa2 100644
---- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
-+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
-@@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_common.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_common.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_common.c	2022-12-04 10:40:26.692034106 -0500
+@@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_de
  		unsigned int start;
  
  		do {
@@ -2328,11 +12485,10 @@ index 349a2b1a19a24..cf4d6f1129fa2 100644
  		stats->tx_packets += data[0];
  		stats->tx_bytes += data[1];
  		stats->tx_errors += data[2];
-diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
-index b1b1b648e40cb..eeb1455a4e5db 100644
---- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
-+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
-@@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
+diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c	2022-12-04 10:40:26.692034106 -0500
+@@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct
  		unsigned int start;
  
  		do {
@@ -2341,7 +12497,7 @@ index b1b1b648e40cb..eeb1455a4e5db 100644
  			data[0] = nn->r_vecs[i].rx_pkts;
  			tmp[0] = nn->r_vecs[i].hw_csum_rx_ok;
  			tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok;
-@@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
+@@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct
  			tmp[3] = nn->r_vecs[i].hw_csum_rx_error;
  			tmp[4] = nn->r_vecs[i].rx_replace_buf_alloc_fail;
  			tmp[5] = nn->r_vecs[i].hw_tls_rx;
@@ -2354,7 +12510,7 @@ index b1b1b648e40cb..eeb1455a4e5db 100644
  			data[1] = nn->r_vecs[i].tx_pkts;
  			data[2] = nn->r_vecs[i].tx_busy;
  			tmp[6] = nn->r_vecs[i].hw_csum_tx;
-@@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
+@@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct
  			tmp[10] = nn->r_vecs[i].hw_tls_tx;
  			tmp[11] = nn->r_vecs[i].tls_tx_fallback;
  			tmp[12] = nn->r_vecs[i].tls_tx_no_fallback;
@@ -2363,11 +12519,10 @@ index b1b1b648e40cb..eeb1455a4e5db 100644
  
  		data += NN_RVEC_PER_Q_STATS;
  
-diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
-index 8b77582bdfa01..a6b6ca1fd55ee 100644
---- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
-+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
-@@ -134,13 +134,13 @@ nfp_repr_get_host_stats64(const struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c	2022-12-04 10:40:26.692034106 -0500
+@@ -134,13 +134,13 @@ nfp_repr_get_host_stats64(const struct n
  
  		repr_stats = per_cpu_ptr(repr->stats, i);
  		do {
@@ -2383,11 +12538,10 @@ index 8b77582bdfa01..a6b6ca1fd55ee 100644
  
  		stats->tx_bytes += tbytes;
  		stats->tx_packets += tpkts;
-diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
-index 5116badaf0919..50ebbd7e91c48 100644
---- a/drivers/net/ethernet/nvidia/forcedeth.c
-+++ b/drivers/net/ethernet/nvidia/forcedeth.c
-@@ -1734,12 +1734,12 @@ static void nv_get_stats(int cpu, struct fe_priv *np,
+diff -rupN linux.orig/drivers/net/ethernet/nvidia/forcedeth.c linux/drivers/net/ethernet/nvidia/forcedeth.c
+--- linux.orig/drivers/net/ethernet/nvidia/forcedeth.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/nvidia/forcedeth.c	2022-12-04 10:40:26.692034106 -0500
+@@ -1734,12 +1734,12 @@ static void nv_get_stats(int cpu, struct
  	u64 tx_packets, tx_bytes, tx_dropped;
  
  	do {
@@ -2402,7 +12556,7 @@ index 5116badaf0919..50ebbd7e91c48 100644
  
  	storage->rx_packets       += rx_packets;
  	storage->rx_bytes         += rx_bytes;
-@@ -1747,11 +1747,11 @@ static void nv_get_stats(int cpu, struct fe_priv *np,
+@@ -1747,11 +1747,11 @@ static void nv_get_stats(int cpu, struct
  	storage->rx_missed_errors += rx_missed_errors;
  
  	do {
@@ -2416,11 +12570,10 @@ index 5116badaf0919..50ebbd7e91c48 100644
  
  	storage->tx_packets += tx_packets;
  	storage->tx_bytes   += tx_bytes;
-diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
-index 1b2119b1d48aa..3f5e6572d20e7 100644
---- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
-+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
-@@ -135,9 +135,9 @@ static void rmnet_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c linux/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
+--- linux.orig/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c	2022-12-04 10:40:26.692034106 -0500
+@@ -135,9 +135,9 @@ static void rmnet_get_stats64(struct net
  		pcpu_ptr = per_cpu_ptr(priv->pcpu_stats, cpu);
  
  		do {
@@ -2432,11 +12585,10 @@ index 1b2119b1d48aa..3f5e6572d20e7 100644
  
  		total_stats.rx_pkts += snapshot.rx_pkts;
  		total_stats.rx_bytes += snapshot.rx_bytes;
-diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
-index 15b40fd93cd2e..82bd0eb614634 100644
---- a/drivers/net/ethernet/realtek/8139too.c
-+++ b/drivers/net/ethernet/realtek/8139too.c
-@@ -2532,16 +2532,16 @@ rtl8139_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/ethernet/realtek/8139too.c linux/drivers/net/ethernet/realtek/8139too.c
+--- linux.orig/drivers/net/ethernet/realtek/8139too.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/realtek/8139too.c	2022-12-04 10:40:26.692034106 -0500
+@@ -2532,16 +2532,16 @@ rtl8139_get_stats64(struct net_device *d
  	netdev_stats_to_stats64(stats, &dev->stats);
  
  	do {
@@ -2457,11 +12609,10 @@ index 15b40fd93cd2e..82bd0eb614634 100644
  }
  
  /* Set or clear the multicast filter for this adaptor.
-diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
-index f0c8de2c60755..d4f7238333bb7 100644
---- a/drivers/net/ethernet/socionext/sni_ave.c
-+++ b/drivers/net/ethernet/socionext/sni_ave.c
-@@ -1506,16 +1506,16 @@ static void ave_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/ethernet/socionext/sni_ave.c linux/drivers/net/ethernet/socionext/sni_ave.c
+--- linux.orig/drivers/net/ethernet/socionext/sni_ave.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/socionext/sni_ave.c	2022-12-04 10:40:26.692034106 -0500
+@@ -1508,16 +1508,16 @@ static void ave_get_stats64(struct net_d
  	unsigned int start;
  
  	do {
@@ -2482,11 +12633,2010 @@ index f0c8de2c60755..d4f7238333bb7 100644
  
  	stats->rx_errors      = priv->stats_rx.errors;
  	stats->tx_errors      = priv->stats_tx.errors;
-diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
-index f4a6b590a1e39..1b62400c19049 100644
---- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
-+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
-@@ -1365,12 +1365,12 @@ static void am65_cpsw_nuss_ndo_get_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/socionext/sni_ave.c.orig linux/drivers/net/ethernet/socionext/sni_ave.c.orig
+--- linux.orig/drivers/net/ethernet/socionext/sni_ave.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/ethernet/socionext/sni_ave.c.orig	2022-12-04 10:40:18.168055947 -0500
+@@ -0,0 +1,1996 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * sni_ave.c - Socionext UniPhier AVE ethernet driver
++ * Copyright 2014 Panasonic Corporation
++ * Copyright 2015-2017 Socionext Inc.
++ */
++
++#include <linux/bitops.h>
++#include <linux/clk.h>
++#include <linux/etherdevice.h>
++#include <linux/interrupt.h>
++#include <linux/io.h>
++#include <linux/iopoll.h>
++#include <linux/mfd/syscon.h>
++#include <linux/mii.h>
++#include <linux/module.h>
++#include <linux/netdevice.h>
++#include <linux/of_net.h>
++#include <linux/of_mdio.h>
++#include <linux/of_platform.h>
++#include <linux/phy.h>
++#include <linux/regmap.h>
++#include <linux/reset.h>
++#include <linux/types.h>
++#include <linux/u64_stats_sync.h>
++
++/* General Register Group */
++#define AVE_IDR			0x000	/* ID */
++#define AVE_VR			0x004	/* Version */
++#define AVE_GRR			0x008	/* Global Reset */
++#define AVE_CFGR		0x00c	/* Configuration */
++
++/* Interrupt Register Group */
++#define AVE_GIMR		0x100	/* Global Interrupt Mask */
++#define AVE_GISR		0x104	/* Global Interrupt Status */
++
++/* MAC Register Group */
++#define AVE_TXCR		0x200	/* TX Setup */
++#define AVE_RXCR		0x204	/* RX Setup */
++#define AVE_RXMAC1R		0x208	/* MAC address (lower) */
++#define AVE_RXMAC2R		0x20c	/* MAC address (upper) */
++#define AVE_MDIOCTR		0x214	/* MDIO Control */
++#define AVE_MDIOAR		0x218	/* MDIO Address */
++#define AVE_MDIOWDR		0x21c	/* MDIO Data */
++#define AVE_MDIOSR		0x220	/* MDIO Status */
++#define AVE_MDIORDR		0x224	/* MDIO Rd Data */
++
++/* Descriptor Control Register Group */
++#define AVE_DESCC		0x300	/* Descriptor Control */
++#define AVE_TXDC		0x304	/* TX Descriptor Configuration */
++#define AVE_RXDC0		0x308	/* RX Descriptor Ring0 Configuration */
++#define AVE_IIRQC		0x34c	/* Interval IRQ Control */
++
++/* Packet Filter Register Group */
++#define AVE_PKTF_BASE		0x800	/* PF Base Address */
++#define AVE_PFMBYTE_BASE	0xd00	/* PF Mask Byte Base Address */
++#define AVE_PFMBIT_BASE		0xe00	/* PF Mask Bit Base Address */
++#define AVE_PFSEL_BASE		0xf00	/* PF Selector Base Address */
++#define AVE_PFEN		0xffc	/* Packet Filter Enable */
++#define AVE_PKTF(ent)		(AVE_PKTF_BASE + (ent) * 0x40)
++#define AVE_PFMBYTE(ent)	(AVE_PFMBYTE_BASE + (ent) * 8)
++#define AVE_PFMBIT(ent)		(AVE_PFMBIT_BASE + (ent) * 4)
++#define AVE_PFSEL(ent)		(AVE_PFSEL_BASE + (ent) * 4)
++
++/* 64bit descriptor memory */
++#define AVE_DESC_SIZE_64	12	/* Descriptor Size */
++
++#define AVE_TXDM_64		0x1000	/* Tx Descriptor Memory */
++#define AVE_RXDM_64		0x1c00	/* Rx Descriptor Memory */
++
++#define AVE_TXDM_SIZE_64	0x0ba0	/* Tx Descriptor Memory Size 3KB */
++#define AVE_RXDM_SIZE_64	0x6000	/* Rx Descriptor Memory Size 24KB */
++
++/* 32bit descriptor memory */
++#define AVE_DESC_SIZE_32	8	/* Descriptor Size */
++
++#define AVE_TXDM_32		0x1000	/* Tx Descriptor Memory */
++#define AVE_RXDM_32		0x1800	/* Rx Descriptor Memory */
++
++#define AVE_TXDM_SIZE_32	0x07c0	/* Tx Descriptor Memory Size 2KB */
++#define AVE_RXDM_SIZE_32	0x4000	/* Rx Descriptor Memory Size 16KB */
++
++/* RMII Bridge Register Group */
++#define AVE_RSTCTRL		0x8028	/* Reset control */
++#define AVE_RSTCTRL_RMIIRST	BIT(16)
++#define AVE_LINKSEL		0x8034	/* Link speed setting */
++#define AVE_LINKSEL_100M	BIT(0)
++
++/* AVE_GRR */
++#define AVE_GRR_RXFFR		BIT(5)	/* Reset RxFIFO */
++#define AVE_GRR_PHYRST		BIT(4)	/* Reset external PHY */
++#define AVE_GRR_GRST		BIT(0)	/* Reset all MAC */
++
++/* AVE_CFGR */
++#define AVE_CFGR_FLE		BIT(31)	/* Filter Function */
++#define AVE_CFGR_CHE		BIT(30)	/* Checksum Function */
++#define AVE_CFGR_MII		BIT(27)	/* Func mode (1:MII/RMII, 0:RGMII) */
++#define AVE_CFGR_IPFCEN		BIT(24)	/* IP fragment sum Enable */
++
++/* AVE_GISR (common with GIMR) */
++#define AVE_GI_PHY		BIT(24)	/* PHY interrupt */
++#define AVE_GI_TX		BIT(16)	/* Tx complete */
++#define AVE_GI_RXERR		BIT(8)	/* Receive frame more than max size */
++#define AVE_GI_RXOVF		BIT(7)	/* Overflow at the RxFIFO */
++#define AVE_GI_RXDROP		BIT(6)	/* Drop packet */
++#define AVE_GI_RXIINT		BIT(5)	/* Interval interrupt */
++
++/* AVE_TXCR */
++#define AVE_TXCR_FLOCTR		BIT(18)	/* Flow control */
++#define AVE_TXCR_TXSPD_1G	BIT(17)
++#define AVE_TXCR_TXSPD_100	BIT(16)
++
++/* AVE_RXCR */
++#define AVE_RXCR_RXEN		BIT(30)	/* Rx enable */
++#define AVE_RXCR_FDUPEN		BIT(22)	/* Interface mode */
++#define AVE_RXCR_FLOCTR		BIT(21)	/* Flow control */
++#define AVE_RXCR_AFEN		BIT(19)	/* MAC address filter */
++#define AVE_RXCR_DRPEN		BIT(18)	/* Drop pause frame */
++#define AVE_RXCR_MPSIZ_MASK	GENMASK(10, 0)
++
++/* AVE_MDIOCTR */
++#define AVE_MDIOCTR_RREQ	BIT(3)	/* Read request */
++#define AVE_MDIOCTR_WREQ	BIT(2)	/* Write request */
++
++/* AVE_MDIOSR */
++#define AVE_MDIOSR_STS		BIT(0)	/* access status */
++
++/* AVE_DESCC */
++#define AVE_DESCC_STATUS_MASK	GENMASK(31, 16)
++#define AVE_DESCC_RD0		BIT(8)	/* Enable Rx descriptor Ring0 */
++#define AVE_DESCC_RDSTP		BIT(4)	/* Pause Rx descriptor */
++#define AVE_DESCC_TD		BIT(0)	/* Enable Tx descriptor */
++
++/* AVE_TXDC */
++#define AVE_TXDC_SIZE		GENMASK(27, 16)	/* Size of Tx descriptor */
++#define AVE_TXDC_ADDR		GENMASK(11, 0)	/* Start address */
++#define AVE_TXDC_ADDR_START	0
++
++/* AVE_RXDC0 */
++#define AVE_RXDC0_SIZE		GENMASK(30, 16)	/* Size of Rx descriptor */
++#define AVE_RXDC0_ADDR		GENMASK(14, 0)	/* Start address */
++#define AVE_RXDC0_ADDR_START	0
++
++/* AVE_IIRQC */
++#define AVE_IIRQC_EN0		BIT(27)	/* Enable interval interrupt Ring0 */
++#define AVE_IIRQC_BSCK		GENMASK(15, 0)	/* Interval count unit */
++
++/* Command status for descriptor */
++#define AVE_STS_OWN		BIT(31)	/* Descriptor ownership */
++#define AVE_STS_INTR		BIT(29)	/* Request for interrupt */
++#define AVE_STS_OK		BIT(27)	/* Normal transmit */
++/* TX */
++#define AVE_STS_NOCSUM		BIT(28)	/* No use HW checksum */
++#define AVE_STS_1ST		BIT(26)	/* Head of buffer chain */
++#define AVE_STS_LAST		BIT(25)	/* Tail of buffer chain */
++#define AVE_STS_OWC		BIT(21)	/* Out of window,Late Collision */
++#define AVE_STS_EC		BIT(20)	/* Excess collision occurred */
++#define AVE_STS_PKTLEN_TX_MASK	GENMASK(15, 0)
++/* RX */
++#define AVE_STS_CSSV		BIT(21)	/* Checksum check performed */
++#define AVE_STS_CSER		BIT(20)	/* Checksum error detected */
++#define AVE_STS_PKTLEN_RX_MASK	GENMASK(10, 0)
++
++/* Packet filter */
++#define AVE_PFMBYTE_MASK0	(GENMASK(31, 8) | GENMASK(5, 0))
++#define AVE_PFMBYTE_MASK1	GENMASK(25, 0)
++#define AVE_PFMBIT_MASK		GENMASK(15, 0)
++
++#define AVE_PF_SIZE		17	/* Number of all packet filter */
++#define AVE_PF_MULTICAST_SIZE	7	/* Number of multicast filter */
++
++#define AVE_PFNUM_FILTER	0	/* No.0 */
++#define AVE_PFNUM_UNICAST	1	/* No.1 */
++#define AVE_PFNUM_BROADCAST	2	/* No.2 */
++#define AVE_PFNUM_MULTICAST	11	/* No.11-17 */
++
++/* NETIF Message control */
++#define AVE_DEFAULT_MSG_ENABLE	(NETIF_MSG_DRV    |	\
++				 NETIF_MSG_PROBE  |	\
++				 NETIF_MSG_LINK   |	\
++				 NETIF_MSG_TIMER  |	\
++				 NETIF_MSG_IFDOWN |	\
++				 NETIF_MSG_IFUP   |	\
++				 NETIF_MSG_RX_ERR |	\
++				 NETIF_MSG_TX_ERR)
++
++/* Parameter for descriptor */
++#define AVE_NR_TXDESC		64	/* Tx descriptor */
++#define AVE_NR_RXDESC		256	/* Rx descriptor */
++
++#define AVE_DESC_OFS_CMDSTS	0
++#define AVE_DESC_OFS_ADDRL	4
++#define AVE_DESC_OFS_ADDRU	8
++
++/* Parameter for ethernet frame */
++#define AVE_MAX_ETHFRAME	1518
++#define AVE_FRAME_HEADROOM	2
++
++/* Parameter for interrupt */
++#define AVE_INTM_COUNT		20
++#define AVE_FORCE_TXINTCNT	1
++
++/* SG */
++#define SG_ETPINMODE		0x540
++#define SG_ETPINMODE_EXTPHY	BIT(1)	/* for LD11 */
++#define SG_ETPINMODE_RMII(ins)	BIT(ins)
++
++#define IS_DESC_64BIT(p)	((p)->data->is_desc_64bit)
++
++#define AVE_MAX_CLKS		4
++#define AVE_MAX_RSTS		2
++
++enum desc_id {
++	AVE_DESCID_RX,
++	AVE_DESCID_TX,
++};
++
++enum desc_state {
++	AVE_DESC_RX_PERMIT,
++	AVE_DESC_RX_SUSPEND,
++	AVE_DESC_START,
++	AVE_DESC_STOP,
++};
++
++struct ave_desc {
++	struct sk_buff	*skbs;
++	dma_addr_t	skbs_dma;
++	size_t		skbs_dmalen;
++};
++
++struct ave_desc_info {
++	u32	ndesc;		/* number of descriptor */
++	u32	daddr;		/* start address of descriptor */
++	u32	proc_idx;	/* index of processing packet */
++	u32	done_idx;	/* index of processed packet */
++	struct ave_desc *desc;	/* skb info related descriptor */
++};
++
++struct ave_stats {
++	struct	u64_stats_sync	syncp;
++	u64	packets;
++	u64	bytes;
++	u64	errors;
++	u64	dropped;
++	u64	collisions;
++	u64	fifo_errors;
++};
++
++struct ave_private {
++	void __iomem            *base;
++	int                     irq;
++	int			phy_id;
++	unsigned int		desc_size;
++	u32			msg_enable;
++	int			nclks;
++	struct clk		*clk[AVE_MAX_CLKS];
++	int			nrsts;
++	struct reset_control	*rst[AVE_MAX_RSTS];
++	phy_interface_t		phy_mode;
++	struct phy_device	*phydev;
++	struct mii_bus		*mdio;
++	struct regmap		*regmap;
++	unsigned int		pinmode_mask;
++	unsigned int		pinmode_val;
++	u32			wolopts;
++
++	/* stats */
++	struct ave_stats	stats_rx;
++	struct ave_stats	stats_tx;
++
++	/* NAPI support */
++	struct net_device	*ndev;
++	struct napi_struct	napi_rx;
++	struct napi_struct	napi_tx;
++
++	/* descriptor */
++	struct ave_desc_info	rx;
++	struct ave_desc_info	tx;
++
++	/* flow control */
++	int pause_auto;
++	int pause_rx;
++	int pause_tx;
++
++	const struct ave_soc_data *data;
++};
++
++struct ave_soc_data {
++	bool	is_desc_64bit;
++	const char	*clock_names[AVE_MAX_CLKS];
++	const char	*reset_names[AVE_MAX_RSTS];
++	int	(*get_pinmode)(struct ave_private *priv,
++			       phy_interface_t phy_mode, u32 arg);
++};
++
++static u32 ave_desc_read(struct net_device *ndev, enum desc_id id, int entry,
++			 int offset)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 addr;
++
++	addr = ((id == AVE_DESCID_TX) ? priv->tx.daddr : priv->rx.daddr)
++		+ entry * priv->desc_size + offset;
++
++	return readl(priv->base + addr);
++}
++
++static u32 ave_desc_read_cmdsts(struct net_device *ndev, enum desc_id id,
++				int entry)
++{
++	return ave_desc_read(ndev, id, entry, AVE_DESC_OFS_CMDSTS);
++}
++
++static void ave_desc_write(struct net_device *ndev, enum desc_id id,
++			   int entry, int offset, u32 val)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 addr;
++
++	addr = ((id == AVE_DESCID_TX) ? priv->tx.daddr : priv->rx.daddr)
++		+ entry * priv->desc_size + offset;
++
++	writel(val, priv->base + addr);
++}
++
++static void ave_desc_write_cmdsts(struct net_device *ndev, enum desc_id id,
++				  int entry, u32 val)
++{
++	ave_desc_write(ndev, id, entry, AVE_DESC_OFS_CMDSTS, val);
++}
++
++static void ave_desc_write_addr(struct net_device *ndev, enum desc_id id,
++				int entry, dma_addr_t paddr)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	ave_desc_write(ndev, id, entry, AVE_DESC_OFS_ADDRL,
++		       lower_32_bits(paddr));
++	if (IS_DESC_64BIT(priv))
++		ave_desc_write(ndev, id,
++			       entry, AVE_DESC_OFS_ADDRU,
++			       upper_32_bits(paddr));
++}
++
++static u32 ave_irq_disable_all(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 ret;
++
++	ret = readl(priv->base + AVE_GIMR);
++	writel(0, priv->base + AVE_GIMR);
++
++	return ret;
++}
++
++static void ave_irq_restore(struct net_device *ndev, u32 val)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	writel(val, priv->base + AVE_GIMR);
++}
++
++static void ave_irq_enable(struct net_device *ndev, u32 bitflag)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	writel(readl(priv->base + AVE_GIMR) | bitflag, priv->base + AVE_GIMR);
++	writel(bitflag, priv->base + AVE_GISR);
++}
++
++static void ave_hw_write_macaddr(struct net_device *ndev,
++				 const unsigned char *mac_addr,
++				 int reg1, int reg2)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	writel(mac_addr[0] | mac_addr[1] << 8 |
++	       mac_addr[2] << 16 | mac_addr[3] << 24, priv->base + reg1);
++	writel(mac_addr[4] | mac_addr[5] << 8, priv->base + reg2);
++}
++
++static void ave_hw_read_version(struct net_device *ndev, char *buf, int len)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 major, minor, vr;
++
++	vr = readl(priv->base + AVE_VR);
++	major = (vr & GENMASK(15, 8)) >> 8;
++	minor = (vr & GENMASK(7, 0));
++	snprintf(buf, len, "v%u.%u", major, minor);
++}
++
++static void ave_ethtool_get_drvinfo(struct net_device *ndev,
++				    struct ethtool_drvinfo *info)
++{
++	struct device *dev = ndev->dev.parent;
++
++	strlcpy(info->driver, dev->driver->name, sizeof(info->driver));
++	strlcpy(info->bus_info, dev_name(dev), sizeof(info->bus_info));
++	ave_hw_read_version(ndev, info->fw_version, sizeof(info->fw_version));
++}
++
++static u32 ave_ethtool_get_msglevel(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	return priv->msg_enable;
++}
++
++static void ave_ethtool_set_msglevel(struct net_device *ndev, u32 val)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	priv->msg_enable = val;
++}
++
++static void ave_ethtool_get_wol(struct net_device *ndev,
++				struct ethtool_wolinfo *wol)
++{
++	wol->supported = 0;
++	wol->wolopts   = 0;
++
++	if (ndev->phydev)
++		phy_ethtool_get_wol(ndev->phydev, wol);
++}
++
++static int __ave_ethtool_set_wol(struct net_device *ndev,
++				 struct ethtool_wolinfo *wol)
++{
++	if (!ndev->phydev ||
++	    (wol->wolopts & (WAKE_ARP | WAKE_MAGICSECURE)))
++		return -EOPNOTSUPP;
++
++	return phy_ethtool_set_wol(ndev->phydev, wol);
++}
++
++static int ave_ethtool_set_wol(struct net_device *ndev,
++			       struct ethtool_wolinfo *wol)
++{
++	int ret;
++
++	ret = __ave_ethtool_set_wol(ndev, wol);
++	if (!ret)
++		device_set_wakeup_enable(&ndev->dev, !!wol->wolopts);
++
++	return ret;
++}
++
++static void ave_ethtool_get_pauseparam(struct net_device *ndev,
++				       struct ethtool_pauseparam *pause)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	pause->autoneg  = priv->pause_auto;
++	pause->rx_pause = priv->pause_rx;
++	pause->tx_pause = priv->pause_tx;
++}
++
++static int ave_ethtool_set_pauseparam(struct net_device *ndev,
++				      struct ethtool_pauseparam *pause)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	struct phy_device *phydev = ndev->phydev;
++
++	if (!phydev)
++		return -EINVAL;
++
++	priv->pause_auto = pause->autoneg;
++	priv->pause_rx   = pause->rx_pause;
++	priv->pause_tx   = pause->tx_pause;
++
++	phy_set_asym_pause(phydev, pause->rx_pause, pause->tx_pause);
++
++	return 0;
++}
++
++static const struct ethtool_ops ave_ethtool_ops = {
++	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
++	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
++	.get_drvinfo		= ave_ethtool_get_drvinfo,
++	.nway_reset		= phy_ethtool_nway_reset,
++	.get_link		= ethtool_op_get_link,
++	.get_msglevel		= ave_ethtool_get_msglevel,
++	.set_msglevel		= ave_ethtool_set_msglevel,
++	.get_wol		= ave_ethtool_get_wol,
++	.set_wol		= ave_ethtool_set_wol,
++	.get_pauseparam         = ave_ethtool_get_pauseparam,
++	.set_pauseparam         = ave_ethtool_set_pauseparam,
++};
++
++static int ave_mdiobus_read(struct mii_bus *bus, int phyid, int regnum)
++{
++	struct net_device *ndev = bus->priv;
++	struct ave_private *priv;
++	u32 mdioctl, mdiosr;
++	int ret;
++
++	priv = netdev_priv(ndev);
++
++	/* write address */
++	writel((phyid << 8) | regnum, priv->base + AVE_MDIOAR);
++
++	/* read request */
++	mdioctl = readl(priv->base + AVE_MDIOCTR);
++	writel((mdioctl | AVE_MDIOCTR_RREQ) & ~AVE_MDIOCTR_WREQ,
++	       priv->base + AVE_MDIOCTR);
++
++	ret = readl_poll_timeout(priv->base + AVE_MDIOSR, mdiosr,
++				 !(mdiosr & AVE_MDIOSR_STS), 20, 2000);
++	if (ret) {
++		netdev_err(ndev, "failed to read (phy:%d reg:%x)\n",
++			   phyid, regnum);
++		return ret;
++	}
++
++	return readl(priv->base + AVE_MDIORDR) & GENMASK(15, 0);
++}
++
++static int ave_mdiobus_write(struct mii_bus *bus, int phyid, int regnum,
++			     u16 val)
++{
++	struct net_device *ndev = bus->priv;
++	struct ave_private *priv;
++	u32 mdioctl, mdiosr;
++	int ret;
++
++	priv = netdev_priv(ndev);
++
++	/* write address */
++	writel((phyid << 8) | regnum, priv->base + AVE_MDIOAR);
++
++	/* write data */
++	writel(val, priv->base + AVE_MDIOWDR);
++
++	/* write request */
++	mdioctl = readl(priv->base + AVE_MDIOCTR);
++	writel((mdioctl | AVE_MDIOCTR_WREQ) & ~AVE_MDIOCTR_RREQ,
++	       priv->base + AVE_MDIOCTR);
++
++	ret = readl_poll_timeout(priv->base + AVE_MDIOSR, mdiosr,
++				 !(mdiosr & AVE_MDIOSR_STS), 20, 2000);
++	if (ret)
++		netdev_err(ndev, "failed to write (phy:%d reg:%x)\n",
++			   phyid, regnum);
++
++	return ret;
++}
++
++static int ave_dma_map(struct net_device *ndev, struct ave_desc *desc,
++		       void *ptr, size_t len, enum dma_data_direction dir,
++		       dma_addr_t *paddr)
++{
++	dma_addr_t map_addr;
++
++	map_addr = dma_map_single(ndev->dev.parent, ptr, len, dir);
++	if (unlikely(dma_mapping_error(ndev->dev.parent, map_addr)))
++		return -ENOMEM;
++
++	desc->skbs_dma = map_addr;
++	desc->skbs_dmalen = len;
++	*paddr = map_addr;
++
++	return 0;
++}
++
++static void ave_dma_unmap(struct net_device *ndev, struct ave_desc *desc,
++			  enum dma_data_direction dir)
++{
++	if (!desc->skbs_dma)
++		return;
++
++	dma_unmap_single(ndev->dev.parent,
++			 desc->skbs_dma, desc->skbs_dmalen, dir);
++	desc->skbs_dma = 0;
++}
++
++/* Prepare Rx descriptor and memory */
++static int ave_rxdesc_prepare(struct net_device *ndev, int entry)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	struct sk_buff *skb;
++	dma_addr_t paddr;
++	int ret;
++
++	skb = priv->rx.desc[entry].skbs;
++	if (!skb) {
++		skb = netdev_alloc_skb(ndev, AVE_MAX_ETHFRAME);
++		if (!skb) {
++			netdev_err(ndev, "can't allocate skb for Rx\n");
++			return -ENOMEM;
++		}
++		skb->data += AVE_FRAME_HEADROOM;
++		skb->tail += AVE_FRAME_HEADROOM;
++	}
++
++	/* set disable to cmdsts */
++	ave_desc_write_cmdsts(ndev, AVE_DESCID_RX, entry,
++			      AVE_STS_INTR | AVE_STS_OWN);
++
++	/* map Rx buffer
++	 * Rx buffer set to the Rx descriptor has two restrictions:
++	 * - Rx buffer address is 4 byte aligned.
++	 * - Rx buffer begins with 2 byte headroom, and data will be put from
++	 *   (buffer + 2).
++	 * To satisfy this, specify the address to put back the buffer
++	 * pointer advanced by AVE_FRAME_HEADROOM, and expand the map size
++	 * by AVE_FRAME_HEADROOM.
++	 */
++	ret = ave_dma_map(ndev, &priv->rx.desc[entry],
++			  skb->data - AVE_FRAME_HEADROOM,
++			  AVE_MAX_ETHFRAME + AVE_FRAME_HEADROOM,
++			  DMA_FROM_DEVICE, &paddr);
++	if (ret) {
++		netdev_err(ndev, "can't map skb for Rx\n");
++		dev_kfree_skb_any(skb);
++		return ret;
++	}
++	priv->rx.desc[entry].skbs = skb;
++
++	/* set buffer pointer */
++	ave_desc_write_addr(ndev, AVE_DESCID_RX, entry, paddr);
++
++	/* set enable to cmdsts */
++	ave_desc_write_cmdsts(ndev, AVE_DESCID_RX, entry,
++			      AVE_STS_INTR | AVE_MAX_ETHFRAME);
++
++	return ret;
++}
++
++/* Switch state of descriptor */
++static int ave_desc_switch(struct net_device *ndev, enum desc_state state)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	int ret = 0;
++	u32 val;
++
++	switch (state) {
++	case AVE_DESC_START:
++		writel(AVE_DESCC_TD | AVE_DESCC_RD0, priv->base + AVE_DESCC);
++		break;
++
++	case AVE_DESC_STOP:
++		writel(0, priv->base + AVE_DESCC);
++		if (readl_poll_timeout(priv->base + AVE_DESCC, val, !val,
++				       150, 15000)) {
++			netdev_err(ndev, "can't stop descriptor\n");
++			ret = -EBUSY;
++		}
++		break;
++
++	case AVE_DESC_RX_SUSPEND:
++		val = readl(priv->base + AVE_DESCC);
++		val |= AVE_DESCC_RDSTP;
++		val &= ~AVE_DESCC_STATUS_MASK;
++		writel(val, priv->base + AVE_DESCC);
++		if (readl_poll_timeout(priv->base + AVE_DESCC, val,
++				       val & (AVE_DESCC_RDSTP << 16),
++				       150, 150000)) {
++			netdev_err(ndev, "can't suspend descriptor\n");
++			ret = -EBUSY;
++		}
++		break;
++
++	case AVE_DESC_RX_PERMIT:
++		val = readl(priv->base + AVE_DESCC);
++		val &= ~AVE_DESCC_RDSTP;
++		val &= ~AVE_DESCC_STATUS_MASK;
++		writel(val, priv->base + AVE_DESCC);
++		break;
++
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	return ret;
++}
++
++static int ave_tx_complete(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 proc_idx, done_idx, ndesc, cmdsts;
++	unsigned int nr_freebuf = 0;
++	unsigned int tx_packets = 0;
++	unsigned int tx_bytes = 0;
++
++	proc_idx = priv->tx.proc_idx;
++	done_idx = priv->tx.done_idx;
++	ndesc    = priv->tx.ndesc;
++
++	/* free pre-stored skb from done_idx to proc_idx */
++	while (proc_idx != done_idx) {
++		cmdsts = ave_desc_read_cmdsts(ndev, AVE_DESCID_TX, done_idx);
++
++		/* do nothing if owner is HW (==1 for Tx) */
++		if (cmdsts & AVE_STS_OWN)
++			break;
++
++		/* check Tx status and updates statistics */
++		if (cmdsts & AVE_STS_OK) {
++			tx_bytes += cmdsts & AVE_STS_PKTLEN_TX_MASK;
++			/* success */
++			if (cmdsts & AVE_STS_LAST)
++				tx_packets++;
++		} else {
++			/* error */
++			if (cmdsts & AVE_STS_LAST) {
++				priv->stats_tx.errors++;
++				if (cmdsts & (AVE_STS_OWC | AVE_STS_EC))
++					priv->stats_tx.collisions++;
++			}
++		}
++
++		/* release skb */
++		if (priv->tx.desc[done_idx].skbs) {
++			ave_dma_unmap(ndev, &priv->tx.desc[done_idx],
++				      DMA_TO_DEVICE);
++			dev_consume_skb_any(priv->tx.desc[done_idx].skbs);
++			priv->tx.desc[done_idx].skbs = NULL;
++			nr_freebuf++;
++		}
++		done_idx = (done_idx + 1) % ndesc;
++	}
++
++	priv->tx.done_idx = done_idx;
++
++	/* update stats */
++	u64_stats_update_begin(&priv->stats_tx.syncp);
++	priv->stats_tx.packets += tx_packets;
++	priv->stats_tx.bytes   += tx_bytes;
++	u64_stats_update_end(&priv->stats_tx.syncp);
++
++	/* wake queue for freeing buffer */
++	if (unlikely(netif_queue_stopped(ndev)) && nr_freebuf)
++		netif_wake_queue(ndev);
++
++	return nr_freebuf;
++}
++
++static int ave_rx_receive(struct net_device *ndev, int num)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	unsigned int rx_packets = 0;
++	unsigned int rx_bytes = 0;
++	u32 proc_idx, done_idx;
++	struct sk_buff *skb;
++	unsigned int pktlen;
++	int restpkt, npkts;
++	u32 ndesc, cmdsts;
++
++	proc_idx = priv->rx.proc_idx;
++	done_idx = priv->rx.done_idx;
++	ndesc    = priv->rx.ndesc;
++	restpkt  = ((proc_idx + ndesc - 1) - done_idx) % ndesc;
++
++	for (npkts = 0; npkts < num; npkts++) {
++		/* we can't receive more packet, so fill desc quickly */
++		if (--restpkt < 0)
++			break;
++
++		cmdsts = ave_desc_read_cmdsts(ndev, AVE_DESCID_RX, proc_idx);
++
++		/* do nothing if owner is HW (==0 for Rx) */
++		if (!(cmdsts & AVE_STS_OWN))
++			break;
++
++		if (!(cmdsts & AVE_STS_OK)) {
++			priv->stats_rx.errors++;
++			proc_idx = (proc_idx + 1) % ndesc;
++			continue;
++		}
++
++		pktlen = cmdsts & AVE_STS_PKTLEN_RX_MASK;
++
++		/* get skbuff for rx */
++		skb = priv->rx.desc[proc_idx].skbs;
++		priv->rx.desc[proc_idx].skbs = NULL;
++
++		ave_dma_unmap(ndev, &priv->rx.desc[proc_idx], DMA_FROM_DEVICE);
++
++		skb->dev = ndev;
++		skb_put(skb, pktlen);
++		skb->protocol = eth_type_trans(skb, ndev);
++
++		if ((cmdsts & AVE_STS_CSSV) && (!(cmdsts & AVE_STS_CSER)))
++			skb->ip_summed = CHECKSUM_UNNECESSARY;
++
++		rx_packets++;
++		rx_bytes += pktlen;
++
++		netif_receive_skb(skb);
++
++		proc_idx = (proc_idx + 1) % ndesc;
++	}
++
++	priv->rx.proc_idx = proc_idx;
++
++	/* update stats */
++	u64_stats_update_begin(&priv->stats_rx.syncp);
++	priv->stats_rx.packets += rx_packets;
++	priv->stats_rx.bytes   += rx_bytes;
++	u64_stats_update_end(&priv->stats_rx.syncp);
++
++	/* refill the Rx buffers */
++	while (proc_idx != done_idx) {
++		if (ave_rxdesc_prepare(ndev, done_idx))
++			break;
++		done_idx = (done_idx + 1) % ndesc;
++	}
++
++	priv->rx.done_idx = done_idx;
++
++	return npkts;
++}
++
++static int ave_napi_poll_rx(struct napi_struct *napi, int budget)
++{
++	struct ave_private *priv;
++	struct net_device *ndev;
++	int num;
++
++	priv = container_of(napi, struct ave_private, napi_rx);
++	ndev = priv->ndev;
++
++	num = ave_rx_receive(ndev, budget);
++	if (num < budget) {
++		napi_complete_done(napi, num);
++
++		/* enable Rx interrupt when NAPI finishes */
++		ave_irq_enable(ndev, AVE_GI_RXIINT);
++	}
++
++	return num;
++}
++
++static int ave_napi_poll_tx(struct napi_struct *napi, int budget)
++{
++	struct ave_private *priv;
++	struct net_device *ndev;
++	int num;
++
++	priv = container_of(napi, struct ave_private, napi_tx);
++	ndev = priv->ndev;
++
++	num = ave_tx_complete(ndev);
++	napi_complete(napi);
++
++	/* enable Tx interrupt when NAPI finishes */
++	ave_irq_enable(ndev, AVE_GI_TX);
++
++	return num;
++}
++
++static void ave_global_reset(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 val;
++
++	/* set config register */
++	val = AVE_CFGR_FLE | AVE_CFGR_IPFCEN | AVE_CFGR_CHE;
++	if (!phy_interface_mode_is_rgmii(priv->phy_mode))
++		val |= AVE_CFGR_MII;
++	writel(val, priv->base + AVE_CFGR);
++
++	/* reset RMII register */
++	val = readl(priv->base + AVE_RSTCTRL);
++	val &= ~AVE_RSTCTRL_RMIIRST;
++	writel(val, priv->base + AVE_RSTCTRL);
++
++	/* assert reset */
++	writel(AVE_GRR_GRST | AVE_GRR_PHYRST, priv->base + AVE_GRR);
++	msleep(20);
++
++	/* 1st, negate PHY reset only */
++	writel(AVE_GRR_GRST, priv->base + AVE_GRR);
++	msleep(40);
++
++	/* negate reset */
++	writel(0, priv->base + AVE_GRR);
++	msleep(40);
++
++	/* negate RMII register */
++	val = readl(priv->base + AVE_RSTCTRL);
++	val |= AVE_RSTCTRL_RMIIRST;
++	writel(val, priv->base + AVE_RSTCTRL);
++
++	ave_irq_disable_all(ndev);
++}
++
++static void ave_rxfifo_reset(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 rxcr_org;
++
++	/* save and disable MAC receive op */
++	rxcr_org = readl(priv->base + AVE_RXCR);
++	writel(rxcr_org & (~AVE_RXCR_RXEN), priv->base + AVE_RXCR);
++
++	/* suspend Rx descriptor */
++	ave_desc_switch(ndev, AVE_DESC_RX_SUSPEND);
++
++	/* receive all packets before descriptor starts */
++	ave_rx_receive(ndev, priv->rx.ndesc);
++
++	/* assert reset */
++	writel(AVE_GRR_RXFFR, priv->base + AVE_GRR);
++	udelay(50);
++
++	/* negate reset */
++	writel(0, priv->base + AVE_GRR);
++	udelay(20);
++
++	/* negate interrupt status */
++	writel(AVE_GI_RXOVF, priv->base + AVE_GISR);
++
++	/* permit descriptor */
++	ave_desc_switch(ndev, AVE_DESC_RX_PERMIT);
++
++	/* restore MAC reccieve op */
++	writel(rxcr_org, priv->base + AVE_RXCR);
++}
++
++static irqreturn_t ave_irq_handler(int irq, void *netdev)
++{
++	struct net_device *ndev = (struct net_device *)netdev;
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 gimr_val, gisr_val;
++
++	gimr_val = ave_irq_disable_all(ndev);
++
++	/* get interrupt status */
++	gisr_val = readl(priv->base + AVE_GISR);
++
++	/* PHY */
++	if (gisr_val & AVE_GI_PHY)
++		writel(AVE_GI_PHY, priv->base + AVE_GISR);
++
++	/* check exceeding packet */
++	if (gisr_val & AVE_GI_RXERR) {
++		writel(AVE_GI_RXERR, priv->base + AVE_GISR);
++		netdev_err(ndev, "receive a packet exceeding frame buffer\n");
++	}
++
++	gisr_val &= gimr_val;
++	if (!gisr_val)
++		goto exit_isr;
++
++	/* RxFIFO overflow */
++	if (gisr_val & AVE_GI_RXOVF) {
++		priv->stats_rx.fifo_errors++;
++		ave_rxfifo_reset(ndev);
++		goto exit_isr;
++	}
++
++	/* Rx drop */
++	if (gisr_val & AVE_GI_RXDROP) {
++		priv->stats_rx.dropped++;
++		writel(AVE_GI_RXDROP, priv->base + AVE_GISR);
++	}
++
++	/* Rx interval */
++	if (gisr_val & AVE_GI_RXIINT) {
++		napi_schedule(&priv->napi_rx);
++		/* still force to disable Rx interrupt until NAPI finishes */
++		gimr_val &= ~AVE_GI_RXIINT;
++	}
++
++	/* Tx completed */
++	if (gisr_val & AVE_GI_TX) {
++		napi_schedule(&priv->napi_tx);
++		/* still force to disable Tx interrupt until NAPI finishes */
++		gimr_val &= ~AVE_GI_TX;
++	}
++
++exit_isr:
++	ave_irq_restore(ndev, gimr_val);
++
++	return IRQ_HANDLED;
++}
++
++static int ave_pfsel_start(struct net_device *ndev, unsigned int entry)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 val;
++
++	if (WARN_ON(entry > AVE_PF_SIZE))
++		return -EINVAL;
++
++	val = readl(priv->base + AVE_PFEN);
++	writel(val | BIT(entry), priv->base + AVE_PFEN);
++
++	return 0;
++}
++
++static int ave_pfsel_stop(struct net_device *ndev, unsigned int entry)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 val;
++
++	if (WARN_ON(entry > AVE_PF_SIZE))
++		return -EINVAL;
++
++	val = readl(priv->base + AVE_PFEN);
++	writel(val & ~BIT(entry), priv->base + AVE_PFEN);
++
++	return 0;
++}
++
++static int ave_pfsel_set_macaddr(struct net_device *ndev,
++				 unsigned int entry,
++				 const unsigned char *mac_addr,
++				 unsigned int set_size)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	if (WARN_ON(entry > AVE_PF_SIZE))
++		return -EINVAL;
++	if (WARN_ON(set_size > 6))
++		return -EINVAL;
++
++	ave_pfsel_stop(ndev, entry);
++
++	/* set MAC address for the filter */
++	ave_hw_write_macaddr(ndev, mac_addr,
++			     AVE_PKTF(entry), AVE_PKTF(entry) + 4);
++
++	/* set byte mask */
++	writel(GENMASK(31, set_size) & AVE_PFMBYTE_MASK0,
++	       priv->base + AVE_PFMBYTE(entry));
++	writel(AVE_PFMBYTE_MASK1, priv->base + AVE_PFMBYTE(entry) + 4);
++
++	/* set bit mask filter */
++	writel(AVE_PFMBIT_MASK, priv->base + AVE_PFMBIT(entry));
++
++	/* set selector to ring 0 */
++	writel(0, priv->base + AVE_PFSEL(entry));
++
++	/* restart filter */
++	ave_pfsel_start(ndev, entry);
++
++	return 0;
++}
++
++static void ave_pfsel_set_promisc(struct net_device *ndev,
++				  unsigned int entry, u32 rxring)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	if (WARN_ON(entry > AVE_PF_SIZE))
++		return;
++
++	ave_pfsel_stop(ndev, entry);
++
++	/* set byte mask */
++	writel(AVE_PFMBYTE_MASK0, priv->base + AVE_PFMBYTE(entry));
++	writel(AVE_PFMBYTE_MASK1, priv->base + AVE_PFMBYTE(entry) + 4);
++
++	/* set bit mask filter */
++	writel(AVE_PFMBIT_MASK, priv->base + AVE_PFMBIT(entry));
++
++	/* set selector to rxring */
++	writel(rxring, priv->base + AVE_PFSEL(entry));
++
++	ave_pfsel_start(ndev, entry);
++}
++
++static void ave_pfsel_init(struct net_device *ndev)
++{
++	unsigned char bcast_mac[ETH_ALEN];
++	int i;
++
++	eth_broadcast_addr(bcast_mac);
++
++	for (i = 0; i < AVE_PF_SIZE; i++)
++		ave_pfsel_stop(ndev, i);
++
++	/* promiscious entry, select ring 0 */
++	ave_pfsel_set_promisc(ndev, AVE_PFNUM_FILTER, 0);
++
++	/* unicast entry */
++	ave_pfsel_set_macaddr(ndev, AVE_PFNUM_UNICAST, ndev->dev_addr, 6);
++
++	/* broadcast entry */
++	ave_pfsel_set_macaddr(ndev, AVE_PFNUM_BROADCAST, bcast_mac, 6);
++}
++
++static void ave_phy_adjust_link(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	struct phy_device *phydev = ndev->phydev;
++	u32 val, txcr, rxcr, rxcr_org;
++	u16 rmt_adv = 0, lcl_adv = 0;
++	u8 cap;
++
++	/* set RGMII speed */
++	val = readl(priv->base + AVE_TXCR);
++	val &= ~(AVE_TXCR_TXSPD_100 | AVE_TXCR_TXSPD_1G);
++
++	if (phy_interface_is_rgmii(phydev) && phydev->speed == SPEED_1000)
++		val |= AVE_TXCR_TXSPD_1G;
++	else if (phydev->speed == SPEED_100)
++		val |= AVE_TXCR_TXSPD_100;
++
++	writel(val, priv->base + AVE_TXCR);
++
++	/* set RMII speed (100M/10M only) */
++	if (!phy_interface_is_rgmii(phydev)) {
++		val = readl(priv->base + AVE_LINKSEL);
++		if (phydev->speed == SPEED_10)
++			val &= ~AVE_LINKSEL_100M;
++		else
++			val |= AVE_LINKSEL_100M;
++		writel(val, priv->base + AVE_LINKSEL);
++	}
++
++	/* check current RXCR/TXCR */
++	rxcr = readl(priv->base + AVE_RXCR);
++	txcr = readl(priv->base + AVE_TXCR);
++	rxcr_org = rxcr;
++
++	if (phydev->duplex) {
++		rxcr |= AVE_RXCR_FDUPEN;
++
++		if (phydev->pause)
++			rmt_adv |= LPA_PAUSE_CAP;
++		if (phydev->asym_pause)
++			rmt_adv |= LPA_PAUSE_ASYM;
++
++		lcl_adv = linkmode_adv_to_lcl_adv_t(phydev->advertising);
++		cap = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv);
++		if (cap & FLOW_CTRL_TX)
++			txcr |= AVE_TXCR_FLOCTR;
++		else
++			txcr &= ~AVE_TXCR_FLOCTR;
++		if (cap & FLOW_CTRL_RX)
++			rxcr |= AVE_RXCR_FLOCTR;
++		else
++			rxcr &= ~AVE_RXCR_FLOCTR;
++	} else {
++		rxcr &= ~AVE_RXCR_FDUPEN;
++		rxcr &= ~AVE_RXCR_FLOCTR;
++		txcr &= ~AVE_TXCR_FLOCTR;
++	}
++
++	if (rxcr_org != rxcr) {
++		/* disable Rx mac */
++		writel(rxcr & ~AVE_RXCR_RXEN, priv->base + AVE_RXCR);
++		/* change and enable TX/Rx mac */
++		writel(txcr, priv->base + AVE_TXCR);
++		writel(rxcr, priv->base + AVE_RXCR);
++	}
++
++	phy_print_status(phydev);
++}
++
++static void ave_macaddr_init(struct net_device *ndev)
++{
++	ave_hw_write_macaddr(ndev, ndev->dev_addr, AVE_RXMAC1R, AVE_RXMAC2R);
++
++	/* pfsel unicast entry */
++	ave_pfsel_set_macaddr(ndev, AVE_PFNUM_UNICAST, ndev->dev_addr, 6);
++}
++
++static int ave_init(struct net_device *ndev)
++{
++	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
++	struct ave_private *priv = netdev_priv(ndev);
++	struct device *dev = ndev->dev.parent;
++	struct device_node *np = dev->of_node;
++	struct device_node *mdio_np;
++	struct phy_device *phydev;
++	int nc, nr, ret;
++
++	/* enable clk because of hw access until ndo_open */
++	for (nc = 0; nc < priv->nclks; nc++) {
++		ret = clk_prepare_enable(priv->clk[nc]);
++		if (ret) {
++			dev_err(dev, "can't enable clock\n");
++			goto out_clk_disable;
++		}
++	}
++
++	for (nr = 0; nr < priv->nrsts; nr++) {
++		ret = reset_control_deassert(priv->rst[nr]);
++		if (ret) {
++			dev_err(dev, "can't deassert reset\n");
++			goto out_reset_assert;
++		}
++	}
++
++	ret = regmap_update_bits(priv->regmap, SG_ETPINMODE,
++				 priv->pinmode_mask, priv->pinmode_val);
++	if (ret)
++		goto out_reset_assert;
++
++	ave_global_reset(ndev);
++
++	mdio_np = of_get_child_by_name(np, "mdio");
++	if (!mdio_np) {
++		dev_err(dev, "mdio node not found\n");
++		ret = -EINVAL;
++		goto out_reset_assert;
++	}
++	ret = of_mdiobus_register(priv->mdio, mdio_np);
++	of_node_put(mdio_np);
++	if (ret) {
++		dev_err(dev, "failed to register mdiobus\n");
++		goto out_reset_assert;
++	}
++
++	phydev = of_phy_get_and_connect(ndev, np, ave_phy_adjust_link);
++	if (!phydev) {
++		dev_err(dev, "could not attach to PHY\n");
++		ret = -ENODEV;
++		goto out_mdio_unregister;
++	}
++
++	priv->phydev = phydev;
++
++	ave_ethtool_get_wol(ndev, &wol);
++	device_set_wakeup_capable(&ndev->dev, !!wol.supported);
++
++	/* set wol initial state disabled */
++	wol.wolopts = 0;
++	__ave_ethtool_set_wol(ndev, &wol);
++
++	if (!phy_interface_is_rgmii(phydev))
++		phy_set_max_speed(phydev, SPEED_100);
++
++	phy_support_asym_pause(phydev);
++
++	phydev->mac_managed_pm = true;
++
++	phy_attached_info(phydev);
++
++	return 0;
++
++out_mdio_unregister:
++	mdiobus_unregister(priv->mdio);
++out_reset_assert:
++	while (--nr >= 0)
++		reset_control_assert(priv->rst[nr]);
++out_clk_disable:
++	while (--nc >= 0)
++		clk_disable_unprepare(priv->clk[nc]);
++
++	return ret;
++}
++
++static void ave_uninit(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	int i;
++
++	phy_disconnect(priv->phydev);
++	mdiobus_unregister(priv->mdio);
++
++	/* disable clk because of hw access after ndo_stop */
++	for (i = 0; i < priv->nrsts; i++)
++		reset_control_assert(priv->rst[i]);
++	for (i = 0; i < priv->nclks; i++)
++		clk_disable_unprepare(priv->clk[i]);
++}
++
++static int ave_open(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	int entry;
++	int ret;
++	u32 val;
++
++	ret = request_irq(priv->irq, ave_irq_handler, IRQF_SHARED, ndev->name,
++			  ndev);
++	if (ret)
++		return ret;
++
++	priv->tx.desc = kcalloc(priv->tx.ndesc, sizeof(*priv->tx.desc),
++				GFP_KERNEL);
++	if (!priv->tx.desc) {
++		ret = -ENOMEM;
++		goto out_free_irq;
++	}
++
++	priv->rx.desc = kcalloc(priv->rx.ndesc, sizeof(*priv->rx.desc),
++				GFP_KERNEL);
++	if (!priv->rx.desc) {
++		kfree(priv->tx.desc);
++		ret = -ENOMEM;
++		goto out_free_irq;
++	}
++
++	/* initialize Tx work and descriptor */
++	priv->tx.proc_idx = 0;
++	priv->tx.done_idx = 0;
++	for (entry = 0; entry < priv->tx.ndesc; entry++) {
++		ave_desc_write_cmdsts(ndev, AVE_DESCID_TX, entry, 0);
++		ave_desc_write_addr(ndev, AVE_DESCID_TX, entry, 0);
++	}
++	writel(AVE_TXDC_ADDR_START |
++	       (((priv->tx.ndesc * priv->desc_size) << 16) & AVE_TXDC_SIZE),
++	       priv->base + AVE_TXDC);
++
++	/* initialize Rx work and descriptor */
++	priv->rx.proc_idx = 0;
++	priv->rx.done_idx = 0;
++	for (entry = 0; entry < priv->rx.ndesc; entry++) {
++		if (ave_rxdesc_prepare(ndev, entry))
++			break;
++	}
++	writel(AVE_RXDC0_ADDR_START |
++	       (((priv->rx.ndesc * priv->desc_size) << 16) & AVE_RXDC0_SIZE),
++	       priv->base + AVE_RXDC0);
++
++	ave_desc_switch(ndev, AVE_DESC_START);
++
++	ave_pfsel_init(ndev);
++	ave_macaddr_init(ndev);
++
++	/* set Rx configuration */
++	/* full duplex, enable pause drop, enalbe flow control */
++	val = AVE_RXCR_RXEN | AVE_RXCR_FDUPEN | AVE_RXCR_DRPEN |
++		AVE_RXCR_FLOCTR | (AVE_MAX_ETHFRAME & AVE_RXCR_MPSIZ_MASK);
++	writel(val, priv->base + AVE_RXCR);
++
++	/* set Tx configuration */
++	/* enable flow control, disable loopback */
++	writel(AVE_TXCR_FLOCTR, priv->base + AVE_TXCR);
++
++	/* enable timer, clear EN,INTM, and mask interval unit(BSCK) */
++	val = readl(priv->base + AVE_IIRQC) & AVE_IIRQC_BSCK;
++	val |= AVE_IIRQC_EN0 | (AVE_INTM_COUNT << 16);
++	writel(val, priv->base + AVE_IIRQC);
++
++	val = AVE_GI_RXIINT | AVE_GI_RXOVF | AVE_GI_TX | AVE_GI_RXDROP;
++	ave_irq_restore(ndev, val);
++
++	napi_enable(&priv->napi_rx);
++	napi_enable(&priv->napi_tx);
++
++	phy_start(ndev->phydev);
++	phy_start_aneg(ndev->phydev);
++	netif_start_queue(ndev);
++
++	return 0;
++
++out_free_irq:
++	disable_irq(priv->irq);
++	free_irq(priv->irq, ndev);
++
++	return ret;
++}
++
++static int ave_stop(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	int entry;
++
++	ave_irq_disable_all(ndev);
++	disable_irq(priv->irq);
++	free_irq(priv->irq, ndev);
++
++	netif_tx_disable(ndev);
++	phy_stop(ndev->phydev);
++	napi_disable(&priv->napi_tx);
++	napi_disable(&priv->napi_rx);
++
++	ave_desc_switch(ndev, AVE_DESC_STOP);
++
++	/* free Tx buffer */
++	for (entry = 0; entry < priv->tx.ndesc; entry++) {
++		if (!priv->tx.desc[entry].skbs)
++			continue;
++
++		ave_dma_unmap(ndev, &priv->tx.desc[entry], DMA_TO_DEVICE);
++		dev_kfree_skb_any(priv->tx.desc[entry].skbs);
++		priv->tx.desc[entry].skbs = NULL;
++	}
++	priv->tx.proc_idx = 0;
++	priv->tx.done_idx = 0;
++
++	/* free Rx buffer */
++	for (entry = 0; entry < priv->rx.ndesc; entry++) {
++		if (!priv->rx.desc[entry].skbs)
++			continue;
++
++		ave_dma_unmap(ndev, &priv->rx.desc[entry], DMA_FROM_DEVICE);
++		dev_kfree_skb_any(priv->rx.desc[entry].skbs);
++		priv->rx.desc[entry].skbs = NULL;
++	}
++	priv->rx.proc_idx = 0;
++	priv->rx.done_idx = 0;
++
++	kfree(priv->tx.desc);
++	kfree(priv->rx.desc);
++
++	return 0;
++}
++
++static netdev_tx_t ave_start_xmit(struct sk_buff *skb, struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 proc_idx, done_idx, ndesc, cmdsts;
++	int ret, freepkt;
++	dma_addr_t paddr;
++
++	proc_idx = priv->tx.proc_idx;
++	done_idx = priv->tx.done_idx;
++	ndesc = priv->tx.ndesc;
++	freepkt = ((done_idx + ndesc - 1) - proc_idx) % ndesc;
++
++	/* stop queue when not enough entry */
++	if (unlikely(freepkt < 1)) {
++		netif_stop_queue(ndev);
++		return NETDEV_TX_BUSY;
++	}
++
++	/* add padding for short packet */
++	if (skb_put_padto(skb, ETH_ZLEN)) {
++		priv->stats_tx.dropped++;
++		return NETDEV_TX_OK;
++	}
++
++	/* map Tx buffer
++	 * Tx buffer set to the Tx descriptor doesn't have any restriction.
++	 */
++	ret = ave_dma_map(ndev, &priv->tx.desc[proc_idx],
++			  skb->data, skb->len, DMA_TO_DEVICE, &paddr);
++	if (ret) {
++		dev_kfree_skb_any(skb);
++		priv->stats_tx.dropped++;
++		return NETDEV_TX_OK;
++	}
++
++	priv->tx.desc[proc_idx].skbs = skb;
++
++	ave_desc_write_addr(ndev, AVE_DESCID_TX, proc_idx, paddr);
++
++	cmdsts = AVE_STS_OWN | AVE_STS_1ST | AVE_STS_LAST |
++		(skb->len & AVE_STS_PKTLEN_TX_MASK);
++
++	/* set interrupt per AVE_FORCE_TXINTCNT or when queue is stopped */
++	if (!(proc_idx % AVE_FORCE_TXINTCNT) || netif_queue_stopped(ndev))
++		cmdsts |= AVE_STS_INTR;
++
++	/* disable checksum calculation when skb doesn't calurate checksum */
++	if (skb->ip_summed == CHECKSUM_NONE ||
++	    skb->ip_summed == CHECKSUM_UNNECESSARY)
++		cmdsts |= AVE_STS_NOCSUM;
++
++	ave_desc_write_cmdsts(ndev, AVE_DESCID_TX, proc_idx, cmdsts);
++
++	priv->tx.proc_idx = (proc_idx + 1) % ndesc;
++
++	return NETDEV_TX_OK;
++}
++
++static int ave_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd)
++{
++	return phy_mii_ioctl(ndev->phydev, ifr, cmd);
++}
++
++static const u8 v4multi_macadr[] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 };
++static const u8 v6multi_macadr[] = { 0x33, 0x00, 0x00, 0x00, 0x00, 0x00 };
++
++static void ave_set_rx_mode(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	struct netdev_hw_addr *hw_adr;
++	int count, mc_cnt;
++	u32 val;
++
++	/* MAC addr filter enable for promiscious mode */
++	mc_cnt = netdev_mc_count(ndev);
++	val = readl(priv->base + AVE_RXCR);
++	if (ndev->flags & IFF_PROMISC || !mc_cnt)
++		val &= ~AVE_RXCR_AFEN;
++	else
++		val |= AVE_RXCR_AFEN;
++	writel(val, priv->base + AVE_RXCR);
++
++	/* set all multicast address */
++	if ((ndev->flags & IFF_ALLMULTI) || mc_cnt > AVE_PF_MULTICAST_SIZE) {
++		ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST,
++				      v4multi_macadr, 1);
++		ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST + 1,
++				      v6multi_macadr, 1);
++	} else {
++		/* stop all multicast filter */
++		for (count = 0; count < AVE_PF_MULTICAST_SIZE; count++)
++			ave_pfsel_stop(ndev, AVE_PFNUM_MULTICAST + count);
++
++		/* set multicast addresses */
++		count = 0;
++		netdev_for_each_mc_addr(hw_adr, ndev) {
++			if (count == mc_cnt)
++				break;
++			ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST + count,
++					      hw_adr->addr, 6);
++			count++;
++		}
++	}
++}
++
++static void ave_get_stats64(struct net_device *ndev,
++			    struct rtnl_link_stats64 *stats)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	unsigned int start;
++
++	do {
++		start = u64_stats_fetch_begin_irq(&priv->stats_rx.syncp);
++		stats->rx_packets = priv->stats_rx.packets;
++		stats->rx_bytes	  = priv->stats_rx.bytes;
++	} while (u64_stats_fetch_retry_irq(&priv->stats_rx.syncp, start));
++
++	do {
++		start = u64_stats_fetch_begin_irq(&priv->stats_tx.syncp);
++		stats->tx_packets = priv->stats_tx.packets;
++		stats->tx_bytes	  = priv->stats_tx.bytes;
++	} while (u64_stats_fetch_retry_irq(&priv->stats_tx.syncp, start));
++
++	stats->rx_errors      = priv->stats_rx.errors;
++	stats->tx_errors      = priv->stats_tx.errors;
++	stats->rx_dropped     = priv->stats_rx.dropped;
++	stats->tx_dropped     = priv->stats_tx.dropped;
++	stats->rx_fifo_errors = priv->stats_rx.fifo_errors;
++	stats->collisions     = priv->stats_tx.collisions;
++}
++
++static int ave_set_mac_address(struct net_device *ndev, void *p)
++{
++	int ret = eth_mac_addr(ndev, p);
++
++	if (ret)
++		return ret;
++
++	ave_macaddr_init(ndev);
++
++	return 0;
++}
++
++static const struct net_device_ops ave_netdev_ops = {
++	.ndo_init		= ave_init,
++	.ndo_uninit		= ave_uninit,
++	.ndo_open		= ave_open,
++	.ndo_stop		= ave_stop,
++	.ndo_start_xmit		= ave_start_xmit,
++	.ndo_eth_ioctl		= ave_ioctl,
++	.ndo_set_rx_mode	= ave_set_rx_mode,
++	.ndo_get_stats64	= ave_get_stats64,
++	.ndo_set_mac_address	= ave_set_mac_address,
++};
++
++static int ave_probe(struct platform_device *pdev)
++{
++	const struct ave_soc_data *data;
++	struct device *dev = &pdev->dev;
++	char buf[ETHTOOL_FWVERS_LEN];
++	struct of_phandle_args args;
++	phy_interface_t phy_mode;
++	struct ave_private *priv;
++	struct net_device *ndev;
++	struct device_node *np;
++	void __iomem *base;
++	const char *name;
++	int i, irq, ret;
++	u64 dma_mask;
++	u32 ave_id;
++
++	data = of_device_get_match_data(dev);
++	if (WARN_ON(!data))
++		return -EINVAL;
++
++	np = dev->of_node;
++	ret = of_get_phy_mode(np, &phy_mode);
++	if (ret) {
++		dev_err(dev, "phy-mode not found\n");
++		return ret;
++	}
++
++	irq = platform_get_irq(pdev, 0);
++	if (irq < 0)
++		return irq;
++
++	base = devm_platform_ioremap_resource(pdev, 0);
++	if (IS_ERR(base))
++		return PTR_ERR(base);
++
++	ndev = devm_alloc_etherdev(dev, sizeof(struct ave_private));
++	if (!ndev) {
++		dev_err(dev, "can't allocate ethernet device\n");
++		return -ENOMEM;
++	}
++
++	ndev->netdev_ops = &ave_netdev_ops;
++	ndev->ethtool_ops = &ave_ethtool_ops;
++	SET_NETDEV_DEV(ndev, dev);
++
++	ndev->features    |= (NETIF_F_IP_CSUM | NETIF_F_RXCSUM);
++	ndev->hw_features |= (NETIF_F_IP_CSUM | NETIF_F_RXCSUM);
++
++	ndev->max_mtu = AVE_MAX_ETHFRAME - (ETH_HLEN + ETH_FCS_LEN);
++
++	ret = of_get_ethdev_address(np, ndev);
++	if (ret) {
++		/* if the mac address is invalid, use random mac address */
++		eth_hw_addr_random(ndev);
++		dev_warn(dev, "Using random MAC address: %pM\n",
++			 ndev->dev_addr);
++	}
++
++	priv = netdev_priv(ndev);
++	priv->base = base;
++	priv->irq = irq;
++	priv->ndev = ndev;
++	priv->msg_enable = netif_msg_init(-1, AVE_DEFAULT_MSG_ENABLE);
++	priv->phy_mode = phy_mode;
++	priv->data = data;
++
++	if (IS_DESC_64BIT(priv)) {
++		priv->desc_size = AVE_DESC_SIZE_64;
++		priv->tx.daddr  = AVE_TXDM_64;
++		priv->rx.daddr  = AVE_RXDM_64;
++		dma_mask = DMA_BIT_MASK(64);
++	} else {
++		priv->desc_size = AVE_DESC_SIZE_32;
++		priv->tx.daddr  = AVE_TXDM_32;
++		priv->rx.daddr  = AVE_RXDM_32;
++		dma_mask = DMA_BIT_MASK(32);
++	}
++	ret = dma_set_mask(dev, dma_mask);
++	if (ret)
++		return ret;
++
++	priv->tx.ndesc = AVE_NR_TXDESC;
++	priv->rx.ndesc = AVE_NR_RXDESC;
++
++	u64_stats_init(&priv->stats_tx.syncp);
++	u64_stats_init(&priv->stats_rx.syncp);
++
++	for (i = 0; i < AVE_MAX_CLKS; i++) {
++		name = priv->data->clock_names[i];
++		if (!name)
++			break;
++		priv->clk[i] = devm_clk_get(dev, name);
++		if (IS_ERR(priv->clk[i]))
++			return PTR_ERR(priv->clk[i]);
++		priv->nclks++;
++	}
++
++	for (i = 0; i < AVE_MAX_RSTS; i++) {
++		name = priv->data->reset_names[i];
++		if (!name)
++			break;
++		priv->rst[i] = devm_reset_control_get_shared(dev, name);
++		if (IS_ERR(priv->rst[i]))
++			return PTR_ERR(priv->rst[i]);
++		priv->nrsts++;
++	}
++
++	ret = of_parse_phandle_with_fixed_args(np,
++					       "socionext,syscon-phy-mode",
++					       1, 0, &args);
++	if (ret) {
++		dev_err(dev, "can't get syscon-phy-mode property\n");
++		return ret;
++	}
++	priv->regmap = syscon_node_to_regmap(args.np);
++	of_node_put(args.np);
++	if (IS_ERR(priv->regmap)) {
++		dev_err(dev, "can't map syscon-phy-mode\n");
++		return PTR_ERR(priv->regmap);
++	}
++	ret = priv->data->get_pinmode(priv, phy_mode, args.args[0]);
++	if (ret) {
++		dev_err(dev, "invalid phy-mode setting\n");
++		return ret;
++	}
++
++	priv->mdio = devm_mdiobus_alloc(dev);
++	if (!priv->mdio)
++		return -ENOMEM;
++	priv->mdio->priv = ndev;
++	priv->mdio->parent = dev;
++	priv->mdio->read = ave_mdiobus_read;
++	priv->mdio->write = ave_mdiobus_write;
++	priv->mdio->name = "uniphier-mdio";
++	snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "%s-%x",
++		 pdev->name, pdev->id);
++
++	/* Register as a NAPI supported driver */
++	netif_napi_add(ndev, &priv->napi_rx, ave_napi_poll_rx,
++		       NAPI_POLL_WEIGHT);
++	netif_napi_add_tx(ndev, &priv->napi_tx, ave_napi_poll_tx);
++
++	platform_set_drvdata(pdev, ndev);
++
++	ret = register_netdev(ndev);
++	if (ret) {
++		dev_err(dev, "failed to register netdevice\n");
++		goto out_del_napi;
++	}
++
++	/* get ID and version */
++	ave_id = readl(priv->base + AVE_IDR);
++	ave_hw_read_version(ndev, buf, sizeof(buf));
++
++	dev_info(dev, "Socionext %c%c%c%c Ethernet IP %s (irq=%d, phy=%s)\n",
++		 (ave_id >> 24) & 0xff, (ave_id >> 16) & 0xff,
++		 (ave_id >> 8) & 0xff, (ave_id >> 0) & 0xff,
++		 buf, priv->irq, phy_modes(phy_mode));
++
++	return 0;
++
++out_del_napi:
++	netif_napi_del(&priv->napi_rx);
++	netif_napi_del(&priv->napi_tx);
++
++	return ret;
++}
++
++static int ave_remove(struct platform_device *pdev)
++{
++	struct net_device *ndev = platform_get_drvdata(pdev);
++	struct ave_private *priv = netdev_priv(ndev);
++
++	unregister_netdev(ndev);
++	netif_napi_del(&priv->napi_rx);
++	netif_napi_del(&priv->napi_tx);
++
++	return 0;
++}
++
++#ifdef CONFIG_PM_SLEEP
++static int ave_suspend(struct device *dev)
++{
++	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
++	struct net_device *ndev = dev_get_drvdata(dev);
++	struct ave_private *priv = netdev_priv(ndev);
++	int ret = 0;
++
++	if (netif_running(ndev)) {
++		ret = ave_stop(ndev);
++		netif_device_detach(ndev);
++	}
++
++	ave_ethtool_get_wol(ndev, &wol);
++	priv->wolopts = wol.wolopts;
++
++	return ret;
++}
++
++static int ave_resume(struct device *dev)
++{
++	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
++	struct net_device *ndev = dev_get_drvdata(dev);
++	struct ave_private *priv = netdev_priv(ndev);
++	int ret = 0;
++
++	ave_global_reset(ndev);
++
++	ret = phy_init_hw(ndev->phydev);
++	if (ret)
++		return ret;
++
++	ave_ethtool_get_wol(ndev, &wol);
++	wol.wolopts = priv->wolopts;
++	__ave_ethtool_set_wol(ndev, &wol);
++
++	if (ndev->phydev) {
++		ret = phy_resume(ndev->phydev);
++		if (ret)
++			return ret;
++	}
++
++	if (netif_running(ndev)) {
++		ret = ave_open(ndev);
++		netif_device_attach(ndev);
++	}
++
++	return ret;
++}
++
++static SIMPLE_DEV_PM_OPS(ave_pm_ops, ave_suspend, ave_resume);
++#define AVE_PM_OPS	(&ave_pm_ops)
++#else
++#define AVE_PM_OPS	NULL
++#endif
++
++static int ave_pro4_get_pinmode(struct ave_private *priv,
++				phy_interface_t phy_mode, u32 arg)
++{
++	if (arg > 0)
++		return -EINVAL;
++
++	priv->pinmode_mask = SG_ETPINMODE_RMII(0);
++
++	switch (phy_mode) {
++	case PHY_INTERFACE_MODE_RMII:
++		priv->pinmode_val = SG_ETPINMODE_RMII(0);
++		break;
++	case PHY_INTERFACE_MODE_MII:
++	case PHY_INTERFACE_MODE_RGMII:
++	case PHY_INTERFACE_MODE_RGMII_ID:
++	case PHY_INTERFACE_MODE_RGMII_RXID:
++	case PHY_INTERFACE_MODE_RGMII_TXID:
++		priv->pinmode_val = 0;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int ave_ld11_get_pinmode(struct ave_private *priv,
++				phy_interface_t phy_mode, u32 arg)
++{
++	if (arg > 0)
++		return -EINVAL;
++
++	priv->pinmode_mask = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0);
++
++	switch (phy_mode) {
++	case PHY_INTERFACE_MODE_INTERNAL:
++		priv->pinmode_val = 0;
++		break;
++	case PHY_INTERFACE_MODE_RMII:
++		priv->pinmode_val = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0);
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int ave_ld20_get_pinmode(struct ave_private *priv,
++				phy_interface_t phy_mode, u32 arg)
++{
++	if (arg > 0)
++		return -EINVAL;
++
++	priv->pinmode_mask = SG_ETPINMODE_RMII(0);
++
++	switch (phy_mode) {
++	case PHY_INTERFACE_MODE_RMII:
++		priv->pinmode_val = SG_ETPINMODE_RMII(0);
++		break;
++	case PHY_INTERFACE_MODE_RGMII:
++	case PHY_INTERFACE_MODE_RGMII_ID:
++	case PHY_INTERFACE_MODE_RGMII_RXID:
++	case PHY_INTERFACE_MODE_RGMII_TXID:
++		priv->pinmode_val = 0;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int ave_pxs3_get_pinmode(struct ave_private *priv,
++				phy_interface_t phy_mode, u32 arg)
++{
++	if (arg > 1)
++		return -EINVAL;
++
++	priv->pinmode_mask = SG_ETPINMODE_RMII(arg);
++
++	switch (phy_mode) {
++	case PHY_INTERFACE_MODE_RMII:
++		priv->pinmode_val = SG_ETPINMODE_RMII(arg);
++		break;
++	case PHY_INTERFACE_MODE_RGMII:
++	case PHY_INTERFACE_MODE_RGMII_ID:
++	case PHY_INTERFACE_MODE_RGMII_RXID:
++	case PHY_INTERFACE_MODE_RGMII_TXID:
++		priv->pinmode_val = 0;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static const struct ave_soc_data ave_pro4_data = {
++	.is_desc_64bit = false,
++	.clock_names = {
++		"gio", "ether", "ether-gb", "ether-phy",
++	},
++	.reset_names = {
++		"gio", "ether",
++	},
++	.get_pinmode = ave_pro4_get_pinmode,
++};
++
++static const struct ave_soc_data ave_pxs2_data = {
++	.is_desc_64bit = false,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_pro4_get_pinmode,
++};
++
++static const struct ave_soc_data ave_ld11_data = {
++	.is_desc_64bit = false,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_ld11_get_pinmode,
++};
++
++static const struct ave_soc_data ave_ld20_data = {
++	.is_desc_64bit = true,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_ld20_get_pinmode,
++};
++
++static const struct ave_soc_data ave_pxs3_data = {
++	.is_desc_64bit = false,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_pxs3_get_pinmode,
++};
++
++static const struct ave_soc_data ave_nx1_data = {
++	.is_desc_64bit = true,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_pxs3_get_pinmode,
++};
++
++static const struct of_device_id of_ave_match[] = {
++	{
++		.compatible = "socionext,uniphier-pro4-ave4",
++		.data = &ave_pro4_data,
++	},
++	{
++		.compatible = "socionext,uniphier-pxs2-ave4",
++		.data = &ave_pxs2_data,
++	},
++	{
++		.compatible = "socionext,uniphier-ld11-ave4",
++		.data = &ave_ld11_data,
++	},
++	{
++		.compatible = "socionext,uniphier-ld20-ave4",
++		.data = &ave_ld20_data,
++	},
++	{
++		.compatible = "socionext,uniphier-pxs3-ave4",
++		.data = &ave_pxs3_data,
++	},
++	{
++		.compatible = "socionext,uniphier-nx1-ave4",
++		.data = &ave_nx1_data,
++	},
++	{ /* Sentinel */ }
++};
++MODULE_DEVICE_TABLE(of, of_ave_match);
++
++static struct platform_driver ave_driver = {
++	.probe  = ave_probe,
++	.remove = ave_remove,
++	.driver	= {
++		.name = "ave",
++		.pm   = AVE_PM_OPS,
++		.of_match_table	= of_ave_match,
++	},
++};
++module_platform_driver(ave_driver);
++
++MODULE_AUTHOR("Kunihiko Hayashi <hayashi.kunihiko@socionext.com>");
++MODULE_DESCRIPTION("Socionext UniPhier AVE ethernet driver");
++MODULE_LICENSE("GPL v2");
+diff -rupN linux.orig/drivers/net/ethernet/ti/am65-cpsw-nuss.c linux/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+--- linux.orig/drivers/net/ethernet/ti/am65-cpsw-nuss.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/ti/am65-cpsw-nuss.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1365,12 +1365,12 @@ static void am65_cpsw_nuss_ndo_get_stats
  
  		cpu_stats = per_cpu_ptr(ndev_priv->stats, cpu);
  		do {
@@ -2501,11 +14651,10 @@ index f4a6b590a1e39..1b62400c19049 100644
  
  		stats->rx_packets += rx_packets;
  		stats->rx_bytes   += rx_bytes;
-diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
-index b15d44261e766..68c7b2c05aab3 100644
---- a/drivers/net/ethernet/ti/netcp_core.c
-+++ b/drivers/net/ethernet/ti/netcp_core.c
-@@ -1916,16 +1916,16 @@ netcp_get_stats(struct net_device *ndev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/ethernet/ti/netcp_core.c linux/drivers/net/ethernet/ti/netcp_core.c
+--- linux.orig/drivers/net/ethernet/ti/netcp_core.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/ti/netcp_core.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1916,16 +1916,16 @@ netcp_get_stats(struct net_device *ndev,
  	unsigned int start;
  
  	do {
@@ -2526,11 +14675,10 @@ index b15d44261e766..68c7b2c05aab3 100644
  
  	stats->rx_packets = rxpackets;
  	stats->rx_bytes = rxbytes;
-diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
-index 509c5e9b29dfa..5301c907b5ae3 100644
---- a/drivers/net/ethernet/via/via-rhine.c
-+++ b/drivers/net/ethernet/via/via-rhine.c
-@@ -2217,16 +2217,16 @@ rhine_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/ethernet/via/via-rhine.c linux/drivers/net/ethernet/via/via-rhine.c
+--- linux.orig/drivers/net/ethernet/via/via-rhine.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/via/via-rhine.c	2022-12-04 10:40:26.696034096 -0500
+@@ -2217,16 +2217,16 @@ rhine_get_stats64(struct net_device *dev
  	netdev_stats_to_stats64(stats, &dev->stats);
  
  	do {
@@ -2551,11 +14699,10 @@ index 509c5e9b29dfa..5301c907b5ae3 100644
  }
  
  static void rhine_set_rx_mode(struct net_device *dev)
-diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
-index 9262988d26a32..2c233b59e7d93 100644
---- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
-+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
-@@ -1305,16 +1305,16 @@ axienet_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/ethernet/xilinx/xilinx_axienet_main.c linux/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+--- linux.orig/drivers/net/ethernet/xilinx/xilinx_axienet_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/xilinx/xilinx_axienet_main.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1305,16 +1305,16 @@ axienet_get_stats64(struct net_device *d
  	netdev_stats_to_stats64(stats, &dev->stats);
  
  	do {
@@ -2576,11 +14723,10 @@ index 9262988d26a32..2c233b59e7d93 100644
  }
  
  static const struct net_device_ops axienet_netdev_ops = {
-diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
-index 8113ac17ab70a..2fd8b9c51e839 100644
---- a/drivers/net/hyperv/netvsc_drv.c
-+++ b/drivers/net/hyperv/netvsc_drv.c
-@@ -1264,12 +1264,12 @@ static void netvsc_get_vf_stats(struct net_device *net,
+diff -rupN linux.orig/drivers/net/hyperv/netvsc_drv.c linux/drivers/net/hyperv/netvsc_drv.c
+--- linux.orig/drivers/net/hyperv/netvsc_drv.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/hyperv/netvsc_drv.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1264,12 +1264,12 @@ static void netvsc_get_vf_stats(struct n
  		unsigned int start;
  
  		do {
@@ -2595,7 +14741,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  
  		tot->rx_packets += rx_packets;
  		tot->tx_packets += tx_packets;
-@@ -1294,12 +1294,12 @@ static void netvsc_get_pcpu_stats(struct net_device *net,
+@@ -1294,12 +1294,12 @@ static void netvsc_get_pcpu_stats(struct
  		unsigned int start;
  
  		do {
@@ -2610,7 +14756,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  		this_tot->rx_packets = this_tot->vf_rx_packets;
  		this_tot->tx_packets = this_tot->vf_tx_packets;
  		this_tot->rx_bytes   = this_tot->vf_rx_bytes;
-@@ -1318,20 +1318,20 @@ static void netvsc_get_pcpu_stats(struct net_device *net,
+@@ -1318,20 +1318,20 @@ static void netvsc_get_pcpu_stats(struct
  
  		tx_stats = &nvchan->tx_stats;
  		do {
@@ -2635,7 +14781,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  
  		this_tot->rx_bytes	+= bytes;
  		this_tot->rx_packets	+= packets;
-@@ -1370,21 +1370,21 @@ static void netvsc_get_stats64(struct net_device *net,
+@@ -1370,21 +1370,21 @@ static void netvsc_get_stats64(struct ne
  
  		tx_stats = &nvchan->tx_stats;
  		do {
@@ -2661,7 +14807,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  
  		t->rx_bytes	+= bytes;
  		t->rx_packets	+= packets;
-@@ -1527,24 +1527,24 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
+@@ -1527,24 +1527,24 @@ static void netvsc_get_ethtool_stats(str
  		tx_stats = &nvdev->chan_table[j].tx_stats;
  
  		do {
@@ -2690,11 +14836,10 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  		data[i++] = packets;
  		data[i++] = bytes;
  		data[i++] = xdp_drop;
-diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
-index 1c64d5347b8e0..78253ad57b2ef 100644
---- a/drivers/net/ifb.c
-+++ b/drivers/net/ifb.c
-@@ -162,18 +162,18 @@ static void ifb_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ifb.c linux/drivers/net/ifb.c
+--- linux.orig/drivers/net/ifb.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ifb.c	2022-12-04 10:40:26.696034096 -0500
+@@ -162,18 +162,18 @@ static void ifb_stats64(struct net_devic
  
  	for (i = 0; i < dev->num_tx_queues; i++,txp++) {
  		do {
@@ -2717,7 +14862,7 @@ index 1c64d5347b8e0..78253ad57b2ef 100644
  		stats->tx_packets += packets;
  		stats->tx_bytes += bytes;
  	}
-@@ -245,12 +245,12 @@ static void ifb_fill_stats_data(u64 **data,
+@@ -245,12 +245,12 @@ static void ifb_fill_stats_data(u64 **da
  	int j;
  
  	do {
@@ -2732,11 +14877,10 @@ index 1c64d5347b8e0..78253ad57b2ef 100644
  
  	*data += IFB_Q_STATS_LEN;
  }
-diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
-index 49ba8a50dfb1e..8a58d74638cd8 100644
---- a/drivers/net/ipvlan/ipvlan_main.c
-+++ b/drivers/net/ipvlan/ipvlan_main.c
-@@ -299,13 +299,13 @@ static void ipvlan_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ipvlan/ipvlan_main.c linux/drivers/net/ipvlan/ipvlan_main.c
+--- linux.orig/drivers/net/ipvlan/ipvlan_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ipvlan/ipvlan_main.c	2022-12-04 10:40:26.696034096 -0500
+@@ -301,13 +301,13 @@ static void ipvlan_get_stats64(struct ne
  		for_each_possible_cpu(idx) {
  			pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx);
  			do {
@@ -2752,11 +14896,1096 @@ index 49ba8a50dfb1e..8a58d74638cd8 100644
  							   strt));
  
  			s->rx_packets += rx_pkts;
-diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
-index 14e8d04cb4347..c4ad98d39ea60 100644
---- a/drivers/net/loopback.c
-+++ b/drivers/net/loopback.c
-@@ -106,10 +106,10 @@ void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes)
+diff -rupN linux.orig/drivers/net/ipvlan/ipvlan_main.c.orig linux/drivers/net/ipvlan/ipvlan_main.c.orig
+--- linux.orig/drivers/net/ipvlan/ipvlan_main.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/ipvlan/ipvlan_main.c.orig	2022-12-04 10:40:18.180055916 -0500
+@@ -0,0 +1,1082 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
++ */
++
++#include <linux/ethtool.h>
++
++#include "ipvlan.h"
++
++static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval,
++				struct netlink_ext_ack *extack)
++{
++	struct ipvl_dev *ipvlan;
++	unsigned int flags;
++	int err;
++
++	ASSERT_RTNL();
++	if (port->mode != nval) {
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
++			flags = ipvlan->dev->flags;
++			if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) {
++				err = dev_change_flags(ipvlan->dev,
++						       flags | IFF_NOARP,
++						       extack);
++			} else {
++				err = dev_change_flags(ipvlan->dev,
++						       flags & ~IFF_NOARP,
++						       extack);
++			}
++			if (unlikely(err))
++				goto fail;
++		}
++		if (nval == IPVLAN_MODE_L3S) {
++			/* New mode is L3S */
++			err = ipvlan_l3s_register(port);
++			if (err)
++				goto fail;
++		} else if (port->mode == IPVLAN_MODE_L3S) {
++			/* Old mode was L3S */
++			ipvlan_l3s_unregister(port);
++		}
++		port->mode = nval;
++	}
++	return 0;
++
++fail:
++	/* Undo the flags changes that have been done so far. */
++	list_for_each_entry_continue_reverse(ipvlan, &port->ipvlans, pnode) {
++		flags = ipvlan->dev->flags;
++		if (port->mode == IPVLAN_MODE_L3 ||
++		    port->mode == IPVLAN_MODE_L3S)
++			dev_change_flags(ipvlan->dev, flags | IFF_NOARP,
++					 NULL);
++		else
++			dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP,
++					 NULL);
++	}
++
++	return err;
++}
++
++static int ipvlan_port_create(struct net_device *dev)
++{
++	struct ipvl_port *port;
++	int err, idx;
++
++	port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL);
++	if (!port)
++		return -ENOMEM;
++
++	write_pnet(&port->pnet, dev_net(dev));
++	port->dev = dev;
++	port->mode = IPVLAN_MODE_L3;
++	INIT_LIST_HEAD(&port->ipvlans);
++	for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++)
++		INIT_HLIST_HEAD(&port->hlhead[idx]);
++
++	skb_queue_head_init(&port->backlog);
++	INIT_WORK(&port->wq, ipvlan_process_multicast);
++	ida_init(&port->ida);
++	port->dev_id_start = 1;
++
++	err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
++	if (err)
++		goto err;
++
++	netdev_hold(dev, &port->dev_tracker, GFP_KERNEL);
++	return 0;
++
++err:
++	kfree(port);
++	return err;
++}
++
++static void ipvlan_port_destroy(struct net_device *dev)
++{
++	struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
++	struct sk_buff *skb;
++
++	netdev_put(dev, &port->dev_tracker);
++	if (port->mode == IPVLAN_MODE_L3S)
++		ipvlan_l3s_unregister(port);
++	netdev_rx_handler_unregister(dev);
++	cancel_work_sync(&port->wq);
++	while ((skb = __skb_dequeue(&port->backlog)) != NULL) {
++		dev_put(skb->dev);
++		kfree_skb(skb);
++	}
++	ida_destroy(&port->ida);
++	kfree(port);
++}
++
++#define IPVLAN_ALWAYS_ON_OFLOADS \
++	(NETIF_F_SG | NETIF_F_HW_CSUM | \
++	 NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL)
++
++#define IPVLAN_ALWAYS_ON \
++	(IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED)
++
++#define IPVLAN_FEATURES \
++	(NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
++	 NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \
++	 NETIF_F_GRO | NETIF_F_RXCSUM | \
++	 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
++
++	/* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */
++
++#define IPVLAN_STATE_MASK \
++	((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))
++
++static int ipvlan_init(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++	struct ipvl_port *port;
++	int err;
++
++	dev->state = (dev->state & ~IPVLAN_STATE_MASK) |
++		     (phy_dev->state & IPVLAN_STATE_MASK);
++	dev->features = phy_dev->features & IPVLAN_FEATURES;
++	dev->features |= IPVLAN_ALWAYS_ON;
++	dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES;
++	dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS;
++	dev->hw_enc_features |= dev->features;
++	netif_inherit_tso_max(dev, phy_dev);
++	dev->hard_header_len = phy_dev->hard_header_len;
++
++	netdev_lockdep_set_classes(dev);
++
++	ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats);
++	if (!ipvlan->pcpu_stats)
++		return -ENOMEM;
++
++	if (!netif_is_ipvlan_port(phy_dev)) {
++		err = ipvlan_port_create(phy_dev);
++		if (err < 0) {
++			free_percpu(ipvlan->pcpu_stats);
++			return err;
++		}
++	}
++	port = ipvlan_port_get_rtnl(phy_dev);
++	port->count += 1;
++	return 0;
++}
++
++static void ipvlan_uninit(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++	struct ipvl_port *port;
++
++	free_percpu(ipvlan->pcpu_stats);
++
++	port = ipvlan_port_get_rtnl(phy_dev);
++	port->count -= 1;
++	if (!port->count)
++		ipvlan_port_destroy(port->dev);
++}
++
++static int ipvlan_open(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_addr *addr;
++
++	if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
++	    ipvlan->port->mode == IPVLAN_MODE_L3S)
++		dev->flags |= IFF_NOARP;
++	else
++		dev->flags &= ~IFF_NOARP;
++
++	rcu_read_lock();
++	list_for_each_entry_rcu(addr, &ipvlan->addrs, anode)
++		ipvlan_ht_addr_add(ipvlan, addr);
++	rcu_read_unlock();
++
++	return 0;
++}
++
++static int ipvlan_stop(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++	struct ipvl_addr *addr;
++
++	dev_uc_unsync(phy_dev, dev);
++	dev_mc_unsync(phy_dev, dev);
++
++	rcu_read_lock();
++	list_for_each_entry_rcu(addr, &ipvlan->addrs, anode)
++		ipvlan_ht_addr_del(addr);
++	rcu_read_unlock();
++
++	return 0;
++}
++
++static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb,
++				     struct net_device *dev)
++{
++	const struct ipvl_dev *ipvlan = netdev_priv(dev);
++	int skblen = skb->len;
++	int ret;
++
++	ret = ipvlan_queue_xmit(skb, dev);
++	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
++		struct ipvl_pcpu_stats *pcptr;
++
++		pcptr = this_cpu_ptr(ipvlan->pcpu_stats);
++
++		u64_stats_update_begin(&pcptr->syncp);
++		u64_stats_inc(&pcptr->tx_pkts);
++		u64_stats_add(&pcptr->tx_bytes, skblen);
++		u64_stats_update_end(&pcptr->syncp);
++	} else {
++		this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
++	}
++	return ret;
++}
++
++static netdev_features_t ipvlan_fix_features(struct net_device *dev,
++					     netdev_features_t features)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	features |= NETIF_F_ALL_FOR_ALL;
++	features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES);
++	features = netdev_increment_features(ipvlan->phy_dev->features,
++					     features, features);
++	features |= IPVLAN_ALWAYS_ON;
++	features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON);
++
++	return features;
++}
++
++static void ipvlan_change_rx_flags(struct net_device *dev, int change)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++
++	if (change & IFF_ALLMULTI)
++		dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1);
++}
++
++static void ipvlan_set_multicast_mac_filter(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
++		bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE);
++	} else {
++		struct netdev_hw_addr *ha;
++		DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE);
++
++		bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE);
++		netdev_for_each_mc_addr(ha, dev)
++			__set_bit(ipvlan_mac_hash(ha->addr), mc_filters);
++
++		/* Turn-on broadcast bit irrespective of address family,
++		 * since broadcast is deferred to a work-queue, hence no
++		 * impact on fast-path processing.
++		 */
++		__set_bit(ipvlan_mac_hash(dev->broadcast), mc_filters);
++
++		bitmap_copy(ipvlan->mac_filters, mc_filters,
++			    IPVLAN_MAC_FILTER_SIZE);
++	}
++	dev_uc_sync(ipvlan->phy_dev, dev);
++	dev_mc_sync(ipvlan->phy_dev, dev);
++}
++
++static void ipvlan_get_stats64(struct net_device *dev,
++			       struct rtnl_link_stats64 *s)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (ipvlan->pcpu_stats) {
++		struct ipvl_pcpu_stats *pcptr;
++		u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes;
++		u32 rx_errs = 0, tx_drps = 0;
++		u32 strt;
++		int idx;
++
++		for_each_possible_cpu(idx) {
++			pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx);
++			do {
++				strt= u64_stats_fetch_begin_irq(&pcptr->syncp);
++				rx_pkts = u64_stats_read(&pcptr->rx_pkts);
++				rx_bytes = u64_stats_read(&pcptr->rx_bytes);
++				rx_mcast = u64_stats_read(&pcptr->rx_mcast);
++				tx_pkts = u64_stats_read(&pcptr->tx_pkts);
++				tx_bytes = u64_stats_read(&pcptr->tx_bytes);
++			} while (u64_stats_fetch_retry_irq(&pcptr->syncp,
++							   strt));
++
++			s->rx_packets += rx_pkts;
++			s->rx_bytes += rx_bytes;
++			s->multicast += rx_mcast;
++			s->tx_packets += tx_pkts;
++			s->tx_bytes += tx_bytes;
++
++			/* u32 values are updated without syncp protection. */
++			rx_errs += READ_ONCE(pcptr->rx_errs);
++			tx_drps += READ_ONCE(pcptr->tx_drps);
++		}
++		s->rx_errors = rx_errs;
++		s->rx_dropped = rx_errs;
++		s->tx_dropped = tx_drps;
++	}
++}
++
++static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++
++	return vlan_vid_add(phy_dev, proto, vid);
++}
++
++static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
++				   u16 vid)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++
++	vlan_vid_del(phy_dev, proto, vid);
++	return 0;
++}
++
++static int ipvlan_get_iflink(const struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	return ipvlan->phy_dev->ifindex;
++}
++
++static const struct net_device_ops ipvlan_netdev_ops = {
++	.ndo_init		= ipvlan_init,
++	.ndo_uninit		= ipvlan_uninit,
++	.ndo_open		= ipvlan_open,
++	.ndo_stop		= ipvlan_stop,
++	.ndo_start_xmit		= ipvlan_start_xmit,
++	.ndo_fix_features	= ipvlan_fix_features,
++	.ndo_change_rx_flags	= ipvlan_change_rx_flags,
++	.ndo_set_rx_mode	= ipvlan_set_multicast_mac_filter,
++	.ndo_get_stats64	= ipvlan_get_stats64,
++	.ndo_vlan_rx_add_vid	= ipvlan_vlan_rx_add_vid,
++	.ndo_vlan_rx_kill_vid	= ipvlan_vlan_rx_kill_vid,
++	.ndo_get_iflink		= ipvlan_get_iflink,
++};
++
++static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
++			      unsigned short type, const void *daddr,
++			      const void *saddr, unsigned len)
++{
++	const struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++
++	/* TODO Probably use a different field than dev_addr so that the
++	 * mac-address on the virtual device is portable and can be carried
++	 * while the packets use the mac-addr on the physical device.
++	 */
++	return dev_hard_header(skb, phy_dev, type, daddr,
++			       saddr ? : phy_dev->dev_addr, len);
++}
++
++static const struct header_ops ipvlan_header_ops = {
++	.create  	= ipvlan_hard_header,
++	.parse		= eth_header_parse,
++	.cache		= eth_header_cache,
++	.cache_update	= eth_header_cache_update,
++};
++
++static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
++{
++	ipvlan->dev->mtu = dev->mtu;
++}
++
++static bool netif_is_ipvlan(const struct net_device *dev)
++{
++	/* both ipvlan and ipvtap devices use the same netdev_ops */
++	return dev->netdev_ops == &ipvlan_netdev_ops;
++}
++
++static int ipvlan_ethtool_get_link_ksettings(struct net_device *dev,
++					     struct ethtool_link_ksettings *cmd)
++{
++	const struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	return __ethtool_get_link_ksettings(ipvlan->phy_dev, cmd);
++}
++
++static void ipvlan_ethtool_get_drvinfo(struct net_device *dev,
++				       struct ethtool_drvinfo *drvinfo)
++{
++	strlcpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver));
++	strlcpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version));
++}
++
++static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev)
++{
++	const struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	return ipvlan->msg_enable;
++}
++
++static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	ipvlan->msg_enable = value;
++}
++
++static const struct ethtool_ops ipvlan_ethtool_ops = {
++	.get_link	= ethtool_op_get_link,
++	.get_link_ksettings	= ipvlan_ethtool_get_link_ksettings,
++	.get_drvinfo	= ipvlan_ethtool_get_drvinfo,
++	.get_msglevel	= ipvlan_ethtool_get_msglevel,
++	.set_msglevel	= ipvlan_ethtool_set_msglevel,
++};
++
++static int ipvlan_nl_changelink(struct net_device *dev,
++				struct nlattr *tb[], struct nlattr *data[],
++				struct netlink_ext_ack *extack)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
++	int err = 0;
++
++	if (!data)
++		return 0;
++	if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN))
++		return -EPERM;
++
++	if (data[IFLA_IPVLAN_MODE]) {
++		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
++
++		err = ipvlan_set_port_mode(port, nmode, extack);
++	}
++
++	if (!err && data[IFLA_IPVLAN_FLAGS]) {
++		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
++
++		if (flags & IPVLAN_F_PRIVATE)
++			ipvlan_mark_private(port);
++		else
++			ipvlan_clear_private(port);
++
++		if (flags & IPVLAN_F_VEPA)
++			ipvlan_mark_vepa(port);
++		else
++			ipvlan_clear_vepa(port);
++	}
++
++	return err;
++}
++
++static size_t ipvlan_nl_getsize(const struct net_device *dev)
++{
++	return (0
++		+ nla_total_size(2) /* IFLA_IPVLAN_MODE */
++		+ nla_total_size(2) /* IFLA_IPVLAN_FLAGS */
++		);
++}
++
++static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[],
++			      struct netlink_ext_ack *extack)
++{
++	if (!data)
++		return 0;
++
++	if (data[IFLA_IPVLAN_MODE]) {
++		u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
++
++		if (mode >= IPVLAN_MODE_MAX)
++			return -EINVAL;
++	}
++	if (data[IFLA_IPVLAN_FLAGS]) {
++		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
++
++		/* Only two bits are used at this moment. */
++		if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
++			return -EINVAL;
++		/* Also both flags can't be active at the same time. */
++		if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ==
++		    (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
++			return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int ipvlan_nl_fillinfo(struct sk_buff *skb,
++			      const struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
++	int ret = -EINVAL;
++
++	if (!port)
++		goto err;
++
++	ret = -EMSGSIZE;
++	if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode))
++		goto err;
++	if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags))
++		goto err;
++
++	return 0;
++
++err:
++	return ret;
++}
++
++int ipvlan_link_new(struct net *src_net, struct net_device *dev,
++		    struct nlattr *tb[], struct nlattr *data[],
++		    struct netlink_ext_ack *extack)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_port *port;
++	struct net_device *phy_dev;
++	int err;
++	u16 mode = IPVLAN_MODE_L3;
++
++	if (!tb[IFLA_LINK])
++		return -EINVAL;
++
++	phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
++	if (!phy_dev)
++		return -ENODEV;
++
++	if (netif_is_ipvlan(phy_dev)) {
++		struct ipvl_dev *tmp = netdev_priv(phy_dev);
++
++		phy_dev = tmp->phy_dev;
++		if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN))
++			return -EPERM;
++	} else if (!netif_is_ipvlan_port(phy_dev)) {
++		/* Exit early if the underlying link is invalid or busy */
++		if (phy_dev->type != ARPHRD_ETHER ||
++		    phy_dev->flags & IFF_LOOPBACK) {
++			netdev_err(phy_dev,
++				   "Master is either lo or non-ether device\n");
++			return -EINVAL;
++		}
++
++		if (netdev_is_rx_handler_busy(phy_dev)) {
++			netdev_err(phy_dev, "Device is already in use.\n");
++			return -EBUSY;
++		}
++	}
++
++	ipvlan->phy_dev = phy_dev;
++	ipvlan->dev = dev;
++	ipvlan->sfeatures = IPVLAN_FEATURES;
++	if (!tb[IFLA_MTU])
++		ipvlan_adjust_mtu(ipvlan, phy_dev);
++	INIT_LIST_HEAD(&ipvlan->addrs);
++	spin_lock_init(&ipvlan->addrs_lock);
++
++	/* TODO Probably put random address here to be presented to the
++	 * world but keep using the physical-dev address for the outgoing
++	 * packets.
++	 */
++	eth_hw_addr_set(dev, phy_dev->dev_addr);
++
++	dev->priv_flags |= IFF_NO_RX_HANDLER;
++
++	err = register_netdevice(dev);
++	if (err < 0)
++		return err;
++
++	/* ipvlan_init() would have created the port, if required */
++	port = ipvlan_port_get_rtnl(phy_dev);
++	ipvlan->port = port;
++
++	/* If the port-id base is at the MAX value, then wrap it around and
++	 * begin from 0x1 again. This may be due to a busy system where lots
++	 * of slaves are getting created and deleted.
++	 */
++	if (port->dev_id_start == 0xFFFE)
++		port->dev_id_start = 0x1;
++
++	/* Since L2 address is shared among all IPvlan slaves including
++	 * master, use unique 16 bit dev-ids to diffentiate among them.
++	 * Assign IDs between 0x1 and 0xFFFE (used by the master) to each
++	 * slave link [see addrconf_ifid_eui48()].
++	 */
++	err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE,
++			     GFP_KERNEL);
++	if (err < 0)
++		err = ida_simple_get(&port->ida, 0x1, port->dev_id_start,
++				     GFP_KERNEL);
++	if (err < 0)
++		goto unregister_netdev;
++	dev->dev_id = err;
++
++	/* Increment id-base to the next slot for the future assignment */
++	port->dev_id_start = err + 1;
++
++	err = netdev_upper_dev_link(phy_dev, dev, extack);
++	if (err)
++		goto remove_ida;
++
++	/* Flags are per port and latest update overrides. User has
++	 * to be consistent in setting it just like the mode attribute.
++	 */
++	if (data && data[IFLA_IPVLAN_FLAGS])
++		port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
++
++	if (data && data[IFLA_IPVLAN_MODE])
++		mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
++
++	err = ipvlan_set_port_mode(port, mode, extack);
++	if (err)
++		goto unlink_netdev;
++
++	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
++	netif_stacked_transfer_operstate(phy_dev, dev);
++	return 0;
++
++unlink_netdev:
++	netdev_upper_dev_unlink(phy_dev, dev);
++remove_ida:
++	ida_simple_remove(&port->ida, dev->dev_id);
++unregister_netdev:
++	unregister_netdevice(dev);
++	return err;
++}
++EXPORT_SYMBOL_GPL(ipvlan_link_new);
++
++void ipvlan_link_delete(struct net_device *dev, struct list_head *head)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_addr *addr, *next;
++
++	spin_lock_bh(&ipvlan->addrs_lock);
++	list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) {
++		ipvlan_ht_addr_del(addr);
++		list_del_rcu(&addr->anode);
++		kfree_rcu(addr, rcu);
++	}
++	spin_unlock_bh(&ipvlan->addrs_lock);
++
++	ida_simple_remove(&ipvlan->port->ida, dev->dev_id);
++	list_del_rcu(&ipvlan->pnode);
++	unregister_netdevice_queue(dev, head);
++	netdev_upper_dev_unlink(ipvlan->phy_dev, dev);
++}
++EXPORT_SYMBOL_GPL(ipvlan_link_delete);
++
++void ipvlan_link_setup(struct net_device *dev)
++{
++	ether_setup(dev);
++
++	dev->max_mtu = ETH_MAX_MTU;
++	dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
++	dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
++	dev->netdev_ops = &ipvlan_netdev_ops;
++	dev->needs_free_netdev = true;
++	dev->header_ops = &ipvlan_header_ops;
++	dev->ethtool_ops = &ipvlan_ethtool_ops;
++}
++EXPORT_SYMBOL_GPL(ipvlan_link_setup);
++
++static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] =
++{
++	[IFLA_IPVLAN_MODE] = { .type = NLA_U16 },
++	[IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 },
++};
++
++static struct net *ipvlan_get_link_net(const struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	return dev_net(ipvlan->phy_dev);
++}
++
++static struct rtnl_link_ops ipvlan_link_ops = {
++	.kind		= "ipvlan",
++	.priv_size	= sizeof(struct ipvl_dev),
++
++	.setup		= ipvlan_link_setup,
++	.newlink	= ipvlan_link_new,
++	.dellink	= ipvlan_link_delete,
++	.get_link_net   = ipvlan_get_link_net,
++};
++
++int ipvlan_link_register(struct rtnl_link_ops *ops)
++{
++	ops->get_size	= ipvlan_nl_getsize;
++	ops->policy	= ipvlan_nl_policy;
++	ops->validate	= ipvlan_nl_validate;
++	ops->fill_info	= ipvlan_nl_fillinfo;
++	ops->changelink = ipvlan_nl_changelink;
++	ops->maxtype	= IFLA_IPVLAN_MAX;
++	return rtnl_link_register(ops);
++}
++EXPORT_SYMBOL_GPL(ipvlan_link_register);
++
++static int ipvlan_device_event(struct notifier_block *unused,
++			       unsigned long event, void *ptr)
++{
++	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
++	struct netdev_notifier_pre_changeaddr_info *prechaddr_info;
++	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
++	struct ipvl_dev *ipvlan, *next;
++	struct ipvl_port *port;
++	LIST_HEAD(lst_kill);
++	int err;
++
++	if (!netif_is_ipvlan_port(dev))
++		return NOTIFY_DONE;
++
++	port = ipvlan_port_get_rtnl(dev);
++
++	switch (event) {
++	case NETDEV_UP:
++	case NETDEV_CHANGE:
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
++			netif_stacked_transfer_operstate(ipvlan->phy_dev,
++							 ipvlan->dev);
++		break;
++
++	case NETDEV_REGISTER: {
++		struct net *oldnet, *newnet = dev_net(dev);
++
++		oldnet = read_pnet(&port->pnet);
++		if (net_eq(newnet, oldnet))
++			break;
++
++		write_pnet(&port->pnet, newnet);
++
++		ipvlan_migrate_l3s_hook(oldnet, newnet);
++		break;
++	}
++	case NETDEV_UNREGISTER:
++		if (dev->reg_state != NETREG_UNREGISTERING)
++			break;
++
++		list_for_each_entry_safe(ipvlan, next, &port->ipvlans, pnode)
++			ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev,
++							    &lst_kill);
++		unregister_netdevice_many(&lst_kill);
++		break;
++
++	case NETDEV_FEAT_CHANGE:
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
++			netif_inherit_tso_max(ipvlan->dev, dev);
++			netdev_update_features(ipvlan->dev);
++		}
++		break;
++
++	case NETDEV_CHANGEMTU:
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
++			ipvlan_adjust_mtu(ipvlan, dev);
++		break;
++
++	case NETDEV_PRE_CHANGEADDR:
++		prechaddr_info = ptr;
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
++			err = dev_pre_changeaddr_notify(ipvlan->dev,
++						    prechaddr_info->dev_addr,
++						    extack);
++			if (err)
++				return notifier_from_errno(err);
++		}
++		break;
++
++	case NETDEV_CHANGEADDR:
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
++			eth_hw_addr_set(ipvlan->dev, dev->dev_addr);
++			call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev);
++		}
++		break;
++
++	case NETDEV_PRE_TYPE_CHANGE:
++		/* Forbid underlying device to change its type. */
++		return NOTIFY_BAD;
++	}
++	return NOTIFY_DONE;
++}
++
++/* the caller must held the addrs lock */
++static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
++{
++	struct ipvl_addr *addr;
++
++	addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC);
++	if (!addr)
++		return -ENOMEM;
++
++	addr->master = ipvlan;
++	if (!is_v6) {
++		memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr));
++		addr->atype = IPVL_IPV4;
++#if IS_ENABLED(CONFIG_IPV6)
++	} else {
++		memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr));
++		addr->atype = IPVL_IPV6;
++#endif
++	}
++
++	list_add_tail_rcu(&addr->anode, &ipvlan->addrs);
++
++	/* If the interface is not up, the address will be added to the hash
++	 * list by ipvlan_open.
++	 */
++	if (netif_running(ipvlan->dev))
++		ipvlan_ht_addr_add(ipvlan, addr);
++
++	return 0;
++}
++
++static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
++{
++	struct ipvl_addr *addr;
++
++	spin_lock_bh(&ipvlan->addrs_lock);
++	addr = ipvlan_find_addr(ipvlan, iaddr, is_v6);
++	if (!addr) {
++		spin_unlock_bh(&ipvlan->addrs_lock);
++		return;
++	}
++
++	ipvlan_ht_addr_del(addr);
++	list_del_rcu(&addr->anode);
++	spin_unlock_bh(&ipvlan->addrs_lock);
++	kfree_rcu(addr, rcu);
++}
++
++static bool ipvlan_is_valid_dev(const struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (!netif_is_ipvlan(dev))
++		return false;
++
++	if (!ipvlan || !ipvlan->port)
++		return false;
++
++	return true;
++}
++
++#if IS_ENABLED(CONFIG_IPV6)
++static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
++{
++	int ret = -EINVAL;
++
++	spin_lock_bh(&ipvlan->addrs_lock);
++	if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true))
++		netif_err(ipvlan, ifup, ipvlan->dev,
++			  "Failed to add IPv6=%pI6c addr for %s intf\n",
++			  ip6_addr, ipvlan->dev->name);
++	else
++		ret = ipvlan_add_addr(ipvlan, ip6_addr, true);
++	spin_unlock_bh(&ipvlan->addrs_lock);
++	return ret;
++}
++
++static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
++{
++	return ipvlan_del_addr(ipvlan, ip6_addr, true);
++}
++
++static int ipvlan_addr6_event(struct notifier_block *unused,
++			      unsigned long event, void *ptr)
++{
++	struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr;
++	struct net_device *dev = (struct net_device *)if6->idev->dev;
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (!ipvlan_is_valid_dev(dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_UP:
++		if (ipvlan_add_addr6(ipvlan, &if6->addr))
++			return NOTIFY_BAD;
++		break;
++
++	case NETDEV_DOWN:
++		ipvlan_del_addr6(ipvlan, &if6->addr);
++		break;
++	}
++
++	return NOTIFY_OK;
++}
++
++static int ipvlan_addr6_validator_event(struct notifier_block *unused,
++					unsigned long event, void *ptr)
++{
++	struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr;
++	struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev;
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (!ipvlan_is_valid_dev(dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_UP:
++		if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) {
++			NL_SET_ERR_MSG(i6vi->extack,
++				       "Address already assigned to an ipvlan device");
++			return notifier_from_errno(-EADDRINUSE);
++		}
++		break;
++	}
++
++	return NOTIFY_OK;
++}
++#endif
++
++static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
++{
++	int ret = -EINVAL;
++
++	spin_lock_bh(&ipvlan->addrs_lock);
++	if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false))
++		netif_err(ipvlan, ifup, ipvlan->dev,
++			  "Failed to add IPv4=%pI4 on %s intf.\n",
++			  ip4_addr, ipvlan->dev->name);
++	else
++		ret = ipvlan_add_addr(ipvlan, ip4_addr, false);
++	spin_unlock_bh(&ipvlan->addrs_lock);
++	return ret;
++}
++
++static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
++{
++	return ipvlan_del_addr(ipvlan, ip4_addr, false);
++}
++
++static int ipvlan_addr4_event(struct notifier_block *unused,
++			      unsigned long event, void *ptr)
++{
++	struct in_ifaddr *if4 = (struct in_ifaddr *)ptr;
++	struct net_device *dev = (struct net_device *)if4->ifa_dev->dev;
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct in_addr ip4_addr;
++
++	if (!ipvlan_is_valid_dev(dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_UP:
++		ip4_addr.s_addr = if4->ifa_address;
++		if (ipvlan_add_addr4(ipvlan, &ip4_addr))
++			return NOTIFY_BAD;
++		break;
++
++	case NETDEV_DOWN:
++		ip4_addr.s_addr = if4->ifa_address;
++		ipvlan_del_addr4(ipvlan, &ip4_addr);
++		break;
++	}
++
++	return NOTIFY_OK;
++}
++
++static int ipvlan_addr4_validator_event(struct notifier_block *unused,
++					unsigned long event, void *ptr)
++{
++	struct in_validator_info *ivi = (struct in_validator_info *)ptr;
++	struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev;
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (!ipvlan_is_valid_dev(dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_UP:
++		if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) {
++			NL_SET_ERR_MSG(ivi->extack,
++				       "Address already assigned to an ipvlan device");
++			return notifier_from_errno(-EADDRINUSE);
++		}
++		break;
++	}
++
++	return NOTIFY_OK;
++}
++
++static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_addr4_event,
++};
++
++static struct notifier_block ipvlan_addr4_vtor_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_addr4_validator_event,
++};
++
++static struct notifier_block ipvlan_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_device_event,
++};
++
++#if IS_ENABLED(CONFIG_IPV6)
++static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_addr6_event,
++};
++
++static struct notifier_block ipvlan_addr6_vtor_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_addr6_validator_event,
++};
++#endif
++
++static int __init ipvlan_init_module(void)
++{
++	int err;
++
++	ipvlan_init_secret();
++	register_netdevice_notifier(&ipvlan_notifier_block);
++#if IS_ENABLED(CONFIG_IPV6)
++	register_inet6addr_notifier(&ipvlan_addr6_notifier_block);
++	register_inet6addr_validator_notifier(
++	    &ipvlan_addr6_vtor_notifier_block);
++#endif
++	register_inetaddr_notifier(&ipvlan_addr4_notifier_block);
++	register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block);
++
++	err = ipvlan_l3s_init();
++	if (err < 0)
++		goto error;
++
++	err = ipvlan_link_register(&ipvlan_link_ops);
++	if (err < 0) {
++		ipvlan_l3s_cleanup();
++		goto error;
++	}
++
++	return 0;
++error:
++	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
++	unregister_inetaddr_validator_notifier(
++	    &ipvlan_addr4_vtor_notifier_block);
++#if IS_ENABLED(CONFIG_IPV6)
++	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
++	unregister_inet6addr_validator_notifier(
++	    &ipvlan_addr6_vtor_notifier_block);
++#endif
++	unregister_netdevice_notifier(&ipvlan_notifier_block);
++	return err;
++}
++
++static void __exit ipvlan_cleanup_module(void)
++{
++	rtnl_link_unregister(&ipvlan_link_ops);
++	ipvlan_l3s_cleanup();
++	unregister_netdevice_notifier(&ipvlan_notifier_block);
++	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
++	unregister_inetaddr_validator_notifier(
++	    &ipvlan_addr4_vtor_notifier_block);
++#if IS_ENABLED(CONFIG_IPV6)
++	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
++	unregister_inet6addr_validator_notifier(
++	    &ipvlan_addr6_vtor_notifier_block);
++#endif
++}
++
++module_init(ipvlan_init_module);
++module_exit(ipvlan_cleanup_module);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>");
++MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs");
++MODULE_ALIAS_RTNL_LINK("ipvlan");
+diff -rupN linux.orig/drivers/net/loopback.c linux/drivers/net/loopback.c
+--- linux.orig/drivers/net/loopback.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/loopback.c	2022-12-04 10:40:26.696034096 -0500
+@@ -106,10 +106,10 @@ void dev_lstats_read(struct net_device *
  
  		lb_stats = per_cpu_ptr(dev->lstats, i);
  		do {
@@ -2769,11 +15998,10 @@ index 14e8d04cb4347..c4ad98d39ea60 100644
  		*bytes   += tbytes;
  		*packets += tpackets;
  	}
-diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
-index c6d271e5687e9..5056f3cd5699a 100644
---- a/drivers/net/macsec.c
-+++ b/drivers/net/macsec.c
-@@ -2823,9 +2823,9 @@ static void get_rx_sc_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/macsec.c linux/drivers/net/macsec.c
+--- linux.orig/drivers/net/macsec.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/macsec.c	2022-12-04 10:40:26.696034096 -0500
+@@ -2821,9 +2821,9 @@ static void get_rx_sc_stats(struct net_d
  
  		stats = per_cpu_ptr(rx_sc->stats, cpu);
  		do {
@@ -2785,7 +16013,7 @@ index c6d271e5687e9..5056f3cd5699a 100644
  
  		sum->InOctetsValidated += tmp.InOctetsValidated;
  		sum->InOctetsDecrypted += tmp.InOctetsDecrypted;
-@@ -2904,9 +2904,9 @@ static void get_tx_sc_stats(struct net_device *dev,
+@@ -2902,9 +2902,9 @@ static void get_tx_sc_stats(struct net_d
  
  		stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu);
  		do {
@@ -2797,7 +16025,7 @@ index c6d271e5687e9..5056f3cd5699a 100644
  
  		sum->OutPktsProtected   += tmp.OutPktsProtected;
  		sum->OutPktsEncrypted   += tmp.OutPktsEncrypted;
-@@ -2960,9 +2960,9 @@ static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum)
+@@ -2958,9 +2958,9 @@ static void get_secy_stats(struct net_de
  
  		stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu);
  		do {
@@ -2809,11 +16037,4431 @@ index c6d271e5687e9..5056f3cd5699a 100644
  
  		sum->OutPktsUntagged  += tmp.OutPktsUntagged;
  		sum->InPktsUntagged   += tmp.InPktsUntagged;
-diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
-index 1080d6ebff63b..a1c7823f0ba66 100644
---- a/drivers/net/macvlan.c
-+++ b/drivers/net/macvlan.c
-@@ -948,13 +948,13 @@ static void macvlan_dev_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/macsec.c.orig linux/drivers/net/macsec.c.orig
+--- linux.orig/drivers/net/macsec.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/macsec.c.orig	2022-12-04 10:40:18.180055916 -0500
+@@ -0,0 +1,4417 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * drivers/net/macsec.c - MACsec device
++ *
++ * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net>
++ */
++
++#include <linux/types.h>
++#include <linux/skbuff.h>
++#include <linux/socket.h>
++#include <linux/module.h>
++#include <crypto/aead.h>
++#include <linux/etherdevice.h>
++#include <linux/netdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/refcount.h>
++#include <net/genetlink.h>
++#include <net/sock.h>
++#include <net/gro_cells.h>
++#include <net/macsec.h>
++#include <linux/phy.h>
++#include <linux/byteorder/generic.h>
++#include <linux/if_arp.h>
++
++#include <uapi/linux/if_macsec.h>
++
++#define MACSEC_SCI_LEN 8
++
++/* SecTAG length = macsec_eth_header without the optional SCI */
++#define MACSEC_TAG_LEN 6
++
++struct macsec_eth_header {
++	struct ethhdr eth;
++	/* SecTAG */
++	u8  tci_an;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	u8  short_length:6,
++		  unused:2;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++	u8        unused:2,
++	    short_length:6;
++#else
++#error	"Please fix <asm/byteorder.h>"
++#endif
++	__be32 packet_number;
++	u8 secure_channel_id[8]; /* optional */
++} __packed;
++
++#define MACSEC_TCI_VERSION 0x80
++#define MACSEC_TCI_ES      0x40 /* end station */
++#define MACSEC_TCI_SC      0x20 /* SCI present */
++#define MACSEC_TCI_SCB     0x10 /* epon */
++#define MACSEC_TCI_E       0x08 /* encryption */
++#define MACSEC_TCI_C       0x04 /* changed text */
++#define MACSEC_AN_MASK     0x03 /* association number */
++#define MACSEC_TCI_CONFID  (MACSEC_TCI_E | MACSEC_TCI_C)
++
++/* minimum secure data length deemed "not short", see IEEE 802.1AE-2006 9.7 */
++#define MIN_NON_SHORT_LEN 48
++
++#define GCM_AES_IV_LEN 12
++#define DEFAULT_ICV_LEN 16
++
++#define for_each_rxsc(secy, sc)				\
++	for (sc = rcu_dereference_bh(secy->rx_sc);	\
++	     sc;					\
++	     sc = rcu_dereference_bh(sc->next))
++#define for_each_rxsc_rtnl(secy, sc)			\
++	for (sc = rtnl_dereference(secy->rx_sc);	\
++	     sc;					\
++	     sc = rtnl_dereference(sc->next))
++
++#define pn_same_half(pn1, pn2) (!(((pn1) >> 31) ^ ((pn2) >> 31)))
++
++struct gcm_iv_xpn {
++	union {
++		u8 short_secure_channel_id[4];
++		ssci_t ssci;
++	};
++	__be64 pn;
++} __packed;
++
++struct gcm_iv {
++	union {
++		u8 secure_channel_id[8];
++		sci_t sci;
++	};
++	__be32 pn;
++};
++
++#define MACSEC_VALIDATE_DEFAULT MACSEC_VALIDATE_STRICT
++
++struct pcpu_secy_stats {
++	struct macsec_dev_stats stats;
++	struct u64_stats_sync syncp;
++};
++
++/**
++ * struct macsec_dev - private data
++ * @secy: SecY config
++ * @real_dev: pointer to underlying netdevice
++ * @dev_tracker: refcount tracker for @real_dev reference
++ * @stats: MACsec device stats
++ * @secys: linked list of SecY's on the underlying device
++ * @gro_cells: pointer to the Generic Receive Offload cell
++ * @offload: status of offloading on the MACsec device
++ */
++struct macsec_dev {
++	struct macsec_secy secy;
++	struct net_device *real_dev;
++	netdevice_tracker dev_tracker;
++	struct pcpu_secy_stats __percpu *stats;
++	struct list_head secys;
++	struct gro_cells gro_cells;
++	enum macsec_offload offload;
++};
++
++/**
++ * struct macsec_rxh_data - rx_handler private argument
++ * @secys: linked list of SecY's on this underlying device
++ */
++struct macsec_rxh_data {
++	struct list_head secys;
++};
++
++static struct macsec_dev *macsec_priv(const struct net_device *dev)
++{
++	return (struct macsec_dev *)netdev_priv(dev);
++}
++
++static struct macsec_rxh_data *macsec_data_rcu(const struct net_device *dev)
++{
++	return rcu_dereference_bh(dev->rx_handler_data);
++}
++
++static struct macsec_rxh_data *macsec_data_rtnl(const struct net_device *dev)
++{
++	return rtnl_dereference(dev->rx_handler_data);
++}
++
++struct macsec_cb {
++	struct aead_request *req;
++	union {
++		struct macsec_tx_sa *tx_sa;
++		struct macsec_rx_sa *rx_sa;
++	};
++	u8 assoc_num;
++	bool valid;
++	bool has_sci;
++};
++
++static struct macsec_rx_sa *macsec_rxsa_get(struct macsec_rx_sa __rcu *ptr)
++{
++	struct macsec_rx_sa *sa = rcu_dereference_bh(ptr);
++
++	if (!sa || !sa->active)
++		return NULL;
++
++	if (!refcount_inc_not_zero(&sa->refcnt))
++		return NULL;
++
++	return sa;
++}
++
++static struct macsec_rx_sa *macsec_active_rxsa_get(struct macsec_rx_sc *rx_sc)
++{
++	struct macsec_rx_sa *sa = NULL;
++	int an;
++
++	for (an = 0; an < MACSEC_NUM_AN; an++)	{
++		sa = macsec_rxsa_get(rx_sc->sa[an]);
++		if (sa)
++			break;
++	}
++	return sa;
++}
++
++static void free_rx_sc_rcu(struct rcu_head *head)
++{
++	struct macsec_rx_sc *rx_sc = container_of(head, struct macsec_rx_sc, rcu_head);
++
++	free_percpu(rx_sc->stats);
++	kfree(rx_sc);
++}
++
++static struct macsec_rx_sc *macsec_rxsc_get(struct macsec_rx_sc *sc)
++{
++	return refcount_inc_not_zero(&sc->refcnt) ? sc : NULL;
++}
++
++static void macsec_rxsc_put(struct macsec_rx_sc *sc)
++{
++	if (refcount_dec_and_test(&sc->refcnt))
++		call_rcu(&sc->rcu_head, free_rx_sc_rcu);
++}
++
++static void free_rxsa(struct rcu_head *head)
++{
++	struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu);
++
++	crypto_free_aead(sa->key.tfm);
++	free_percpu(sa->stats);
++	kfree(sa);
++}
++
++static void macsec_rxsa_put(struct macsec_rx_sa *sa)
++{
++	if (refcount_dec_and_test(&sa->refcnt))
++		call_rcu(&sa->rcu, free_rxsa);
++}
++
++static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr)
++{
++	struct macsec_tx_sa *sa = rcu_dereference_bh(ptr);
++
++	if (!sa || !sa->active)
++		return NULL;
++
++	if (!refcount_inc_not_zero(&sa->refcnt))
++		return NULL;
++
++	return sa;
++}
++
++static void free_txsa(struct rcu_head *head)
++{
++	struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu);
++
++	crypto_free_aead(sa->key.tfm);
++	free_percpu(sa->stats);
++	kfree(sa);
++}
++
++static void macsec_txsa_put(struct macsec_tx_sa *sa)
++{
++	if (refcount_dec_and_test(&sa->refcnt))
++		call_rcu(&sa->rcu, free_txsa);
++}
++
++static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb)
++{
++	BUILD_BUG_ON(sizeof(struct macsec_cb) > sizeof(skb->cb));
++	return (struct macsec_cb *)skb->cb;
++}
++
++#define MACSEC_PORT_ES (htons(0x0001))
++#define MACSEC_PORT_SCB (0x0000)
++#define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL)
++#define MACSEC_UNDEF_SSCI ((__force ssci_t)0xffffffff)
++
++#define MACSEC_GCM_AES_128_SAK_LEN 16
++#define MACSEC_GCM_AES_256_SAK_LEN 32
++
++#define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN
++#define DEFAULT_XPN false
++#define DEFAULT_SEND_SCI true
++#define DEFAULT_ENCRYPT false
++#define DEFAULT_ENCODING_SA 0
++#define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1))
++
++static bool send_sci(const struct macsec_secy *secy)
++{
++	const struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++
++	return tx_sc->send_sci ||
++		(secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb);
++}
++
++static sci_t make_sci(const u8 *addr, __be16 port)
++{
++	sci_t sci;
++
++	memcpy(&sci, addr, ETH_ALEN);
++	memcpy(((char *)&sci) + ETH_ALEN, &port, sizeof(port));
++
++	return sci;
++}
++
++static sci_t macsec_frame_sci(struct macsec_eth_header *hdr, bool sci_present)
++{
++	sci_t sci;
++
++	if (sci_present)
++		memcpy(&sci, hdr->secure_channel_id,
++		       sizeof(hdr->secure_channel_id));
++	else
++		sci = make_sci(hdr->eth.h_source, MACSEC_PORT_ES);
++
++	return sci;
++}
++
++static unsigned int macsec_sectag_len(bool sci_present)
++{
++	return MACSEC_TAG_LEN + (sci_present ? MACSEC_SCI_LEN : 0);
++}
++
++static unsigned int macsec_hdr_len(bool sci_present)
++{
++	return macsec_sectag_len(sci_present) + ETH_HLEN;
++}
++
++static unsigned int macsec_extra_len(bool sci_present)
++{
++	return macsec_sectag_len(sci_present) + sizeof(__be16);
++}
++
++/* Fill SecTAG according to IEEE 802.1AE-2006 10.5.3 */
++static void macsec_fill_sectag(struct macsec_eth_header *h,
++			       const struct macsec_secy *secy, u32 pn,
++			       bool sci_present)
++{
++	const struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++
++	memset(&h->tci_an, 0, macsec_sectag_len(sci_present));
++	h->eth.h_proto = htons(ETH_P_MACSEC);
++
++	if (sci_present) {
++		h->tci_an |= MACSEC_TCI_SC;
++		memcpy(&h->secure_channel_id, &secy->sci,
++		       sizeof(h->secure_channel_id));
++	} else {
++		if (tx_sc->end_station)
++			h->tci_an |= MACSEC_TCI_ES;
++		if (tx_sc->scb)
++			h->tci_an |= MACSEC_TCI_SCB;
++	}
++
++	h->packet_number = htonl(pn);
++
++	/* with GCM, C/E clear for !encrypt, both set for encrypt */
++	if (tx_sc->encrypt)
++		h->tci_an |= MACSEC_TCI_CONFID;
++	else if (secy->icv_len != DEFAULT_ICV_LEN)
++		h->tci_an |= MACSEC_TCI_C;
++
++	h->tci_an |= tx_sc->encoding_sa;
++}
++
++static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len)
++{
++	if (data_len < MIN_NON_SHORT_LEN)
++		h->short_length = data_len;
++}
++
++/* Checks if a MACsec interface is being offloaded to an hardware engine */
++static bool macsec_is_offloaded(struct macsec_dev *macsec)
++{
++	if (macsec->offload == MACSEC_OFFLOAD_MAC ||
++	    macsec->offload == MACSEC_OFFLOAD_PHY)
++		return true;
++
++	return false;
++}
++
++/* Checks if underlying layers implement MACsec offloading functions. */
++static bool macsec_check_offload(enum macsec_offload offload,
++				 struct macsec_dev *macsec)
++{
++	if (!macsec || !macsec->real_dev)
++		return false;
++
++	if (offload == MACSEC_OFFLOAD_PHY)
++		return macsec->real_dev->phydev &&
++		       macsec->real_dev->phydev->macsec_ops;
++	else if (offload == MACSEC_OFFLOAD_MAC)
++		return macsec->real_dev->features & NETIF_F_HW_MACSEC &&
++		       macsec->real_dev->macsec_ops;
++
++	return false;
++}
++
++static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload,
++						 struct macsec_dev *macsec,
++						 struct macsec_context *ctx)
++{
++	if (ctx) {
++		memset(ctx, 0, sizeof(*ctx));
++		ctx->offload = offload;
++
++		if (offload == MACSEC_OFFLOAD_PHY)
++			ctx->phydev = macsec->real_dev->phydev;
++		else if (offload == MACSEC_OFFLOAD_MAC)
++			ctx->netdev = macsec->real_dev;
++	}
++
++	if (offload == MACSEC_OFFLOAD_PHY)
++		return macsec->real_dev->phydev->macsec_ops;
++	else
++		return macsec->real_dev->macsec_ops;
++}
++
++/* Returns a pointer to the MACsec ops struct if any and updates the MACsec
++ * context device reference if provided.
++ */
++static const struct macsec_ops *macsec_get_ops(struct macsec_dev *macsec,
++					       struct macsec_context *ctx)
++{
++	if (!macsec_check_offload(macsec->offload, macsec))
++		return NULL;
++
++	return __macsec_get_ops(macsec->offload, macsec, ctx);
++}
++
++/* validate MACsec packet according to IEEE 802.1AE-2018 9.12 */
++static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len, bool xpn)
++{
++	struct macsec_eth_header *h = (struct macsec_eth_header *)skb->data;
++	int len = skb->len - 2 * ETH_ALEN;
++	int extra_len = macsec_extra_len(!!(h->tci_an & MACSEC_TCI_SC)) + icv_len;
++
++	/* a) It comprises at least 17 octets */
++	if (skb->len <= 16)
++		return false;
++
++	/* b) MACsec EtherType: already checked */
++
++	/* c) V bit is clear */
++	if (h->tci_an & MACSEC_TCI_VERSION)
++		return false;
++
++	/* d) ES or SCB => !SC */
++	if ((h->tci_an & MACSEC_TCI_ES || h->tci_an & MACSEC_TCI_SCB) &&
++	    (h->tci_an & MACSEC_TCI_SC))
++		return false;
++
++	/* e) Bits 7 and 8 of octet 4 of the SecTAG are clear */
++	if (h->unused)
++		return false;
++
++	/* rx.pn != 0 if not XPN (figure 10-5 with 802.11AEbw-2013 amendment) */
++	if (!h->packet_number && !xpn)
++		return false;
++
++	/* length check, f) g) h) i) */
++	if (h->short_length)
++		return len == extra_len + h->short_length;
++	return len >= extra_len + MIN_NON_SHORT_LEN;
++}
++
++#define MACSEC_NEEDED_HEADROOM (macsec_extra_len(true))
++#define MACSEC_NEEDED_TAILROOM MACSEC_STD_ICV_LEN
++
++static void macsec_fill_iv_xpn(unsigned char *iv, ssci_t ssci, u64 pn,
++			       salt_t salt)
++{
++	struct gcm_iv_xpn *gcm_iv = (struct gcm_iv_xpn *)iv;
++
++	gcm_iv->ssci = ssci ^ salt.ssci;
++	gcm_iv->pn = cpu_to_be64(pn) ^ salt.pn;
++}
++
++static void macsec_fill_iv(unsigned char *iv, sci_t sci, u32 pn)
++{
++	struct gcm_iv *gcm_iv = (struct gcm_iv *)iv;
++
++	gcm_iv->sci = sci;
++	gcm_iv->pn = htonl(pn);
++}
++
++static struct macsec_eth_header *macsec_ethhdr(struct sk_buff *skb)
++{
++	return (struct macsec_eth_header *)skb_mac_header(skb);
++}
++
++static void __macsec_pn_wrapped(struct macsec_secy *secy,
++				struct macsec_tx_sa *tx_sa)
++{
++	pr_debug("PN wrapped, transitioning to !oper\n");
++	tx_sa->active = false;
++	if (secy->protect_frames)
++		secy->operational = false;
++}
++
++void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa)
++{
++	spin_lock_bh(&tx_sa->lock);
++	__macsec_pn_wrapped(secy, tx_sa);
++	spin_unlock_bh(&tx_sa->lock);
++}
++EXPORT_SYMBOL_GPL(macsec_pn_wrapped);
++
++static pn_t tx_sa_update_pn(struct macsec_tx_sa *tx_sa,
++			    struct macsec_secy *secy)
++{
++	pn_t pn;
++
++	spin_lock_bh(&tx_sa->lock);
++
++	pn = tx_sa->next_pn_halves;
++	if (secy->xpn)
++		tx_sa->next_pn++;
++	else
++		tx_sa->next_pn_halves.lower++;
++
++	if (tx_sa->next_pn == 0)
++		__macsec_pn_wrapped(secy, tx_sa);
++	spin_unlock_bh(&tx_sa->lock);
++
++	return pn;
++}
++
++static void macsec_encrypt_finish(struct sk_buff *skb, struct net_device *dev)
++{
++	struct macsec_dev *macsec = netdev_priv(dev);
++
++	skb->dev = macsec->real_dev;
++	skb_reset_mac_header(skb);
++	skb->protocol = eth_hdr(skb)->h_proto;
++}
++
++static unsigned int macsec_msdu_len(struct sk_buff *skb)
++{
++	struct macsec_dev *macsec = macsec_priv(skb->dev);
++	struct macsec_secy *secy = &macsec->secy;
++	bool sci_present = macsec_skb_cb(skb)->has_sci;
++
++	return skb->len - macsec_hdr_len(sci_present) - secy->icv_len;
++}
++
++static void macsec_count_tx(struct sk_buff *skb, struct macsec_tx_sc *tx_sc,
++			    struct macsec_tx_sa *tx_sa)
++{
++	unsigned int msdu_len = macsec_msdu_len(skb);
++	struct pcpu_tx_sc_stats *txsc_stats = this_cpu_ptr(tx_sc->stats);
++
++	u64_stats_update_begin(&txsc_stats->syncp);
++	if (tx_sc->encrypt) {
++		txsc_stats->stats.OutOctetsEncrypted += msdu_len;
++		txsc_stats->stats.OutPktsEncrypted++;
++		this_cpu_inc(tx_sa->stats->OutPktsEncrypted);
++	} else {
++		txsc_stats->stats.OutOctetsProtected += msdu_len;
++		txsc_stats->stats.OutPktsProtected++;
++		this_cpu_inc(tx_sa->stats->OutPktsProtected);
++	}
++	u64_stats_update_end(&txsc_stats->syncp);
++}
++
++static void count_tx(struct net_device *dev, int ret, int len)
++{
++	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
++		struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats);
++
++		u64_stats_update_begin(&stats->syncp);
++		u64_stats_inc(&stats->tx_packets);
++		u64_stats_add(&stats->tx_bytes, len);
++		u64_stats_update_end(&stats->syncp);
++	}
++}
++
++static void macsec_encrypt_done(struct crypto_async_request *base, int err)
++{
++	struct sk_buff *skb = base->data;
++	struct net_device *dev = skb->dev;
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa;
++	int len, ret;
++
++	aead_request_free(macsec_skb_cb(skb)->req);
++
++	rcu_read_lock_bh();
++	macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa);
++	/* packet is encrypted/protected so tx_bytes must be calculated */
++	len = macsec_msdu_len(skb) + 2 * ETH_ALEN;
++	macsec_encrypt_finish(skb, dev);
++	ret = dev_queue_xmit(skb);
++	count_tx(dev, ret, len);
++	rcu_read_unlock_bh();
++
++	macsec_txsa_put(sa);
++	dev_put(dev);
++}
++
++static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm,
++					     unsigned char **iv,
++					     struct scatterlist **sg,
++					     int num_frags)
++{
++	size_t size, iv_offset, sg_offset;
++	struct aead_request *req;
++	void *tmp;
++
++	size = sizeof(struct aead_request) + crypto_aead_reqsize(tfm);
++	iv_offset = size;
++	size += GCM_AES_IV_LEN;
++
++	size = ALIGN(size, __alignof__(struct scatterlist));
++	sg_offset = size;
++	size += sizeof(struct scatterlist) * num_frags;
++
++	tmp = kmalloc(size, GFP_ATOMIC);
++	if (!tmp)
++		return NULL;
++
++	*iv = (unsigned char *)(tmp + iv_offset);
++	*sg = (struct scatterlist *)(tmp + sg_offset);
++	req = tmp;
++
++	aead_request_set_tfm(req, tfm);
++
++	return req;
++}
++
++static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
++				      struct net_device *dev)
++{
++	int ret;
++	struct scatterlist *sg;
++	struct sk_buff *trailer;
++	unsigned char *iv;
++	struct ethhdr *eth;
++	struct macsec_eth_header *hh;
++	size_t unprotected_len;
++	struct aead_request *req;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++	struct macsec_dev *macsec = macsec_priv(dev);
++	bool sci_present;
++	pn_t pn;
++
++	secy = &macsec->secy;
++	tx_sc = &secy->tx_sc;
++
++	/* 10.5.1 TX SA assignment */
++	tx_sa = macsec_txsa_get(tx_sc->sa[tx_sc->encoding_sa]);
++	if (!tx_sa) {
++		secy->operational = false;
++		kfree_skb(skb);
++		return ERR_PTR(-EINVAL);
++	}
++
++	if (unlikely(skb_headroom(skb) < MACSEC_NEEDED_HEADROOM ||
++		     skb_tailroom(skb) < MACSEC_NEEDED_TAILROOM)) {
++		struct sk_buff *nskb = skb_copy_expand(skb,
++						       MACSEC_NEEDED_HEADROOM,
++						       MACSEC_NEEDED_TAILROOM,
++						       GFP_ATOMIC);
++		if (likely(nskb)) {
++			consume_skb(skb);
++			skb = nskb;
++		} else {
++			macsec_txsa_put(tx_sa);
++			kfree_skb(skb);
++			return ERR_PTR(-ENOMEM);
++		}
++	} else {
++		skb = skb_unshare(skb, GFP_ATOMIC);
++		if (!skb) {
++			macsec_txsa_put(tx_sa);
++			return ERR_PTR(-ENOMEM);
++		}
++	}
++
++	unprotected_len = skb->len;
++	eth = eth_hdr(skb);
++	sci_present = send_sci(secy);
++	hh = skb_push(skb, macsec_extra_len(sci_present));
++	memmove(hh, eth, 2 * ETH_ALEN);
++
++	pn = tx_sa_update_pn(tx_sa, secy);
++	if (pn.full64 == 0) {
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(-ENOLINK);
++	}
++	macsec_fill_sectag(hh, secy, pn.lower, sci_present);
++	macsec_set_shortlen(hh, unprotected_len - 2 * ETH_ALEN);
++
++	skb_put(skb, secy->icv_len);
++
++	if (skb->len - ETH_HLEN > macsec_priv(dev)->real_dev->mtu) {
++		struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats);
++
++		u64_stats_update_begin(&secy_stats->syncp);
++		secy_stats->stats.OutPktsTooLong++;
++		u64_stats_update_end(&secy_stats->syncp);
++
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(-EINVAL);
++	}
++
++	ret = skb_cow_data(skb, 0, &trailer);
++	if (unlikely(ret < 0)) {
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(ret);
++	}
++
++	req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret);
++	if (!req) {
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(-ENOMEM);
++	}
++
++	if (secy->xpn)
++		macsec_fill_iv_xpn(iv, tx_sa->ssci, pn.full64, tx_sa->key.salt);
++	else
++		macsec_fill_iv(iv, secy->sci, pn.lower);
++
++	sg_init_table(sg, ret);
++	ret = skb_to_sgvec(skb, sg, 0, skb->len);
++	if (unlikely(ret < 0)) {
++		aead_request_free(req);
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(ret);
++	}
++
++	if (tx_sc->encrypt) {
++		int len = skb->len - macsec_hdr_len(sci_present) -
++			  secy->icv_len;
++		aead_request_set_crypt(req, sg, sg, len, iv);
++		aead_request_set_ad(req, macsec_hdr_len(sci_present));
++	} else {
++		aead_request_set_crypt(req, sg, sg, 0, iv);
++		aead_request_set_ad(req, skb->len - secy->icv_len);
++	}
++
++	macsec_skb_cb(skb)->req = req;
++	macsec_skb_cb(skb)->tx_sa = tx_sa;
++	macsec_skb_cb(skb)->has_sci = sci_present;
++	aead_request_set_callback(req, 0, macsec_encrypt_done, skb);
++
++	dev_hold(skb->dev);
++	ret = crypto_aead_encrypt(req);
++	if (ret == -EINPROGRESS) {
++		return ERR_PTR(ret);
++	} else if (ret != 0) {
++		dev_put(skb->dev);
++		kfree_skb(skb);
++		aead_request_free(req);
++		macsec_txsa_put(tx_sa);
++		return ERR_PTR(-EINVAL);
++	}
++
++	dev_put(skb->dev);
++	aead_request_free(req);
++	macsec_txsa_put(tx_sa);
++
++	return skb;
++}
++
++static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u32 pn)
++{
++	struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa;
++	struct pcpu_rx_sc_stats *rxsc_stats = this_cpu_ptr(rx_sa->sc->stats);
++	struct macsec_eth_header *hdr = macsec_ethhdr(skb);
++	u32 lowest_pn = 0;
++
++	spin_lock(&rx_sa->lock);
++	if (rx_sa->next_pn_halves.lower >= secy->replay_window)
++		lowest_pn = rx_sa->next_pn_halves.lower - secy->replay_window;
++
++	/* Now perform replay protection check again
++	 * (see IEEE 802.1AE-2006 figure 10-5)
++	 */
++	if (secy->replay_protect && pn < lowest_pn &&
++	    (!secy->xpn || pn_same_half(pn, lowest_pn))) {
++		spin_unlock(&rx_sa->lock);
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		rxsc_stats->stats.InPktsLate++;
++		u64_stats_update_end(&rxsc_stats->syncp);
++		secy->netdev->stats.rx_dropped++;
++		return false;
++	}
++
++	if (secy->validate_frames != MACSEC_VALIDATE_DISABLED) {
++		unsigned int msdu_len = macsec_msdu_len(skb);
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		if (hdr->tci_an & MACSEC_TCI_E)
++			rxsc_stats->stats.InOctetsDecrypted += msdu_len;
++		else
++			rxsc_stats->stats.InOctetsValidated += msdu_len;
++		u64_stats_update_end(&rxsc_stats->syncp);
++	}
++
++	if (!macsec_skb_cb(skb)->valid) {
++		spin_unlock(&rx_sa->lock);
++
++		/* 10.6.5 */
++		if (hdr->tci_an & MACSEC_TCI_C ||
++		    secy->validate_frames == MACSEC_VALIDATE_STRICT) {
++			u64_stats_update_begin(&rxsc_stats->syncp);
++			rxsc_stats->stats.InPktsNotValid++;
++			u64_stats_update_end(&rxsc_stats->syncp);
++			this_cpu_inc(rx_sa->stats->InPktsNotValid);
++			secy->netdev->stats.rx_errors++;
++			return false;
++		}
++
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		if (secy->validate_frames == MACSEC_VALIDATE_CHECK) {
++			rxsc_stats->stats.InPktsInvalid++;
++			this_cpu_inc(rx_sa->stats->InPktsInvalid);
++		} else if (pn < lowest_pn) {
++			rxsc_stats->stats.InPktsDelayed++;
++		} else {
++			rxsc_stats->stats.InPktsUnchecked++;
++		}
++		u64_stats_update_end(&rxsc_stats->syncp);
++	} else {
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		if (pn < lowest_pn) {
++			rxsc_stats->stats.InPktsDelayed++;
++		} else {
++			rxsc_stats->stats.InPktsOK++;
++			this_cpu_inc(rx_sa->stats->InPktsOK);
++		}
++		u64_stats_update_end(&rxsc_stats->syncp);
++
++		// Instead of "pn >=" - to support pn overflow in xpn
++		if (pn + 1 > rx_sa->next_pn_halves.lower) {
++			rx_sa->next_pn_halves.lower = pn + 1;
++		} else if (secy->xpn &&
++			   !pn_same_half(pn, rx_sa->next_pn_halves.lower)) {
++			rx_sa->next_pn_halves.upper++;
++			rx_sa->next_pn_halves.lower = pn + 1;
++		}
++
++		spin_unlock(&rx_sa->lock);
++	}
++
++	return true;
++}
++
++static void macsec_reset_skb(struct sk_buff *skb, struct net_device *dev)
++{
++	skb->pkt_type = PACKET_HOST;
++	skb->protocol = eth_type_trans(skb, dev);
++
++	skb_reset_network_header(skb);
++	if (!skb_transport_header_was_set(skb))
++		skb_reset_transport_header(skb);
++	skb_reset_mac_len(skb);
++}
++
++static void macsec_finalize_skb(struct sk_buff *skb, u8 icv_len, u8 hdr_len)
++{
++	skb->ip_summed = CHECKSUM_NONE;
++	memmove(skb->data + hdr_len, skb->data, 2 * ETH_ALEN);
++	skb_pull(skb, hdr_len);
++	pskb_trim_unique(skb, skb->len - icv_len);
++}
++
++static void count_rx(struct net_device *dev, int len)
++{
++	struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats);
++
++	u64_stats_update_begin(&stats->syncp);
++	u64_stats_inc(&stats->rx_packets);
++	u64_stats_add(&stats->rx_bytes, len);
++	u64_stats_update_end(&stats->syncp);
++}
++
++static void macsec_decrypt_done(struct crypto_async_request *base, int err)
++{
++	struct sk_buff *skb = base->data;
++	struct net_device *dev = skb->dev;
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa;
++	struct macsec_rx_sc *rx_sc = rx_sa->sc;
++	int len;
++	u32 pn;
++
++	aead_request_free(macsec_skb_cb(skb)->req);
++
++	if (!err)
++		macsec_skb_cb(skb)->valid = true;
++
++	rcu_read_lock_bh();
++	pn = ntohl(macsec_ethhdr(skb)->packet_number);
++	if (!macsec_post_decrypt(skb, &macsec->secy, pn)) {
++		rcu_read_unlock_bh();
++		kfree_skb(skb);
++		goto out;
++	}
++
++	macsec_finalize_skb(skb, macsec->secy.icv_len,
++			    macsec_extra_len(macsec_skb_cb(skb)->has_sci));
++	len = skb->len;
++	macsec_reset_skb(skb, macsec->secy.netdev);
++
++	if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS)
++		count_rx(dev, len);
++
++	rcu_read_unlock_bh();
++
++out:
++	macsec_rxsa_put(rx_sa);
++	macsec_rxsc_put(rx_sc);
++	dev_put(dev);
++}
++
++static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
++				      struct net_device *dev,
++				      struct macsec_rx_sa *rx_sa,
++				      sci_t sci,
++				      struct macsec_secy *secy)
++{
++	int ret;
++	struct scatterlist *sg;
++	struct sk_buff *trailer;
++	unsigned char *iv;
++	struct aead_request *req;
++	struct macsec_eth_header *hdr;
++	u32 hdr_pn;
++	u16 icv_len = secy->icv_len;
++
++	macsec_skb_cb(skb)->valid = false;
++	skb = skb_share_check(skb, GFP_ATOMIC);
++	if (!skb)
++		return ERR_PTR(-ENOMEM);
++
++	ret = skb_cow_data(skb, 0, &trailer);
++	if (unlikely(ret < 0)) {
++		kfree_skb(skb);
++		return ERR_PTR(ret);
++	}
++	req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret);
++	if (!req) {
++		kfree_skb(skb);
++		return ERR_PTR(-ENOMEM);
++	}
++
++	hdr = (struct macsec_eth_header *)skb->data;
++	hdr_pn = ntohl(hdr->packet_number);
++
++	if (secy->xpn) {
++		pn_t recovered_pn = rx_sa->next_pn_halves;
++
++		recovered_pn.lower = hdr_pn;
++		if (hdr_pn < rx_sa->next_pn_halves.lower &&
++		    !pn_same_half(hdr_pn, rx_sa->next_pn_halves.lower))
++			recovered_pn.upper++;
++
++		macsec_fill_iv_xpn(iv, rx_sa->ssci, recovered_pn.full64,
++				   rx_sa->key.salt);
++	} else {
++		macsec_fill_iv(iv, sci, hdr_pn);
++	}
++
++	sg_init_table(sg, ret);
++	ret = skb_to_sgvec(skb, sg, 0, skb->len);
++	if (unlikely(ret < 0)) {
++		aead_request_free(req);
++		kfree_skb(skb);
++		return ERR_PTR(ret);
++	}
++
++	if (hdr->tci_an & MACSEC_TCI_E) {
++		/* confidentiality: ethernet + macsec header
++		 * authenticated, encrypted payload
++		 */
++		int len = skb->len - macsec_hdr_len(macsec_skb_cb(skb)->has_sci);
++
++		aead_request_set_crypt(req, sg, sg, len, iv);
++		aead_request_set_ad(req, macsec_hdr_len(macsec_skb_cb(skb)->has_sci));
++		skb = skb_unshare(skb, GFP_ATOMIC);
++		if (!skb) {
++			aead_request_free(req);
++			return ERR_PTR(-ENOMEM);
++		}
++	} else {
++		/* integrity only: all headers + data authenticated */
++		aead_request_set_crypt(req, sg, sg, icv_len, iv);
++		aead_request_set_ad(req, skb->len - icv_len);
++	}
++
++	macsec_skb_cb(skb)->req = req;
++	skb->dev = dev;
++	aead_request_set_callback(req, 0, macsec_decrypt_done, skb);
++
++	dev_hold(dev);
++	ret = crypto_aead_decrypt(req);
++	if (ret == -EINPROGRESS) {
++		return ERR_PTR(ret);
++	} else if (ret != 0) {
++		/* decryption/authentication failed
++		 * 10.6 if validateFrames is disabled, deliver anyway
++		 */
++		if (ret != -EBADMSG) {
++			kfree_skb(skb);
++			skb = ERR_PTR(ret);
++		}
++	} else {
++		macsec_skb_cb(skb)->valid = true;
++	}
++	dev_put(dev);
++
++	aead_request_free(req);
++
++	return skb;
++}
++
++static struct macsec_rx_sc *find_rx_sc(struct macsec_secy *secy, sci_t sci)
++{
++	struct macsec_rx_sc *rx_sc;
++
++	for_each_rxsc(secy, rx_sc) {
++		if (rx_sc->sci == sci)
++			return rx_sc;
++	}
++
++	return NULL;
++}
++
++static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci)
++{
++	struct macsec_rx_sc *rx_sc;
++
++	for_each_rxsc_rtnl(secy, rx_sc) {
++		if (rx_sc->sci == sci)
++			return rx_sc;
++	}
++
++	return NULL;
++}
++
++static enum rx_handler_result handle_not_macsec(struct sk_buff *skb)
++{
++	/* Deliver to the uncontrolled port by default */
++	enum rx_handler_result ret = RX_HANDLER_PASS;
++	struct ethhdr *hdr = eth_hdr(skb);
++	struct macsec_rxh_data *rxd;
++	struct macsec_dev *macsec;
++
++	rcu_read_lock();
++	rxd = macsec_data_rcu(skb->dev);
++
++	list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
++		struct sk_buff *nskb;
++		struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats);
++		struct net_device *ndev = macsec->secy.netdev;
++
++		/* If h/w offloading is enabled, HW decodes frames and strips
++		 * the SecTAG, so we have to deduce which port to deliver to.
++		 */
++		if (macsec_is_offloaded(macsec) && netif_running(ndev)) {
++			if (ether_addr_equal_64bits(hdr->h_dest,
++						    ndev->dev_addr)) {
++				/* exact match, divert skb to this port */
++				skb->dev = ndev;
++				skb->pkt_type = PACKET_HOST;
++				ret = RX_HANDLER_ANOTHER;
++				goto out;
++			} else if (is_multicast_ether_addr_64bits(
++					   hdr->h_dest)) {
++				/* multicast frame, deliver on this port too */
++				nskb = skb_clone(skb, GFP_ATOMIC);
++				if (!nskb)
++					break;
++
++				nskb->dev = ndev;
++				if (ether_addr_equal_64bits(hdr->h_dest,
++							    ndev->broadcast))
++					nskb->pkt_type = PACKET_BROADCAST;
++				else
++					nskb->pkt_type = PACKET_MULTICAST;
++
++				__netif_rx(nskb);
++			}
++			continue;
++		}
++
++		/* 10.6 If the management control validateFrames is not
++		 * Strict, frames without a SecTAG are received, counted, and
++		 * delivered to the Controlled Port
++		 */
++		if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) {
++			u64_stats_update_begin(&secy_stats->syncp);
++			secy_stats->stats.InPktsNoTag++;
++			u64_stats_update_end(&secy_stats->syncp);
++			macsec->secy.netdev->stats.rx_dropped++;
++			continue;
++		}
++
++		/* deliver on this port */
++		nskb = skb_clone(skb, GFP_ATOMIC);
++		if (!nskb)
++			break;
++
++		nskb->dev = ndev;
++
++		if (__netif_rx(nskb) == NET_RX_SUCCESS) {
++			u64_stats_update_begin(&secy_stats->syncp);
++			secy_stats->stats.InPktsUntagged++;
++			u64_stats_update_end(&secy_stats->syncp);
++		}
++	}
++
++out:
++	rcu_read_unlock();
++	return ret;
++}
++
++static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb)
++{
++	struct sk_buff *skb = *pskb;
++	struct net_device *dev = skb->dev;
++	struct macsec_eth_header *hdr;
++	struct macsec_secy *secy = NULL;
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++	struct macsec_rxh_data *rxd;
++	struct macsec_dev *macsec;
++	unsigned int len;
++	sci_t sci;
++	u32 hdr_pn;
++	bool cbit;
++	struct pcpu_rx_sc_stats *rxsc_stats;
++	struct pcpu_secy_stats *secy_stats;
++	bool pulled_sci;
++	int ret;
++
++	if (skb_headroom(skb) < ETH_HLEN)
++		goto drop_direct;
++
++	hdr = macsec_ethhdr(skb);
++	if (hdr->eth.h_proto != htons(ETH_P_MACSEC))
++		return handle_not_macsec(skb);
++
++	skb = skb_unshare(skb, GFP_ATOMIC);
++	*pskb = skb;
++	if (!skb)
++		return RX_HANDLER_CONSUMED;
++
++	pulled_sci = pskb_may_pull(skb, macsec_extra_len(true));
++	if (!pulled_sci) {
++		if (!pskb_may_pull(skb, macsec_extra_len(false)))
++			goto drop_direct;
++	}
++
++	hdr = macsec_ethhdr(skb);
++
++	/* Frames with a SecTAG that has the TCI E bit set but the C
++	 * bit clear are discarded, as this reserved encoding is used
++	 * to identify frames with a SecTAG that are not to be
++	 * delivered to the Controlled Port.
++	 */
++	if ((hdr->tci_an & (MACSEC_TCI_C | MACSEC_TCI_E)) == MACSEC_TCI_E)
++		return RX_HANDLER_PASS;
++
++	/* now, pull the extra length */
++	if (hdr->tci_an & MACSEC_TCI_SC) {
++		if (!pulled_sci)
++			goto drop_direct;
++	}
++
++	/* ethernet header is part of crypto processing */
++	skb_push(skb, ETH_HLEN);
++
++	macsec_skb_cb(skb)->has_sci = !!(hdr->tci_an & MACSEC_TCI_SC);
++	macsec_skb_cb(skb)->assoc_num = hdr->tci_an & MACSEC_AN_MASK;
++	sci = macsec_frame_sci(hdr, macsec_skb_cb(skb)->has_sci);
++
++	rcu_read_lock();
++	rxd = macsec_data_rcu(skb->dev);
++
++	list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
++		struct macsec_rx_sc *sc = find_rx_sc(&macsec->secy, sci);
++
++		sc = sc ? macsec_rxsc_get(sc) : NULL;
++
++		if (sc) {
++			secy = &macsec->secy;
++			rx_sc = sc;
++			break;
++		}
++	}
++
++	if (!secy)
++		goto nosci;
++
++	dev = secy->netdev;
++	macsec = macsec_priv(dev);
++	secy_stats = this_cpu_ptr(macsec->stats);
++	rxsc_stats = this_cpu_ptr(rx_sc->stats);
++
++	if (!macsec_validate_skb(skb, secy->icv_len, secy->xpn)) {
++		u64_stats_update_begin(&secy_stats->syncp);
++		secy_stats->stats.InPktsBadTag++;
++		u64_stats_update_end(&secy_stats->syncp);
++		secy->netdev->stats.rx_errors++;
++		goto drop_nosa;
++	}
++
++	rx_sa = macsec_rxsa_get(rx_sc->sa[macsec_skb_cb(skb)->assoc_num]);
++	if (!rx_sa) {
++		/* 10.6.1 if the SA is not in use */
++
++		/* If validateFrames is Strict or the C bit in the
++		 * SecTAG is set, discard
++		 */
++		struct macsec_rx_sa *active_rx_sa = macsec_active_rxsa_get(rx_sc);
++		if (hdr->tci_an & MACSEC_TCI_C ||
++		    secy->validate_frames == MACSEC_VALIDATE_STRICT) {
++			u64_stats_update_begin(&rxsc_stats->syncp);
++			rxsc_stats->stats.InPktsNotUsingSA++;
++			u64_stats_update_end(&rxsc_stats->syncp);
++			secy->netdev->stats.rx_errors++;
++			if (active_rx_sa)
++				this_cpu_inc(active_rx_sa->stats->InPktsNotUsingSA);
++			goto drop_nosa;
++		}
++
++		/* not Strict, the frame (with the SecTAG and ICV
++		 * removed) is delivered to the Controlled Port.
++		 */
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		rxsc_stats->stats.InPktsUnusedSA++;
++		u64_stats_update_end(&rxsc_stats->syncp);
++		if (active_rx_sa)
++			this_cpu_inc(active_rx_sa->stats->InPktsUnusedSA);
++		goto deliver;
++	}
++
++	/* First, PN check to avoid decrypting obviously wrong packets */
++	hdr_pn = ntohl(hdr->packet_number);
++	if (secy->replay_protect) {
++		bool late;
++
++		spin_lock(&rx_sa->lock);
++		late = rx_sa->next_pn_halves.lower >= secy->replay_window &&
++		       hdr_pn < (rx_sa->next_pn_halves.lower - secy->replay_window);
++
++		if (secy->xpn)
++			late = late && pn_same_half(rx_sa->next_pn_halves.lower, hdr_pn);
++		spin_unlock(&rx_sa->lock);
++
++		if (late) {
++			u64_stats_update_begin(&rxsc_stats->syncp);
++			rxsc_stats->stats.InPktsLate++;
++			u64_stats_update_end(&rxsc_stats->syncp);
++			macsec->secy.netdev->stats.rx_dropped++;
++			goto drop;
++		}
++	}
++
++	macsec_skb_cb(skb)->rx_sa = rx_sa;
++
++	/* Disabled && !changed text => skip validation */
++	if (hdr->tci_an & MACSEC_TCI_C ||
++	    secy->validate_frames != MACSEC_VALIDATE_DISABLED)
++		skb = macsec_decrypt(skb, dev, rx_sa, sci, secy);
++
++	if (IS_ERR(skb)) {
++		/* the decrypt callback needs the reference */
++		if (PTR_ERR(skb) != -EINPROGRESS) {
++			macsec_rxsa_put(rx_sa);
++			macsec_rxsc_put(rx_sc);
++		}
++		rcu_read_unlock();
++		*pskb = NULL;
++		return RX_HANDLER_CONSUMED;
++	}
++
++	if (!macsec_post_decrypt(skb, secy, hdr_pn))
++		goto drop;
++
++deliver:
++	macsec_finalize_skb(skb, secy->icv_len,
++			    macsec_extra_len(macsec_skb_cb(skb)->has_sci));
++	len = skb->len;
++	macsec_reset_skb(skb, secy->netdev);
++
++	if (rx_sa)
++		macsec_rxsa_put(rx_sa);
++	macsec_rxsc_put(rx_sc);
++
++	skb_orphan(skb);
++	ret = gro_cells_receive(&macsec->gro_cells, skb);
++	if (ret == NET_RX_SUCCESS)
++		count_rx(dev, len);
++	else
++		macsec->secy.netdev->stats.rx_dropped++;
++
++	rcu_read_unlock();
++
++	*pskb = NULL;
++	return RX_HANDLER_CONSUMED;
++
++drop:
++	macsec_rxsa_put(rx_sa);
++drop_nosa:
++	macsec_rxsc_put(rx_sc);
++	rcu_read_unlock();
++drop_direct:
++	kfree_skb(skb);
++	*pskb = NULL;
++	return RX_HANDLER_CONSUMED;
++
++nosci:
++	/* 10.6.1 if the SC is not found */
++	cbit = !!(hdr->tci_an & MACSEC_TCI_C);
++	if (!cbit)
++		macsec_finalize_skb(skb, DEFAULT_ICV_LEN,
++				    macsec_extra_len(macsec_skb_cb(skb)->has_sci));
++
++	list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
++		struct sk_buff *nskb;
++
++		secy_stats = this_cpu_ptr(macsec->stats);
++
++		/* If validateFrames is Strict or the C bit in the
++		 * SecTAG is set, discard
++		 */
++		if (cbit ||
++		    macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) {
++			u64_stats_update_begin(&secy_stats->syncp);
++			secy_stats->stats.InPktsNoSCI++;
++			u64_stats_update_end(&secy_stats->syncp);
++			macsec->secy.netdev->stats.rx_errors++;
++			continue;
++		}
++
++		/* not strict, the frame (with the SecTAG and ICV
++		 * removed) is delivered to the Controlled Port.
++		 */
++		nskb = skb_clone(skb, GFP_ATOMIC);
++		if (!nskb)
++			break;
++
++		macsec_reset_skb(nskb, macsec->secy.netdev);
++
++		ret = __netif_rx(nskb);
++		if (ret == NET_RX_SUCCESS) {
++			u64_stats_update_begin(&secy_stats->syncp);
++			secy_stats->stats.InPktsUnknownSCI++;
++			u64_stats_update_end(&secy_stats->syncp);
++		} else {
++			macsec->secy.netdev->stats.rx_dropped++;
++		}
++	}
++
++	rcu_read_unlock();
++	*pskb = skb;
++	return RX_HANDLER_PASS;
++}
++
++static struct crypto_aead *macsec_alloc_tfm(char *key, int key_len, int icv_len)
++{
++	struct crypto_aead *tfm;
++	int ret;
++
++	/* Pick a sync gcm(aes) cipher to ensure order is preserved. */
++	tfm = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
++
++	if (IS_ERR(tfm))
++		return tfm;
++
++	ret = crypto_aead_setkey(tfm, key, key_len);
++	if (ret < 0)
++		goto fail;
++
++	ret = crypto_aead_setauthsize(tfm, icv_len);
++	if (ret < 0)
++		goto fail;
++
++	return tfm;
++fail:
++	crypto_free_aead(tfm);
++	return ERR_PTR(ret);
++}
++
++static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len,
++		      int icv_len)
++{
++	rx_sa->stats = alloc_percpu(struct macsec_rx_sa_stats);
++	if (!rx_sa->stats)
++		return -ENOMEM;
++
++	rx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len);
++	if (IS_ERR(rx_sa->key.tfm)) {
++		free_percpu(rx_sa->stats);
++		return PTR_ERR(rx_sa->key.tfm);
++	}
++
++	rx_sa->ssci = MACSEC_UNDEF_SSCI;
++	rx_sa->active = false;
++	rx_sa->next_pn = 1;
++	refcount_set(&rx_sa->refcnt, 1);
++	spin_lock_init(&rx_sa->lock);
++
++	return 0;
++}
++
++static void clear_rx_sa(struct macsec_rx_sa *rx_sa)
++{
++	rx_sa->active = false;
++
++	macsec_rxsa_put(rx_sa);
++}
++
++static void free_rx_sc(struct macsec_rx_sc *rx_sc)
++{
++	int i;
++
++	for (i = 0; i < MACSEC_NUM_AN; i++) {
++		struct macsec_rx_sa *sa = rtnl_dereference(rx_sc->sa[i]);
++
++		RCU_INIT_POINTER(rx_sc->sa[i], NULL);
++		if (sa)
++			clear_rx_sa(sa);
++	}
++
++	macsec_rxsc_put(rx_sc);
++}
++
++static struct macsec_rx_sc *del_rx_sc(struct macsec_secy *secy, sci_t sci)
++{
++	struct macsec_rx_sc *rx_sc, __rcu **rx_scp;
++
++	for (rx_scp = &secy->rx_sc, rx_sc = rtnl_dereference(*rx_scp);
++	     rx_sc;
++	     rx_scp = &rx_sc->next, rx_sc = rtnl_dereference(*rx_scp)) {
++		if (rx_sc->sci == sci) {
++			if (rx_sc->active)
++				secy->n_rx_sc--;
++			rcu_assign_pointer(*rx_scp, rx_sc->next);
++			return rx_sc;
++		}
++	}
++
++	return NULL;
++}
++
++static struct macsec_rx_sc *create_rx_sc(struct net_device *dev, sci_t sci,
++					 bool active)
++{
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_dev *macsec;
++	struct net_device *real_dev = macsec_priv(dev)->real_dev;
++	struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);
++	struct macsec_secy *secy;
++
++	list_for_each_entry(macsec, &rxd->secys, secys) {
++		if (find_rx_sc_rtnl(&macsec->secy, sci))
++			return ERR_PTR(-EEXIST);
++	}
++
++	rx_sc = kzalloc(sizeof(*rx_sc), GFP_KERNEL);
++	if (!rx_sc)
++		return ERR_PTR(-ENOMEM);
++
++	rx_sc->stats = netdev_alloc_pcpu_stats(struct pcpu_rx_sc_stats);
++	if (!rx_sc->stats) {
++		kfree(rx_sc);
++		return ERR_PTR(-ENOMEM);
++	}
++
++	rx_sc->sci = sci;
++	rx_sc->active = active;
++	refcount_set(&rx_sc->refcnt, 1);
++
++	secy = &macsec_priv(dev)->secy;
++	rcu_assign_pointer(rx_sc->next, secy->rx_sc);
++	rcu_assign_pointer(secy->rx_sc, rx_sc);
++
++	if (rx_sc->active)
++		secy->n_rx_sc++;
++
++	return rx_sc;
++}
++
++static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len,
++		      int icv_len)
++{
++	tx_sa->stats = alloc_percpu(struct macsec_tx_sa_stats);
++	if (!tx_sa->stats)
++		return -ENOMEM;
++
++	tx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len);
++	if (IS_ERR(tx_sa->key.tfm)) {
++		free_percpu(tx_sa->stats);
++		return PTR_ERR(tx_sa->key.tfm);
++	}
++
++	tx_sa->ssci = MACSEC_UNDEF_SSCI;
++	tx_sa->active = false;
++	refcount_set(&tx_sa->refcnt, 1);
++	spin_lock_init(&tx_sa->lock);
++
++	return 0;
++}
++
++static void clear_tx_sa(struct macsec_tx_sa *tx_sa)
++{
++	tx_sa->active = false;
++
++	macsec_txsa_put(tx_sa);
++}
++
++static struct genl_family macsec_fam;
++
++static struct net_device *get_dev_from_nl(struct net *net,
++					  struct nlattr **attrs)
++{
++	int ifindex = nla_get_u32(attrs[MACSEC_ATTR_IFINDEX]);
++	struct net_device *dev;
++
++	dev = __dev_get_by_index(net, ifindex);
++	if (!dev)
++		return ERR_PTR(-ENODEV);
++
++	if (!netif_is_macsec(dev))
++		return ERR_PTR(-ENODEV);
++
++	return dev;
++}
++
++static enum macsec_offload nla_get_offload(const struct nlattr *nla)
++{
++	return (__force enum macsec_offload)nla_get_u8(nla);
++}
++
++static sci_t nla_get_sci(const struct nlattr *nla)
++{
++	return (__force sci_t)nla_get_u64(nla);
++}
++
++static int nla_put_sci(struct sk_buff *skb, int attrtype, sci_t value,
++		       int padattr)
++{
++	return nla_put_u64_64bit(skb, attrtype, (__force u64)value, padattr);
++}
++
++static ssci_t nla_get_ssci(const struct nlattr *nla)
++{
++	return (__force ssci_t)nla_get_u32(nla);
++}
++
++static int nla_put_ssci(struct sk_buff *skb, int attrtype, ssci_t value)
++{
++	return nla_put_u32(skb, attrtype, (__force u64)value);
++}
++
++static struct macsec_tx_sa *get_txsa_from_nl(struct net *net,
++					     struct nlattr **attrs,
++					     struct nlattr **tb_sa,
++					     struct net_device **devp,
++					     struct macsec_secy **secyp,
++					     struct macsec_tx_sc **scp,
++					     u8 *assoc_num)
++{
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++
++	if (!tb_sa[MACSEC_SA_ATTR_AN])
++		return ERR_PTR(-EINVAL);
++
++	*assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);
++
++	dev = get_dev_from_nl(net, attrs);
++	if (IS_ERR(dev))
++		return ERR_CAST(dev);
++
++	if (*assoc_num >= MACSEC_NUM_AN)
++		return ERR_PTR(-EINVAL);
++
++	secy = &macsec_priv(dev)->secy;
++	tx_sc = &secy->tx_sc;
++
++	tx_sa = rtnl_dereference(tx_sc->sa[*assoc_num]);
++	if (!tx_sa)
++		return ERR_PTR(-ENODEV);
++
++	*devp = dev;
++	*scp = tx_sc;
++	*secyp = secy;
++	return tx_sa;
++}
++
++static struct macsec_rx_sc *get_rxsc_from_nl(struct net *net,
++					     struct nlattr **attrs,
++					     struct nlattr **tb_rxsc,
++					     struct net_device **devp,
++					     struct macsec_secy **secyp)
++{
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	sci_t sci;
++
++	dev = get_dev_from_nl(net, attrs);
++	if (IS_ERR(dev))
++		return ERR_CAST(dev);
++
++	secy = &macsec_priv(dev)->secy;
++
++	if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI])
++		return ERR_PTR(-EINVAL);
++
++	sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]);
++	rx_sc = find_rx_sc_rtnl(secy, sci);
++	if (!rx_sc)
++		return ERR_PTR(-ENODEV);
++
++	*secyp = secy;
++	*devp = dev;
++
++	return rx_sc;
++}
++
++static struct macsec_rx_sa *get_rxsa_from_nl(struct net *net,
++					     struct nlattr **attrs,
++					     struct nlattr **tb_rxsc,
++					     struct nlattr **tb_sa,
++					     struct net_device **devp,
++					     struct macsec_secy **secyp,
++					     struct macsec_rx_sc **scp,
++					     u8 *assoc_num)
++{
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++
++	if (!tb_sa[MACSEC_SA_ATTR_AN])
++		return ERR_PTR(-EINVAL);
++
++	*assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);
++	if (*assoc_num >= MACSEC_NUM_AN)
++		return ERR_PTR(-EINVAL);
++
++	rx_sc = get_rxsc_from_nl(net, attrs, tb_rxsc, devp, secyp);
++	if (IS_ERR(rx_sc))
++		return ERR_CAST(rx_sc);
++
++	rx_sa = rtnl_dereference(rx_sc->sa[*assoc_num]);
++	if (!rx_sa)
++		return ERR_PTR(-ENODEV);
++
++	*scp = rx_sc;
++	return rx_sa;
++}
++
++static const struct nla_policy macsec_genl_policy[NUM_MACSEC_ATTR] = {
++	[MACSEC_ATTR_IFINDEX] = { .type = NLA_U32 },
++	[MACSEC_ATTR_RXSC_CONFIG] = { .type = NLA_NESTED },
++	[MACSEC_ATTR_SA_CONFIG] = { .type = NLA_NESTED },
++	[MACSEC_ATTR_OFFLOAD] = { .type = NLA_NESTED },
++};
++
++static const struct nla_policy macsec_genl_rxsc_policy[NUM_MACSEC_RXSC_ATTR] = {
++	[MACSEC_RXSC_ATTR_SCI] = { .type = NLA_U64 },
++	[MACSEC_RXSC_ATTR_ACTIVE] = { .type = NLA_U8 },
++};
++
++static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = {
++	[MACSEC_SA_ATTR_AN] = { .type = NLA_U8 },
++	[MACSEC_SA_ATTR_ACTIVE] = { .type = NLA_U8 },
++	[MACSEC_SA_ATTR_PN] = NLA_POLICY_MIN_LEN(4),
++	[MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY,
++				   .len = MACSEC_KEYID_LEN, },
++	[MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY,
++				 .len = MACSEC_MAX_KEY_LEN, },
++	[MACSEC_SA_ATTR_SSCI] = { .type = NLA_U32 },
++	[MACSEC_SA_ATTR_SALT] = { .type = NLA_BINARY,
++				  .len = MACSEC_SALT_LEN, },
++};
++
++static const struct nla_policy macsec_genl_offload_policy[NUM_MACSEC_OFFLOAD_ATTR] = {
++	[MACSEC_OFFLOAD_ATTR_TYPE] = { .type = NLA_U8 },
++};
++
++/* Offloads an operation to a device driver */
++static int macsec_offload(int (* const func)(struct macsec_context *),
++			  struct macsec_context *ctx)
++{
++	int ret;
++
++	if (unlikely(!func))
++		return 0;
++
++	if (ctx->offload == MACSEC_OFFLOAD_PHY)
++		mutex_lock(&ctx->phydev->lock);
++
++	/* Phase I: prepare. The drive should fail here if there are going to be
++	 * issues in the commit phase.
++	 */
++	ctx->prepare = true;
++	ret = (*func)(ctx);
++	if (ret)
++		goto phy_unlock;
++
++	/* Phase II: commit. This step cannot fail. */
++	ctx->prepare = false;
++	ret = (*func)(ctx);
++	/* This should never happen: commit is not allowed to fail */
++	if (unlikely(ret))
++		WARN(1, "MACsec offloading commit failed (%d)\n", ret);
++
++phy_unlock:
++	if (ctx->offload == MACSEC_OFFLOAD_PHY)
++		mutex_unlock(&ctx->phydev->lock);
++
++	return ret;
++}
++
++static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa)
++{
++	if (!attrs[MACSEC_ATTR_SA_CONFIG])
++		return -EINVAL;
++
++	if (nla_parse_nested_deprecated(tb_sa, MACSEC_SA_ATTR_MAX, attrs[MACSEC_ATTR_SA_CONFIG], macsec_genl_sa_policy, NULL))
++		return -EINVAL;
++
++	return 0;
++}
++
++static int parse_rxsc_config(struct nlattr **attrs, struct nlattr **tb_rxsc)
++{
++	if (!attrs[MACSEC_ATTR_RXSC_CONFIG])
++		return -EINVAL;
++
++	if (nla_parse_nested_deprecated(tb_rxsc, MACSEC_RXSC_ATTR_MAX, attrs[MACSEC_ATTR_RXSC_CONFIG], macsec_genl_rxsc_policy, NULL))
++		return -EINVAL;
++
++	return 0;
++}
++
++static bool validate_add_rxsa(struct nlattr **attrs)
++{
++	if (!attrs[MACSEC_SA_ATTR_AN] ||
++	    !attrs[MACSEC_SA_ATTR_KEY] ||
++	    !attrs[MACSEC_SA_ATTR_KEYID])
++		return false;
++
++	if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_PN] &&
++	    nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
++		if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1)
++			return false;
++	}
++
++	if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN)
++		return false;
++
++	return true;
++}
++
++static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct net_device *dev;
++	struct nlattr **attrs = info->attrs;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++	unsigned char assoc_num;
++	int pn_len;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	int err;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (!validate_add_rxsa(tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy);
++	if (IS_ERR(rx_sc)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sc);
++	}
++
++	assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);
++
++	if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) {
++		pr_notice("macsec: nl: add_rxsa: bad key length: %d != %d\n",
++			  nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len);
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
++	if (tb_sa[MACSEC_SA_ATTR_PN] &&
++	    nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
++		pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n",
++			  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	if (secy->xpn) {
++		if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) {
++			rtnl_unlock();
++			return -EINVAL;
++		}
++
++		if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) {
++			pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n",
++				  nla_len(tb_sa[MACSEC_SA_ATTR_SALT]),
++				  MACSEC_SALT_LEN);
++			rtnl_unlock();
++			return -EINVAL;
++		}
++	}
++
++	rx_sa = rtnl_dereference(rx_sc->sa[assoc_num]);
++	if (rx_sa) {
++		rtnl_unlock();
++		return -EBUSY;
++	}
++
++	rx_sa = kmalloc(sizeof(*rx_sa), GFP_KERNEL);
++	if (!rx_sa) {
++		rtnl_unlock();
++		return -ENOMEM;
++	}
++
++	err = init_rx_sa(rx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
++			 secy->key_len, secy->icv_len);
++	if (err < 0) {
++		kfree(rx_sa);
++		rtnl_unlock();
++		return err;
++	}
++
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		spin_lock_bh(&rx_sa->lock);
++		rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
++		spin_unlock_bh(&rx_sa->lock);
++	}
++
++	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
++		rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
++
++	rx_sa->sc = rx_sc;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			err = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.rx_sa = rx_sa;
++		ctx.secy = secy;
++		memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
++		       secy->key_len);
++
++		err = macsec_offload(ops->mdo_add_rxsa, &ctx);
++		memzero_explicit(ctx.sa.key, secy->key_len);
++		if (err)
++			goto cleanup;
++	}
++
++	if (secy->xpn) {
++		rx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]);
++		nla_memcpy(rx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT],
++			   MACSEC_SALT_LEN);
++	}
++
++	nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
++	rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa);
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	macsec_rxsa_put(rx_sa);
++	rtnl_unlock();
++	return err;
++}
++
++static bool validate_add_rxsc(struct nlattr **attrs)
++{
++	if (!attrs[MACSEC_RXSC_ATTR_SCI])
++		return false;
++
++	if (attrs[MACSEC_RXSC_ATTR_ACTIVE]) {
++		if (nla_get_u8(attrs[MACSEC_RXSC_ATTR_ACTIVE]) > 1)
++			return false;
++	}
++
++	return true;
++}
++
++static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info)
++{
++	struct net_device *dev;
++	sci_t sci = MACSEC_UNDEF_SCI;
++	struct nlattr **attrs = info->attrs;
++	struct macsec_rx_sc *rx_sc;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	struct macsec_secy *secy;
++	bool active = true;
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (!validate_add_rxsc(tb_rxsc))
++		return -EINVAL;
++
++	rtnl_lock();
++	dev = get_dev_from_nl(genl_info_net(info), attrs);
++	if (IS_ERR(dev)) {
++		rtnl_unlock();
++		return PTR_ERR(dev);
++	}
++
++	secy = &macsec_priv(dev)->secy;
++	sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]);
++
++	if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE])
++		active = nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]);
++
++	rx_sc = create_rx_sc(dev, sci, active);
++	if (IS_ERR(rx_sc)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sc);
++	}
++
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.rx_sc = rx_sc;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_add_rxsc, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	del_rx_sc(secy, sci);
++	free_rx_sc(rx_sc);
++	rtnl_unlock();
++	return ret;
++}
++
++static bool validate_add_txsa(struct nlattr **attrs)
++{
++	if (!attrs[MACSEC_SA_ATTR_AN] ||
++	    !attrs[MACSEC_SA_ATTR_PN] ||
++	    !attrs[MACSEC_SA_ATTR_KEY] ||
++	    !attrs[MACSEC_SA_ATTR_KEYID])
++		return false;
++
++	if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
++		return false;
++
++	if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
++		if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1)
++			return false;
++	}
++
++	if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN)
++		return false;
++
++	return true;
++}
++
++static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct net_device *dev;
++	struct nlattr **attrs = info->attrs;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++	unsigned char assoc_num;
++	int pn_len;
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	bool was_operational;
++	int err;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (!validate_add_txsa(tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	dev = get_dev_from_nl(genl_info_net(info), attrs);
++	if (IS_ERR(dev)) {
++		rtnl_unlock();
++		return PTR_ERR(dev);
++	}
++
++	secy = &macsec_priv(dev)->secy;
++	tx_sc = &secy->tx_sc;
++
++	assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);
++
++	if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) {
++		pr_notice("macsec: nl: add_txsa: bad key length: %d != %d\n",
++			  nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len);
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
++	if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
++		pr_notice("macsec: nl: add_txsa: bad pn length: %d != %d\n",
++			  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	if (secy->xpn) {
++		if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) {
++			rtnl_unlock();
++			return -EINVAL;
++		}
++
++		if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) {
++			pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n",
++				  nla_len(tb_sa[MACSEC_SA_ATTR_SALT]),
++				  MACSEC_SALT_LEN);
++			rtnl_unlock();
++			return -EINVAL;
++		}
++	}
++
++	tx_sa = rtnl_dereference(tx_sc->sa[assoc_num]);
++	if (tx_sa) {
++		rtnl_unlock();
++		return -EBUSY;
++	}
++
++	tx_sa = kmalloc(sizeof(*tx_sa), GFP_KERNEL);
++	if (!tx_sa) {
++		rtnl_unlock();
++		return -ENOMEM;
++	}
++
++	err = init_tx_sa(tx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
++			 secy->key_len, secy->icv_len);
++	if (err < 0) {
++		kfree(tx_sa);
++		rtnl_unlock();
++		return err;
++	}
++
++	spin_lock_bh(&tx_sa->lock);
++	tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
++	spin_unlock_bh(&tx_sa->lock);
++
++	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
++		tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
++
++	was_operational = secy->operational;
++	if (assoc_num == tx_sc->encoding_sa && tx_sa->active)
++		secy->operational = true;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			err = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.tx_sa = tx_sa;
++		ctx.secy = secy;
++		memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
++		       secy->key_len);
++
++		err = macsec_offload(ops->mdo_add_txsa, &ctx);
++		memzero_explicit(ctx.sa.key, secy->key_len);
++		if (err)
++			goto cleanup;
++	}
++
++	if (secy->xpn) {
++		tx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]);
++		nla_memcpy(tx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT],
++			   MACSEC_SALT_LEN);
++	}
++
++	nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
++	rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa);
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	secy->operational = was_operational;
++	macsec_txsa_put(tx_sa);
++	rtnl_unlock();
++	return err;
++}
++
++static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++	u8 assoc_num;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	rtnl_lock();
++	rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa,
++				 &dev, &secy, &rx_sc, &assoc_num);
++	if (IS_ERR(rx_sa)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sa);
++	}
++
++	if (rx_sa->active) {
++		rtnl_unlock();
++		return -EBUSY;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.rx_sa = rx_sa;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_del_rxsa, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL);
++	clear_rx_sa(rx_sa);
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	rtnl_unlock();
++	return ret;
++}
++
++static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	sci_t sci;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI])
++		return -EINVAL;
++
++	rtnl_lock();
++	dev = get_dev_from_nl(genl_info_net(info), info->attrs);
++	if (IS_ERR(dev)) {
++		rtnl_unlock();
++		return PTR_ERR(dev);
++	}
++
++	secy = &macsec_priv(dev)->secy;
++	sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]);
++
++	rx_sc = del_rx_sc(secy, sci);
++	if (!rx_sc) {
++		rtnl_unlock();
++		return -ENODEV;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.rx_sc = rx_sc;
++		ctx.secy = secy;
++		ret = macsec_offload(ops->mdo_del_rxsc, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	free_rx_sc(rx_sc);
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	rtnl_unlock();
++	return ret;
++}
++
++static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++	u8 assoc_num;
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa,
++				 &dev, &secy, &tx_sc, &assoc_num);
++	if (IS_ERR(tx_sa)) {
++		rtnl_unlock();
++		return PTR_ERR(tx_sa);
++	}
++
++	if (tx_sa->active) {
++		rtnl_unlock();
++		return -EBUSY;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.tx_sa = tx_sa;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_del_txsa, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL);
++	clear_tx_sa(tx_sa);
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	rtnl_unlock();
++	return ret;
++}
++
++static bool validate_upd_sa(struct nlattr **attrs)
++{
++	if (!attrs[MACSEC_SA_ATTR_AN] ||
++	    attrs[MACSEC_SA_ATTR_KEY] ||
++	    attrs[MACSEC_SA_ATTR_KEYID] ||
++	    attrs[MACSEC_SA_ATTR_SSCI] ||
++	    attrs[MACSEC_SA_ATTR_SALT])
++		return false;
++
++	if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
++		if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1)
++			return false;
++	}
++
++	return true;
++}
++
++static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++	u8 assoc_num;
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	bool was_operational, was_active;
++	pn_t prev_pn;
++	int ret = 0;
++
++	prev_pn.full64 = 0;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (!validate_upd_sa(tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa,
++				 &dev, &secy, &tx_sc, &assoc_num);
++	if (IS_ERR(tx_sa)) {
++		rtnl_unlock();
++		return PTR_ERR(tx_sa);
++	}
++
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		int pn_len;
++
++		pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
++		if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
++			pr_notice("macsec: nl: upd_txsa: bad pn length: %d != %d\n",
++				  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
++			rtnl_unlock();
++			return -EINVAL;
++		}
++
++		spin_lock_bh(&tx_sa->lock);
++		prev_pn = tx_sa->next_pn_halves;
++		tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
++		spin_unlock_bh(&tx_sa->lock);
++	}
++
++	was_active = tx_sa->active;
++	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
++		tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
++
++	was_operational = secy->operational;
++	if (assoc_num == tx_sc->encoding_sa)
++		secy->operational = tx_sa->active;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.tx_sa = tx_sa;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_upd_txsa, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		spin_lock_bh(&tx_sa->lock);
++		tx_sa->next_pn_halves = prev_pn;
++		spin_unlock_bh(&tx_sa->lock);
++	}
++	tx_sa->active = was_active;
++	secy->operational = was_operational;
++	rtnl_unlock();
++	return ret;
++}
++
++static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++	u8 assoc_num;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	bool was_active;
++	pn_t prev_pn;
++	int ret = 0;
++
++	prev_pn.full64 = 0;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (!validate_upd_sa(tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa,
++				 &dev, &secy, &rx_sc, &assoc_num);
++	if (IS_ERR(rx_sa)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sa);
++	}
++
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		int pn_len;
++
++		pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
++		if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
++			pr_notice("macsec: nl: upd_rxsa: bad pn length: %d != %d\n",
++				  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
++			rtnl_unlock();
++			return -EINVAL;
++		}
++
++		spin_lock_bh(&rx_sa->lock);
++		prev_pn = rx_sa->next_pn_halves;
++		rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
++		spin_unlock_bh(&rx_sa->lock);
++	}
++
++	was_active = rx_sa->active;
++	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
++		rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.rx_sa = rx_sa;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_upd_rxsa, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	rtnl_unlock();
++	return 0;
++
++cleanup:
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		spin_lock_bh(&rx_sa->lock);
++		rx_sa->next_pn_halves = prev_pn;
++		spin_unlock_bh(&rx_sa->lock);
++	}
++	rx_sa->active = was_active;
++	rtnl_unlock();
++	return ret;
++}
++
++static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	unsigned int prev_n_rx_sc;
++	bool was_active;
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (!validate_add_rxsc(tb_rxsc))
++		return -EINVAL;
++
++	rtnl_lock();
++	rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy);
++	if (IS_ERR(rx_sc)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sc);
++	}
++
++	was_active = rx_sc->active;
++	prev_n_rx_sc = secy->n_rx_sc;
++	if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) {
++		bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]);
++
++		if (rx_sc->active != new)
++			secy->n_rx_sc += new ? 1 : -1;
++
++		rx_sc->active = new;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.rx_sc = rx_sc;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_upd_rxsc, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	secy->n_rx_sc = prev_n_rx_sc;
++	rx_sc->active = was_active;
++	rtnl_unlock();
++	return ret;
++}
++
++static bool macsec_is_configured(struct macsec_dev *macsec)
++{
++	struct macsec_secy *secy = &macsec->secy;
++	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++	int i;
++
++	if (secy->rx_sc)
++		return true;
++
++	for (i = 0; i < MACSEC_NUM_AN; i++)
++		if (tx_sc->sa[i])
++			return true;
++
++	return false;
++}
++
++static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr *tb_offload[MACSEC_OFFLOAD_ATTR_MAX + 1];
++	enum macsec_offload offload, prev_offload;
++	int (*func)(struct macsec_context *ctx);
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	const struct macsec_ops *ops;
++	struct macsec_context ctx;
++	struct macsec_dev *macsec;
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (!attrs[MACSEC_ATTR_OFFLOAD])
++		return -EINVAL;
++
++	if (nla_parse_nested_deprecated(tb_offload, MACSEC_OFFLOAD_ATTR_MAX,
++					attrs[MACSEC_ATTR_OFFLOAD],
++					macsec_genl_offload_policy, NULL))
++		return -EINVAL;
++
++	dev = get_dev_from_nl(genl_info_net(info), attrs);
++	if (IS_ERR(dev))
++		return PTR_ERR(dev);
++	macsec = macsec_priv(dev);
++
++	if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE])
++		return -EINVAL;
++
++	offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]);
++	if (macsec->offload == offload)
++		return 0;
++
++	/* Check if the offloading mode is supported by the underlying layers */
++	if (offload != MACSEC_OFFLOAD_OFF &&
++	    !macsec_check_offload(offload, macsec))
++		return -EOPNOTSUPP;
++
++	/* Check if the net device is busy. */
++	if (netif_running(dev))
++		return -EBUSY;
++
++	rtnl_lock();
++
++	prev_offload = macsec->offload;
++	macsec->offload = offload;
++
++	/* Check if the device already has rules configured: we do not support
++	 * rules migration.
++	 */
++	if (macsec_is_configured(macsec)) {
++		ret = -EBUSY;
++		goto rollback;
++	}
++
++	ops = __macsec_get_ops(offload == MACSEC_OFFLOAD_OFF ? prev_offload : offload,
++			       macsec, &ctx);
++	if (!ops) {
++		ret = -EOPNOTSUPP;
++		goto rollback;
++	}
++
++	if (prev_offload == MACSEC_OFFLOAD_OFF)
++		func = ops->mdo_add_secy;
++	else
++		func = ops->mdo_del_secy;
++
++	ctx.secy = &macsec->secy;
++	ret = macsec_offload(func, &ctx);
++	if (ret)
++		goto rollback;
++
++	rtnl_unlock();
++	return 0;
++
++rollback:
++	macsec->offload = prev_offload;
++
++	rtnl_unlock();
++	return ret;
++}
++
++static void get_tx_sa_stats(struct net_device *dev, int an,
++			    struct macsec_tx_sa *tx_sa,
++			    struct macsec_tx_sa_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.sa.assoc_num = an;
++			ctx.sa.tx_sa = tx_sa;
++			ctx.stats.tx_sa_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			macsec_offload(ops->mdo_get_tx_sa_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct macsec_tx_sa_stats *stats =
++			per_cpu_ptr(tx_sa->stats, cpu);
++
++		sum->OutPktsProtected += stats->OutPktsProtected;
++		sum->OutPktsEncrypted += stats->OutPktsEncrypted;
++	}
++}
++
++static int copy_tx_sa_stats(struct sk_buff *skb, struct macsec_tx_sa_stats *sum)
++{
++	if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED,
++			sum->OutPktsProtected) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED,
++			sum->OutPktsEncrypted))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static void get_rx_sa_stats(struct net_device *dev,
++			    struct macsec_rx_sc *rx_sc, int an,
++			    struct macsec_rx_sa *rx_sa,
++			    struct macsec_rx_sa_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.sa.assoc_num = an;
++			ctx.sa.rx_sa = rx_sa;
++			ctx.stats.rx_sa_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			ctx.rx_sc = rx_sc;
++			macsec_offload(ops->mdo_get_rx_sa_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct macsec_rx_sa_stats *stats =
++			per_cpu_ptr(rx_sa->stats, cpu);
++
++		sum->InPktsOK         += stats->InPktsOK;
++		sum->InPktsInvalid    += stats->InPktsInvalid;
++		sum->InPktsNotValid   += stats->InPktsNotValid;
++		sum->InPktsNotUsingSA += stats->InPktsNotUsingSA;
++		sum->InPktsUnusedSA   += stats->InPktsUnusedSA;
++	}
++}
++
++static int copy_rx_sa_stats(struct sk_buff *skb,
++			    struct macsec_rx_sa_stats *sum)
++{
++	if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID,
++			sum->InPktsInvalid) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID,
++			sum->InPktsNotValid) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA,
++			sum->InPktsNotUsingSA) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA,
++			sum->InPktsUnusedSA))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static void get_rx_sc_stats(struct net_device *dev,
++			    struct macsec_rx_sc *rx_sc,
++			    struct macsec_rx_sc_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.stats.rx_sc_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			ctx.rx_sc = rx_sc;
++			macsec_offload(ops->mdo_get_rx_sc_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct pcpu_rx_sc_stats *stats;
++		struct macsec_rx_sc_stats tmp;
++		unsigned int start;
++
++		stats = per_cpu_ptr(rx_sc->stats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&stats->syncp);
++			memcpy(&tmp, &stats->stats, sizeof(tmp));
++		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
++
++		sum->InOctetsValidated += tmp.InOctetsValidated;
++		sum->InOctetsDecrypted += tmp.InOctetsDecrypted;
++		sum->InPktsUnchecked   += tmp.InPktsUnchecked;
++		sum->InPktsDelayed     += tmp.InPktsDelayed;
++		sum->InPktsOK          += tmp.InPktsOK;
++		sum->InPktsInvalid     += tmp.InPktsInvalid;
++		sum->InPktsLate        += tmp.InPktsLate;
++		sum->InPktsNotValid    += tmp.InPktsNotValid;
++		sum->InPktsNotUsingSA  += tmp.InPktsNotUsingSA;
++		sum->InPktsUnusedSA    += tmp.InPktsUnusedSA;
++	}
++}
++
++static int copy_rx_sc_stats(struct sk_buff *skb, struct macsec_rx_sc_stats *sum)
++{
++	if (nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED,
++			      sum->InOctetsValidated,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED,
++			      sum->InOctetsDecrypted,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED,
++			      sum->InPktsUnchecked,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED,
++			      sum->InPktsDelayed,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK,
++			      sum->InPktsOK,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID,
++			      sum->InPktsInvalid,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE,
++			      sum->InPktsLate,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID,
++			      sum->InPktsNotValid,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA,
++			      sum->InPktsNotUsingSA,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA,
++			      sum->InPktsUnusedSA,
++			      MACSEC_RXSC_STATS_ATTR_PAD))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static void get_tx_sc_stats(struct net_device *dev,
++			    struct macsec_tx_sc_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.stats.tx_sc_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			macsec_offload(ops->mdo_get_tx_sc_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct pcpu_tx_sc_stats *stats;
++		struct macsec_tx_sc_stats tmp;
++		unsigned int start;
++
++		stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&stats->syncp);
++			memcpy(&tmp, &stats->stats, sizeof(tmp));
++		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
++
++		sum->OutPktsProtected   += tmp.OutPktsProtected;
++		sum->OutPktsEncrypted   += tmp.OutPktsEncrypted;
++		sum->OutOctetsProtected += tmp.OutOctetsProtected;
++		sum->OutOctetsEncrypted += tmp.OutOctetsEncrypted;
++	}
++}
++
++static int copy_tx_sc_stats(struct sk_buff *skb, struct macsec_tx_sc_stats *sum)
++{
++	if (nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED,
++			      sum->OutPktsProtected,
++			      MACSEC_TXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED,
++			      sum->OutPktsEncrypted,
++			      MACSEC_TXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED,
++			      sum->OutOctetsProtected,
++			      MACSEC_TXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED,
++			      sum->OutOctetsEncrypted,
++			      MACSEC_TXSC_STATS_ATTR_PAD))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.stats.dev_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			macsec_offload(ops->mdo_get_dev_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct pcpu_secy_stats *stats;
++		struct macsec_dev_stats tmp;
++		unsigned int start;
++
++		stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&stats->syncp);
++			memcpy(&tmp, &stats->stats, sizeof(tmp));
++		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
++
++		sum->OutPktsUntagged  += tmp.OutPktsUntagged;
++		sum->InPktsUntagged   += tmp.InPktsUntagged;
++		sum->OutPktsTooLong   += tmp.OutPktsTooLong;
++		sum->InPktsNoTag      += tmp.InPktsNoTag;
++		sum->InPktsBadTag     += tmp.InPktsBadTag;
++		sum->InPktsUnknownSCI += tmp.InPktsUnknownSCI;
++		sum->InPktsNoSCI      += tmp.InPktsNoSCI;
++		sum->InPktsOverrun    += tmp.InPktsOverrun;
++	}
++}
++
++static int copy_secy_stats(struct sk_buff *skb, struct macsec_dev_stats *sum)
++{
++	if (nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED,
++			      sum->OutPktsUntagged,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED,
++			      sum->InPktsUntagged,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG,
++			      sum->OutPktsTooLong,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG,
++			      sum->InPktsNoTag,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG,
++			      sum->InPktsBadTag,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI,
++			      sum->InPktsUnknownSCI,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI,
++			      sum->InPktsNoSCI,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN,
++			      sum->InPktsOverrun,
++			      MACSEC_SECY_STATS_ATTR_PAD))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb)
++{
++	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++	struct nlattr *secy_nest = nla_nest_start_noflag(skb,
++							 MACSEC_ATTR_SECY);
++	u64 csid;
++
++	if (!secy_nest)
++		return 1;
++
++	switch (secy->key_len) {
++	case MACSEC_GCM_AES_128_SAK_LEN:
++		csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID;
++		break;
++	case MACSEC_GCM_AES_256_SAK_LEN:
++		csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256;
++		break;
++	default:
++		goto cancel;
++	}
++
++	if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci,
++			MACSEC_SECY_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE,
++			      csid, MACSEC_SECY_ATTR_PAD) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_REPLAY, secy->replay_protect) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_VALIDATE, secy->validate_frames) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_ENCRYPT, tx_sc->encrypt) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_INC_SCI, tx_sc->send_sci) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_ES, tx_sc->end_station) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_SCB, tx_sc->scb) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_ENCODING_SA, tx_sc->encoding_sa))
++		goto cancel;
++
++	if (secy->replay_protect) {
++		if (nla_put_u32(skb, MACSEC_SECY_ATTR_WINDOW, secy->replay_window))
++			goto cancel;
++	}
++
++	nla_nest_end(skb, secy_nest);
++	return 0;
++
++cancel:
++	nla_nest_cancel(skb, secy_nest);
++	return 1;
++}
++
++static noinline_for_stack int
++dump_secy(struct macsec_secy *secy, struct net_device *dev,
++	  struct sk_buff *skb, struct netlink_callback *cb)
++{
++	struct macsec_tx_sc_stats tx_sc_stats = {0, };
++	struct macsec_tx_sa_stats tx_sa_stats = {0, };
++	struct macsec_rx_sc_stats rx_sc_stats = {0, };
++	struct macsec_rx_sa_stats rx_sa_stats = {0, };
++	struct macsec_dev *macsec = netdev_priv(dev);
++	struct macsec_dev_stats dev_stats = {0, };
++	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++	struct nlattr *txsa_list, *rxsc_list;
++	struct macsec_rx_sc *rx_sc;
++	struct nlattr *attr;
++	void *hdr;
++	int i, j;
++
++	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
++			  &macsec_fam, NLM_F_MULTI, MACSEC_CMD_GET_TXSC);
++	if (!hdr)
++		return -EMSGSIZE;
++
++	genl_dump_check_consistent(cb, hdr);
++
++	if (nla_put_u32(skb, MACSEC_ATTR_IFINDEX, dev->ifindex))
++		goto nla_put_failure;
++
++	attr = nla_nest_start_noflag(skb, MACSEC_ATTR_OFFLOAD);
++	if (!attr)
++		goto nla_put_failure;
++	if (nla_put_u8(skb, MACSEC_OFFLOAD_ATTR_TYPE, macsec->offload))
++		goto nla_put_failure;
++	nla_nest_end(skb, attr);
++
++	if (nla_put_secy(secy, skb))
++		goto nla_put_failure;
++
++	attr = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSC_STATS);
++	if (!attr)
++		goto nla_put_failure;
++
++	get_tx_sc_stats(dev, &tx_sc_stats);
++	if (copy_tx_sc_stats(skb, &tx_sc_stats)) {
++		nla_nest_cancel(skb, attr);
++		goto nla_put_failure;
++	}
++	nla_nest_end(skb, attr);
++
++	attr = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY_STATS);
++	if (!attr)
++		goto nla_put_failure;
++	get_secy_stats(dev, &dev_stats);
++	if (copy_secy_stats(skb, &dev_stats)) {
++		nla_nest_cancel(skb, attr);
++		goto nla_put_failure;
++	}
++	nla_nest_end(skb, attr);
++
++	txsa_list = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSA_LIST);
++	if (!txsa_list)
++		goto nla_put_failure;
++	for (i = 0, j = 1; i < MACSEC_NUM_AN; i++) {
++		struct macsec_tx_sa *tx_sa = rtnl_dereference(tx_sc->sa[i]);
++		struct nlattr *txsa_nest;
++		u64 pn;
++		int pn_len;
++
++		if (!tx_sa)
++			continue;
++
++		txsa_nest = nla_nest_start_noflag(skb, j++);
++		if (!txsa_nest) {
++			nla_nest_cancel(skb, txsa_list);
++			goto nla_put_failure;
++		}
++
++		attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS);
++		if (!attr) {
++			nla_nest_cancel(skb, txsa_nest);
++			nla_nest_cancel(skb, txsa_list);
++			goto nla_put_failure;
++		}
++		memset(&tx_sa_stats, 0, sizeof(tx_sa_stats));
++		get_tx_sa_stats(dev, i, tx_sa, &tx_sa_stats);
++		if (copy_tx_sa_stats(skb, &tx_sa_stats)) {
++			nla_nest_cancel(skb, attr);
++			nla_nest_cancel(skb, txsa_nest);
++			nla_nest_cancel(skb, txsa_list);
++			goto nla_put_failure;
++		}
++		nla_nest_end(skb, attr);
++
++		if (secy->xpn) {
++			pn = tx_sa->next_pn;
++			pn_len = MACSEC_XPN_PN_LEN;
++		} else {
++			pn = tx_sa->next_pn_halves.lower;
++			pn_len = MACSEC_DEFAULT_PN_LEN;
++		}
++
++		if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) ||
++		    nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) ||
++		    nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, tx_sa->key.id) ||
++		    (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, tx_sa->ssci)) ||
++		    nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, tx_sa->active)) {
++			nla_nest_cancel(skb, txsa_nest);
++			nla_nest_cancel(skb, txsa_list);
++			goto nla_put_failure;
++		}
++
++		nla_nest_end(skb, txsa_nest);
++	}
++	nla_nest_end(skb, txsa_list);
++
++	rxsc_list = nla_nest_start_noflag(skb, MACSEC_ATTR_RXSC_LIST);
++	if (!rxsc_list)
++		goto nla_put_failure;
++
++	j = 1;
++	for_each_rxsc_rtnl(secy, rx_sc) {
++		int k;
++		struct nlattr *rxsa_list;
++		struct nlattr *rxsc_nest = nla_nest_start_noflag(skb, j++);
++
++		if (!rxsc_nest) {
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++
++		if (nla_put_u8(skb, MACSEC_RXSC_ATTR_ACTIVE, rx_sc->active) ||
++		    nla_put_sci(skb, MACSEC_RXSC_ATTR_SCI, rx_sc->sci,
++				MACSEC_RXSC_ATTR_PAD)) {
++			nla_nest_cancel(skb, rxsc_nest);
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++
++		attr = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_STATS);
++		if (!attr) {
++			nla_nest_cancel(skb, rxsc_nest);
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++		memset(&rx_sc_stats, 0, sizeof(rx_sc_stats));
++		get_rx_sc_stats(dev, rx_sc, &rx_sc_stats);
++		if (copy_rx_sc_stats(skb, &rx_sc_stats)) {
++			nla_nest_cancel(skb, attr);
++			nla_nest_cancel(skb, rxsc_nest);
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++		nla_nest_end(skb, attr);
++
++		rxsa_list = nla_nest_start_noflag(skb,
++						  MACSEC_RXSC_ATTR_SA_LIST);
++		if (!rxsa_list) {
++			nla_nest_cancel(skb, rxsc_nest);
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++
++		for (i = 0, k = 1; i < MACSEC_NUM_AN; i++) {
++			struct macsec_rx_sa *rx_sa = rtnl_dereference(rx_sc->sa[i]);
++			struct nlattr *rxsa_nest;
++			u64 pn;
++			int pn_len;
++
++			if (!rx_sa)
++				continue;
++
++			rxsa_nest = nla_nest_start_noflag(skb, k++);
++			if (!rxsa_nest) {
++				nla_nest_cancel(skb, rxsa_list);
++				nla_nest_cancel(skb, rxsc_nest);
++				nla_nest_cancel(skb, rxsc_list);
++				goto nla_put_failure;
++			}
++
++			attr = nla_nest_start_noflag(skb,
++						     MACSEC_SA_ATTR_STATS);
++			if (!attr) {
++				nla_nest_cancel(skb, rxsa_list);
++				nla_nest_cancel(skb, rxsc_nest);
++				nla_nest_cancel(skb, rxsc_list);
++				goto nla_put_failure;
++			}
++			memset(&rx_sa_stats, 0, sizeof(rx_sa_stats));
++			get_rx_sa_stats(dev, rx_sc, i, rx_sa, &rx_sa_stats);
++			if (copy_rx_sa_stats(skb, &rx_sa_stats)) {
++				nla_nest_cancel(skb, attr);
++				nla_nest_cancel(skb, rxsa_list);
++				nla_nest_cancel(skb, rxsc_nest);
++				nla_nest_cancel(skb, rxsc_list);
++				goto nla_put_failure;
++			}
++			nla_nest_end(skb, attr);
++
++			if (secy->xpn) {
++				pn = rx_sa->next_pn;
++				pn_len = MACSEC_XPN_PN_LEN;
++			} else {
++				pn = rx_sa->next_pn_halves.lower;
++				pn_len = MACSEC_DEFAULT_PN_LEN;
++			}
++
++			if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) ||
++			    nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) ||
++			    nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, rx_sa->key.id) ||
++			    (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, rx_sa->ssci)) ||
++			    nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, rx_sa->active)) {
++				nla_nest_cancel(skb, rxsa_nest);
++				nla_nest_cancel(skb, rxsc_nest);
++				nla_nest_cancel(skb, rxsc_list);
++				goto nla_put_failure;
++			}
++			nla_nest_end(skb, rxsa_nest);
++		}
++
++		nla_nest_end(skb, rxsa_list);
++		nla_nest_end(skb, rxsc_nest);
++	}
++
++	nla_nest_end(skb, rxsc_list);
++
++	genlmsg_end(skb, hdr);
++
++	return 0;
++
++nla_put_failure:
++	genlmsg_cancel(skb, hdr);
++	return -EMSGSIZE;
++}
++
++static int macsec_generation = 1; /* protected by RTNL */
++
++static int macsec_dump_txsc(struct sk_buff *skb, struct netlink_callback *cb)
++{
++	struct net *net = sock_net(skb->sk);
++	struct net_device *dev;
++	int dev_idx, d;
++
++	dev_idx = cb->args[0];
++
++	d = 0;
++	rtnl_lock();
++
++	cb->seq = macsec_generation;
++
++	for_each_netdev(net, dev) {
++		struct macsec_secy *secy;
++
++		if (d < dev_idx)
++			goto next;
++
++		if (!netif_is_macsec(dev))
++			goto next;
++
++		secy = &macsec_priv(dev)->secy;
++		if (dump_secy(secy, dev, skb, cb) < 0)
++			goto done;
++next:
++		d++;
++	}
++
++done:
++	rtnl_unlock();
++	cb->args[0] = d;
++	return skb->len;
++}
++
++static const struct genl_small_ops macsec_genl_ops[] = {
++	{
++		.cmd = MACSEC_CMD_GET_TXSC,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.dumpit = macsec_dump_txsc,
++	},
++	{
++		.cmd = MACSEC_CMD_ADD_RXSC,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_add_rxsc,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_DEL_RXSC,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_del_rxsc,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_UPD_RXSC,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_upd_rxsc,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_ADD_TXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_add_txsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_DEL_TXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_del_txsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_UPD_TXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_upd_txsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_ADD_RXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_add_rxsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_DEL_RXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_del_rxsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_UPD_RXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_upd_rxsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_UPD_OFFLOAD,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_upd_offload,
++		.flags = GENL_ADMIN_PERM,
++	},
++};
++
++static struct genl_family macsec_fam __ro_after_init = {
++	.name		= MACSEC_GENL_NAME,
++	.hdrsize	= 0,
++	.version	= MACSEC_GENL_VERSION,
++	.maxattr	= MACSEC_ATTR_MAX,
++	.policy = macsec_genl_policy,
++	.netnsok	= true,
++	.module		= THIS_MODULE,
++	.small_ops	= macsec_genl_ops,
++	.n_small_ops	= ARRAY_SIZE(macsec_genl_ops),
++};
++
++static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
++				     struct net_device *dev)
++{
++	struct macsec_dev *macsec = netdev_priv(dev);
++	struct macsec_secy *secy = &macsec->secy;
++	struct pcpu_secy_stats *secy_stats;
++	int ret, len;
++
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		skb->dev = macsec->real_dev;
++		return dev_queue_xmit(skb);
++	}
++
++	/* 10.5 */
++	if (!secy->protect_frames) {
++		secy_stats = this_cpu_ptr(macsec->stats);
++		u64_stats_update_begin(&secy_stats->syncp);
++		secy_stats->stats.OutPktsUntagged++;
++		u64_stats_update_end(&secy_stats->syncp);
++		skb->dev = macsec->real_dev;
++		len = skb->len;
++		ret = dev_queue_xmit(skb);
++		count_tx(dev, ret, len);
++		return ret;
++	}
++
++	if (!secy->operational) {
++		kfree_skb(skb);
++		dev->stats.tx_dropped++;
++		return NETDEV_TX_OK;
++	}
++
++	len = skb->len;
++	skb = macsec_encrypt(skb, dev);
++	if (IS_ERR(skb)) {
++		if (PTR_ERR(skb) != -EINPROGRESS)
++			dev->stats.tx_dropped++;
++		return NETDEV_TX_OK;
++	}
++
++	macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa);
++
++	macsec_encrypt_finish(skb, dev);
++	ret = dev_queue_xmit(skb);
++	count_tx(dev, ret, len);
++	return ret;
++}
++
++#define MACSEC_FEATURES \
++	(NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST)
++
++static int macsec_dev_init(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++	int err;
++
++	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
++	if (!dev->tstats)
++		return -ENOMEM;
++
++	err = gro_cells_init(&macsec->gro_cells, dev);
++	if (err) {
++		free_percpu(dev->tstats);
++		return err;
++	}
++
++	dev->features = real_dev->features & MACSEC_FEATURES;
++	dev->features |= NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE;
++
++	dev->needed_headroom = real_dev->needed_headroom +
++			       MACSEC_NEEDED_HEADROOM;
++	dev->needed_tailroom = real_dev->needed_tailroom +
++			       MACSEC_NEEDED_TAILROOM;
++
++	if (is_zero_ether_addr(dev->dev_addr))
++		eth_hw_addr_inherit(dev, real_dev);
++	if (is_zero_ether_addr(dev->broadcast))
++		memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
++
++	/* Get macsec's reference to real_dev */
++	netdev_hold(real_dev, &macsec->dev_tracker, GFP_KERNEL);
++
++	return 0;
++}
++
++static void macsec_dev_uninit(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++
++	gro_cells_destroy(&macsec->gro_cells);
++	free_percpu(dev->tstats);
++}
++
++static netdev_features_t macsec_fix_features(struct net_device *dev,
++					     netdev_features_t features)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++
++	features &= (real_dev->features & MACSEC_FEATURES) |
++		    NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES;
++	features |= NETIF_F_LLTX;
++
++	return features;
++}
++
++static int macsec_dev_open(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++	int err;
++
++	err = dev_uc_add(real_dev, dev->dev_addr);
++	if (err < 0)
++		return err;
++
++	if (dev->flags & IFF_ALLMULTI) {
++		err = dev_set_allmulti(real_dev, 1);
++		if (err < 0)
++			goto del_unicast;
++	}
++
++	if (dev->flags & IFF_PROMISC) {
++		err = dev_set_promiscuity(real_dev, 1);
++		if (err < 0)
++			goto clear_allmulti;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			err = -EOPNOTSUPP;
++			goto clear_allmulti;
++		}
++
++		ctx.secy = &macsec->secy;
++		err = macsec_offload(ops->mdo_dev_open, &ctx);
++		if (err)
++			goto clear_allmulti;
++	}
++
++	if (netif_carrier_ok(real_dev))
++		netif_carrier_on(dev);
++
++	return 0;
++clear_allmulti:
++	if (dev->flags & IFF_ALLMULTI)
++		dev_set_allmulti(real_dev, -1);
++del_unicast:
++	dev_uc_del(real_dev, dev->dev_addr);
++	netif_carrier_off(dev);
++	return err;
++}
++
++static int macsec_dev_stop(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++
++	netif_carrier_off(dev);
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.secy = &macsec->secy;
++			macsec_offload(ops->mdo_dev_stop, &ctx);
++		}
++	}
++
++	dev_mc_unsync(real_dev, dev);
++	dev_uc_unsync(real_dev, dev);
++
++	if (dev->flags & IFF_ALLMULTI)
++		dev_set_allmulti(real_dev, -1);
++
++	if (dev->flags & IFF_PROMISC)
++		dev_set_promiscuity(real_dev, -1);
++
++	dev_uc_del(real_dev, dev->dev_addr);
++
++	return 0;
++}
++
++static void macsec_dev_change_rx_flags(struct net_device *dev, int change)
++{
++	struct net_device *real_dev = macsec_priv(dev)->real_dev;
++
++	if (!(dev->flags & IFF_UP))
++		return;
++
++	if (change & IFF_ALLMULTI)
++		dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
++
++	if (change & IFF_PROMISC)
++		dev_set_promiscuity(real_dev,
++				    dev->flags & IFF_PROMISC ? 1 : -1);
++}
++
++static void macsec_dev_set_rx_mode(struct net_device *dev)
++{
++	struct net_device *real_dev = macsec_priv(dev)->real_dev;
++
++	dev_mc_sync(real_dev, dev);
++	dev_uc_sync(real_dev, dev);
++}
++
++static int macsec_set_mac_address(struct net_device *dev, void *p)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++	struct sockaddr *addr = p;
++	int err;
++
++	if (!is_valid_ether_addr(addr->sa_data))
++		return -EADDRNOTAVAIL;
++
++	if (!(dev->flags & IFF_UP))
++		goto out;
++
++	err = dev_uc_add(real_dev, addr->sa_data);
++	if (err < 0)
++		return err;
++
++	dev_uc_del(real_dev, dev->dev_addr);
++
++out:
++	eth_hw_addr_set(dev, addr->sa_data);
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.secy = &macsec->secy;
++			macsec_offload(ops->mdo_upd_secy, &ctx);
++		}
++	}
++
++	return 0;
++}
++
++static int macsec_change_mtu(struct net_device *dev, int new_mtu)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	unsigned int extra = macsec->secy.icv_len + macsec_extra_len(true);
++
++	if (macsec->real_dev->mtu - extra < new_mtu)
++		return -ERANGE;
++
++	dev->mtu = new_mtu;
++
++	return 0;
++}
++
++static void macsec_get_stats64(struct net_device *dev,
++			       struct rtnl_link_stats64 *s)
++{
++	if (!dev->tstats)
++		return;
++
++	dev_fetch_sw_netstats(s, dev->tstats);
++
++	s->rx_dropped = dev->stats.rx_dropped;
++	s->tx_dropped = dev->stats.tx_dropped;
++	s->rx_errors = dev->stats.rx_errors;
++}
++
++static int macsec_get_iflink(const struct net_device *dev)
++{
++	return macsec_priv(dev)->real_dev->ifindex;
++}
++
++static const struct net_device_ops macsec_netdev_ops = {
++	.ndo_init		= macsec_dev_init,
++	.ndo_uninit		= macsec_dev_uninit,
++	.ndo_open		= macsec_dev_open,
++	.ndo_stop		= macsec_dev_stop,
++	.ndo_fix_features	= macsec_fix_features,
++	.ndo_change_mtu		= macsec_change_mtu,
++	.ndo_set_rx_mode	= macsec_dev_set_rx_mode,
++	.ndo_change_rx_flags	= macsec_dev_change_rx_flags,
++	.ndo_set_mac_address	= macsec_set_mac_address,
++	.ndo_start_xmit		= macsec_start_xmit,
++	.ndo_get_stats64	= macsec_get_stats64,
++	.ndo_get_iflink		= macsec_get_iflink,
++};
++
++static const struct device_type macsec_type = {
++	.name = "macsec",
++};
++
++static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = {
++	[IFLA_MACSEC_SCI] = { .type = NLA_U64 },
++	[IFLA_MACSEC_PORT] = { .type = NLA_U16 },
++	[IFLA_MACSEC_ICV_LEN] = { .type = NLA_U8 },
++	[IFLA_MACSEC_CIPHER_SUITE] = { .type = NLA_U64 },
++	[IFLA_MACSEC_WINDOW] = { .type = NLA_U32 },
++	[IFLA_MACSEC_ENCODING_SA] = { .type = NLA_U8 },
++	[IFLA_MACSEC_ENCRYPT] = { .type = NLA_U8 },
++	[IFLA_MACSEC_PROTECT] = { .type = NLA_U8 },
++	[IFLA_MACSEC_INC_SCI] = { .type = NLA_U8 },
++	[IFLA_MACSEC_ES] = { .type = NLA_U8 },
++	[IFLA_MACSEC_SCB] = { .type = NLA_U8 },
++	[IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 },
++	[IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 },
++};
++
++static void macsec_free_netdev(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++
++	free_percpu(macsec->stats);
++	free_percpu(macsec->secy.tx_sc.stats);
++
++	/* Get rid of the macsec's reference to real_dev */
++	netdev_put(macsec->real_dev, &macsec->dev_tracker);
++}
++
++static void macsec_setup(struct net_device *dev)
++{
++	ether_setup(dev);
++	dev->min_mtu = 0;
++	dev->max_mtu = ETH_MAX_MTU;
++	dev->priv_flags |= IFF_NO_QUEUE;
++	dev->netdev_ops = &macsec_netdev_ops;
++	dev->needs_free_netdev = true;
++	dev->priv_destructor = macsec_free_netdev;
++	SET_NETDEV_DEVTYPE(dev, &macsec_type);
++
++	eth_zero_addr(dev->broadcast);
++}
++
++static int macsec_changelink_common(struct net_device *dev,
++				    struct nlattr *data[])
++{
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++
++	secy = &macsec_priv(dev)->secy;
++	tx_sc = &secy->tx_sc;
++
++	if (data[IFLA_MACSEC_ENCODING_SA]) {
++		struct macsec_tx_sa *tx_sa;
++
++		tx_sc->encoding_sa = nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]);
++		tx_sa = rtnl_dereference(tx_sc->sa[tx_sc->encoding_sa]);
++
++		secy->operational = tx_sa && tx_sa->active;
++	}
++
++	if (data[IFLA_MACSEC_ENCRYPT])
++		tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]);
++
++	if (data[IFLA_MACSEC_PROTECT])
++		secy->protect_frames = !!nla_get_u8(data[IFLA_MACSEC_PROTECT]);
++
++	if (data[IFLA_MACSEC_INC_SCI])
++		tx_sc->send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]);
++
++	if (data[IFLA_MACSEC_ES])
++		tx_sc->end_station = !!nla_get_u8(data[IFLA_MACSEC_ES]);
++
++	if (data[IFLA_MACSEC_SCB])
++		tx_sc->scb = !!nla_get_u8(data[IFLA_MACSEC_SCB]);
++
++	if (data[IFLA_MACSEC_REPLAY_PROTECT])
++		secy->replay_protect = !!nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT]);
++
++	if (data[IFLA_MACSEC_VALIDATION])
++		secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]);
++
++	if (data[IFLA_MACSEC_CIPHER_SUITE]) {
++		switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) {
++		case MACSEC_CIPHER_ID_GCM_AES_128:
++		case MACSEC_DEFAULT_CIPHER_ID:
++			secy->key_len = MACSEC_GCM_AES_128_SAK_LEN;
++			secy->xpn = false;
++			break;
++		case MACSEC_CIPHER_ID_GCM_AES_256:
++			secy->key_len = MACSEC_GCM_AES_256_SAK_LEN;
++			secy->xpn = false;
++			break;
++		case MACSEC_CIPHER_ID_GCM_AES_XPN_128:
++			secy->key_len = MACSEC_GCM_AES_128_SAK_LEN;
++			secy->xpn = true;
++			break;
++		case MACSEC_CIPHER_ID_GCM_AES_XPN_256:
++			secy->key_len = MACSEC_GCM_AES_256_SAK_LEN;
++			secy->xpn = true;
++			break;
++		default:
++			return -EINVAL;
++		}
++	}
++
++	if (data[IFLA_MACSEC_WINDOW]) {
++		secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]);
++
++		/* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window
++		 * for XPN cipher suites */
++		if (secy->xpn &&
++		    secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW)
++			return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int macsec_changelink(struct net_device *dev, struct nlattr *tb[],
++			     struct nlattr *data[],
++			     struct netlink_ext_ack *extack)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_tx_sc tx_sc;
++	struct macsec_secy secy;
++	int ret;
++
++	if (!data)
++		return 0;
++
++	if (data[IFLA_MACSEC_CIPHER_SUITE] ||
++	    data[IFLA_MACSEC_ICV_LEN] ||
++	    data[IFLA_MACSEC_SCI] ||
++	    data[IFLA_MACSEC_PORT])
++		return -EINVAL;
++
++	/* Keep a copy of unmodified secy and tx_sc, in case the offload
++	 * propagation fails, to revert macsec_changelink_common.
++	 */
++	memcpy(&secy, &macsec->secy, sizeof(secy));
++	memcpy(&tx_sc, &macsec->secy.tx_sc, sizeof(tx_sc));
++
++	ret = macsec_changelink_common(dev, data);
++	if (ret)
++		goto cleanup;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.secy = &macsec->secy;
++		ret = macsec_offload(ops->mdo_upd_secy, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	return 0;
++
++cleanup:
++	memcpy(&macsec->secy.tx_sc, &tx_sc, sizeof(tx_sc));
++	memcpy(&macsec->secy, &secy, sizeof(secy));
++
++	return ret;
++}
++
++static void macsec_del_dev(struct macsec_dev *macsec)
++{
++	int i;
++
++	while (macsec->secy.rx_sc) {
++		struct macsec_rx_sc *rx_sc = rtnl_dereference(macsec->secy.rx_sc);
++
++		rcu_assign_pointer(macsec->secy.rx_sc, rx_sc->next);
++		free_rx_sc(rx_sc);
++	}
++
++	for (i = 0; i < MACSEC_NUM_AN; i++) {
++		struct macsec_tx_sa *sa = rtnl_dereference(macsec->secy.tx_sc.sa[i]);
++
++		if (sa) {
++			RCU_INIT_POINTER(macsec->secy.tx_sc.sa[i], NULL);
++			clear_tx_sa(sa);
++		}
++	}
++}
++
++static void macsec_common_dellink(struct net_device *dev, struct list_head *head)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (ops) {
++			ctx.secy = &macsec->secy;
++			macsec_offload(ops->mdo_del_secy, &ctx);
++		}
++	}
++
++	unregister_netdevice_queue(dev, head);
++	list_del_rcu(&macsec->secys);
++	macsec_del_dev(macsec);
++	netdev_upper_dev_unlink(real_dev, dev);
++
++	macsec_generation++;
++}
++
++static void macsec_dellink(struct net_device *dev, struct list_head *head)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++	struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);
++
++	macsec_common_dellink(dev, head);
++
++	if (list_empty(&rxd->secys)) {
++		netdev_rx_handler_unregister(real_dev);
++		kfree(rxd);
++	}
++}
++
++static int register_macsec_dev(struct net_device *real_dev,
++			       struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);
++
++	if (!rxd) {
++		int err;
++
++		rxd = kmalloc(sizeof(*rxd), GFP_KERNEL);
++		if (!rxd)
++			return -ENOMEM;
++
++		INIT_LIST_HEAD(&rxd->secys);
++
++		err = netdev_rx_handler_register(real_dev, macsec_handle_frame,
++						 rxd);
++		if (err < 0) {
++			kfree(rxd);
++			return err;
++		}
++	}
++
++	list_add_tail_rcu(&macsec->secys, &rxd->secys);
++	return 0;
++}
++
++static bool sci_exists(struct net_device *dev, sci_t sci)
++{
++	struct macsec_rxh_data *rxd = macsec_data_rtnl(dev);
++	struct macsec_dev *macsec;
++
++	list_for_each_entry(macsec, &rxd->secys, secys) {
++		if (macsec->secy.sci == sci)
++			return true;
++	}
++
++	return false;
++}
++
++static sci_t dev_to_sci(struct net_device *dev, __be16 port)
++{
++	return make_sci(dev->dev_addr, port);
++}
++
++static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_secy *secy = &macsec->secy;
++
++	macsec->stats = netdev_alloc_pcpu_stats(struct pcpu_secy_stats);
++	if (!macsec->stats)
++		return -ENOMEM;
++
++	secy->tx_sc.stats = netdev_alloc_pcpu_stats(struct pcpu_tx_sc_stats);
++	if (!secy->tx_sc.stats) {
++		free_percpu(macsec->stats);
++		return -ENOMEM;
++	}
++
++	if (sci == MACSEC_UNDEF_SCI)
++		sci = dev_to_sci(dev, MACSEC_PORT_ES);
++
++	secy->netdev = dev;
++	secy->operational = true;
++	secy->key_len = DEFAULT_SAK_LEN;
++	secy->icv_len = icv_len;
++	secy->validate_frames = MACSEC_VALIDATE_DEFAULT;
++	secy->protect_frames = true;
++	secy->replay_protect = false;
++	secy->xpn = DEFAULT_XPN;
++
++	secy->sci = sci;
++	secy->tx_sc.active = true;
++	secy->tx_sc.encoding_sa = DEFAULT_ENCODING_SA;
++	secy->tx_sc.encrypt = DEFAULT_ENCRYPT;
++	secy->tx_sc.send_sci = DEFAULT_SEND_SCI;
++	secy->tx_sc.end_station = false;
++	secy->tx_sc.scb = false;
++
++	return 0;
++}
++
++static struct lock_class_key macsec_netdev_addr_lock_key;
++
++static int macsec_newlink(struct net *net, struct net_device *dev,
++			  struct nlattr *tb[], struct nlattr *data[],
++			  struct netlink_ext_ack *extack)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	rx_handler_func_t *rx_handler;
++	u8 icv_len = DEFAULT_ICV_LEN;
++	struct net_device *real_dev;
++	int err, mtu;
++	sci_t sci;
++
++	if (!tb[IFLA_LINK])
++		return -EINVAL;
++	real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
++	if (!real_dev)
++		return -ENODEV;
++	if (real_dev->type != ARPHRD_ETHER)
++		return -EINVAL;
++
++	dev->priv_flags |= IFF_MACSEC;
++
++	macsec->real_dev = real_dev;
++
++	if (data && data[IFLA_MACSEC_OFFLOAD])
++		macsec->offload = nla_get_offload(data[IFLA_MACSEC_OFFLOAD]);
++	else
++		/* MACsec offloading is off by default */
++		macsec->offload = MACSEC_OFFLOAD_OFF;
++
++	/* Check if the offloading mode is supported by the underlying layers */
++	if (macsec->offload != MACSEC_OFFLOAD_OFF &&
++	    !macsec_check_offload(macsec->offload, macsec))
++		return -EOPNOTSUPP;
++
++	/* send_sci must be set to true when transmit sci explicitly is set */
++	if ((data && data[IFLA_MACSEC_SCI]) &&
++	    (data && data[IFLA_MACSEC_INC_SCI])) {
++		u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]);
++
++		if (!send_sci)
++			return -EINVAL;
++	}
++
++	if (data && data[IFLA_MACSEC_ICV_LEN])
++		icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]);
++	mtu = real_dev->mtu - icv_len - macsec_extra_len(true);
++	if (mtu < 0)
++		dev->mtu = 0;
++	else
++		dev->mtu = mtu;
++
++	rx_handler = rtnl_dereference(real_dev->rx_handler);
++	if (rx_handler && rx_handler != macsec_handle_frame)
++		return -EBUSY;
++
++	err = register_netdevice(dev);
++	if (err < 0)
++		return err;
++
++	netdev_lockdep_set_classes(dev);
++	lockdep_set_class(&dev->addr_list_lock,
++			  &macsec_netdev_addr_lock_key);
++
++	err = netdev_upper_dev_link(real_dev, dev, extack);
++	if (err < 0)
++		goto unregister;
++
++	/* need to be already registered so that ->init has run and
++	 * the MAC addr is set
++	 */
++	if (data && data[IFLA_MACSEC_SCI])
++		sci = nla_get_sci(data[IFLA_MACSEC_SCI]);
++	else if (data && data[IFLA_MACSEC_PORT])
++		sci = dev_to_sci(dev, nla_get_be16(data[IFLA_MACSEC_PORT]));
++	else
++		sci = dev_to_sci(dev, MACSEC_PORT_ES);
++
++	if (rx_handler && sci_exists(real_dev, sci)) {
++		err = -EBUSY;
++		goto unlink;
++	}
++
++	err = macsec_add_dev(dev, sci, icv_len);
++	if (err)
++		goto unlink;
++
++	if (data) {
++		err = macsec_changelink_common(dev, data);
++		if (err)
++			goto del_dev;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.secy = &macsec->secy;
++			err = macsec_offload(ops->mdo_add_secy, &ctx);
++			if (err)
++				goto del_dev;
++		}
++	}
++
++	err = register_macsec_dev(real_dev, dev);
++	if (err < 0)
++		goto del_dev;
++
++	netif_stacked_transfer_operstate(real_dev, dev);
++	linkwatch_fire_event(dev);
++
++	macsec_generation++;
++
++	return 0;
++
++del_dev:
++	macsec_del_dev(macsec);
++unlink:
++	netdev_upper_dev_unlink(real_dev, dev);
++unregister:
++	unregister_netdevice(dev);
++	return err;
++}
++
++static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[],
++				struct netlink_ext_ack *extack)
++{
++	u64 csid = MACSEC_DEFAULT_CIPHER_ID;
++	u8 icv_len = DEFAULT_ICV_LEN;
++	int flag;
++	bool es, scb, sci;
++
++	if (!data)
++		return 0;
++
++	if (data[IFLA_MACSEC_CIPHER_SUITE])
++		csid = nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE]);
++
++	if (data[IFLA_MACSEC_ICV_LEN]) {
++		icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]);
++		if (icv_len != DEFAULT_ICV_LEN) {
++			char dummy_key[DEFAULT_SAK_LEN] = { 0 };
++			struct crypto_aead *dummy_tfm;
++
++			dummy_tfm = macsec_alloc_tfm(dummy_key,
++						     DEFAULT_SAK_LEN,
++						     icv_len);
++			if (IS_ERR(dummy_tfm))
++				return PTR_ERR(dummy_tfm);
++			crypto_free_aead(dummy_tfm);
++		}
++	}
++
++	switch (csid) {
++	case MACSEC_CIPHER_ID_GCM_AES_128:
++	case MACSEC_CIPHER_ID_GCM_AES_256:
++	case MACSEC_CIPHER_ID_GCM_AES_XPN_128:
++	case MACSEC_CIPHER_ID_GCM_AES_XPN_256:
++	case MACSEC_DEFAULT_CIPHER_ID:
++		if (icv_len < MACSEC_MIN_ICV_LEN ||
++		    icv_len > MACSEC_STD_ICV_LEN)
++			return -EINVAL;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	if (data[IFLA_MACSEC_ENCODING_SA]) {
++		if (nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]) >= MACSEC_NUM_AN)
++			return -EINVAL;
++	}
++
++	for (flag = IFLA_MACSEC_ENCODING_SA + 1;
++	     flag < IFLA_MACSEC_VALIDATION;
++	     flag++) {
++		if (data[flag]) {
++			if (nla_get_u8(data[flag]) > 1)
++				return -EINVAL;
++		}
++	}
++
++	es  = data[IFLA_MACSEC_ES] ? nla_get_u8(data[IFLA_MACSEC_ES]) : false;
++	sci = data[IFLA_MACSEC_INC_SCI] ? nla_get_u8(data[IFLA_MACSEC_INC_SCI]) : false;
++	scb = data[IFLA_MACSEC_SCB] ? nla_get_u8(data[IFLA_MACSEC_SCB]) : false;
++
++	if ((sci && (scb || es)) || (scb && es))
++		return -EINVAL;
++
++	if (data[IFLA_MACSEC_VALIDATION] &&
++	    nla_get_u8(data[IFLA_MACSEC_VALIDATION]) > MACSEC_VALIDATE_MAX)
++		return -EINVAL;
++
++	if ((data[IFLA_MACSEC_REPLAY_PROTECT] &&
++	     nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT])) &&
++	    !data[IFLA_MACSEC_WINDOW])
++		return -EINVAL;
++
++	return 0;
++}
++
++static struct net *macsec_get_link_net(const struct net_device *dev)
++{
++	return dev_net(macsec_priv(dev)->real_dev);
++}
++
++static size_t macsec_get_size(const struct net_device *dev)
++{
++	return  nla_total_size_64bit(8) + /* IFLA_MACSEC_SCI */
++		nla_total_size(1) + /* IFLA_MACSEC_ICV_LEN */
++		nla_total_size_64bit(8) + /* IFLA_MACSEC_CIPHER_SUITE */
++		nla_total_size(4) + /* IFLA_MACSEC_WINDOW */
++		nla_total_size(1) + /* IFLA_MACSEC_ENCODING_SA */
++		nla_total_size(1) + /* IFLA_MACSEC_ENCRYPT */
++		nla_total_size(1) + /* IFLA_MACSEC_PROTECT */
++		nla_total_size(1) + /* IFLA_MACSEC_INC_SCI */
++		nla_total_size(1) + /* IFLA_MACSEC_ES */
++		nla_total_size(1) + /* IFLA_MACSEC_SCB */
++		nla_total_size(1) + /* IFLA_MACSEC_REPLAY_PROTECT */
++		nla_total_size(1) + /* IFLA_MACSEC_VALIDATION */
++		0;
++}
++
++static int macsec_fill_info(struct sk_buff *skb,
++			    const struct net_device *dev)
++{
++	struct macsec_secy *secy = &macsec_priv(dev)->secy;
++	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++	u64 csid;
++
++	switch (secy->key_len) {
++	case MACSEC_GCM_AES_128_SAK_LEN:
++		csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID;
++		break;
++	case MACSEC_GCM_AES_256_SAK_LEN:
++		csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256;
++		break;
++	default:
++		goto nla_put_failure;
++	}
++
++	if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci,
++			IFLA_MACSEC_PAD) ||
++	    nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) ||
++	    nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE,
++			      csid, IFLA_MACSEC_PAD) ||
++	    nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) ||
++	    nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) ||
++	    nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) ||
++	    nla_put_u8(skb, IFLA_MACSEC_INC_SCI, tx_sc->send_sci) ||
++	    nla_put_u8(skb, IFLA_MACSEC_ES, tx_sc->end_station) ||
++	    nla_put_u8(skb, IFLA_MACSEC_SCB, tx_sc->scb) ||
++	    nla_put_u8(skb, IFLA_MACSEC_REPLAY_PROTECT, secy->replay_protect) ||
++	    nla_put_u8(skb, IFLA_MACSEC_VALIDATION, secy->validate_frames) ||
++	    0)
++		goto nla_put_failure;
++
++	if (secy->replay_protect) {
++		if (nla_put_u32(skb, IFLA_MACSEC_WINDOW, secy->replay_window))
++			goto nla_put_failure;
++	}
++
++	return 0;
++
++nla_put_failure:
++	return -EMSGSIZE;
++}
++
++static struct rtnl_link_ops macsec_link_ops __read_mostly = {
++	.kind		= "macsec",
++	.priv_size	= sizeof(struct macsec_dev),
++	.maxtype	= IFLA_MACSEC_MAX,
++	.policy		= macsec_rtnl_policy,
++	.setup		= macsec_setup,
++	.validate	= macsec_validate_attr,
++	.newlink	= macsec_newlink,
++	.changelink	= macsec_changelink,
++	.dellink	= macsec_dellink,
++	.get_size	= macsec_get_size,
++	.fill_info	= macsec_fill_info,
++	.get_link_net	= macsec_get_link_net,
++};
++
++static bool is_macsec_master(struct net_device *dev)
++{
++	return rcu_access_pointer(dev->rx_handler) == macsec_handle_frame;
++}
++
++static int macsec_notify(struct notifier_block *this, unsigned long event,
++			 void *ptr)
++{
++	struct net_device *real_dev = netdev_notifier_info_to_dev(ptr);
++	LIST_HEAD(head);
++
++	if (!is_macsec_master(real_dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_DOWN:
++	case NETDEV_UP:
++	case NETDEV_CHANGE: {
++		struct macsec_dev *m, *n;
++		struct macsec_rxh_data *rxd;
++
++		rxd = macsec_data_rtnl(real_dev);
++		list_for_each_entry_safe(m, n, &rxd->secys, secys) {
++			struct net_device *dev = m->secy.netdev;
++
++			netif_stacked_transfer_operstate(real_dev, dev);
++		}
++		break;
++	}
++	case NETDEV_UNREGISTER: {
++		struct macsec_dev *m, *n;
++		struct macsec_rxh_data *rxd;
++
++		rxd = macsec_data_rtnl(real_dev);
++		list_for_each_entry_safe(m, n, &rxd->secys, secys) {
++			macsec_common_dellink(m->secy.netdev, &head);
++		}
++
++		netdev_rx_handler_unregister(real_dev);
++		kfree(rxd);
++
++		unregister_netdevice_many(&head);
++		break;
++	}
++	case NETDEV_CHANGEMTU: {
++		struct macsec_dev *m;
++		struct macsec_rxh_data *rxd;
++
++		rxd = macsec_data_rtnl(real_dev);
++		list_for_each_entry(m, &rxd->secys, secys) {
++			struct net_device *dev = m->secy.netdev;
++			unsigned int mtu = real_dev->mtu - (m->secy.icv_len +
++							    macsec_extra_len(true));
++
++			if (dev->mtu > mtu)
++				dev_set_mtu(dev, mtu);
++		}
++	}
++	}
++
++	return NOTIFY_OK;
++}
++
++static struct notifier_block macsec_notifier = {
++	.notifier_call = macsec_notify,
++};
++
++static int __init macsec_init(void)
++{
++	int err;
++
++	pr_info("MACsec IEEE 802.1AE\n");
++	err = register_netdevice_notifier(&macsec_notifier);
++	if (err)
++		return err;
++
++	err = rtnl_link_register(&macsec_link_ops);
++	if (err)
++		goto notifier;
++
++	err = genl_register_family(&macsec_fam);
++	if (err)
++		goto rtnl;
++
++	return 0;
++
++rtnl:
++	rtnl_link_unregister(&macsec_link_ops);
++notifier:
++	unregister_netdevice_notifier(&macsec_notifier);
++	return err;
++}
++
++static void __exit macsec_exit(void)
++{
++	genl_unregister_family(&macsec_fam);
++	rtnl_link_unregister(&macsec_link_ops);
++	unregister_netdevice_notifier(&macsec_notifier);
++	rcu_barrier();
++}
++
++module_init(macsec_init);
++module_exit(macsec_exit);
++
++MODULE_ALIAS_RTNL_LINK("macsec");
++MODULE_ALIAS_GENL_FAMILY("macsec");
++
++MODULE_DESCRIPTION("MACsec IEEE 802.1AE");
++MODULE_LICENSE("GPL v2");
+diff -rupN linux.orig/drivers/net/macvlan.c linux/drivers/net/macvlan.c
+--- linux.orig/drivers/net/macvlan.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/macvlan.c	2022-12-04 10:40:26.696034096 -0500
+@@ -948,13 +948,13 @@ static void macvlan_dev_get_stats64(stru
  		for_each_possible_cpu(i) {
  			p = per_cpu_ptr(vlan->pcpu_stats, i);
  			do {
@@ -2829,11 +20477,10 @@ index 1080d6ebff63b..a1c7823f0ba66 100644
  
  			stats->rx_packets	+= rx_packets;
  			stats->rx_bytes		+= rx_bytes;
-diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c
-index 0b1b6f650104b..ff302144029de 100644
---- a/drivers/net/mhi_net.c
-+++ b/drivers/net/mhi_net.c
-@@ -104,19 +104,19 @@ static void mhi_ndo_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/mhi_net.c linux/drivers/net/mhi_net.c
+--- linux.orig/drivers/net/mhi_net.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/mhi_net.c	2022-12-04 10:40:26.696034096 -0500
+@@ -104,19 +104,19 @@ static void mhi_ndo_get_stats64(struct n
  	unsigned int start;
  
  	do {
@@ -2857,11 +20504,10 @@ index 0b1b6f650104b..ff302144029de 100644
  }
  
  static const struct net_device_ops mhi_netdev_ops = {
-diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
-index 9a1a5b2036240..e470e3398abc2 100644
---- a/drivers/net/netdevsim/netdev.c
-+++ b/drivers/net/netdevsim/netdev.c
-@@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/netdevsim/netdev.c linux/drivers/net/netdevsim/netdev.c
+--- linux.orig/drivers/net/netdevsim/netdev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/netdevsim/netdev.c	2022-12-04 10:40:26.696034096 -0500
+@@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev,
  	unsigned int start;
  
  	do {
@@ -2874,11 +20520,10 @@ index 9a1a5b2036240..e470e3398abc2 100644
  }
  
  static int
-diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
-index 154a3c0a6dfd8..3de937141c168 100644
---- a/drivers/net/team/team.c
-+++ b/drivers/net/team/team.c
-@@ -1865,13 +1865,13 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/team/team.c linux/drivers/net/team/team.c
+--- linux.orig/drivers/net/team/team.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/team/team.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1865,13 +1865,13 @@ team_get_stats64(struct net_device *dev,
  	for_each_possible_cpu(i) {
  		p = per_cpu_ptr(team->pcpu_stats, i);
  		do {
@@ -2894,11 +20539,10 @@ index 154a3c0a6dfd8..3de937141c168 100644
  
  		stats->rx_packets	+= rx_packets;
  		stats->rx_bytes		+= rx_bytes;
-diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
-index b095a4b4957bb..18d99fda997cf 100644
---- a/drivers/net/team/team_mode_loadbalance.c
-+++ b/drivers/net/team/team_mode_loadbalance.c
-@@ -466,9 +466,9 @@ static void __lb_one_cpu_stats_add(struct lb_stats *acc_stats,
+diff -rupN linux.orig/drivers/net/team/team_mode_loadbalance.c linux/drivers/net/team/team_mode_loadbalance.c
+--- linux.orig/drivers/net/team/team_mode_loadbalance.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/team/team_mode_loadbalance.c	2022-12-04 10:40:26.696034096 -0500
+@@ -466,9 +466,9 @@ static void __lb_one_cpu_stats_add(struc
  	struct lb_stats tmp;
  
  	do {
@@ -2910,11 +20554,10 @@ index b095a4b4957bb..18d99fda997cf 100644
  	acc_stats->tx_bytes += tmp.tx_bytes;
  }
  
-diff --git a/drivers/net/veth.c b/drivers/net/veth.c
-index 466da01ba2e3e..2da7cfcfe1c31 100644
---- a/drivers/net/veth.c
-+++ b/drivers/net/veth.c
-@@ -182,12 +182,12 @@ static void veth_get_ethtool_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/veth.c linux/drivers/net/veth.c
+--- linux.orig/drivers/net/veth.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/veth.c	2022-12-04 10:40:26.696034096 -0500
+@@ -182,12 +182,12 @@ static void veth_get_ethtool_stats(struc
  		size_t offset;
  
  		do {
@@ -2929,7 +20572,7 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644
  		idx += VETH_RQ_STATS_LEN;
  	}
  
-@@ -203,12 +203,12 @@ static void veth_get_ethtool_stats(struct net_device *dev,
+@@ -203,12 +203,12 @@ static void veth_get_ethtool_stats(struc
  
  		tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
  		do {
@@ -2944,7 +20587,7 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644
  	}
  }
  
-@@ -379,13 +379,13 @@ static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
+@@ -379,13 +379,13 @@ static void veth_stats_rx(struct veth_st
  		unsigned int start;
  
  		do {
@@ -2960,11 +20603,10 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644
  		result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err;
  		result->xdp_tx_err += xdp_tx_err;
  		result->xdp_packets += packets;
-diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
-index 9cce7dec7366d..a94d9d8f67fd0 100644
---- a/drivers/net/virtio_net.c
-+++ b/drivers/net/virtio_net.c
-@@ -2066,18 +2066,18 @@ static void virtnet_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/virtio_net.c linux/drivers/net/virtio_net.c
+--- linux.orig/drivers/net/virtio_net.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/virtio_net.c	2022-12-04 10:40:26.696034096 -0500
+@@ -2066,18 +2066,18 @@ static void virtnet_stats(struct net_dev
  		struct send_queue *sq = &vi->sq[i];
  
  		do {
@@ -2987,7 +20629,7 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644
  
  		tot->rx_packets += rpackets;
  		tot->tx_packets += tpackets;
-@@ -2688,12 +2688,12 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
+@@ -2688,12 +2688,12 @@ static void virtnet_get_ethtool_stats(st
  
  		stats_base = (u8 *)&rq->stats;
  		do {
@@ -3002,7 +20644,7 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644
  		idx += VIRTNET_RQ_STATS_LEN;
  	}
  
-@@ -2702,12 +2702,12 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
+@@ -2702,12 +2702,12 @@ static void virtnet_get_ethtool_stats(st
  
  		stats_base = (u8 *)&sq->stats;
  		do {
@@ -3017,11 +20659,10 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644
  		idx += VIRTNET_SQ_STATS_LEN;
  	}
  }
-diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
-index 5df7a0abc39d5..191ebc482f0c1 100644
---- a/drivers/net/vrf.c
-+++ b/drivers/net/vrf.c
-@@ -159,13 +159,13 @@ static void vrf_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/vrf.c linux/drivers/net/vrf.c
+--- linux.orig/drivers/net/vrf.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/vrf.c	2022-12-04 10:40:26.696034096 -0500
+@@ -159,13 +159,13 @@ static void vrf_get_stats64(struct net_d
  
  		dstats = per_cpu_ptr(dev->dstats, i);
  		do {
@@ -3037,11 +20678,10 @@ index 5df7a0abc39d5..191ebc482f0c1 100644
  		stats->tx_bytes += tbytes;
  		stats->tx_packets += tpkts;
  		stats->tx_dropped += tdrops;
-diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c
-index 3e04af4c5daa1..a3de081cda5ee 100644
---- a/drivers/net/vxlan/vxlan_vnifilter.c
-+++ b/drivers/net/vxlan/vxlan_vnifilter.c
-@@ -129,9 +129,9 @@ static void vxlan_vnifilter_stats_get(const struct vxlan_vni_node *vninode,
+diff -rupN linux.orig/drivers/net/vxlan/vxlan_vnifilter.c linux/drivers/net/vxlan/vxlan_vnifilter.c
+--- linux.orig/drivers/net/vxlan/vxlan_vnifilter.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/vxlan/vxlan_vnifilter.c	2022-12-04 10:40:26.696034096 -0500
+@@ -129,9 +129,9 @@ static void vxlan_vnifilter_stats_get(co
  
  		pstats = per_cpu_ptr(vninode->stats, i);
  		do {
@@ -3053,11 +20693,10 @@ index 3e04af4c5daa1..a3de081cda5ee 100644
  
  		dest->rx_packets += temp.rx_packets;
  		dest->rx_bytes += temp.rx_bytes;
-diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c
-index 6872782e8dd89..22b5939a42bb3 100644
---- a/drivers/net/wwan/mhi_wwan_mbim.c
-+++ b/drivers/net/wwan/mhi_wwan_mbim.c
-@@ -456,19 +456,19 @@ static void mhi_mbim_ndo_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/wwan/mhi_wwan_mbim.c linux/drivers/net/wwan/mhi_wwan_mbim.c
+--- linux.orig/drivers/net/wwan/mhi_wwan_mbim.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/wwan/mhi_wwan_mbim.c	2022-12-04 10:40:26.696034096 -0500
+@@ -456,19 +456,19 @@ static void mhi_mbim_ndo_get_stats64(str
  	unsigned int start;
  
  	do {
@@ -3081,11 +20720,10 @@ index 6872782e8dd89..22b5939a42bb3 100644
  }
  
  static void mhi_mbim_ul_callback(struct mhi_device *mhi_dev,
-diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
-index 27a11cc08c61e..df4dc02638a00 100644
---- a/drivers/net/xen-netfront.c
-+++ b/drivers/net/xen-netfront.c
-@@ -1392,16 +1392,16 @@ static void xennet_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/xen-netfront.c linux/drivers/net/xen-netfront.c
+--- linux.orig/drivers/net/xen-netfront.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/xen-netfront.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1392,16 +1392,16 @@ static void xennet_get_stats64(struct ne
  		unsigned int start;
  
  		do {
@@ -3106,11 +20744,10 @@ index 27a11cc08c61e..df4dc02638a00 100644
  
  		tot->rx_packets += rx_packets;
  		tot->tx_packets += tx_packets;
-diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
-index 2a4b3efb7e12b..9f6ed09538cd0 100644
---- a/drivers/pinctrl/pinctrl-amd.c
-+++ b/drivers/pinctrl/pinctrl-amd.c
-@@ -639,7 +639,7 @@ static bool do_amd_gpio_irq_handler(int irq, void *dev_id)
+diff -rupN linux.orig/drivers/pinctrl/pinctrl-amd.c linux/drivers/pinctrl/pinctrl-amd.c
+--- linux.orig/drivers/pinctrl/pinctrl-amd.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/pinctrl/pinctrl-amd.c	2022-12-04 10:40:26.696034096 -0500
+@@ -639,7 +639,7 @@ static bool do_amd_gpio_irq_handler(int
  			if (!(regval & PIN_IRQ_PENDING) ||
  			    !(regval & BIT(INTERRUPT_MASK_OFF)))
  				continue;
@@ -3119,11 +20756,10 @@ index 2a4b3efb7e12b..9f6ed09538cd0 100644
  
  			/* Clear interrupt.
  			 * We must read the pin register again, in case the
-diff --git a/drivers/platform/x86/intel/int0002_vgpio.c b/drivers/platform/x86/intel/int0002_vgpio.c
-index 617dbf98980ec..97cfbc520a02c 100644
---- a/drivers/platform/x86/intel/int0002_vgpio.c
-+++ b/drivers/platform/x86/intel/int0002_vgpio.c
-@@ -125,8 +125,7 @@ static irqreturn_t int0002_irq(int irq, void *data)
+diff -rupN linux.orig/drivers/platform/x86/intel/int0002_vgpio.c linux/drivers/platform/x86/intel/int0002_vgpio.c
+--- linux.orig/drivers/platform/x86/intel/int0002_vgpio.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/platform/x86/intel/int0002_vgpio.c	2022-12-04 10:40:26.696034096 -0500
+@@ -125,8 +125,7 @@ static irqreturn_t int0002_irq(int irq,
  	if (!(gpe_sts_reg & GPE0A_PME_B0_STS_BIT))
  		return IRQ_NONE;
  
@@ -3133,10 +20769,9 @@ index 617dbf98980ec..97cfbc520a02c 100644
  
  	pm_wakeup_hard_event(chip->parent);
  
-diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
-index 4b42f2302a8a8..d4f77f6688cf7 100644
---- a/drivers/spi/spi.c
-+++ b/drivers/spi/spi.c
+diff -rupN linux.orig/drivers/spi/spi.c linux/drivers/spi/spi.c
+--- linux.orig/drivers/spi/spi.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/spi/spi.c	2022-12-04 10:40:26.700034085 -0500
 @@ -127,10 +127,10 @@ do {									\
  		unsigned int start;					\
  		pcpu_stats = per_cpu_ptr(in, i);			\
@@ -3150,11 +20785,10 @@ index 4b42f2302a8a8..d4f77f6688cf7 100644
  					&pcpu_stats->syncp, start));	\
  		ret += inc;						\
  	}								\
-diff --git a/drivers/ssb/driver_gpio.c b/drivers/ssb/driver_gpio.c
-index 2de3896489c84..897cb8db5084f 100644
---- a/drivers/ssb/driver_gpio.c
-+++ b/drivers/ssb/driver_gpio.c
-@@ -132,7 +132,8 @@ static irqreturn_t ssb_gpio_irq_chipco_handler(int irq, void *dev_id)
+diff -rupN linux.orig/drivers/ssb/driver_gpio.c linux/drivers/ssb/driver_gpio.c
+--- linux.orig/drivers/ssb/driver_gpio.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/ssb/driver_gpio.c	2022-12-04 10:40:26.700034085 -0500
+@@ -132,7 +132,8 @@ static irqreturn_t ssb_gpio_irq_chipco_h
  		return IRQ_NONE;
  
  	for_each_set_bit(gpio, &irqs, bus->gpio.ngpio)
@@ -3164,7 +20798,7 @@ index 2de3896489c84..897cb8db5084f 100644
  	ssb_chipco_gpio_polarity(chipco, irqs, val & irqs);
  
  	return IRQ_HANDLED;
-@@ -330,7 +331,8 @@ static irqreturn_t ssb_gpio_irq_extif_handler(int irq, void *dev_id)
+@@ -330,7 +331,8 @@ static irqreturn_t ssb_gpio_irq_extif_ha
  		return IRQ_NONE;
  
  	for_each_set_bit(gpio, &irqs, bus->gpio.ngpio)
@@ -3174,11 +20808,207 @@ index 2de3896489c84..897cb8db5084f 100644
  	ssb_extif_gpio_polarity(extif, irqs, val & irqs);
  
  	return IRQ_HANDLED;
-diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
-index 287153d325365..81f5fce6e895f 100644
---- a/drivers/tty/serial/8250/8250.h
-+++ b/drivers/tty/serial/8250/8250.h
-@@ -177,12 +177,74 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value)
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_aspeed_vuart.c linux/drivers/tty/serial/8250/8250_aspeed_vuart.c
+--- linux.orig/drivers/tty/serial/8250/8250_aspeed_vuart.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_aspeed_vuart.c	2022-12-04 10:40:26.700034085 -0500
+@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle(
+ 	up->ier &= ~irqs;
+ 	if (!throttle)
+ 		up->ier |= irqs;
+-	serial_out(up, UART_IER, up->ier);
++	serial8250_set_IER(up, up->ier);
+ }
+ static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle)
+ {
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_bcm7271.c linux/drivers/tty/serial/8250/8250_bcm7271.c
+--- linux.orig/drivers/tty/serial/8250/8250_bcm7271.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_bcm7271.c	2022-12-04 10:40:26.700034085 -0500
+@@ -609,7 +609,7 @@ static int brcmuart_startup(struct uart_
+ 	 * will handle this.
+ 	 */
+ 	up->ier &= ~UART_IER_RDI;
+-	serial_port_out(port, UART_IER, up->ier);
++	serial8250_set_IER(up, up->ier);
+ 
+ 	priv->tx_running = false;
+ 	priv->dma.rx_dma = NULL;
+@@ -775,10 +775,12 @@ static int brcmuart_handle_irq(struct ua
+ 	unsigned int iir = serial_port_in(p, UART_IIR);
+ 	struct brcmuart_priv *priv = p->private_data;
+ 	struct uart_8250_port *up = up_to_u8250p(p);
++	unsigned long cs_flags;
+ 	unsigned int status;
+ 	unsigned long flags;
+ 	unsigned int ier;
+ 	unsigned int mcr;
++	bool is_console;
+ 	int handled = 0;
+ 
+ 	/*
+@@ -789,6 +791,10 @@ static int brcmuart_handle_irq(struct ua
+ 		spin_lock_irqsave(&p->lock, flags);
+ 		status = serial_port_in(p, UART_LSR);
+ 		if ((status & UART_LSR_DR) == 0) {
++			is_console = uart_console(p);
++
++			if (is_console)
++				printk_cpu_sync_get_irqsave(cs_flags);
+ 
+ 			ier = serial_port_in(p, UART_IER);
+ 			/*
+@@ -809,6 +815,9 @@ static int brcmuart_handle_irq(struct ua
+ 				serial_port_in(p, UART_RX);
+ 			}
+ 
++			if (is_console)
++				printk_cpu_sync_put_irqrestore(cs_flags);
++
+ 			handled = 1;
+ 		}
+ 		spin_unlock_irqrestore(&p->lock, flags);
+@@ -823,8 +832,10 @@ static enum hrtimer_restart brcmuart_hrt
+ 	struct brcmuart_priv *priv = container_of(t, struct brcmuart_priv, hrt);
+ 	struct uart_port *p = priv->up;
+ 	struct uart_8250_port *up = up_to_u8250p(p);
++	unsigned long cs_flags;
+ 	unsigned int status;
+ 	unsigned long flags;
++	bool is_console;
+ 
+ 	if (priv->shutdown)
+ 		return HRTIMER_NORESTART;
+@@ -846,12 +857,20 @@ static enum hrtimer_restart brcmuart_hrt
+ 	/* re-enable receive unless upper layer has disabled it */
+ 	if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) ==
+ 	    (UART_IER_RLSI | UART_IER_RDI)) {
++		is_console = uart_console(p);
++
++		if (is_console)
++			printk_cpu_sync_get_irqsave(cs_flags);
++
+ 		status = serial_port_in(p, UART_IER);
+ 		status |= (UART_IER_RLSI | UART_IER_RDI);
+ 		serial_port_out(p, UART_IER, status);
+ 		status = serial_port_in(p, UART_MCR);
+ 		status |= UART_MCR_RTS;
+ 		serial_port_out(p, UART_MCR, status);
++
++		if (is_console)
++			printk_cpu_sync_put_irqrestore(cs_flags);
+ 	}
+ 	spin_unlock_irqrestore(&p->lock, flags);
+ 	return HRTIMER_NORESTART;
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_core.c linux/drivers/tty/serial/8250/8250_core.c
+--- linux.orig/drivers/tty/serial/8250/8250_core.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_core.c	2022-12-04 10:40:26.700034085 -0500
+@@ -255,8 +255,11 @@ static void serial8250_timeout(struct ti
+ static void serial8250_backup_timeout(struct timer_list *t)
+ {
+ 	struct uart_8250_port *up = from_timer(up, t, timer);
++	struct uart_port *port = &up->port;
+ 	unsigned int iir, ier = 0, lsr;
++	unsigned long cs_flags;
+ 	unsigned long flags;
++	bool is_console;
+ 
+ 	spin_lock_irqsave(&up->port.lock, flags);
+ 
+@@ -265,8 +268,16 @@ static void serial8250_backup_timeout(st
+ 	 * based handler.
+ 	 */
+ 	if (up->port.irq) {
++		is_console = uart_console(port);
++
++		if (is_console)
++			printk_cpu_sync_get_irqsave(cs_flags);
++
+ 		ier = serial_in(up, UART_IER);
+ 		serial_out(up, UART_IER, 0);
++
++		if (is_console)
++			printk_cpu_sync_put_irqrestore(cs_flags);
+ 	}
+ 
+ 	iir = serial_in(up, UART_IIR);
+@@ -289,7 +300,7 @@ static void serial8250_backup_timeout(st
+ 		serial8250_tx_chars(up);
+ 
+ 	if (up->port.irq)
+-		serial_out(up, UART_IER, ier);
++		serial8250_set_IER(up, ier);
+ 
+ 	spin_unlock_irqrestore(&up->port.lock, flags);
+ 
+@@ -575,6 +586,14 @@ serial8250_register_ports(struct uart_dr
+ 
+ #ifdef CONFIG_SERIAL_8250_CONSOLE
+ 
++static void univ8250_console_write_atomic(struct console *co, const char *s,
++					  unsigned int count)
++{
++	struct uart_8250_port *up = &serial8250_ports[co->index];
++
++	serial8250_console_write_atomic(up, s, count);
++}
++
+ static void univ8250_console_write(struct console *co, const char *s,
+ 				   unsigned int count)
+ {
+@@ -668,6 +687,7 @@ static int univ8250_console_match(struct
+ 
+ static struct console univ8250_console = {
+ 	.name		= "ttyS",
++	.write_atomic	= univ8250_console_write_atomic,
+ 	.write		= univ8250_console_write,
+ 	.device		= uart_console_device,
+ 	.setup		= univ8250_console_setup,
+@@ -961,7 +981,7 @@ static void serial_8250_overrun_backoff_
+ 	spin_lock_irqsave(&port->lock, flags);
+ 	up->ier |= UART_IER_RLSI | UART_IER_RDI;
+ 	up->port.read_status_mask |= UART_LSR_DR;
+-	serial_out(up, UART_IER, up->ier);
++	serial8250_set_IER(up, up->ier);
+ 	spin_unlock_irqrestore(&port->lock, flags);
+ }
+ 
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_exar.c linux/drivers/tty/serial/8250/8250_exar.c
+--- linux.orig/drivers/tty/serial/8250/8250_exar.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_exar.c	2022-12-04 10:40:26.700034085 -0500
+@@ -179,6 +179,8 @@ static void xr17v35x_set_divisor(struct
+ 
+ static int xr17v35x_startup(struct uart_port *port)
+ {
++	struct uart_8250_port *up = up_to_u8250p(port);
++
+ 	/*
+ 	 * First enable access to IER [7:5], ISR [5:4], FCR [5:4],
+ 	 * MCR [7:5] and MSR [7:0]
+@@ -189,7 +191,7 @@ static int xr17v35x_startup(struct uart_
+ 	 * Make sure all interrups are masked until initialization is
+ 	 * complete and the FIFOs are cleared
+ 	 */
+-	serial_port_out(port, UART_IER, 0);
++	serial8250_set_IER(up, 0);
+ 
+ 	return serial8250_do_startup(port);
+ }
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_fsl.c linux/drivers/tty/serial/8250/8250_fsl.c
+--- linux.orig/drivers/tty/serial/8250/8250_fsl.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_fsl.c	2022-12-04 10:40:26.700034085 -0500
+@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port
+ 	if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
+ 		unsigned long delay;
+ 
+-		up->ier = port->serial_in(port, UART_IER);
++		up->ier = serial8250_in_IER(up);
++
+ 		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
+ 			port->ops->stop_rx(port);
+ 		} else {
+diff -rupN linux.orig/drivers/tty/serial/8250/8250.h linux/drivers/tty/serial/8250/8250.h
+--- linux.orig/drivers/tty/serial/8250/8250.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250.h	2022-12-04 10:40:26.700034085 -0500
+@@ -177,12 +177,74 @@ static inline void serial_dl_write(struc
  	up->dl_write(up, value);
  }
  
@@ -3254,7 +21084,7 @@ index 287153d325365..81f5fce6e895f 100644
  	return true;
  }
  
-@@ -191,7 +253,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
+@@ -191,7 +253,7 @@ static inline bool serial8250_clear_THRI
  	if (!(up->ier & UART_IER_THRI))
  		return false;
  	up->ier &= ~UART_IER_THRI;
@@ -3263,213 +21093,10 @@ index 287153d325365..81f5fce6e895f 100644
  	return true;
  }
  
-diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c
-index 9d2a7856784f7..7cc6b527c088b 100644
---- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
-+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
-@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle(struct uart_8250_port *up,
- 	up->ier &= ~irqs;
- 	if (!throttle)
- 		up->ier |= irqs;
--	serial_out(up, UART_IER, up->ier);
-+	serial8250_set_IER(up, up->ier);
- }
- static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle)
- {
-diff --git a/drivers/tty/serial/8250/8250_bcm7271.c b/drivers/tty/serial/8250/8250_bcm7271.c
-index 8efdc271eb75f..d30c74618411f 100644
---- a/drivers/tty/serial/8250/8250_bcm7271.c
-+++ b/drivers/tty/serial/8250/8250_bcm7271.c
-@@ -609,7 +609,7 @@ static int brcmuart_startup(struct uart_port *port)
- 	 * will handle this.
- 	 */
- 	up->ier &= ~UART_IER_RDI;
--	serial_port_out(port, UART_IER, up->ier);
-+	serial8250_set_IER(up, up->ier);
- 
- 	priv->tx_running = false;
- 	priv->dma.rx_dma = NULL;
-@@ -775,10 +775,12 @@ static int brcmuart_handle_irq(struct uart_port *p)
- 	unsigned int iir = serial_port_in(p, UART_IIR);
- 	struct brcmuart_priv *priv = p->private_data;
- 	struct uart_8250_port *up = up_to_u8250p(p);
-+	unsigned long cs_flags;
- 	unsigned int status;
- 	unsigned long flags;
- 	unsigned int ier;
- 	unsigned int mcr;
-+	bool is_console;
- 	int handled = 0;
- 
- 	/*
-@@ -789,6 +791,10 @@ static int brcmuart_handle_irq(struct uart_port *p)
- 		spin_lock_irqsave(&p->lock, flags);
- 		status = serial_port_in(p, UART_LSR);
- 		if ((status & UART_LSR_DR) == 0) {
-+			is_console = uart_console(p);
-+
-+			if (is_console)
-+				printk_cpu_sync_get_irqsave(cs_flags);
- 
- 			ier = serial_port_in(p, UART_IER);
- 			/*
-@@ -809,6 +815,9 @@ static int brcmuart_handle_irq(struct uart_port *p)
- 				serial_port_in(p, UART_RX);
- 			}
- 
-+			if (is_console)
-+				printk_cpu_sync_put_irqrestore(cs_flags);
-+
- 			handled = 1;
- 		}
- 		spin_unlock_irqrestore(&p->lock, flags);
-@@ -823,8 +832,10 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t)
- 	struct brcmuart_priv *priv = container_of(t, struct brcmuart_priv, hrt);
- 	struct uart_port *p = priv->up;
- 	struct uart_8250_port *up = up_to_u8250p(p);
-+	unsigned long cs_flags;
- 	unsigned int status;
- 	unsigned long flags;
-+	bool is_console;
- 
- 	if (priv->shutdown)
- 		return HRTIMER_NORESTART;
-@@ -846,12 +857,20 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t)
- 	/* re-enable receive unless upper layer has disabled it */
- 	if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) ==
- 	    (UART_IER_RLSI | UART_IER_RDI)) {
-+		is_console = uart_console(p);
-+
-+		if (is_console)
-+			printk_cpu_sync_get_irqsave(cs_flags);
-+
- 		status = serial_port_in(p, UART_IER);
- 		status |= (UART_IER_RLSI | UART_IER_RDI);
- 		serial_port_out(p, UART_IER, status);
- 		status = serial_port_in(p, UART_MCR);
- 		status |= UART_MCR_RTS;
- 		serial_port_out(p, UART_MCR, status);
-+
-+		if (is_console)
-+			printk_cpu_sync_put_irqrestore(cs_flags);
- 	}
- 	spin_unlock_irqrestore(&p->lock, flags);
- 	return HRTIMER_NORESTART;
-diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
-index 94fbf0add2ce2..196d0c55dfe99 100644
---- a/drivers/tty/serial/8250/8250_core.c
-+++ b/drivers/tty/serial/8250/8250_core.c
-@@ -255,8 +255,11 @@ static void serial8250_timeout(struct timer_list *t)
- static void serial8250_backup_timeout(struct timer_list *t)
- {
- 	struct uart_8250_port *up = from_timer(up, t, timer);
-+	struct uart_port *port = &up->port;
- 	unsigned int iir, ier = 0, lsr;
-+	unsigned long cs_flags;
- 	unsigned long flags;
-+	bool is_console;
- 
- 	spin_lock_irqsave(&up->port.lock, flags);
- 
-@@ -265,8 +268,16 @@ static void serial8250_backup_timeout(struct timer_list *t)
- 	 * based handler.
- 	 */
- 	if (up->port.irq) {
-+		is_console = uart_console(port);
-+
-+		if (is_console)
-+			printk_cpu_sync_get_irqsave(cs_flags);
-+
- 		ier = serial_in(up, UART_IER);
- 		serial_out(up, UART_IER, 0);
-+
-+		if (is_console)
-+			printk_cpu_sync_put_irqrestore(cs_flags);
- 	}
- 
- 	iir = serial_in(up, UART_IIR);
-@@ -289,7 +300,7 @@ static void serial8250_backup_timeout(struct timer_list *t)
- 		serial8250_tx_chars(up);
- 
- 	if (up->port.irq)
--		serial_out(up, UART_IER, ier);
-+		serial8250_set_IER(up, ier);
- 
- 	spin_unlock_irqrestore(&up->port.lock, flags);
- 
-@@ -575,6 +586,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
- 
- #ifdef CONFIG_SERIAL_8250_CONSOLE
- 
-+static void univ8250_console_write_atomic(struct console *co, const char *s,
-+					  unsigned int count)
-+{
-+	struct uart_8250_port *up = &serial8250_ports[co->index];
-+
-+	serial8250_console_write_atomic(up, s, count);
-+}
-+
- static void univ8250_console_write(struct console *co, const char *s,
- 				   unsigned int count)
- {
-@@ -668,6 +687,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx,
- 
- static struct console univ8250_console = {
- 	.name		= "ttyS",
-+	.write_atomic	= univ8250_console_write_atomic,
- 	.write		= univ8250_console_write,
- 	.device		= uart_console_device,
- 	.setup		= univ8250_console_setup,
-@@ -961,7 +981,7 @@ static void serial_8250_overrun_backoff_work(struct work_struct *work)
- 	spin_lock_irqsave(&port->lock, flags);
- 	up->ier |= UART_IER_RLSI | UART_IER_RDI;
- 	up->port.read_status_mask |= UART_LSR_DR;
--	serial_out(up, UART_IER, up->ier);
-+	serial8250_set_IER(up, up->ier);
- 	spin_unlock_irqrestore(&port->lock, flags);
- }
- 
-diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
-index 314a05e009df9..9809517de8270 100644
---- a/drivers/tty/serial/8250/8250_exar.c
-+++ b/drivers/tty/serial/8250/8250_exar.c
-@@ -179,6 +179,8 @@ static void xr17v35x_set_divisor(struct uart_port *p, unsigned int baud,
- 
- static int xr17v35x_startup(struct uart_port *port)
- {
-+	struct uart_8250_port *up = up_to_u8250p(port);
-+
- 	/*
- 	 * First enable access to IER [7:5], ISR [5:4], FCR [5:4],
- 	 * MCR [7:5] and MSR [7:0]
-@@ -189,7 +191,7 @@ static int xr17v35x_startup(struct uart_port *port)
- 	 * Make sure all interrups are masked until initialization is
- 	 * complete and the FIFOs are cleared
- 	 */
--	serial_port_out(port, UART_IER, 0);
-+	serial8250_set_IER(up, 0);
- 
- 	return serial8250_do_startup(port);
- }
-diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c
-index 8aad15622a2e5..74bb85b705e7f 100644
---- a/drivers/tty/serial/8250/8250_fsl.c
-+++ b/drivers/tty/serial/8250/8250_fsl.c
-@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port *port)
- 	if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
- 		unsigned long delay;
- 
--		up->ier = port->serial_in(port, UART_IER);
-+		up->ier = serial8250_in_IER(up);
-+
- 		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
- 			port->ops->stop_rx(port);
- 		} else {
-diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c
-index 2b2f5d8d24b91..2b78e6c394fb9 100644
---- a/drivers/tty/serial/8250/8250_ingenic.c
-+++ b/drivers/tty/serial/8250/8250_ingenic.c
-@@ -146,6 +146,7 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart",
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_ingenic.c linux/drivers/tty/serial/8250/8250_ingenic.c
+--- linux.orig/drivers/tty/serial/8250/8250_ingenic.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_ingenic.c	2022-12-04 10:40:26.700034085 -0500
+@@ -146,6 +146,7 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic
  
  static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
  {
@@ -3477,7 +21104,7 @@ index 2b2f5d8d24b91..2b78e6c394fb9 100644
  	int ier;
  
  	switch (offset) {
-@@ -167,7 +168,7 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
+@@ -167,7 +168,7 @@ static void ingenic_uart_serial_out(stru
  		 * If we have enabled modem status IRQs we should enable
  		 * modem mode.
  		 */
@@ -3486,11 +21113,10 @@ index 2b2f5d8d24b91..2b78e6c394fb9 100644
  
  		if (ier & UART_IER_MSI)
  			value |= UART_MCR_MDCE | UART_MCR_FCM;
-diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c
-index 54051ec7b4992..6092c75808fb9 100644
---- a/drivers/tty/serial/8250/8250_mtk.c
-+++ b/drivers/tty/serial/8250/8250_mtk.c
-@@ -222,12 +222,40 @@ static void mtk8250_shutdown(struct uart_port *port)
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_mtk.c linux/drivers/tty/serial/8250/8250_mtk.c
+--- linux.orig/drivers/tty/serial/8250/8250_mtk.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_mtk.c	2022-12-04 10:40:26.700034085 -0500
+@@ -222,12 +222,40 @@ static void mtk8250_shutdown(struct uart
  
  static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask)
  {
@@ -3533,20 +21159,19 @@ index 54051ec7b4992..6092c75808fb9 100644
  }
  
  static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode)
-diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
-index 38ee3e42251af..8dc983a8cad15 100644
---- a/drivers/tty/serial/8250/8250_omap.c
-+++ b/drivers/tty/serial/8250/8250_omap.c
-@@ -325,7 +325,7 @@ static void omap8250_restore_regs(struct uart_8250_port *up)
- 
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_omap.c linux/drivers/tty/serial/8250/8250_omap.c
+--- linux.orig/drivers/tty/serial/8250/8250_omap.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_omap.c	2022-12-04 10:41:15.271907054 -0500
+@@ -328,7 +328,7 @@ static void omap8250_restore_regs(struct
  	/* drop TCR + TLR access, we setup XON/XOFF later */
- 	serial8250_out_MCR(up, up->mcr);
+ 	serial8250_out_MCR(up, mcr);
+ 
 -	serial_out(up, UART_IER, up->ier);
 +	serial8250_set_IER(up, up->ier);
  
  	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
  	serial_dl_write(up, priv->quot);
-@@ -515,7 +515,7 @@ static void omap_8250_pm(struct uart_port *port, unsigned int state,
+@@ -518,7 +518,7 @@ static void omap_8250_pm(struct uart_por
  	serial_out(up, UART_EFR, efr | UART_EFR_ECB);
  	serial_out(up, UART_LCR, 0);
  
@@ -3555,7 +21180,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
  	serial_out(up, UART_EFR, efr);
  	serial_out(up, UART_LCR, 0);
-@@ -636,7 +636,7 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id)
+@@ -639,7 +639,7 @@ static irqreturn_t omap8250_irq(int irq,
  	if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) {
  		unsigned long delay;
  
@@ -3564,7 +21189,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
  			port->ops->stop_rx(port);
  		} else {
-@@ -696,7 +696,7 @@ static int omap_8250_startup(struct uart_port *port)
+@@ -698,7 +698,7 @@ static int omap_8250_startup(struct uart
  		goto err;
  
  	up->ier = UART_IER_RLSI | UART_IER_RDI;
@@ -3573,7 +21198,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  
  #ifdef CONFIG_PM
  	up->capabilities |= UART_CAP_RPM;
-@@ -737,7 +737,7 @@ static void omap_8250_shutdown(struct uart_port *port)
+@@ -739,7 +739,7 @@ static void omap_8250_shutdown(struct ua
  		serial_out(up, UART_OMAP_EFR2, 0x0);
  
  	up->ier = 0;
@@ -3582,7 +21207,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  
  	if (up->dma)
  		serial8250_release_dma(up);
-@@ -785,7 +785,7 @@ static void omap_8250_unthrottle(struct uart_port *port)
+@@ -787,7 +787,7 @@ static void omap_8250_unthrottle(struct
  		up->dma->rx_dma(up);
  	up->ier |= UART_IER_RLSI | UART_IER_RDI;
  	port->read_status_mask |= UART_LSR_DR;
@@ -3591,7 +21216,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  	spin_unlock_irqrestore(&port->lock, flags);
  
  	pm_runtime_mark_last_busy(port->dev);
-@@ -876,7 +876,7 @@ static void __dma_rx_complete(void *param)
+@@ -878,7 +878,7 @@ static void __dma_rx_complete(void *para
  	__dma_rx_do_complete(p);
  	if (!priv->throttled) {
  		p->ier |= UART_IER_RLSI | UART_IER_RDI;
@@ -3600,7 +21225,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  		if (!(priv->habit & UART_HAS_EFR2))
  			omap_8250_rx_dma(p);
  	}
-@@ -933,7 +933,7 @@ static int omap_8250_rx_dma(struct uart_8250_port *p)
+@@ -935,7 +935,7 @@ static int omap_8250_rx_dma(struct uart_
  			 * callback to run.
  			 */
  			p->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
@@ -3609,7 +21234,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  		}
  		goto out;
  	}
-@@ -1148,12 +1148,12 @@ static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir,
+@@ -1150,12 +1150,12 @@ static void am654_8250_handle_rx_dma(str
  		 * periodic timeouts, re-enable interrupts.
  		 */
  		up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
@@ -3624,11 +21249,1731 @@ index 38ee3e42251af..8dc983a8cad15 100644
  	}
  }
  
-diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
-index 2030a92ac66e7..326549603740d 100644
---- a/drivers/tty/serial/8250/8250_port.c
-+++ b/drivers/tty/serial/8250/8250_port.c
-@@ -743,7 +743,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_omap.c.orig linux/drivers/tty/serial/8250/8250_omap.c.orig
+--- linux.orig/drivers/tty/serial/8250/8250_omap.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_omap.c.orig	2022-12-04 10:40:18.432055273 -0500
+@@ -0,0 +1,1716 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * 8250-core based driver for the OMAP internal UART
++ *
++ * based on omap-serial.c, Copyright (C) 2010 Texas Instruments.
++ *
++ * Copyright (C) 2014 Sebastian Andrzej Siewior
++ *
++ */
++
++#include <linux/clk.h>
++#include <linux/device.h>
++#include <linux/io.h>
++#include <linux/module.h>
++#include <linux/serial_8250.h>
++#include <linux/serial_reg.h>
++#include <linux/tty_flip.h>
++#include <linux/platform_device.h>
++#include <linux/slab.h>
++#include <linux/of.h>
++#include <linux/of_device.h>
++#include <linux/of_gpio.h>
++#include <linux/of_irq.h>
++#include <linux/delay.h>
++#include <linux/pm_runtime.h>
++#include <linux/console.h>
++#include <linux/pm_qos.h>
++#include <linux/pm_wakeirq.h>
++#include <linux/dma-mapping.h>
++#include <linux/sys_soc.h>
++
++#include "8250.h"
++
++#define DEFAULT_CLK_SPEED	48000000
++
++#define UART_ERRATA_i202_MDR1_ACCESS	(1 << 0)
++#define OMAP_UART_WER_HAS_TX_WAKEUP	(1 << 1)
++#define OMAP_DMA_TX_KICK		(1 << 2)
++/*
++ * See Advisory 21 in AM437x errata SPRZ408B, updated April 2015.
++ * The same errata is applicable to AM335x and DRA7x processors too.
++ */
++#define UART_ERRATA_CLOCK_DISABLE	(1 << 3)
++#define	UART_HAS_EFR2			BIT(4)
++#define UART_HAS_RHR_IT_DIS		BIT(5)
++#define UART_RX_TIMEOUT_QUIRK		BIT(6)
++
++#define OMAP_UART_FCR_RX_TRIG		6
++#define OMAP_UART_FCR_TX_TRIG		4
++
++/* SCR register bitmasks */
++#define OMAP_UART_SCR_RX_TRIG_GRANU1_MASK	(1 << 7)
++#define OMAP_UART_SCR_TX_TRIG_GRANU1_MASK	(1 << 6)
++#define OMAP_UART_SCR_TX_EMPTY			(1 << 3)
++#define OMAP_UART_SCR_DMAMODE_MASK		(3 << 1)
++#define OMAP_UART_SCR_DMAMODE_1			(1 << 1)
++#define OMAP_UART_SCR_DMAMODE_CTL		(1 << 0)
++
++/* MVR register bitmasks */
++#define OMAP_UART_MVR_SCHEME_SHIFT	30
++#define OMAP_UART_LEGACY_MVR_MAJ_MASK	0xf0
++#define OMAP_UART_LEGACY_MVR_MAJ_SHIFT	4
++#define OMAP_UART_LEGACY_MVR_MIN_MASK	0x0f
++#define OMAP_UART_MVR_MAJ_MASK		0x700
++#define OMAP_UART_MVR_MAJ_SHIFT		8
++#define OMAP_UART_MVR_MIN_MASK		0x3f
++
++/* SYSC register bitmasks */
++#define OMAP_UART_SYSC_SOFTRESET	(1 << 1)
++
++/* SYSS register bitmasks */
++#define OMAP_UART_SYSS_RESETDONE	(1 << 0)
++
++#define UART_TI752_TLR_TX	0
++#define UART_TI752_TLR_RX	4
++
++#define TRIGGER_TLR_MASK(x)	((x & 0x3c) >> 2)
++#define TRIGGER_FCR_MASK(x)	(x & 3)
++
++/* Enable XON/XOFF flow control on output */
++#define OMAP_UART_SW_TX		0x08
++/* Enable XON/XOFF flow control on input */
++#define OMAP_UART_SW_RX		0x02
++
++#define OMAP_UART_WER_MOD_WKUP	0x7f
++#define OMAP_UART_TX_WAKEUP_EN	(1 << 7)
++
++#define TX_TRIGGER	1
++#define RX_TRIGGER	48
++
++#define OMAP_UART_TCR_RESTORE(x)	((x / 4) << 4)
++#define OMAP_UART_TCR_HALT(x)		((x / 4) << 0)
++
++#define UART_BUILD_REVISION(x, y)	(((x) << 8) | (y))
++
++#define OMAP_UART_REV_46 0x0406
++#define OMAP_UART_REV_52 0x0502
++#define OMAP_UART_REV_63 0x0603
++
++/* Interrupt Enable Register 2 */
++#define UART_OMAP_IER2			0x1B
++#define UART_OMAP_IER2_RHR_IT_DIS	BIT(2)
++
++/* Enhanced features register 2 */
++#define UART_OMAP_EFR2			0x23
++#define UART_OMAP_EFR2_TIMEOUT_BEHAVE	BIT(6)
++
++/* RX FIFO occupancy indicator */
++#define UART_OMAP_RX_LVL		0x19
++
++struct omap8250_priv {
++	int line;
++	u8 habit;
++	u8 mdr1;
++	u8 efr;
++	u8 scr;
++	u8 wer;
++	u8 xon;
++	u8 xoff;
++	u8 delayed_restore;
++	u16 quot;
++
++	u8 tx_trigger;
++	u8 rx_trigger;
++	bool is_suspending;
++	int wakeirq;
++	int wakeups_enabled;
++	u32 latency;
++	u32 calc_latency;
++	struct pm_qos_request pm_qos_request;
++	struct work_struct qos_work;
++	struct uart_8250_dma omap8250_dma;
++	spinlock_t rx_dma_lock;
++	bool rx_dma_broken;
++	bool throttled;
++};
++
++struct omap8250_dma_params {
++	u32 rx_size;
++	u8 rx_trigger;
++	u8 tx_trigger;
++};
++
++struct omap8250_platdata {
++	struct omap8250_dma_params *dma_params;
++	u8 habit;
++};
++
++#ifdef CONFIG_SERIAL_8250_DMA
++static void omap_8250_rx_dma_flush(struct uart_8250_port *p);
++#else
++static inline void omap_8250_rx_dma_flush(struct uart_8250_port *p) { }
++#endif
++
++static u32 uart_read(struct uart_8250_port *up, u32 reg)
++{
++	return readl(up->port.membase + (reg << up->port.regshift));
++}
++
++/*
++ * Called on runtime PM resume path from omap8250_restore_regs(), and
++ * omap8250_set_mctrl().
++ */
++static void __omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = up->port.private_data;
++	u8 lcr;
++
++	serial8250_do_set_mctrl(port, mctrl);
++
++	if (!mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_RTS)) {
++		/*
++		 * Turn off autoRTS if RTS is lowered and restore autoRTS
++		 * setting if RTS is raised
++		 */
++		lcr = serial_in(up, UART_LCR);
++		serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++		if ((mctrl & TIOCM_RTS) && (port->status & UPSTAT_AUTORTS))
++			priv->efr |= UART_EFR_RTS;
++		else
++			priv->efr &= ~UART_EFR_RTS;
++		serial_out(up, UART_EFR, priv->efr);
++		serial_out(up, UART_LCR, lcr);
++	}
++}
++
++static void omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
++{
++	int err;
++
++	err = pm_runtime_resume_and_get(port->dev);
++	if (err)
++		return;
++
++	__omap8250_set_mctrl(port, mctrl);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++}
++
++/*
++ * Work Around for Errata i202 (2430, 3430, 3630, 4430 and 4460)
++ * The access to uart register after MDR1 Access
++ * causes UART to corrupt data.
++ *
++ * Need a delay =
++ * 5 L4 clock cycles + 5 UART functional clock cycle (@48MHz = ~0.2uS)
++ * give 10 times as much
++ */
++static void omap_8250_mdr1_errataset(struct uart_8250_port *up,
++				     struct omap8250_priv *priv)
++{
++	serial_out(up, UART_OMAP_MDR1, priv->mdr1);
++	udelay(2);
++	serial_out(up, UART_FCR, up->fcr | UART_FCR_CLEAR_XMIT |
++			UART_FCR_CLEAR_RCVR);
++}
++
++static void omap_8250_get_divisor(struct uart_port *port, unsigned int baud,
++				  struct omap8250_priv *priv)
++{
++	unsigned int uartclk = port->uartclk;
++	unsigned int div_13, div_16;
++	unsigned int abs_d13, abs_d16;
++
++	/*
++	 * Old custom speed handling.
++	 */
++	if (baud == 38400 && (port->flags & UPF_SPD_MASK) == UPF_SPD_CUST) {
++		priv->quot = port->custom_divisor & UART_DIV_MAX;
++		/*
++		 * I assume that nobody is using this. But hey, if somebody
++		 * would like to specify the divisor _and_ the mode then the
++		 * driver is ready and waiting for it.
++		 */
++		if (port->custom_divisor & (1 << 16))
++			priv->mdr1 = UART_OMAP_MDR1_13X_MODE;
++		else
++			priv->mdr1 = UART_OMAP_MDR1_16X_MODE;
++		return;
++	}
++	div_13 = DIV_ROUND_CLOSEST(uartclk, 13 * baud);
++	div_16 = DIV_ROUND_CLOSEST(uartclk, 16 * baud);
++
++	if (!div_13)
++		div_13 = 1;
++	if (!div_16)
++		div_16 = 1;
++
++	abs_d13 = abs(baud - uartclk / 13 / div_13);
++	abs_d16 = abs(baud - uartclk / 16 / div_16);
++
++	if (abs_d13 >= abs_d16) {
++		priv->mdr1 = UART_OMAP_MDR1_16X_MODE;
++		priv->quot = div_16;
++	} else {
++		priv->mdr1 = UART_OMAP_MDR1_13X_MODE;
++		priv->quot = div_13;
++	}
++}
++
++static void omap8250_update_scr(struct uart_8250_port *up,
++				struct omap8250_priv *priv)
++{
++	u8 old_scr;
++
++	old_scr = serial_in(up, UART_OMAP_SCR);
++	if (old_scr == priv->scr)
++		return;
++
++	/*
++	 * The manual recommends not to enable the DMA mode selector in the SCR
++	 * (instead of the FCR) register _and_ selecting the DMA mode as one
++	 * register write because this may lead to malfunction.
++	 */
++	if (priv->scr & OMAP_UART_SCR_DMAMODE_MASK)
++		serial_out(up, UART_OMAP_SCR,
++			   priv->scr & ~OMAP_UART_SCR_DMAMODE_MASK);
++	serial_out(up, UART_OMAP_SCR, priv->scr);
++}
++
++static void omap8250_update_mdr1(struct uart_8250_port *up,
++				 struct omap8250_priv *priv)
++{
++	if (priv->habit & UART_ERRATA_i202_MDR1_ACCESS)
++		omap_8250_mdr1_errataset(up, priv);
++	else
++		serial_out(up, UART_OMAP_MDR1, priv->mdr1);
++}
++
++static void omap8250_restore_regs(struct uart_8250_port *up)
++{
++	struct omap8250_priv *priv = up->port.private_data;
++	struct uart_8250_dma	*dma = up->dma;
++	u8 mcr = serial8250_in_MCR(up);
++
++	if (dma && dma->tx_running) {
++		/*
++		 * TCSANOW requests the change to occur immediately however if
++		 * we have a TX-DMA operation in progress then it has been
++		 * observed that it might stall and never complete. Therefore we
++		 * delay DMA completes to prevent this hang from happen.
++		 */
++		priv->delayed_restore = 1;
++		return;
++	}
++
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_EFR, UART_EFR_ECB);
++
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	serial8250_out_MCR(up, mcr | UART_MCR_TCRTLR);
++	serial_out(up, UART_FCR, up->fcr);
++
++	omap8250_update_scr(up, priv);
++
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++
++	serial_out(up, UART_TI752_TCR, OMAP_UART_TCR_RESTORE(16) |
++			OMAP_UART_TCR_HALT(52));
++	serial_out(up, UART_TI752_TLR,
++		   TRIGGER_TLR_MASK(priv->tx_trigger) << UART_TI752_TLR_TX |
++		   TRIGGER_TLR_MASK(priv->rx_trigger) << UART_TI752_TLR_RX);
++
++	serial_out(up, UART_LCR, 0);
++
++	/* drop TCR + TLR access, we setup XON/XOFF later */
++	serial8250_out_MCR(up, mcr);
++
++	serial_out(up, UART_IER, up->ier);
++
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_dl_write(up, priv->quot);
++
++	serial_out(up, UART_EFR, priv->efr);
++
++	/* Configure flow control */
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_XON1, priv->xon);
++	serial_out(up, UART_XOFF1, priv->xoff);
++
++	serial_out(up, UART_LCR, up->lcr);
++
++	omap8250_update_mdr1(up, priv);
++
++	__omap8250_set_mctrl(&up->port, up->port.mctrl);
++
++	if (up->port.rs485.flags & SER_RS485_ENABLED)
++		serial8250_em485_stop_tx(up);
++}
++
++/*
++ * OMAP can use "CLK / (16 or 13) / div" for baud rate. And then we have have
++ * some differences in how we want to handle flow control.
++ */
++static void omap_8250_set_termios(struct uart_port *port,
++				  struct ktermios *termios,
++				  struct ktermios *old)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = up->port.private_data;
++	unsigned char cval = 0;
++	unsigned int baud;
++
++	cval = UART_LCR_WLEN(tty_get_char_size(termios->c_cflag));
++
++	if (termios->c_cflag & CSTOPB)
++		cval |= UART_LCR_STOP;
++	if (termios->c_cflag & PARENB)
++		cval |= UART_LCR_PARITY;
++	if (!(termios->c_cflag & PARODD))
++		cval |= UART_LCR_EPAR;
++	if (termios->c_cflag & CMSPAR)
++		cval |= UART_LCR_SPAR;
++
++	/*
++	 * Ask the core to calculate the divisor for us.
++	 */
++	baud = uart_get_baud_rate(port, termios, old,
++				  port->uartclk / 16 / UART_DIV_MAX,
++				  port->uartclk / 13);
++	omap_8250_get_divisor(port, baud, priv);
++
++	/*
++	 * Ok, we're now changing the port state. Do it with
++	 * interrupts disabled.
++	 */
++	pm_runtime_get_sync(port->dev);
++	spin_lock_irq(&port->lock);
++
++	/*
++	 * Update the per-port timeout.
++	 */
++	uart_update_timeout(port, termios->c_cflag, baud);
++
++	up->port.read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR;
++	if (termios->c_iflag & INPCK)
++		up->port.read_status_mask |= UART_LSR_FE | UART_LSR_PE;
++	if (termios->c_iflag & (IGNBRK | PARMRK))
++		up->port.read_status_mask |= UART_LSR_BI;
++
++	/*
++	 * Characters to ignore
++	 */
++	up->port.ignore_status_mask = 0;
++	if (termios->c_iflag & IGNPAR)
++		up->port.ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
++	if (termios->c_iflag & IGNBRK) {
++		up->port.ignore_status_mask |= UART_LSR_BI;
++		/*
++		 * If we're ignoring parity and break indicators,
++		 * ignore overruns too (for real raw support).
++		 */
++		if (termios->c_iflag & IGNPAR)
++			up->port.ignore_status_mask |= UART_LSR_OE;
++	}
++
++	/*
++	 * ignore all characters if CREAD is not set
++	 */
++	if ((termios->c_cflag & CREAD) == 0)
++		up->port.ignore_status_mask |= UART_LSR_DR;
++
++	/*
++	 * Modem status interrupts
++	 */
++	up->ier &= ~UART_IER_MSI;
++	if (UART_ENABLE_MS(&up->port, termios->c_cflag))
++		up->ier |= UART_IER_MSI;
++
++	up->lcr = cval;
++	/* Up to here it was mostly serial8250_do_set_termios() */
++
++	/*
++	 * We enable TRIG_GRANU for RX and TX and additionally we set
++	 * SCR_TX_EMPTY bit. The result is the following:
++	 * - RX_TRIGGER amount of bytes in the FIFO will cause an interrupt.
++	 * - less than RX_TRIGGER number of bytes will also cause an interrupt
++	 *   once the UART decides that there no new bytes arriving.
++	 * - Once THRE is enabled, the interrupt will be fired once the FIFO is
++	 *   empty - the trigger level is ignored here.
++	 *
++	 * Once DMA is enabled:
++	 * - UART will assert the TX DMA line once there is room for TX_TRIGGER
++	 *   bytes in the TX FIFO. On each assert the DMA engine will move
++	 *   TX_TRIGGER bytes into the FIFO.
++	 * - UART will assert the RX DMA line once there are RX_TRIGGER bytes in
++	 *   the FIFO and move RX_TRIGGER bytes.
++	 * This is because threshold and trigger values are the same.
++	 */
++	up->fcr = UART_FCR_ENABLE_FIFO;
++	up->fcr |= TRIGGER_FCR_MASK(priv->tx_trigger) << OMAP_UART_FCR_TX_TRIG;
++	up->fcr |= TRIGGER_FCR_MASK(priv->rx_trigger) << OMAP_UART_FCR_RX_TRIG;
++
++	priv->scr = OMAP_UART_SCR_RX_TRIG_GRANU1_MASK | OMAP_UART_SCR_TX_EMPTY |
++		OMAP_UART_SCR_TX_TRIG_GRANU1_MASK;
++
++	if (up->dma)
++		priv->scr |= OMAP_UART_SCR_DMAMODE_1 |
++			OMAP_UART_SCR_DMAMODE_CTL;
++
++	priv->xon = termios->c_cc[VSTART];
++	priv->xoff = termios->c_cc[VSTOP];
++
++	priv->efr = 0;
++	up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF);
++
++	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW &&
++	    !mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_RTS) &&
++	    !mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_CTS)) {
++		/* Enable AUTOCTS (autoRTS is enabled when RTS is raised) */
++		up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS;
++		priv->efr |= UART_EFR_CTS;
++	} else	if (up->port.flags & UPF_SOFT_FLOW) {
++		/*
++		 * OMAP rx s/w flow control is borked; the transmitter remains
++		 * stuck off even if rx flow control is subsequently disabled
++		 */
++
++		/*
++		 * IXOFF Flag:
++		 * Enable XON/XOFF flow control on output.
++		 * Transmit XON1, XOFF1
++		 */
++		if (termios->c_iflag & IXOFF) {
++			up->port.status |= UPSTAT_AUTOXOFF;
++			priv->efr |= OMAP_UART_SW_TX;
++		}
++	}
++	omap8250_restore_regs(up);
++
++	spin_unlock_irq(&up->port.lock);
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++
++	/* calculate wakeup latency constraint */
++	priv->calc_latency = USEC_PER_SEC * 64 * 8 / baud;
++	priv->latency = priv->calc_latency;
++
++	schedule_work(&priv->qos_work);
++
++	/* Don't rewrite B0 */
++	if (tty_termios_baud_rate(termios))
++		tty_termios_encode_baud_rate(termios, baud, baud);
++}
++
++/* same as 8250 except that we may have extra flow bits set in EFR */
++static void omap_8250_pm(struct uart_port *port, unsigned int state,
++			 unsigned int oldstate)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	u8 efr;
++
++	pm_runtime_get_sync(port->dev);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	efr = serial_in(up, UART_EFR);
++	serial_out(up, UART_EFR, efr | UART_EFR_ECB);
++	serial_out(up, UART_LCR, 0);
++
++	serial_out(up, UART_IER, (state != 0) ? UART_IERX_SLEEP : 0);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_EFR, efr);
++	serial_out(up, UART_LCR, 0);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++}
++
++static void omap_serial_fill_features_erratas(struct uart_8250_port *up,
++					      struct omap8250_priv *priv)
++{
++	static const struct soc_device_attribute k3_soc_devices[] = {
++		{ .family = "AM65X",  },
++		{ .family = "J721E", .revision = "SR1.0" },
++		{ /* sentinel */ }
++	};
++	u32 mvr, scheme;
++	u16 revision, major, minor;
++
++	mvr = uart_read(up, UART_OMAP_MVER);
++
++	/* Check revision register scheme */
++	scheme = mvr >> OMAP_UART_MVR_SCHEME_SHIFT;
++
++	switch (scheme) {
++	case 0: /* Legacy Scheme: OMAP2/3 */
++		/* MINOR_REV[0:4], MAJOR_REV[4:7] */
++		major = (mvr & OMAP_UART_LEGACY_MVR_MAJ_MASK) >>
++			OMAP_UART_LEGACY_MVR_MAJ_SHIFT;
++		minor = (mvr & OMAP_UART_LEGACY_MVR_MIN_MASK);
++		break;
++	case 1:
++		/* New Scheme: OMAP4+ */
++		/* MINOR_REV[0:5], MAJOR_REV[8:10] */
++		major = (mvr & OMAP_UART_MVR_MAJ_MASK) >>
++			OMAP_UART_MVR_MAJ_SHIFT;
++		minor = (mvr & OMAP_UART_MVR_MIN_MASK);
++		break;
++	default:
++		dev_warn(up->port.dev,
++			 "Unknown revision, defaulting to highest\n");
++		/* highest possible revision */
++		major = 0xff;
++		minor = 0xff;
++	}
++	/* normalize revision for the driver */
++	revision = UART_BUILD_REVISION(major, minor);
++
++	switch (revision) {
++	case OMAP_UART_REV_46:
++		priv->habit |= UART_ERRATA_i202_MDR1_ACCESS;
++		break;
++	case OMAP_UART_REV_52:
++		priv->habit |= UART_ERRATA_i202_MDR1_ACCESS |
++				OMAP_UART_WER_HAS_TX_WAKEUP;
++		break;
++	case OMAP_UART_REV_63:
++		priv->habit |= UART_ERRATA_i202_MDR1_ACCESS |
++			OMAP_UART_WER_HAS_TX_WAKEUP;
++		break;
++	default:
++		break;
++	}
++
++	/*
++	 * AM65x SR1.0, AM65x SR2.0 and J721e SR1.0 don't
++	 * don't have RHR_IT_DIS bit in IER2 register. So drop to flag
++	 * to enable errata workaround.
++	 */
++	if (soc_device_match(k3_soc_devices))
++		priv->habit &= ~UART_HAS_RHR_IT_DIS;
++}
++
++static void omap8250_uart_qos_work(struct work_struct *work)
++{
++	struct omap8250_priv *priv;
++
++	priv = container_of(work, struct omap8250_priv, qos_work);
++	cpu_latency_qos_update_request(&priv->pm_qos_request, priv->latency);
++}
++
++#ifdef CONFIG_SERIAL_8250_DMA
++static int omap_8250_dma_handle_irq(struct uart_port *port);
++#endif
++
++static irqreturn_t omap8250_irq(int irq, void *dev_id)
++{
++	struct uart_port *port = dev_id;
++	struct omap8250_priv *priv = port->private_data;
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned int iir, lsr;
++	int ret;
++
++#ifdef CONFIG_SERIAL_8250_DMA
++	if (up->dma) {
++		ret = omap_8250_dma_handle_irq(port);
++		return IRQ_RETVAL(ret);
++	}
++#endif
++
++	serial8250_rpm_get(up);
++	lsr = serial_port_in(port, UART_LSR);
++	iir = serial_port_in(port, UART_IIR);
++	ret = serial8250_handle_irq(port, iir);
++
++	/*
++	 * On K3 SoCs, it is observed that RX TIMEOUT is signalled after
++	 * FIFO has been drained, in which case a dummy read of RX FIFO
++	 * is required to clear RX TIMEOUT condition.
++	 */
++	if (priv->habit & UART_RX_TIMEOUT_QUIRK &&
++	    (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT &&
++	    serial_port_in(port, UART_OMAP_RX_LVL) == 0) {
++		serial_port_in(port, UART_RX);
++	}
++
++	/* Stop processing interrupts on input overrun */
++	if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) {
++		unsigned long delay;
++
++		up->ier = port->serial_in(port, UART_IER);
++		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
++			port->ops->stop_rx(port);
++		} else {
++			/* Keep restarting the timer until
++			 * the input overrun subsides.
++			 */
++			cancel_delayed_work(&up->overrun_backoff);
++		}
++
++		delay = msecs_to_jiffies(up->overrun_backoff_time_ms);
++		schedule_delayed_work(&up->overrun_backoff, delay);
++	}
++
++	serial8250_rpm_put(up);
++
++	return IRQ_RETVAL(ret);
++}
++
++static int omap_8250_startup(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = port->private_data;
++	int ret;
++
++	if (priv->wakeirq) {
++		ret = dev_pm_set_dedicated_wake_irq(port->dev, priv->wakeirq);
++		if (ret)
++			return ret;
++	}
++
++	pm_runtime_get_sync(port->dev);
++
++	serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
++
++	serial_out(up, UART_LCR, UART_LCR_WLEN8);
++
++	up->lsr_saved_flags = 0;
++	up->msr_saved_flags = 0;
++
++	/* Disable DMA for console UART */
++	if (uart_console(port))
++		up->dma = NULL;
++
++	if (up->dma) {
++		ret = serial8250_request_dma(up);
++		if (ret) {
++			dev_warn_ratelimited(port->dev,
++					     "failed to request DMA\n");
++			up->dma = NULL;
++		}
++	}
++
++	ret = request_irq(port->irq, omap8250_irq, IRQF_SHARED,
++			  dev_name(port->dev), port);
++	if (ret < 0)
++		goto err;
++
++	up->ier = UART_IER_RLSI | UART_IER_RDI;
++	serial_out(up, UART_IER, up->ier);
++
++#ifdef CONFIG_PM
++	up->capabilities |= UART_CAP_RPM;
++#endif
++
++	/* Enable module level wake up */
++	priv->wer = OMAP_UART_WER_MOD_WKUP;
++	if (priv->habit & OMAP_UART_WER_HAS_TX_WAKEUP)
++		priv->wer |= OMAP_UART_TX_WAKEUP_EN;
++	serial_out(up, UART_OMAP_WER, priv->wer);
++
++	if (up->dma && !(priv->habit & UART_HAS_EFR2))
++		up->dma->rx_dma(up);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++	return 0;
++err:
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++	dev_pm_clear_wake_irq(port->dev);
++	return ret;
++}
++
++static void omap_8250_shutdown(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = port->private_data;
++
++	flush_work(&priv->qos_work);
++	if (up->dma)
++		omap_8250_rx_dma_flush(up);
++
++	pm_runtime_get_sync(port->dev);
++
++	serial_out(up, UART_OMAP_WER, 0);
++	if (priv->habit & UART_HAS_EFR2)
++		serial_out(up, UART_OMAP_EFR2, 0x0);
++
++	up->ier = 0;
++	serial_out(up, UART_IER, 0);
++
++	if (up->dma)
++		serial8250_release_dma(up);
++
++	/*
++	 * Disable break condition and FIFOs
++	 */
++	if (up->lcr & UART_LCR_SBC)
++		serial_out(up, UART_LCR, up->lcr & ~UART_LCR_SBC);
++	serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++	free_irq(port->irq, port);
++	dev_pm_clear_wake_irq(port->dev);
++}
++
++static void omap_8250_throttle(struct uart_port *port)
++{
++	struct omap8250_priv *priv = port->private_data;
++	unsigned long flags;
++
++	pm_runtime_get_sync(port->dev);
++
++	spin_lock_irqsave(&port->lock, flags);
++	port->ops->stop_rx(port);
++	priv->throttled = true;
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++}
++
++static void omap_8250_unthrottle(struct uart_port *port)
++{
++	struct omap8250_priv *priv = port->private_data;
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++
++	pm_runtime_get_sync(port->dev);
++
++	spin_lock_irqsave(&port->lock, flags);
++	priv->throttled = false;
++	if (up->dma)
++		up->dma->rx_dma(up);
++	up->ier |= UART_IER_RLSI | UART_IER_RDI;
++	port->read_status_mask |= UART_LSR_DR;
++	serial_out(up, UART_IER, up->ier);
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++}
++
++#ifdef CONFIG_SERIAL_8250_DMA
++static int omap_8250_rx_dma(struct uart_8250_port *p);
++
++/* Must be called while priv->rx_dma_lock is held */
++static void __dma_rx_do_complete(struct uart_8250_port *p)
++{
++	struct uart_8250_dma    *dma = p->dma;
++	struct tty_port         *tty_port = &p->port.state->port;
++	struct omap8250_priv	*priv = p->port.private_data;
++	struct dma_chan		*rxchan = dma->rxchan;
++	dma_cookie_t		cookie;
++	struct dma_tx_state     state;
++	int                     count;
++	int			ret;
++	u32			reg;
++
++	if (!dma->rx_running)
++		goto out;
++
++	cookie = dma->rx_cookie;
++	dma->rx_running = 0;
++
++	/* Re-enable RX FIFO interrupt now that transfer is complete */
++	if (priv->habit & UART_HAS_RHR_IT_DIS) {
++		reg = serial_in(p, UART_OMAP_IER2);
++		reg &= ~UART_OMAP_IER2_RHR_IT_DIS;
++		serial_out(p, UART_OMAP_IER2, UART_OMAP_IER2_RHR_IT_DIS);
++	}
++
++	dmaengine_tx_status(rxchan, cookie, &state);
++
++	count = dma->rx_size - state.residue + state.in_flight_bytes;
++	if (count < dma->rx_size) {
++		dmaengine_terminate_async(rxchan);
++
++		/*
++		 * Poll for teardown to complete which guarantees in
++		 * flight data is drained.
++		 */
++		if (state.in_flight_bytes) {
++			int poll_count = 25;
++
++			while (dmaengine_tx_status(rxchan, cookie, NULL) &&
++			       poll_count--)
++				cpu_relax();
++
++			if (poll_count == -1)
++				dev_err(p->port.dev, "teardown incomplete\n");
++		}
++	}
++	if (!count)
++		goto out;
++	ret = tty_insert_flip_string(tty_port, dma->rx_buf, count);
++
++	p->port.icount.rx += ret;
++	p->port.icount.buf_overrun += count - ret;
++out:
++
++	tty_flip_buffer_push(tty_port);
++}
++
++static void __dma_rx_complete(void *param)
++{
++	struct uart_8250_port *p = param;
++	struct omap8250_priv *priv = p->port.private_data;
++	struct uart_8250_dma *dma = p->dma;
++	struct dma_tx_state     state;
++	unsigned long flags;
++
++	spin_lock_irqsave(&p->port.lock, flags);
++
++	/*
++	 * If the tx status is not DMA_COMPLETE, then this is a delayed
++	 * completion callback. A previous RX timeout flush would have
++	 * already pushed the data, so exit.
++	 */
++	if (dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state) !=
++			DMA_COMPLETE) {
++		spin_unlock_irqrestore(&p->port.lock, flags);
++		return;
++	}
++	__dma_rx_do_complete(p);
++	if (!priv->throttled) {
++		p->ier |= UART_IER_RLSI | UART_IER_RDI;
++		serial_out(p, UART_IER, p->ier);
++		if (!(priv->habit & UART_HAS_EFR2))
++			omap_8250_rx_dma(p);
++	}
++
++	spin_unlock_irqrestore(&p->port.lock, flags);
++}
++
++static void omap_8250_rx_dma_flush(struct uart_8250_port *p)
++{
++	struct omap8250_priv	*priv = p->port.private_data;
++	struct uart_8250_dma	*dma = p->dma;
++	struct dma_tx_state     state;
++	unsigned long		flags;
++	int ret;
++
++	spin_lock_irqsave(&priv->rx_dma_lock, flags);
++
++	if (!dma->rx_running) {
++		spin_unlock_irqrestore(&priv->rx_dma_lock, flags);
++		return;
++	}
++
++	ret = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state);
++	if (ret == DMA_IN_PROGRESS) {
++		ret = dmaengine_pause(dma->rxchan);
++		if (WARN_ON_ONCE(ret))
++			priv->rx_dma_broken = true;
++	}
++	__dma_rx_do_complete(p);
++	spin_unlock_irqrestore(&priv->rx_dma_lock, flags);
++}
++
++static int omap_8250_rx_dma(struct uart_8250_port *p)
++{
++	struct omap8250_priv		*priv = p->port.private_data;
++	struct uart_8250_dma            *dma = p->dma;
++	int				err = 0;
++	struct dma_async_tx_descriptor  *desc;
++	unsigned long			flags;
++	u32				reg;
++
++	if (priv->rx_dma_broken)
++		return -EINVAL;
++
++	spin_lock_irqsave(&priv->rx_dma_lock, flags);
++
++	if (dma->rx_running) {
++		enum dma_status state;
++
++		state = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, NULL);
++		if (state == DMA_COMPLETE) {
++			/*
++			 * Disable RX interrupts to allow RX DMA completion
++			 * callback to run.
++			 */
++			p->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
++			serial_out(p, UART_IER, p->ier);
++		}
++		goto out;
++	}
++
++	desc = dmaengine_prep_slave_single(dma->rxchan, dma->rx_addr,
++					   dma->rx_size, DMA_DEV_TO_MEM,
++					   DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
++	if (!desc) {
++		err = -EBUSY;
++		goto out;
++	}
++
++	dma->rx_running = 1;
++	desc->callback = __dma_rx_complete;
++	desc->callback_param = p;
++
++	dma->rx_cookie = dmaengine_submit(desc);
++
++	/*
++	 * Disable RX FIFO interrupt while RX DMA is enabled, else
++	 * spurious interrupt may be raised when data is in the RX FIFO
++	 * but is yet to be drained by DMA.
++	 */
++	if (priv->habit & UART_HAS_RHR_IT_DIS) {
++		reg = serial_in(p, UART_OMAP_IER2);
++		reg |= UART_OMAP_IER2_RHR_IT_DIS;
++		serial_out(p, UART_OMAP_IER2, UART_OMAP_IER2_RHR_IT_DIS);
++	}
++
++	dma_async_issue_pending(dma->rxchan);
++out:
++	spin_unlock_irqrestore(&priv->rx_dma_lock, flags);
++	return err;
++}
++
++static int omap_8250_tx_dma(struct uart_8250_port *p);
++
++static void omap_8250_dma_tx_complete(void *param)
++{
++	struct uart_8250_port	*p = param;
++	struct uart_8250_dma	*dma = p->dma;
++	struct circ_buf		*xmit = &p->port.state->xmit;
++	unsigned long		flags;
++	bool			en_thri = false;
++	struct omap8250_priv	*priv = p->port.private_data;
++
++	dma_sync_single_for_cpu(dma->txchan->device->dev, dma->tx_addr,
++				UART_XMIT_SIZE, DMA_TO_DEVICE);
++
++	spin_lock_irqsave(&p->port.lock, flags);
++
++	dma->tx_running = 0;
++
++	xmit->tail += dma->tx_size;
++	xmit->tail &= UART_XMIT_SIZE - 1;
++	p->port.icount.tx += dma->tx_size;
++
++	if (priv->delayed_restore) {
++		priv->delayed_restore = 0;
++		omap8250_restore_regs(p);
++	}
++
++	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
++		uart_write_wakeup(&p->port);
++
++	if (!uart_circ_empty(xmit) && !uart_tx_stopped(&p->port)) {
++		int ret;
++
++		ret = omap_8250_tx_dma(p);
++		if (ret)
++			en_thri = true;
++	} else if (p->capabilities & UART_CAP_RPM) {
++		en_thri = true;
++	}
++
++	if (en_thri) {
++		dma->tx_err = 1;
++		serial8250_set_THRI(p);
++	}
++
++	spin_unlock_irqrestore(&p->port.lock, flags);
++}
++
++static int omap_8250_tx_dma(struct uart_8250_port *p)
++{
++	struct uart_8250_dma		*dma = p->dma;
++	struct omap8250_priv		*priv = p->port.private_data;
++	struct circ_buf			*xmit = &p->port.state->xmit;
++	struct dma_async_tx_descriptor	*desc;
++	unsigned int	skip_byte = 0;
++	int ret;
++
++	if (dma->tx_running)
++		return 0;
++	if (uart_tx_stopped(&p->port) || uart_circ_empty(xmit)) {
++
++		/*
++		 * Even if no data, we need to return an error for the two cases
++		 * below so serial8250_tx_chars() is invoked and properly clears
++		 * THRI and/or runtime suspend.
++		 */
++		if (dma->tx_err || p->capabilities & UART_CAP_RPM) {
++			ret = -EBUSY;
++			goto err;
++		}
++		serial8250_clear_THRI(p);
++		return 0;
++	}
++
++	dma->tx_size = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
++	if (priv->habit & OMAP_DMA_TX_KICK) {
++		u8 tx_lvl;
++
++		/*
++		 * We need to put the first byte into the FIFO in order to start
++		 * the DMA transfer. For transfers smaller than four bytes we
++		 * don't bother doing DMA at all. It seem not matter if there
++		 * are still bytes in the FIFO from the last transfer (in case
++		 * we got here directly from omap_8250_dma_tx_complete()). Bytes
++		 * leaving the FIFO seem not to trigger the DMA transfer. It is
++		 * really the byte that we put into the FIFO.
++		 * If the FIFO is already full then we most likely got here from
++		 * omap_8250_dma_tx_complete(). And this means the DMA engine
++		 * just completed its work. We don't have to wait the complete
++		 * 86us at 115200,8n1 but around 60us (not to mention lower
++		 * baudrates). So in that case we take the interrupt and try
++		 * again with an empty FIFO.
++		 */
++		tx_lvl = serial_in(p, UART_OMAP_TX_LVL);
++		if (tx_lvl == p->tx_loadsz) {
++			ret = -EBUSY;
++			goto err;
++		}
++		if (dma->tx_size < 4) {
++			ret = -EINVAL;
++			goto err;
++		}
++		skip_byte = 1;
++	}
++
++	desc = dmaengine_prep_slave_single(dma->txchan,
++			dma->tx_addr + xmit->tail + skip_byte,
++			dma->tx_size - skip_byte, DMA_MEM_TO_DEV,
++			DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
++	if (!desc) {
++		ret = -EBUSY;
++		goto err;
++	}
++
++	dma->tx_running = 1;
++
++	desc->callback = omap_8250_dma_tx_complete;
++	desc->callback_param = p;
++
++	dma->tx_cookie = dmaengine_submit(desc);
++
++	dma_sync_single_for_device(dma->txchan->device->dev, dma->tx_addr,
++				   UART_XMIT_SIZE, DMA_TO_DEVICE);
++
++	dma_async_issue_pending(dma->txchan);
++	if (dma->tx_err)
++		dma->tx_err = 0;
++
++	serial8250_clear_THRI(p);
++	if (skip_byte)
++		serial_out(p, UART_TX, xmit->buf[xmit->tail]);
++	return 0;
++err:
++	dma->tx_err = 1;
++	return ret;
++}
++
++static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
++{
++	switch (iir & 0x3f) {
++	case UART_IIR_RLSI:
++	case UART_IIR_RX_TIMEOUT:
++	case UART_IIR_RDI:
++		omap_8250_rx_dma_flush(up);
++		return true;
++	}
++	return omap_8250_rx_dma(up);
++}
++
++static u16 omap_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, u16 status)
++{
++	if ((status & (UART_LSR_DR | UART_LSR_BI)) &&
++	    (iir & UART_IIR_RDI)) {
++		if (handle_rx_dma(up, iir)) {
++			status = serial8250_rx_chars(up, status);
++			omap_8250_rx_dma(up);
++		}
++	}
++
++	return status;
++}
++
++static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir,
++				     u16 status)
++{
++	/*
++	 * Queue a new transfer if FIFO has data.
++	 */
++	if ((status & (UART_LSR_DR | UART_LSR_BI)) &&
++	    (up->ier & UART_IER_RDI)) {
++		omap_8250_rx_dma(up);
++		serial_out(up, UART_OMAP_EFR2, UART_OMAP_EFR2_TIMEOUT_BEHAVE);
++	} else if ((iir & 0x3f) == UART_IIR_RX_TIMEOUT) {
++		/*
++		 * Disable RX timeout, read IIR to clear
++		 * current timeout condition, clear EFR2 to
++		 * periodic timeouts, re-enable interrupts.
++		 */
++		up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
++		serial_out(up, UART_IER, up->ier);
++		omap_8250_rx_dma_flush(up);
++		serial_in(up, UART_IIR);
++		serial_out(up, UART_OMAP_EFR2, 0x0);
++		up->ier |= UART_IER_RLSI | UART_IER_RDI;
++		serial_out(up, UART_IER, up->ier);
++	}
++}
++
++/*
++ * This is mostly serial8250_handle_irq(). We have a slightly different DMA
++ * hoook for RX/TX and need different logic for them in the ISR. Therefore we
++ * use the default routine in the non-DMA case and this one for with DMA.
++ */
++static int omap_8250_dma_handle_irq(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = up->port.private_data;
++	u16 status;
++	u8 iir;
++
++	serial8250_rpm_get(up);
++
++	iir = serial_port_in(port, UART_IIR);
++	if (iir & UART_IIR_NO_INT) {
++		serial8250_rpm_put(up);
++		return IRQ_HANDLED;
++	}
++
++	spin_lock(&port->lock);
++
++	status = serial_port_in(port, UART_LSR);
++
++	if (priv->habit & UART_HAS_EFR2)
++		am654_8250_handle_rx_dma(up, iir, status);
++	else
++		status = omap_8250_handle_rx_dma(up, iir, status);
++
++	serial8250_modem_status(up);
++	if (status & UART_LSR_THRE && up->dma->tx_err) {
++		if (uart_tx_stopped(&up->port) ||
++		    uart_circ_empty(&up->port.state->xmit)) {
++			up->dma->tx_err = 0;
++			serial8250_tx_chars(up);
++		} else  {
++			/*
++			 * try again due to an earlier failer which
++			 * might have been resolved by now.
++			 */
++			if (omap_8250_tx_dma(up))
++				serial8250_tx_chars(up);
++		}
++	}
++
++	uart_unlock_and_check_sysrq(port);
++
++	serial8250_rpm_put(up);
++	return 1;
++}
++
++static bool the_no_dma_filter_fn(struct dma_chan *chan, void *param)
++{
++	return false;
++}
++
++#else
++
++static inline int omap_8250_rx_dma(struct uart_8250_port *p)
++{
++	return -EINVAL;
++}
++#endif
++
++static int omap8250_no_handle_irq(struct uart_port *port)
++{
++	/* IRQ has not been requested but handling irq? */
++	WARN_ONCE(1, "Unexpected irq handling before port startup\n");
++	return 0;
++}
++
++static struct omap8250_dma_params am654_dma = {
++	.rx_size = SZ_2K,
++	.rx_trigger = 1,
++	.tx_trigger = TX_TRIGGER,
++};
++
++static struct omap8250_dma_params am33xx_dma = {
++	.rx_size = RX_TRIGGER,
++	.rx_trigger = RX_TRIGGER,
++	.tx_trigger = TX_TRIGGER,
++};
++
++static struct omap8250_platdata am654_platdata = {
++	.dma_params	= &am654_dma,
++	.habit		= UART_HAS_EFR2 | UART_HAS_RHR_IT_DIS |
++			  UART_RX_TIMEOUT_QUIRK,
++};
++
++static struct omap8250_platdata am33xx_platdata = {
++	.dma_params	= &am33xx_dma,
++	.habit		= OMAP_DMA_TX_KICK | UART_ERRATA_CLOCK_DISABLE,
++};
++
++static struct omap8250_platdata omap4_platdata = {
++	.dma_params	= &am33xx_dma,
++	.habit		= UART_ERRATA_CLOCK_DISABLE,
++};
++
++static const struct of_device_id omap8250_dt_ids[] = {
++	{ .compatible = "ti,am654-uart", .data = &am654_platdata, },
++	{ .compatible = "ti,omap2-uart" },
++	{ .compatible = "ti,omap3-uart" },
++	{ .compatible = "ti,omap4-uart", .data = &omap4_platdata, },
++	{ .compatible = "ti,am3352-uart", .data = &am33xx_platdata, },
++	{ .compatible = "ti,am4372-uart", .data = &am33xx_platdata, },
++	{ .compatible = "ti,dra742-uart", .data = &omap4_platdata, },
++	{},
++};
++MODULE_DEVICE_TABLE(of, omap8250_dt_ids);
++
++static int omap8250_probe(struct platform_device *pdev)
++{
++	struct device_node *np = pdev->dev.of_node;
++	struct omap8250_priv *priv;
++	const struct omap8250_platdata *pdata;
++	struct uart_8250_port up;
++	struct resource *regs;
++	void __iomem *membase;
++	int irq, ret;
++
++	irq = platform_get_irq(pdev, 0);
++	if (irq < 0)
++		return irq;
++
++	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
++	if (!regs) {
++		dev_err(&pdev->dev, "missing registers\n");
++		return -EINVAL;
++	}
++
++	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
++	if (!priv)
++		return -ENOMEM;
++
++	membase = devm_ioremap(&pdev->dev, regs->start,
++				       resource_size(regs));
++	if (!membase)
++		return -ENODEV;
++
++	memset(&up, 0, sizeof(up));
++	up.port.dev = &pdev->dev;
++	up.port.mapbase = regs->start;
++	up.port.membase = membase;
++	up.port.irq = irq;
++	/*
++	 * It claims to be 16C750 compatible however it is a little different.
++	 * It has EFR and has no FCR7_64byte bit. The AFE (which it claims to
++	 * have) is enabled via EFR instead of MCR. The type is set here 8250
++	 * just to get things going. UNKNOWN does not work for a few reasons and
++	 * we don't need our own type since we don't use 8250's set_termios()
++	 * or pm callback.
++	 */
++	up.port.type = PORT_8250;
++	up.port.iotype = UPIO_MEM;
++	up.port.flags = UPF_FIXED_PORT | UPF_FIXED_TYPE | UPF_SOFT_FLOW |
++		UPF_HARD_FLOW;
++	up.port.private_data = priv;
++
++	up.port.regshift = 2;
++	up.port.fifosize = 64;
++	up.tx_loadsz = 64;
++	up.capabilities = UART_CAP_FIFO;
++#ifdef CONFIG_PM
++	/*
++	 * Runtime PM is mostly transparent. However to do it right we need to a
++	 * TX empty interrupt before we can put the device to auto idle. So if
++	 * PM is not enabled we don't add that flag and can spare that one extra
++	 * interrupt in the TX path.
++	 */
++	up.capabilities |= UART_CAP_RPM;
++#endif
++	up.port.set_termios = omap_8250_set_termios;
++	up.port.set_mctrl = omap8250_set_mctrl;
++	up.port.pm = omap_8250_pm;
++	up.port.startup = omap_8250_startup;
++	up.port.shutdown = omap_8250_shutdown;
++	up.port.throttle = omap_8250_throttle;
++	up.port.unthrottle = omap_8250_unthrottle;
++	up.port.rs485_config = serial8250_em485_config;
++	up.port.rs485_supported = serial8250_em485_supported;
++	up.rs485_start_tx = serial8250_em485_start_tx;
++	up.rs485_stop_tx = serial8250_em485_stop_tx;
++	up.port.has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE);
++
++	ret = of_alias_get_id(np, "serial");
++	if (ret < 0) {
++		dev_err(&pdev->dev, "failed to get alias\n");
++		return ret;
++	}
++	up.port.line = ret;
++
++	if (of_property_read_u32(np, "clock-frequency", &up.port.uartclk)) {
++		struct clk *clk;
++
++		clk = devm_clk_get(&pdev->dev, NULL);
++		if (IS_ERR(clk)) {
++			if (PTR_ERR(clk) == -EPROBE_DEFER)
++				return -EPROBE_DEFER;
++		} else {
++			up.port.uartclk = clk_get_rate(clk);
++		}
++	}
++
++	if (of_property_read_u32(np, "overrun-throttle-ms",
++				 &up.overrun_backoff_time_ms) != 0)
++		up.overrun_backoff_time_ms = 0;
++
++	priv->wakeirq = irq_of_parse_and_map(np, 1);
++
++	pdata = of_device_get_match_data(&pdev->dev);
++	if (pdata)
++		priv->habit |= pdata->habit;
++
++	if (!up.port.uartclk) {
++		up.port.uartclk = DEFAULT_CLK_SPEED;
++		dev_warn(&pdev->dev,
++			 "No clock speed specified: using default: %d\n",
++			 DEFAULT_CLK_SPEED);
++	}
++
++	priv->latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE;
++	priv->calc_latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE;
++	cpu_latency_qos_add_request(&priv->pm_qos_request, priv->latency);
++	INIT_WORK(&priv->qos_work, omap8250_uart_qos_work);
++
++	spin_lock_init(&priv->rx_dma_lock);
++
++	device_init_wakeup(&pdev->dev, true);
++	pm_runtime_enable(&pdev->dev);
++	pm_runtime_use_autosuspend(&pdev->dev);
++
++	/*
++	 * Disable runtime PM until autosuspend delay unless specifically
++	 * enabled by the user via sysfs. This is the historic way to
++	 * prevent an unsafe default policy with lossy characters on wake-up.
++	 * For serdev devices this is not needed, the policy can be managed by
++	 * the serdev driver.
++	 */
++	if (!of_get_available_child_count(pdev->dev.of_node))
++		pm_runtime_set_autosuspend_delay(&pdev->dev, -1);
++
++	pm_runtime_irq_safe(&pdev->dev);
++
++	pm_runtime_get_sync(&pdev->dev);
++
++	omap_serial_fill_features_erratas(&up, priv);
++	up.port.handle_irq = omap8250_no_handle_irq;
++	priv->rx_trigger = RX_TRIGGER;
++	priv->tx_trigger = TX_TRIGGER;
++#ifdef CONFIG_SERIAL_8250_DMA
++	/*
++	 * Oh DMA support. If there are no DMA properties in the DT then
++	 * we will fall back to a generic DMA channel which does not
++	 * really work here. To ensure that we do not get a generic DMA
++	 * channel assigned, we have the the_no_dma_filter_fn() here.
++	 * To avoid "failed to request DMA" messages we check for DMA
++	 * properties in DT.
++	 */
++	ret = of_property_count_strings(np, "dma-names");
++	if (ret == 2) {
++		struct omap8250_dma_params *dma_params = NULL;
++
++		up.dma = &priv->omap8250_dma;
++		up.dma->fn = the_no_dma_filter_fn;
++		up.dma->tx_dma = omap_8250_tx_dma;
++		up.dma->rx_dma = omap_8250_rx_dma;
++		if (pdata)
++			dma_params = pdata->dma_params;
++
++		if (dma_params) {
++			up.dma->rx_size = dma_params->rx_size;
++			up.dma->rxconf.src_maxburst = dma_params->rx_trigger;
++			up.dma->txconf.dst_maxburst = dma_params->tx_trigger;
++			priv->rx_trigger = dma_params->rx_trigger;
++			priv->tx_trigger = dma_params->tx_trigger;
++		} else {
++			up.dma->rx_size = RX_TRIGGER;
++			up.dma->rxconf.src_maxburst = RX_TRIGGER;
++			up.dma->txconf.dst_maxburst = TX_TRIGGER;
++		}
++	}
++#endif
++	ret = serial8250_register_8250_port(&up);
++	if (ret < 0) {
++		dev_err(&pdev->dev, "unable to register 8250 port\n");
++		goto err;
++	}
++	priv->line = ret;
++	platform_set_drvdata(pdev, priv);
++	pm_runtime_mark_last_busy(&pdev->dev);
++	pm_runtime_put_autosuspend(&pdev->dev);
++	return 0;
++err:
++	pm_runtime_dont_use_autosuspend(&pdev->dev);
++	pm_runtime_put_sync(&pdev->dev);
++	pm_runtime_disable(&pdev->dev);
++	return ret;
++}
++
++static int omap8250_remove(struct platform_device *pdev)
++{
++	struct omap8250_priv *priv = platform_get_drvdata(pdev);
++	int err;
++
++	err = pm_runtime_resume_and_get(&pdev->dev);
++	if (err)
++		return err;
++
++	pm_runtime_dont_use_autosuspend(&pdev->dev);
++	pm_runtime_put_sync(&pdev->dev);
++	flush_work(&priv->qos_work);
++	pm_runtime_disable(&pdev->dev);
++	serial8250_unregister_port(priv->line);
++	cpu_latency_qos_remove_request(&priv->pm_qos_request);
++	device_init_wakeup(&pdev->dev, false);
++	return 0;
++}
++
++#ifdef CONFIG_PM_SLEEP
++static int omap8250_prepare(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++
++	if (!priv)
++		return 0;
++	priv->is_suspending = true;
++	return 0;
++}
++
++static void omap8250_complete(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++
++	if (!priv)
++		return;
++	priv->is_suspending = false;
++}
++
++static int omap8250_suspend(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++	struct uart_8250_port *up = serial8250_get_port(priv->line);
++
++	serial8250_suspend_port(priv->line);
++
++	pm_runtime_get_sync(dev);
++	if (!device_may_wakeup(dev))
++		priv->wer = 0;
++	serial_out(up, UART_OMAP_WER, priv->wer);
++	pm_runtime_mark_last_busy(dev);
++	pm_runtime_put_autosuspend(dev);
++
++	flush_work(&priv->qos_work);
++	return 0;
++}
++
++static int omap8250_resume(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++
++	serial8250_resume_port(priv->line);
++	return 0;
++}
++#else
++#define omap8250_prepare NULL
++#define omap8250_complete NULL
++#endif
++
++#ifdef CONFIG_PM
++static int omap8250_lost_context(struct uart_8250_port *up)
++{
++	u32 val;
++
++	val = serial_in(up, UART_OMAP_SCR);
++	/*
++	 * If we lose context, then SCR is set to its reset value of zero.
++	 * After set_termios() we set bit 3 of SCR (TX_EMPTY_CTL_IT) to 1,
++	 * among other bits, to never set the register back to zero again.
++	 */
++	if (!val)
++		return 1;
++	return 0;
++}
++
++/* TODO: in future, this should happen via API in drivers/reset/ */
++static int omap8250_soft_reset(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++	struct uart_8250_port *up = serial8250_get_port(priv->line);
++	int timeout = 100;
++	int sysc;
++	int syss;
++
++	/*
++	 * At least on omap4, unused uarts may not idle after reset without
++	 * a basic scr dma configuration even with no dma in use. The
++	 * module clkctrl status bits will be 1 instead of 3 blocking idle
++	 * for the whole clockdomain. The softreset below will clear scr,
++	 * and we restore it on resume so this is safe to do on all SoCs
++	 * needing omap8250_soft_reset() quirk. Do it in two writes as
++	 * recommended in the comment for omap8250_update_scr().
++	 */
++	serial_out(up, UART_OMAP_SCR, OMAP_UART_SCR_DMAMODE_1);
++	serial_out(up, UART_OMAP_SCR,
++		   OMAP_UART_SCR_DMAMODE_1 | OMAP_UART_SCR_DMAMODE_CTL);
++
++	sysc = serial_in(up, UART_OMAP_SYSC);
++
++	/* softreset the UART */
++	sysc |= OMAP_UART_SYSC_SOFTRESET;
++	serial_out(up, UART_OMAP_SYSC, sysc);
++
++	/* By experiments, 1us enough for reset complete on AM335x */
++	do {
++		udelay(1);
++		syss = serial_in(up, UART_OMAP_SYSS);
++	} while (--timeout && !(syss & OMAP_UART_SYSS_RESETDONE));
++
++	if (!timeout) {
++		dev_err(dev, "timed out waiting for reset done\n");
++		return -ETIMEDOUT;
++	}
++
++	return 0;
++}
++
++static int omap8250_runtime_suspend(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++	struct uart_8250_port *up;
++
++	/* In case runtime-pm tries this before we are setup */
++	if (!priv)
++		return 0;
++
++	up = serial8250_get_port(priv->line);
++	/*
++	 * When using 'no_console_suspend', the console UART must not be
++	 * suspended. Since driver suspend is managed by runtime suspend,
++	 * preventing runtime suspend (by returning error) will keep device
++	 * active during suspend.
++	 */
++	if (priv->is_suspending && !console_suspend_enabled) {
++		if (uart_console(&up->port))
++			return -EBUSY;
++	}
++
++	if (priv->habit & UART_ERRATA_CLOCK_DISABLE) {
++		int ret;
++
++		ret = omap8250_soft_reset(dev);
++		if (ret)
++			return ret;
++
++		/* Restore to UART mode after reset (for wakeup) */
++		omap8250_update_mdr1(up, priv);
++		/* Restore wakeup enable register */
++		serial_out(up, UART_OMAP_WER, priv->wer);
++	}
++
++	if (up->dma && up->dma->rxchan)
++		omap_8250_rx_dma_flush(up);
++
++	priv->latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE;
++	schedule_work(&priv->qos_work);
++
++	return 0;
++}
++
++static int omap8250_runtime_resume(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++	struct uart_8250_port *up;
++
++	/* In case runtime-pm tries this before we are setup */
++	if (!priv)
++		return 0;
++
++	up = serial8250_get_port(priv->line);
++
++	if (omap8250_lost_context(up))
++		omap8250_restore_regs(up);
++
++	if (up->dma && up->dma->rxchan && !(priv->habit & UART_HAS_EFR2))
++		omap_8250_rx_dma(up);
++
++	priv->latency = priv->calc_latency;
++	schedule_work(&priv->qos_work);
++	return 0;
++}
++#endif
++
++#ifdef CONFIG_SERIAL_8250_OMAP_TTYO_FIXUP
++static int __init omap8250_console_fixup(void)
++{
++	char *omap_str;
++	char *options;
++	u8 idx;
++
++	if (strstr(boot_command_line, "console=ttyS"))
++		/* user set a ttyS based name for the console */
++		return 0;
++
++	omap_str = strstr(boot_command_line, "console=ttyO");
++	if (!omap_str)
++		/* user did not set ttyO based console, so we don't care */
++		return 0;
++
++	omap_str += 12;
++	if ('0' <= *omap_str && *omap_str <= '9')
++		idx = *omap_str - '0';
++	else
++		return 0;
++
++	omap_str++;
++	if (omap_str[0] == ',') {
++		omap_str++;
++		options = omap_str;
++	} else {
++		options = NULL;
++	}
++
++	add_preferred_console("ttyS", idx, options);
++	pr_err("WARNING: Your 'console=ttyO%d' has been replaced by 'ttyS%d'\n",
++	       idx, idx);
++	pr_err("This ensures that you still see kernel messages. Please\n");
++	pr_err("update your kernel commandline.\n");
++	return 0;
++}
++console_initcall(omap8250_console_fixup);
++#endif
++
++static const struct dev_pm_ops omap8250_dev_pm_ops = {
++	SET_SYSTEM_SLEEP_PM_OPS(omap8250_suspend, omap8250_resume)
++	SET_RUNTIME_PM_OPS(omap8250_runtime_suspend,
++			   omap8250_runtime_resume, NULL)
++	.prepare        = omap8250_prepare,
++	.complete       = omap8250_complete,
++};
++
++static struct platform_driver omap8250_platform_driver = {
++	.driver = {
++		.name		= "omap8250",
++		.pm		= &omap8250_dev_pm_ops,
++		.of_match_table = omap8250_dt_ids,
++	},
++	.probe			= omap8250_probe,
++	.remove			= omap8250_remove,
++};
++module_platform_driver(omap8250_platform_driver);
++
++MODULE_AUTHOR("Sebastian Andrzej Siewior");
++MODULE_DESCRIPTION("OMAP 8250 Driver");
++MODULE_LICENSE("GPL v2");
+Binary files linux.orig/drivers/tty/serial/8250/.8250_omap.c.rej.swp and linux/drivers/tty/serial/8250/.8250_omap.c.rej.swp differ
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_port.c linux/drivers/tty/serial/8250/8250_port.c
+--- linux.orig/drivers/tty/serial/8250/8250_port.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_port.c	2022-12-04 10:40:26.700034085 -0500
+@@ -743,7 +743,7 @@ static void serial8250_set_sleep(struct
  			serial_out(p, UART_EFR, UART_EFR_ECB);
  			serial_out(p, UART_LCR, 0);
  		}
@@ -3637,7 +22982,7 @@ index 2030a92ac66e7..326549603740d 100644
  		if (p->capabilities & UART_CAP_EFR) {
  			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
  			serial_out(p, UART_EFR, efr);
-@@ -1017,8 +1017,11 @@ static int broken_efr(struct uart_8250_port *up)
+@@ -1017,8 +1017,11 @@ static int broken_efr(struct uart_8250_p
   */
  static void autoconfig_16550a(struct uart_8250_port *up)
  {
@@ -3649,7 +22994,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	up->port.type = PORT_16550A;
  	up->capabilities |= UART_CAP_FIFO;
-@@ -1130,6 +1133,11 @@ static void autoconfig_16550a(struct uart_8250_port *up)
+@@ -1130,6 +1133,11 @@ static void autoconfig_16550a(struct uar
  		return;
  	}
  
@@ -3661,7 +23006,7 @@ index 2030a92ac66e7..326549603740d 100644
  	/*
  	 * Try writing and reading the UART_IER_UUE bit (b6).
  	 * If it works, this is probably one of the Xscale platform's
-@@ -1165,6 +1173,9 @@ static void autoconfig_16550a(struct uart_8250_port *up)
+@@ -1165,6 +1173,9 @@ static void autoconfig_16550a(struct uar
  	}
  	serial_out(up, UART_IER, iersave);
  
@@ -3671,7 +23016,7 @@ index 2030a92ac66e7..326549603740d 100644
  	/*
  	 * We distinguish between 16550A and U6 16550A by counting
  	 * how many bytes are in the FIFO.
-@@ -1187,8 +1198,10 @@ static void autoconfig(struct uart_8250_port *up)
+@@ -1187,8 +1198,10 @@ static void autoconfig(struct uart_8250_
  	unsigned char status1, scratch, scratch2, scratch3;
  	unsigned char save_lcr, save_mcr;
  	struct uart_port *port = &up->port;
@@ -3682,7 +23027,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	if (!port->iobase && !port->mapbase && !port->membase)
  		return;
-@@ -1206,6 +1219,11 @@ static void autoconfig(struct uart_8250_port *up)
+@@ -1206,6 +1219,11 @@ static void autoconfig(struct uart_8250_
  	up->bugs = 0;
  
  	if (!(port->flags & UPF_BUGGY_UART)) {
@@ -3694,7 +23039,7 @@ index 2030a92ac66e7..326549603740d 100644
  		/*
  		 * Do a simple existence test first; if we fail this,
  		 * there's no point trying anything else.
-@@ -1235,6 +1253,10 @@ static void autoconfig(struct uart_8250_port *up)
+@@ -1235,6 +1253,10 @@ static void autoconfig(struct uart_8250_
  #endif
  		scratch3 = serial_in(up, UART_IER) & 0x0f;
  		serial_out(up, UART_IER, scratch);
@@ -3705,7 +23050,7 @@ index 2030a92ac66e7..326549603740d 100644
  		if (scratch2 != 0 || scratch3 != 0x0F) {
  			/*
  			 * We failed; there's nothing here
-@@ -1332,10 +1354,7 @@ static void autoconfig(struct uart_8250_port *up)
+@@ -1332,10 +1354,7 @@ static void autoconfig(struct uart_8250_
  	serial8250_out_MCR(up, save_mcr);
  	serial8250_clear_fifos(up);
  	serial_in(up, UART_RX);
@@ -3717,7 +23062,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  out_unlock:
  	spin_unlock_irqrestore(&port->lock, flags);
-@@ -1361,7 +1380,9 @@ static void autoconfig_irq(struct uart_8250_port *up)
+@@ -1361,7 +1380,9 @@ static void autoconfig_irq(struct uart_8
  	unsigned char save_mcr, save_ier;
  	unsigned char save_ICP = 0;
  	unsigned int ICP = 0;
@@ -3727,7 +23072,7 @@ index 2030a92ac66e7..326549603740d 100644
  	int irq;
  
  	if (port->flags & UPF_FOURPORT) {
-@@ -1371,8 +1392,12 @@ static void autoconfig_irq(struct uart_8250_port *up)
+@@ -1371,8 +1392,12 @@ static void autoconfig_irq(struct uart_8
  		inb_p(ICP);
  	}
  
@@ -3741,7 +23086,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	/* forget possible initially masked and pending IRQ */
  	probe_irq_off(probe_irq_on());
-@@ -1404,8 +1429,10 @@ static void autoconfig_irq(struct uart_8250_port *up)
+@@ -1404,8 +1429,10 @@ static void autoconfig_irq(struct uart_8
  	if (port->flags & UPF_FOURPORT)
  		outb_p(save_ICP, ICP);
  
@@ -3753,7 +23098,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	port->irq = (irq > 0) ? irq : 0;
  }
-@@ -1418,7 +1445,7 @@ static void serial8250_stop_rx(struct uart_port *port)
+@@ -1418,7 +1445,7 @@ static void serial8250_stop_rx(struct ua
  
  	up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
  	up->port.read_status_mask &= ~UART_LSR_DR;
@@ -3762,7 +23107,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	serial8250_rpm_put(up);
  }
-@@ -1448,7 +1475,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p)
+@@ -1448,7 +1475,7 @@ void serial8250_em485_stop_tx(struct uar
  		serial8250_clear_and_reinit_fifos(p);
  
  		p->ier |= UART_IER_RLSI | UART_IER_RDI;
@@ -3771,7 +23116,7 @@ index 2030a92ac66e7..326549603740d 100644
  	}
  }
  EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);
-@@ -1697,7 +1724,7 @@ static void serial8250_disable_ms(struct uart_port *port)
+@@ -1697,7 +1724,7 @@ static void serial8250_disable_ms(struct
  	mctrl_gpio_disable_ms(up->gpios);
  
  	up->ier &= ~UART_IER_MSI;
@@ -3780,7 +23125,7 @@ index 2030a92ac66e7..326549603740d 100644
  }
  
  static void serial8250_enable_ms(struct uart_port *port)
-@@ -1713,7 +1740,7 @@ static void serial8250_enable_ms(struct uart_port *port)
+@@ -1713,7 +1740,7 @@ static void serial8250_enable_ms(struct
  	up->ier |= UART_IER_MSI;
  
  	serial8250_rpm_get(up);
@@ -3789,7 +23134,7 @@ index 2030a92ac66e7..326549603740d 100644
  	serial8250_rpm_put(up);
  }
  
-@@ -2144,14 +2171,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
+@@ -2147,14 +2174,7 @@ static void serial8250_put_poll_char(str
  	struct uart_8250_port *up = up_to_u8250p(port);
  
  	serial8250_rpm_get(up);
@@ -3805,7 +23150,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
  	/*
-@@ -2164,7 +2184,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
+@@ -2167,7 +2187,7 @@ static void serial8250_put_poll_char(str
  	 *	and restore the IER
  	 */
  	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
@@ -3814,7 +23159,7 @@ index 2030a92ac66e7..326549603740d 100644
  	serial8250_rpm_put(up);
  }
  
-@@ -2173,8 +2193,10 @@ static void serial8250_put_poll_char(struct uart_port *port,
+@@ -2176,8 +2196,10 @@ static void serial8250_put_poll_char(str
  int serial8250_do_startup(struct uart_port *port)
  {
  	struct uart_8250_port *up = up_to_u8250p(port);
@@ -3825,7 +23170,7 @@ index 2030a92ac66e7..326549603740d 100644
  	int retval;
  	u16 lsr;
  
-@@ -2195,7 +2217,7 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2198,7 +2220,7 @@ int serial8250_do_startup(struct uart_po
  		up->acr = 0;
  		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
  		serial_port_out(port, UART_EFR, UART_EFR_ECB);
@@ -3834,7 +23179,7 @@ index 2030a92ac66e7..326549603740d 100644
  		serial_port_out(port, UART_LCR, 0);
  		serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
  		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
-@@ -2205,7 +2227,7 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2208,7 +2230,7 @@ int serial8250_do_startup(struct uart_po
  
  	if (port->type == PORT_DA830) {
  		/* Reset the port */
@@ -3843,7 +23188,7 @@ index 2030a92ac66e7..326549603740d 100644
  		serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
  		mdelay(10);
  
-@@ -2304,6 +2326,8 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2307,6 +2329,8 @@ int serial8250_do_startup(struct uart_po
  	if (retval)
  		goto out;
  
@@ -3852,7 +23197,7 @@ index 2030a92ac66e7..326549603740d 100644
  	if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) {
  		unsigned char iir1;
  
-@@ -2320,6 +2344,9 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2323,6 +2347,9 @@ int serial8250_do_startup(struct uart_po
  		 */
  		spin_lock_irqsave(&port->lock, flags);
  
@@ -3862,7 +23207,7 @@ index 2030a92ac66e7..326549603740d 100644
  		wait_for_xmitr(up, UART_LSR_THRE);
  		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
  		udelay(1); /* allow THRE to set */
-@@ -2330,6 +2357,9 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2333,6 +2360,9 @@ int serial8250_do_startup(struct uart_po
  		iir = serial_port_in(port, UART_IIR);
  		serial_port_out(port, UART_IER, 0);
  
@@ -3872,7 +23217,7 @@ index 2030a92ac66e7..326549603740d 100644
  		spin_unlock_irqrestore(&port->lock, flags);
  
  		if (port->irqflags & IRQF_SHARED)
-@@ -2384,10 +2414,14 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2387,10 +2417,14 @@ int serial8250_do_startup(struct uart_po
  	 * Do a quick test to see if we receive an interrupt when we enable
  	 * the TX irq.
  	 */
@@ -3887,7 +23232,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
  		if (!(up->bugs & UART_BUG_TXEN)) {
-@@ -2419,7 +2453,7 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2422,7 +2456,7 @@ dont_test_tx_en:
  	if (up->dma) {
  		const char *msg = NULL;
  
@@ -3896,7 +23241,7 @@ index 2030a92ac66e7..326549603740d 100644
  			msg = "forbid DMA for kernel console";
  		else if (serial8250_request_dma(up))
  			msg = "failed to request DMA";
-@@ -2470,7 +2504,7 @@ void serial8250_do_shutdown(struct uart_port *port)
+@@ -2473,7 +2507,7 @@ void serial8250_do_shutdown(struct uart_
  	 */
  	spin_lock_irqsave(&port->lock, flags);
  	up->ier = 0;
@@ -3905,7 +23250,7 @@ index 2030a92ac66e7..326549603740d 100644
  	spin_unlock_irqrestore(&port->lock, flags);
  
  	synchronize_irq(port->irq);
-@@ -2836,7 +2870,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
+@@ -2839,7 +2873,7 @@ serial8250_do_set_termios(struct uart_po
  	if (up->capabilities & UART_CAP_RTOIE)
  		up->ier |= UART_IER_RTOIE;
  
@@ -3914,7 +23259,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	if (up->capabilities & UART_CAP_EFR) {
  		unsigned char efr = 0;
-@@ -3301,7 +3335,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults);
+@@ -3304,7 +3338,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_default
  
  #ifdef CONFIG_SERIAL_8250_CONSOLE
  
@@ -3923,7 +23268,7 @@ index 2030a92ac66e7..326549603740d 100644
  {
  	struct uart_8250_port *up = up_to_u8250p(port);
  
-@@ -3309,6 +3343,18 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
+@@ -3312,6 +3346,18 @@ static void serial8250_console_putchar(s
  	serial_port_out(port, UART_TX, ch);
  }
  
@@ -3942,7 +23287,7 @@ index 2030a92ac66e7..326549603740d 100644
  /*
   *	Restore serial console when h/w power-off detected
   */
-@@ -3335,6 +3381,32 @@ static void serial8250_console_restore(struct uart_8250_port *up)
+@@ -3338,6 +3384,32 @@ static void serial8250_console_restore(s
  	serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
  }
  
@@ -3975,7 +23320,7 @@ index 2030a92ac66e7..326549603740d 100644
  /*
   * Print a string to the serial port using the device FIFO
   *
-@@ -3380,24 +3452,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
+@@ -3383,24 +3455,12 @@ void serial8250_console_write(struct uar
  	struct uart_port *port = &up->port;
  	unsigned long flags;
  	unsigned int ier, use_fifo;
@@ -3987,13 +23332,13 @@ index 2030a92ac66e7..326549603740d 100644
 -		locked = spin_trylock_irqsave(&port->lock, flags);
 -	else
 -		spin_lock_irqsave(&port->lock, flags);
-+	spin_lock_irqsave(&port->lock, flags);
- 
+-
 -	/*
 -	 *	First save the IER then disable the interrupts
 -	 */
 -	ier = serial_port_in(port, UART_IER);
--
++	spin_lock_irqsave(&port->lock, flags);
+ 
 -	if (up->capabilities & UART_CAP_UUE)
 -		serial_port_out(port, UART_IER, UART_IER_UUE);
 -	else
@@ -4002,7 +23347,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	/* check scratch reg to see if port powered off during system sleep */
  	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
-@@ -3431,10 +3491,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
+@@ -3434,10 +3494,12 @@ void serial8250_console_write(struct uar
  		 */
  		!(up->port.flags & UPF_CONS_FLOW);
  
@@ -4015,7 +23360,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	/*
  	 *	Finally, wait for transmitter to become empty
-@@ -3447,8 +3509,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
+@@ -3450,8 +3512,7 @@ void serial8250_console_write(struct uar
  		if (em485->tx_stopped)
  			up->rs485_stop_tx(up);
  	}
@@ -4025,7 +23370,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	/*
  	 *	The receive handling will happen properly because the
-@@ -3460,8 +3521,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
+@@ -3463,8 +3524,7 @@ void serial8250_console_write(struct uar
  	if (up->msr_saved_flags)
  		serial8250_modem_status(up);
  
@@ -4035,7 +23380,7 @@ index 2030a92ac66e7..326549603740d 100644
  }
  
  static unsigned int probe_baud(struct uart_port *port)
-@@ -3481,6 +3541,7 @@ static unsigned int probe_baud(struct uart_port *port)
+@@ -3484,6 +3544,7 @@ static unsigned int probe_baud(struct ua
  
  int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
  {
@@ -4043,7 +23388,7 @@ index 2030a92ac66e7..326549603740d 100644
  	int baud = 9600;
  	int bits = 8;
  	int parity = 'n';
-@@ -3490,6 +3551,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
+@@ -3493,6 +3554,8 @@ int serial8250_console_setup(struct uart
  	if (!port->iobase && !port->membase)
  		return -ENODEV;
  
@@ -4052,10 +23397,3534 @@ index 2030a92ac66e7..326549603740d 100644
  	if (options)
  		uart_parse_options(options, &baud, &parity, &bits, &flow);
  	else if (probe)
-diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
-index d0b49e15fbf5e..02c308467339c 100644
---- a/drivers/tty/serial/8250/Kconfig
-+++ b/drivers/tty/serial/8250/Kconfig
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_port.c.orig linux/drivers/tty/serial/8250/8250_port.c.orig
+--- linux.orig/drivers/tty/serial/8250/8250_port.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_port.c.orig	2022-12-04 10:40:18.432055273 -0500
+@@ -0,0 +1,3521 @@
++// SPDX-License-Identifier: GPL-2.0+
++/*
++ *  Base port operations for 8250/16550-type serial ports
++ *
++ *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
++ *  Split from 8250_core.c, Copyright (C) 2001 Russell King.
++ *
++ * A note about mapbase / membase
++ *
++ *  mapbase is the physical address of the IO port.
++ *  membase is an 'ioremapped' cookie.
++ */
++
++#include <linux/module.h>
++#include <linux/moduleparam.h>
++#include <linux/ioport.h>
++#include <linux/init.h>
++#include <linux/console.h>
++#include <linux/gpio/consumer.h>
++#include <linux/sysrq.h>
++#include <linux/delay.h>
++#include <linux/platform_device.h>
++#include <linux/tty.h>
++#include <linux/ratelimit.h>
++#include <linux/tty_flip.h>
++#include <linux/serial.h>
++#include <linux/serial_8250.h>
++#include <linux/nmi.h>
++#include <linux/mutex.h>
++#include <linux/slab.h>
++#include <linux/uaccess.h>
++#include <linux/pm_runtime.h>
++#include <linux/ktime.h>
++
++#include <asm/io.h>
++#include <asm/irq.h>
++
++#include "8250.h"
++
++/* Nuvoton NPCM timeout register */
++#define UART_NPCM_TOR          7
++#define UART_NPCM_TOIE         BIT(7)  /* Timeout Interrupt Enable */
++
++/*
++ * Debugging.
++ */
++#if 0
++#define DEBUG_AUTOCONF(fmt...)	printk(fmt)
++#else
++#define DEBUG_AUTOCONF(fmt...)	do { } while (0)
++#endif
++
++/*
++ * Here we define the default xmit fifo size used for each type of UART.
++ */
++static const struct serial8250_config uart_config[] = {
++	[PORT_UNKNOWN] = {
++		.name		= "unknown",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_8250] = {
++		.name		= "8250",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16450] = {
++		.name		= "16450",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16550] = {
++		.name		= "16550",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16550A] = {
++		.name		= "16550A",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_CIRRUS] = {
++		.name		= "Cirrus",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16650] = {
++		.name		= "ST16650",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
++	},
++	[PORT_16650V2] = {
++		.name		= "ST16650V2",
++		.fifo_size	= 32,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
++				  UART_FCR_T_TRIG_00,
++		.rxtrig_bytes	= {8, 16, 24, 28},
++		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
++	},
++	[PORT_16750] = {
++		.name		= "TI16750",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
++				  UART_FCR7_64BYTE,
++		.rxtrig_bytes	= {1, 16, 32, 56},
++		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE,
++	},
++	[PORT_STARTECH] = {
++		.name		= "Startech",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16C950] = {
++		.name		= "16C950/954",
++		.fifo_size	= 128,
++		.tx_loadsz	= 128,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
++		.rxtrig_bytes	= {16, 32, 112, 120},
++		/* UART_CAP_EFR breaks billionon CF bluetooth card. */
++		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP,
++	},
++	[PORT_16654] = {
++		.name		= "ST16654",
++		.fifo_size	= 64,
++		.tx_loadsz	= 32,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
++				  UART_FCR_T_TRIG_10,
++		.rxtrig_bytes	= {8, 16, 56, 60},
++		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
++	},
++	[PORT_16850] = {
++		.name		= "XR16850",
++		.fifo_size	= 128,
++		.tx_loadsz	= 128,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
++	},
++	[PORT_RSA] = {
++		.name		= "RSA",
++		.fifo_size	= 2048,
++		.tx_loadsz	= 2048,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11,
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_NS16550A] = {
++		.name		= "NS16550A",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_NATSEMI,
++	},
++	[PORT_XSCALE] = {
++		.name		= "XScale",
++		.fifo_size	= 32,
++		.tx_loadsz	= 32,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE,
++	},
++	[PORT_OCTEON] = {
++		.name		= "OCTEON",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_AR7] = {
++		.name		= "AR7",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
++		.flags		= UART_CAP_FIFO /* | UART_CAP_AFE */,
++	},
++	[PORT_U6_16550A] = {
++		.name		= "U6_16550A",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	[PORT_TEGRA] = {
++		.name		= "Tegra",
++		.fifo_size	= 32,
++		.tx_loadsz	= 8,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
++				  UART_FCR_T_TRIG_01,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO | UART_CAP_RTOIE,
++	},
++	[PORT_XR17D15X] = {
++		.name		= "XR17D15X",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
++				  UART_CAP_SLEEP,
++	},
++	[PORT_XR17V35X] = {
++		.name		= "XR17V35X",
++		.fifo_size	= 256,
++		.tx_loadsz	= 256,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 |
++				  UART_FCR_T_TRIG_11,
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
++				  UART_CAP_SLEEP,
++	},
++	[PORT_LPC3220] = {
++		.name		= "LPC3220",
++		.fifo_size	= 64,
++		.tx_loadsz	= 32,
++		.fcr		= UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
++				  UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00,
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_BRCM_TRUMANAGE] = {
++		.name		= "TruManage",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1024,
++		.flags		= UART_CAP_HFIFO,
++	},
++	[PORT_8250_CIR] = {
++		.name		= "CIR port"
++	},
++	[PORT_ALTR_16550_F32] = {
++		.name		= "Altera 16550 FIFO32",
++		.fifo_size	= 32,
++		.tx_loadsz	= 32,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 8, 16, 30},
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	[PORT_ALTR_16550_F64] = {
++		.name		= "Altera 16550 FIFO64",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 16, 32, 62},
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	[PORT_ALTR_16550_F128] = {
++		.name		= "Altera 16550 FIFO128",
++		.fifo_size	= 128,
++		.tx_loadsz	= 128,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 32, 64, 126},
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	/*
++	 * tx_loadsz is set to 63-bytes instead of 64-bytes to implement
++	 * workaround of errata A-008006 which states that tx_loadsz should
++	 * be configured less than Maximum supported fifo bytes.
++	 */
++	[PORT_16550A_FSL64] = {
++		.name		= "16550A_FSL64",
++		.fifo_size	= 64,
++		.tx_loadsz	= 63,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
++				  UART_FCR7_64BYTE,
++		.flags		= UART_CAP_FIFO | UART_CAP_NOTEMT,
++	},
++	[PORT_RT2880] = {
++		.name		= "Palmchip BK-3103",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_DA830] = {
++		.name		= "TI DA8xx/66AK2x",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
++				  UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	[PORT_MTK_BTIF] = {
++		.name		= "MediaTek BTIF",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO |
++				  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_NPCM] = {
++		.name		= "Nuvoton 16550",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
++				  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_SUNIX] = {
++		.name		= "Sunix",
++		.fifo_size	= 128,
++		.tx_loadsz	= 128,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 32, 64, 112},
++		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP,
++	},
++	[PORT_ASPEED_VUART] = {
++		.name		= "ASPEED VUART",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO,
++	},
++};
++
++/* Uart divisor latch read */
++static int default_serial_dl_read(struct uart_8250_port *up)
++{
++	/* Assign these in pieces to truncate any bits above 7.  */
++	unsigned char dll = serial_in(up, UART_DLL);
++	unsigned char dlm = serial_in(up, UART_DLM);
++
++	return dll | dlm << 8;
++}
++
++/* Uart divisor latch write */
++static void default_serial_dl_write(struct uart_8250_port *up, int value)
++{
++	serial_out(up, UART_DLL, value & 0xff);
++	serial_out(up, UART_DLM, value >> 8 & 0xff);
++}
++
++#ifdef CONFIG_SERIAL_8250_RT288X
++
++#define UART_REG_UNMAPPED	-1
++
++/* Au1x00/RT288x UART hardware has a weird register layout */
++static const s8 au_io_in_map[8] = {
++	[UART_RX]	= 0,
++	[UART_IER]	= 2,
++	[UART_IIR]	= 3,
++	[UART_LCR]	= 5,
++	[UART_MCR]	= 6,
++	[UART_LSR]	= 7,
++	[UART_MSR]	= 8,
++	[UART_SCR]	= UART_REG_UNMAPPED,
++};
++
++static const s8 au_io_out_map[8] = {
++	[UART_TX]	= 1,
++	[UART_IER]	= 2,
++	[UART_FCR]	= 4,
++	[UART_LCR]	= 5,
++	[UART_MCR]	= 6,
++	[UART_LSR]	= UART_REG_UNMAPPED,
++	[UART_MSR]	= UART_REG_UNMAPPED,
++	[UART_SCR]	= UART_REG_UNMAPPED,
++};
++
++unsigned int au_serial_in(struct uart_port *p, int offset)
++{
++	if (offset >= ARRAY_SIZE(au_io_in_map))
++		return UINT_MAX;
++	offset = au_io_in_map[offset];
++	if (offset == UART_REG_UNMAPPED)
++		return UINT_MAX;
++	return __raw_readl(p->membase + (offset << p->regshift));
++}
++
++void au_serial_out(struct uart_port *p, int offset, int value)
++{
++	if (offset >= ARRAY_SIZE(au_io_out_map))
++		return;
++	offset = au_io_out_map[offset];
++	if (offset == UART_REG_UNMAPPED)
++		return;
++	__raw_writel(value, p->membase + (offset << p->regshift));
++}
++
++/* Au1x00 haven't got a standard divisor latch */
++static int au_serial_dl_read(struct uart_8250_port *up)
++{
++	return __raw_readl(up->port.membase + 0x28);
++}
++
++static void au_serial_dl_write(struct uart_8250_port *up, int value)
++{
++	__raw_writel(value, up->port.membase + 0x28);
++}
++
++#endif
++
++static unsigned int hub6_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	outb(p->hub6 - 1 + offset, p->iobase);
++	return inb(p->iobase + 1);
++}
++
++static void hub6_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	outb(p->hub6 - 1 + offset, p->iobase);
++	outb(value, p->iobase + 1);
++}
++
++static unsigned int mem_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return readb(p->membase + offset);
++}
++
++static void mem_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	writeb(value, p->membase + offset);
++}
++
++static void mem16_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	writew(value, p->membase + offset);
++}
++
++static unsigned int mem16_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return readw(p->membase + offset);
++}
++
++static void mem32_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	writel(value, p->membase + offset);
++}
++
++static unsigned int mem32_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return readl(p->membase + offset);
++}
++
++static void mem32be_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	iowrite32be(value, p->membase + offset);
++}
++
++static unsigned int mem32be_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return ioread32be(p->membase + offset);
++}
++
++static unsigned int io_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return inb(p->iobase + offset);
++}
++
++static void io_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	outb(value, p->iobase + offset);
++}
++
++static int serial8250_default_handle_irq(struct uart_port *port);
++
++static void set_io_from_upio(struct uart_port *p)
++{
++	struct uart_8250_port *up = up_to_u8250p(p);
++
++	up->dl_read = default_serial_dl_read;
++	up->dl_write = default_serial_dl_write;
++
++	switch (p->iotype) {
++	case UPIO_HUB6:
++		p->serial_in = hub6_serial_in;
++		p->serial_out = hub6_serial_out;
++		break;
++
++	case UPIO_MEM:
++		p->serial_in = mem_serial_in;
++		p->serial_out = mem_serial_out;
++		break;
++
++	case UPIO_MEM16:
++		p->serial_in = mem16_serial_in;
++		p->serial_out = mem16_serial_out;
++		break;
++
++	case UPIO_MEM32:
++		p->serial_in = mem32_serial_in;
++		p->serial_out = mem32_serial_out;
++		break;
++
++	case UPIO_MEM32BE:
++		p->serial_in = mem32be_serial_in;
++		p->serial_out = mem32be_serial_out;
++		break;
++
++#ifdef CONFIG_SERIAL_8250_RT288X
++	case UPIO_AU:
++		p->serial_in = au_serial_in;
++		p->serial_out = au_serial_out;
++		up->dl_read = au_serial_dl_read;
++		up->dl_write = au_serial_dl_write;
++		break;
++#endif
++
++	default:
++		p->serial_in = io_serial_in;
++		p->serial_out = io_serial_out;
++		break;
++	}
++	/* Remember loaded iotype */
++	up->cur_iotype = p->iotype;
++	p->handle_irq = serial8250_default_handle_irq;
++}
++
++static void
++serial_port_out_sync(struct uart_port *p, int offset, int value)
++{
++	switch (p->iotype) {
++	case UPIO_MEM:
++	case UPIO_MEM16:
++	case UPIO_MEM32:
++	case UPIO_MEM32BE:
++	case UPIO_AU:
++		p->serial_out(p, offset, value);
++		p->serial_in(p, UART_LCR);	/* safe, no side-effects */
++		break;
++	default:
++		p->serial_out(p, offset, value);
++	}
++}
++
++/*
++ * FIFO support.
++ */
++static void serial8250_clear_fifos(struct uart_8250_port *p)
++{
++	if (p->capabilities & UART_CAP_FIFO) {
++		serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO);
++		serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO |
++			       UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
++		serial_out(p, UART_FCR, 0);
++	}
++}
++
++static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t);
++static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t);
++
++void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p)
++{
++	serial8250_clear_fifos(p);
++	serial_out(p, UART_FCR, p->fcr);
++}
++EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos);
++
++void serial8250_rpm_get(struct uart_8250_port *p)
++{
++	if (!(p->capabilities & UART_CAP_RPM))
++		return;
++	pm_runtime_get_sync(p->port.dev);
++}
++EXPORT_SYMBOL_GPL(serial8250_rpm_get);
++
++void serial8250_rpm_put(struct uart_8250_port *p)
++{
++	if (!(p->capabilities & UART_CAP_RPM))
++		return;
++	pm_runtime_mark_last_busy(p->port.dev);
++	pm_runtime_put_autosuspend(p->port.dev);
++}
++EXPORT_SYMBOL_GPL(serial8250_rpm_put);
++
++/**
++ *	serial8250_em485_init() - put uart_8250_port into rs485 emulating
++ *	@p:	uart_8250_port port instance
++ *
++ *	The function is used to start rs485 software emulating on the
++ *	&struct uart_8250_port* @p. Namely, RTS is switched before/after
++ *	transmission. The function is idempotent, so it is safe to call it
++ *	multiple times.
++ *
++ *	The caller MUST enable interrupt on empty shift register before
++ *	calling serial8250_em485_init(). This interrupt is not a part of
++ *	8250 standard, but implementation defined.
++ *
++ *	The function is supposed to be called from .rs485_config callback
++ *	or from any other callback protected with p->port.lock spinlock.
++ *
++ *	See also serial8250_em485_destroy()
++ *
++ *	Return 0 - success, -errno - otherwise
++ */
++static int serial8250_em485_init(struct uart_8250_port *p)
++{
++	if (p->em485)
++		goto deassert_rts;
++
++	p->em485 = kmalloc(sizeof(struct uart_8250_em485), GFP_ATOMIC);
++	if (!p->em485)
++		return -ENOMEM;
++
++	hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC,
++		     HRTIMER_MODE_REL);
++	hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC,
++		     HRTIMER_MODE_REL);
++	p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx;
++	p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx;
++	p->em485->port = p;
++	p->em485->active_timer = NULL;
++	p->em485->tx_stopped = true;
++
++deassert_rts:
++	if (p->em485->tx_stopped)
++		p->rs485_stop_tx(p);
++
++	return 0;
++}
++
++/**
++ *	serial8250_em485_destroy() - put uart_8250_port into normal state
++ *	@p:	uart_8250_port port instance
++ *
++ *	The function is used to stop rs485 software emulating on the
++ *	&struct uart_8250_port* @p. The function is idempotent, so it is safe to
++ *	call it multiple times.
++ *
++ *	The function is supposed to be called from .rs485_config callback
++ *	or from any other callback protected with p->port.lock spinlock.
++ *
++ *	See also serial8250_em485_init()
++ */
++void serial8250_em485_destroy(struct uart_8250_port *p)
++{
++	if (!p->em485)
++		return;
++
++	hrtimer_cancel(&p->em485->start_tx_timer);
++	hrtimer_cancel(&p->em485->stop_tx_timer);
++
++	kfree(p->em485);
++	p->em485 = NULL;
++}
++EXPORT_SYMBOL_GPL(serial8250_em485_destroy);
++
++struct serial_rs485 serial8250_em485_supported = {
++	.flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND |
++		 SER_RS485_TERMINATE_BUS | SER_RS485_RX_DURING_TX,
++	.delay_rts_before_send = 1,
++	.delay_rts_after_send = 1,
++};
++EXPORT_SYMBOL_GPL(serial8250_em485_supported);
++
++/**
++ * serial8250_em485_config() - generic ->rs485_config() callback
++ * @port: uart port
++ * @rs485: rs485 settings
++ *
++ * Generic callback usable by 8250 uart drivers to activate rs485 settings
++ * if the uart is incapable of driving RTS as a Transmit Enable signal in
++ * hardware, relying on software emulation instead.
++ */
++int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
++			    struct serial_rs485 *rs485)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	/* pick sane settings if the user hasn't */
++	if (!!(rs485->flags & SER_RS485_RTS_ON_SEND) ==
++	    !!(rs485->flags & SER_RS485_RTS_AFTER_SEND)) {
++		rs485->flags |= SER_RS485_RTS_ON_SEND;
++		rs485->flags &= ~SER_RS485_RTS_AFTER_SEND;
++	}
++
++	/*
++	 * Both serial8250_em485_init() and serial8250_em485_destroy()
++	 * are idempotent.
++	 */
++	if (rs485->flags & SER_RS485_ENABLED)
++		return serial8250_em485_init(up);
++
++	serial8250_em485_destroy(up);
++	return 0;
++}
++EXPORT_SYMBOL_GPL(serial8250_em485_config);
++
++/*
++ * These two wrappers ensure that enable_runtime_pm_tx() can be called more than
++ * once and disable_runtime_pm_tx() will still disable RPM because the fifo is
++ * empty and the HW can idle again.
++ */
++void serial8250_rpm_get_tx(struct uart_8250_port *p)
++{
++	unsigned char rpm_active;
++
++	if (!(p->capabilities & UART_CAP_RPM))
++		return;
++
++	rpm_active = xchg(&p->rpm_tx_active, 1);
++	if (rpm_active)
++		return;
++	pm_runtime_get_sync(p->port.dev);
++}
++EXPORT_SYMBOL_GPL(serial8250_rpm_get_tx);
++
++void serial8250_rpm_put_tx(struct uart_8250_port *p)
++{
++	unsigned char rpm_active;
++
++	if (!(p->capabilities & UART_CAP_RPM))
++		return;
++
++	rpm_active = xchg(&p->rpm_tx_active, 0);
++	if (!rpm_active)
++		return;
++	pm_runtime_mark_last_busy(p->port.dev);
++	pm_runtime_put_autosuspend(p->port.dev);
++}
++EXPORT_SYMBOL_GPL(serial8250_rpm_put_tx);
++
++/*
++ * IER sleep support.  UARTs which have EFRs need the "extended
++ * capability" bit enabled.  Note that on XR16C850s, we need to
++ * reset LCR to write to IER.
++ */
++static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
++{
++	unsigned char lcr = 0, efr = 0;
++
++	serial8250_rpm_get(p);
++
++	if (p->capabilities & UART_CAP_SLEEP) {
++		if (p->capabilities & UART_CAP_EFR) {
++			lcr = serial_in(p, UART_LCR);
++			efr = serial_in(p, UART_EFR);
++			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
++			serial_out(p, UART_EFR, UART_EFR_ECB);
++			serial_out(p, UART_LCR, 0);
++		}
++		serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
++		if (p->capabilities & UART_CAP_EFR) {
++			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
++			serial_out(p, UART_EFR, efr);
++			serial_out(p, UART_LCR, lcr);
++		}
++	}
++
++	serial8250_rpm_put(p);
++}
++
++#ifdef CONFIG_SERIAL_8250_RSA
++/*
++ * Attempts to turn on the RSA FIFO.  Returns zero on failure.
++ * We set the port uart clock rate if we succeed.
++ */
++static int __enable_rsa(struct uart_8250_port *up)
++{
++	unsigned char mode;
++	int result;
++
++	mode = serial_in(up, UART_RSA_MSR);
++	result = mode & UART_RSA_MSR_FIFO;
++
++	if (!result) {
++		serial_out(up, UART_RSA_MSR, mode | UART_RSA_MSR_FIFO);
++		mode = serial_in(up, UART_RSA_MSR);
++		result = mode & UART_RSA_MSR_FIFO;
++	}
++
++	if (result)
++		up->port.uartclk = SERIAL_RSA_BAUD_BASE * 16;
++
++	return result;
++}
++
++static void enable_rsa(struct uart_8250_port *up)
++{
++	if (up->port.type == PORT_RSA) {
++		if (up->port.uartclk != SERIAL_RSA_BAUD_BASE * 16) {
++			spin_lock_irq(&up->port.lock);
++			__enable_rsa(up);
++			spin_unlock_irq(&up->port.lock);
++		}
++		if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16)
++			serial_out(up, UART_RSA_FRR, 0);
++	}
++}
++
++/*
++ * Attempts to turn off the RSA FIFO.  Returns zero on failure.
++ * It is unknown why interrupts were disabled in here.  However,
++ * the caller is expected to preserve this behaviour by grabbing
++ * the spinlock before calling this function.
++ */
++static void disable_rsa(struct uart_8250_port *up)
++{
++	unsigned char mode;
++	int result;
++
++	if (up->port.type == PORT_RSA &&
++	    up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) {
++		spin_lock_irq(&up->port.lock);
++
++		mode = serial_in(up, UART_RSA_MSR);
++		result = !(mode & UART_RSA_MSR_FIFO);
++
++		if (!result) {
++			serial_out(up, UART_RSA_MSR, mode & ~UART_RSA_MSR_FIFO);
++			mode = serial_in(up, UART_RSA_MSR);
++			result = !(mode & UART_RSA_MSR_FIFO);
++		}
++
++		if (result)
++			up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16;
++		spin_unlock_irq(&up->port.lock);
++	}
++}
++#endif /* CONFIG_SERIAL_8250_RSA */
++
++/*
++ * This is a quickie test to see how big the FIFO is.
++ * It doesn't work at all the time, more's the pity.
++ */
++static int size_fifo(struct uart_8250_port *up)
++{
++	unsigned char old_fcr, old_mcr, old_lcr;
++	unsigned short old_dl;
++	int count;
++
++	old_lcr = serial_in(up, UART_LCR);
++	serial_out(up, UART_LCR, 0);
++	old_fcr = serial_in(up, UART_FCR);
++	old_mcr = serial8250_in_MCR(up);
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
++		    UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
++	serial8250_out_MCR(up, UART_MCR_LOOP);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	old_dl = serial_dl_read(up);
++	serial_dl_write(up, 0x0001);
++	serial_out(up, UART_LCR, UART_LCR_WLEN8);
++	for (count = 0; count < 256; count++)
++		serial_out(up, UART_TX, count);
++	mdelay(20);/* FIXME - schedule_timeout */
++	for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) &&
++	     (count < 256); count++)
++		serial_in(up, UART_RX);
++	serial_out(up, UART_FCR, old_fcr);
++	serial8250_out_MCR(up, old_mcr);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	serial_dl_write(up, old_dl);
++	serial_out(up, UART_LCR, old_lcr);
++
++	return count;
++}
++
++/*
++ * Read UART ID using the divisor method - set DLL and DLM to zero
++ * and the revision will be in DLL and device type in DLM.  We
++ * preserve the device state across this.
++ */
++static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p)
++{
++	unsigned char old_lcr;
++	unsigned int id, old_dl;
++
++	old_lcr = serial_in(p, UART_LCR);
++	serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A);
++	old_dl = serial_dl_read(p);
++	serial_dl_write(p, 0);
++	id = serial_dl_read(p);
++	serial_dl_write(p, old_dl);
++
++	serial_out(p, UART_LCR, old_lcr);
++
++	return id;
++}
++
++/*
++ * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's.
++ * When this function is called we know it is at least a StarTech
++ * 16650 V2, but it might be one of several StarTech UARTs, or one of
++ * its clones.  (We treat the broken original StarTech 16650 V1 as a
++ * 16550, and why not?  Startech doesn't seem to even acknowledge its
++ * existence.)
++ *
++ * What evil have men's minds wrought...
++ */
++static void autoconfig_has_efr(struct uart_8250_port *up)
++{
++	unsigned int id1, id2, id3, rev;
++
++	/*
++	 * Everything with an EFR has SLEEP
++	 */
++	up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
++
++	/*
++	 * First we check to see if it's an Oxford Semiconductor UART.
++	 *
++	 * If we have to do this here because some non-National
++	 * Semiconductor clone chips lock up if you try writing to the
++	 * LSR register (which serial_icr_read does)
++	 */
++
++	/*
++	 * Check for Oxford Semiconductor 16C950.
++	 *
++	 * EFR [4] must be set else this test fails.
++	 *
++	 * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca)
++	 * claims that it's needed for 952 dual UART's (which are not
++	 * recommended for new designs).
++	 */
++	up->acr = 0;
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_EFR, UART_EFR_ECB);
++	serial_out(up, UART_LCR, 0x00);
++	id1 = serial_icr_read(up, UART_ID1);
++	id2 = serial_icr_read(up, UART_ID2);
++	id3 = serial_icr_read(up, UART_ID3);
++	rev = serial_icr_read(up, UART_REV);
++
++	DEBUG_AUTOCONF("950id=%02x:%02x:%02x:%02x ", id1, id2, id3, rev);
++
++	if (id1 == 0x16 && id2 == 0xC9 &&
++	    (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) {
++		up->port.type = PORT_16C950;
++
++		/*
++		 * Enable work around for the Oxford Semiconductor 952 rev B
++		 * chip which causes it to seriously miscalculate baud rates
++		 * when DLL is 0.
++		 */
++		if (id3 == 0x52 && rev == 0x01)
++			up->bugs |= UART_BUG_QUOT;
++		return;
++	}
++
++	/*
++	 * We check for a XR16C850 by setting DLL and DLM to 0, and then
++	 * reading back DLL and DLM.  The chip type depends on the DLM
++	 * value read back:
++	 *  0x10 - XR16C850 and the DLL contains the chip revision.
++	 *  0x12 - XR16C2850.
++	 *  0x14 - XR16C854.
++	 */
++	id1 = autoconfig_read_divisor_id(up);
++	DEBUG_AUTOCONF("850id=%04x ", id1);
++
++	id2 = id1 >> 8;
++	if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) {
++		up->port.type = PORT_16850;
++		return;
++	}
++
++	/*
++	 * It wasn't an XR16C850.
++	 *
++	 * We distinguish between the '654 and the '650 by counting
++	 * how many bytes are in the FIFO.  I'm using this for now,
++	 * since that's the technique that was sent to me in the
++	 * serial driver update, but I'm not convinced this works.
++	 * I've had problems doing this in the past.  -TYT
++	 */
++	if (size_fifo(up) == 64)
++		up->port.type = PORT_16654;
++	else
++		up->port.type = PORT_16650V2;
++}
++
++/*
++ * We detected a chip without a FIFO.  Only two fall into
++ * this category - the original 8250 and the 16450.  The
++ * 16450 has a scratch register (accessible with LCR=0)
++ */
++static void autoconfig_8250(struct uart_8250_port *up)
++{
++	unsigned char scratch, status1, status2;
++
++	up->port.type = PORT_8250;
++
++	scratch = serial_in(up, UART_SCR);
++	serial_out(up, UART_SCR, 0xa5);
++	status1 = serial_in(up, UART_SCR);
++	serial_out(up, UART_SCR, 0x5a);
++	status2 = serial_in(up, UART_SCR);
++	serial_out(up, UART_SCR, scratch);
++
++	if (status1 == 0xa5 && status2 == 0x5a)
++		up->port.type = PORT_16450;
++}
++
++static int broken_efr(struct uart_8250_port *up)
++{
++	/*
++	 * Exar ST16C2550 "A2" devices incorrectly detect as
++	 * having an EFR, and report an ID of 0x0201.  See
++	 * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html
++	 */
++	if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16)
++		return 1;
++
++	return 0;
++}
++
++/*
++ * We know that the chip has FIFOs.  Does it have an EFR?  The
++ * EFR is located in the same register position as the IIR and
++ * we know the top two bits of the IIR are currently set.  The
++ * EFR should contain zero.  Try to read the EFR.
++ */
++static void autoconfig_16550a(struct uart_8250_port *up)
++{
++	unsigned char status1, status2;
++	unsigned int iersave;
++
++	up->port.type = PORT_16550A;
++	up->capabilities |= UART_CAP_FIFO;
++
++	if (!IS_ENABLED(CONFIG_SERIAL_8250_16550A_VARIANTS) &&
++	    !(up->port.flags & UPF_FULL_PROBE))
++		return;
++
++	/*
++	 * Check for presence of the EFR when DLAB is set.
++	 * Only ST16C650V1 UARTs pass this test.
++	 */
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	if (serial_in(up, UART_EFR) == 0) {
++		serial_out(up, UART_EFR, 0xA8);
++		if (serial_in(up, UART_EFR) != 0) {
++			DEBUG_AUTOCONF("EFRv1 ");
++			up->port.type = PORT_16650;
++			up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
++		} else {
++			serial_out(up, UART_LCR, 0);
++			serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
++				   UART_FCR7_64BYTE);
++			status1 = serial_in(up, UART_IIR) >> 5;
++			serial_out(up, UART_FCR, 0);
++			serial_out(up, UART_LCR, 0);
++
++			if (status1 == 7)
++				up->port.type = PORT_16550A_FSL64;
++			else
++				DEBUG_AUTOCONF("Motorola 8xxx DUART ");
++		}
++		serial_out(up, UART_EFR, 0);
++		return;
++	}
++
++	/*
++	 * Maybe it requires 0xbf to be written to the LCR.
++	 * (other ST16C650V2 UARTs, TI16C752A, etc)
++	 */
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) {
++		DEBUG_AUTOCONF("EFRv2 ");
++		autoconfig_has_efr(up);
++		return;
++	}
++
++	/*
++	 * Check for a National Semiconductor SuperIO chip.
++	 * Attempt to switch to bank 2, read the value of the LOOP bit
++	 * from EXCR1. Switch back to bank 0, change it in MCR. Then
++	 * switch back to bank 2, read it from EXCR1 again and check
++	 * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2
++	 */
++	serial_out(up, UART_LCR, 0);
++	status1 = serial8250_in_MCR(up);
++	serial_out(up, UART_LCR, 0xE0);
++	status2 = serial_in(up, 0x02); /* EXCR1 */
++
++	if (!((status2 ^ status1) & UART_MCR_LOOP)) {
++		serial_out(up, UART_LCR, 0);
++		serial8250_out_MCR(up, status1 ^ UART_MCR_LOOP);
++		serial_out(up, UART_LCR, 0xE0);
++		status2 = serial_in(up, 0x02); /* EXCR1 */
++		serial_out(up, UART_LCR, 0);
++		serial8250_out_MCR(up, status1);
++
++		if ((status2 ^ status1) & UART_MCR_LOOP) {
++			unsigned short quot;
++
++			serial_out(up, UART_LCR, 0xE0);
++
++			quot = serial_dl_read(up);
++			quot <<= 3;
++
++			if (ns16550a_goto_highspeed(up))
++				serial_dl_write(up, quot);
++
++			serial_out(up, UART_LCR, 0);
++
++			up->port.uartclk = 921600*16;
++			up->port.type = PORT_NS16550A;
++			up->capabilities |= UART_NATSEMI;
++			return;
++		}
++	}
++
++	/*
++	 * No EFR.  Try to detect a TI16750, which only sets bit 5 of
++	 * the IIR when 64 byte FIFO mode is enabled when DLAB is set.
++	 * Try setting it with and without DLAB set.  Cheap clones
++	 * set bit 5 without DLAB set.
++	 */
++	serial_out(up, UART_LCR, 0);
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
++	status1 = serial_in(up, UART_IIR) >> 5;
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
++	status2 = serial_in(up, UART_IIR) >> 5;
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
++	serial_out(up, UART_LCR, 0);
++
++	DEBUG_AUTOCONF("iir1=%d iir2=%d ", status1, status2);
++
++	if (status1 == 6 && status2 == 7) {
++		up->port.type = PORT_16750;
++		up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP;
++		return;
++	}
++
++	/*
++	 * Try writing and reading the UART_IER_UUE bit (b6).
++	 * If it works, this is probably one of the Xscale platform's
++	 * internal UARTs.
++	 * We're going to explicitly set the UUE bit to 0 before
++	 * trying to write and read a 1 just to make sure it's not
++	 * already a 1 and maybe locked there before we even start start.
++	 */
++	iersave = serial_in(up, UART_IER);
++	serial_out(up, UART_IER, iersave & ~UART_IER_UUE);
++	if (!(serial_in(up, UART_IER) & UART_IER_UUE)) {
++		/*
++		 * OK it's in a known zero state, try writing and reading
++		 * without disturbing the current state of the other bits.
++		 */
++		serial_out(up, UART_IER, iersave | UART_IER_UUE);
++		if (serial_in(up, UART_IER) & UART_IER_UUE) {
++			/*
++			 * It's an Xscale.
++			 * We'll leave the UART_IER_UUE bit set to 1 (enabled).
++			 */
++			DEBUG_AUTOCONF("Xscale ");
++			up->port.type = PORT_XSCALE;
++			up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE;
++			return;
++		}
++	} else {
++		/*
++		 * If we got here we couldn't force the IER_UUE bit to 0.
++		 * Log it and continue.
++		 */
++		DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 ");
++	}
++	serial_out(up, UART_IER, iersave);
++
++	/*
++	 * We distinguish between 16550A and U6 16550A by counting
++	 * how many bytes are in the FIFO.
++	 */
++	if (up->port.type == PORT_16550A && size_fifo(up) == 64) {
++		up->port.type = PORT_U6_16550A;
++		up->capabilities |= UART_CAP_AFE;
++	}
++}
++
++/*
++ * This routine is called by rs_init() to initialize a specific serial
++ * port.  It determines what type of UART chip this serial port is
++ * using: 8250, 16450, 16550, 16550A.  The important question is
++ * whether or not this UART is a 16550A or not, since this will
++ * determine whether or not we can use its FIFO features or not.
++ */
++static void autoconfig(struct uart_8250_port *up)
++{
++	unsigned char status1, scratch, scratch2, scratch3;
++	unsigned char save_lcr, save_mcr;
++	struct uart_port *port = &up->port;
++	unsigned long flags;
++	unsigned int old_capabilities;
++
++	if (!port->iobase && !port->mapbase && !port->membase)
++		return;
++
++	DEBUG_AUTOCONF("%s: autoconf (0x%04lx, 0x%p): ",
++		       port->name, port->iobase, port->membase);
++
++	/*
++	 * We really do need global IRQs disabled here - we're going to
++	 * be frobbing the chips IRQ enable register to see if it exists.
++	 */
++	spin_lock_irqsave(&port->lock, flags);
++
++	up->capabilities = 0;
++	up->bugs = 0;
++
++	if (!(port->flags & UPF_BUGGY_UART)) {
++		/*
++		 * Do a simple existence test first; if we fail this,
++		 * there's no point trying anything else.
++		 *
++		 * 0x80 is used as a nonsense port to prevent against
++		 * false positives due to ISA bus float.  The
++		 * assumption is that 0x80 is a non-existent port;
++		 * which should be safe since include/asm/io.h also
++		 * makes this assumption.
++		 *
++		 * Note: this is safe as long as MCR bit 4 is clear
++		 * and the device is in "PC" mode.
++		 */
++		scratch = serial_in(up, UART_IER);
++		serial_out(up, UART_IER, 0);
++#ifdef __i386__
++		outb(0xff, 0x080);
++#endif
++		/*
++		 * Mask out IER[7:4] bits for test as some UARTs (e.g. TL
++		 * 16C754B) allow only to modify them if an EFR bit is set.
++		 */
++		scratch2 = serial_in(up, UART_IER) & 0x0f;
++		serial_out(up, UART_IER, 0x0F);
++#ifdef __i386__
++		outb(0, 0x080);
++#endif
++		scratch3 = serial_in(up, UART_IER) & 0x0f;
++		serial_out(up, UART_IER, scratch);
++		if (scratch2 != 0 || scratch3 != 0x0F) {
++			/*
++			 * We failed; there's nothing here
++			 */
++			spin_unlock_irqrestore(&port->lock, flags);
++			DEBUG_AUTOCONF("IER test failed (%02x, %02x) ",
++				       scratch2, scratch3);
++			goto out;
++		}
++	}
++
++	save_mcr = serial8250_in_MCR(up);
++	save_lcr = serial_in(up, UART_LCR);
++
++	/*
++	 * Check to see if a UART is really there.  Certain broken
++	 * internal modems based on the Rockwell chipset fail this
++	 * test, because they apparently don't implement the loopback
++	 * test mode.  So this test is skipped on the COM 1 through
++	 * COM 4 ports.  This *should* be safe, since no board
++	 * manufacturer would be stupid enough to design a board
++	 * that conflicts with COM 1-4 --- we hope!
++	 */
++	if (!(port->flags & UPF_SKIP_TEST)) {
++		serial8250_out_MCR(up, UART_MCR_LOOP | 0x0A);
++		status1 = serial_in(up, UART_MSR) & 0xF0;
++		serial8250_out_MCR(up, save_mcr);
++		if (status1 != 0x90) {
++			spin_unlock_irqrestore(&port->lock, flags);
++			DEBUG_AUTOCONF("LOOP test failed (%02x) ",
++				       status1);
++			goto out;
++		}
++	}
++
++	/*
++	 * We're pretty sure there's a port here.  Lets find out what
++	 * type of port it is.  The IIR top two bits allows us to find
++	 * out if it's 8250 or 16450, 16550, 16550A or later.  This
++	 * determines what we test for next.
++	 *
++	 * We also initialise the EFR (if any) to zero for later.  The
++	 * EFR occupies the same register location as the FCR and IIR.
++	 */
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_EFR, 0);
++	serial_out(up, UART_LCR, 0);
++
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
++
++	/* Assign this as it is to truncate any bits above 7.  */
++	scratch = serial_in(up, UART_IIR);
++
++	switch (scratch >> 6) {
++	case 0:
++		autoconfig_8250(up);
++		break;
++	case 1:
++		port->type = PORT_UNKNOWN;
++		break;
++	case 2:
++		port->type = PORT_16550;
++		break;
++	case 3:
++		autoconfig_16550a(up);
++		break;
++	}
++
++#ifdef CONFIG_SERIAL_8250_RSA
++	/*
++	 * Only probe for RSA ports if we got the region.
++	 */
++	if (port->type == PORT_16550A && up->probe & UART_PROBE_RSA &&
++	    __enable_rsa(up))
++		port->type = PORT_RSA;
++#endif
++
++	serial_out(up, UART_LCR, save_lcr);
++
++	port->fifosize = uart_config[up->port.type].fifo_size;
++	old_capabilities = up->capabilities;
++	up->capabilities = uart_config[port->type].flags;
++	up->tx_loadsz = uart_config[port->type].tx_loadsz;
++
++	if (port->type == PORT_UNKNOWN)
++		goto out_unlock;
++
++	/*
++	 * Reset the UART.
++	 */
++#ifdef CONFIG_SERIAL_8250_RSA
++	if (port->type == PORT_RSA)
++		serial_out(up, UART_RSA_FRR, 0);
++#endif
++	serial8250_out_MCR(up, save_mcr);
++	serial8250_clear_fifos(up);
++	serial_in(up, UART_RX);
++	if (up->capabilities & UART_CAP_UUE)
++		serial_out(up, UART_IER, UART_IER_UUE);
++	else
++		serial_out(up, UART_IER, 0);
++
++out_unlock:
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	/*
++	 * Check if the device is a Fintek F81216A
++	 */
++	if (port->type == PORT_16550A && port->iotype == UPIO_PORT)
++		fintek_8250_probe(up);
++
++	if (up->capabilities != old_capabilities) {
++		dev_warn(port->dev, "detected caps %08x should be %08x\n",
++			 old_capabilities, up->capabilities);
++	}
++out:
++	DEBUG_AUTOCONF("iir=%d ", scratch);
++	DEBUG_AUTOCONF("type=%s\n", uart_config[port->type].name);
++}
++
++static void autoconfig_irq(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++	unsigned char save_mcr, save_ier;
++	unsigned char save_ICP = 0;
++	unsigned int ICP = 0;
++	unsigned long irqs;
++	int irq;
++
++	if (port->flags & UPF_FOURPORT) {
++		ICP = (port->iobase & 0xfe0) | 0x1f;
++		save_ICP = inb_p(ICP);
++		outb_p(0x80, ICP);
++		inb_p(ICP);
++	}
++
++	if (uart_console(port))
++		console_lock();
++
++	/* forget possible initially masked and pending IRQ */
++	probe_irq_off(probe_irq_on());
++	save_mcr = serial8250_in_MCR(up);
++	save_ier = serial_in(up, UART_IER);
++	serial8250_out_MCR(up, UART_MCR_OUT1 | UART_MCR_OUT2);
++
++	irqs = probe_irq_on();
++	serial8250_out_MCR(up, 0);
++	udelay(10);
++	if (port->flags & UPF_FOURPORT) {
++		serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS);
++	} else {
++		serial8250_out_MCR(up,
++			UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2);
++	}
++	serial_out(up, UART_IER, 0x0f);	/* enable all intrs */
++	serial_in(up, UART_LSR);
++	serial_in(up, UART_RX);
++	serial_in(up, UART_IIR);
++	serial_in(up, UART_MSR);
++	serial_out(up, UART_TX, 0xFF);
++	udelay(20);
++	irq = probe_irq_off(irqs);
++
++	serial8250_out_MCR(up, save_mcr);
++	serial_out(up, UART_IER, save_ier);
++
++	if (port->flags & UPF_FOURPORT)
++		outb_p(save_ICP, ICP);
++
++	if (uart_console(port))
++		console_unlock();
++
++	port->irq = (irq > 0) ? irq : 0;
++}
++
++static void serial8250_stop_rx(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	serial8250_rpm_get(up);
++
++	up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
++	up->port.read_status_mask &= ~UART_LSR_DR;
++	serial_port_out(port, UART_IER, up->ier);
++
++	serial8250_rpm_put(up);
++}
++
++/**
++ * serial8250_em485_stop_tx() - generic ->rs485_stop_tx() callback
++ * @p: uart 8250 port
++ *
++ * Generic callback usable by 8250 uart drivers to stop rs485 transmission.
++ */
++void serial8250_em485_stop_tx(struct uart_8250_port *p)
++{
++	unsigned char mcr = serial8250_in_MCR(p);
++
++	if (p->port.rs485.flags & SER_RS485_RTS_AFTER_SEND)
++		mcr |= UART_MCR_RTS;
++	else
++		mcr &= ~UART_MCR_RTS;
++	serial8250_out_MCR(p, mcr);
++
++	/*
++	 * Empty the RX FIFO, we are not interested in anything
++	 * received during the half-duplex transmission.
++	 * Enable previously disabled RX interrupts.
++	 */
++	if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) {
++		serial8250_clear_and_reinit_fifos(p);
++
++		p->ier |= UART_IER_RLSI | UART_IER_RDI;
++		serial_port_out(&p->port, UART_IER, p->ier);
++	}
++}
++EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);
++
++static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t)
++{
++	struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
++			stop_tx_timer);
++	struct uart_8250_port *p = em485->port;
++	unsigned long flags;
++
++	serial8250_rpm_get(p);
++	spin_lock_irqsave(&p->port.lock, flags);
++	if (em485->active_timer == &em485->stop_tx_timer) {
++		p->rs485_stop_tx(p);
++		em485->active_timer = NULL;
++		em485->tx_stopped = true;
++	}
++	spin_unlock_irqrestore(&p->port.lock, flags);
++	serial8250_rpm_put(p);
++
++	return HRTIMER_NORESTART;
++}
++
++static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec)
++{
++	hrtimer_start(hrt, ms_to_ktime(msec), HRTIMER_MODE_REL);
++}
++
++static void __stop_tx_rs485(struct uart_8250_port *p, u64 stop_delay)
++{
++	struct uart_8250_em485 *em485 = p->em485;
++
++	stop_delay += (u64)p->port.rs485.delay_rts_after_send * NSEC_PER_MSEC;
++
++	/*
++	 * rs485_stop_tx() is going to set RTS according to config
++	 * AND flush RX FIFO if required.
++	 */
++	if (stop_delay > 0) {
++		em485->active_timer = &em485->stop_tx_timer;
++		hrtimer_start(&em485->stop_tx_timer, ns_to_ktime(stop_delay), HRTIMER_MODE_REL);
++	} else {
++		p->rs485_stop_tx(p);
++		em485->active_timer = NULL;
++		em485->tx_stopped = true;
++	}
++}
++
++static inline void __stop_tx(struct uart_8250_port *p)
++{
++	struct uart_8250_em485 *em485 = p->em485;
++
++	if (em485) {
++		u16 lsr = serial_lsr_in(p);
++		u64 stop_delay = 0;
++
++		p->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
++
++		if (!(lsr & UART_LSR_THRE))
++			return;
++		/*
++		 * To provide required timing and allow FIFO transfer,
++		 * __stop_tx_rs485() must be called only when both FIFO and
++		 * shift register are empty. The device driver should either
++		 * enable interrupt on TEMT or set UART_CAP_NOTEMT that will
++		 * enlarge stop_tx_timer by the tx time of one frame to cover
++		 * for emptying of the shift register.
++		 */
++		if (!(lsr & UART_LSR_TEMT)) {
++			if (!(p->capabilities & UART_CAP_NOTEMT))
++				return;
++			/*
++			 * RTS might get deasserted too early with the normal
++			 * frame timing formula. It seems to suggest THRE might
++			 * get asserted already during tx of the stop bit
++			 * rather than after it is fully sent.
++			 * Roughly estimate 1 extra bit here with / 7.
++			 */
++			stop_delay = p->port.frame_time + DIV_ROUND_UP(p->port.frame_time, 7);
++		}
++
++		__stop_tx_rs485(p, stop_delay);
++	}
++
++	if (serial8250_clear_THRI(p))
++		serial8250_rpm_put_tx(p);
++}
++
++static void serial8250_stop_tx(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	serial8250_rpm_get(up);
++	__stop_tx(up);
++
++	/*
++	 * We really want to stop the transmitter from sending.
++	 */
++	if (port->type == PORT_16C950) {
++		up->acr |= UART_ACR_TXDIS;
++		serial_icr_write(up, UART_ACR, up->acr);
++	}
++	serial8250_rpm_put(up);
++}
++
++static inline void __start_tx(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	if (up->dma && !up->dma->tx_dma(up))
++		return;
++
++	if (serial8250_set_THRI(up)) {
++		if (up->bugs & UART_BUG_TXEN) {
++			u16 lsr = serial_lsr_in(up);
++
++			if (lsr & UART_LSR_THRE)
++				serial8250_tx_chars(up);
++		}
++	}
++
++	/*
++	 * Re-enable the transmitter if we disabled it.
++	 */
++	if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) {
++		up->acr &= ~UART_ACR_TXDIS;
++		serial_icr_write(up, UART_ACR, up->acr);
++	}
++}
++
++/**
++ * serial8250_em485_start_tx() - generic ->rs485_start_tx() callback
++ * @up: uart 8250 port
++ *
++ * Generic callback usable by 8250 uart drivers to start rs485 transmission.
++ * Assumes that setting the RTS bit in the MCR register means RTS is high.
++ * (Some chips use inverse semantics.)  Further assumes that reception is
++ * stoppable by disabling the UART_IER_RDI interrupt.  (Some chips set the
++ * UART_LSR_DR bit even when UART_IER_RDI is disabled, foiling this approach.)
++ */
++void serial8250_em485_start_tx(struct uart_8250_port *up)
++{
++	unsigned char mcr = serial8250_in_MCR(up);
++
++	if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX))
++		serial8250_stop_rx(&up->port);
++
++	if (up->port.rs485.flags & SER_RS485_RTS_ON_SEND)
++		mcr |= UART_MCR_RTS;
++	else
++		mcr &= ~UART_MCR_RTS;
++	serial8250_out_MCR(up, mcr);
++}
++EXPORT_SYMBOL_GPL(serial8250_em485_start_tx);
++
++/* Returns false, if start_tx_timer was setup to defer TX start */
++static bool start_tx_rs485(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct uart_8250_em485 *em485 = up->em485;
++
++	/*
++	 * While serial8250_em485_handle_stop_tx() is a noop if
++	 * em485->active_timer != &em485->stop_tx_timer, it might happen that
++	 * the timer is still armed and triggers only after the current bunch of
++	 * chars is send and em485->active_timer == &em485->stop_tx_timer again.
++	 * So cancel the timer. There is still a theoretical race condition if
++	 * the timer is already running and only comes around to check for
++	 * em485->active_timer when &em485->stop_tx_timer is armed again.
++	 */
++	if (em485->active_timer == &em485->stop_tx_timer)
++		hrtimer_try_to_cancel(&em485->stop_tx_timer);
++
++	em485->active_timer = NULL;
++
++	if (em485->tx_stopped) {
++		em485->tx_stopped = false;
++
++		up->rs485_start_tx(up);
++
++		if (up->port.rs485.delay_rts_before_send > 0) {
++			em485->active_timer = &em485->start_tx_timer;
++			start_hrtimer_ms(&em485->start_tx_timer,
++					 up->port.rs485.delay_rts_before_send);
++			return false;
++		}
++	}
++
++	return true;
++}
++
++static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t)
++{
++	struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
++			start_tx_timer);
++	struct uart_8250_port *p = em485->port;
++	unsigned long flags;
++
++	spin_lock_irqsave(&p->port.lock, flags);
++	if (em485->active_timer == &em485->start_tx_timer) {
++		__start_tx(&p->port);
++		em485->active_timer = NULL;
++	}
++	spin_unlock_irqrestore(&p->port.lock, flags);
++
++	return HRTIMER_NORESTART;
++}
++
++static void serial8250_start_tx(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct uart_8250_em485 *em485 = up->em485;
++
++	if (!port->x_char && uart_circ_empty(&port->state->xmit))
++		return;
++
++	serial8250_rpm_get_tx(up);
++
++	if (em485) {
++		if ((em485->active_timer == &em485->start_tx_timer) ||
++		    !start_tx_rs485(port))
++			return;
++	}
++	__start_tx(port);
++}
++
++static void serial8250_throttle(struct uart_port *port)
++{
++	port->throttle(port);
++}
++
++static void serial8250_unthrottle(struct uart_port *port)
++{
++	port->unthrottle(port);
++}
++
++static void serial8250_disable_ms(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	/* no MSR capabilities */
++	if (up->bugs & UART_BUG_NOMSR)
++		return;
++
++	mctrl_gpio_disable_ms(up->gpios);
++
++	up->ier &= ~UART_IER_MSI;
++	serial_port_out(port, UART_IER, up->ier);
++}
++
++static void serial8250_enable_ms(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	/* no MSR capabilities */
++	if (up->bugs & UART_BUG_NOMSR)
++		return;
++
++	mctrl_gpio_enable_ms(up->gpios);
++
++	up->ier |= UART_IER_MSI;
++
++	serial8250_rpm_get(up);
++	serial_port_out(port, UART_IER, up->ier);
++	serial8250_rpm_put(up);
++}
++
++void serial8250_read_char(struct uart_8250_port *up, u16 lsr)
++{
++	struct uart_port *port = &up->port;
++	unsigned char ch;
++	char flag = TTY_NORMAL;
++
++	if (likely(lsr & UART_LSR_DR))
++		ch = serial_in(up, UART_RX);
++	else
++		/*
++		 * Intel 82571 has a Serial Over Lan device that will
++		 * set UART_LSR_BI without setting UART_LSR_DR when
++		 * it receives a break. To avoid reading from the
++		 * receive buffer without UART_LSR_DR bit set, we
++		 * just force the read character to be 0
++		 */
++		ch = 0;
++
++	port->icount.rx++;
++
++	lsr |= up->lsr_saved_flags;
++	up->lsr_saved_flags = 0;
++
++	if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) {
++		if (lsr & UART_LSR_BI) {
++			lsr &= ~(UART_LSR_FE | UART_LSR_PE);
++			port->icount.brk++;
++			/*
++			 * We do the SysRQ and SAK checking
++			 * here because otherwise the break
++			 * may get masked by ignore_status_mask
++			 * or read_status_mask.
++			 */
++			if (uart_handle_break(port))
++				return;
++		} else if (lsr & UART_LSR_PE)
++			port->icount.parity++;
++		else if (lsr & UART_LSR_FE)
++			port->icount.frame++;
++		if (lsr & UART_LSR_OE)
++			port->icount.overrun++;
++
++		/*
++		 * Mask off conditions which should be ignored.
++		 */
++		lsr &= port->read_status_mask;
++
++		if (lsr & UART_LSR_BI) {
++			dev_dbg(port->dev, "handling break\n");
++			flag = TTY_BREAK;
++		} else if (lsr & UART_LSR_PE)
++			flag = TTY_PARITY;
++		else if (lsr & UART_LSR_FE)
++			flag = TTY_FRAME;
++	}
++	if (uart_prepare_sysrq_char(port, ch))
++		return;
++
++	uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
++}
++EXPORT_SYMBOL_GPL(serial8250_read_char);
++
++/*
++ * serial8250_rx_chars - Read characters. The first LSR value must be passed in.
++ *
++ * Returns LSR bits. The caller should rely only on non-Rx related LSR bits
++ * (such as THRE) because the LSR value might come from an already consumed
++ * character.
++ */
++u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr)
++{
++	struct uart_port *port = &up->port;
++	int max_count = 256;
++
++	do {
++		serial8250_read_char(up, lsr);
++		if (--max_count == 0)
++			break;
++		lsr = serial_in(up, UART_LSR);
++	} while (lsr & (UART_LSR_DR | UART_LSR_BI));
++
++	tty_flip_buffer_push(&port->state->port);
++	return lsr;
++}
++EXPORT_SYMBOL_GPL(serial8250_rx_chars);
++
++void serial8250_tx_chars(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++	struct circ_buf *xmit = &port->state->xmit;
++	int count;
++
++	if (port->x_char) {
++		uart_xchar_out(port, UART_TX);
++		return;
++	}
++	if (uart_tx_stopped(port)) {
++		serial8250_stop_tx(port);
++		return;
++	}
++	if (uart_circ_empty(xmit)) {
++		__stop_tx(up);
++		return;
++	}
++
++	count = up->tx_loadsz;
++	do {
++		serial_out(up, UART_TX, xmit->buf[xmit->tail]);
++		if (up->bugs & UART_BUG_TXRACE) {
++			/*
++			 * The Aspeed BMC virtual UARTs have a bug where data
++			 * may get stuck in the BMC's Tx FIFO from bursts of
++			 * writes on the APB interface.
++			 *
++			 * Delay back-to-back writes by a read cycle to avoid
++			 * stalling the VUART. Read a register that won't have
++			 * side-effects and discard the result.
++			 */
++			serial_in(up, UART_SCR);
++		}
++		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
++		port->icount.tx++;
++		if (uart_circ_empty(xmit))
++			break;
++		if ((up->capabilities & UART_CAP_HFIFO) &&
++		    !uart_lsr_tx_empty(serial_in(up, UART_LSR)))
++			break;
++		/* The BCM2835 MINI UART THRE bit is really a not-full bit. */
++		if ((up->capabilities & UART_CAP_MINI) &&
++		    !(serial_in(up, UART_LSR) & UART_LSR_THRE))
++			break;
++	} while (--count > 0);
++
++	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
++		uart_write_wakeup(port);
++
++	/*
++	 * With RPM enabled, we have to wait until the FIFO is empty before the
++	 * HW can go idle. So we get here once again with empty FIFO and disable
++	 * the interrupt and RPM in __stop_tx()
++	 */
++	if (uart_circ_empty(xmit) && !(up->capabilities & UART_CAP_RPM))
++		__stop_tx(up);
++}
++EXPORT_SYMBOL_GPL(serial8250_tx_chars);
++
++/* Caller holds uart port lock */
++unsigned int serial8250_modem_status(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++	unsigned int status = serial_in(up, UART_MSR);
++
++	status |= up->msr_saved_flags;
++	up->msr_saved_flags = 0;
++	if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI &&
++	    port->state != NULL) {
++		if (status & UART_MSR_TERI)
++			port->icount.rng++;
++		if (status & UART_MSR_DDSR)
++			port->icount.dsr++;
++		if (status & UART_MSR_DDCD)
++			uart_handle_dcd_change(port, status & UART_MSR_DCD);
++		if (status & UART_MSR_DCTS)
++			uart_handle_cts_change(port, status & UART_MSR_CTS);
++
++		wake_up_interruptible(&port->state->port.delta_msr_wait);
++	}
++
++	return status;
++}
++EXPORT_SYMBOL_GPL(serial8250_modem_status);
++
++static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
++{
++	switch (iir & 0x3f) {
++	case UART_IIR_RDI:
++		if (!up->dma->rx_running)
++			break;
++		fallthrough;
++	case UART_IIR_RLSI:
++	case UART_IIR_RX_TIMEOUT:
++		serial8250_rx_dma_flush(up);
++		return true;
++	}
++	return up->dma->rx_dma(up);
++}
++
++/*
++ * This handles the interrupt from one port.
++ */
++int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	bool skip_rx = false;
++	unsigned long flags;
++	u16 status;
++
++	if (iir & UART_IIR_NO_INT)
++		return 0;
++
++	spin_lock_irqsave(&port->lock, flags);
++
++	status = serial_lsr_in(up);
++
++	/*
++	 * If port is stopped and there are no error conditions in the
++	 * FIFO, then don't drain the FIFO, as this may lead to TTY buffer
++	 * overflow. Not servicing, RX FIFO would trigger auto HW flow
++	 * control when FIFO occupancy reaches preset threshold, thus
++	 * halting RX. This only works when auto HW flow control is
++	 * available.
++	 */
++	if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) &&
++	    (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) &&
++	    !(port->read_status_mask & UART_LSR_DR))
++		skip_rx = true;
++
++	if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) {
++		if (!up->dma || handle_rx_dma(up, iir))
++			status = serial8250_rx_chars(up, status);
++	}
++	serial8250_modem_status(up);
++	if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) {
++		if (!up->dma || up->dma->tx_err)
++			serial8250_tx_chars(up);
++		else if (!up->dma->tx_running)
++			__stop_tx(up);
++	}
++
++	uart_unlock_and_check_sysrq_irqrestore(port, flags);
++
++	return 1;
++}
++EXPORT_SYMBOL_GPL(serial8250_handle_irq);
++
++static int serial8250_default_handle_irq(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned int iir;
++	int ret;
++
++	serial8250_rpm_get(up);
++
++	iir = serial_port_in(port, UART_IIR);
++	ret = serial8250_handle_irq(port, iir);
++
++	serial8250_rpm_put(up);
++	return ret;
++}
++
++/*
++ * Newer 16550 compatible parts such as the SC16C650 & Altera 16550 Soft IP
++ * have a programmable TX threshold that triggers the THRE interrupt in
++ * the IIR register. In this case, the THRE interrupt indicates the FIFO
++ * has space available. Load it up with tx_loadsz bytes.
++ */
++static int serial8250_tx_threshold_handle_irq(struct uart_port *port)
++{
++	unsigned long flags;
++	unsigned int iir = serial_port_in(port, UART_IIR);
++
++	/* TX Threshold IRQ triggered so load up FIFO */
++	if ((iir & UART_IIR_ID) == UART_IIR_THRI) {
++		struct uart_8250_port *up = up_to_u8250p(port);
++
++		spin_lock_irqsave(&port->lock, flags);
++		serial8250_tx_chars(up);
++		spin_unlock_irqrestore(&port->lock, flags);
++	}
++
++	iir = serial_port_in(port, UART_IIR);
++	return serial8250_handle_irq(port, iir);
++}
++
++static unsigned int serial8250_tx_empty(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++	u16 lsr;
++
++	serial8250_rpm_get(up);
++
++	spin_lock_irqsave(&port->lock, flags);
++	lsr = serial_lsr_in(up);
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	serial8250_rpm_put(up);
++
++	return uart_lsr_tx_empty(lsr) ? TIOCSER_TEMT : 0;
++}
++
++unsigned int serial8250_do_get_mctrl(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned int status;
++	unsigned int val;
++
++	serial8250_rpm_get(up);
++	status = serial8250_modem_status(up);
++	serial8250_rpm_put(up);
++
++	val = serial8250_MSR_to_TIOCM(status);
++	if (up->gpios)
++		return mctrl_gpio_get(up->gpios, &val);
++
++	return val;
++}
++EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl);
++
++static unsigned int serial8250_get_mctrl(struct uart_port *port)
++{
++	if (port->get_mctrl)
++		return port->get_mctrl(port);
++	return serial8250_do_get_mctrl(port);
++}
++
++void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned char mcr;
++
++	mcr = serial8250_TIOCM_to_MCR(mctrl);
++
++	mcr |= up->mcr;
++
++	serial8250_out_MCR(up, mcr);
++}
++EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl);
++
++static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
++{
++	if (port->rs485.flags & SER_RS485_ENABLED)
++		return;
++
++	if (port->set_mctrl)
++		port->set_mctrl(port, mctrl);
++	else
++		serial8250_do_set_mctrl(port, mctrl);
++}
++
++static void serial8250_break_ctl(struct uart_port *port, int break_state)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++
++	serial8250_rpm_get(up);
++	spin_lock_irqsave(&port->lock, flags);
++	if (break_state == -1)
++		up->lcr |= UART_LCR_SBC;
++	else
++		up->lcr &= ~UART_LCR_SBC;
++	serial_port_out(port, UART_LCR, up->lcr);
++	spin_unlock_irqrestore(&port->lock, flags);
++	serial8250_rpm_put(up);
++}
++
++static void wait_for_lsr(struct uart_8250_port *up, int bits)
++{
++	unsigned int status, tmout = 10000;
++
++	/* Wait up to 10ms for the character(s) to be sent. */
++	for (;;) {
++		status = serial_lsr_in(up);
++
++		if ((status & bits) == bits)
++			break;
++		if (--tmout == 0)
++			break;
++		udelay(1);
++		touch_nmi_watchdog();
++	}
++}
++
++/*
++ *	Wait for transmitter & holding register to empty
++ */
++static void wait_for_xmitr(struct uart_8250_port *up, int bits)
++{
++	unsigned int tmout;
++
++	wait_for_lsr(up, bits);
++
++	/* Wait up to 1s for flow control if necessary */
++	if (up->port.flags & UPF_CONS_FLOW) {
++		for (tmout = 1000000; tmout; tmout--) {
++			unsigned int msr = serial_in(up, UART_MSR);
++			up->msr_saved_flags |= msr & MSR_SAVE_FLAGS;
++			if (msr & UART_MSR_CTS)
++				break;
++			udelay(1);
++			touch_nmi_watchdog();
++		}
++	}
++}
++
++#ifdef CONFIG_CONSOLE_POLL
++/*
++ * Console polling routines for writing and reading from the uart while
++ * in an interrupt or debug context.
++ */
++
++static int serial8250_get_poll_char(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	int status;
++	u16 lsr;
++
++	serial8250_rpm_get(up);
++
++	lsr = serial_port_in(port, UART_LSR);
++
++	if (!(lsr & UART_LSR_DR)) {
++		status = NO_POLL_CHAR;
++		goto out;
++	}
++
++	status = serial_port_in(port, UART_RX);
++out:
++	serial8250_rpm_put(up);
++	return status;
++}
++
++
++static void serial8250_put_poll_char(struct uart_port *port,
++			 unsigned char c)
++{
++	unsigned int ier;
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	serial8250_rpm_get(up);
++	/*
++	 *	First save the IER then disable the interrupts
++	 */
++	ier = serial_port_in(port, UART_IER);
++	if (up->capabilities & UART_CAP_UUE)
++		serial_port_out(port, UART_IER, UART_IER_UUE);
++	else
++		serial_port_out(port, UART_IER, 0);
++
++	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
++	/*
++	 *	Send the character out.
++	 */
++	serial_port_out(port, UART_TX, c);
++
++	/*
++	 *	Finally, wait for transmitter to become empty
++	 *	and restore the IER
++	 */
++	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
++	serial_port_out(port, UART_IER, ier);
++	serial8250_rpm_put(up);
++}
++
++#endif /* CONFIG_CONSOLE_POLL */
++
++int serial8250_do_startup(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++	unsigned char iir;
++	int retval;
++	u16 lsr;
++
++	if (!port->fifosize)
++		port->fifosize = uart_config[port->type].fifo_size;
++	if (!up->tx_loadsz)
++		up->tx_loadsz = uart_config[port->type].tx_loadsz;
++	if (!up->capabilities)
++		up->capabilities = uart_config[port->type].flags;
++	up->mcr = 0;
++
++	if (port->iotype != up->cur_iotype)
++		set_io_from_upio(port);
++
++	serial8250_rpm_get(up);
++	if (port->type == PORT_16C950) {
++		/* Wake up and initialize UART */
++		up->acr = 0;
++		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
++		serial_port_out(port, UART_EFR, UART_EFR_ECB);
++		serial_port_out(port, UART_IER, 0);
++		serial_port_out(port, UART_LCR, 0);
++		serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
++		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
++		serial_port_out(port, UART_EFR, UART_EFR_ECB);
++		serial_port_out(port, UART_LCR, 0);
++	}
++
++	if (port->type == PORT_DA830) {
++		/* Reset the port */
++		serial_port_out(port, UART_IER, 0);
++		serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
++		mdelay(10);
++
++		/* Enable Tx, Rx and free run mode */
++		serial_port_out(port, UART_DA830_PWREMU_MGMT,
++				UART_DA830_PWREMU_MGMT_UTRST |
++				UART_DA830_PWREMU_MGMT_URRST |
++				UART_DA830_PWREMU_MGMT_FREE);
++	}
++
++	if (port->type == PORT_NPCM) {
++		/*
++		 * Nuvoton calls the scratch register 'UART_TOR' (timeout
++		 * register). Enable it, and set TIOC (timeout interrupt
++		 * comparator) to be 0x20 for correct operation.
++		 */
++		serial_port_out(port, UART_NPCM_TOR, UART_NPCM_TOIE | 0x20);
++	}
++
++#ifdef CONFIG_SERIAL_8250_RSA
++	/*
++	 * If this is an RSA port, see if we can kick it up to the
++	 * higher speed clock.
++	 */
++	enable_rsa(up);
++#endif
++
++	/*
++	 * Clear the FIFO buffers and disable them.
++	 * (they will be reenabled in set_termios())
++	 */
++	serial8250_clear_fifos(up);
++
++	/*
++	 * Clear the interrupt registers.
++	 */
++	serial_port_in(port, UART_LSR);
++	serial_port_in(port, UART_RX);
++	serial_port_in(port, UART_IIR);
++	serial_port_in(port, UART_MSR);
++
++	/*
++	 * At this point, there's no way the LSR could still be 0xff;
++	 * if it is, then bail out, because there's likely no UART
++	 * here.
++	 */
++	if (!(port->flags & UPF_BUGGY_UART) &&
++	    (serial_port_in(port, UART_LSR) == 0xff)) {
++		dev_info_ratelimited(port->dev, "LSR safety check engaged!\n");
++		retval = -ENODEV;
++		goto out;
++	}
++
++	/*
++	 * For a XR16C850, we need to set the trigger levels
++	 */
++	if (port->type == PORT_16850) {
++		unsigned char fctr;
++
++		serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++
++		fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX);
++		serial_port_out(port, UART_FCTR,
++				fctr | UART_FCTR_TRGD | UART_FCTR_RX);
++		serial_port_out(port, UART_TRG, UART_TRG_96);
++		serial_port_out(port, UART_FCTR,
++				fctr | UART_FCTR_TRGD | UART_FCTR_TX);
++		serial_port_out(port, UART_TRG, UART_TRG_96);
++
++		serial_port_out(port, UART_LCR, 0);
++	}
++
++	/*
++	 * For the Altera 16550 variants, set TX threshold trigger level.
++	 */
++	if (((port->type == PORT_ALTR_16550_F32) ||
++	     (port->type == PORT_ALTR_16550_F64) ||
++	     (port->type == PORT_ALTR_16550_F128)) && (port->fifosize > 1)) {
++		/* Bounds checking of TX threshold (valid 0 to fifosize-2) */
++		if ((up->tx_loadsz < 2) || (up->tx_loadsz > port->fifosize)) {
++			dev_err(port->dev, "TX FIFO Threshold errors, skipping\n");
++		} else {
++			serial_port_out(port, UART_ALTR_AFR,
++					UART_ALTR_EN_TXFIFO_LW);
++			serial_port_out(port, UART_ALTR_TX_LOW,
++					port->fifosize - up->tx_loadsz);
++			port->handle_irq = serial8250_tx_threshold_handle_irq;
++		}
++	}
++
++	/* Check if we need to have shared IRQs */
++	if (port->irq && (up->port.flags & UPF_SHARE_IRQ))
++		up->port.irqflags |= IRQF_SHARED;
++
++	retval = up->ops->setup_irq(up);
++	if (retval)
++		goto out;
++
++	if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) {
++		unsigned char iir1;
++
++		if (port->irqflags & IRQF_SHARED)
++			disable_irq_nosync(port->irq);
++
++		/*
++		 * Test for UARTs that do not reassert THRE when the
++		 * transmitter is idle and the interrupt has already
++		 * been cleared.  Real 16550s should always reassert
++		 * this interrupt whenever the transmitter is idle and
++		 * the interrupt is enabled.  Delays are necessary to
++		 * allow register changes to become visible.
++		 */
++		spin_lock_irqsave(&port->lock, flags);
++
++		wait_for_xmitr(up, UART_LSR_THRE);
++		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
++		udelay(1); /* allow THRE to set */
++		iir1 = serial_port_in(port, UART_IIR);
++		serial_port_out(port, UART_IER, 0);
++		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
++		udelay(1); /* allow a working UART time to re-assert THRE */
++		iir = serial_port_in(port, UART_IIR);
++		serial_port_out(port, UART_IER, 0);
++
++		spin_unlock_irqrestore(&port->lock, flags);
++
++		if (port->irqflags & IRQF_SHARED)
++			enable_irq(port->irq);
++
++		/*
++		 * If the interrupt is not reasserted, or we otherwise
++		 * don't trust the iir, setup a timer to kick the UART
++		 * on a regular basis.
++		 */
++		if ((!(iir1 & UART_IIR_NO_INT) && (iir & UART_IIR_NO_INT)) ||
++		    up->port.flags & UPF_BUG_THRE) {
++			up->bugs |= UART_BUG_THRE;
++		}
++	}
++
++	up->ops->setup_timer(up);
++
++	/*
++	 * Now, initialize the UART
++	 */
++	serial_port_out(port, UART_LCR, UART_LCR_WLEN8);
++
++	spin_lock_irqsave(&port->lock, flags);
++	if (up->port.flags & UPF_FOURPORT) {
++		if (!up->port.irq)
++			up->port.mctrl |= TIOCM_OUT1;
++	} else
++		/*
++		 * Most PC uarts need OUT2 raised to enable interrupts.
++		 */
++		if (port->irq)
++			up->port.mctrl |= TIOCM_OUT2;
++
++	serial8250_set_mctrl(port, port->mctrl);
++
++	/*
++	 * Serial over Lan (SoL) hack:
++	 * Intel 8257x Gigabit ethernet chips have a 16550 emulation, to be
++	 * used for Serial Over Lan.  Those chips take a longer time than a
++	 * normal serial device to signalize that a transmission data was
++	 * queued. Due to that, the above test generally fails. One solution
++	 * would be to delay the reading of iir. However, this is not
++	 * reliable, since the timeout is variable. So, let's just don't
++	 * test if we receive TX irq.  This way, we'll never enable
++	 * UART_BUG_TXEN.
++	 */
++	if (up->port.quirks & UPQ_NO_TXEN_TEST)
++		goto dont_test_tx_en;
++
++	/*
++	 * Do a quick test to see if we receive an interrupt when we enable
++	 * the TX irq.
++	 */
++	serial_port_out(port, UART_IER, UART_IER_THRI);
++	lsr = serial_port_in(port, UART_LSR);
++	iir = serial_port_in(port, UART_IIR);
++	serial_port_out(port, UART_IER, 0);
++
++	if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
++		if (!(up->bugs & UART_BUG_TXEN)) {
++			up->bugs |= UART_BUG_TXEN;
++			dev_dbg(port->dev, "enabling bad tx status workarounds\n");
++		}
++	} else {
++		up->bugs &= ~UART_BUG_TXEN;
++	}
++
++dont_test_tx_en:
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	/*
++	 * Clear the interrupt registers again for luck, and clear the
++	 * saved flags to avoid getting false values from polling
++	 * routines or the previous session.
++	 */
++	serial_port_in(port, UART_LSR);
++	serial_port_in(port, UART_RX);
++	serial_port_in(port, UART_IIR);
++	serial_port_in(port, UART_MSR);
++	up->lsr_saved_flags = 0;
++	up->msr_saved_flags = 0;
++
++	/*
++	 * Request DMA channels for both RX and TX.
++	 */
++	if (up->dma) {
++		const char *msg = NULL;
++
++		if (uart_console(port))
++			msg = "forbid DMA for kernel console";
++		else if (serial8250_request_dma(up))
++			msg = "failed to request DMA";
++		if (msg) {
++			dev_warn_ratelimited(port->dev, "%s\n", msg);
++			up->dma = NULL;
++		}
++	}
++
++	/*
++	 * Set the IER shadow for rx interrupts but defer actual interrupt
++	 * enable until after the FIFOs are enabled; otherwise, an already-
++	 * active sender can swamp the interrupt handler with "too much work".
++	 */
++	up->ier = UART_IER_RLSI | UART_IER_RDI;
++
++	if (port->flags & UPF_FOURPORT) {
++		unsigned int icp;
++		/*
++		 * Enable interrupts on the AST Fourport board
++		 */
++		icp = (port->iobase & 0xfe0) | 0x01f;
++		outb_p(0x80, icp);
++		inb_p(icp);
++	}
++	retval = 0;
++out:
++	serial8250_rpm_put(up);
++	return retval;
++}
++EXPORT_SYMBOL_GPL(serial8250_do_startup);
++
++static int serial8250_startup(struct uart_port *port)
++{
++	if (port->startup)
++		return port->startup(port);
++	return serial8250_do_startup(port);
++}
++
++void serial8250_do_shutdown(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++
++	serial8250_rpm_get(up);
++	/*
++	 * Disable interrupts from this port
++	 */
++	spin_lock_irqsave(&port->lock, flags);
++	up->ier = 0;
++	serial_port_out(port, UART_IER, 0);
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	synchronize_irq(port->irq);
++
++	if (up->dma)
++		serial8250_release_dma(up);
++
++	spin_lock_irqsave(&port->lock, flags);
++	if (port->flags & UPF_FOURPORT) {
++		/* reset interrupts on the AST Fourport board */
++		inb((port->iobase & 0xfe0) | 0x1f);
++		port->mctrl |= TIOCM_OUT1;
++	} else
++		port->mctrl &= ~TIOCM_OUT2;
++
++	serial8250_set_mctrl(port, port->mctrl);
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	/*
++	 * Disable break condition and FIFOs
++	 */
++	serial_port_out(port, UART_LCR,
++			serial_port_in(port, UART_LCR) & ~UART_LCR_SBC);
++	serial8250_clear_fifos(up);
++
++#ifdef CONFIG_SERIAL_8250_RSA
++	/*
++	 * Reset the RSA board back to 115kbps compat mode.
++	 */
++	disable_rsa(up);
++#endif
++
++	/*
++	 * Read data port to reset things, and then unlink from
++	 * the IRQ chain.
++	 */
++	serial_port_in(port, UART_RX);
++	serial8250_rpm_put(up);
++
++	up->ops->release_irq(up);
++}
++EXPORT_SYMBOL_GPL(serial8250_do_shutdown);
++
++static void serial8250_shutdown(struct uart_port *port)
++{
++	if (port->shutdown)
++		port->shutdown(port);
++	else
++		serial8250_do_shutdown(port);
++}
++
++/* Nuvoton NPCM UARTs have a custom divisor calculation */
++static unsigned int npcm_get_divisor(struct uart_8250_port *up,
++		unsigned int baud)
++{
++	struct uart_port *port = &up->port;
++
++	return DIV_ROUND_CLOSEST(port->uartclk, 16 * baud + 2) - 2;
++}
++
++static unsigned int serial8250_do_get_divisor(struct uart_port *port,
++					      unsigned int baud,
++					      unsigned int *frac)
++{
++	upf_t magic_multiplier = port->flags & UPF_MAGIC_MULTIPLIER;
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned int quot;
++
++	/*
++	 * Handle magic divisors for baud rates above baud_base on SMSC
++	 * Super I/O chips.  We clamp custom rates from clk/6 and clk/12
++	 * up to clk/4 (0x8001) and clk/8 (0x8002) respectively.  These
++	 * magic divisors actually reprogram the baud rate generator's
++	 * reference clock derived from chips's 14.318MHz clock input.
++	 *
++	 * Documentation claims that with these magic divisors the base
++	 * frequencies of 7.3728MHz and 3.6864MHz are used respectively
++	 * for the extra baud rates of 460800bps and 230400bps rather
++	 * than the usual base frequency of 1.8462MHz.  However empirical
++	 * evidence contradicts that.
++	 *
++	 * Instead bit 7 of the DLM register (bit 15 of the divisor) is
++	 * effectively used as a clock prescaler selection bit for the
++	 * base frequency of 7.3728MHz, always used.  If set to 0, then
++	 * the base frequency is divided by 4 for use by the Baud Rate
++	 * Generator, for the usual arrangement where the value of 1 of
++	 * the divisor produces the baud rate of 115200bps.  Conversely,
++	 * if set to 1 and high-speed operation has been enabled with the
++	 * Serial Port Mode Register in the Device Configuration Space,
++	 * then the base frequency is supplied directly to the Baud Rate
++	 * Generator, so for the divisor values of 0x8001, 0x8002, 0x8003,
++	 * 0x8004, etc. the respective baud rates produced are 460800bps,
++	 * 230400bps, 153600bps, 115200bps, etc.
++	 *
++	 * In all cases only low 15 bits of the divisor are used to divide
++	 * the baud base and therefore 32767 is the maximum divisor value
++	 * possible, even though documentation says that the programmable
++	 * Baud Rate Generator is capable of dividing the internal PLL
++	 * clock by any divisor from 1 to 65535.
++	 */
++	if (magic_multiplier && baud >= port->uartclk / 6)
++		quot = 0x8001;
++	else if (magic_multiplier && baud >= port->uartclk / 12)
++		quot = 0x8002;
++	else if (up->port.type == PORT_NPCM)
++		quot = npcm_get_divisor(up, baud);
++	else
++		quot = uart_get_divisor(port, baud);
++
++	/*
++	 * Oxford Semi 952 rev B workaround
++	 */
++	if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0)
++		quot++;
++
++	return quot;
++}
++
++static unsigned int serial8250_get_divisor(struct uart_port *port,
++					   unsigned int baud,
++					   unsigned int *frac)
++{
++	if (port->get_divisor)
++		return port->get_divisor(port, baud, frac);
++
++	return serial8250_do_get_divisor(port, baud, frac);
++}
++
++static unsigned char serial8250_compute_lcr(struct uart_8250_port *up,
++					    tcflag_t c_cflag)
++{
++	unsigned char cval;
++
++	cval = UART_LCR_WLEN(tty_get_char_size(c_cflag));
++
++	if (c_cflag & CSTOPB)
++		cval |= UART_LCR_STOP;
++	if (c_cflag & PARENB) {
++		cval |= UART_LCR_PARITY;
++		if (up->bugs & UART_BUG_PARITY)
++			up->fifo_bug = true;
++	}
++	if (!(c_cflag & PARODD))
++		cval |= UART_LCR_EPAR;
++	if (c_cflag & CMSPAR)
++		cval |= UART_LCR_SPAR;
++
++	return cval;
++}
++
++void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
++			       unsigned int quot, unsigned int quot_frac)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	/* Workaround to enable 115200 baud on OMAP1510 internal ports */
++	if (is_omap1510_8250(up)) {
++		if (baud == 115200) {
++			quot = 1;
++			serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1);
++		} else
++			serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0);
++	}
++
++	/*
++	 * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2,
++	 * otherwise just set DLAB
++	 */
++	if (up->capabilities & UART_NATSEMI)
++		serial_port_out(port, UART_LCR, 0xe0);
++	else
++		serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB);
++
++	serial_dl_write(up, quot);
++}
++EXPORT_SYMBOL_GPL(serial8250_do_set_divisor);
++
++static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
++				   unsigned int quot, unsigned int quot_frac)
++{
++	if (port->set_divisor)
++		port->set_divisor(port, baud, quot, quot_frac);
++	else
++		serial8250_do_set_divisor(port, baud, quot, quot_frac);
++}
++
++static unsigned int serial8250_get_baud_rate(struct uart_port *port,
++					     struct ktermios *termios,
++					     struct ktermios *old)
++{
++	unsigned int tolerance = port->uartclk / 100;
++	unsigned int min;
++	unsigned int max;
++
++	/*
++	 * Handle magic divisors for baud rates above baud_base on SMSC
++	 * Super I/O chips.  Enable custom rates of clk/4 and clk/8, but
++	 * disable divisor values beyond 32767, which are unavailable.
++	 */
++	if (port->flags & UPF_MAGIC_MULTIPLIER) {
++		min = port->uartclk / 16 / UART_DIV_MAX >> 1;
++		max = (port->uartclk + tolerance) / 4;
++	} else {
++		min = port->uartclk / 16 / UART_DIV_MAX;
++		max = (port->uartclk + tolerance) / 16;
++	}
++
++	/*
++	 * Ask the core to calculate the divisor for us.
++	 * Allow 1% tolerance at the upper limit so uart clks marginally
++	 * slower than nominal still match standard baud rates without
++	 * causing transmission errors.
++	 */
++	return uart_get_baud_rate(port, termios, old, min, max);
++}
++
++/*
++ * Note in order to avoid the tty port mutex deadlock don't use the next method
++ * within the uart port callbacks. Primarily it's supposed to be utilized to
++ * handle a sudden reference clock rate change.
++ */
++void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct tty_port *tport = &port->state->port;
++	unsigned int baud, quot, frac = 0;
++	struct ktermios *termios;
++	struct tty_struct *tty;
++	unsigned long flags;
++
++	tty = tty_port_tty_get(tport);
++	if (!tty) {
++		mutex_lock(&tport->mutex);
++		port->uartclk = uartclk;
++		mutex_unlock(&tport->mutex);
++		return;
++	}
++
++	down_write(&tty->termios_rwsem);
++	mutex_lock(&tport->mutex);
++
++	if (port->uartclk == uartclk)
++		goto out_unlock;
++
++	port->uartclk = uartclk;
++
++	if (!tty_port_initialized(tport))
++		goto out_unlock;
++
++	termios = &tty->termios;
++
++	baud = serial8250_get_baud_rate(port, termios, NULL);
++	quot = serial8250_get_divisor(port, baud, &frac);
++
++	serial8250_rpm_get(up);
++	spin_lock_irqsave(&port->lock, flags);
++
++	uart_update_timeout(port, termios->c_cflag, baud);
++
++	serial8250_set_divisor(port, baud, quot, frac);
++	serial_port_out(port, UART_LCR, up->lcr);
++
++	spin_unlock_irqrestore(&port->lock, flags);
++	serial8250_rpm_put(up);
++
++out_unlock:
++	mutex_unlock(&tport->mutex);
++	up_write(&tty->termios_rwsem);
++	tty_kref_put(tty);
++}
++EXPORT_SYMBOL_GPL(serial8250_update_uartclk);
++
++void
++serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
++			  struct ktermios *old)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned char cval;
++	unsigned long flags;
++	unsigned int baud, quot, frac = 0;
++
++	if (up->capabilities & UART_CAP_MINI) {
++		termios->c_cflag &= ~(CSTOPB | PARENB | PARODD | CMSPAR);
++		if ((termios->c_cflag & CSIZE) == CS5 ||
++		    (termios->c_cflag & CSIZE) == CS6)
++			termios->c_cflag = (termios->c_cflag & ~CSIZE) | CS7;
++	}
++	cval = serial8250_compute_lcr(up, termios->c_cflag);
++
++	baud = serial8250_get_baud_rate(port, termios, old);
++	quot = serial8250_get_divisor(port, baud, &frac);
++
++	/*
++	 * Ok, we're now changing the port state.  Do it with
++	 * interrupts disabled.
++	 */
++	serial8250_rpm_get(up);
++	spin_lock_irqsave(&port->lock, flags);
++
++	up->lcr = cval;					/* Save computed LCR */
++
++	if (up->capabilities & UART_CAP_FIFO && port->fifosize > 1) {
++		/* NOTE: If fifo_bug is not set, a user can set RX_trigger. */
++		if ((baud < 2400 && !up->dma) || up->fifo_bug) {
++			up->fcr &= ~UART_FCR_TRIGGER_MASK;
++			up->fcr |= UART_FCR_TRIGGER_1;
++		}
++	}
++
++	/*
++	 * MCR-based auto flow control.  When AFE is enabled, RTS will be
++	 * deasserted when the receive FIFO contains more characters than
++	 * the trigger, or the MCR RTS bit is cleared.
++	 */
++	if (up->capabilities & UART_CAP_AFE) {
++		up->mcr &= ~UART_MCR_AFE;
++		if (termios->c_cflag & CRTSCTS)
++			up->mcr |= UART_MCR_AFE;
++	}
++
++	/*
++	 * Update the per-port timeout.
++	 */
++	uart_update_timeout(port, termios->c_cflag, baud);
++
++	port->read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR;
++	if (termios->c_iflag & INPCK)
++		port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
++	if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
++		port->read_status_mask |= UART_LSR_BI;
++
++	/*
++	 * Characters to ignore
++	 */
++	port->ignore_status_mask = 0;
++	if (termios->c_iflag & IGNPAR)
++		port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
++	if (termios->c_iflag & IGNBRK) {
++		port->ignore_status_mask |= UART_LSR_BI;
++		/*
++		 * If we're ignoring parity and break indicators,
++		 * ignore overruns too (for real raw support).
++		 */
++		if (termios->c_iflag & IGNPAR)
++			port->ignore_status_mask |= UART_LSR_OE;
++	}
++
++	/*
++	 * ignore all characters if CREAD is not set
++	 */
++	if ((termios->c_cflag & CREAD) == 0)
++		port->ignore_status_mask |= UART_LSR_DR;
++
++	/*
++	 * CTS flow control flag and modem status interrupts
++	 */
++	up->ier &= ~UART_IER_MSI;
++	if (!(up->bugs & UART_BUG_NOMSR) &&
++			UART_ENABLE_MS(&up->port, termios->c_cflag))
++		up->ier |= UART_IER_MSI;
++	if (up->capabilities & UART_CAP_UUE)
++		up->ier |= UART_IER_UUE;
++	if (up->capabilities & UART_CAP_RTOIE)
++		up->ier |= UART_IER_RTOIE;
++
++	serial_port_out(port, UART_IER, up->ier);
++
++	if (up->capabilities & UART_CAP_EFR) {
++		unsigned char efr = 0;
++		/*
++		 * TI16C752/Startech hardware flow control.  FIXME:
++		 * - TI16C752 requires control thresholds to be set.
++		 * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled.
++		 */
++		if (termios->c_cflag & CRTSCTS)
++			efr |= UART_EFR_CTS;
++
++		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
++		if (port->flags & UPF_EXAR_EFR)
++			serial_port_out(port, UART_XR_EFR, efr);
++		else
++			serial_port_out(port, UART_EFR, efr);
++	}
++
++	serial8250_set_divisor(port, baud, quot, frac);
++
++	/*
++	 * LCR DLAB must be set to enable 64-byte FIFO mode. If the FCR
++	 * is written without DLAB set, this mode will be disabled.
++	 */
++	if (port->type == PORT_16750)
++		serial_port_out(port, UART_FCR, up->fcr);
++
++	serial_port_out(port, UART_LCR, up->lcr);	/* reset DLAB */
++	if (port->type != PORT_16750) {
++		/* emulated UARTs (Lucent Venus 167x) need two steps */
++		if (up->fcr & UART_FCR_ENABLE_FIFO)
++			serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO);
++		serial_port_out(port, UART_FCR, up->fcr);	/* set fcr */
++	}
++	serial8250_set_mctrl(port, port->mctrl);
++	spin_unlock_irqrestore(&port->lock, flags);
++	serial8250_rpm_put(up);
++
++	/* Don't rewrite B0 */
++	if (tty_termios_baud_rate(termios))
++		tty_termios_encode_baud_rate(termios, baud, baud);
++}
++EXPORT_SYMBOL(serial8250_do_set_termios);
++
++static void
++serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
++		       struct ktermios *old)
++{
++	if (port->set_termios)
++		port->set_termios(port, termios, old);
++	else
++		serial8250_do_set_termios(port, termios, old);
++}
++
++void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios)
++{
++	if (termios->c_line == N_PPS) {
++		port->flags |= UPF_HARDPPS_CD;
++		spin_lock_irq(&port->lock);
++		serial8250_enable_ms(port);
++		spin_unlock_irq(&port->lock);
++	} else {
++		port->flags &= ~UPF_HARDPPS_CD;
++		if (!UART_ENABLE_MS(port, termios->c_cflag)) {
++			spin_lock_irq(&port->lock);
++			serial8250_disable_ms(port);
++			spin_unlock_irq(&port->lock);
++		}
++	}
++}
++EXPORT_SYMBOL_GPL(serial8250_do_set_ldisc);
++
++static void
++serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios)
++{
++	if (port->set_ldisc)
++		port->set_ldisc(port, termios);
++	else
++		serial8250_do_set_ldisc(port, termios);
++}
++
++void serial8250_do_pm(struct uart_port *port, unsigned int state,
++		      unsigned int oldstate)
++{
++	struct uart_8250_port *p = up_to_u8250p(port);
++
++	serial8250_set_sleep(p, state != 0);
++}
++EXPORT_SYMBOL(serial8250_do_pm);
++
++static void
++serial8250_pm(struct uart_port *port, unsigned int state,
++	      unsigned int oldstate)
++{
++	if (port->pm)
++		port->pm(port, state, oldstate);
++	else
++		serial8250_do_pm(port, state, oldstate);
++}
++
++static unsigned int serial8250_port_size(struct uart_8250_port *pt)
++{
++	if (pt->port.mapsize)
++		return pt->port.mapsize;
++	if (pt->port.iotype == UPIO_AU) {
++		if (pt->port.type == PORT_RT2880)
++			return 0x100;
++		return 0x1000;
++	}
++	if (is_omap1_8250(pt))
++		return 0x16 << pt->port.regshift;
++
++	return 8 << pt->port.regshift;
++}
++
++/*
++ * Resource handling.
++ */
++static int serial8250_request_std_resource(struct uart_8250_port *up)
++{
++	unsigned int size = serial8250_port_size(up);
++	struct uart_port *port = &up->port;
++	int ret = 0;
++
++	switch (port->iotype) {
++	case UPIO_AU:
++	case UPIO_TSI:
++	case UPIO_MEM32:
++	case UPIO_MEM32BE:
++	case UPIO_MEM16:
++	case UPIO_MEM:
++		if (!port->mapbase) {
++			ret = -EINVAL;
++			break;
++		}
++
++		if (!request_mem_region(port->mapbase, size, "serial")) {
++			ret = -EBUSY;
++			break;
++		}
++
++		if (port->flags & UPF_IOREMAP) {
++			port->membase = ioremap(port->mapbase, size);
++			if (!port->membase) {
++				release_mem_region(port->mapbase, size);
++				ret = -ENOMEM;
++			}
++		}
++		break;
++
++	case UPIO_HUB6:
++	case UPIO_PORT:
++		if (!request_region(port->iobase, size, "serial"))
++			ret = -EBUSY;
++		break;
++	}
++	return ret;
++}
++
++static void serial8250_release_std_resource(struct uart_8250_port *up)
++{
++	unsigned int size = serial8250_port_size(up);
++	struct uart_port *port = &up->port;
++
++	switch (port->iotype) {
++	case UPIO_AU:
++	case UPIO_TSI:
++	case UPIO_MEM32:
++	case UPIO_MEM32BE:
++	case UPIO_MEM16:
++	case UPIO_MEM:
++		if (!port->mapbase)
++			break;
++
++		if (port->flags & UPF_IOREMAP) {
++			iounmap(port->membase);
++			port->membase = NULL;
++		}
++
++		release_mem_region(port->mapbase, size);
++		break;
++
++	case UPIO_HUB6:
++	case UPIO_PORT:
++		release_region(port->iobase, size);
++		break;
++	}
++}
++
++static void serial8250_release_port(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	serial8250_release_std_resource(up);
++}
++
++static int serial8250_request_port(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	return serial8250_request_std_resource(up);
++}
++
++static int fcr_get_rxtrig_bytes(struct uart_8250_port *up)
++{
++	const struct serial8250_config *conf_type = &uart_config[up->port.type];
++	unsigned char bytes;
++
++	bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)];
++
++	return bytes ? bytes : -EOPNOTSUPP;
++}
++
++static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes)
++{
++	const struct serial8250_config *conf_type = &uart_config[up->port.type];
++	int i;
++
++	if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)])
++		return -EOPNOTSUPP;
++
++	for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) {
++		if (bytes < conf_type->rxtrig_bytes[i])
++			/* Use the nearest lower value */
++			return (--i) << UART_FCR_R_TRIG_SHIFT;
++	}
++
++	return UART_FCR_R_TRIG_11;
++}
++
++static int do_get_rxtrig(struct tty_port *port)
++{
++	struct uart_state *state = container_of(port, struct uart_state, port);
++	struct uart_port *uport = state->uart_port;
++	struct uart_8250_port *up = up_to_u8250p(uport);
++
++	if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
++		return -EINVAL;
++
++	return fcr_get_rxtrig_bytes(up);
++}
++
++static int do_serial8250_get_rxtrig(struct tty_port *port)
++{
++	int rxtrig_bytes;
++
++	mutex_lock(&port->mutex);
++	rxtrig_bytes = do_get_rxtrig(port);
++	mutex_unlock(&port->mutex);
++
++	return rxtrig_bytes;
++}
++
++static ssize_t rx_trig_bytes_show(struct device *dev,
++	struct device_attribute *attr, char *buf)
++{
++	struct tty_port *port = dev_get_drvdata(dev);
++	int rxtrig_bytes;
++
++	rxtrig_bytes = do_serial8250_get_rxtrig(port);
++	if (rxtrig_bytes < 0)
++		return rxtrig_bytes;
++
++	return sysfs_emit(buf, "%d\n", rxtrig_bytes);
++}
++
++static int do_set_rxtrig(struct tty_port *port, unsigned char bytes)
++{
++	struct uart_state *state = container_of(port, struct uart_state, port);
++	struct uart_port *uport = state->uart_port;
++	struct uart_8250_port *up = up_to_u8250p(uport);
++	int rxtrig;
++
++	if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1 ||
++	    up->fifo_bug)
++		return -EINVAL;
++
++	rxtrig = bytes_to_fcr_rxtrig(up, bytes);
++	if (rxtrig < 0)
++		return rxtrig;
++
++	serial8250_clear_fifos(up);
++	up->fcr &= ~UART_FCR_TRIGGER_MASK;
++	up->fcr |= (unsigned char)rxtrig;
++	serial_out(up, UART_FCR, up->fcr);
++	return 0;
++}
++
++static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes)
++{
++	int ret;
++
++	mutex_lock(&port->mutex);
++	ret = do_set_rxtrig(port, bytes);
++	mutex_unlock(&port->mutex);
++
++	return ret;
++}
++
++static ssize_t rx_trig_bytes_store(struct device *dev,
++	struct device_attribute *attr, const char *buf, size_t count)
++{
++	struct tty_port *port = dev_get_drvdata(dev);
++	unsigned char bytes;
++	int ret;
++
++	if (!count)
++		return -EINVAL;
++
++	ret = kstrtou8(buf, 10, &bytes);
++	if (ret < 0)
++		return ret;
++
++	ret = do_serial8250_set_rxtrig(port, bytes);
++	if (ret < 0)
++		return ret;
++
++	return count;
++}
++
++static DEVICE_ATTR_RW(rx_trig_bytes);
++
++static struct attribute *serial8250_dev_attrs[] = {
++	&dev_attr_rx_trig_bytes.attr,
++	NULL
++};
++
++static struct attribute_group serial8250_dev_attr_group = {
++	.attrs = serial8250_dev_attrs,
++};
++
++static void register_dev_spec_attr_grp(struct uart_8250_port *up)
++{
++	const struct serial8250_config *conf_type = &uart_config[up->port.type];
++
++	if (conf_type->rxtrig_bytes[0])
++		up->port.attr_group = &serial8250_dev_attr_group;
++}
++
++static void serial8250_config_port(struct uart_port *port, int flags)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	int ret;
++
++	/*
++	 * Find the region that we can probe for.  This in turn
++	 * tells us whether we can probe for the type of port.
++	 */
++	ret = serial8250_request_std_resource(up);
++	if (ret < 0)
++		return;
++
++	if (port->iotype != up->cur_iotype)
++		set_io_from_upio(port);
++
++	if (flags & UART_CONFIG_TYPE)
++		autoconfig(up);
++
++	/* if access method is AU, it is a 16550 with a quirk */
++	if (port->type == PORT_16550A && port->iotype == UPIO_AU)
++		up->bugs |= UART_BUG_NOMSR;
++
++	/* HW bugs may trigger IRQ while IIR == NO_INT */
++	if (port->type == PORT_TEGRA)
++		up->bugs |= UART_BUG_NOMSR;
++
++	if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ)
++		autoconfig_irq(up);
++
++	if (port->type == PORT_UNKNOWN)
++		serial8250_release_std_resource(up);
++
++	register_dev_spec_attr_grp(up);
++	up->fcr = uart_config[up->port.type].fcr;
++}
++
++static int
++serial8250_verify_port(struct uart_port *port, struct serial_struct *ser)
++{
++	if (ser->irq >= nr_irqs || ser->irq < 0 ||
++	    ser->baud_base < 9600 || ser->type < PORT_UNKNOWN ||
++	    ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS ||
++	    ser->type == PORT_STARTECH)
++		return -EINVAL;
++	return 0;
++}
++
++static const char *serial8250_type(struct uart_port *port)
++{
++	int type = port->type;
++
++	if (type >= ARRAY_SIZE(uart_config))
++		type = 0;
++	return uart_config[type].name;
++}
++
++static const struct uart_ops serial8250_pops = {
++	.tx_empty	= serial8250_tx_empty,
++	.set_mctrl	= serial8250_set_mctrl,
++	.get_mctrl	= serial8250_get_mctrl,
++	.stop_tx	= serial8250_stop_tx,
++	.start_tx	= serial8250_start_tx,
++	.throttle	= serial8250_throttle,
++	.unthrottle	= serial8250_unthrottle,
++	.stop_rx	= serial8250_stop_rx,
++	.enable_ms	= serial8250_enable_ms,
++	.break_ctl	= serial8250_break_ctl,
++	.startup	= serial8250_startup,
++	.shutdown	= serial8250_shutdown,
++	.set_termios	= serial8250_set_termios,
++	.set_ldisc	= serial8250_set_ldisc,
++	.pm		= serial8250_pm,
++	.type		= serial8250_type,
++	.release_port	= serial8250_release_port,
++	.request_port	= serial8250_request_port,
++	.config_port	= serial8250_config_port,
++	.verify_port	= serial8250_verify_port,
++#ifdef CONFIG_CONSOLE_POLL
++	.poll_get_char = serial8250_get_poll_char,
++	.poll_put_char = serial8250_put_poll_char,
++#endif
++};
++
++void serial8250_init_port(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++
++	spin_lock_init(&port->lock);
++	port->ops = &serial8250_pops;
++	port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE);
++
++	up->cur_iotype = 0xFF;
++}
++EXPORT_SYMBOL_GPL(serial8250_init_port);
++
++void serial8250_set_defaults(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++
++	if (up->port.flags & UPF_FIXED_TYPE) {
++		unsigned int type = up->port.type;
++
++		if (!up->port.fifosize)
++			up->port.fifosize = uart_config[type].fifo_size;
++		if (!up->tx_loadsz)
++			up->tx_loadsz = uart_config[type].tx_loadsz;
++		if (!up->capabilities)
++			up->capabilities = uart_config[type].flags;
++	}
++
++	set_io_from_upio(port);
++
++	/* default dma handlers */
++	if (up->dma) {
++		if (!up->dma->tx_dma)
++			up->dma->tx_dma = serial8250_tx_dma;
++		if (!up->dma->rx_dma)
++			up->dma->rx_dma = serial8250_rx_dma;
++	}
++}
++EXPORT_SYMBOL_GPL(serial8250_set_defaults);
++
++#ifdef CONFIG_SERIAL_8250_CONSOLE
++
++static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	wait_for_xmitr(up, UART_LSR_THRE);
++	serial_port_out(port, UART_TX, ch);
++}
++
++/*
++ *	Restore serial console when h/w power-off detected
++ */
++static void serial8250_console_restore(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++	struct ktermios termios;
++	unsigned int baud, quot, frac = 0;
++
++	termios.c_cflag = port->cons->cflag;
++	termios.c_ispeed = port->cons->ispeed;
++	termios.c_ospeed = port->cons->ospeed;
++	if (port->state->port.tty && termios.c_cflag == 0) {
++		termios.c_cflag = port->state->port.tty->termios.c_cflag;
++		termios.c_ispeed = port->state->port.tty->termios.c_ispeed;
++		termios.c_ospeed = port->state->port.tty->termios.c_ospeed;
++	}
++
++	baud = serial8250_get_baud_rate(port, &termios, NULL);
++	quot = serial8250_get_divisor(port, baud, &frac);
++
++	serial8250_set_divisor(port, baud, quot, frac);
++	serial_port_out(port, UART_LCR, up->lcr);
++	serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
++}
++
++/*
++ * Print a string to the serial port using the device FIFO
++ *
++ * It sends fifosize bytes and then waits for the fifo
++ * to get empty.
++ */
++static void serial8250_console_fifo_write(struct uart_8250_port *up,
++					  const char *s, unsigned int count)
++{
++	int i;
++	const char *end = s + count;
++	unsigned int fifosize = up->tx_loadsz;
++	bool cr_sent = false;
++
++	while (s != end) {
++		wait_for_lsr(up, UART_LSR_THRE);
++
++		for (i = 0; i < fifosize && s != end; ++i) {
++			if (*s == '\n' && !cr_sent) {
++				serial_out(up, UART_TX, '\r');
++				cr_sent = true;
++			} else {
++				serial_out(up, UART_TX, *s++);
++				cr_sent = false;
++			}
++		}
++	}
++}
++
++/*
++ *	Print a string to the serial port trying not to disturb
++ *	any possible real use of the port...
++ *
++ *	The console_lock must be held when we get here.
++ *
++ *	Doing runtime PM is really a bad idea for the kernel console.
++ *	Thus, we assume the function is called when device is powered up.
++ */
++void serial8250_console_write(struct uart_8250_port *up, const char *s,
++			      unsigned int count)
++{
++	struct uart_8250_em485 *em485 = up->em485;
++	struct uart_port *port = &up->port;
++	unsigned long flags;
++	unsigned int ier, use_fifo;
++	int locked = 1;
++
++	touch_nmi_watchdog();
++
++	if (oops_in_progress)
++		locked = spin_trylock_irqsave(&port->lock, flags);
++	else
++		spin_lock_irqsave(&port->lock, flags);
++
++	/*
++	 *	First save the IER then disable the interrupts
++	 */
++	ier = serial_port_in(port, UART_IER);
++
++	if (up->capabilities & UART_CAP_UUE)
++		serial_port_out(port, UART_IER, UART_IER_UUE);
++	else
++		serial_port_out(port, UART_IER, 0);
++
++	/* check scratch reg to see if port powered off during system sleep */
++	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
++		serial8250_console_restore(up);
++		up->canary = 0;
++	}
++
++	if (em485) {
++		if (em485->tx_stopped)
++			up->rs485_start_tx(up);
++		mdelay(port->rs485.delay_rts_before_send);
++	}
++
++	use_fifo = (up->capabilities & UART_CAP_FIFO) &&
++		/*
++		 * BCM283x requires to check the fifo
++		 * after each byte.
++		 */
++		!(up->capabilities & UART_CAP_MINI) &&
++		/*
++		 * tx_loadsz contains the transmit fifo size
++		 */
++		up->tx_loadsz > 1 &&
++		(up->fcr & UART_FCR_ENABLE_FIFO) &&
++		port->state &&
++		test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) &&
++		/*
++		 * After we put a data in the fifo, the controller will send
++		 * it regardless of the CTS state. Therefore, only use fifo
++		 * if we don't use control flow.
++		 */
++		!(up->port.flags & UPF_CONS_FLOW);
++
++	if (likely(use_fifo))
++		serial8250_console_fifo_write(up, s, count);
++	else
++		uart_console_write(port, s, count, serial8250_console_putchar);
++
++	/*
++	 *	Finally, wait for transmitter to become empty
++	 *	and restore the IER
++	 */
++	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
++
++	if (em485) {
++		mdelay(port->rs485.delay_rts_after_send);
++		if (em485->tx_stopped)
++			up->rs485_stop_tx(up);
++	}
++
++	serial_port_out(port, UART_IER, ier);
++
++	/*
++	 *	The receive handling will happen properly because the
++	 *	receive ready bit will still be set; it is not cleared
++	 *	on read.  However, modem control will not, we must
++	 *	call it if we have saved something in the saved flags
++	 *	while processing with interrupts off.
++	 */
++	if (up->msr_saved_flags)
++		serial8250_modem_status(up);
++
++	if (locked)
++		spin_unlock_irqrestore(&port->lock, flags);
++}
++
++static unsigned int probe_baud(struct uart_port *port)
++{
++	unsigned char lcr, dll, dlm;
++	unsigned int quot;
++
++	lcr = serial_port_in(port, UART_LCR);
++	serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB);
++	dll = serial_port_in(port, UART_DLL);
++	dlm = serial_port_in(port, UART_DLM);
++	serial_port_out(port, UART_LCR, lcr);
++
++	quot = (dlm << 8) | dll;
++	return (port->uartclk / 16) / quot;
++}
++
++int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
++{
++	int baud = 9600;
++	int bits = 8;
++	int parity = 'n';
++	int flow = 'n';
++	int ret;
++
++	if (!port->iobase && !port->membase)
++		return -ENODEV;
++
++	if (options)
++		uart_parse_options(options, &baud, &parity, &bits, &flow);
++	else if (probe)
++		baud = probe_baud(port);
++
++	ret = uart_set_options(port, port->cons, baud, parity, bits, flow);
++	if (ret)
++		return ret;
++
++	if (port->dev)
++		pm_runtime_get_sync(port->dev);
++
++	return 0;
++}
++
++int serial8250_console_exit(struct uart_port *port)
++{
++	if (port->dev)
++		pm_runtime_put_sync(port->dev);
++
++	return 0;
++}
++
++#endif /* CONFIG_SERIAL_8250_CONSOLE */
++
++MODULE_LICENSE("GPL");
+diff -rupN linux.orig/drivers/tty/serial/8250/Kconfig linux/drivers/tty/serial/8250/Kconfig
+--- linux.orig/drivers/tty/serial/8250/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/Kconfig	2022-12-04 10:40:26.708034065 -0500
 @@ -9,6 +9,7 @@ config SERIAL_8250
  	depends on !S390
  	select SERIAL_CORE
@@ -4064,11 +26933,10 @@ index d0b49e15fbf5e..02c308467339c 100644
  	help
  	  This selects whether you want to include the driver for the standard
  	  serial ports.  The standard answer is Y.  People who might say N
-diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
-index 15f0e4d88c5a0..ffdb001e3d109 100644
---- a/drivers/tty/serial/amba-pl011.c
-+++ b/drivers/tty/serial/amba-pl011.c
-@@ -2308,18 +2308,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
+diff -rupN linux.orig/drivers/tty/serial/amba-pl011.c linux/drivers/tty/serial/amba-pl011.c
+--- linux.orig/drivers/tty/serial/amba-pl011.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/amba-pl011.c	2022-12-04 10:40:26.708034065 -0500
+@@ -2308,18 +2308,24 @@ pl011_console_write(struct console *co,
  {
  	struct uart_amba_port *uap = amba_ports[co->index];
  	unsigned int old_cr = 0, new_cr;
@@ -4097,7 +26965,7 @@ index 15f0e4d88c5a0..ffdb001e3d109 100644
  
  	/*
  	 *	First save the CR then disable the interrupts
-@@ -2345,8 +2351,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
+@@ -2345,8 +2351,7 @@ pl011_console_write(struct console *co,
  		pl011_write(old_cr, uap, REG_CR);
  
  	if (locked)
@@ -4107,11 +26975,10 @@ index 15f0e4d88c5a0..ffdb001e3d109 100644
  
  	clk_disable(uap->clk);
  }
-diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
-index 0aa666e247d57..d7130d1ae64c0 100644
---- a/drivers/tty/serial/omap-serial.c
-+++ b/drivers/tty/serial/omap-serial.c
-@@ -1240,13 +1240,10 @@ serial_omap_console_write(struct console *co, const char *s,
+diff -rupN linux.orig/drivers/tty/serial/omap-serial.c linux/drivers/tty/serial/omap-serial.c
+--- linux.orig/drivers/tty/serial/omap-serial.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/omap-serial.c	2022-12-04 10:40:26.708034065 -0500
+@@ -1240,13 +1240,10 @@ serial_omap_console_write(struct console
  	unsigned int ier;
  	int locked = 1;
  
@@ -4128,7 +26995,7 @@ index 0aa666e247d57..d7130d1ae64c0 100644
  
  	/*
  	 * First save the IER then disable the interrupts
-@@ -1273,8 +1270,7 @@ serial_omap_console_write(struct console *co, const char *s,
+@@ -1273,8 +1270,7 @@ serial_omap_console_write(struct console
  		check_modem_status(up);
  
  	if (locked)
@@ -4138,11 +27005,10 @@ index 0aa666e247d57..d7130d1ae64c0 100644
  }
  
  static int __init
-diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
-index d2b2720db6ca7..18e623325887f 100644
---- a/drivers/tty/sysrq.c
-+++ b/drivers/tty/sysrq.c
-@@ -581,6 +581,7 @@ void __handle_sysrq(int key, bool check_mask)
+diff -rupN linux.orig/drivers/tty/sysrq.c linux/drivers/tty/sysrq.c
+--- linux.orig/drivers/tty/sysrq.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/sysrq.c	2022-12-04 10:40:26.708034065 -0500
+@@ -581,6 +581,7 @@ void __handle_sysrq(int key, bool check_
  
  	rcu_sysrq_start();
  	rcu_read_lock();
@@ -4150,7 +27016,7 @@ index d2b2720db6ca7..18e623325887f 100644
  	/*
  	 * Raise the apparent loglevel to maximum so that the sysrq header
  	 * is shown to provide the user with positive feedback.  We do not
-@@ -622,6 +623,7 @@ void __handle_sysrq(int key, bool check_mask)
+@@ -622,6 +623,7 @@ void __handle_sysrq(int key, bool check_
  		pr_cont("\n");
  		console_loglevel = orig_log_level;
  	}
@@ -4158,10 +27024,9 @@ index d2b2720db6ca7..18e623325887f 100644
  	rcu_read_unlock();
  	rcu_sysrq_end();
  
-diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
-index 4e0e50e7ac153..173e979b84a93 100644
---- a/drivers/vdpa/vdpa_user/iova_domain.h
-+++ b/drivers/vdpa/vdpa_user/iova_domain.h
+diff -rupN linux.orig/drivers/vdpa/vdpa_user/iova_domain.h linux/drivers/vdpa/vdpa_user/iova_domain.h
+--- linux.orig/drivers/vdpa/vdpa_user/iova_domain.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/vdpa/vdpa_user/iova_domain.h	2022-12-04 10:40:26.708034065 -0500
 @@ -14,7 +14,6 @@
  #include <linux/iova.h>
  #include <linux/dma-mapping.h>
@@ -4170,10 +27035,9 @@ index 4e0e50e7ac153..173e979b84a93 100644
  
  #define IOVA_START_PFN 1
  
-diff --git a/fs/dcache.c b/fs/dcache.c
-index bb0c4d0038dbd..2ee8636016ee9 100644
---- a/fs/dcache.c
-+++ b/fs/dcache.c
+diff -rupN linux.orig/fs/dcache.c linux/fs/dcache.c
+--- linux.orig/fs/dcache.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/fs/dcache.c	2022-12-04 10:40:26.708034065 -0500
 @@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash);
  
  static inline unsigned start_dir_add(struct inode *dir)
@@ -4191,7 +27055,7 @@ index bb0c4d0038dbd..2ee8636016ee9 100644
  	for (;;) {
  		unsigned n = dir->i_dir_seq;
  		if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
-@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
+@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct in
  			       wait_queue_head_t *d_wait)
  {
  	smp_store_release(&dir->i_dir_seq, n + 2);
@@ -4201,10 +27065,9 @@ index bb0c4d0038dbd..2ee8636016ee9 100644
  	wake_up_all(d_wait);
  }
  
-diff --git a/include/linux/console.h b/include/linux/console.h
-index 8c1686e2c2337..8a813cbaf9285 100644
---- a/include/linux/console.h
-+++ b/include/linux/console.h
+diff -rupN linux.orig/include/linux/console.h linux/include/linux/console.h
+--- linux.orig/include/linux/console.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/console.h	2022-12-04 10:40:26.712034055 -0500
 @@ -16,6 +16,7 @@
  
  #include <linux/atomic.h>
@@ -4269,10 +27132,9 @@ index 8c1686e2c2337..8a813cbaf9285 100644
  	CONSOLE_FLUSH_PENDING,
  	CONSOLE_REPLAY_ALL,
  };
-diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
-index 84a466b176cf4..df6d17bc30aa3 100644
---- a/include/linux/entry-common.h
-+++ b/include/linux/entry-common.h
+diff -rupN linux.orig/include/linux/entry-common.h linux/include/linux/entry-common.h
+--- linux.orig/include/linux/entry-common.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/entry-common.h	2022-12-04 10:40:26.712034055 -0500
 @@ -57,9 +57,15 @@
  # define ARCH_EXIT_TO_USER_MODE_WORK		(0)
  #endif
@@ -4290,11 +27152,10 @@ index 84a466b176cf4..df6d17bc30aa3 100644
  	 ARCH_EXIT_TO_USER_MODE_WORK)
  
  /**
-diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
-index a92bce40b04b3..bf82980f569df 100644
---- a/include/linux/interrupt.h
-+++ b/include/linux/interrupt.h
-@@ -605,6 +605,35 @@ extern void __raise_softirq_irqoff(unsigned int nr);
+diff -rupN linux.orig/include/linux/interrupt.h linux/include/linux/interrupt.h
+--- linux.orig/include/linux/interrupt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/interrupt.h	2022-12-04 10:40:26.712034055 -0500
+@@ -605,6 +605,35 @@ extern void __raise_softirq_irqoff(unsig
  extern void raise_softirq_irqoff(unsigned int nr);
  extern void raise_softirq(unsigned int nr);
  
@@ -4330,11 +27191,10 @@ index a92bce40b04b3..bf82980f569df 100644
  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
  
  static inline struct task_struct *this_cpu_ksoftirqd(void)
-diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
-index 1cd4e36890fbf..844a8e30e6de5 100644
---- a/include/linux/irqdesc.h
-+++ b/include/linux/irqdesc.h
-@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int irq);
+diff -rupN linux.orig/include/linux/irqdesc.h linux/include/linux/irqdesc.h
+--- linux.orig/include/linux/irqdesc.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/irqdesc.h	2022-12-04 10:40:26.712034055 -0500
+@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int
   * conversion failed.
   */
  int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq);
@@ -4342,10 +27202,9 @@ index 1cd4e36890fbf..844a8e30e6de5 100644
  int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq);
  #endif
  
-diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
-index 1f1099dac3f05..1023f349af716 100644
---- a/include/linux/lockdep.h
-+++ b/include/linux/lockdep.h
+diff -rupN linux.orig/include/linux/lockdep.h linux/include/linux/lockdep.h
+--- linux.orig/include/linux/lockdep.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/lockdep.h	2022-12-04 10:40:26.712034055 -0500
 @@ -435,7 +435,6 @@ enum xhlock_context_t {
  	XHLOCK_CTX_NR,
  };
@@ -4354,11 +27213,10 @@ index 1f1099dac3f05..1023f349af716 100644
  /*
   * To initialize a lockdep_map statically use this macro.
   * Note that _name must not be NULL.
-diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
-index 15ae78cd28536..b8728d11c9490 100644
---- a/include/linux/mmdebug.h
-+++ b/include/linux/mmdebug.h
-@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm);
+diff -rupN linux.orig/include/linux/mmdebug.h linux/include/linux/mmdebug.h
+--- linux.orig/include/linux/mmdebug.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/mmdebug.h	2022-12-04 10:40:26.712034055 -0500
+@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm)
  #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
  #endif
  
@@ -4371,10 +27229,9 @@ index 15ae78cd28536..b8728d11c9490 100644
  #ifdef CONFIG_DEBUG_VIRTUAL
  #define VIRTUAL_BUG_ON(cond) BUG_ON(cond)
  #else
-diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
-index 05d6f3facd5a5..5e6b840f5a9ac 100644
---- a/include/linux/netdevice.h
-+++ b/include/linux/netdevice.h
+diff -rupN linux.orig/include/linux/netdevice.h linux/include/linux/netdevice.h
+--- linux.orig/include/linux/netdevice.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/netdevice.h	2022-12-04 10:40:26.712034055 -0500
 @@ -3156,7 +3156,11 @@ struct softnet_data {
  	int			defer_count;
  	int			defer_ipi_scheduled;
@@ -4387,10 +27244,9 @@ index 05d6f3facd5a5..5e6b840f5a9ac 100644
  };
  
  static inline void input_queue_head_incr(struct softnet_data *sd)
-diff --git a/include/linux/preempt.h b/include/linux/preempt.h
-index b4381f255a5ca..12f59cdaaedda 100644
---- a/include/linux/preempt.h
-+++ b/include/linux/preempt.h
+diff -rupN linux.orig/include/linux/preempt.h linux/include/linux/preempt.h
+--- linux.orig/include/linux/preempt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/preempt.h	2022-12-04 10:40:26.712034055 -0500
 @@ -196,6 +196,20 @@ extern void preempt_count_sub(int val);
  #define preempt_count_inc() preempt_count_add(1)
  #define preempt_count_dec() preempt_count_sub(1)
@@ -4537,10 +27393,9 @@ index b4381f255a5ca..12f59cdaaedda 100644
 +}
 +
  #endif /* __LINUX_PREEMPT_H */
-diff --git a/include/linux/printk.h b/include/linux/printk.h
-index cf7d666ab1f8e..f88ec15f83dcc 100644
---- a/include/linux/printk.h
-+++ b/include/linux/printk.h
+diff -rupN linux.orig/include/linux/printk.h linux/include/linux/printk.h
+--- linux.orig/include/linux/printk.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/printk.h	2022-12-04 10:40:26.712034055 -0500
 @@ -169,7 +169,11 @@ extern void __printk_safe_exit(void);
  #define printk_deferred_enter __printk_safe_enter
  #define printk_deferred_exit __printk_safe_exit
@@ -4553,7 +27408,7 @@ index cf7d666ab1f8e..f88ec15f83dcc 100644
  
  /*
   * Please don't use printk_ratelimit(), because it shares ratelimiting state
-@@ -221,11 +225,23 @@ static inline void printk_deferred_exit(void)
+@@ -221,11 +225,23 @@ static inline void printk_deferred_exit(
  {
  }
  
@@ -4577,10 +27432,9 @@ index cf7d666ab1f8e..f88ec15f83dcc 100644
  static inline int printk_ratelimit(void)
  {
  	return 0;
-diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
-index 8f416c5e929ea..c0ef596f340b5 100644
---- a/include/linux/rwlock.h
-+++ b/include/linux/rwlock.h
+diff -rupN linux.orig/include/linux/rwlock.h linux/include/linux/rwlock.h
+--- linux.orig/include/linux/rwlock.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/rwlock.h	2022-12-04 10:40:26.712034055 -0500
 @@ -1,7 +1,7 @@
  #ifndef __LINUX_RWLOCK_H
  #define __LINUX_RWLOCK_H
@@ -4590,11 +27444,10 @@ index 8f416c5e929ea..c0ef596f340b5 100644
  # error "please don't include this file directly"
  #endif
  
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 8d82d6d326701..e1623b3001c5b 100644
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -2038,6 +2038,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
+diff -rupN linux.orig/include/linux/sched.h linux/include/linux/sched.h
+--- linux.orig/include/linux/sched.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/sched.h	2022-12-04 10:40:26.712034055 -0500
+@@ -2038,6 +2038,43 @@ static inline int test_tsk_need_resched(
  	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
  }
  
@@ -4638,10 +27491,9 @@ index 8d82d6d326701..e1623b3001c5b 100644
  /*
   * cond_resched() and cond_resched_lock(): latency reduction via
   * explicit rescheduling in places that are safe. The return
-diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
-index 16e3d75a324c7..ee1f719a21678 100644
---- a/include/linux/serial_8250.h
-+++ b/include/linux/serial_8250.h
+diff -rupN linux.orig/include/linux/serial_8250.h linux/include/linux/serial_8250.h
+--- linux.orig/include/linux/serial_8250.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/serial_8250.h	2022-12-04 10:40:26.712034055 -0500
 @@ -7,6 +7,7 @@
  #ifndef _LINUX_SERIAL_8250_H
  #define _LINUX_SERIAL_8250_H
@@ -4659,7 +27511,7 @@ index 16e3d75a324c7..ee1f719a21678 100644
  	struct uart_8250_dma	*dma;
  	const struct uart_8250_ops *ops;
  
-@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up);
+@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_82
  void serial8250_set_defaults(struct uart_8250_port *up);
  void serial8250_console_write(struct uart_8250_port *up, const char *s,
  			      unsigned int count);
@@ -4668,28 +27520,9 @@ index 16e3d75a324c7..ee1f719a21678 100644
  int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
  int serial8250_console_exit(struct uart_port *port);
  
-diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
-index 5c0c5174155d0..1341f7d62da44 100644
---- a/include/linux/spinlock.h
-+++ b/include/linux/spinlock.h
-@@ -1,6 +1,7 @@
- /* SPDX-License-Identifier: GPL-2.0 */
- #ifndef __LINUX_SPINLOCK_H
- #define __LINUX_SPINLOCK_H
-+#define __LINUX_INSIDE_SPINLOCK_H
- 
- /*
-  * include/linux/spinlock.h - generic spinlock/rwlock declarations
-@@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
- 
- void free_bucket_spinlocks(spinlock_t *locks);
- 
-+#undef __LINUX_INSIDE_SPINLOCK_H
- #endif /* __LINUX_SPINLOCK_H */
-diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
-index 51fa0dab68c4d..89eb6f4c659c7 100644
---- a/include/linux/spinlock_api_smp.h
-+++ b/include/linux/spinlock_api_smp.h
+diff -rupN linux.orig/include/linux/spinlock_api_smp.h linux/include/linux/spinlock_api_smp.h
+--- linux.orig/include/linux/spinlock_api_smp.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock_api_smp.h	2022-12-04 10:40:26.712034055 -0500
 @@ -1,7 +1,7 @@
  #ifndef __LINUX_SPINLOCK_API_SMP_H
  #define __LINUX_SPINLOCK_API_SMP_H
@@ -4699,10 +27532,9 @@ index 51fa0dab68c4d..89eb6f4c659c7 100644
  # error "please don't include this file directly"
  #endif
  
-diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
-index b8ba00ccccdeb..819aeba1c87e6 100644
---- a/include/linux/spinlock_api_up.h
-+++ b/include/linux/spinlock_api_up.h
+diff -rupN linux.orig/include/linux/spinlock_api_up.h linux/include/linux/spinlock_api_up.h
+--- linux.orig/include/linux/spinlock_api_up.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock_api_up.h	2022-12-04 10:40:26.712034055 -0500
 @@ -1,7 +1,7 @@
  #ifndef __LINUX_SPINLOCK_API_UP_H
  #define __LINUX_SPINLOCK_API_UP_H
@@ -4712,10 +27544,26 @@ index b8ba00ccccdeb..819aeba1c87e6 100644
  # error "please don't include this file directly"
  #endif
  
-diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
-index 835aedaf68acd..61c49b16f69ab 100644
---- a/include/linux/spinlock_rt.h
-+++ b/include/linux/spinlock_rt.h
+diff -rupN linux.orig/include/linux/spinlock.h linux/include/linux/spinlock.h
+--- linux.orig/include/linux/spinlock.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock.h	2022-12-04 10:40:26.712034055 -0500
+@@ -1,6 +1,7 @@
+ /* SPDX-License-Identifier: GPL-2.0 */
+ #ifndef __LINUX_SPINLOCK_H
+ #define __LINUX_SPINLOCK_H
++#define __LINUX_INSIDE_SPINLOCK_H
+ 
+ /*
+  * include/linux/spinlock.h - generic spinlock/rwlock declarations
+@@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t
+ 
+ void free_bucket_spinlocks(spinlock_t *locks);
+ 
++#undef __LINUX_INSIDE_SPINLOCK_H
+ #endif /* __LINUX_SPINLOCK_H */
+diff -rupN linux.orig/include/linux/spinlock_rt.h linux/include/linux/spinlock_rt.h
+--- linux.orig/include/linux/spinlock_rt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock_rt.h	2022-12-04 10:40:26.712034055 -0500
 @@ -2,7 +2,7 @@
  #ifndef __LINUX_SPINLOCK_RT_H
  #define __LINUX_SPINLOCK_RT_H
@@ -4725,10 +27573,9 @@ index 835aedaf68acd..61c49b16f69ab 100644
  #error Do not include directly. Use spinlock.h
  #endif
  
-diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
-index 16521074b6f7c..c87204247592f 100644
---- a/include/linux/spinlock_up.h
-+++ b/include/linux/spinlock_up.h
+diff -rupN linux.orig/include/linux/spinlock_up.h linux/include/linux/spinlock_up.h
+--- linux.orig/include/linux/spinlock_up.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock_up.h	2022-12-04 10:40:26.716034044 -0500
 @@ -1,7 +1,7 @@
  #ifndef __LINUX_SPINLOCK_UP_H
  #define __LINUX_SPINLOCK_UP_H
@@ -4738,11 +27585,10 @@ index 16521074b6f7c..c87204247592f 100644
  # error "please don't include this file directly"
  #endif
  
-diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
-index 9f392ec76f2bb..779e0e96b9cb0 100644
---- a/include/linux/thread_info.h
-+++ b/include/linux/thread_info.h
-@@ -177,7 +177,17 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
+diff -rupN linux.orig/include/linux/thread_info.h linux/include/linux/thread_info.h
+--- linux.orig/include/linux/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/thread_info.h	2022-12-04 10:40:26.716034044 -0500
+@@ -177,7 +177,17 @@ static __always_inline unsigned long rea
  	clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
  #endif /* !CONFIG_GENERIC_ENTRY */
  
@@ -4761,10 +27607,9 @@ index 9f392ec76f2bb..779e0e96b9cb0 100644
  
  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
  static inline int arch_within_stack_frames(const void * const stack,
-diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
-index 20749bd9db718..224bf60d6563c 100644
---- a/include/linux/trace_events.h
-+++ b/include/linux/trace_events.h
+diff -rupN linux.orig/include/linux/trace_events.h linux/include/linux/trace_events.h
+--- linux.orig/include/linux/trace_events.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/trace_events.h	2022-12-04 10:40:26.716034044 -0500
 @@ -70,6 +70,7 @@ struct trace_entry {
  	unsigned char		flags;
  	unsigned char		preempt_count;
@@ -4773,7 +27618,7 @@ index 20749bd9db718..224bf60d6563c 100644
  };
  
  #define TRACE_EVENT_TYPE_MAX						\
-@@ -159,9 +160,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry,
+@@ -159,9 +160,10 @@ static inline void tracing_generic_entry
  						unsigned int trace_ctx)
  {
  	entry->preempt_count		= trace_ctx & 0xff;
@@ -4799,10 +27644,9 @@ index 20749bd9db718..224bf60d6563c 100644
  	TRACE_FLAG_NMI			= 0x40,
  	TRACE_FLAG_BH_OFF		= 0x80,
  };
-diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
-index 6ad4e9032d538..ffe48e69b3f3a 100644
---- a/include/linux/u64_stats_sync.h
-+++ b/include/linux/u64_stats_sync.h
+diff -rupN linux.orig/include/linux/u64_stats_sync.h linux/include/linux/u64_stats_sync.h
+--- linux.orig/include/linux/u64_stats_sync.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/u64_stats_sync.h	2022-12-04 10:40:26.716034044 -0500
 @@ -8,7 +8,7 @@
   *
   * Key points :
@@ -4843,7 +27687,7 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
  	seqcount_t	seq;
  #endif
  };
-@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p)
+@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_sta
  	local64_inc(&p->v);
  }
  
@@ -4867,7 +27711,7 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
  
  typedef struct {
  	u64		v;
-@@ -123,122 +134,82 @@ static inline void u64_stats_inc(u64_stats_t *p)
+@@ -123,122 +134,82 @@ static inline void u64_stats_inc(u64_sta
  {
  	p->v++;
  }
@@ -4944,25 +27788,50 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
 -#else
 -	return 0;
 -#endif
-+}
-+
+ }
+ 
+-static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 +					   unsigned int start)
-+{
+ {
+-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
+-	preempt_disable();
+-#endif
+-	return __u64_stats_fetch_begin(syncp);
 +	return read_seqcount_retry(&syncp->seq, start);
-+}
+ }
 +#endif /* !64 bit */
-+
+ 
+-static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+-					 unsigned int start)
 +static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
-+{
+ {
+-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
+-	return read_seqcount_retry(&syncp->seq, start);
+-#else
+-	return false;
+-#endif
 +	__u64_stats_update_begin(syncp);
-+}
-+
+ }
+ 
+-static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+-					 unsigned int start)
 +static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
-+{
+ {
+-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
+-	preempt_enable();
+-#endif
+-	return __u64_stats_fetch_retry(syncp, start);
 +	__u64_stats_update_end(syncp);
-+}
-+
+ }
+ 
+-/*
+- * In case irq handlers can update u64 counters, readers can use following helpers
+- * - SMP 32bit arches use seqcount protection, irq safe.
+- * - UP 32bit must disable irqs.
+- * - 64bit have no problem atomically reading u64 values, irq safe.
+- */
+-static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
 +static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
 +{
 +	unsigned long flags = __u64_stats_irqsave();
@@ -4976,54 +27845,23 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
 +{
 +	__u64_stats_update_end(syncp);
 +	__u64_stats_irqrestore(flags);
- }
- 
- static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
++}
++
++static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
  {
--#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
--	preempt_disable();
--#endif
- 	return __u64_stats_fetch_begin(syncp);
- }
- 
--static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
--					 unsigned int start)
--{
--#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
--	return read_seqcount_retry(&syncp->seq, start);
--#else
--	return false;
--#endif
--}
--
- static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
- 					 unsigned int start)
- {
--#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
--	preempt_enable();
--#endif
--	return __u64_stats_fetch_retry(syncp, start);
--}
--
--/*
-- * In case irq handlers can update u64 counters, readers can use following helpers
-- * - SMP 32bit arches use seqcount protection, irq safe.
-- * - UP 32bit must disable irqs.
-- * - 64bit have no problem atomically reading u64 values, irq safe.
-- */
--static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
--{
 -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
 -	preempt_disable();
 -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
 -	local_irq_disable();
 -#endif
--	return __u64_stats_fetch_begin(syncp);
--}
--
+ 	return __u64_stats_fetch_begin(syncp);
+ }
+ 
 -static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
 -					     unsigned int start)
--{
++static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
++					 unsigned int start)
+ {
 -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
 -	preempt_enable();
 -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
@@ -5032,10 +27870,9 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
  	return __u64_stats_fetch_retry(syncp, start);
  }
  
-diff --git a/init/Kconfig b/init/Kconfig
-index 532362fcfe31f..08ec5f25e6642 100644
---- a/init/Kconfig
-+++ b/init/Kconfig
+diff -rupN linux.orig/init/Kconfig linux/init/Kconfig
+--- linux.orig/init/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/init/Kconfig	2022-12-04 10:40:26.716034044 -0500
 @@ -1574,6 +1574,10 @@ config PRINTK
  	  very difficult to diagnose system problems, saying N here is
  	  strongly discouraged.
@@ -5047,27 +27884,10 @@ index 532362fcfe31f..08ec5f25e6642 100644
  config BUG
  	bool "BUG() support" if EXPERT
  	default y
-diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
-index c2f1fd95a8214..260c08efeb486 100644
---- a/kernel/Kconfig.preempt
-+++ b/kernel/Kconfig.preempt
-@@ -1,5 +1,11 @@
- # SPDX-License-Identifier: GPL-2.0-only
- 
-+config HAVE_PREEMPT_LAZY
-+	bool
-+
-+config PREEMPT_LAZY
-+	def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT
-+
- config PREEMPT_NONE_BUILD
- 	bool
- 
-diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
-index 22e7a805c6723..b492e482b63a9 100644
---- a/kernel/bpf/syscall.c
-+++ b/kernel/bpf/syscall.c
-@@ -2107,11 +2107,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
+diff -rupN linux.orig/kernel/bpf/syscall.c linux/kernel/bpf/syscall.c
+--- linux.orig/kernel/bpf/syscall.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/bpf/syscall.c	2022-12-04 10:40:26.716034044 -0500
+@@ -2118,11 +2118,11 @@ static void bpf_prog_get_stats(const str
  
  		st = per_cpu_ptr(prog->stats, cpu);
  		do {
@@ -5081,11 +27901,5333 @@ index 22e7a805c6723..b492e482b63a9 100644
  		nsecs += tnsecs;
  		cnt += tcnt;
  		misses += tmisses;
-diff --git a/kernel/entry/common.c b/kernel/entry/common.c
-index 063068a9ea9b3..26b772720b227 100644
---- a/kernel/entry/common.c
-+++ b/kernel/entry/common.c
-@@ -153,7 +153,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+diff -rupN linux.orig/kernel/bpf/syscall.c.orig linux/kernel/bpf/syscall.c.orig
+--- linux.orig/kernel/bpf/syscall.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/kernel/bpf/syscall.c.orig	2022-12-04 10:40:18.684054629 -0500
+@@ -0,0 +1,5319 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
++ */
++#include <linux/bpf.h>
++#include <linux/bpf-cgroup.h>
++#include <linux/bpf_trace.h>
++#include <linux/bpf_lirc.h>
++#include <linux/bpf_verifier.h>
++#include <linux/bsearch.h>
++#include <linux/btf.h>
++#include <linux/syscalls.h>
++#include <linux/slab.h>
++#include <linux/sched/signal.h>
++#include <linux/vmalloc.h>
++#include <linux/mmzone.h>
++#include <linux/anon_inodes.h>
++#include <linux/fdtable.h>
++#include <linux/file.h>
++#include <linux/fs.h>
++#include <linux/license.h>
++#include <linux/filter.h>
++#include <linux/kernel.h>
++#include <linux/idr.h>
++#include <linux/cred.h>
++#include <linux/timekeeping.h>
++#include <linux/ctype.h>
++#include <linux/nospec.h>
++#include <linux/audit.h>
++#include <uapi/linux/btf.h>
++#include <linux/pgtable.h>
++#include <linux/bpf_lsm.h>
++#include <linux/poll.h>
++#include <linux/sort.h>
++#include <linux/bpf-netns.h>
++#include <linux/rcupdate_trace.h>
++#include <linux/memcontrol.h>
++#include <linux/trace_events.h>
++
++#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
++			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
++			  (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
++#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
++#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
++#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
++			IS_FD_HASH(map))
++
++#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
++
++DEFINE_PER_CPU(int, bpf_prog_active);
++static DEFINE_IDR(prog_idr);
++static DEFINE_SPINLOCK(prog_idr_lock);
++static DEFINE_IDR(map_idr);
++static DEFINE_SPINLOCK(map_idr_lock);
++static DEFINE_IDR(link_idr);
++static DEFINE_SPINLOCK(link_idr_lock);
++
++int sysctl_unprivileged_bpf_disabled __read_mostly =
++	IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
++
++static const struct bpf_map_ops * const bpf_map_types[] = {
++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
++#define BPF_MAP_TYPE(_id, _ops) \
++	[_id] = &_ops,
++#define BPF_LINK_TYPE(_id, _name)
++#include <linux/bpf_types.h>
++#undef BPF_PROG_TYPE
++#undef BPF_MAP_TYPE
++#undef BPF_LINK_TYPE
++};
++
++/*
++ * If we're handed a bigger struct than we know of, ensure all the unknown bits
++ * are 0 - i.e. new user-space does not rely on any kernel feature extensions
++ * we don't know about yet.
++ *
++ * There is a ToCToU between this function call and the following
++ * copy_from_user() call. However, this is not a concern since this function is
++ * meant to be a future-proofing of bits.
++ */
++int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
++			     size_t expected_size,
++			     size_t actual_size)
++{
++	int res;
++
++	if (unlikely(actual_size > PAGE_SIZE))	/* silly large */
++		return -E2BIG;
++
++	if (actual_size <= expected_size)
++		return 0;
++
++	if (uaddr.is_kernel)
++		res = memchr_inv(uaddr.kernel + expected_size, 0,
++				 actual_size - expected_size) == NULL;
++	else
++		res = check_zeroed_user(uaddr.user + expected_size,
++					actual_size - expected_size);
++	if (res < 0)
++		return res;
++	return res ? 0 : -E2BIG;
++}
++
++const struct bpf_map_ops bpf_map_offload_ops = {
++	.map_meta_equal = bpf_map_meta_equal,
++	.map_alloc = bpf_map_offload_map_alloc,
++	.map_free = bpf_map_offload_map_free,
++	.map_check_btf = map_check_no_btf,
++};
++
++static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
++{
++	const struct bpf_map_ops *ops;
++	u32 type = attr->map_type;
++	struct bpf_map *map;
++	int err;
++
++	if (type >= ARRAY_SIZE(bpf_map_types))
++		return ERR_PTR(-EINVAL);
++	type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
++	ops = bpf_map_types[type];
++	if (!ops)
++		return ERR_PTR(-EINVAL);
++
++	if (ops->map_alloc_check) {
++		err = ops->map_alloc_check(attr);
++		if (err)
++			return ERR_PTR(err);
++	}
++	if (attr->map_ifindex)
++		ops = &bpf_map_offload_ops;
++	map = ops->map_alloc(attr);
++	if (IS_ERR(map))
++		return map;
++	map->ops = ops;
++	map->map_type = type;
++	return map;
++}
++
++static void bpf_map_write_active_inc(struct bpf_map *map)
++{
++	atomic64_inc(&map->writecnt);
++}
++
++static void bpf_map_write_active_dec(struct bpf_map *map)
++{
++	atomic64_dec(&map->writecnt);
++}
++
++bool bpf_map_write_active(const struct bpf_map *map)
++{
++	return atomic64_read(&map->writecnt) != 0;
++}
++
++static u32 bpf_map_value_size(const struct bpf_map *map)
++{
++	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
++	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
++	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
++	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
++		return round_up(map->value_size, 8) * num_possible_cpus();
++	else if (IS_FD_MAP(map))
++		return sizeof(u32);
++	else
++		return  map->value_size;
++}
++
++static void maybe_wait_bpf_programs(struct bpf_map *map)
++{
++	/* Wait for any running BPF programs to complete so that
++	 * userspace, when we return to it, knows that all programs
++	 * that could be running use the new map value.
++	 */
++	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
++	    map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
++		synchronize_rcu();
++}
++
++static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
++				void *value, __u64 flags)
++{
++	int err;
++
++	/* Need to create a kthread, thus must support schedule */
++	if (bpf_map_is_dev_bound(map)) {
++		return bpf_map_offload_update_elem(map, key, value, flags);
++	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
++		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
++		return map->ops->map_update_elem(map, key, value, flags);
++	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
++		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
++		return sock_map_update_elem_sys(map, key, value, flags);
++	} else if (IS_FD_PROG_ARRAY(map)) {
++		return bpf_fd_array_map_update_elem(map, f.file, key, value,
++						    flags);
++	}
++
++	bpf_disable_instrumentation();
++	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
++	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
++		err = bpf_percpu_hash_update(map, key, value, flags);
++	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
++		err = bpf_percpu_array_update(map, key, value, flags);
++	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
++		err = bpf_percpu_cgroup_storage_update(map, key, value,
++						       flags);
++	} else if (IS_FD_ARRAY(map)) {
++		rcu_read_lock();
++		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
++						   flags);
++		rcu_read_unlock();
++	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
++		rcu_read_lock();
++		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
++						  flags);
++		rcu_read_unlock();
++	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
++		/* rcu_read_lock() is not needed */
++		err = bpf_fd_reuseport_array_update_elem(map, key, value,
++							 flags);
++	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
++		   map->map_type == BPF_MAP_TYPE_STACK ||
++		   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
++		err = map->ops->map_push_elem(map, value, flags);
++	} else {
++		rcu_read_lock();
++		err = map->ops->map_update_elem(map, key, value, flags);
++		rcu_read_unlock();
++	}
++	bpf_enable_instrumentation();
++	maybe_wait_bpf_programs(map);
++
++	return err;
++}
++
++static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
++			      __u64 flags)
++{
++	void *ptr;
++	int err;
++
++	if (bpf_map_is_dev_bound(map))
++		return bpf_map_offload_lookup_elem(map, key, value);
++
++	bpf_disable_instrumentation();
++	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
++	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
++		err = bpf_percpu_hash_copy(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
++		err = bpf_percpu_array_copy(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
++		err = bpf_percpu_cgroup_storage_copy(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
++		err = bpf_stackmap_copy(map, key, value);
++	} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
++		err = bpf_fd_array_map_lookup_elem(map, key, value);
++	} else if (IS_FD_HASH(map)) {
++		err = bpf_fd_htab_map_lookup_elem(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
++		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
++		   map->map_type == BPF_MAP_TYPE_STACK ||
++		   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
++		err = map->ops->map_peek_elem(map, value);
++	} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
++		/* struct_ops map requires directly updating "value" */
++		err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
++	} else {
++		rcu_read_lock();
++		if (map->ops->map_lookup_elem_sys_only)
++			ptr = map->ops->map_lookup_elem_sys_only(map, key);
++		else
++			ptr = map->ops->map_lookup_elem(map, key);
++		if (IS_ERR(ptr)) {
++			err = PTR_ERR(ptr);
++		} else if (!ptr) {
++			err = -ENOENT;
++		} else {
++			err = 0;
++			if (flags & BPF_F_LOCK)
++				/* lock 'ptr' and copy everything but lock */
++				copy_map_value_locked(map, value, ptr, true);
++			else
++				copy_map_value(map, value, ptr);
++			/* mask lock and timer, since value wasn't zero inited */
++			check_and_init_map_value(map, value);
++		}
++		rcu_read_unlock();
++	}
++
++	bpf_enable_instrumentation();
++	maybe_wait_bpf_programs(map);
++
++	return err;
++}
++
++/* Please, do not use this function outside from the map creation path
++ * (e.g. in map update path) without taking care of setting the active
++ * memory cgroup (see at bpf_map_kmalloc_node() for example).
++ */
++static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
++{
++	/* We really just want to fail instead of triggering OOM killer
++	 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
++	 * which is used for lower order allocation requests.
++	 *
++	 * It has been observed that higher order allocation requests done by
++	 * vmalloc with __GFP_NORETRY being set might fail due to not trying
++	 * to reclaim memory from the page cache, thus we set
++	 * __GFP_RETRY_MAYFAIL to avoid such situations.
++	 */
++
++	const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
++	unsigned int flags = 0;
++	unsigned long align = 1;
++	void *area;
++
++	if (size >= SIZE_MAX)
++		return NULL;
++
++	/* kmalloc()'ed memory can't be mmap()'ed */
++	if (mmapable) {
++		BUG_ON(!PAGE_ALIGNED(size));
++		align = SHMLBA;
++		flags = VM_USERMAP;
++	} else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
++		area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
++				    numa_node);
++		if (area != NULL)
++			return area;
++	}
++
++	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
++			gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
++			flags, numa_node, __builtin_return_address(0));
++}
++
++void *bpf_map_area_alloc(u64 size, int numa_node)
++{
++	return __bpf_map_area_alloc(size, numa_node, false);
++}
++
++void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
++{
++	return __bpf_map_area_alloc(size, numa_node, true);
++}
++
++void bpf_map_area_free(void *area)
++{
++	kvfree(area);
++}
++
++static u32 bpf_map_flags_retain_permanent(u32 flags)
++{
++	/* Some map creation flags are not tied to the map object but
++	 * rather to the map fd instead, so they have no meaning upon
++	 * map object inspection since multiple file descriptors with
++	 * different (access) properties can exist here. Thus, given
++	 * this has zero meaning for the map itself, lets clear these
++	 * from here.
++	 */
++	return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
++}
++
++void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
++{
++	map->map_type = attr->map_type;
++	map->key_size = attr->key_size;
++	map->value_size = attr->value_size;
++	map->max_entries = attr->max_entries;
++	map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
++	map->numa_node = bpf_map_attr_numa_node(attr);
++	map->map_extra = attr->map_extra;
++}
++
++static int bpf_map_alloc_id(struct bpf_map *map)
++{
++	int id;
++
++	idr_preload(GFP_KERNEL);
++	spin_lock_bh(&map_idr_lock);
++	id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
++	if (id > 0)
++		map->id = id;
++	spin_unlock_bh(&map_idr_lock);
++	idr_preload_end();
++
++	if (WARN_ON_ONCE(!id))
++		return -ENOSPC;
++
++	return id > 0 ? 0 : id;
++}
++
++void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
++{
++	unsigned long flags;
++
++	/* Offloaded maps are removed from the IDR store when their device
++	 * disappears - even if someone holds an fd to them they are unusable,
++	 * the memory is gone, all ops will fail; they are simply waiting for
++	 * refcnt to drop to be freed.
++	 */
++	if (!map->id)
++		return;
++
++	if (do_idr_lock)
++		spin_lock_irqsave(&map_idr_lock, flags);
++	else
++		__acquire(&map_idr_lock);
++
++	idr_remove(&map_idr, map->id);
++	map->id = 0;
++
++	if (do_idr_lock)
++		spin_unlock_irqrestore(&map_idr_lock, flags);
++	else
++		__release(&map_idr_lock);
++}
++
++#ifdef CONFIG_MEMCG_KMEM
++static void bpf_map_save_memcg(struct bpf_map *map)
++{
++	/* Currently if a map is created by a process belonging to the root
++	 * memory cgroup, get_obj_cgroup_from_current() will return NULL.
++	 * So we have to check map->objcg for being NULL each time it's
++	 * being used.
++	 */
++	map->objcg = get_obj_cgroup_from_current();
++}
++
++static void bpf_map_release_memcg(struct bpf_map *map)
++{
++	if (map->objcg)
++		obj_cgroup_put(map->objcg);
++}
++
++static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
++{
++	if (map->objcg)
++		return get_mem_cgroup_from_objcg(map->objcg);
++
++	return root_mem_cgroup;
++}
++
++void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
++			   int node)
++{
++	struct mem_cgroup *memcg, *old_memcg;
++	void *ptr;
++
++	memcg = bpf_map_get_memcg(map);
++	old_memcg = set_active_memcg(memcg);
++	ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
++	set_active_memcg(old_memcg);
++	mem_cgroup_put(memcg);
++
++	return ptr;
++}
++
++void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
++{
++	struct mem_cgroup *memcg, *old_memcg;
++	void *ptr;
++
++	memcg = bpf_map_get_memcg(map);
++	old_memcg = set_active_memcg(memcg);
++	ptr = kzalloc(size, flags | __GFP_ACCOUNT);
++	set_active_memcg(old_memcg);
++	mem_cgroup_put(memcg);
++
++	return ptr;
++}
++
++void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
++				    size_t align, gfp_t flags)
++{
++	struct mem_cgroup *memcg, *old_memcg;
++	void __percpu *ptr;
++
++	memcg = bpf_map_get_memcg(map);
++	old_memcg = set_active_memcg(memcg);
++	ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
++	set_active_memcg(old_memcg);
++	mem_cgroup_put(memcg);
++
++	return ptr;
++}
++
++#else
++static void bpf_map_save_memcg(struct bpf_map *map)
++{
++}
++
++static void bpf_map_release_memcg(struct bpf_map *map)
++{
++}
++#endif
++
++static int bpf_map_kptr_off_cmp(const void *a, const void *b)
++{
++	const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
++
++	if (off_desc1->offset < off_desc2->offset)
++		return -1;
++	else if (off_desc1->offset > off_desc2->offset)
++		return 1;
++	return 0;
++}
++
++struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
++{
++	/* Since members are iterated in btf_find_field in increasing order,
++	 * offsets appended to kptr_off_tab are in increasing order, so we can
++	 * do bsearch to find exact match.
++	 */
++	struct bpf_map_value_off *tab;
++
++	if (!map_value_has_kptrs(map))
++		return NULL;
++	tab = map->kptr_off_tab;
++	return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
++}
++
++void bpf_map_free_kptr_off_tab(struct bpf_map *map)
++{
++	struct bpf_map_value_off *tab = map->kptr_off_tab;
++	int i;
++
++	if (!map_value_has_kptrs(map))
++		return;
++	for (i = 0; i < tab->nr_off; i++) {
++		if (tab->off[i].kptr.module)
++			module_put(tab->off[i].kptr.module);
++		btf_put(tab->off[i].kptr.btf);
++	}
++	kfree(tab);
++	map->kptr_off_tab = NULL;
++}
++
++struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
++{
++	struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
++	int size, i;
++
++	if (!map_value_has_kptrs(map))
++		return ERR_PTR(-ENOENT);
++	size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
++	new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
++	if (!new_tab)
++		return ERR_PTR(-ENOMEM);
++	/* Do a deep copy of the kptr_off_tab */
++	for (i = 0; i < tab->nr_off; i++) {
++		btf_get(tab->off[i].kptr.btf);
++		if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) {
++			while (i--) {
++				if (tab->off[i].kptr.module)
++					module_put(tab->off[i].kptr.module);
++				btf_put(tab->off[i].kptr.btf);
++			}
++			kfree(new_tab);
++			return ERR_PTR(-ENXIO);
++		}
++	}
++	return new_tab;
++}
++
++bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
++{
++	struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
++	bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
++	int size;
++
++	if (!a_has_kptr && !b_has_kptr)
++		return true;
++	if (a_has_kptr != b_has_kptr)
++		return false;
++	if (tab_a->nr_off != tab_b->nr_off)
++		return false;
++	size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
++	return !memcmp(tab_a, tab_b, size);
++}
++
++/* Caller must ensure map_value_has_kptrs is true. Note that this function can
++ * be called on a map value while the map_value is visible to BPF programs, as
++ * it ensures the correct synchronization, and we already enforce the same using
++ * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs.
++ */
++void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
++{
++	struct bpf_map_value_off *tab = map->kptr_off_tab;
++	unsigned long *btf_id_ptr;
++	int i;
++
++	for (i = 0; i < tab->nr_off; i++) {
++		struct bpf_map_value_off_desc *off_desc = &tab->off[i];
++		unsigned long old_ptr;
++
++		btf_id_ptr = map_value + off_desc->offset;
++		if (off_desc->type == BPF_KPTR_UNREF) {
++			u64 *p = (u64 *)btf_id_ptr;
++
++			WRITE_ONCE(*p, 0);
++			continue;
++		}
++		old_ptr = xchg(btf_id_ptr, 0);
++		off_desc->kptr.dtor((void *)old_ptr);
++	}
++}
++
++/* called from workqueue */
++static void bpf_map_free_deferred(struct work_struct *work)
++{
++	struct bpf_map *map = container_of(work, struct bpf_map, work);
++
++	security_bpf_map_free(map);
++	kfree(map->off_arr);
++	bpf_map_release_memcg(map);
++	/* implementation dependent freeing, map_free callback also does
++	 * bpf_map_free_kptr_off_tab, if needed.
++	 */
++	map->ops->map_free(map);
++}
++
++static void bpf_map_put_uref(struct bpf_map *map)
++{
++	if (atomic64_dec_and_test(&map->usercnt)) {
++		if (map->ops->map_release_uref)
++			map->ops->map_release_uref(map);
++	}
++}
++
++/* decrement map refcnt and schedule it for freeing via workqueue
++ * (unrelying map implementation ops->map_free() might sleep)
++ */
++static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
++{
++	if (atomic64_dec_and_test(&map->refcnt)) {
++		/* bpf_map_free_id() must be called first */
++		bpf_map_free_id(map, do_idr_lock);
++		btf_put(map->btf);
++		INIT_WORK(&map->work, bpf_map_free_deferred);
++		schedule_work(&map->work);
++	}
++}
++
++void bpf_map_put(struct bpf_map *map)
++{
++	__bpf_map_put(map, true);
++}
++EXPORT_SYMBOL_GPL(bpf_map_put);
++
++void bpf_map_put_with_uref(struct bpf_map *map)
++{
++	bpf_map_put_uref(map);
++	bpf_map_put(map);
++}
++
++static int bpf_map_release(struct inode *inode, struct file *filp)
++{
++	struct bpf_map *map = filp->private_data;
++
++	if (map->ops->map_release)
++		map->ops->map_release(map, filp);
++
++	bpf_map_put_with_uref(map);
++	return 0;
++}
++
++static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
++{
++	fmode_t mode = f.file->f_mode;
++
++	/* Our file permissions may have been overridden by global
++	 * map permissions facing syscall side.
++	 */
++	if (READ_ONCE(map->frozen))
++		mode &= ~FMODE_CAN_WRITE;
++	return mode;
++}
++
++#ifdef CONFIG_PROC_FS
++/* Provides an approximation of the map's memory footprint.
++ * Used only to provide a backward compatibility and display
++ * a reasonable "memlock" info.
++ */
++static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
++{
++	unsigned long size;
++
++	size = round_up(map->key_size + bpf_map_value_size(map), 8);
++
++	return round_up(map->max_entries * size, PAGE_SIZE);
++}
++
++static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
++{
++	struct bpf_map *map = filp->private_data;
++	u32 type = 0, jited = 0;
++
++	if (map_type_contains_progs(map)) {
++		spin_lock(&map->owner.lock);
++		type  = map->owner.type;
++		jited = map->owner.jited;
++		spin_unlock(&map->owner.lock);
++	}
++
++	seq_printf(m,
++		   "map_type:\t%u\n"
++		   "key_size:\t%u\n"
++		   "value_size:\t%u\n"
++		   "max_entries:\t%u\n"
++		   "map_flags:\t%#x\n"
++		   "map_extra:\t%#llx\n"
++		   "memlock:\t%lu\n"
++		   "map_id:\t%u\n"
++		   "frozen:\t%u\n",
++		   map->map_type,
++		   map->key_size,
++		   map->value_size,
++		   map->max_entries,
++		   map->map_flags,
++		   (unsigned long long)map->map_extra,
++		   bpf_map_memory_footprint(map),
++		   map->id,
++		   READ_ONCE(map->frozen));
++	if (type) {
++		seq_printf(m, "owner_prog_type:\t%u\n", type);
++		seq_printf(m, "owner_jited:\t%u\n", jited);
++	}
++}
++#endif
++
++static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
++			      loff_t *ppos)
++{
++	/* We need this handler such that alloc_file() enables
++	 * f_mode with FMODE_CAN_READ.
++	 */
++	return -EINVAL;
++}
++
++static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
++			       size_t siz, loff_t *ppos)
++{
++	/* We need this handler such that alloc_file() enables
++	 * f_mode with FMODE_CAN_WRITE.
++	 */
++	return -EINVAL;
++}
++
++/* called for any extra memory-mapped regions (except initial) */
++static void bpf_map_mmap_open(struct vm_area_struct *vma)
++{
++	struct bpf_map *map = vma->vm_file->private_data;
++
++	if (vma->vm_flags & VM_MAYWRITE)
++		bpf_map_write_active_inc(map);
++}
++
++/* called for all unmapped memory region (including initial) */
++static void bpf_map_mmap_close(struct vm_area_struct *vma)
++{
++	struct bpf_map *map = vma->vm_file->private_data;
++
++	if (vma->vm_flags & VM_MAYWRITE)
++		bpf_map_write_active_dec(map);
++}
++
++static const struct vm_operations_struct bpf_map_default_vmops = {
++	.open		= bpf_map_mmap_open,
++	.close		= bpf_map_mmap_close,
++};
++
++static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++	struct bpf_map *map = filp->private_data;
++	int err;
++
++	if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
++	    map_value_has_timer(map) || map_value_has_kptrs(map))
++		return -ENOTSUPP;
++
++	if (!(vma->vm_flags & VM_SHARED))
++		return -EINVAL;
++
++	mutex_lock(&map->freeze_mutex);
++
++	if (vma->vm_flags & VM_WRITE) {
++		if (map->frozen) {
++			err = -EPERM;
++			goto out;
++		}
++		/* map is meant to be read-only, so do not allow mapping as
++		 * writable, because it's possible to leak a writable page
++		 * reference and allows user-space to still modify it after
++		 * freezing, while verifier will assume contents do not change
++		 */
++		if (map->map_flags & BPF_F_RDONLY_PROG) {
++			err = -EACCES;
++			goto out;
++		}
++	}
++
++	/* set default open/close callbacks */
++	vma->vm_ops = &bpf_map_default_vmops;
++	vma->vm_private_data = map;
++	vma->vm_flags &= ~VM_MAYEXEC;
++	if (!(vma->vm_flags & VM_WRITE))
++		/* disallow re-mapping with PROT_WRITE */
++		vma->vm_flags &= ~VM_MAYWRITE;
++
++	err = map->ops->map_mmap(map, vma);
++	if (err)
++		goto out;
++
++	if (vma->vm_flags & VM_MAYWRITE)
++		bpf_map_write_active_inc(map);
++out:
++	mutex_unlock(&map->freeze_mutex);
++	return err;
++}
++
++static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
++{
++	struct bpf_map *map = filp->private_data;
++
++	if (map->ops->map_poll)
++		return map->ops->map_poll(map, filp, pts);
++
++	return EPOLLERR;
++}
++
++const struct file_operations bpf_map_fops = {
++#ifdef CONFIG_PROC_FS
++	.show_fdinfo	= bpf_map_show_fdinfo,
++#endif
++	.release	= bpf_map_release,
++	.read		= bpf_dummy_read,
++	.write		= bpf_dummy_write,
++	.mmap		= bpf_map_mmap,
++	.poll		= bpf_map_poll,
++};
++
++int bpf_map_new_fd(struct bpf_map *map, int flags)
++{
++	int ret;
++
++	ret = security_bpf_map(map, OPEN_FMODE(flags));
++	if (ret < 0)
++		return ret;
++
++	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
++				flags | O_CLOEXEC);
++}
++
++int bpf_get_file_flag(int flags)
++{
++	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
++		return -EINVAL;
++	if (flags & BPF_F_RDONLY)
++		return O_RDONLY;
++	if (flags & BPF_F_WRONLY)
++		return O_WRONLY;
++	return O_RDWR;
++}
++
++/* helper macro to check that unused fields 'union bpf_attr' are zero */
++#define CHECK_ATTR(CMD) \
++	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
++		   sizeof(attr->CMD##_LAST_FIELD), 0, \
++		   sizeof(*attr) - \
++		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
++		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
++
++/* dst and src must have at least "size" number of bytes.
++ * Return strlen on success and < 0 on error.
++ */
++int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
++{
++	const char *end = src + size;
++	const char *orig_src = src;
++
++	memset(dst, 0, size);
++	/* Copy all isalnum(), '_' and '.' chars. */
++	while (src < end && *src) {
++		if (!isalnum(*src) &&
++		    *src != '_' && *src != '.')
++			return -EINVAL;
++		*dst++ = *src++;
++	}
++
++	/* No '\0' found in "size" number of bytes */
++	if (src == end)
++		return -EINVAL;
++
++	return src - orig_src;
++}
++
++int map_check_no_btf(const struct bpf_map *map,
++		     const struct btf *btf,
++		     const struct btf_type *key_type,
++		     const struct btf_type *value_type)
++{
++	return -ENOTSUPP;
++}
++
++static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
++{
++	const u32 a = *(const u32 *)_a;
++	const u32 b = *(const u32 *)_b;
++
++	if (a < b)
++		return -1;
++	else if (a > b)
++		return 1;
++	return 0;
++}
++
++static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
++{
++	struct bpf_map *map = (struct bpf_map *)priv;
++	u32 *off_base = map->off_arr->field_off;
++	u32 *a = _a, *b = _b;
++	u8 *sz_a, *sz_b;
++
++	sz_a = map->off_arr->field_sz + (a - off_base);
++	sz_b = map->off_arr->field_sz + (b - off_base);
++
++	swap(*a, *b);
++	swap(*sz_a, *sz_b);
++}
++
++static int bpf_map_alloc_off_arr(struct bpf_map *map)
++{
++	bool has_spin_lock = map_value_has_spin_lock(map);
++	bool has_timer = map_value_has_timer(map);
++	bool has_kptrs = map_value_has_kptrs(map);
++	struct bpf_map_off_arr *off_arr;
++	u32 i;
++
++	if (!has_spin_lock && !has_timer && !has_kptrs) {
++		map->off_arr = NULL;
++		return 0;
++	}
++
++	off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
++	if (!off_arr)
++		return -ENOMEM;
++	map->off_arr = off_arr;
++
++	off_arr->cnt = 0;
++	if (has_spin_lock) {
++		i = off_arr->cnt;
++
++		off_arr->field_off[i] = map->spin_lock_off;
++		off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
++		off_arr->cnt++;
++	}
++	if (has_timer) {
++		i = off_arr->cnt;
++
++		off_arr->field_off[i] = map->timer_off;
++		off_arr->field_sz[i] = sizeof(struct bpf_timer);
++		off_arr->cnt++;
++	}
++	if (has_kptrs) {
++		struct bpf_map_value_off *tab = map->kptr_off_tab;
++		u32 *off = &off_arr->field_off[off_arr->cnt];
++		u8 *sz = &off_arr->field_sz[off_arr->cnt];
++
++		for (i = 0; i < tab->nr_off; i++) {
++			*off++ = tab->off[i].offset;
++			*sz++ = sizeof(u64);
++		}
++		off_arr->cnt += tab->nr_off;
++	}
++
++	if (off_arr->cnt == 1)
++		return 0;
++	sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
++	       map_off_arr_cmp, map_off_arr_swap, map);
++	return 0;
++}
++
++static int map_check_btf(struct bpf_map *map, const struct btf *btf,
++			 u32 btf_key_id, u32 btf_value_id)
++{
++	const struct btf_type *key_type, *value_type;
++	u32 key_size, value_size;
++	int ret = 0;
++
++	/* Some maps allow key to be unspecified. */
++	if (btf_key_id) {
++		key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
++		if (!key_type || key_size != map->key_size)
++			return -EINVAL;
++	} else {
++		key_type = btf_type_by_id(btf, 0);
++		if (!map->ops->map_check_btf)
++			return -EINVAL;
++	}
++
++	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
++	if (!value_type || value_size != map->value_size)
++		return -EINVAL;
++
++	map->spin_lock_off = btf_find_spin_lock(btf, value_type);
++
++	if (map_value_has_spin_lock(map)) {
++		if (map->map_flags & BPF_F_RDONLY_PROG)
++			return -EACCES;
++		if (map->map_type != BPF_MAP_TYPE_HASH &&
++		    map->map_type != BPF_MAP_TYPE_ARRAY &&
++		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
++		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
++		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
++		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
++			return -ENOTSUPP;
++		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
++		    map->value_size) {
++			WARN_ONCE(1,
++				  "verifier bug spin_lock_off %d value_size %d\n",
++				  map->spin_lock_off, map->value_size);
++			return -EFAULT;
++		}
++	}
++
++	map->timer_off = btf_find_timer(btf, value_type);
++	if (map_value_has_timer(map)) {
++		if (map->map_flags & BPF_F_RDONLY_PROG)
++			return -EACCES;
++		if (map->map_type != BPF_MAP_TYPE_HASH &&
++		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
++		    map->map_type != BPF_MAP_TYPE_ARRAY)
++			return -EOPNOTSUPP;
++	}
++
++	map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
++	if (map_value_has_kptrs(map)) {
++		if (!bpf_capable()) {
++			ret = -EPERM;
++			goto free_map_tab;
++		}
++		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
++			ret = -EACCES;
++			goto free_map_tab;
++		}
++		if (map->map_type != BPF_MAP_TYPE_HASH &&
++		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
++		    map->map_type != BPF_MAP_TYPE_ARRAY) {
++			ret = -EOPNOTSUPP;
++			goto free_map_tab;
++		}
++	}
++
++	if (map->ops->map_check_btf) {
++		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
++		if (ret < 0)
++			goto free_map_tab;
++	}
++
++	return ret;
++free_map_tab:
++	bpf_map_free_kptr_off_tab(map);
++	return ret;
++}
++
++#define BPF_MAP_CREATE_LAST_FIELD map_extra
++/* called via syscall */
++static int map_create(union bpf_attr *attr)
++{
++	int numa_node = bpf_map_attr_numa_node(attr);
++	struct bpf_map *map;
++	int f_flags;
++	int err;
++
++	err = CHECK_ATTR(BPF_MAP_CREATE);
++	if (err)
++		return -EINVAL;
++
++	if (attr->btf_vmlinux_value_type_id) {
++		if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
++		    attr->btf_key_type_id || attr->btf_value_type_id)
++			return -EINVAL;
++	} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
++		return -EINVAL;
++	}
++
++	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
++	    attr->map_extra != 0)
++		return -EINVAL;
++
++	f_flags = bpf_get_file_flag(attr->map_flags);
++	if (f_flags < 0)
++		return f_flags;
++
++	if (numa_node != NUMA_NO_NODE &&
++	    ((unsigned int)numa_node >= nr_node_ids ||
++	     !node_online(numa_node)))
++		return -EINVAL;
++
++	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
++	map = find_and_alloc_map(attr);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++
++	err = bpf_obj_name_cpy(map->name, attr->map_name,
++			       sizeof(attr->map_name));
++	if (err < 0)
++		goto free_map;
++
++	atomic64_set(&map->refcnt, 1);
++	atomic64_set(&map->usercnt, 1);
++	mutex_init(&map->freeze_mutex);
++	spin_lock_init(&map->owner.lock);
++
++	map->spin_lock_off = -EINVAL;
++	map->timer_off = -EINVAL;
++	if (attr->btf_key_type_id || attr->btf_value_type_id ||
++	    /* Even the map's value is a kernel's struct,
++	     * the bpf_prog.o must have BTF to begin with
++	     * to figure out the corresponding kernel's
++	     * counter part.  Thus, attr->btf_fd has
++	     * to be valid also.
++	     */
++	    attr->btf_vmlinux_value_type_id) {
++		struct btf *btf;
++
++		btf = btf_get_by_fd(attr->btf_fd);
++		if (IS_ERR(btf)) {
++			err = PTR_ERR(btf);
++			goto free_map;
++		}
++		if (btf_is_kernel(btf)) {
++			btf_put(btf);
++			err = -EACCES;
++			goto free_map;
++		}
++		map->btf = btf;
++
++		if (attr->btf_value_type_id) {
++			err = map_check_btf(map, btf, attr->btf_key_type_id,
++					    attr->btf_value_type_id);
++			if (err)
++				goto free_map;
++		}
++
++		map->btf_key_type_id = attr->btf_key_type_id;
++		map->btf_value_type_id = attr->btf_value_type_id;
++		map->btf_vmlinux_value_type_id =
++			attr->btf_vmlinux_value_type_id;
++	}
++
++	err = bpf_map_alloc_off_arr(map);
++	if (err)
++		goto free_map;
++
++	err = security_bpf_map_alloc(map);
++	if (err)
++		goto free_map_off_arr;
++
++	err = bpf_map_alloc_id(map);
++	if (err)
++		goto free_map_sec;
++
++	bpf_map_save_memcg(map);
++
++	err = bpf_map_new_fd(map, f_flags);
++	if (err < 0) {
++		/* failed to allocate fd.
++		 * bpf_map_put_with_uref() is needed because the above
++		 * bpf_map_alloc_id() has published the map
++		 * to the userspace and the userspace may
++		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
++		 */
++		bpf_map_put_with_uref(map);
++		return err;
++	}
++
++	return err;
++
++free_map_sec:
++	security_bpf_map_free(map);
++free_map_off_arr:
++	kfree(map->off_arr);
++free_map:
++	btf_put(map->btf);
++	map->ops->map_free(map);
++	return err;
++}
++
++/* if error is returned, fd is released.
++ * On success caller should complete fd access with matching fdput()
++ */
++struct bpf_map *__bpf_map_get(struct fd f)
++{
++	if (!f.file)
++		return ERR_PTR(-EBADF);
++	if (f.file->f_op != &bpf_map_fops) {
++		fdput(f);
++		return ERR_PTR(-EINVAL);
++	}
++
++	return f.file->private_data;
++}
++
++void bpf_map_inc(struct bpf_map *map)
++{
++	atomic64_inc(&map->refcnt);
++}
++EXPORT_SYMBOL_GPL(bpf_map_inc);
++
++void bpf_map_inc_with_uref(struct bpf_map *map)
++{
++	atomic64_inc(&map->refcnt);
++	atomic64_inc(&map->usercnt);
++}
++EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
++
++struct bpf_map *bpf_map_get(u32 ufd)
++{
++	struct fd f = fdget(ufd);
++	struct bpf_map *map;
++
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return map;
++
++	bpf_map_inc(map);
++	fdput(f);
++
++	return map;
++}
++EXPORT_SYMBOL(bpf_map_get);
++
++struct bpf_map *bpf_map_get_with_uref(u32 ufd)
++{
++	struct fd f = fdget(ufd);
++	struct bpf_map *map;
++
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return map;
++
++	bpf_map_inc_with_uref(map);
++	fdput(f);
++
++	return map;
++}
++
++/* map_idr_lock should have been held */
++static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
++{
++	int refold;
++
++	refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
++	if (!refold)
++		return ERR_PTR(-ENOENT);
++	if (uref)
++		atomic64_inc(&map->usercnt);
++
++	return map;
++}
++
++struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
++{
++	spin_lock_bh(&map_idr_lock);
++	map = __bpf_map_inc_not_zero(map, false);
++	spin_unlock_bh(&map_idr_lock);
++
++	return map;
++}
++EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
++
++int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
++{
++	return -ENOTSUPP;
++}
++
++static void *__bpf_copy_key(void __user *ukey, u64 key_size)
++{
++	if (key_size)
++		return vmemdup_user(ukey, key_size);
++
++	if (ukey)
++		return ERR_PTR(-EINVAL);
++
++	return NULL;
++}
++
++static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
++{
++	if (key_size)
++		return kvmemdup_bpfptr(ukey, key_size);
++
++	if (!bpfptr_is_null(ukey))
++		return ERR_PTR(-EINVAL);
++
++	return NULL;
++}
++
++/* last field in 'union bpf_attr' used by this command */
++#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
++
++static int map_lookup_elem(union bpf_attr *attr)
++{
++	void __user *ukey = u64_to_user_ptr(attr->key);
++	void __user *uvalue = u64_to_user_ptr(attr->value);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	void *key, *value;
++	u32 value_size;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
++		return -EINVAL;
++
++	if (attr->flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if ((attr->flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		err = -EINVAL;
++		goto err_put;
++	}
++
++	key = __bpf_copy_key(ukey, map->key_size);
++	if (IS_ERR(key)) {
++		err = PTR_ERR(key);
++		goto err_put;
++	}
++
++	value_size = bpf_map_value_size(map);
++
++	err = -ENOMEM;
++	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
++	if (!value)
++		goto free_key;
++
++	if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
++		if (copy_from_user(value, uvalue, value_size))
++			err = -EFAULT;
++		else
++			err = bpf_map_copy_value(map, key, value, attr->flags);
++		goto free_value;
++	}
++
++	err = bpf_map_copy_value(map, key, value, attr->flags);
++	if (err)
++		goto free_value;
++
++	err = -EFAULT;
++	if (copy_to_user(uvalue, value, value_size) != 0)
++		goto free_value;
++
++	err = 0;
++
++free_value:
++	kvfree(value);
++free_key:
++	kvfree(key);
++err_put:
++	fdput(f);
++	return err;
++}
++
++
++#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
++
++static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
++{
++	bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
++	bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	void *key, *value;
++	u32 value_size;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	bpf_map_write_active_inc(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if ((attr->flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		err = -EINVAL;
++		goto err_put;
++	}
++
++	key = ___bpf_copy_key(ukey, map->key_size);
++	if (IS_ERR(key)) {
++		err = PTR_ERR(key);
++		goto err_put;
++	}
++
++	value_size = bpf_map_value_size(map);
++
++	err = -ENOMEM;
++	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
++	if (!value)
++		goto free_key;
++
++	err = -EFAULT;
++	if (copy_from_bpfptr(value, uvalue, value_size) != 0)
++		goto free_value;
++
++	err = bpf_map_update_value(map, f, key, value, attr->flags);
++
++free_value:
++	kvfree(value);
++free_key:
++	kvfree(key);
++err_put:
++	bpf_map_write_active_dec(map);
++	fdput(f);
++	return err;
++}
++
++#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
++
++static int map_delete_elem(union bpf_attr *attr)
++{
++	void __user *ukey = u64_to_user_ptr(attr->key);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	struct fd f;
++	void *key;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	bpf_map_write_active_inc(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	key = __bpf_copy_key(ukey, map->key_size);
++	if (IS_ERR(key)) {
++		err = PTR_ERR(key);
++		goto err_put;
++	}
++
++	if (bpf_map_is_dev_bound(map)) {
++		err = bpf_map_offload_delete_elem(map, key);
++		goto out;
++	} else if (IS_FD_PROG_ARRAY(map) ||
++		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
++		/* These maps require sleepable context */
++		err = map->ops->map_delete_elem(map, key);
++		goto out;
++	}
++
++	bpf_disable_instrumentation();
++	rcu_read_lock();
++	err = map->ops->map_delete_elem(map, key);
++	rcu_read_unlock();
++	bpf_enable_instrumentation();
++	maybe_wait_bpf_programs(map);
++out:
++	kvfree(key);
++err_put:
++	bpf_map_write_active_dec(map);
++	fdput(f);
++	return err;
++}
++
++/* last field in 'union bpf_attr' used by this command */
++#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
++
++static int map_get_next_key(union bpf_attr *attr)
++{
++	void __user *ukey = u64_to_user_ptr(attr->key);
++	void __user *unext_key = u64_to_user_ptr(attr->next_key);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	void *key, *next_key;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if (ukey) {
++		key = __bpf_copy_key(ukey, map->key_size);
++		if (IS_ERR(key)) {
++			err = PTR_ERR(key);
++			goto err_put;
++		}
++	} else {
++		key = NULL;
++	}
++
++	err = -ENOMEM;
++	next_key = kvmalloc(map->key_size, GFP_USER);
++	if (!next_key)
++		goto free_key;
++
++	if (bpf_map_is_dev_bound(map)) {
++		err = bpf_map_offload_get_next_key(map, key, next_key);
++		goto out;
++	}
++
++	rcu_read_lock();
++	err = map->ops->map_get_next_key(map, key, next_key);
++	rcu_read_unlock();
++out:
++	if (err)
++		goto free_next_key;
++
++	err = -EFAULT;
++	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
++		goto free_next_key;
++
++	err = 0;
++
++free_next_key:
++	kvfree(next_key);
++free_key:
++	kvfree(key);
++err_put:
++	fdput(f);
++	return err;
++}
++
++int generic_map_delete_batch(struct bpf_map *map,
++			     const union bpf_attr *attr,
++			     union bpf_attr __user *uattr)
++{
++	void __user *keys = u64_to_user_ptr(attr->batch.keys);
++	u32 cp, max_count;
++	int err = 0;
++	void *key;
++
++	if (attr->batch.elem_flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		return -EINVAL;
++	}
++
++	max_count = attr->batch.count;
++	if (!max_count)
++		return 0;
++
++	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
++	if (!key)
++		return -ENOMEM;
++
++	for (cp = 0; cp < max_count; cp++) {
++		err = -EFAULT;
++		if (copy_from_user(key, keys + cp * map->key_size,
++				   map->key_size))
++			break;
++
++		if (bpf_map_is_dev_bound(map)) {
++			err = bpf_map_offload_delete_elem(map, key);
++			break;
++		}
++
++		bpf_disable_instrumentation();
++		rcu_read_lock();
++		err = map->ops->map_delete_elem(map, key);
++		rcu_read_unlock();
++		bpf_enable_instrumentation();
++		if (err)
++			break;
++		cond_resched();
++	}
++	if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
++		err = -EFAULT;
++
++	kvfree(key);
++
++	maybe_wait_bpf_programs(map);
++	return err;
++}
++
++int generic_map_update_batch(struct bpf_map *map,
++			     const union bpf_attr *attr,
++			     union bpf_attr __user *uattr)
++{
++	void __user *values = u64_to_user_ptr(attr->batch.values);
++	void __user *keys = u64_to_user_ptr(attr->batch.keys);
++	u32 value_size, cp, max_count;
++	int ufd = attr->batch.map_fd;
++	void *key, *value;
++	struct fd f;
++	int err = 0;
++
++	if (attr->batch.elem_flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		return -EINVAL;
++	}
++
++	value_size = bpf_map_value_size(map);
++
++	max_count = attr->batch.count;
++	if (!max_count)
++		return 0;
++
++	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
++	if (!key)
++		return -ENOMEM;
++
++	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
++	if (!value) {
++		kvfree(key);
++		return -ENOMEM;
++	}
++
++	f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
++	for (cp = 0; cp < max_count; cp++) {
++		err = -EFAULT;
++		if (copy_from_user(key, keys + cp * map->key_size,
++		    map->key_size) ||
++		    copy_from_user(value, values + cp * value_size, value_size))
++			break;
++
++		err = bpf_map_update_value(map, f, key, value,
++					   attr->batch.elem_flags);
++
++		if (err)
++			break;
++		cond_resched();
++	}
++
++	if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
++		err = -EFAULT;
++
++	kvfree(value);
++	kvfree(key);
++	fdput(f);
++	return err;
++}
++
++#define MAP_LOOKUP_RETRIES 3
++
++int generic_map_lookup_batch(struct bpf_map *map,
++				    const union bpf_attr *attr,
++				    union bpf_attr __user *uattr)
++{
++	void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
++	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
++	void __user *values = u64_to_user_ptr(attr->batch.values);
++	void __user *keys = u64_to_user_ptr(attr->batch.keys);
++	void *buf, *buf_prevkey, *prev_key, *key, *value;
++	int err, retry = MAP_LOOKUP_RETRIES;
++	u32 value_size, cp, max_count;
++
++	if (attr->batch.elem_flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map))
++		return -EINVAL;
++
++	value_size = bpf_map_value_size(map);
++
++	max_count = attr->batch.count;
++	if (!max_count)
++		return 0;
++
++	if (put_user(0, &uattr->batch.count))
++		return -EFAULT;
++
++	buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
++	if (!buf_prevkey)
++		return -ENOMEM;
++
++	buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
++	if (!buf) {
++		kvfree(buf_prevkey);
++		return -ENOMEM;
++	}
++
++	err = -EFAULT;
++	prev_key = NULL;
++	if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
++		goto free_buf;
++	key = buf;
++	value = key + map->key_size;
++	if (ubatch)
++		prev_key = buf_prevkey;
++
++	for (cp = 0; cp < max_count;) {
++		rcu_read_lock();
++		err = map->ops->map_get_next_key(map, prev_key, key);
++		rcu_read_unlock();
++		if (err)
++			break;
++		err = bpf_map_copy_value(map, key, value,
++					 attr->batch.elem_flags);
++
++		if (err == -ENOENT) {
++			if (retry) {
++				retry--;
++				continue;
++			}
++			err = -EINTR;
++			break;
++		}
++
++		if (err)
++			goto free_buf;
++
++		if (copy_to_user(keys + cp * map->key_size, key,
++				 map->key_size)) {
++			err = -EFAULT;
++			goto free_buf;
++		}
++		if (copy_to_user(values + cp * value_size, value, value_size)) {
++			err = -EFAULT;
++			goto free_buf;
++		}
++
++		if (!prev_key)
++			prev_key = buf_prevkey;
++
++		swap(prev_key, key);
++		retry = MAP_LOOKUP_RETRIES;
++		cp++;
++		cond_resched();
++	}
++
++	if (err == -EFAULT)
++		goto free_buf;
++
++	if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
++		    (cp && copy_to_user(uobatch, prev_key, map->key_size))))
++		err = -EFAULT;
++
++free_buf:
++	kvfree(buf_prevkey);
++	kvfree(buf);
++	return err;
++}
++
++#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
++
++static int map_lookup_and_delete_elem(union bpf_attr *attr)
++{
++	void __user *ukey = u64_to_user_ptr(attr->key);
++	void __user *uvalue = u64_to_user_ptr(attr->value);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	void *key, *value;
++	u32 value_size;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
++		return -EINVAL;
++
++	if (attr->flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	bpf_map_write_active_inc(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
++	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if (attr->flags &&
++	    (map->map_type == BPF_MAP_TYPE_QUEUE ||
++	     map->map_type == BPF_MAP_TYPE_STACK)) {
++		err = -EINVAL;
++		goto err_put;
++	}
++
++	if ((attr->flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		err = -EINVAL;
++		goto err_put;
++	}
++
++	key = __bpf_copy_key(ukey, map->key_size);
++	if (IS_ERR(key)) {
++		err = PTR_ERR(key);
++		goto err_put;
++	}
++
++	value_size = bpf_map_value_size(map);
++
++	err = -ENOMEM;
++	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
++	if (!value)
++		goto free_key;
++
++	err = -ENOTSUPP;
++	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
++	    map->map_type == BPF_MAP_TYPE_STACK) {
++		err = map->ops->map_pop_elem(map, value);
++	} else if (map->map_type == BPF_MAP_TYPE_HASH ||
++		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
++		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
++		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
++		if (!bpf_map_is_dev_bound(map)) {
++			bpf_disable_instrumentation();
++			rcu_read_lock();
++			err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
++			rcu_read_unlock();
++			bpf_enable_instrumentation();
++		}
++	}
++
++	if (err)
++		goto free_value;
++
++	if (copy_to_user(uvalue, value, value_size) != 0) {
++		err = -EFAULT;
++		goto free_value;
++	}
++
++	err = 0;
++
++free_value:
++	kvfree(value);
++free_key:
++	kvfree(key);
++err_put:
++	bpf_map_write_active_dec(map);
++	fdput(f);
++	return err;
++}
++
++#define BPF_MAP_FREEZE_LAST_FIELD map_fd
++
++static int map_freeze(const union bpf_attr *attr)
++{
++	int err = 0, ufd = attr->map_fd;
++	struct bpf_map *map;
++	struct fd f;
++
++	if (CHECK_ATTR(BPF_MAP_FREEZE))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++
++	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
++	    map_value_has_timer(map) || map_value_has_kptrs(map)) {
++		fdput(f);
++		return -ENOTSUPP;
++	}
++
++	mutex_lock(&map->freeze_mutex);
++	if (bpf_map_write_active(map)) {
++		err = -EBUSY;
++		goto err_put;
++	}
++	if (READ_ONCE(map->frozen)) {
++		err = -EBUSY;
++		goto err_put;
++	}
++	if (!bpf_capable()) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	WRITE_ONCE(map->frozen, true);
++err_put:
++	mutex_unlock(&map->freeze_mutex);
++	fdput(f);
++	return err;
++}
++
++static const struct bpf_prog_ops * const bpf_prog_types[] = {
++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
++	[_id] = & _name ## _prog_ops,
++#define BPF_MAP_TYPE(_id, _ops)
++#define BPF_LINK_TYPE(_id, _name)
++#include <linux/bpf_types.h>
++#undef BPF_PROG_TYPE
++#undef BPF_MAP_TYPE
++#undef BPF_LINK_TYPE
++};
++
++static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
++{
++	const struct bpf_prog_ops *ops;
++
++	if (type >= ARRAY_SIZE(bpf_prog_types))
++		return -EINVAL;
++	type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
++	ops = bpf_prog_types[type];
++	if (!ops)
++		return -EINVAL;
++
++	if (!bpf_prog_is_dev_bound(prog->aux))
++		prog->aux->ops = ops;
++	else
++		prog->aux->ops = &bpf_offload_prog_ops;
++	prog->type = type;
++	return 0;
++}
++
++enum bpf_audit {
++	BPF_AUDIT_LOAD,
++	BPF_AUDIT_UNLOAD,
++	BPF_AUDIT_MAX,
++};
++
++static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
++	[BPF_AUDIT_LOAD]   = "LOAD",
++	[BPF_AUDIT_UNLOAD] = "UNLOAD",
++};
++
++static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
++{
++	struct audit_context *ctx = NULL;
++	struct audit_buffer *ab;
++
++	if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
++		return;
++	if (audit_enabled == AUDIT_OFF)
++		return;
++	if (op == BPF_AUDIT_LOAD)
++		ctx = audit_context();
++	ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
++	if (unlikely(!ab))
++		return;
++	audit_log_format(ab, "prog-id=%u op=%s",
++			 prog->aux->id, bpf_audit_str[op]);
++	audit_log_end(ab);
++}
++
++static int bpf_prog_alloc_id(struct bpf_prog *prog)
++{
++	int id;
++
++	idr_preload(GFP_KERNEL);
++	spin_lock_bh(&prog_idr_lock);
++	id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
++	if (id > 0)
++		prog->aux->id = id;
++	spin_unlock_bh(&prog_idr_lock);
++	idr_preload_end();
++
++	/* id is in [1, INT_MAX) */
++	if (WARN_ON_ONCE(!id))
++		return -ENOSPC;
++
++	return id > 0 ? 0 : id;
++}
++
++void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
++{
++	unsigned long flags;
++
++	/* cBPF to eBPF migrations are currently not in the idr store.
++	 * Offloaded programs are removed from the store when their device
++	 * disappears - even if someone grabs an fd to them they are unusable,
++	 * simply waiting for refcnt to drop to be freed.
++	 */
++	if (!prog->aux->id)
++		return;
++
++	if (do_idr_lock)
++		spin_lock_irqsave(&prog_idr_lock, flags);
++	else
++		__acquire(&prog_idr_lock);
++
++	idr_remove(&prog_idr, prog->aux->id);
++	prog->aux->id = 0;
++
++	if (do_idr_lock)
++		spin_unlock_irqrestore(&prog_idr_lock, flags);
++	else
++		__release(&prog_idr_lock);
++}
++
++static void __bpf_prog_put_rcu(struct rcu_head *rcu)
++{
++	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
++
++	kvfree(aux->func_info);
++	kfree(aux->func_info_aux);
++	free_uid(aux->user);
++	security_bpf_prog_free(aux);
++	bpf_prog_free(aux->prog);
++}
++
++static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
++{
++	bpf_prog_kallsyms_del_all(prog);
++	btf_put(prog->aux->btf);
++	kvfree(prog->aux->jited_linfo);
++	kvfree(prog->aux->linfo);
++	kfree(prog->aux->kfunc_tab);
++	if (prog->aux->attach_btf)
++		btf_put(prog->aux->attach_btf);
++
++	if (deferred) {
++		if (prog->aux->sleepable)
++			call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
++		else
++			call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
++	} else {
++		__bpf_prog_put_rcu(&prog->aux->rcu);
++	}
++}
++
++static void bpf_prog_put_deferred(struct work_struct *work)
++{
++	struct bpf_prog_aux *aux;
++	struct bpf_prog *prog;
++
++	aux = container_of(work, struct bpf_prog_aux, work);
++	prog = aux->prog;
++	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
++	bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
++	__bpf_prog_put_noref(prog, true);
++}
++
++static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
++{
++	struct bpf_prog_aux *aux = prog->aux;
++
++	if (atomic64_dec_and_test(&aux->refcnt)) {
++		/* bpf_prog_free_id() must be called first */
++		bpf_prog_free_id(prog, do_idr_lock);
++
++		if (in_irq() || irqs_disabled()) {
++			INIT_WORK(&aux->work, bpf_prog_put_deferred);
++			schedule_work(&aux->work);
++		} else {
++			bpf_prog_put_deferred(&aux->work);
++		}
++	}
++}
++
++void bpf_prog_put(struct bpf_prog *prog)
++{
++	__bpf_prog_put(prog, true);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_put);
++
++static int bpf_prog_release(struct inode *inode, struct file *filp)
++{
++	struct bpf_prog *prog = filp->private_data;
++
++	bpf_prog_put(prog);
++	return 0;
++}
++
++struct bpf_prog_kstats {
++	u64 nsecs;
++	u64 cnt;
++	u64 misses;
++};
++
++void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
++{
++	struct bpf_prog_stats *stats;
++	unsigned int flags;
++
++	stats = this_cpu_ptr(prog->stats);
++	flags = u64_stats_update_begin_irqsave(&stats->syncp);
++	u64_stats_inc(&stats->misses);
++	u64_stats_update_end_irqrestore(&stats->syncp, flags);
++}
++
++static void bpf_prog_get_stats(const struct bpf_prog *prog,
++			       struct bpf_prog_kstats *stats)
++{
++	u64 nsecs = 0, cnt = 0, misses = 0;
++	int cpu;
++
++	for_each_possible_cpu(cpu) {
++		const struct bpf_prog_stats *st;
++		unsigned int start;
++		u64 tnsecs, tcnt, tmisses;
++
++		st = per_cpu_ptr(prog->stats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&st->syncp);
++			tnsecs = u64_stats_read(&st->nsecs);
++			tcnt = u64_stats_read(&st->cnt);
++			tmisses = u64_stats_read(&st->misses);
++		} while (u64_stats_fetch_retry_irq(&st->syncp, start));
++		nsecs += tnsecs;
++		cnt += tcnt;
++		misses += tmisses;
++	}
++	stats->nsecs = nsecs;
++	stats->cnt = cnt;
++	stats->misses = misses;
++}
++
++#ifdef CONFIG_PROC_FS
++static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
++{
++	const struct bpf_prog *prog = filp->private_data;
++	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
++	struct bpf_prog_kstats stats;
++
++	bpf_prog_get_stats(prog, &stats);
++	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
++	seq_printf(m,
++		   "prog_type:\t%u\n"
++		   "prog_jited:\t%u\n"
++		   "prog_tag:\t%s\n"
++		   "memlock:\t%llu\n"
++		   "prog_id:\t%u\n"
++		   "run_time_ns:\t%llu\n"
++		   "run_cnt:\t%llu\n"
++		   "recursion_misses:\t%llu\n"
++		   "verified_insns:\t%u\n",
++		   prog->type,
++		   prog->jited,
++		   prog_tag,
++		   prog->pages * 1ULL << PAGE_SHIFT,
++		   prog->aux->id,
++		   stats.nsecs,
++		   stats.cnt,
++		   stats.misses,
++		   prog->aux->verified_insns);
++}
++#endif
++
++const struct file_operations bpf_prog_fops = {
++#ifdef CONFIG_PROC_FS
++	.show_fdinfo	= bpf_prog_show_fdinfo,
++#endif
++	.release	= bpf_prog_release,
++	.read		= bpf_dummy_read,
++	.write		= bpf_dummy_write,
++};
++
++int bpf_prog_new_fd(struct bpf_prog *prog)
++{
++	int ret;
++
++	ret = security_bpf_prog(prog);
++	if (ret < 0)
++		return ret;
++
++	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
++				O_RDWR | O_CLOEXEC);
++}
++
++static struct bpf_prog *____bpf_prog_get(struct fd f)
++{
++	if (!f.file)
++		return ERR_PTR(-EBADF);
++	if (f.file->f_op != &bpf_prog_fops) {
++		fdput(f);
++		return ERR_PTR(-EINVAL);
++	}
++
++	return f.file->private_data;
++}
++
++void bpf_prog_add(struct bpf_prog *prog, int i)
++{
++	atomic64_add(i, &prog->aux->refcnt);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_add);
++
++void bpf_prog_sub(struct bpf_prog *prog, int i)
++{
++	/* Only to be used for undoing previous bpf_prog_add() in some
++	 * error path. We still know that another entity in our call
++	 * path holds a reference to the program, thus atomic_sub() can
++	 * be safely used in such cases!
++	 */
++	WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_sub);
++
++void bpf_prog_inc(struct bpf_prog *prog)
++{
++	atomic64_inc(&prog->aux->refcnt);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_inc);
++
++/* prog_idr_lock should have been held */
++struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
++{
++	int refold;
++
++	refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
++
++	if (!refold)
++		return ERR_PTR(-ENOENT);
++
++	return prog;
++}
++EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
++
++bool bpf_prog_get_ok(struct bpf_prog *prog,
++			    enum bpf_prog_type *attach_type, bool attach_drv)
++{
++	/* not an attachment, just a refcount inc, always allow */
++	if (!attach_type)
++		return true;
++
++	if (prog->type != *attach_type)
++		return false;
++	if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
++		return false;
++
++	return true;
++}
++
++static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
++				       bool attach_drv)
++{
++	struct fd f = fdget(ufd);
++	struct bpf_prog *prog;
++
++	prog = ____bpf_prog_get(f);
++	if (IS_ERR(prog))
++		return prog;
++	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
++		prog = ERR_PTR(-EINVAL);
++		goto out;
++	}
++
++	bpf_prog_inc(prog);
++out:
++	fdput(f);
++	return prog;
++}
++
++struct bpf_prog *bpf_prog_get(u32 ufd)
++{
++	return __bpf_prog_get(ufd, NULL, false);
++}
++
++struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
++				       bool attach_drv)
++{
++	return __bpf_prog_get(ufd, &type, attach_drv);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
++
++/* Initially all BPF programs could be loaded w/o specifying
++ * expected_attach_type. Later for some of them specifying expected_attach_type
++ * at load time became required so that program could be validated properly.
++ * Programs of types that are allowed to be loaded both w/ and w/o (for
++ * backward compatibility) expected_attach_type, should have the default attach
++ * type assigned to expected_attach_type for the latter case, so that it can be
++ * validated later at attach time.
++ *
++ * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
++ * prog type requires it but has some attach types that have to be backward
++ * compatible.
++ */
++static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
++{
++	switch (attr->prog_type) {
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++		/* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
++		 * exist so checking for non-zero is the way to go here.
++		 */
++		if (!attr->expected_attach_type)
++			attr->expected_attach_type =
++				BPF_CGROUP_INET_SOCK_CREATE;
++		break;
++	case BPF_PROG_TYPE_SK_REUSEPORT:
++		if (!attr->expected_attach_type)
++			attr->expected_attach_type =
++				BPF_SK_REUSEPORT_SELECT;
++		break;
++	}
++}
++
++static int
++bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
++			   enum bpf_attach_type expected_attach_type,
++			   struct btf *attach_btf, u32 btf_id,
++			   struct bpf_prog *dst_prog)
++{
++	if (btf_id) {
++		if (btf_id > BTF_MAX_TYPE)
++			return -EINVAL;
++
++		if (!attach_btf && !dst_prog)
++			return -EINVAL;
++
++		switch (prog_type) {
++		case BPF_PROG_TYPE_TRACING:
++		case BPF_PROG_TYPE_LSM:
++		case BPF_PROG_TYPE_STRUCT_OPS:
++		case BPF_PROG_TYPE_EXT:
++			break;
++		default:
++			return -EINVAL;
++		}
++	}
++
++	if (attach_btf && (!btf_id || dst_prog))
++		return -EINVAL;
++
++	if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
++	    prog_type != BPF_PROG_TYPE_EXT)
++		return -EINVAL;
++
++	switch (prog_type) {
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++		switch (expected_attach_type) {
++		case BPF_CGROUP_INET_SOCK_CREATE:
++		case BPF_CGROUP_INET_SOCK_RELEASE:
++		case BPF_CGROUP_INET4_POST_BIND:
++		case BPF_CGROUP_INET6_POST_BIND:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++		switch (expected_attach_type) {
++		case BPF_CGROUP_INET4_BIND:
++		case BPF_CGROUP_INET6_BIND:
++		case BPF_CGROUP_INET4_CONNECT:
++		case BPF_CGROUP_INET6_CONNECT:
++		case BPF_CGROUP_INET4_GETPEERNAME:
++		case BPF_CGROUP_INET6_GETPEERNAME:
++		case BPF_CGROUP_INET4_GETSOCKNAME:
++		case BPF_CGROUP_INET6_GETSOCKNAME:
++		case BPF_CGROUP_UDP4_SENDMSG:
++		case BPF_CGROUP_UDP6_SENDMSG:
++		case BPF_CGROUP_UDP4_RECVMSG:
++		case BPF_CGROUP_UDP6_RECVMSG:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_CGROUP_SKB:
++		switch (expected_attach_type) {
++		case BPF_CGROUP_INET_INGRESS:
++		case BPF_CGROUP_INET_EGRESS:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++		switch (expected_attach_type) {
++		case BPF_CGROUP_SETSOCKOPT:
++		case BPF_CGROUP_GETSOCKOPT:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_SK_LOOKUP:
++		if (expected_attach_type == BPF_SK_LOOKUP)
++			return 0;
++		return -EINVAL;
++	case BPF_PROG_TYPE_SK_REUSEPORT:
++		switch (expected_attach_type) {
++		case BPF_SK_REUSEPORT_SELECT:
++		case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_SYSCALL:
++	case BPF_PROG_TYPE_EXT:
++		if (expected_attach_type)
++			return -EINVAL;
++		fallthrough;
++	default:
++		return 0;
++	}
++}
++
++static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
++{
++	switch (prog_type) {
++	case BPF_PROG_TYPE_SCHED_CLS:
++	case BPF_PROG_TYPE_SCHED_ACT:
++	case BPF_PROG_TYPE_XDP:
++	case BPF_PROG_TYPE_LWT_IN:
++	case BPF_PROG_TYPE_LWT_OUT:
++	case BPF_PROG_TYPE_LWT_XMIT:
++	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
++	case BPF_PROG_TYPE_SK_SKB:
++	case BPF_PROG_TYPE_SK_MSG:
++	case BPF_PROG_TYPE_LIRC_MODE2:
++	case BPF_PROG_TYPE_FLOW_DISSECTOR:
++	case BPF_PROG_TYPE_CGROUP_DEVICE:
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++	case BPF_PROG_TYPE_CGROUP_SYSCTL:
++	case BPF_PROG_TYPE_SOCK_OPS:
++	case BPF_PROG_TYPE_EXT: /* extends any prog */
++		return true;
++	case BPF_PROG_TYPE_CGROUP_SKB:
++		/* always unpriv */
++	case BPF_PROG_TYPE_SK_REUSEPORT:
++		/* equivalent to SOCKET_FILTER. need CAP_BPF only */
++	default:
++		return false;
++	}
++}
++
++static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
++{
++	switch (prog_type) {
++	case BPF_PROG_TYPE_KPROBE:
++	case BPF_PROG_TYPE_TRACEPOINT:
++	case BPF_PROG_TYPE_PERF_EVENT:
++	case BPF_PROG_TYPE_RAW_TRACEPOINT:
++	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
++	case BPF_PROG_TYPE_TRACING:
++	case BPF_PROG_TYPE_LSM:
++	case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
++	case BPF_PROG_TYPE_EXT: /* extends any prog */
++		return true;
++	default:
++		return false;
++	}
++}
++
++/* last field in 'union bpf_attr' used by this command */
++#define	BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
++
++static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
++{
++	enum bpf_prog_type type = attr->prog_type;
++	struct bpf_prog *prog, *dst_prog = NULL;
++	struct btf *attach_btf = NULL;
++	int err;
++	char license[128];
++	bool is_gpl;
++
++	if (CHECK_ATTR(BPF_PROG_LOAD))
++		return -EINVAL;
++
++	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
++				 BPF_F_ANY_ALIGNMENT |
++				 BPF_F_TEST_STATE_FREQ |
++				 BPF_F_SLEEPABLE |
++				 BPF_F_TEST_RND_HI32 |
++				 BPF_F_XDP_HAS_FRAGS))
++		return -EINVAL;
++
++	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
++	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
++	    !bpf_capable())
++		return -EPERM;
++
++	/* copy eBPF program license from user space */
++	if (strncpy_from_bpfptr(license,
++				make_bpfptr(attr->license, uattr.is_kernel),
++				sizeof(license) - 1) < 0)
++		return -EFAULT;
++	license[sizeof(license) - 1] = 0;
++
++	/* eBPF programs must be GPL compatible to use GPL-ed functions */
++	is_gpl = license_is_gpl_compatible(license);
++
++	if (attr->insn_cnt == 0 ||
++	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
++		return -E2BIG;
++	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
++	    type != BPF_PROG_TYPE_CGROUP_SKB &&
++	    !bpf_capable())
++		return -EPERM;
++
++	if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
++		return -EPERM;
++	if (is_perfmon_prog_type(type) && !perfmon_capable())
++		return -EPERM;
++
++	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
++	 * or btf, we need to check which one it is
++	 */
++	if (attr->attach_prog_fd) {
++		dst_prog = bpf_prog_get(attr->attach_prog_fd);
++		if (IS_ERR(dst_prog)) {
++			dst_prog = NULL;
++			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
++			if (IS_ERR(attach_btf))
++				return -EINVAL;
++			if (!btf_is_kernel(attach_btf)) {
++				/* attaching through specifying bpf_prog's BTF
++				 * objects directly might be supported eventually
++				 */
++				btf_put(attach_btf);
++				return -ENOTSUPP;
++			}
++		}
++	} else if (attr->attach_btf_id) {
++		/* fall back to vmlinux BTF, if BTF type ID is specified */
++		attach_btf = bpf_get_btf_vmlinux();
++		if (IS_ERR(attach_btf))
++			return PTR_ERR(attach_btf);
++		if (!attach_btf)
++			return -EINVAL;
++		btf_get(attach_btf);
++	}
++
++	bpf_prog_load_fixup_attach_type(attr);
++	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
++				       attach_btf, attr->attach_btf_id,
++				       dst_prog)) {
++		if (dst_prog)
++			bpf_prog_put(dst_prog);
++		if (attach_btf)
++			btf_put(attach_btf);
++		return -EINVAL;
++	}
++
++	/* plain bpf_prog allocation */
++	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
++	if (!prog) {
++		if (dst_prog)
++			bpf_prog_put(dst_prog);
++		if (attach_btf)
++			btf_put(attach_btf);
++		return -ENOMEM;
++	}
++
++	prog->expected_attach_type = attr->expected_attach_type;
++	prog->aux->attach_btf = attach_btf;
++	prog->aux->attach_btf_id = attr->attach_btf_id;
++	prog->aux->dst_prog = dst_prog;
++	prog->aux->offload_requested = !!attr->prog_ifindex;
++	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
++	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
++
++	err = security_bpf_prog_alloc(prog->aux);
++	if (err)
++		goto free_prog;
++
++	prog->aux->user = get_current_user();
++	prog->len = attr->insn_cnt;
++
++	err = -EFAULT;
++	if (copy_from_bpfptr(prog->insns,
++			     make_bpfptr(attr->insns, uattr.is_kernel),
++			     bpf_prog_insn_size(prog)) != 0)
++		goto free_prog_sec;
++
++	prog->orig_prog = NULL;
++	prog->jited = 0;
++
++	atomic64_set(&prog->aux->refcnt, 1);
++	prog->gpl_compatible = is_gpl ? 1 : 0;
++
++	if (bpf_prog_is_dev_bound(prog->aux)) {
++		err = bpf_prog_offload_init(prog, attr);
++		if (err)
++			goto free_prog_sec;
++	}
++
++	/* find program type: socket_filter vs tracing_filter */
++	err = find_prog_type(type, prog);
++	if (err < 0)
++		goto free_prog_sec;
++
++	prog->aux->load_time = ktime_get_boottime_ns();
++	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
++			       sizeof(attr->prog_name));
++	if (err < 0)
++		goto free_prog_sec;
++
++	/* run eBPF verifier */
++	err = bpf_check(&prog, attr, uattr);
++	if (err < 0)
++		goto free_used_maps;
++
++	prog = bpf_prog_select_runtime(prog, &err);
++	if (err < 0)
++		goto free_used_maps;
++
++	err = bpf_prog_alloc_id(prog);
++	if (err)
++		goto free_used_maps;
++
++	/* Upon success of bpf_prog_alloc_id(), the BPF prog is
++	 * effectively publicly exposed. However, retrieving via
++	 * bpf_prog_get_fd_by_id() will take another reference,
++	 * therefore it cannot be gone underneath us.
++	 *
++	 * Only for the time /after/ successful bpf_prog_new_fd()
++	 * and before returning to userspace, we might just hold
++	 * one reference and any parallel close on that fd could
++	 * rip everything out. Hence, below notifications must
++	 * happen before bpf_prog_new_fd().
++	 *
++	 * Also, any failure handling from this point onwards must
++	 * be using bpf_prog_put() given the program is exposed.
++	 */
++	bpf_prog_kallsyms_add(prog);
++	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
++	bpf_audit_prog(prog, BPF_AUDIT_LOAD);
++
++	err = bpf_prog_new_fd(prog);
++	if (err < 0)
++		bpf_prog_put(prog);
++	return err;
++
++free_used_maps:
++	/* In case we have subprogs, we need to wait for a grace
++	 * period before we can tear down JIT memory since symbols
++	 * are already exposed under kallsyms.
++	 */
++	__bpf_prog_put_noref(prog, prog->aux->func_cnt);
++	return err;
++free_prog_sec:
++	free_uid(prog->aux->user);
++	security_bpf_prog_free(prog->aux);
++free_prog:
++	if (prog->aux->attach_btf)
++		btf_put(prog->aux->attach_btf);
++	bpf_prog_free(prog);
++	return err;
++}
++
++#define BPF_OBJ_LAST_FIELD file_flags
++
++static int bpf_obj_pin(const union bpf_attr *attr)
++{
++	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
++		return -EINVAL;
++
++	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
++}
++
++static int bpf_obj_get(const union bpf_attr *attr)
++{
++	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
++	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
++		return -EINVAL;
++
++	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
++				attr->file_flags);
++}
++
++void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
++		   const struct bpf_link_ops *ops, struct bpf_prog *prog)
++{
++	atomic64_set(&link->refcnt, 1);
++	link->type = type;
++	link->id = 0;
++	link->ops = ops;
++	link->prog = prog;
++}
++
++static void bpf_link_free_id(int id)
++{
++	if (!id)
++		return;
++
++	spin_lock_bh(&link_idr_lock);
++	idr_remove(&link_idr, id);
++	spin_unlock_bh(&link_idr_lock);
++}
++
++/* Clean up bpf_link and corresponding anon_inode file and FD. After
++ * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
++ * anon_inode's release() call. This helper marksbpf_link as
++ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
++ * is not decremented, it's the responsibility of a calling code that failed
++ * to complete bpf_link initialization.
++ */
++void bpf_link_cleanup(struct bpf_link_primer *primer)
++{
++	primer->link->prog = NULL;
++	bpf_link_free_id(primer->id);
++	fput(primer->file);
++	put_unused_fd(primer->fd);
++}
++
++void bpf_link_inc(struct bpf_link *link)
++{
++	atomic64_inc(&link->refcnt);
++}
++
++/* bpf_link_free is guaranteed to be called from process context */
++static void bpf_link_free(struct bpf_link *link)
++{
++	bpf_link_free_id(link->id);
++	if (link->prog) {
++		/* detach BPF program, clean up used resources */
++		link->ops->release(link);
++		bpf_prog_put(link->prog);
++	}
++	/* free bpf_link and its containing memory */
++	link->ops->dealloc(link);
++}
++
++static void bpf_link_put_deferred(struct work_struct *work)
++{
++	struct bpf_link *link = container_of(work, struct bpf_link, work);
++
++	bpf_link_free(link);
++}
++
++/* bpf_link_put can be called from atomic context, but ensures that resources
++ * are freed from process context
++ */
++void bpf_link_put(struct bpf_link *link)
++{
++	if (!atomic64_dec_and_test(&link->refcnt))
++		return;
++
++	if (in_atomic()) {
++		INIT_WORK(&link->work, bpf_link_put_deferred);
++		schedule_work(&link->work);
++	} else {
++		bpf_link_free(link);
++	}
++}
++EXPORT_SYMBOL(bpf_link_put);
++
++static int bpf_link_release(struct inode *inode, struct file *filp)
++{
++	struct bpf_link *link = filp->private_data;
++
++	bpf_link_put(link);
++	return 0;
++}
++
++#ifdef CONFIG_PROC_FS
++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
++#define BPF_MAP_TYPE(_id, _ops)
++#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
++static const char *bpf_link_type_strs[] = {
++	[BPF_LINK_TYPE_UNSPEC] = "<invalid>",
++#include <linux/bpf_types.h>
++};
++#undef BPF_PROG_TYPE
++#undef BPF_MAP_TYPE
++#undef BPF_LINK_TYPE
++
++static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
++{
++	const struct bpf_link *link = filp->private_data;
++	const struct bpf_prog *prog = link->prog;
++	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
++
++	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
++	seq_printf(m,
++		   "link_type:\t%s\n"
++		   "link_id:\t%u\n"
++		   "prog_tag:\t%s\n"
++		   "prog_id:\t%u\n",
++		   bpf_link_type_strs[link->type],
++		   link->id,
++		   prog_tag,
++		   prog->aux->id);
++	if (link->ops->show_fdinfo)
++		link->ops->show_fdinfo(link, m);
++}
++#endif
++
++static const struct file_operations bpf_link_fops = {
++#ifdef CONFIG_PROC_FS
++	.show_fdinfo	= bpf_link_show_fdinfo,
++#endif
++	.release	= bpf_link_release,
++	.read		= bpf_dummy_read,
++	.write		= bpf_dummy_write,
++};
++
++static int bpf_link_alloc_id(struct bpf_link *link)
++{
++	int id;
++
++	idr_preload(GFP_KERNEL);
++	spin_lock_bh(&link_idr_lock);
++	id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
++	spin_unlock_bh(&link_idr_lock);
++	idr_preload_end();
++
++	return id;
++}
++
++/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
++ * reserving unused FD and allocating ID from link_idr. This is to be paired
++ * with bpf_link_settle() to install FD and ID and expose bpf_link to
++ * user-space, if bpf_link is successfully attached. If not, bpf_link and
++ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
++ * transient state is passed around in struct bpf_link_primer.
++ * This is preferred way to create and initialize bpf_link, especially when
++ * there are complicated and expensive operations in between creating bpf_link
++ * itself and attaching it to BPF hook. By using bpf_link_prime() and
++ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
++ * expensive (and potentially failing) roll back operations in a rare case
++ * that file, FD, or ID can't be allocated.
++ */
++int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
++{
++	struct file *file;
++	int fd, id;
++
++	fd = get_unused_fd_flags(O_CLOEXEC);
++	if (fd < 0)
++		return fd;
++
++
++	id = bpf_link_alloc_id(link);
++	if (id < 0) {
++		put_unused_fd(fd);
++		return id;
++	}
++
++	file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
++	if (IS_ERR(file)) {
++		bpf_link_free_id(id);
++		put_unused_fd(fd);
++		return PTR_ERR(file);
++	}
++
++	primer->link = link;
++	primer->file = file;
++	primer->fd = fd;
++	primer->id = id;
++	return 0;
++}
++
++int bpf_link_settle(struct bpf_link_primer *primer)
++{
++	/* make bpf_link fetchable by ID */
++	spin_lock_bh(&link_idr_lock);
++	primer->link->id = primer->id;
++	spin_unlock_bh(&link_idr_lock);
++	/* make bpf_link fetchable by FD */
++	fd_install(primer->fd, primer->file);
++	/* pass through installed FD */
++	return primer->fd;
++}
++
++int bpf_link_new_fd(struct bpf_link *link)
++{
++	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
++}
++
++struct bpf_link *bpf_link_get_from_fd(u32 ufd)
++{
++	struct fd f = fdget(ufd);
++	struct bpf_link *link;
++
++	if (!f.file)
++		return ERR_PTR(-EBADF);
++	if (f.file->f_op != &bpf_link_fops) {
++		fdput(f);
++		return ERR_PTR(-EINVAL);
++	}
++
++	link = f.file->private_data;
++	bpf_link_inc(link);
++	fdput(f);
++
++	return link;
++}
++EXPORT_SYMBOL(bpf_link_get_from_fd);
++
++static void bpf_tracing_link_release(struct bpf_link *link)
++{
++	struct bpf_tracing_link *tr_link =
++		container_of(link, struct bpf_tracing_link, link.link);
++
++	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
++						tr_link->trampoline));
++
++	bpf_trampoline_put(tr_link->trampoline);
++
++	/* tgt_prog is NULL if target is a kernel function */
++	if (tr_link->tgt_prog)
++		bpf_prog_put(tr_link->tgt_prog);
++}
++
++static void bpf_tracing_link_dealloc(struct bpf_link *link)
++{
++	struct bpf_tracing_link *tr_link =
++		container_of(link, struct bpf_tracing_link, link.link);
++
++	kfree(tr_link);
++}
++
++static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
++					 struct seq_file *seq)
++{
++	struct bpf_tracing_link *tr_link =
++		container_of(link, struct bpf_tracing_link, link.link);
++
++	seq_printf(seq,
++		   "attach_type:\t%d\n",
++		   tr_link->attach_type);
++}
++
++static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
++					   struct bpf_link_info *info)
++{
++	struct bpf_tracing_link *tr_link =
++		container_of(link, struct bpf_tracing_link, link.link);
++
++	info->tracing.attach_type = tr_link->attach_type;
++	bpf_trampoline_unpack_key(tr_link->trampoline->key,
++				  &info->tracing.target_obj_id,
++				  &info->tracing.target_btf_id);
++
++	return 0;
++}
++
++static const struct bpf_link_ops bpf_tracing_link_lops = {
++	.release = bpf_tracing_link_release,
++	.dealloc = bpf_tracing_link_dealloc,
++	.show_fdinfo = bpf_tracing_link_show_fdinfo,
++	.fill_link_info = bpf_tracing_link_fill_link_info,
++};
++
++static int bpf_tracing_prog_attach(struct bpf_prog *prog,
++				   int tgt_prog_fd,
++				   u32 btf_id,
++				   u64 bpf_cookie)
++{
++	struct bpf_link_primer link_primer;
++	struct bpf_prog *tgt_prog = NULL;
++	struct bpf_trampoline *tr = NULL;
++	struct bpf_tracing_link *link;
++	u64 key = 0;
++	int err;
++
++	switch (prog->type) {
++	case BPF_PROG_TYPE_TRACING:
++		if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
++		    prog->expected_attach_type != BPF_TRACE_FEXIT &&
++		    prog->expected_attach_type != BPF_MODIFY_RETURN) {
++			err = -EINVAL;
++			goto out_put_prog;
++		}
++		break;
++	case BPF_PROG_TYPE_EXT:
++		if (prog->expected_attach_type != 0) {
++			err = -EINVAL;
++			goto out_put_prog;
++		}
++		break;
++	case BPF_PROG_TYPE_LSM:
++		if (prog->expected_attach_type != BPF_LSM_MAC) {
++			err = -EINVAL;
++			goto out_put_prog;
++		}
++		break;
++	default:
++		err = -EINVAL;
++		goto out_put_prog;
++	}
++
++	if (!!tgt_prog_fd != !!btf_id) {
++		err = -EINVAL;
++		goto out_put_prog;
++	}
++
++	if (tgt_prog_fd) {
++		/* For now we only allow new targets for BPF_PROG_TYPE_EXT */
++		if (prog->type != BPF_PROG_TYPE_EXT) {
++			err = -EINVAL;
++			goto out_put_prog;
++		}
++
++		tgt_prog = bpf_prog_get(tgt_prog_fd);
++		if (IS_ERR(tgt_prog)) {
++			err = PTR_ERR(tgt_prog);
++			tgt_prog = NULL;
++			goto out_put_prog;
++		}
++
++		key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
++	}
++
++	link = kzalloc(sizeof(*link), GFP_USER);
++	if (!link) {
++		err = -ENOMEM;
++		goto out_put_prog;
++	}
++	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
++		      &bpf_tracing_link_lops, prog);
++	link->attach_type = prog->expected_attach_type;
++	link->link.cookie = bpf_cookie;
++
++	mutex_lock(&prog->aux->dst_mutex);
++
++	/* There are a few possible cases here:
++	 *
++	 * - if prog->aux->dst_trampoline is set, the program was just loaded
++	 *   and not yet attached to anything, so we can use the values stored
++	 *   in prog->aux
++	 *
++	 * - if prog->aux->dst_trampoline is NULL, the program has already been
++         *   attached to a target and its initial target was cleared (below)
++	 *
++	 * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
++	 *   target_btf_id using the link_create API.
++	 *
++	 * - if tgt_prog == NULL when this function was called using the old
++	 *   raw_tracepoint_open API, and we need a target from prog->aux
++	 *
++	 * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
++	 *   was detached and is going for re-attachment.
++	 */
++	if (!prog->aux->dst_trampoline && !tgt_prog) {
++		/*
++		 * Allow re-attach for TRACING and LSM programs. If it's
++		 * currently linked, bpf_trampoline_link_prog will fail.
++		 * EXT programs need to specify tgt_prog_fd, so they
++		 * re-attach in separate code path.
++		 */
++		if (prog->type != BPF_PROG_TYPE_TRACING &&
++		    prog->type != BPF_PROG_TYPE_LSM) {
++			err = -EINVAL;
++			goto out_unlock;
++		}
++		btf_id = prog->aux->attach_btf_id;
++		key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
++	}
++
++	if (!prog->aux->dst_trampoline ||
++	    (key && key != prog->aux->dst_trampoline->key)) {
++		/* If there is no saved target, or the specified target is
++		 * different from the destination specified at load time, we
++		 * need a new trampoline and a check for compatibility
++		 */
++		struct bpf_attach_target_info tgt_info = {};
++
++		err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
++					      &tgt_info);
++		if (err)
++			goto out_unlock;
++
++		tr = bpf_trampoline_get(key, &tgt_info);
++		if (!tr) {
++			err = -ENOMEM;
++			goto out_unlock;
++		}
++	} else {
++		/* The caller didn't specify a target, or the target was the
++		 * same as the destination supplied during program load. This
++		 * means we can reuse the trampoline and reference from program
++		 * load time, and there is no need to allocate a new one. This
++		 * can only happen once for any program, as the saved values in
++		 * prog->aux are cleared below.
++		 */
++		tr = prog->aux->dst_trampoline;
++		tgt_prog = prog->aux->dst_prog;
++	}
++
++	err = bpf_link_prime(&link->link.link, &link_primer);
++	if (err)
++		goto out_unlock;
++
++	err = bpf_trampoline_link_prog(&link->link, tr);
++	if (err) {
++		bpf_link_cleanup(&link_primer);
++		link = NULL;
++		goto out_unlock;
++	}
++
++	link->tgt_prog = tgt_prog;
++	link->trampoline = tr;
++
++	/* Always clear the trampoline and target prog from prog->aux to make
++	 * sure the original attach destination is not kept alive after a
++	 * program is (re-)attached to another target.
++	 */
++	if (prog->aux->dst_prog &&
++	    (tgt_prog_fd || tr != prog->aux->dst_trampoline))
++		/* got extra prog ref from syscall, or attaching to different prog */
++		bpf_prog_put(prog->aux->dst_prog);
++	if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
++		/* we allocated a new trampoline, so free the old one */
++		bpf_trampoline_put(prog->aux->dst_trampoline);
++
++	prog->aux->dst_prog = NULL;
++	prog->aux->dst_trampoline = NULL;
++	mutex_unlock(&prog->aux->dst_mutex);
++
++	return bpf_link_settle(&link_primer);
++out_unlock:
++	if (tr && tr != prog->aux->dst_trampoline)
++		bpf_trampoline_put(tr);
++	mutex_unlock(&prog->aux->dst_mutex);
++	kfree(link);
++out_put_prog:
++	if (tgt_prog_fd && tgt_prog)
++		bpf_prog_put(tgt_prog);
++	return err;
++}
++
++struct bpf_raw_tp_link {
++	struct bpf_link link;
++	struct bpf_raw_event_map *btp;
++};
++
++static void bpf_raw_tp_link_release(struct bpf_link *link)
++{
++	struct bpf_raw_tp_link *raw_tp =
++		container_of(link, struct bpf_raw_tp_link, link);
++
++	bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
++	bpf_put_raw_tracepoint(raw_tp->btp);
++}
++
++static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
++{
++	struct bpf_raw_tp_link *raw_tp =
++		container_of(link, struct bpf_raw_tp_link, link);
++
++	kfree(raw_tp);
++}
++
++static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
++					struct seq_file *seq)
++{
++	struct bpf_raw_tp_link *raw_tp_link =
++		container_of(link, struct bpf_raw_tp_link, link);
++
++	seq_printf(seq,
++		   "tp_name:\t%s\n",
++		   raw_tp_link->btp->tp->name);
++}
++
++static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
++					  struct bpf_link_info *info)
++{
++	struct bpf_raw_tp_link *raw_tp_link =
++		container_of(link, struct bpf_raw_tp_link, link);
++	char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
++	const char *tp_name = raw_tp_link->btp->tp->name;
++	u32 ulen = info->raw_tracepoint.tp_name_len;
++	size_t tp_len = strlen(tp_name);
++
++	if (!ulen ^ !ubuf)
++		return -EINVAL;
++
++	info->raw_tracepoint.tp_name_len = tp_len + 1;
++
++	if (!ubuf)
++		return 0;
++
++	if (ulen >= tp_len + 1) {
++		if (copy_to_user(ubuf, tp_name, tp_len + 1))
++			return -EFAULT;
++	} else {
++		char zero = '\0';
++
++		if (copy_to_user(ubuf, tp_name, ulen - 1))
++			return -EFAULT;
++		if (put_user(zero, ubuf + ulen - 1))
++			return -EFAULT;
++		return -ENOSPC;
++	}
++
++	return 0;
++}
++
++static const struct bpf_link_ops bpf_raw_tp_link_lops = {
++	.release = bpf_raw_tp_link_release,
++	.dealloc = bpf_raw_tp_link_dealloc,
++	.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
++	.fill_link_info = bpf_raw_tp_link_fill_link_info,
++};
++
++#ifdef CONFIG_PERF_EVENTS
++struct bpf_perf_link {
++	struct bpf_link link;
++	struct file *perf_file;
++};
++
++static void bpf_perf_link_release(struct bpf_link *link)
++{
++	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
++	struct perf_event *event = perf_link->perf_file->private_data;
++
++	perf_event_free_bpf_prog(event);
++	fput(perf_link->perf_file);
++}
++
++static void bpf_perf_link_dealloc(struct bpf_link *link)
++{
++	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
++
++	kfree(perf_link);
++}
++
++static const struct bpf_link_ops bpf_perf_link_lops = {
++	.release = bpf_perf_link_release,
++	.dealloc = bpf_perf_link_dealloc,
++};
++
++static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
++{
++	struct bpf_link_primer link_primer;
++	struct bpf_perf_link *link;
++	struct perf_event *event;
++	struct file *perf_file;
++	int err;
++
++	if (attr->link_create.flags)
++		return -EINVAL;
++
++	perf_file = perf_event_get(attr->link_create.target_fd);
++	if (IS_ERR(perf_file))
++		return PTR_ERR(perf_file);
++
++	link = kzalloc(sizeof(*link), GFP_USER);
++	if (!link) {
++		err = -ENOMEM;
++		goto out_put_file;
++	}
++	bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
++	link->perf_file = perf_file;
++
++	err = bpf_link_prime(&link->link, &link_primer);
++	if (err) {
++		kfree(link);
++		goto out_put_file;
++	}
++
++	event = perf_file->private_data;
++	err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
++	if (err) {
++		bpf_link_cleanup(&link_primer);
++		goto out_put_file;
++	}
++	/* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
++	bpf_prog_inc(prog);
++
++	return bpf_link_settle(&link_primer);
++
++out_put_file:
++	fput(perf_file);
++	return err;
++}
++#else
++static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
++{
++	return -EOPNOTSUPP;
++}
++#endif /* CONFIG_PERF_EVENTS */
++
++static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
++				  const char __user *user_tp_name)
++{
++	struct bpf_link_primer link_primer;
++	struct bpf_raw_tp_link *link;
++	struct bpf_raw_event_map *btp;
++	const char *tp_name;
++	char buf[128];
++	int err;
++
++	switch (prog->type) {
++	case BPF_PROG_TYPE_TRACING:
++	case BPF_PROG_TYPE_EXT:
++	case BPF_PROG_TYPE_LSM:
++		if (user_tp_name)
++			/* The attach point for this category of programs
++			 * should be specified via btf_id during program load.
++			 */
++			return -EINVAL;
++		if (prog->type == BPF_PROG_TYPE_TRACING &&
++		    prog->expected_attach_type == BPF_TRACE_RAW_TP) {
++			tp_name = prog->aux->attach_func_name;
++			break;
++		}
++		return bpf_tracing_prog_attach(prog, 0, 0, 0);
++	case BPF_PROG_TYPE_RAW_TRACEPOINT:
++	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
++		if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
++			return -EFAULT;
++		buf[sizeof(buf) - 1] = 0;
++		tp_name = buf;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	btp = bpf_get_raw_tracepoint(tp_name);
++	if (!btp)
++		return -ENOENT;
++
++	link = kzalloc(sizeof(*link), GFP_USER);
++	if (!link) {
++		err = -ENOMEM;
++		goto out_put_btp;
++	}
++	bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
++		      &bpf_raw_tp_link_lops, prog);
++	link->btp = btp;
++
++	err = bpf_link_prime(&link->link, &link_primer);
++	if (err) {
++		kfree(link);
++		goto out_put_btp;
++	}
++
++	err = bpf_probe_register(link->btp, prog);
++	if (err) {
++		bpf_link_cleanup(&link_primer);
++		goto out_put_btp;
++	}
++
++	return bpf_link_settle(&link_primer);
++
++out_put_btp:
++	bpf_put_raw_tracepoint(btp);
++	return err;
++}
++
++#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
++
++static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
++{
++	struct bpf_prog *prog;
++	int fd;
++
++	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
++		return -EINVAL;
++
++	prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
++	if (fd < 0)
++		bpf_prog_put(prog);
++	return fd;
++}
++
++static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
++					     enum bpf_attach_type attach_type)
++{
++	switch (prog->type) {
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++	case BPF_PROG_TYPE_SK_LOOKUP:
++		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
++	case BPF_PROG_TYPE_CGROUP_SKB:
++		if (!capable(CAP_NET_ADMIN))
++			/* cg-skb progs can be loaded by unpriv user.
++			 * check permissions at attach time.
++			 */
++			return -EPERM;
++		return prog->enforce_expected_attach_type &&
++			prog->expected_attach_type != attach_type ?
++			-EINVAL : 0;
++	default:
++		return 0;
++	}
++}
++
++static enum bpf_prog_type
++attach_type_to_prog_type(enum bpf_attach_type attach_type)
++{
++	switch (attach_type) {
++	case BPF_CGROUP_INET_INGRESS:
++	case BPF_CGROUP_INET_EGRESS:
++		return BPF_PROG_TYPE_CGROUP_SKB;
++	case BPF_CGROUP_INET_SOCK_CREATE:
++	case BPF_CGROUP_INET_SOCK_RELEASE:
++	case BPF_CGROUP_INET4_POST_BIND:
++	case BPF_CGROUP_INET6_POST_BIND:
++		return BPF_PROG_TYPE_CGROUP_SOCK;
++	case BPF_CGROUP_INET4_BIND:
++	case BPF_CGROUP_INET6_BIND:
++	case BPF_CGROUP_INET4_CONNECT:
++	case BPF_CGROUP_INET6_CONNECT:
++	case BPF_CGROUP_INET4_GETPEERNAME:
++	case BPF_CGROUP_INET6_GETPEERNAME:
++	case BPF_CGROUP_INET4_GETSOCKNAME:
++	case BPF_CGROUP_INET6_GETSOCKNAME:
++	case BPF_CGROUP_UDP4_SENDMSG:
++	case BPF_CGROUP_UDP6_SENDMSG:
++	case BPF_CGROUP_UDP4_RECVMSG:
++	case BPF_CGROUP_UDP6_RECVMSG:
++		return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
++	case BPF_CGROUP_SOCK_OPS:
++		return BPF_PROG_TYPE_SOCK_OPS;
++	case BPF_CGROUP_DEVICE:
++		return BPF_PROG_TYPE_CGROUP_DEVICE;
++	case BPF_SK_MSG_VERDICT:
++		return BPF_PROG_TYPE_SK_MSG;
++	case BPF_SK_SKB_STREAM_PARSER:
++	case BPF_SK_SKB_STREAM_VERDICT:
++	case BPF_SK_SKB_VERDICT:
++		return BPF_PROG_TYPE_SK_SKB;
++	case BPF_LIRC_MODE2:
++		return BPF_PROG_TYPE_LIRC_MODE2;
++	case BPF_FLOW_DISSECTOR:
++		return BPF_PROG_TYPE_FLOW_DISSECTOR;
++	case BPF_CGROUP_SYSCTL:
++		return BPF_PROG_TYPE_CGROUP_SYSCTL;
++	case BPF_CGROUP_GETSOCKOPT:
++	case BPF_CGROUP_SETSOCKOPT:
++		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
++	case BPF_TRACE_ITER:
++	case BPF_TRACE_RAW_TP:
++	case BPF_TRACE_FENTRY:
++	case BPF_TRACE_FEXIT:
++	case BPF_MODIFY_RETURN:
++		return BPF_PROG_TYPE_TRACING;
++	case BPF_LSM_MAC:
++		return BPF_PROG_TYPE_LSM;
++	case BPF_SK_LOOKUP:
++		return BPF_PROG_TYPE_SK_LOOKUP;
++	case BPF_XDP:
++		return BPF_PROG_TYPE_XDP;
++	case BPF_LSM_CGROUP:
++		return BPF_PROG_TYPE_LSM;
++	default:
++		return BPF_PROG_TYPE_UNSPEC;
++	}
++}
++
++#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
++
++#define BPF_F_ATTACH_MASK \
++	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
++
++static int bpf_prog_attach(const union bpf_attr *attr)
++{
++	enum bpf_prog_type ptype;
++	struct bpf_prog *prog;
++	int ret;
++
++	if (CHECK_ATTR(BPF_PROG_ATTACH))
++		return -EINVAL;
++
++	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
++		return -EINVAL;
++
++	ptype = attach_type_to_prog_type(attr->attach_type);
++	if (ptype == BPF_PROG_TYPE_UNSPEC)
++		return -EINVAL;
++
++	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
++		bpf_prog_put(prog);
++		return -EINVAL;
++	}
++
++	switch (ptype) {
++	case BPF_PROG_TYPE_SK_SKB:
++	case BPF_PROG_TYPE_SK_MSG:
++		ret = sock_map_get_from_fd(attr, prog);
++		break;
++	case BPF_PROG_TYPE_LIRC_MODE2:
++		ret = lirc_prog_attach(attr, prog);
++		break;
++	case BPF_PROG_TYPE_FLOW_DISSECTOR:
++		ret = netns_bpf_prog_attach(attr, prog);
++		break;
++	case BPF_PROG_TYPE_CGROUP_DEVICE:
++	case BPF_PROG_TYPE_CGROUP_SKB:
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++	case BPF_PROG_TYPE_CGROUP_SYSCTL:
++	case BPF_PROG_TYPE_SOCK_OPS:
++	case BPF_PROG_TYPE_LSM:
++		if (ptype == BPF_PROG_TYPE_LSM &&
++		    prog->expected_attach_type != BPF_LSM_CGROUP)
++			return -EINVAL;
++
++		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret)
++		bpf_prog_put(prog);
++	return ret;
++}
++
++#define BPF_PROG_DETACH_LAST_FIELD attach_type
++
++static int bpf_prog_detach(const union bpf_attr *attr)
++{
++	enum bpf_prog_type ptype;
++
++	if (CHECK_ATTR(BPF_PROG_DETACH))
++		return -EINVAL;
++
++	ptype = attach_type_to_prog_type(attr->attach_type);
++
++	switch (ptype) {
++	case BPF_PROG_TYPE_SK_MSG:
++	case BPF_PROG_TYPE_SK_SKB:
++		return sock_map_prog_detach(attr, ptype);
++	case BPF_PROG_TYPE_LIRC_MODE2:
++		return lirc_prog_detach(attr);
++	case BPF_PROG_TYPE_FLOW_DISSECTOR:
++		return netns_bpf_prog_detach(attr, ptype);
++	case BPF_PROG_TYPE_CGROUP_DEVICE:
++	case BPF_PROG_TYPE_CGROUP_SKB:
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++	case BPF_PROG_TYPE_CGROUP_SYSCTL:
++	case BPF_PROG_TYPE_SOCK_OPS:
++	case BPF_PROG_TYPE_LSM:
++		return cgroup_bpf_prog_detach(attr, ptype);
++	default:
++		return -EINVAL;
++	}
++}
++
++#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
++
++static int bpf_prog_query(const union bpf_attr *attr,
++			  union bpf_attr __user *uattr)
++{
++	if (!capable(CAP_NET_ADMIN))
++		return -EPERM;
++	if (CHECK_ATTR(BPF_PROG_QUERY))
++		return -EINVAL;
++	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
++		return -EINVAL;
++
++	switch (attr->query.attach_type) {
++	case BPF_CGROUP_INET_INGRESS:
++	case BPF_CGROUP_INET_EGRESS:
++	case BPF_CGROUP_INET_SOCK_CREATE:
++	case BPF_CGROUP_INET_SOCK_RELEASE:
++	case BPF_CGROUP_INET4_BIND:
++	case BPF_CGROUP_INET6_BIND:
++	case BPF_CGROUP_INET4_POST_BIND:
++	case BPF_CGROUP_INET6_POST_BIND:
++	case BPF_CGROUP_INET4_CONNECT:
++	case BPF_CGROUP_INET6_CONNECT:
++	case BPF_CGROUP_INET4_GETPEERNAME:
++	case BPF_CGROUP_INET6_GETPEERNAME:
++	case BPF_CGROUP_INET4_GETSOCKNAME:
++	case BPF_CGROUP_INET6_GETSOCKNAME:
++	case BPF_CGROUP_UDP4_SENDMSG:
++	case BPF_CGROUP_UDP6_SENDMSG:
++	case BPF_CGROUP_UDP4_RECVMSG:
++	case BPF_CGROUP_UDP6_RECVMSG:
++	case BPF_CGROUP_SOCK_OPS:
++	case BPF_CGROUP_DEVICE:
++	case BPF_CGROUP_SYSCTL:
++	case BPF_CGROUP_GETSOCKOPT:
++	case BPF_CGROUP_SETSOCKOPT:
++	case BPF_LSM_CGROUP:
++		return cgroup_bpf_prog_query(attr, uattr);
++	case BPF_LIRC_MODE2:
++		return lirc_prog_query(attr, uattr);
++	case BPF_FLOW_DISSECTOR:
++	case BPF_SK_LOOKUP:
++		return netns_bpf_prog_query(attr, uattr);
++	case BPF_SK_SKB_STREAM_PARSER:
++	case BPF_SK_SKB_STREAM_VERDICT:
++	case BPF_SK_MSG_VERDICT:
++	case BPF_SK_SKB_VERDICT:
++		return sock_map_bpf_prog_query(attr, uattr);
++	default:
++		return -EINVAL;
++	}
++}
++
++#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
++
++static int bpf_prog_test_run(const union bpf_attr *attr,
++			     union bpf_attr __user *uattr)
++{
++	struct bpf_prog *prog;
++	int ret = -ENOTSUPP;
++
++	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
++		return -EINVAL;
++
++	if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
++	    (!attr->test.ctx_size_in && attr->test.ctx_in))
++		return -EINVAL;
++
++	if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
++	    (!attr->test.ctx_size_out && attr->test.ctx_out))
++		return -EINVAL;
++
++	prog = bpf_prog_get(attr->test.prog_fd);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	if (prog->aux->ops->test_run)
++		ret = prog->aux->ops->test_run(prog, attr, uattr);
++
++	bpf_prog_put(prog);
++	return ret;
++}
++
++#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
++
++static int bpf_obj_get_next_id(const union bpf_attr *attr,
++			       union bpf_attr __user *uattr,
++			       struct idr *idr,
++			       spinlock_t *lock)
++{
++	u32 next_id = attr->start_id;
++	int err = 0;
++
++	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	next_id++;
++	spin_lock_bh(lock);
++	if (!idr_get_next(idr, &next_id))
++		err = -ENOENT;
++	spin_unlock_bh(lock);
++
++	if (!err)
++		err = put_user(next_id, &uattr->next_id);
++
++	return err;
++}
++
++struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
++{
++	struct bpf_map *map;
++
++	spin_lock_bh(&map_idr_lock);
++again:
++	map = idr_get_next(&map_idr, id);
++	if (map) {
++		map = __bpf_map_inc_not_zero(map, false);
++		if (IS_ERR(map)) {
++			(*id)++;
++			goto again;
++		}
++	}
++	spin_unlock_bh(&map_idr_lock);
++
++	return map;
++}
++
++struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
++{
++	struct bpf_prog *prog;
++
++	spin_lock_bh(&prog_idr_lock);
++again:
++	prog = idr_get_next(&prog_idr, id);
++	if (prog) {
++		prog = bpf_prog_inc_not_zero(prog);
++		if (IS_ERR(prog)) {
++			(*id)++;
++			goto again;
++		}
++	}
++	spin_unlock_bh(&prog_idr_lock);
++
++	return prog;
++}
++
++#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
++
++struct bpf_prog *bpf_prog_by_id(u32 id)
++{
++	struct bpf_prog *prog;
++
++	if (!id)
++		return ERR_PTR(-ENOENT);
++
++	spin_lock_bh(&prog_idr_lock);
++	prog = idr_find(&prog_idr, id);
++	if (prog)
++		prog = bpf_prog_inc_not_zero(prog);
++	else
++		prog = ERR_PTR(-ENOENT);
++	spin_unlock_bh(&prog_idr_lock);
++	return prog;
++}
++
++static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
++{
++	struct bpf_prog *prog;
++	u32 id = attr->prog_id;
++	int fd;
++
++	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	prog = bpf_prog_by_id(id);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	fd = bpf_prog_new_fd(prog);
++	if (fd < 0)
++		bpf_prog_put(prog);
++
++	return fd;
++}
++
++#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
++
++static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
++{
++	struct bpf_map *map;
++	u32 id = attr->map_id;
++	int f_flags;
++	int fd;
++
++	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
++	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	f_flags = bpf_get_file_flag(attr->open_flags);
++	if (f_flags < 0)
++		return f_flags;
++
++	spin_lock_bh(&map_idr_lock);
++	map = idr_find(&map_idr, id);
++	if (map)
++		map = __bpf_map_inc_not_zero(map, true);
++	else
++		map = ERR_PTR(-ENOENT);
++	spin_unlock_bh(&map_idr_lock);
++
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++
++	fd = bpf_map_new_fd(map, f_flags);
++	if (fd < 0)
++		bpf_map_put_with_uref(map);
++
++	return fd;
++}
++
++static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
++					      unsigned long addr, u32 *off,
++					      u32 *type)
++{
++	const struct bpf_map *map;
++	int i;
++
++	mutex_lock(&prog->aux->used_maps_mutex);
++	for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
++		map = prog->aux->used_maps[i];
++		if (map == (void *)addr) {
++			*type = BPF_PSEUDO_MAP_FD;
++			goto out;
++		}
++		if (!map->ops->map_direct_value_meta)
++			continue;
++		if (!map->ops->map_direct_value_meta(map, addr, off)) {
++			*type = BPF_PSEUDO_MAP_VALUE;
++			goto out;
++		}
++	}
++	map = NULL;
++
++out:
++	mutex_unlock(&prog->aux->used_maps_mutex);
++	return map;
++}
++
++static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
++					      const struct cred *f_cred)
++{
++	const struct bpf_map *map;
++	struct bpf_insn *insns;
++	u32 off, type;
++	u64 imm;
++	u8 code;
++	int i;
++
++	insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
++			GFP_USER);
++	if (!insns)
++		return insns;
++
++	for (i = 0; i < prog->len; i++) {
++		code = insns[i].code;
++
++		if (code == (BPF_JMP | BPF_TAIL_CALL)) {
++			insns[i].code = BPF_JMP | BPF_CALL;
++			insns[i].imm = BPF_FUNC_tail_call;
++			/* fall-through */
++		}
++		if (code == (BPF_JMP | BPF_CALL) ||
++		    code == (BPF_JMP | BPF_CALL_ARGS)) {
++			if (code == (BPF_JMP | BPF_CALL_ARGS))
++				insns[i].code = BPF_JMP | BPF_CALL;
++			if (!bpf_dump_raw_ok(f_cred))
++				insns[i].imm = 0;
++			continue;
++		}
++		if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
++			insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
++			continue;
++		}
++
++		if (code != (BPF_LD | BPF_IMM | BPF_DW))
++			continue;
++
++		imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
++		map = bpf_map_from_imm(prog, imm, &off, &type);
++		if (map) {
++			insns[i].src_reg = type;
++			insns[i].imm = map->id;
++			insns[i + 1].imm = off;
++			continue;
++		}
++	}
++
++	return insns;
++}
++
++static int set_info_rec_size(struct bpf_prog_info *info)
++{
++	/*
++	 * Ensure info.*_rec_size is the same as kernel expected size
++	 *
++	 * or
++	 *
++	 * Only allow zero *_rec_size if both _rec_size and _cnt are
++	 * zero.  In this case, the kernel will set the expected
++	 * _rec_size back to the info.
++	 */
++
++	if ((info->nr_func_info || info->func_info_rec_size) &&
++	    info->func_info_rec_size != sizeof(struct bpf_func_info))
++		return -EINVAL;
++
++	if ((info->nr_line_info || info->line_info_rec_size) &&
++	    info->line_info_rec_size != sizeof(struct bpf_line_info))
++		return -EINVAL;
++
++	if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
++	    info->jited_line_info_rec_size != sizeof(__u64))
++		return -EINVAL;
++
++	info->func_info_rec_size = sizeof(struct bpf_func_info);
++	info->line_info_rec_size = sizeof(struct bpf_line_info);
++	info->jited_line_info_rec_size = sizeof(__u64);
++
++	return 0;
++}
++
++static int bpf_prog_get_info_by_fd(struct file *file,
++				   struct bpf_prog *prog,
++				   const union bpf_attr *attr,
++				   union bpf_attr __user *uattr)
++{
++	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
++	struct btf *attach_btf = bpf_prog_get_target_btf(prog);
++	struct bpf_prog_info info;
++	u32 info_len = attr->info.info_len;
++	struct bpf_prog_kstats stats;
++	char __user *uinsns;
++	u32 ulen;
++	int err;
++
++	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
++	if (err)
++		return err;
++	info_len = min_t(u32, sizeof(info), info_len);
++
++	memset(&info, 0, sizeof(info));
++	if (copy_from_user(&info, uinfo, info_len))
++		return -EFAULT;
++
++	info.type = prog->type;
++	info.id = prog->aux->id;
++	info.load_time = prog->aux->load_time;
++	info.created_by_uid = from_kuid_munged(current_user_ns(),
++					       prog->aux->user->uid);
++	info.gpl_compatible = prog->gpl_compatible;
++
++	memcpy(info.tag, prog->tag, sizeof(prog->tag));
++	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
++
++	mutex_lock(&prog->aux->used_maps_mutex);
++	ulen = info.nr_map_ids;
++	info.nr_map_ids = prog->aux->used_map_cnt;
++	ulen = min_t(u32, info.nr_map_ids, ulen);
++	if (ulen) {
++		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
++		u32 i;
++
++		for (i = 0; i < ulen; i++)
++			if (put_user(prog->aux->used_maps[i]->id,
++				     &user_map_ids[i])) {
++				mutex_unlock(&prog->aux->used_maps_mutex);
++				return -EFAULT;
++			}
++	}
++	mutex_unlock(&prog->aux->used_maps_mutex);
++
++	err = set_info_rec_size(&info);
++	if (err)
++		return err;
++
++	bpf_prog_get_stats(prog, &stats);
++	info.run_time_ns = stats.nsecs;
++	info.run_cnt = stats.cnt;
++	info.recursion_misses = stats.misses;
++
++	info.verified_insns = prog->aux->verified_insns;
++
++	if (!bpf_capable()) {
++		info.jited_prog_len = 0;
++		info.xlated_prog_len = 0;
++		info.nr_jited_ksyms = 0;
++		info.nr_jited_func_lens = 0;
++		info.nr_func_info = 0;
++		info.nr_line_info = 0;
++		info.nr_jited_line_info = 0;
++		goto done;
++	}
++
++	ulen = info.xlated_prog_len;
++	info.xlated_prog_len = bpf_prog_insn_size(prog);
++	if (info.xlated_prog_len && ulen) {
++		struct bpf_insn *insns_sanitized;
++		bool fault;
++
++		if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
++			info.xlated_prog_insns = 0;
++			goto done;
++		}
++		insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
++		if (!insns_sanitized)
++			return -ENOMEM;
++		uinsns = u64_to_user_ptr(info.xlated_prog_insns);
++		ulen = min_t(u32, info.xlated_prog_len, ulen);
++		fault = copy_to_user(uinsns, insns_sanitized, ulen);
++		kfree(insns_sanitized);
++		if (fault)
++			return -EFAULT;
++	}
++
++	if (bpf_prog_is_dev_bound(prog->aux)) {
++		err = bpf_prog_offload_info_fill(&info, prog);
++		if (err)
++			return err;
++		goto done;
++	}
++
++	/* NOTE: the following code is supposed to be skipped for offload.
++	 * bpf_prog_offload_info_fill() is the place to fill similar fields
++	 * for offload.
++	 */
++	ulen = info.jited_prog_len;
++	if (prog->aux->func_cnt) {
++		u32 i;
++
++		info.jited_prog_len = 0;
++		for (i = 0; i < prog->aux->func_cnt; i++)
++			info.jited_prog_len += prog->aux->func[i]->jited_len;
++	} else {
++		info.jited_prog_len = prog->jited_len;
++	}
++
++	if (info.jited_prog_len && ulen) {
++		if (bpf_dump_raw_ok(file->f_cred)) {
++			uinsns = u64_to_user_ptr(info.jited_prog_insns);
++			ulen = min_t(u32, info.jited_prog_len, ulen);
++
++			/* for multi-function programs, copy the JITed
++			 * instructions for all the functions
++			 */
++			if (prog->aux->func_cnt) {
++				u32 len, free, i;
++				u8 *img;
++
++				free = ulen;
++				for (i = 0; i < prog->aux->func_cnt; i++) {
++					len = prog->aux->func[i]->jited_len;
++					len = min_t(u32, len, free);
++					img = (u8 *) prog->aux->func[i]->bpf_func;
++					if (copy_to_user(uinsns, img, len))
++						return -EFAULT;
++					uinsns += len;
++					free -= len;
++					if (!free)
++						break;
++				}
++			} else {
++				if (copy_to_user(uinsns, prog->bpf_func, ulen))
++					return -EFAULT;
++			}
++		} else {
++			info.jited_prog_insns = 0;
++		}
++	}
++
++	ulen = info.nr_jited_ksyms;
++	info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
++	if (ulen) {
++		if (bpf_dump_raw_ok(file->f_cred)) {
++			unsigned long ksym_addr;
++			u64 __user *user_ksyms;
++			u32 i;
++
++			/* copy the address of the kernel symbol
++			 * corresponding to each function
++			 */
++			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
++			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
++			if (prog->aux->func_cnt) {
++				for (i = 0; i < ulen; i++) {
++					ksym_addr = (unsigned long)
++						prog->aux->func[i]->bpf_func;
++					if (put_user((u64) ksym_addr,
++						     &user_ksyms[i]))
++						return -EFAULT;
++				}
++			} else {
++				ksym_addr = (unsigned long) prog->bpf_func;
++				if (put_user((u64) ksym_addr, &user_ksyms[0]))
++					return -EFAULT;
++			}
++		} else {
++			info.jited_ksyms = 0;
++		}
++	}
++
++	ulen = info.nr_jited_func_lens;
++	info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
++	if (ulen) {
++		if (bpf_dump_raw_ok(file->f_cred)) {
++			u32 __user *user_lens;
++			u32 func_len, i;
++
++			/* copy the JITed image lengths for each function */
++			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
++			user_lens = u64_to_user_ptr(info.jited_func_lens);
++			if (prog->aux->func_cnt) {
++				for (i = 0; i < ulen; i++) {
++					func_len =
++						prog->aux->func[i]->jited_len;
++					if (put_user(func_len, &user_lens[i]))
++						return -EFAULT;
++				}
++			} else {
++				func_len = prog->jited_len;
++				if (put_user(func_len, &user_lens[0]))
++					return -EFAULT;
++			}
++		} else {
++			info.jited_func_lens = 0;
++		}
++	}
++
++	if (prog->aux->btf)
++		info.btf_id = btf_obj_id(prog->aux->btf);
++	info.attach_btf_id = prog->aux->attach_btf_id;
++	if (attach_btf)
++		info.attach_btf_obj_id = btf_obj_id(attach_btf);
++
++	ulen = info.nr_func_info;
++	info.nr_func_info = prog->aux->func_info_cnt;
++	if (info.nr_func_info && ulen) {
++		char __user *user_finfo;
++
++		user_finfo = u64_to_user_ptr(info.func_info);
++		ulen = min_t(u32, info.nr_func_info, ulen);
++		if (copy_to_user(user_finfo, prog->aux->func_info,
++				 info.func_info_rec_size * ulen))
++			return -EFAULT;
++	}
++
++	ulen = info.nr_line_info;
++	info.nr_line_info = prog->aux->nr_linfo;
++	if (info.nr_line_info && ulen) {
++		__u8 __user *user_linfo;
++
++		user_linfo = u64_to_user_ptr(info.line_info);
++		ulen = min_t(u32, info.nr_line_info, ulen);
++		if (copy_to_user(user_linfo, prog->aux->linfo,
++				 info.line_info_rec_size * ulen))
++			return -EFAULT;
++	}
++
++	ulen = info.nr_jited_line_info;
++	if (prog->aux->jited_linfo)
++		info.nr_jited_line_info = prog->aux->nr_linfo;
++	else
++		info.nr_jited_line_info = 0;
++	if (info.nr_jited_line_info && ulen) {
++		if (bpf_dump_raw_ok(file->f_cred)) {
++			unsigned long line_addr;
++			__u64 __user *user_linfo;
++			u32 i;
++
++			user_linfo = u64_to_user_ptr(info.jited_line_info);
++			ulen = min_t(u32, info.nr_jited_line_info, ulen);
++			for (i = 0; i < ulen; i++) {
++				line_addr = (unsigned long)prog->aux->jited_linfo[i];
++				if (put_user((__u64)line_addr, &user_linfo[i]))
++					return -EFAULT;
++			}
++		} else {
++			info.jited_line_info = 0;
++		}
++	}
++
++	ulen = info.nr_prog_tags;
++	info.nr_prog_tags = prog->aux->func_cnt ? : 1;
++	if (ulen) {
++		__u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
++		u32 i;
++
++		user_prog_tags = u64_to_user_ptr(info.prog_tags);
++		ulen = min_t(u32, info.nr_prog_tags, ulen);
++		if (prog->aux->func_cnt) {
++			for (i = 0; i < ulen; i++) {
++				if (copy_to_user(user_prog_tags[i],
++						 prog->aux->func[i]->tag,
++						 BPF_TAG_SIZE))
++					return -EFAULT;
++			}
++		} else {
++			if (copy_to_user(user_prog_tags[0],
++					 prog->tag, BPF_TAG_SIZE))
++				return -EFAULT;
++		}
++	}
++
++done:
++	if (copy_to_user(uinfo, &info, info_len) ||
++	    put_user(info_len, &uattr->info.info_len))
++		return -EFAULT;
++
++	return 0;
++}
++
++static int bpf_map_get_info_by_fd(struct file *file,
++				  struct bpf_map *map,
++				  const union bpf_attr *attr,
++				  union bpf_attr __user *uattr)
++{
++	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
++	struct bpf_map_info info;
++	u32 info_len = attr->info.info_len;
++	int err;
++
++	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
++	if (err)
++		return err;
++	info_len = min_t(u32, sizeof(info), info_len);
++
++	memset(&info, 0, sizeof(info));
++	info.type = map->map_type;
++	info.id = map->id;
++	info.key_size = map->key_size;
++	info.value_size = map->value_size;
++	info.max_entries = map->max_entries;
++	info.map_flags = map->map_flags;
++	info.map_extra = map->map_extra;
++	memcpy(info.name, map->name, sizeof(map->name));
++
++	if (map->btf) {
++		info.btf_id = btf_obj_id(map->btf);
++		info.btf_key_type_id = map->btf_key_type_id;
++		info.btf_value_type_id = map->btf_value_type_id;
++	}
++	info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
++
++	if (bpf_map_is_dev_bound(map)) {
++		err = bpf_map_offload_info_fill(&info, map);
++		if (err)
++			return err;
++	}
++
++	if (copy_to_user(uinfo, &info, info_len) ||
++	    put_user(info_len, &uattr->info.info_len))
++		return -EFAULT;
++
++	return 0;
++}
++
++static int bpf_btf_get_info_by_fd(struct file *file,
++				  struct btf *btf,
++				  const union bpf_attr *attr,
++				  union bpf_attr __user *uattr)
++{
++	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
++	u32 info_len = attr->info.info_len;
++	int err;
++
++	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
++	if (err)
++		return err;
++
++	return btf_get_info_by_fd(btf, attr, uattr);
++}
++
++static int bpf_link_get_info_by_fd(struct file *file,
++				  struct bpf_link *link,
++				  const union bpf_attr *attr,
++				  union bpf_attr __user *uattr)
++{
++	struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
++	struct bpf_link_info info;
++	u32 info_len = attr->info.info_len;
++	int err;
++
++	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
++	if (err)
++		return err;
++	info_len = min_t(u32, sizeof(info), info_len);
++
++	memset(&info, 0, sizeof(info));
++	if (copy_from_user(&info, uinfo, info_len))
++		return -EFAULT;
++
++	info.type = link->type;
++	info.id = link->id;
++	info.prog_id = link->prog->aux->id;
++
++	if (link->ops->fill_link_info) {
++		err = link->ops->fill_link_info(link, &info);
++		if (err)
++			return err;
++	}
++
++	if (copy_to_user(uinfo, &info, info_len) ||
++	    put_user(info_len, &uattr->info.info_len))
++		return -EFAULT;
++
++	return 0;
++}
++
++
++#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
++
++static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
++				  union bpf_attr __user *uattr)
++{
++	int ufd = attr->info.bpf_fd;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	if (!f.file)
++		return -EBADFD;
++
++	if (f.file->f_op == &bpf_prog_fops)
++		err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
++					      uattr);
++	else if (f.file->f_op == &bpf_map_fops)
++		err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
++					     uattr);
++	else if (f.file->f_op == &btf_fops)
++		err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
++	else if (f.file->f_op == &bpf_link_fops)
++		err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
++					      attr, uattr);
++	else
++		err = -EINVAL;
++
++	fdput(f);
++	return err;
++}
++
++#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
++
++static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
++{
++	if (CHECK_ATTR(BPF_BTF_LOAD))
++		return -EINVAL;
++
++	if (!bpf_capable())
++		return -EPERM;
++
++	return btf_new_fd(attr, uattr);
++}
++
++#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
++
++static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
++{
++	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	return btf_get_fd_by_id(attr->btf_id);
++}
++
++static int bpf_task_fd_query_copy(const union bpf_attr *attr,
++				    union bpf_attr __user *uattr,
++				    u32 prog_id, u32 fd_type,
++				    const char *buf, u64 probe_offset,
++				    u64 probe_addr)
++{
++	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
++	u32 len = buf ? strlen(buf) : 0, input_len;
++	int err = 0;
++
++	if (put_user(len, &uattr->task_fd_query.buf_len))
++		return -EFAULT;
++	input_len = attr->task_fd_query.buf_len;
++	if (input_len && ubuf) {
++		if (!len) {
++			/* nothing to copy, just make ubuf NULL terminated */
++			char zero = '\0';
++
++			if (put_user(zero, ubuf))
++				return -EFAULT;
++		} else if (input_len >= len + 1) {
++			/* ubuf can hold the string with NULL terminator */
++			if (copy_to_user(ubuf, buf, len + 1))
++				return -EFAULT;
++		} else {
++			/* ubuf cannot hold the string with NULL terminator,
++			 * do a partial copy with NULL terminator.
++			 */
++			char zero = '\0';
++
++			err = -ENOSPC;
++			if (copy_to_user(ubuf, buf, input_len - 1))
++				return -EFAULT;
++			if (put_user(zero, ubuf + input_len - 1))
++				return -EFAULT;
++		}
++	}
++
++	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
++	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
++	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
++	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
++		return -EFAULT;
++
++	return err;
++}
++
++#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
++
++static int bpf_task_fd_query(const union bpf_attr *attr,
++			     union bpf_attr __user *uattr)
++{
++	pid_t pid = attr->task_fd_query.pid;
++	u32 fd = attr->task_fd_query.fd;
++	const struct perf_event *event;
++	struct task_struct *task;
++	struct file *file;
++	int err;
++
++	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if (attr->task_fd_query.flags != 0)
++		return -EINVAL;
++
++	rcu_read_lock();
++	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
++	rcu_read_unlock();
++	if (!task)
++		return -ENOENT;
++
++	err = 0;
++	file = fget_task(task, fd);
++	put_task_struct(task);
++	if (!file)
++		return -EBADF;
++
++	if (file->f_op == &bpf_link_fops) {
++		struct bpf_link *link = file->private_data;
++
++		if (link->ops == &bpf_raw_tp_link_lops) {
++			struct bpf_raw_tp_link *raw_tp =
++				container_of(link, struct bpf_raw_tp_link, link);
++			struct bpf_raw_event_map *btp = raw_tp->btp;
++
++			err = bpf_task_fd_query_copy(attr, uattr,
++						     raw_tp->link.prog->aux->id,
++						     BPF_FD_TYPE_RAW_TRACEPOINT,
++						     btp->tp->name, 0, 0);
++			goto put_file;
++		}
++		goto out_not_supp;
++	}
++
++	event = perf_get_event(file);
++	if (!IS_ERR(event)) {
++		u64 probe_offset, probe_addr;
++		u32 prog_id, fd_type;
++		const char *buf;
++
++		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
++					      &buf, &probe_offset,
++					      &probe_addr);
++		if (!err)
++			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
++						     fd_type, buf,
++						     probe_offset,
++						     probe_addr);
++		goto put_file;
++	}
++
++out_not_supp:
++	err = -ENOTSUPP;
++put_file:
++	fput(file);
++	return err;
++}
++
++#define BPF_MAP_BATCH_LAST_FIELD batch.flags
++
++#define BPF_DO_BATCH(fn)			\
++	do {					\
++		if (!fn) {			\
++			err = -ENOTSUPP;	\
++			goto err_put;		\
++		}				\
++		err = fn(map, attr, uattr);	\
++	} while (0)
++
++static int bpf_map_do_batch(const union bpf_attr *attr,
++			    union bpf_attr __user *uattr,
++			    int cmd)
++{
++	bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
++			 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
++	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
++	struct bpf_map *map;
++	int err, ufd;
++	struct fd f;
++
++	if (CHECK_ATTR(BPF_MAP_BATCH))
++		return -EINVAL;
++
++	ufd = attr->batch.map_fd;
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	if (has_write)
++		bpf_map_write_active_inc(map);
++	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
++		err = -EPERM;
++		goto err_put;
++	}
++	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if (cmd == BPF_MAP_LOOKUP_BATCH)
++		BPF_DO_BATCH(map->ops->map_lookup_batch);
++	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
++		BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
++	else if (cmd == BPF_MAP_UPDATE_BATCH)
++		BPF_DO_BATCH(map->ops->map_update_batch);
++	else
++		BPF_DO_BATCH(map->ops->map_delete_batch);
++err_put:
++	if (has_write)
++		bpf_map_write_active_dec(map);
++	fdput(f);
++	return err;
++}
++
++#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
++static int link_create(union bpf_attr *attr, bpfptr_t uattr)
++{
++	enum bpf_prog_type ptype;
++	struct bpf_prog *prog;
++	int ret;
++
++	if (CHECK_ATTR(BPF_LINK_CREATE))
++		return -EINVAL;
++
++	prog = bpf_prog_get(attr->link_create.prog_fd);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	ret = bpf_prog_attach_check_attach_type(prog,
++						attr->link_create.attach_type);
++	if (ret)
++		goto out;
++
++	switch (prog->type) {
++	case BPF_PROG_TYPE_EXT:
++		break;
++	case BPF_PROG_TYPE_PERF_EVENT:
++	case BPF_PROG_TYPE_TRACEPOINT:
++		if (attr->link_create.attach_type != BPF_PERF_EVENT) {
++			ret = -EINVAL;
++			goto out;
++		}
++		break;
++	case BPF_PROG_TYPE_KPROBE:
++		if (attr->link_create.attach_type != BPF_PERF_EVENT &&
++		    attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
++			ret = -EINVAL;
++			goto out;
++		}
++		break;
++	default:
++		ptype = attach_type_to_prog_type(attr->link_create.attach_type);
++		if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
++			ret = -EINVAL;
++			goto out;
++		}
++		break;
++	}
++
++	switch (prog->type) {
++	case BPF_PROG_TYPE_CGROUP_SKB:
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_SOCK_OPS:
++	case BPF_PROG_TYPE_CGROUP_DEVICE:
++	case BPF_PROG_TYPE_CGROUP_SYSCTL:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++		ret = cgroup_bpf_link_attach(attr, prog);
++		break;
++	case BPF_PROG_TYPE_EXT:
++		ret = bpf_tracing_prog_attach(prog,
++					      attr->link_create.target_fd,
++					      attr->link_create.target_btf_id,
++					      attr->link_create.tracing.cookie);
++		break;
++	case BPF_PROG_TYPE_LSM:
++	case BPF_PROG_TYPE_TRACING:
++		if (attr->link_create.attach_type != prog->expected_attach_type) {
++			ret = -EINVAL;
++			goto out;
++		}
++		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
++			ret = bpf_raw_tp_link_attach(prog, NULL);
++		else if (prog->expected_attach_type == BPF_TRACE_ITER)
++			ret = bpf_iter_link_attach(attr, uattr, prog);
++		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
++			ret = cgroup_bpf_link_attach(attr, prog);
++		else
++			ret = bpf_tracing_prog_attach(prog,
++						      attr->link_create.target_fd,
++						      attr->link_create.target_btf_id,
++						      attr->link_create.tracing.cookie);
++		break;
++	case BPF_PROG_TYPE_FLOW_DISSECTOR:
++	case BPF_PROG_TYPE_SK_LOOKUP:
++		ret = netns_bpf_link_create(attr, prog);
++		break;
++#ifdef CONFIG_NET
++	case BPF_PROG_TYPE_XDP:
++		ret = bpf_xdp_link_attach(attr, prog);
++		break;
++#endif
++	case BPF_PROG_TYPE_PERF_EVENT:
++	case BPF_PROG_TYPE_TRACEPOINT:
++		ret = bpf_perf_link_attach(attr, prog);
++		break;
++	case BPF_PROG_TYPE_KPROBE:
++		if (attr->link_create.attach_type == BPF_PERF_EVENT)
++			ret = bpf_perf_link_attach(attr, prog);
++		else
++			ret = bpf_kprobe_multi_link_attach(attr, prog);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++out:
++	if (ret < 0)
++		bpf_prog_put(prog);
++	return ret;
++}
++
++#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
++
++static int link_update(union bpf_attr *attr)
++{
++	struct bpf_prog *old_prog = NULL, *new_prog;
++	struct bpf_link *link;
++	u32 flags;
++	int ret;
++
++	if (CHECK_ATTR(BPF_LINK_UPDATE))
++		return -EINVAL;
++
++	flags = attr->link_update.flags;
++	if (flags & ~BPF_F_REPLACE)
++		return -EINVAL;
++
++	link = bpf_link_get_from_fd(attr->link_update.link_fd);
++	if (IS_ERR(link))
++		return PTR_ERR(link);
++
++	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
++	if (IS_ERR(new_prog)) {
++		ret = PTR_ERR(new_prog);
++		goto out_put_link;
++	}
++
++	if (flags & BPF_F_REPLACE) {
++		old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
++		if (IS_ERR(old_prog)) {
++			ret = PTR_ERR(old_prog);
++			old_prog = NULL;
++			goto out_put_progs;
++		}
++	} else if (attr->link_update.old_prog_fd) {
++		ret = -EINVAL;
++		goto out_put_progs;
++	}
++
++	if (link->ops->update_prog)
++		ret = link->ops->update_prog(link, new_prog, old_prog);
++	else
++		ret = -EINVAL;
++
++out_put_progs:
++	if (old_prog)
++		bpf_prog_put(old_prog);
++	if (ret)
++		bpf_prog_put(new_prog);
++out_put_link:
++	bpf_link_put(link);
++	return ret;
++}
++
++#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
++
++static int link_detach(union bpf_attr *attr)
++{
++	struct bpf_link *link;
++	int ret;
++
++	if (CHECK_ATTR(BPF_LINK_DETACH))
++		return -EINVAL;
++
++	link = bpf_link_get_from_fd(attr->link_detach.link_fd);
++	if (IS_ERR(link))
++		return PTR_ERR(link);
++
++	if (link->ops->detach)
++		ret = link->ops->detach(link);
++	else
++		ret = -EOPNOTSUPP;
++
++	bpf_link_put(link);
++	return ret;
++}
++
++static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
++{
++	return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
++}
++
++struct bpf_link *bpf_link_by_id(u32 id)
++{
++	struct bpf_link *link;
++
++	if (!id)
++		return ERR_PTR(-ENOENT);
++
++	spin_lock_bh(&link_idr_lock);
++	/* before link is "settled", ID is 0, pretend it doesn't exist yet */
++	link = idr_find(&link_idr, id);
++	if (link) {
++		if (link->id)
++			link = bpf_link_inc_not_zero(link);
++		else
++			link = ERR_PTR(-EAGAIN);
++	} else {
++		link = ERR_PTR(-ENOENT);
++	}
++	spin_unlock_bh(&link_idr_lock);
++	return link;
++}
++
++struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
++{
++	struct bpf_link *link;
++
++	spin_lock_bh(&link_idr_lock);
++again:
++	link = idr_get_next(&link_idr, id);
++	if (link) {
++		link = bpf_link_inc_not_zero(link);
++		if (IS_ERR(link)) {
++			(*id)++;
++			goto again;
++		}
++	}
++	spin_unlock_bh(&link_idr_lock);
++
++	return link;
++}
++
++#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
++
++static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
++{
++	struct bpf_link *link;
++	u32 id = attr->link_id;
++	int fd;
++
++	if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	link = bpf_link_by_id(id);
++	if (IS_ERR(link))
++		return PTR_ERR(link);
++
++	fd = bpf_link_new_fd(link);
++	if (fd < 0)
++		bpf_link_put(link);
++
++	return fd;
++}
++
++DEFINE_MUTEX(bpf_stats_enabled_mutex);
++
++static int bpf_stats_release(struct inode *inode, struct file *file)
++{
++	mutex_lock(&bpf_stats_enabled_mutex);
++	static_key_slow_dec(&bpf_stats_enabled_key.key);
++	mutex_unlock(&bpf_stats_enabled_mutex);
++	return 0;
++}
++
++static const struct file_operations bpf_stats_fops = {
++	.release = bpf_stats_release,
++};
++
++static int bpf_enable_runtime_stats(void)
++{
++	int fd;
++
++	mutex_lock(&bpf_stats_enabled_mutex);
++
++	/* Set a very high limit to avoid overflow */
++	if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
++		mutex_unlock(&bpf_stats_enabled_mutex);
++		return -EBUSY;
++	}
++
++	fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
++	if (fd >= 0)
++		static_key_slow_inc(&bpf_stats_enabled_key.key);
++
++	mutex_unlock(&bpf_stats_enabled_mutex);
++	return fd;
++}
++
++#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
++
++static int bpf_enable_stats(union bpf_attr *attr)
++{
++
++	if (CHECK_ATTR(BPF_ENABLE_STATS))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	switch (attr->enable_stats.type) {
++	case BPF_STATS_RUN_TIME:
++		return bpf_enable_runtime_stats();
++	default:
++		break;
++	}
++	return -EINVAL;
++}
++
++#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
++
++static int bpf_iter_create(union bpf_attr *attr)
++{
++	struct bpf_link *link;
++	int err;
++
++	if (CHECK_ATTR(BPF_ITER_CREATE))
++		return -EINVAL;
++
++	if (attr->iter_create.flags)
++		return -EINVAL;
++
++	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
++	if (IS_ERR(link))
++		return PTR_ERR(link);
++
++	err = bpf_iter_new_fd(link);
++	bpf_link_put(link);
++
++	return err;
++}
++
++#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
++
++static int bpf_prog_bind_map(union bpf_attr *attr)
++{
++	struct bpf_prog *prog;
++	struct bpf_map *map;
++	struct bpf_map **used_maps_old, **used_maps_new;
++	int i, ret = 0;
++
++	if (CHECK_ATTR(BPF_PROG_BIND_MAP))
++		return -EINVAL;
++
++	if (attr->prog_bind_map.flags)
++		return -EINVAL;
++
++	prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	map = bpf_map_get(attr->prog_bind_map.map_fd);
++	if (IS_ERR(map)) {
++		ret = PTR_ERR(map);
++		goto out_prog_put;
++	}
++
++	mutex_lock(&prog->aux->used_maps_mutex);
++
++	used_maps_old = prog->aux->used_maps;
++
++	for (i = 0; i < prog->aux->used_map_cnt; i++)
++		if (used_maps_old[i] == map) {
++			bpf_map_put(map);
++			goto out_unlock;
++		}
++
++	used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
++				      sizeof(used_maps_new[0]),
++				      GFP_KERNEL);
++	if (!used_maps_new) {
++		ret = -ENOMEM;
++		goto out_unlock;
++	}
++
++	memcpy(used_maps_new, used_maps_old,
++	       sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
++	used_maps_new[prog->aux->used_map_cnt] = map;
++
++	prog->aux->used_map_cnt++;
++	prog->aux->used_maps = used_maps_new;
++
++	kfree(used_maps_old);
++
++out_unlock:
++	mutex_unlock(&prog->aux->used_maps_mutex);
++
++	if (ret)
++		bpf_map_put(map);
++out_prog_put:
++	bpf_prog_put(prog);
++	return ret;
++}
++
++static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
++{
++	union bpf_attr attr;
++	bool capable;
++	int err;
++
++	capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
++
++	/* Intent here is for unprivileged_bpf_disabled to block key object
++	 * creation commands for unprivileged users; other actions depend
++	 * of fd availability and access to bpffs, so are dependent on
++	 * object creation success.  Capabilities are later verified for
++	 * operations such as load and map create, so even with unprivileged
++	 * BPF disabled, capability checks are still carried out for these
++	 * and other operations.
++	 */
++	if (!capable &&
++	    (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
++		return -EPERM;
++
++	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
++	if (err)
++		return err;
++	size = min_t(u32, size, sizeof(attr));
++
++	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
++	memset(&attr, 0, sizeof(attr));
++	if (copy_from_bpfptr(&attr, uattr, size) != 0)
++		return -EFAULT;
++
++	err = security_bpf(cmd, &attr, size);
++	if (err < 0)
++		return err;
++
++	switch (cmd) {
++	case BPF_MAP_CREATE:
++		err = map_create(&attr);
++		break;
++	case BPF_MAP_LOOKUP_ELEM:
++		err = map_lookup_elem(&attr);
++		break;
++	case BPF_MAP_UPDATE_ELEM:
++		err = map_update_elem(&attr, uattr);
++		break;
++	case BPF_MAP_DELETE_ELEM:
++		err = map_delete_elem(&attr);
++		break;
++	case BPF_MAP_GET_NEXT_KEY:
++		err = map_get_next_key(&attr);
++		break;
++	case BPF_MAP_FREEZE:
++		err = map_freeze(&attr);
++		break;
++	case BPF_PROG_LOAD:
++		err = bpf_prog_load(&attr, uattr);
++		break;
++	case BPF_OBJ_PIN:
++		err = bpf_obj_pin(&attr);
++		break;
++	case BPF_OBJ_GET:
++		err = bpf_obj_get(&attr);
++		break;
++	case BPF_PROG_ATTACH:
++		err = bpf_prog_attach(&attr);
++		break;
++	case BPF_PROG_DETACH:
++		err = bpf_prog_detach(&attr);
++		break;
++	case BPF_PROG_QUERY:
++		err = bpf_prog_query(&attr, uattr.user);
++		break;
++	case BPF_PROG_TEST_RUN:
++		err = bpf_prog_test_run(&attr, uattr.user);
++		break;
++	case BPF_PROG_GET_NEXT_ID:
++		err = bpf_obj_get_next_id(&attr, uattr.user,
++					  &prog_idr, &prog_idr_lock);
++		break;
++	case BPF_MAP_GET_NEXT_ID:
++		err = bpf_obj_get_next_id(&attr, uattr.user,
++					  &map_idr, &map_idr_lock);
++		break;
++	case BPF_BTF_GET_NEXT_ID:
++		err = bpf_obj_get_next_id(&attr, uattr.user,
++					  &btf_idr, &btf_idr_lock);
++		break;
++	case BPF_PROG_GET_FD_BY_ID:
++		err = bpf_prog_get_fd_by_id(&attr);
++		break;
++	case BPF_MAP_GET_FD_BY_ID:
++		err = bpf_map_get_fd_by_id(&attr);
++		break;
++	case BPF_OBJ_GET_INFO_BY_FD:
++		err = bpf_obj_get_info_by_fd(&attr, uattr.user);
++		break;
++	case BPF_RAW_TRACEPOINT_OPEN:
++		err = bpf_raw_tracepoint_open(&attr);
++		break;
++	case BPF_BTF_LOAD:
++		err = bpf_btf_load(&attr, uattr);
++		break;
++	case BPF_BTF_GET_FD_BY_ID:
++		err = bpf_btf_get_fd_by_id(&attr);
++		break;
++	case BPF_TASK_FD_QUERY:
++		err = bpf_task_fd_query(&attr, uattr.user);
++		break;
++	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
++		err = map_lookup_and_delete_elem(&attr);
++		break;
++	case BPF_MAP_LOOKUP_BATCH:
++		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
++		break;
++	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
++		err = bpf_map_do_batch(&attr, uattr.user,
++				       BPF_MAP_LOOKUP_AND_DELETE_BATCH);
++		break;
++	case BPF_MAP_UPDATE_BATCH:
++		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
++		break;
++	case BPF_MAP_DELETE_BATCH:
++		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
++		break;
++	case BPF_LINK_CREATE:
++		err = link_create(&attr, uattr);
++		break;
++	case BPF_LINK_UPDATE:
++		err = link_update(&attr);
++		break;
++	case BPF_LINK_GET_FD_BY_ID:
++		err = bpf_link_get_fd_by_id(&attr);
++		break;
++	case BPF_LINK_GET_NEXT_ID:
++		err = bpf_obj_get_next_id(&attr, uattr.user,
++					  &link_idr, &link_idr_lock);
++		break;
++	case BPF_ENABLE_STATS:
++		err = bpf_enable_stats(&attr);
++		break;
++	case BPF_ITER_CREATE:
++		err = bpf_iter_create(&attr);
++		break;
++	case BPF_LINK_DETACH:
++		err = link_detach(&attr);
++		break;
++	case BPF_PROG_BIND_MAP:
++		err = bpf_prog_bind_map(&attr);
++		break;
++	default:
++		err = -EINVAL;
++		break;
++	}
++
++	return err;
++}
++
++SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
++{
++	return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
++}
++
++static bool syscall_prog_is_valid_access(int off, int size,
++					 enum bpf_access_type type,
++					 const struct bpf_prog *prog,
++					 struct bpf_insn_access_aux *info)
++{
++	if (off < 0 || off >= U16_MAX)
++		return false;
++	if (off % size != 0)
++		return false;
++	return true;
++}
++
++BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
++{
++	switch (cmd) {
++	case BPF_MAP_CREATE:
++	case BPF_MAP_UPDATE_ELEM:
++	case BPF_MAP_FREEZE:
++	case BPF_PROG_LOAD:
++	case BPF_BTF_LOAD:
++	case BPF_LINK_CREATE:
++	case BPF_RAW_TRACEPOINT_OPEN:
++		break;
++	default:
++		return -EINVAL;
++	}
++	return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
++}
++
++
++/* To shut up -Wmissing-prototypes.
++ * This function is used by the kernel light skeleton
++ * to load bpf programs when modules are loaded or during kernel boot.
++ * See tools/lib/bpf/skel_internal.h
++ */
++int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
++
++int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
++{
++	struct bpf_prog * __maybe_unused prog;
++	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
++
++	switch (cmd) {
++#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
++	case BPF_PROG_TEST_RUN:
++		if (attr->test.data_in || attr->test.data_out ||
++		    attr->test.ctx_out || attr->test.duration ||
++		    attr->test.repeat || attr->test.flags)
++			return -EINVAL;
++
++		prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
++		if (IS_ERR(prog))
++			return PTR_ERR(prog);
++
++		if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
++		    attr->test.ctx_size_in > U16_MAX) {
++			bpf_prog_put(prog);
++			return -EINVAL;
++		}
++
++		run_ctx.bpf_cookie = 0;
++		run_ctx.saved_run_ctx = NULL;
++		if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
++			/* recursion detected */
++			bpf_prog_put(prog);
++			return -EBUSY;
++		}
++		attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
++		__bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
++		bpf_prog_put(prog);
++		return 0;
++#endif
++	default:
++		return ____bpf_sys_bpf(cmd, attr, size);
++	}
++}
++EXPORT_SYMBOL(kern_sys_bpf);
++
++static const struct bpf_func_proto bpf_sys_bpf_proto = {
++	.func		= bpf_sys_bpf,
++	.gpl_only	= false,
++	.ret_type	= RET_INTEGER,
++	.arg1_type	= ARG_ANYTHING,
++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
++	.arg3_type	= ARG_CONST_SIZE,
++};
++
++const struct bpf_func_proto * __weak
++tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
++{
++	return bpf_base_func_proto(func_id);
++}
++
++BPF_CALL_1(bpf_sys_close, u32, fd)
++{
++	/* When bpf program calls this helper there should not be
++	 * an fdget() without matching completed fdput().
++	 * This helper is allowed in the following callchain only:
++	 * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
++	 */
++	return close_fd(fd);
++}
++
++static const struct bpf_func_proto bpf_sys_close_proto = {
++	.func		= bpf_sys_close,
++	.gpl_only	= false,
++	.ret_type	= RET_INTEGER,
++	.arg1_type	= ARG_ANYTHING,
++};
++
++BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
++{
++	if (flags)
++		return -EINVAL;
++
++	if (name_sz <= 1 || name[name_sz - 1])
++		return -EINVAL;
++
++	if (!bpf_dump_raw_ok(current_cred()))
++		return -EPERM;
++
++	*res = kallsyms_lookup_name(name);
++	return *res ? 0 : -ENOENT;
++}
++
++static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
++	.func		= bpf_kallsyms_lookup_name,
++	.gpl_only	= false,
++	.ret_type	= RET_INTEGER,
++	.arg1_type	= ARG_PTR_TO_MEM,
++	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
++	.arg3_type	= ARG_ANYTHING,
++	.arg4_type	= ARG_PTR_TO_LONG,
++};
++
++static const struct bpf_func_proto *
++syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
++{
++	switch (func_id) {
++	case BPF_FUNC_sys_bpf:
++		return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
++	case BPF_FUNC_btf_find_by_name_kind:
++		return &bpf_btf_find_by_name_kind_proto;
++	case BPF_FUNC_sys_close:
++		return &bpf_sys_close_proto;
++	case BPF_FUNC_kallsyms_lookup_name:
++		return &bpf_kallsyms_lookup_name_proto;
++	default:
++		return tracing_prog_func_proto(func_id, prog);
++	}
++}
++
++const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
++	.get_func_proto  = syscall_prog_func_proto,
++	.is_valid_access = syscall_prog_is_valid_access,
++};
++
++const struct bpf_prog_ops bpf_syscall_prog_ops = {
++	.test_run = bpf_prog_test_run_syscall,
++};
++
++#ifdef CONFIG_SYSCTL
++static int bpf_stats_handler(struct ctl_table *table, int write,
++			     void *buffer, size_t *lenp, loff_t *ppos)
++{
++	struct static_key *key = (struct static_key *)table->data;
++	static int saved_val;
++	int val, ret;
++	struct ctl_table tmp = {
++		.data   = &val,
++		.maxlen = sizeof(val),
++		.mode   = table->mode,
++		.extra1 = SYSCTL_ZERO,
++		.extra2 = SYSCTL_ONE,
++	};
++
++	if (write && !capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	mutex_lock(&bpf_stats_enabled_mutex);
++	val = saved_val;
++	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
++	if (write && !ret && val != saved_val) {
++		if (val)
++			static_key_slow_inc(key);
++		else
++			static_key_slow_dec(key);
++		saved_val = val;
++	}
++	mutex_unlock(&bpf_stats_enabled_mutex);
++	return ret;
++}
++
++void __weak unpriv_ebpf_notify(int new_state)
++{
++}
++
++static int bpf_unpriv_handler(struct ctl_table *table, int write,
++			      void *buffer, size_t *lenp, loff_t *ppos)
++{
++	int ret, unpriv_enable = *(int *)table->data;
++	bool locked_state = unpriv_enable == 1;
++	struct ctl_table tmp = *table;
++
++	if (write && !capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	tmp.data = &unpriv_enable;
++	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
++	if (write && !ret) {
++		if (locked_state && unpriv_enable != 1)
++			return -EPERM;
++		*(int *)table->data = unpriv_enable;
++	}
++
++	unpriv_ebpf_notify(unpriv_enable);
++
++	return ret;
++}
++
++static struct ctl_table bpf_syscall_table[] = {
++	{
++		.procname	= "unprivileged_bpf_disabled",
++		.data		= &sysctl_unprivileged_bpf_disabled,
++		.maxlen		= sizeof(sysctl_unprivileged_bpf_disabled),
++		.mode		= 0644,
++		.proc_handler	= bpf_unpriv_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_TWO,
++	},
++	{
++		.procname	= "bpf_stats_enabled",
++		.data		= &bpf_stats_enabled_key.key,
++		.maxlen		= sizeof(bpf_stats_enabled_key),
++		.mode		= 0644,
++		.proc_handler	= bpf_stats_handler,
++	},
++	{ }
++};
++
++static int __init bpf_syscall_sysctl_init(void)
++{
++	register_sysctl_init("kernel", bpf_syscall_table);
++	return 0;
++}
++late_initcall(bpf_syscall_sysctl_init);
++#endif /* CONFIG_SYSCTL */
+diff -rupN linux.orig/kernel/entry/common.c linux/kernel/entry/common.c
+--- linux.orig/kernel/entry/common.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/entry/common.c	2022-12-04 10:40:26.716034044 -0500
+@@ -153,7 +153,7 @@ static unsigned long exit_to_user_mode_l
  
  		local_irq_enable_exit_to_user(ti_work);
  
@@ -5094,7 +33236,7 @@ index 063068a9ea9b3..26b772720b227 100644
  			schedule();
  
  		if (ti_work & _TIF_UPROBE)
-@@ -381,7 +381,7 @@ void raw_irqentry_exit_cond_resched(void)
+@@ -381,7 +381,7 @@ void raw_irqentry_exit_cond_resched(void
  		rcu_irq_exit_check_preempt();
  		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
  			WARN_ON_ONCE(!on_thread_stack());
@@ -5103,11 +33245,10 @@ index 063068a9ea9b3..26b772720b227 100644
  			preempt_schedule_irq();
  	}
  }
-diff --git a/kernel/hung_task.c b/kernel/hung_task.c
-index bb2354f73dedc..19c9de825d248 100644
---- a/kernel/hung_task.c
-+++ b/kernel/hung_task.c
-@@ -127,6 +127,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
+diff -rupN linux.orig/kernel/hung_task.c linux/kernel/hung_task.c
+--- linux.orig/kernel/hung_task.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/hung_task.c	2022-12-04 10:40:26.716034044 -0500
+@@ -127,6 +127,8 @@ static void check_hung_task(struct task_
  	 * complain:
  	 */
  	if (sysctl_hung_task_warnings) {
@@ -5116,7 +33257,7 @@ index bb2354f73dedc..19c9de825d248 100644
  		if (sysctl_hung_task_warnings > 0)
  			sysctl_hung_task_warnings--;
  		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
-@@ -142,6 +144,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
+@@ -142,6 +144,8 @@ static void check_hung_task(struct task_
  
  		if (sysctl_hung_task_all_cpu_backtrace)
  			hung_task_show_all_bt = true;
@@ -5125,7 +33266,7 @@ index bb2354f73dedc..19c9de825d248 100644
  	}
  
  	touch_nmi_watchdog();
-@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
+@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_t
  	}
   unlock:
  	rcu_read_unlock();
@@ -5144,11 +33285,10 @@ index bb2354f73dedc..19c9de825d248 100644
  	}
  
  	if (hung_task_call_panic)
-diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
-index 5db0230aa6b52..476a3fecb8c53 100644
---- a/kernel/irq/irqdesc.c
-+++ b/kernel/irq/irqdesc.c
-@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
+diff -rupN linux.orig/kernel/irq/irqdesc.c linux/kernel/irq/irqdesc.c
+--- linux.orig/kernel/irq/irqdesc.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/irq/irqdesc.c	2022-12-04 10:40:26.716034044 -0500
+@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq
  }
  EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
  
@@ -5179,10 +33319,24 @@ index 5db0230aa6b52..476a3fecb8c53 100644
  /**
   * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
   *                             to a domain.
-diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
-index b1292a57c2a53..a6514db7ef58e 100644
---- a/kernel/ksysfs.c
-+++ b/kernel/ksysfs.c
+diff -rupN linux.orig/kernel/Kconfig.preempt linux/kernel/Kconfig.preempt
+--- linux.orig/kernel/Kconfig.preempt	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/Kconfig.preempt	2022-12-04 10:40:26.716034044 -0500
+@@ -1,5 +1,11 @@
+ # SPDX-License-Identifier: GPL-2.0-only
+ 
++config HAVE_PREEMPT_LAZY
++	bool
++
++config PREEMPT_LAZY
++	def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT
++
+ config PREEMPT_NONE_BUILD
+ 	bool
+ 
+diff -rupN linux.orig/kernel/ksysfs.c linux/kernel/ksysfs.c
+--- linux.orig/kernel/ksysfs.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/ksysfs.c	2022-12-04 10:40:26.716034044 -0500
 @@ -137,6 +137,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
  
  #endif /* CONFIG_CRASH_CORE */
@@ -5199,20 +33353,19 @@ index b1292a57c2a53..a6514db7ef58e 100644
  /* whether file capabilities are enabled */
  static ssize_t fscaps_show(struct kobject *kobj,
  				  struct kobj_attribute *attr, char *buf)
-@@ -227,6 +236,9 @@ static struct attribute * kernel_attrs[] = {
- #ifndef CONFIG_TINY_RCU
+@@ -228,6 +237,9 @@ static struct attribute * kernel_attrs[]
  	&rcu_expedited_attr.attr,
  	&rcu_normal_attr.attr,
-+#endif
+ #endif
 +#ifdef CONFIG_PREEMPT_RT
 +	&realtime_attr.attr,
- #endif
++#endif
  	NULL
  };
-diff --git a/kernel/panic.c b/kernel/panic.c
-index c6eb8f8db0c05..c4e8896e3caba 100644
---- a/kernel/panic.c
-+++ b/kernel/panic.c
+ 
+diff -rupN linux.orig/kernel/panic.c linux/kernel/panic.c
+--- linux.orig/kernel/panic.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/panic.c	2022-12-04 10:40:26.716034044 -0500
 @@ -257,7 +257,6 @@ void panic(const char *fmt, ...)
  		panic_smp_self_stop();
  
@@ -5249,7 +33402,7 @@ index c6eb8f8db0c05..c4e8896e3caba 100644
  		crash_smp_send_stop();
  	}
  
-@@ -604,6 +610,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
+@@ -604,6 +610,8 @@ void __warn(const char *file, int line,
  {
  	disable_trace_on_warning();
  
@@ -5258,7 +33411,7 @@ index c6eb8f8db0c05..c4e8896e3caba 100644
  	if (file)
  		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
  			raw_smp_processor_id(), current->pid, file, line,
-@@ -633,6 +641,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
+@@ -633,6 +641,8 @@ void __warn(const char *file, int line,
  
  	/* Just a warning, don't kill lockdep. */
  	add_taint(taint, LOCKDEP_STILL_OK);
@@ -5267,10 +33420,9 @@ index c6eb8f8db0c05..c4e8896e3caba 100644
  }
  
  #ifndef __WARN_FLAGS
-diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
-index d947ca6c84f99..e7d8578860adf 100644
---- a/kernel/printk/internal.h
-+++ b/kernel/printk/internal.h
+diff -rupN linux.orig/kernel/printk/internal.h linux/kernel/printk/internal.h
+--- linux.orig/kernel/printk/internal.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/printk/internal.h	2022-12-04 10:40:26.716034044 -0500
 @@ -20,6 +20,8 @@ enum printk_info_flags {
  	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
  };
@@ -5280,10 +33432,9 @@ index d947ca6c84f99..e7d8578860adf 100644
  __printf(4, 0)
  int vprintk_store(int facility, int level,
  		  const struct dev_printk_info *dev_info,
-diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
-index a1a81fd9889bb..f1f9ce9b23f60 100644
---- a/kernel/printk/printk.c
-+++ b/kernel/printk/printk.c
+diff -rupN linux.orig/kernel/printk/printk.c linux/kernel/printk/printk.c
+--- linux.orig/kernel/printk/printk.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/printk/printk.c	2022-12-04 10:40:26.720034034 -0500
 @@ -44,6 +44,7 @@
  #include <linux/irq_work.h>
  #include <linux/ctype.h>
@@ -5292,11 +33443,10 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  #include <linux/sched/clock.h>
  #include <linux/sched/debug.h>
  #include <linux/sched/task_stack.h>
-@@ -223,6 +224,36 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
- /* Number of registered extended console drivers. */
+@@ -224,6 +225,36 @@ int devkmsg_sysctl_set_loglvl(struct ctl
  static int nr_ext_console_drivers;
  
-+/*
+ /*
 + * Used to synchronize printing kthreads against direct printing via
 + * console_trylock/console_unlock.
 + *
@@ -5326,9 +33476,10 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
 +/* Block console kthreads to avoid processing new messages. */
 +bool block_console_kthreads;
 +
- /*
++/*
   * Helper macros to handle lockdep when locking/unlocking console_sem. We use
   * macros instead of functions so that _RET_IP_ contains useful information.
+  */
 @@ -271,14 +302,49 @@ static bool panic_in_progress(void)
  }
  
@@ -5342,15 +33493,15 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
 + * Tracks whether kthread printers are all blocked. A value of true implies
 + * that the console is locked via console_lock() or the console is suspended.
 + * Writing to this variable requires holding @console_sem.
-  */
--static int console_locked, console_suspended;
++ */
 +static bool console_kthreads_blocked;
 +
 +/*
 + * Block all kthread printers from a schedulable context.
 + *
 + * Requires holding @console_sem.
-+ */
+  */
+-static int console_locked, console_suspended;
 +static void console_kthreads_block(void)
 +{
 +	struct console *con;
@@ -5386,7 +33537,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  
  /*
   *	Array of consoles built from command line options (console=)
-@@ -361,7 +427,75 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
+@@ -361,7 +427,75 @@ static int console_msg_format = MSG_FORM
  /* syslog_lock protects syslog_* variables and write access to clear_seq. */
  static DEFINE_MUTEX(syslog_lock);
  
@@ -5462,7 +33613,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  DECLARE_WAIT_QUEUE_HEAD(log_wait);
  /* All 3 protected by @syslog_lock. */
  /* the next printk record to read by syslog(READ) or /proc/kmsg */
-@@ -1850,6 +1984,7 @@ static int console_lock_spinning_disable_and_check(void)
+@@ -1850,6 +1984,7 @@ static int console_lock_spinning_disable
  	return 1;
  }
  
@@ -5470,7 +33621,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  /**
   * console_trylock_spinning - try to get console_lock by busy waiting
   *
-@@ -1923,6 +2058,7 @@ static int console_trylock_spinning(void)
+@@ -1923,6 +2058,7 @@ static int console_trylock_spinning(void
  
  	return 1;
  }
@@ -5478,7 +33629,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  
  /*
   * Call the specified console driver, asking it to write out the specified
-@@ -1930,19 +2066,28 @@ static int console_trylock_spinning(void)
+@@ -1930,19 +2066,28 @@ static int console_trylock_spinning(void
   * dropped, a dropped message will be written out first.
   */
  static void call_console_driver(struct console *con, const char *text, size_t len,
@@ -5513,7 +33664,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  }
  
  /*
-@@ -2252,10 +2397,22 @@ asmlinkage int vprintk_emit(int facility, int level,
+@@ -2252,10 +2397,22 @@ asmlinkage int vprintk_emit(int facility
  	printed_len = vprintk_store(facility, level, dev_info, fmt, args);
  
  	/* If called from the scheduler, we can not call up(). */
@@ -5538,7 +33689,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  		 * printing of all remaining records to all consoles so that
  		 * this context can return as soon as possible. Hopefully
  		 * another printk() caller will take over the printing.
-@@ -2270,6 +2427,7 @@ asmlinkage int vprintk_emit(int facility, int level,
+@@ -2270,6 +2427,7 @@ asmlinkage int vprintk_emit(int facility
  		if (console_trylock_spinning())
  			console_unlock();
  		preempt_enable();
@@ -5546,7 +33697,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	}
  
  	wake_up_klogd();
-@@ -2296,8 +2454,80 @@ asmlinkage __visible int _printk(const char *fmt, ...)
+@@ -2296,8 +2454,80 @@ asmlinkage __visible int _printk(const c
  }
  EXPORT_SYMBOL(_printk);
  
@@ -5627,7 +33778,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  #else /* CONFIG_PRINTK */
  
  #define CONSOLE_LOG_MAX		0
-@@ -2308,6 +2538,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
+@@ -2308,6 +2538,8 @@ static bool __pr_flush(struct console *c
  #define prb_first_valid_seq(rb)		0
  #define prb_next_seq(rb)		0
  
@@ -5636,7 +33787,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  static u64 syslog_seq;
  
  static size_t record_print_text(const struct printk_record *r,
-@@ -2326,11 +2558,13 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
+@@ -2326,11 +2558,13 @@ static ssize_t msg_print_ext_body(char *
  static void console_lock_spinning_enable(void) { }
  static int console_lock_spinning_disable_and_check(void) { return 0; }
  static void call_console_driver(struct console *con, const char *text, size_t len,
@@ -5651,7 +33802,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  
  #endif /* CONFIG_PRINTK */
  
-@@ -2549,6 +2783,14 @@ static int console_cpu_notify(unsigned int cpu)
+@@ -2549,6 +2783,14 @@ static int console_cpu_notify(unsigned i
  		/* If trylock fails, someone else is doing the printing */
  		if (console_trylock())
  			console_unlock();
@@ -5708,7 +33859,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  }
  EXPORT_SYMBOL(is_console_locked);
  
-@@ -2620,18 +2877,9 @@ static bool abandon_console_lock_in_panic(void)
+@@ -2620,18 +2877,9 @@ static bool abandon_console_lock_in_pani
  	return atomic_read(&panic_cpu) != raw_smp_processor_id();
  }
  
@@ -5729,7 +33880,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  		return false;
  
  	/*
-@@ -2640,18 +2888,116 @@ static inline bool console_is_usable(struct console *con)
+@@ -2640,18 +2888,116 @@ static inline bool console_is_usable(str
  	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
  	 */
  	if (!cpu_online(raw_smp_processor_id()) &&
@@ -5907,7 +34058,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  			suppress_panic_printk = 1;
  			pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
  		}
-@@ -2701,7 +3058,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
+@@ -2701,7 +3058,7 @@ static bool console_emit_next_record(str
  
  	/* Skip record that has level above the console loglevel. */
  	if (suppress_message_printing(r.info->level)) {
@@ -5916,7 +34067,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  		goto skip;
  	}
  
-@@ -2715,31 +3072,65 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
+@@ -2715,32 +3072,66 @@ static bool console_emit_next_record(str
  		len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
  	}
  
@@ -5969,7 +34120,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	return true;
  }
  
-+/*
+ /*
 + * Print a record for a given console, but allow another printk() caller to
 + * take over the console_lock and continue printing.
 + *
@@ -5997,10 +34148,11 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
 +	return __console_emit_next_record(con, text, ext_text, dropped_text, false, handover);
 +}
 +
- /*
++/*
   * Print out all remaining records to all consoles.
   *
-@@ -2758,8 +3149,8 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
+  * @do_cond_resched is set by the caller. It can be true only in schedulable
+@@ -2758,8 +3149,8 @@ skip:
   * were flushed to all usable consoles. A returned false informs the caller
   * that everything was not flushed (either there were no usable consoles or
   * another context has taken over printing or it is a panic situation and this
@@ -6011,7 +34163,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
   *
   * Requires the console_lock.
   */
-@@ -2776,24 +3167,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
+@@ -2776,24 +3167,26 @@ static bool console_flush_all(bool do_co
  	*handover = false;
  
  	do {
@@ -6045,7 +34197,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  			}
  			if (*handover)
  				return false;
-@@ -2818,6 +3211,68 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
+@@ -2818,6 +3211,68 @@ static bool console_flush_all(bool do_co
  	return any_usable;
  }
  
@@ -6141,7 +34293,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	/*
  	 * If someone else is holding the console lock, trylock will fail
  	 * and may_schedule may be set.  Ignore and proceed to unlock so
-@@ -2946,7 +3409,7 @@ void console_flush_on_panic(enum con_flush_mode mode)
+@@ -2946,7 +3409,7 @@ void console_flush_on_panic(enum con_flu
  
  		seq = prb_first_valid_seq(prb);
  		for_each_console(c)
@@ -6150,7 +34302,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	}
  	console_unlock();
  }
-@@ -3189,16 +3652,27 @@ void register_console(struct console *newcon)
+@@ -3189,16 +3652,27 @@ void register_console(struct console *ne
  	if (newcon->flags & CON_EXTENDED)
  		nr_ext_console_drivers++;
  
@@ -6189,7 +34341,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	struct console *con;
  	int res;
  
-@@ -3265,9 +3740,26 @@ int unregister_console(struct console *console)
+@@ -3265,9 +3740,26 @@ int unregister_console(struct console *c
  		console_drivers->flags |= CON_CONSDEV;
  
  	console->flags &= ~CON_ENABLED;
@@ -6237,7 +34389,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  #if defined CONFIG_PRINTK
  /* If @con is specified, only wait for that console. Otherwise wait for all. */
  static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
-@@ -3384,7 +3890,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
+@@ -3384,7 +3890,7 @@ static bool __pr_flush(struct console *c
  		for_each_console(c) {
  			if (con && con != c)
  				continue;
@@ -6246,7 +34398,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  				continue;
  			printk_seq = c->seq;
  			if (printk_seq < seq)
-@@ -3444,11 +3950,215 @@ bool pr_flush(int timeout_ms, bool reset_on_progress)
+@@ -3444,11 +3950,215 @@ bool pr_flush(int timeout_ms, bool reset
  }
  EXPORT_SYMBOL(pr_flush);
  
@@ -6464,7 +34616,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  
  static DEFINE_PER_CPU(int, printk_pending);
  
-@@ -3456,10 +4166,14 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
+@@ -3456,10 +4166,14 @@ static void wake_up_klogd_work_func(stru
  {
  	int pending = this_cpu_xchg(printk_pending, 0);
  
@@ -6513,10 +34665,9 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  }
  
  void printk_trigger_flush(void)
-diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
-index ef0f9a2044da1..caac4de1ea59a 100644
---- a/kernel/printk/printk_safe.c
-+++ b/kernel/printk/printk_safe.c
+diff -rupN linux.orig/kernel/printk/printk_safe.c linux/kernel/printk/printk_safe.c
+--- linux.orig/kernel/printk/printk_safe.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/printk/printk_safe.c	2022-12-04 10:40:26.720034034 -0500
 @@ -8,7 +8,9 @@
  #include <linux/smp.h>
  #include <linux/cpumask.h>
@@ -6527,7 +34678,7 @@ index ef0f9a2044da1..caac4de1ea59a 100644
  
  #include "internal.h"
  
-@@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt, va_list args)
+@@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt,
  	return vprintk_default(fmt, args);
  }
  EXPORT_SYMBOL(vprintk);
@@ -6561,11 +34712,10 @@ index ef0f9a2044da1..caac4de1ea59a 100644
 +		timeout_ms -= 1;
 +	}
 +}
-diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
-index d8e1b270a065f..257cb6f5ea622 100644
---- a/kernel/rcu/rcutorture.c
-+++ b/kernel/rcu/rcutorture.c
-@@ -2157,6 +2157,12 @@ static int rcutorture_booster_init(unsigned int cpu)
+diff -rupN linux.orig/kernel/rcu/rcutorture.c linux/kernel/rcu/rcutorture.c
+--- linux.orig/kernel/rcu/rcutorture.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/rcu/rcutorture.c	2022-12-04 10:40:26.720034034 -0500
+@@ -2157,6 +2157,12 @@ static int rcutorture_booster_init(unsig
  		WARN_ON_ONCE(!t);
  		sp.sched_priority = 2;
  		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
@@ -6578,11 +34728,10 @@ index d8e1b270a065f..257cb6f5ea622 100644
  	}
  
  	/* Don't allow time recalculation while creating a new task. */
-diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
-index c3fbbcc09327f..195cad14742dd 100644
---- a/kernel/rcu/tree_stall.h
-+++ b/kernel/rcu/tree_stall.h
-@@ -643,6 +643,7 @@ static void print_cpu_stall(unsigned long gps)
+diff -rupN linux.orig/kernel/rcu/tree_stall.h linux/kernel/rcu/tree_stall.h
+--- linux.orig/kernel/rcu/tree_stall.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/rcu/tree_stall.h	2022-12-04 10:40:26.720034034 -0500
+@@ -643,6 +643,7 @@ static void print_cpu_stall(unsigned lon
  	 * See Documentation/RCU/stallwarn.rst for info on how to debug
  	 * RCU CPU stall warnings.
  	 */
@@ -6590,7 +34739,7 @@ index c3fbbcc09327f..195cad14742dd 100644
  	trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
  	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
  	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
-@@ -677,6 +678,7 @@ static void print_cpu_stall(unsigned long gps)
+@@ -677,6 +678,7 @@ static void print_cpu_stall(unsigned lon
  	 */
  	set_tsk_need_resched(current);
  	set_preempt_need_resched();
@@ -6598,10 +34747,9 @@ index c3fbbcc09327f..195cad14742dd 100644
  }
  
  static void check_cpu_stall(struct rcu_data *rdp)
-diff --git a/kernel/reboot.c b/kernel/reboot.c
-index 3c35445bf5ad3..80564ffafabff 100644
---- a/kernel/reboot.c
-+++ b/kernel/reboot.c
+diff -rupN linux.orig/kernel/reboot.c linux/kernel/reboot.c
+--- linux.orig/kernel/reboot.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/reboot.c	2022-12-04 10:40:26.720034034 -0500
 @@ -82,6 +82,7 @@ void kernel_restart_prepare(char *cmd)
  {
  	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
@@ -6610,7 +34758,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  	usermodehelper_disable();
  	device_shutdown();
  }
-@@ -270,6 +271,7 @@ static void kernel_shutdown_prepare(enum system_states state)
+@@ -270,6 +271,7 @@ static void kernel_shutdown_prepare(enum
  	blocking_notifier_call_chain(&reboot_notifier_list,
  		(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
  	system_state = state;
@@ -6630,7 +34778,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  	}
  
  	return ret;
-@@ -834,6 +838,7 @@ static int __orderly_poweroff(bool force)
+@@ -834,6 +838,7 @@ static int __orderly_poweroff(bool force
  	ret = run_cmd(poweroff_cmd);
  
  	if (ret && force) {
@@ -6638,7 +34786,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  		pr_warn("Failed to start orderly shutdown: forcing the issue\n");
  
  		/*
-@@ -843,6 +848,7 @@ static int __orderly_poweroff(bool force)
+@@ -843,6 +848,7 @@ static int __orderly_poweroff(bool force
  		 */
  		emergency_sync();
  		kernel_power_off();
@@ -6655,7 +34803,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  	/*
  	 * We have reached here after the emergency shutdown waiting period has
  	 * expired. This means orderly_poweroff has not been able to shut off
-@@ -916,6 +924,8 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work)
+@@ -916,6 +924,8 @@ static void hw_failure_emergency_powerof
  	 */
  	pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
  	emergency_restart();
@@ -6664,7 +34812,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  }
  
  static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
-@@ -954,11 +964,13 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
+@@ -954,11 +964,13 @@ void hw_protection_shutdown(const char *
  {
  	static atomic_t allow_proceed = ATOMIC_INIT(1);
  
@@ -6679,7 +34827,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  
  	/*
  	 * Queue a backup emergency shutdown in the event of
-@@ -966,6 +978,8 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
+@@ -966,6 +978,8 @@ void hw_protection_shutdown(const char *
  	 */
  	hw_failure_emergency_poweroff(ms_until_forced);
  	orderly_poweroff(true);
@@ -6688,10 +34836,9 @@ index 3c35445bf5ad3..80564ffafabff 100644
  }
  EXPORT_SYMBOL_GPL(hw_protection_shutdown);
  
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index ee28253c9ac0c..2ce515d3e6f8d 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
+diff -rupN linux.orig/kernel/sched/core.c linux/kernel/sched/core.c
+--- linux.orig/kernel/sched/core.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/sched/core.c	2022-12-04 10:40:26.720034034 -0500
 @@ -1046,6 +1046,46 @@ void resched_curr(struct rq *rq)
  		trace_sched_wake_idle_without_ipi(cpu);
  }
@@ -6755,7 +34902,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	preempt_enable();
  }
  EXPORT_SYMBOL_GPL(migrate_enable);
-@@ -3251,6 +3293,70 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
+@@ -3251,6 +3293,70 @@ out:
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
@@ -6826,7 +34973,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  /*
   * wait_task_inactive - wait for a thread to unschedule.
   *
-@@ -3269,7 +3375,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
+@@ -3269,7 +3375,7 @@ out:
   */
  unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
  {
@@ -6835,7 +34982,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	struct rq_flags rf;
  	unsigned long ncsw;
  	struct rq *rq;
-@@ -3295,7 +3401,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
+@@ -3295,7 +3401,7 @@ unsigned long wait_task_inactive(struct
  		 * is actually now running somewhere else!
  		 */
  		while (task_running(rq, p)) {
@@ -6844,7 +34991,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  				return 0;
  			cpu_relax();
  		}
-@@ -3308,10 +3414,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
+@@ -3308,10 +3414,12 @@ unsigned long wait_task_inactive(struct
  		rq = task_rq_lock(p, &rf);
  		trace_sched_wait_task(p);
  		running = task_running(rq, p);
@@ -6859,7 +35006,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  		task_rq_unlock(rq, p, &rf);
  
  		/*
-@@ -3340,7 +3448,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
+@@ -3340,7 +3448,7 @@ unsigned long wait_task_inactive(struct
  		 * running right now), it's preempted, and we should
  		 * yield - it could be a while.
  		 */
@@ -6868,7 +35015,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  			ktime_t to = NSEC_PER_SEC / HZ;
  
  			set_current_state(TASK_UNINTERRUPTIBLE);
-@@ -4589,6 +4697,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4589,6 +4697,9 @@ int sched_fork(unsigned long clone_flags
  	p->on_cpu = 0;
  #endif
  	init_task_preempt_count(p);
@@ -6878,7 +35025,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  #ifdef CONFIG_SMP
  	plist_node_init(&p->pushable_tasks, MAX_PRIO);
  	RB_CLEAR_NODE(&p->pushable_dl_tasks);
-@@ -6457,6 +6568,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+@@ -6457,6 +6568,7 @@ static void __sched notrace __schedule(u
  
  	next = pick_next_task(rq, prev, &rf);
  	clear_tsk_need_resched(prev);
@@ -6886,7 +35033,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	clear_preempt_need_resched();
  #ifdef CONFIG_SCHED_DEBUG
  	rq->last_seen_need_resched_ns = 0;
-@@ -6671,6 +6783,30 @@ static void __sched notrace preempt_schedule_common(void)
+@@ -6671,6 +6783,30 @@ static void __sched notrace preempt_sche
  	} while (need_resched());
  }
  
@@ -6917,7 +35064,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  #ifdef CONFIG_PREEMPTION
  /*
   * This is the entry point to schedule() from in-kernel preemption
-@@ -6684,6 +6820,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
+@@ -6684,6 +6820,8 @@ asmlinkage __visible void __sched notrac
  	 */
  	if (likely(!preemptible()))
  		return;
@@ -6926,7 +35073,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	preempt_schedule_common();
  }
  NOKPROBE_SYMBOL(preempt_schedule);
-@@ -6731,6 +6869,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
+@@ -6731,6 +6869,9 @@ asmlinkage __visible void __sched notrac
  	if (likely(!preemptible()))
  		return;
  
@@ -6936,7 +35083,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	do {
  		/*
  		 * Because the function tracer can trace preempt_count_sub()
-@@ -8988,7 +9129,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
+@@ -8988,7 +9129,9 @@ void __init init_idle(struct task_struct
  
  	/* Set the preempt count _outside_ the spinlocks! */
  	init_idle_preempt_count(idle, cpu);
@@ -6947,11 +35094,10 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	/*
  	 * The idle tasks have their own, simple scheduling class:
  	 */
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 914096c5b1ae1..3cb55e6ede337 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -4576,7 +4576,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+diff -rupN linux.orig/kernel/sched/fair.c linux/kernel/sched/fair.c
+--- linux.orig/kernel/sched/fair.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/sched/fair.c	2022-12-04 10:40:26.720034034 -0500
+@@ -4576,7 +4576,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq
  	ideal_runtime = sched_slice(cfs_rq, curr);
  	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
  	if (delta_exec > ideal_runtime) {
@@ -6960,7 +35106,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  		/*
  		 * The current task ran long enough, ensure it doesn't get
  		 * re-elected due to buddy favours.
-@@ -4600,7 +4600,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+@@ -4600,7 +4600,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq
  		return;
  
  	if (delta > ideal_runtime)
@@ -6969,7 +35115,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  }
  
  static void
-@@ -4746,7 +4746,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+@@ -4746,7 +4746,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
  	 * validating it and just reschedule.
  	 */
  	if (queued) {
@@ -6978,7 +35124,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  		return;
  	}
  	/*
-@@ -4895,7 +4895,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+@@ -4895,7 +4895,7 @@ static void __account_cfs_rq_runtime(str
  	 * hierarchy can be throttled
  	 */
  	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
@@ -6987,7 +35133,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  }
  
  static __always_inline
-@@ -5646,7 +5646,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+@@ -5646,7 +5646,7 @@ static void hrtick_start_fair(struct rq
  
  		if (delta < 0) {
  			if (task_current(rq, p))
@@ -6996,7 +35142,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  			return;
  		}
  		hrtick_start(rq, delta);
-@@ -7307,7 +7307,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+@@ -7307,7 +7307,7 @@ static void check_preempt_wakeup(struct
  	return;
  
  preempt:
@@ -7005,7 +35151,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  	/*
  	 * Only set the backward buddy when the current task is still
  	 * on the rq. This can happen when a wakeup gets interleaved
-@@ -11454,7 +11454,7 @@ static void task_fork_fair(struct task_struct *p)
+@@ -11454,7 +11454,7 @@ static void task_fork_fair(struct task_s
  		 * 'current' within the tree based on its new key value.
  		 */
  		swap(curr->vruntime, se->vruntime);
@@ -7014,7 +35160,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  	}
  
  	se->vruntime -= cfs_rq->min_vruntime;
-@@ -11481,7 +11481,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+@@ -11481,7 +11481,7 @@ prio_changed_fair(struct rq *rq, struct
  	 */
  	if (task_current(rq, p)) {
  		if (p->prio > oldprio)
@@ -7023,10 +35169,9 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  	} else
  		check_preempt_curr(rq, p, 0);
  }
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index ee7f23c76bd33..e13090e33f3c4 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
+diff -rupN linux.orig/kernel/sched/features.h linux/kernel/sched/features.h
+--- linux.orig/kernel/sched/features.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/sched/features.h	2022-12-04 10:40:26.720034034 -0500
 @@ -48,6 +48,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
  
  #ifdef CONFIG_PREEMPT_RT
@@ -7037,11 +35182,10 @@ index ee7f23c76bd33..e13090e33f3c4 100644
  #else
  
  /*
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index e26688d387aeb..5b889de29e3c9 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -2356,6 +2356,15 @@ extern void reweight_task(struct task_struct *p, int prio);
+diff -rupN linux.orig/kernel/sched/sched.h linux/kernel/sched/sched.h
+--- linux.orig/kernel/sched/sched.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/sched/sched.h	2022-12-04 10:40:26.724034024 -0500
+@@ -2356,6 +2356,15 @@ extern void reweight_task(struct task_st
  extern void resched_curr(struct rq *rq);
  extern void resched_cpu(int cpu);
  
@@ -7057,11 +35201,10 @@ index e26688d387aeb..5b889de29e3c9 100644
  extern struct rt_bandwidth def_rt_bandwidth;
  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
  extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-diff --git a/kernel/signal.c b/kernel/signal.c
-index 6f86fda5e432a..139b965e4fafc 100644
---- a/kernel/signal.c
-+++ b/kernel/signal.c
-@@ -2297,13 +2297,13 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
+diff -rupN linux.orig/kernel/signal.c linux/kernel/signal.c
+--- linux.orig/kernel/signal.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/signal.c	2022-12-04 10:40:26.724034024 -0500
+@@ -2297,13 +2297,13 @@ static int ptrace_stop(int exit_code, in
  	/*
  	 * Don't want to allow preemption here, because
  	 * sys_ptrace() needs this task to be inactive.
@@ -7079,10 +35222,9 @@ index 6f86fda5e432a..139b965e4fafc 100644
  	freezable_schedule();
  	cgroup_leave_frozen(true);
  
-diff --git a/kernel/softirq.c b/kernel/softirq.c
-index c8a6913c067d9..ab1fe34326bab 100644
---- a/kernel/softirq.c
-+++ b/kernel/softirq.c
+diff -rupN linux.orig/kernel/softirq.c linux/kernel/softirq.c
+--- linux.orig/kernel/softirq.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/softirq.c	2022-12-04 10:40:26.724034024 -0500
 @@ -637,6 +637,24 @@ static inline void tick_irq_exit(void)
  #endif
  }
@@ -7124,7 +35266,7 @@ index c8a6913c067d9..ab1fe34326bab 100644
  
  	tick_irq_exit();
  }
-@@ -976,12 +999,70 @@ static struct smp_hotplug_thread softirq_threads = {
+@@ -976,12 +999,70 @@ static struct smp_hotplug_thread softirq
  	.thread_comm		= "ksoftirqd/%u",
  };
  
@@ -7196,11 +35338,10 @@ index c8a6913c067d9..ab1fe34326bab 100644
  	return 0;
  }
  early_initcall(spawn_ksoftirqd);
-diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
-index 23af5eca11b14..b0b4e44dd0968 100644
---- a/kernel/time/hrtimer.c
-+++ b/kernel/time/hrtimer.c
-@@ -1805,7 +1805,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
+diff -rupN linux.orig/kernel/time/hrtimer.c linux/kernel/time/hrtimer.c
+--- linux.orig/kernel/time/hrtimer.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/time/hrtimer.c	2022-12-04 10:40:26.724034024 -0500
+@@ -1805,7 +1805,7 @@ retry:
  	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
  		cpu_base->softirq_expires_next = KTIME_MAX;
  		cpu_base->softirq_activated = 1;
@@ -7218,11 +35359,10 @@ index 23af5eca11b14..b0b4e44dd0968 100644
  	}
  
  	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
-diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
-index b0e3c9205946f..133e4160ed54b 100644
---- a/kernel/time/tick-sched.c
-+++ b/kernel/time/tick-sched.c
-@@ -779,7 +779,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+diff -rupN linux.orig/kernel/time/tick-sched.c linux/kernel/time/tick-sched.c
+--- linux.orig/kernel/time/tick-sched.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/time/tick-sched.c	2022-12-04 10:40:26.724034024 -0500
+@@ -779,7 +779,7 @@ static void tick_nohz_restart(struct tic
  
  static inline bool local_timer_softirq_pending(void)
  {
@@ -7231,10 +35371,9 @@ index b0e3c9205946f..133e4160ed54b 100644
  }
  
  static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
-diff --git a/kernel/time/timer.c b/kernel/time/timer.c
-index 717fcb9fb14aa..e6219da89933d 100644
---- a/kernel/time/timer.c
-+++ b/kernel/time/timer.c
+diff -rupN linux.orig/kernel/time/timer.c linux/kernel/time/timer.c
+--- linux.orig/kernel/time/timer.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/time/timer.c	2022-12-04 10:40:26.724034024 -0500
 @@ -1822,7 +1822,7 @@ static void run_local_timers(void)
  		if (time_before(jiffies, base->next_expiry))
  			return;
@@ -7244,11 +35383,10 @@ index 717fcb9fb14aa..e6219da89933d 100644
  }
  
  /*
-diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
-index cc65887b31bd9..1d01756752676 100644
---- a/kernel/trace/trace.c
-+++ b/kernel/trace/trace.c
-@@ -2640,11 +2640,19 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
+diff -rupN linux.orig/kernel/trace/trace.c linux/kernel/trace/trace.c
+--- linux.orig/kernel/trace/trace.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/trace/trace.c	2022-12-04 10:40:26.724034024 -0500
+@@ -2640,11 +2640,19 @@ unsigned int tracing_gen_ctx_irq_test(un
  	if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
  		trace_flags |= TRACE_FLAG_BH_OFF;
  
@@ -7270,7 +35408,7 @@ index cc65887b31bd9..1d01756752676 100644
  		(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
  }
  
-@@ -4230,15 +4238,17 @@ unsigned long trace_total_entries(struct trace_array *tr)
+@@ -4230,15 +4238,17 @@ unsigned long trace_total_entries(struct
  
  static void print_lat_help_header(struct seq_file *m)
  {
@@ -7297,7 +35435,7 @@ index cc65887b31bd9..1d01756752676 100644
  }
  
  static void print_event_info(struct array_buffer *buf, struct seq_file *m)
-@@ -4272,14 +4282,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
+@@ -4272,14 +4282,16 @@ static void print_func_help_header_irq(s
  
  	print_event_info(buf, m);
  
@@ -7322,11 +35460,10 @@ index cc65887b31bd9..1d01756752676 100644
  }
  
  void
-diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
-index 0356cae0cf74e..585380a3db753 100644
---- a/kernel/trace/trace_events.c
-+++ b/kernel/trace/trace_events.c
-@@ -193,6 +193,7 @@ static int trace_define_common_fields(void)
+diff -rupN linux.orig/kernel/trace/trace_events.c linux/kernel/trace/trace_events.c
+--- linux.orig/kernel/trace/trace_events.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/trace/trace_events.c	2022-12-04 10:40:26.724034024 -0500
+@@ -193,6 +193,7 @@ static int trace_define_common_fields(vo
  	/* Holds both preempt_count and migrate_disable */
  	__common_field(unsigned char, preempt_count);
  	__common_field(int, pid);
@@ -7334,11 +35471,10 @@ index 0356cae0cf74e..585380a3db753 100644
  
  	return ret;
  }
-diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
-index 67f47ea27921d..de58eaaf1ac7a 100644
---- a/kernel/trace/trace_output.c
-+++ b/kernel/trace/trace_output.c
-@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+diff -rupN linux.orig/kernel/trace/trace_output.c linux/kernel/trace/trace_output.c
+--- linux.orig/kernel/trace/trace_output.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/trace/trace_output.c	2022-12-04 10:40:26.724034024 -0500
+@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq
  {
  	char hardsoft_irq;
  	char need_resched;
@@ -7346,7 +35482,7 @@ index 67f47ea27921d..de58eaaf1ac7a 100644
  	char irqs_off;
  	int hardirq;
  	int softirq;
-@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq
  
  	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
  				TRACE_FLAG_PREEMPT_RESCHED)) {
@@ -7374,7 +35510,7 @@ index 67f47ea27921d..de58eaaf1ac7a 100644
  	hardsoft_irq =
  		(nmi && hardirq)     ? 'Z' :
  		nmi                  ? 'z' :
-@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq
  		softirq              ? 's' :
  		                       '.' ;
  
@@ -7397,11 +35533,10 @@ index 67f47ea27921d..de58eaaf1ac7a 100644
  	if (entry->preempt_count & 0xf0)
  		trace_seq_printf(s, "%x", entry->preempt_count >> 4);
  	else
-diff --git a/kernel/watchdog.c b/kernel/watchdog.c
-index 8e61f21e7e33e..41596c415111b 100644
---- a/kernel/watchdog.c
-+++ b/kernel/watchdog.c
-@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+diff -rupN linux.orig/kernel/watchdog.c linux/kernel/watchdog.c
+--- linux.orig/kernel/watchdog.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/watchdog.c	2022-12-04 10:40:26.724034024 -0500
+@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_tim
  		/* Start period for the next softlockup warning. */
  		update_report_ts();
  
@@ -7410,7 +35545,7 @@ index 8e61f21e7e33e..41596c415111b 100644
  		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
  			smp_processor_id(), duration,
  			current->comm, task_pid_nr(current));
-@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_tim
  		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
  		if (softlockup_panic)
  			panic("softlockup: hung tasks");
@@ -7419,11 +35554,10 @@ index 8e61f21e7e33e..41596c415111b 100644
  	}
  
  	return HRTIMER_RESTART;
-diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
-index 247bf0b1582ca..701f35f0e2d44 100644
---- a/kernel/watchdog_hld.c
-+++ b/kernel/watchdog_hld.c
-@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
+diff -rupN linux.orig/kernel/watchdog_hld.c linux/kernel/watchdog_hld.c
+--- linux.orig/kernel/watchdog_hld.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/watchdog_hld.c	2022-12-04 10:40:26.724034024 -0500
+@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(s
  		if (__this_cpu_read(hard_watchdog_warn) == true)
  			return;
  
@@ -7432,7 +35566,7 @@ index 247bf0b1582ca..701f35f0e2d44 100644
  		pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
  			 this_cpu);
  		print_modules();
-@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
+@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(s
  		if (hardlockup_panic)
  			nmi_panic(regs, "Hard LOCKUP");
  
@@ -7441,10 +35575,28 @@ index 247bf0b1582ca..701f35f0e2d44 100644
  		__this_cpu_write(hard_watchdog_warn, true);
  		return;
  	}
-diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
-index cb131fad117cc..c65e69bf4eebb 100644
---- a/lib/Kconfig.debug
-+++ b/lib/Kconfig.debug
+diff -rupN linux.orig/lib/flex_proportions.c linux/lib/flex_proportions.c
+--- linux.orig/lib/flex_proportions.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/lib/flex_proportions.c	2022-12-04 10:40:26.728034014 -0500
+@@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_globa
+ 	 */
+ 	if (events <= 1)
+ 		return false;
++	preempt_disable_nested();
+ 	write_seqcount_begin(&p->sequence);
+ 	if (periods < 64)
+ 		events -= events >> periods;
+@@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_globa
+ 	percpu_counter_add(&p->events, -events);
+ 	p->period += periods;
+ 	write_seqcount_end(&p->sequence);
++	preempt_enable_nested();
+ 
+ 	return true;
+ }
+diff -rupN linux.orig/lib/Kconfig.debug linux/lib/Kconfig.debug
+--- linux.orig/lib/Kconfig.debug	2022-12-02 11:43:18.000000000 -0500
++++ linux/lib/Kconfig.debug	2022-12-04 10:40:26.724034024 -0500
 @@ -811,6 +811,9 @@ config ARCH_HAS_DEBUG_VM_PGTABLE
  	  An architecture should select this when it can successfully
  	  build and run DEBUG_VM_PGTABLE.
@@ -7455,31 +35607,10 @@ index cb131fad117cc..c65e69bf4eebb 100644
  config DEBUG_VM
  	bool "Debug VM"
  	depends on DEBUG_KERNEL
-diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
-index 05cccbcf1661a..83332fefa6f42 100644
---- a/lib/flex_proportions.c
-+++ b/lib/flex_proportions.c
-@@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_global *p, int periods)
- 	 */
- 	if (events <= 1)
- 		return false;
-+	preempt_disable_nested();
- 	write_seqcount_begin(&p->sequence);
- 	if (periods < 64)
- 		events -= events >> periods;
-@@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_global *p, int periods)
- 	percpu_counter_add(&p->events, -events);
- 	p->period += periods;
- 	write_seqcount_end(&p->sequence);
-+	preempt_enable_nested();
- 
- 	return true;
- }
-diff --git a/lib/vsprintf.c b/lib/vsprintf.c
-index 3c1853a9d1c09..ffaba68e6a290 100644
---- a/lib/vsprintf.c
-+++ b/lib/vsprintf.c
-@@ -750,37 +750,42 @@ static int __init debug_boot_weak_hash_enable(char *str)
+diff -rupN linux.orig/lib/vsprintf.c linux/lib/vsprintf.c
+--- linux.orig/lib/vsprintf.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/lib/vsprintf.c	2022-12-04 10:40:26.728034014 -0500
+@@ -750,37 +750,42 @@ static int __init debug_boot_weak_hash_e
  }
  early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable);
  
@@ -7543,34 +35674,14 @@ index 3c1853a9d1c09..ffaba68e6a290 100644
  
  #ifdef CONFIG_64BIT
  	hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
-diff --git a/localversion-rt b/localversion-rt
-new file mode 100644
-index 0000000000000..08b3e75841adc
---- /dev/null
-+++ b/localversion-rt
+diff -rupN linux.orig/localversion-rt linux/localversion-rt
+--- linux.orig/localversion-rt	1969-12-31 19:00:00.000000000 -0500
++++ linux/localversion-rt	2022-12-04 10:40:26.728034014 -0500
 @@ -0,0 +1 @@
 +-rt14
-diff --git a/mm/Kconfig b/mm/Kconfig
-index 0331f1461f81c..3897e924e40f2 100644
---- a/mm/Kconfig
-+++ b/mm/Kconfig
-@@ -579,6 +579,12 @@ config COMPACTION
- 	  it and then we would be really interested to hear about that at
- 	  linux-mm@kvack.org.
- 
-+config COMPACT_UNEVICTABLE_DEFAULT
-+	int
-+	depends on COMPACTION
-+	default 0 if PREEMPT_RT
-+	default 1
-+
- #
- # support for free page reporting
- config PAGE_REPORTING
-diff --git a/mm/compaction.c b/mm/compaction.c
-index 640fa76228dd9..10561cb1aaad9 100644
---- a/mm/compaction.c
-+++ b/mm/compaction.c
+diff -rupN linux.orig/mm/compaction.c linux/mm/compaction.c
+--- linux.orig/mm/compaction.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/compaction.c	2022-12-04 10:40:26.728034014 -0500
 @@ -1727,11 +1727,7 @@ typedef enum {
   * Allow userspace to control policy on scanning the unevictable LRU for
   * compactable pages.
@@ -7584,10 +35695,25 @@ index 640fa76228dd9..10561cb1aaad9 100644
  
  static inline void
  update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
-diff --git a/mm/memcontrol.c b/mm/memcontrol.c
-index b69979c9ced5c..d35b6fa560f0a 100644
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
+diff -rupN linux.orig/mm/Kconfig linux/mm/Kconfig
+--- linux.orig/mm/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/Kconfig	2022-12-04 10:40:26.728034014 -0500
+@@ -579,6 +579,12 @@ config COMPACTION
+ 	  it and then we would be really interested to hear about that at
+ 	  linux-mm@kvack.org.
+ 
++config COMPACT_UNEVICTABLE_DEFAULT
++	int
++	depends on COMPACTION
++	default 0 if PREEMPT_RT
++	default 1
++
+ #
+ # support for free page reporting
+ config PAGE_REPORTING
+diff -rupN linux.orig/mm/memcontrol.c linux/mm/memcontrol.c
+--- linux.orig/mm/memcontrol.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/memcontrol.c	2022-12-04 10:40:26.728034014 -0500
 @@ -597,25 +597,18 @@ static u64 flush_next_time;
   */
  static void memcg_stats_lock(void)
@@ -7618,7 +35744,7 @@ index b69979c9ced5c..d35b6fa560f0a 100644
  }
  
  static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
-@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lru
  	 * interrupt context while other caller need to have disabled interrupt.
  	 */
  	__memcg_stats_lock();
@@ -7627,7 +35753,7 @@ index b69979c9ced5c..d35b6fa560f0a 100644
  		switch (idx) {
  		case NR_ANON_MAPPED:
  		case NR_FILE_MAPPED:
-@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lru
  			WARN_ON_ONCE(!in_task());
  			break;
  		default:
@@ -7636,10 +35762,9 @@ index b69979c9ced5c..d35b6fa560f0a 100644
  		}
  	}
  
-diff --git a/mm/slub.c b/mm/slub.c
-index 4b98dff9be8e3..59173fa5901a0 100644
---- a/mm/slub.c
-+++ b/mm/slub.c
+diff -rupN linux.orig/mm/slub.c linux/mm/slub.c
+--- linux.orig/mm/slub.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/slub.c	2022-12-04 10:40:26.728034014 -0500
 @@ -50,7 +50,7 @@
   *   1. slab_mutex (Global Mutex)
   *   2. node->list_lock (Spinlock)
@@ -7705,7 +35830,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  #endif
  
  #ifdef CONFIG_SLUB_DEBUG
-@@ -447,7 +455,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
+@@ -447,7 +455,7 @@ slub_set_cpu_partial(struct kmem_cache *
  /*
   * Per slab locking using the pagelock
   */
@@ -7714,7 +35839,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  {
  	struct page *page = slab_page(slab);
  
-@@ -455,7 +463,7 @@ static __always_inline void __slab_lock(struct slab *slab)
+@@ -455,7 +463,7 @@ static __always_inline void __slab_lock(
  	bit_spin_lock(PG_locked, &page->flags);
  }
  
@@ -7723,7 +35848,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  {
  	struct page *page = slab_page(slab);
  
-@@ -463,31 +471,19 @@ static __always_inline void __slab_unlock(struct slab *slab)
+@@ -463,31 +471,19 @@ static __always_inline void __slab_unloc
  	__bit_spin_unlock(PG_locked, &page->flags);
  }
  
@@ -7760,7 +35885,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		lockdep_assert_irqs_disabled();
  #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
-@@ -499,18 +495,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab
+@@ -499,18 +495,15 @@ static inline bool __cmpxchg_double_slab
  	} else
  #endif
  	{
@@ -7782,7 +35907,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	}
  
  	cpu_relax();
-@@ -541,16 +534,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+@@ -541,16 +534,16 @@ static inline bool cmpxchg_double_slab(s
  		unsigned long flags;
  
  		local_irq_save(flags);
@@ -7802,7 +35927,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		local_irq_restore(flags);
  	}
  
-@@ -566,7 +559,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+@@ -566,7 +559,7 @@ static inline bool cmpxchg_double_slab(s
  
  #ifdef CONFIG_SLUB_DEBUG
  static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
@@ -7842,7 +35967,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  static inline unsigned int size_from_object(struct kmem_cache *s)
  {
  	if (s->flags & SLAB_RED_ZONE)
-@@ -1329,17 +1298,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
+@@ -1329,17 +1298,14 @@ static inline int alloc_consistency_chec
  }
  
  static noinline int alloc_debug_processing(struct kmem_cache *s,
@@ -7862,7 +35987,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	trace(s, slab, object, 1);
  	init_object(s, object, SLUB_RED_ACTIVE);
  	return 1;
-@@ -1390,63 +1356,6 @@ static inline int free_consistency_checks(struct kmem_cache *s,
+@@ -1390,63 +1356,6 @@ static inline int free_consistency_check
  	return 1;
  }
  
@@ -7948,7 +36073,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
  					struct slab *slab) {}
  static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
-@@ -1981,11 +1892,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+@@ -1981,11 +1892,13 @@ static struct slab *allocate_slab(struct
  		 */
  		slab = alloc_slab_page(alloc_gfp, node, oo);
  		if (unlikely(!slab))
@@ -7963,7 +36088,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  	account_slab(slab, oo_order(oo), s, flags);
  
-@@ -2012,15 +1925,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+@@ -2012,15 +1925,6 @@ static struct slab *allocate_slab(struct
  		set_freepointer(s, p, NULL);
  	}
  
@@ -7979,11 +36104,10 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	return slab;
  }
  
-@@ -2107,6 +2011,75 @@ static inline void remove_partial(struct kmem_cache_node *n,
- 	n->nr_partial--;
+@@ -2108,6 +2012,75 @@ static inline void remove_partial(struct
  }
  
-+/*
+ /*
 + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
 + * slab from the n->partial list. Remove only a single object from the slab, do
 + * the alloc_debug_processing() checks and leave the slab on the list, or move
@@ -8052,10 +36176,11 @@ index 4b98dff9be8e3..59173fa5901a0 100644
 +	return object;
 +}
 +
- /*
++/*
   * Remove slab from the partial list, freeze it and
   * return the pointer to the freelist.
-@@ -2187,6 +2160,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
+  *
+@@ -2187,6 +2160,13 @@ static void *get_partial_node(struct kme
  		if (!pfmemalloc_match(slab, gfpflags))
  			continue;
  
@@ -8069,7 +36194,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		t = acquire_slab(s, n, slab, object == NULL);
  		if (!t)
  			break;
-@@ -2793,6 +2773,109 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
+@@ -2793,6 +2773,109 @@ static inline unsigned long node_nr_objs
  {
  	return atomic_long_read(&n->total_objects);
  }
@@ -8179,7 +36304,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  #endif /* CONFIG_SLUB_DEBUG */
  
  #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
-@@ -3041,36 +3124,52 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+@@ -3041,36 +3124,52 @@ new_objects:
  		return NULL;
  	}
  
@@ -8245,7 +36370,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  retry_load_slab:
  
-@@ -3094,11 +3193,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+@@ -3094,11 +3193,6 @@ retry_load_slab:
  	c->slab = slab;
  
  	goto load_freelist;
@@ -8257,7 +36382,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  }
  
  /*
-@@ -3202,14 +3296,8 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_l
+@@ -3202,14 +3296,8 @@ redo:
  
  	object = c->freelist;
  	slab = c->slab;
@@ -8274,7 +36399,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	    unlikely(!object || !slab || !node_match(slab, node))) {
  		object = __slab_alloc(s, gfpflags, node, addr, c);
  	} else {
-@@ -3346,9 +3434,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
+@@ -3346,9 +3434,10 @@ static void __slab_free(struct kmem_cach
  	if (kfence_free(head))
  		return;
  
@@ -8287,7 +36412,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  	do {
  		if (unlikely(n)) {
-@@ -3468,6 +3557,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
+@@ -3468,6 +3557,7 @@ static __always_inline void do_slab_free
  	void *tail_obj = tail ? : head;
  	struct kmem_cache_cpu *c;
  	unsigned long tid;
@@ -8295,7 +36420,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  redo:
  	/*
-@@ -3482,9 +3572,13 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
+@@ -3482,9 +3572,13 @@ redo:
  	/* Same with comment on barrier() in slab_alloc_node() */
  	barrier();
  
@@ -8312,7 +36437,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  		set_freepointer(s, tail_obj, freelist);
  
-@@ -3496,16 +3590,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
+@@ -3496,16 +3590,8 @@ redo:
  			note_cmpxchg_failure("slab_free", s, tid);
  			goto redo;
  		}
@@ -8331,7 +36456,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		local_lock(&s->cpu_slab->lock);
  		c = this_cpu_ptr(s->cpu_slab);
  		if (unlikely(slab != c->slab)) {
-@@ -3520,11 +3606,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
+@@ -3520,11 +3606,8 @@ redo:
  		c->tid = next_tid(tid);
  
  		local_unlock(&s->cpu_slab->lock);
@@ -8345,7 +36470,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  }
  
  static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
-@@ -3941,6 +4024,7 @@ static void early_kmem_cache_node_alloc(int node)
+@@ -3941,6 +4024,7 @@ static void early_kmem_cache_node_alloc(
  	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
  
  	BUG_ON(!slab);
@@ -8353,7 +36478,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	if (slab_nid(slab) != node) {
  		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
  		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
-@@ -3955,7 +4039,6 @@ static void early_kmem_cache_node_alloc(int node)
+@@ -3955,7 +4039,6 @@ static void early_kmem_cache_node_alloc(
  	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
  	slab->freelist = get_freepointer(kmem_cache_node, n);
  	slab->inuse = 1;
@@ -8361,7 +36486,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	kmem_cache_node->node[node] = n;
  	init_kmem_cache_node(n);
  	inc_slabs_node(kmem_cache_node, node, slab->objects);
-@@ -4242,23 +4325,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
+@@ -4242,23 +4325,21 @@ static void list_slab_objects(struct kme
  {
  #ifdef CONFIG_SLUB_DEBUG
  	void *addr = slab_address(slab);
@@ -8390,7 +36515,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  #endif
  }
  
-@@ -4616,6 +4697,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
+@@ -4616,6 +4697,7 @@ static int __kmem_cache_do_shrink(struct
  			if (free == slab->objects) {
  				list_move(&slab->slab_list, &discard);
  				n->nr_partial--;
@@ -8398,7 +36523,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  			} else if (free <= SHRINK_PROMOTE_MAX)
  				list_move(&slab->slab_list, promote + free - 1);
  		}
-@@ -4631,7 +4713,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
+@@ -4631,7 +4713,7 @@ static int __kmem_cache_do_shrink(struct
  
  		/* Release empty slabs */
  		list_for_each_entry_safe(slab, t, &discard, slab_list)
@@ -8407,7 +36532,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  		if (slabs_node(s, node))
  			ret = 1;
-@@ -4991,12 +5073,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
+@@ -4991,12 +5073,9 @@ static void validate_slab(struct kmem_ca
  {
  	void *p;
  	void *addr = slab_address(slab);
@@ -8421,7 +36546,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  	/* Now we know that a valid freelist exists */
  	__fill_map(obj_map, s, slab);
-@@ -5007,8 +5086,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
+@@ -5007,8 +5086,6 @@ static void validate_slab(struct kmem_ca
  		if (!check_object(s, slab, p, val))
  			break;
  	}
@@ -8430,7 +36555,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  }
  
  static int validate_slab_node(struct kmem_cache *s,
-@@ -5612,7 +5689,7 @@ static ssize_t validate_store(struct kmem_cache *s,
+@@ -5612,7 +5689,7 @@ static ssize_t validate_store(struct kme
  {
  	int ret = -EINVAL;
  
@@ -8439,11 +36564,10 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		ret = validate_slab_cache(s);
  		if (ret >= 0)
  			ret = length;
-diff --git a/mm/vmstat.c b/mm/vmstat.c
-index 90af9a8572f5a..7a2d73f152304 100644
---- a/mm/vmstat.c
-+++ b/mm/vmstat.c
-@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+diff -rupN linux.orig/mm/vmstat.c linux/mm/vmstat.c
+--- linux.orig/mm/vmstat.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/vmstat.c	2022-12-04 10:40:26.728034014 -0500
+@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *
  	 * CPU migrations and preemption potentially corrupts a counter so
  	 * disable preemption.
  	 */
@@ -8453,7 +36577,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	x = delta + __this_cpu_read(*p);
  
-@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *
  	}
  	__this_cpu_write(*p, x);
  
@@ -8463,7 +36587,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  EXPORT_SYMBOL(__mod_zone_page_state);
  
-@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist
  	}
  
  	/* See __mod_node_page_state */
@@ -8473,7 +36597,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	x = delta + __this_cpu_read(*p);
  
-@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist
  	}
  	__this_cpu_write(*p, x);
  
@@ -8483,7 +36607,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  EXPORT_SYMBOL(__mod_node_page_state);
  
-@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
+@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone,
  	s8 v, t;
  
  	/* See __mod_node_page_state */
@@ -8493,7 +36617,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	v = __this_cpu_inc_return(*p);
  	t = __this_cpu_read(pcp->stat_threshold);
-@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
+@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone,
  		__this_cpu_write(*p, -overstep);
  	}
  
@@ -8503,7 +36627,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  
  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
-@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data
  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
  
  	/* See __mod_node_page_state */
@@ -8513,7 +36637,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	v = __this_cpu_inc_return(*p);
  	t = __this_cpu_read(pcp->stat_threshold);
-@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data
  		__this_cpu_write(*p, -overstep);
  	}
  
@@ -8523,7 +36647,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  
  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
-@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
+@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone,
  	s8 v, t;
  
  	/* See __mod_node_page_state */
@@ -8533,7 +36657,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	v = __this_cpu_dec_return(*p);
  	t = __this_cpu_read(pcp->stat_threshold);
-@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
+@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone,
  		__this_cpu_write(*p, overstep);
  	}
  
@@ -8543,7 +36667,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  
  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
-@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data
  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
  
  	/* See __mod_node_page_state */
@@ -8553,7 +36677,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	v = __this_cpu_dec_return(*p);
  	t = __this_cpu_read(pcp->stat_threshold);
-@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data
  		__this_cpu_write(*p, overstep);
  	}
  
@@ -8563,11 +36687,10 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  
  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
-diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
-index 035812b0461cc..ecdb47712d956 100644
---- a/net/8021q/vlan_dev.c
-+++ b/net/8021q/vlan_dev.c
-@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/net/8021q/vlan_dev.c linux/net/8021q/vlan_dev.c
+--- linux.orig/net/8021q/vlan_dev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/8021q/vlan_dev.c	2022-12-04 10:40:26.728034014 -0500
+@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct
  
  		p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i);
  		do {
@@ -8583,11 +36706,10 @@ index 035812b0461cc..ecdb47712d956 100644
  
  		stats->rx_packets	+= rxpackets;
  		stats->rx_bytes		+= rxbytes;
-diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
-index db4f2641d1cd1..7e2a9fb5786c9 100644
---- a/net/bridge/br_multicast.c
-+++ b/net/bridge/br_multicast.c
-@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct net_bridge *br,
+diff -rupN linux.orig/net/bridge/br_multicast.c linux/net/bridge/br_multicast.c
+--- linux.orig/net/bridge/br_multicast.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/bridge/br_multicast.c	2022-12-04 10:40:26.728034014 -0500
+@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct
  		unsigned int start;
  
  		do {
@@ -8599,11 +36721,10 @@ index db4f2641d1cd1..7e2a9fb5786c9 100644
  
  		mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries);
  		mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries);
-diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
-index 6e53dc9914094..f2fc284abab38 100644
---- a/net/bridge/br_vlan.c
-+++ b/net/bridge/br_vlan.c
-@@ -1378,12 +1378,12 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
+diff -rupN linux.orig/net/bridge/br_vlan.c linux/net/bridge/br_vlan.c
+--- linux.orig/net/bridge/br_vlan.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/bridge/br_vlan.c	2022-12-04 10:40:26.728034014 -0500
+@@ -1389,12 +1389,12 @@ void br_vlan_get_stats(const struct net_
  
  		cpu_stats = per_cpu_ptr(v->stats, i);
  		do {
@@ -8618,11 +36739,2324 @@ index 6e53dc9914094..f2fc284abab38 100644
  
  		u64_stats_add(&stats->rx_packets, rxpackets);
  		u64_stats_add(&stats->rx_bytes, rxbytes);
-diff --git a/net/core/dev.c b/net/core/dev.c
-index 56c8b0921c9fd..d96506980d2f2 100644
---- a/net/core/dev.c
-+++ b/net/core/dev.c
-@@ -4582,15 +4582,6 @@ static void rps_trigger_softirq(void *data)
+diff -rupN linux.orig/net/bridge/br_vlan.c.orig linux/net/bridge/br_vlan.c.orig
+--- linux.orig/net/bridge/br_vlan.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/net/bridge/br_vlan.c.orig	2022-12-04 10:40:18.724054527 -0500
+@@ -0,0 +1,2310 @@
++// SPDX-License-Identifier: GPL-2.0-only
++#include <linux/kernel.h>
++#include <linux/netdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/slab.h>
++#include <net/switchdev.h>
++
++#include "br_private.h"
++#include "br_private_tunnel.h"
++
++static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid);
++
++static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
++			      const void *ptr)
++{
++	const struct net_bridge_vlan *vle = ptr;
++	u16 vid = *(u16 *)arg->key;
++
++	return vle->vid != vid;
++}
++
++static const struct rhashtable_params br_vlan_rht_params = {
++	.head_offset = offsetof(struct net_bridge_vlan, vnode),
++	.key_offset = offsetof(struct net_bridge_vlan, vid),
++	.key_len = sizeof(u16),
++	.nelem_hint = 3,
++	.max_size = VLAN_N_VID,
++	.obj_cmpfn = br_vlan_cmp,
++	.automatic_shrinking = true,
++};
++
++static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid)
++{
++	return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params);
++}
++
++static void __vlan_add_pvid(struct net_bridge_vlan_group *vg,
++			    const struct net_bridge_vlan *v)
++{
++	if (vg->pvid == v->vid)
++		return;
++
++	smp_wmb();
++	br_vlan_set_pvid_state(vg, v->state);
++	vg->pvid = v->vid;
++}
++
++static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid)
++{
++	if (vg->pvid != vid)
++		return;
++
++	smp_wmb();
++	vg->pvid = 0;
++}
++
++/* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v.
++ * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and
++ * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v.
++ */
++static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags,
++				bool commit)
++{
++	struct net_bridge_vlan_group *vg;
++	bool change;
++
++	if (br_vlan_is_master(v))
++		vg = br_vlan_group(v->br);
++	else
++		vg = nbp_vlan_group(v->port);
++
++	/* check if anything would be changed on commit */
++	change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) ||
++		 ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED);
++
++	if (!commit)
++		goto out;
++
++	if (flags & BRIDGE_VLAN_INFO_PVID)
++		__vlan_add_pvid(vg, v);
++	else
++		__vlan_delete_pvid(vg, v->vid);
++
++	if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
++		v->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
++	else
++		v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED;
++
++out:
++	return change;
++}
++
++static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags)
++{
++	return __vlan_flags_update(v, flags, false);
++}
++
++static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags)
++{
++	__vlan_flags_update(v, flags, true);
++}
++
++static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
++			  struct net_bridge_vlan *v, u16 flags,
++			  struct netlink_ext_ack *extack)
++{
++	int err;
++
++	/* Try switchdev op first. In case it is not supported, fallback to
++	 * 8021q add.
++	 */
++	err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack);
++	if (err == -EOPNOTSUPP)
++		return vlan_vid_add(dev, br->vlan_proto, v->vid);
++	v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV;
++	return err;
++}
++
++static void __vlan_add_list(struct net_bridge_vlan *v)
++{
++	struct net_bridge_vlan_group *vg;
++	struct list_head *headp, *hpos;
++	struct net_bridge_vlan *vent;
++
++	if (br_vlan_is_master(v))
++		vg = br_vlan_group(v->br);
++	else
++		vg = nbp_vlan_group(v->port);
++
++	headp = &vg->vlan_list;
++	list_for_each_prev(hpos, headp) {
++		vent = list_entry(hpos, struct net_bridge_vlan, vlist);
++		if (v->vid >= vent->vid)
++			break;
++	}
++	list_add_rcu(&v->vlist, hpos);
++}
++
++static void __vlan_del_list(struct net_bridge_vlan *v)
++{
++	list_del_rcu(&v->vlist);
++}
++
++static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
++			  const struct net_bridge_vlan *v)
++{
++	int err;
++
++	/* Try switchdev op first. In case it is not supported, fallback to
++	 * 8021q del.
++	 */
++	err = br_switchdev_port_vlan_del(dev, v->vid);
++	if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV))
++		vlan_vid_del(dev, br->vlan_proto, v->vid);
++	return err == -EOPNOTSUPP ? 0 : err;
++}
++
++/* Returns a master vlan, if it didn't exist it gets created. In all cases
++ * a reference is taken to the master vlan before returning.
++ */
++static struct net_bridge_vlan *
++br_vlan_get_master(struct net_bridge *br, u16 vid,
++		   struct netlink_ext_ack *extack)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *masterv;
++
++	vg = br_vlan_group(br);
++	masterv = br_vlan_find(vg, vid);
++	if (!masterv) {
++		bool changed;
++
++		/* missing global ctx, create it now */
++		if (br_vlan_add(br, vid, 0, &changed, extack))
++			return NULL;
++		masterv = br_vlan_find(vg, vid);
++		if (WARN_ON(!masterv))
++			return NULL;
++		refcount_set(&masterv->refcnt, 1);
++		return masterv;
++	}
++	refcount_inc(&masterv->refcnt);
++
++	return masterv;
++}
++
++static void br_master_vlan_rcu_free(struct rcu_head *rcu)
++{
++	struct net_bridge_vlan *v;
++
++	v = container_of(rcu, struct net_bridge_vlan, rcu);
++	WARN_ON(!br_vlan_is_master(v));
++	free_percpu(v->stats);
++	v->stats = NULL;
++	kfree(v);
++}
++
++static void br_vlan_put_master(struct net_bridge_vlan *masterv)
++{
++	struct net_bridge_vlan_group *vg;
++
++	if (!br_vlan_is_master(masterv))
++		return;
++
++	vg = br_vlan_group(masterv->br);
++	if (refcount_dec_and_test(&masterv->refcnt)) {
++		rhashtable_remove_fast(&vg->vlan_hash,
++				       &masterv->vnode, br_vlan_rht_params);
++		__vlan_del_list(masterv);
++		br_multicast_toggle_one_vlan(masterv, false);
++		br_multicast_ctx_deinit(&masterv->br_mcast_ctx);
++		call_rcu(&masterv->rcu, br_master_vlan_rcu_free);
++	}
++}
++
++static void nbp_vlan_rcu_free(struct rcu_head *rcu)
++{
++	struct net_bridge_vlan *v;
++
++	v = container_of(rcu, struct net_bridge_vlan, rcu);
++	WARN_ON(br_vlan_is_master(v));
++	/* if we had per-port stats configured then free them here */
++	if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS)
++		free_percpu(v->stats);
++	v->stats = NULL;
++	kfree(v);
++}
++
++static void br_vlan_init_state(struct net_bridge_vlan *v)
++{
++	struct net_bridge *br;
++
++	if (br_vlan_is_master(v))
++		br = v->br;
++	else
++		br = v->port->br;
++
++	if (br_opt_get(br, BROPT_MST_ENABLED)) {
++		br_mst_vlan_init_state(v);
++		return;
++	}
++
++	v->state = BR_STATE_FORWARDING;
++	v->msti = 0;
++}
++
++/* This is the shared VLAN add function which works for both ports and bridge
++ * devices. There are four possible calls to this function in terms of the
++ * vlan entry type:
++ * 1. vlan is being added on a port (no master flags, global entry exists)
++ * 2. vlan is being added on a bridge (both master and brentry flags)
++ * 3. vlan is being added on a port, but a global entry didn't exist which
++ *    is being created right now (master flag set, brentry flag unset), the
++ *    global entry is used for global per-vlan features, but not for filtering
++ * 4. same as 3 but with both master and brentry flags set so the entry
++ *    will be used for filtering in both the port and the bridge
++ */
++static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
++		      struct netlink_ext_ack *extack)
++{
++	struct net_bridge_vlan *masterv = NULL;
++	struct net_bridge_port *p = NULL;
++	struct net_bridge_vlan_group *vg;
++	struct net_device *dev;
++	struct net_bridge *br;
++	int err;
++
++	if (br_vlan_is_master(v)) {
++		br = v->br;
++		dev = br->dev;
++		vg = br_vlan_group(br);
++	} else {
++		p = v->port;
++		br = p->br;
++		dev = p->dev;
++		vg = nbp_vlan_group(p);
++	}
++
++	if (p) {
++		/* Add VLAN to the device filter if it is supported.
++		 * This ensures tagged traffic enters the bridge when
++		 * promiscuous mode is disabled by br_manage_promisc().
++		 */
++		err = __vlan_vid_add(dev, br, v, flags, extack);
++		if (err)
++			goto out;
++
++		/* need to work on the master vlan too */
++		if (flags & BRIDGE_VLAN_INFO_MASTER) {
++			bool changed;
++
++			err = br_vlan_add(br, v->vid,
++					  flags | BRIDGE_VLAN_INFO_BRENTRY,
++					  &changed, extack);
++			if (err)
++				goto out_filt;
++
++			if (changed)
++				br_vlan_notify(br, NULL, v->vid, 0,
++					       RTM_NEWVLAN);
++		}
++
++		masterv = br_vlan_get_master(br, v->vid, extack);
++		if (!masterv) {
++			err = -ENOMEM;
++			goto out_filt;
++		}
++		v->brvlan = masterv;
++		if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) {
++			v->stats =
++			     netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
++			if (!v->stats) {
++				err = -ENOMEM;
++				goto out_filt;
++			}
++			v->priv_flags |= BR_VLFLAG_PER_PORT_STATS;
++		} else {
++			v->stats = masterv->stats;
++		}
++		br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx);
++	} else {
++		if (br_vlan_should_use(v)) {
++			err = br_switchdev_port_vlan_add(dev, v->vid, flags,
++							 false, extack);
++			if (err && err != -EOPNOTSUPP)
++				goto out;
++		}
++		br_multicast_ctx_init(br, v, &v->br_mcast_ctx);
++		v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED;
++	}
++
++	/* Add the dev mac and count the vlan only if it's usable */
++	if (br_vlan_should_use(v)) {
++		err = br_fdb_add_local(br, p, dev->dev_addr, v->vid);
++		if (err) {
++			br_err(br, "failed insert local address into bridge forwarding table\n");
++			goto out_filt;
++		}
++		vg->num_vlans++;
++	}
++
++	/* set the state before publishing */
++	br_vlan_init_state(v);
++
++	err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode,
++					    br_vlan_rht_params);
++	if (err)
++		goto out_fdb_insert;
++
++	__vlan_add_list(v);
++	__vlan_flags_commit(v, flags);
++	br_multicast_toggle_one_vlan(v, true);
++
++	if (p)
++		nbp_vlan_set_vlan_dev_state(p, v->vid);
++out:
++	return err;
++
++out_fdb_insert:
++	if (br_vlan_should_use(v)) {
++		br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid);
++		vg->num_vlans--;
++	}
++
++out_filt:
++	if (p) {
++		__vlan_vid_del(dev, br, v);
++		if (masterv) {
++			if (v->stats && masterv->stats != v->stats)
++				free_percpu(v->stats);
++			v->stats = NULL;
++
++			br_vlan_put_master(masterv);
++			v->brvlan = NULL;
++		}
++	} else {
++		br_switchdev_port_vlan_del(dev, v->vid);
++	}
++
++	goto out;
++}
++
++static int __vlan_del(struct net_bridge_vlan *v)
++{
++	struct net_bridge_vlan *masterv = v;
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p = NULL;
++	int err = 0;
++
++	if (br_vlan_is_master(v)) {
++		vg = br_vlan_group(v->br);
++	} else {
++		p = v->port;
++		vg = nbp_vlan_group(v->port);
++		masterv = v->brvlan;
++	}
++
++	__vlan_delete_pvid(vg, v->vid);
++	if (p) {
++		err = __vlan_vid_del(p->dev, p->br, v);
++		if (err)
++			goto out;
++	} else {
++		err = br_switchdev_port_vlan_del(v->br->dev, v->vid);
++		if (err && err != -EOPNOTSUPP)
++			goto out;
++		err = 0;
++	}
++
++	if (br_vlan_should_use(v)) {
++		v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY;
++		vg->num_vlans--;
++	}
++
++	if (masterv != v) {
++		vlan_tunnel_info_del(vg, v);
++		rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
++				       br_vlan_rht_params);
++		__vlan_del_list(v);
++		nbp_vlan_set_vlan_dev_state(p, v->vid);
++		br_multicast_toggle_one_vlan(v, false);
++		br_multicast_port_ctx_deinit(&v->port_mcast_ctx);
++		call_rcu(&v->rcu, nbp_vlan_rcu_free);
++	}
++
++	br_vlan_put_master(masterv);
++out:
++	return err;
++}
++
++static void __vlan_group_free(struct net_bridge_vlan_group *vg)
++{
++	WARN_ON(!list_empty(&vg->vlan_list));
++	rhashtable_destroy(&vg->vlan_hash);
++	vlan_tunnel_deinit(vg);
++	kfree(vg);
++}
++
++static void __vlan_flush(const struct net_bridge *br,
++			 const struct net_bridge_port *p,
++			 struct net_bridge_vlan_group *vg)
++{
++	struct net_bridge_vlan *vlan, *tmp;
++	u16 v_start = 0, v_end = 0;
++	int err;
++
++	__vlan_delete_pvid(vg, vg->pvid);
++	list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) {
++		/* take care of disjoint ranges */
++		if (!v_start) {
++			v_start = vlan->vid;
++		} else if (vlan->vid - v_end != 1) {
++			/* found range end, notify and start next one */
++			br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
++			v_start = vlan->vid;
++		}
++		v_end = vlan->vid;
++
++		err = __vlan_del(vlan);
++		if (err) {
++			br_err(br,
++			       "port %u(%s) failed to delete vlan %d: %pe\n",
++			       (unsigned int) p->port_no, p->dev->name,
++			       vlan->vid, ERR_PTR(err));
++		}
++	}
++
++	/* notify about the last/whole vlan range */
++	if (v_start)
++		br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
++}
++
++struct sk_buff *br_handle_vlan(struct net_bridge *br,
++			       const struct net_bridge_port *p,
++			       struct net_bridge_vlan_group *vg,
++			       struct sk_buff *skb)
++{
++	struct pcpu_sw_netstats *stats;
++	struct net_bridge_vlan *v;
++	u16 vid;
++
++	/* If this packet was not filtered at input, let it pass */
++	if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
++		goto out;
++
++	/* At this point, we know that the frame was filtered and contains
++	 * a valid vlan id.  If the vlan id has untagged flag set,
++	 * send untagged; otherwise, send tagged.
++	 */
++	br_vlan_get_tag(skb, &vid);
++	v = br_vlan_find(vg, vid);
++	/* Vlan entry must be configured at this point.  The
++	 * only exception is the bridge is set in promisc mode and the
++	 * packet is destined for the bridge device.  In this case
++	 * pass the packet as is.
++	 */
++	if (!v || !br_vlan_should_use(v)) {
++		if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) {
++			goto out;
++		} else {
++			kfree_skb(skb);
++			return NULL;
++		}
++	}
++	if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
++		stats = this_cpu_ptr(v->stats);
++		u64_stats_update_begin(&stats->syncp);
++		u64_stats_add(&stats->tx_bytes, skb->len);
++		u64_stats_inc(&stats->tx_packets);
++		u64_stats_update_end(&stats->syncp);
++	}
++
++	/* If the skb will be sent using forwarding offload, the assumption is
++	 * that the switchdev will inject the packet into hardware together
++	 * with the bridge VLAN, so that it can be forwarded according to that
++	 * VLAN. The switchdev should deal with popping the VLAN header in
++	 * hardware on each egress port as appropriate. So only strip the VLAN
++	 * header if forwarding offload is not being used.
++	 */
++	if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED &&
++	    !br_switchdev_frame_uses_tx_fwd_offload(skb))
++		__vlan_hwaccel_clear_tag(skb);
++
++	if (p && (p->flags & BR_VLAN_TUNNEL) &&
++	    br_handle_egress_vlan_tunnel(skb, v)) {
++		kfree_skb(skb);
++		return NULL;
++	}
++out:
++	return skb;
++}
++
++/* Called under RCU */
++static bool __allowed_ingress(const struct net_bridge *br,
++			      struct net_bridge_vlan_group *vg,
++			      struct sk_buff *skb, u16 *vid,
++			      u8 *state,
++			      struct net_bridge_vlan **vlan)
++{
++	struct pcpu_sw_netstats *stats;
++	struct net_bridge_vlan *v;
++	bool tagged;
++
++	BR_INPUT_SKB_CB(skb)->vlan_filtered = true;
++	/* If vlan tx offload is disabled on bridge device and frame was
++	 * sent from vlan device on the bridge device, it does not have
++	 * HW accelerated vlan tag.
++	 */
++	if (unlikely(!skb_vlan_tag_present(skb) &&
++		     skb->protocol == br->vlan_proto)) {
++		skb = skb_vlan_untag(skb);
++		if (unlikely(!skb))
++			return false;
++	}
++
++	if (!br_vlan_get_tag(skb, vid)) {
++		/* Tagged frame */
++		if (skb->vlan_proto != br->vlan_proto) {
++			/* Protocol-mismatch, empty out vlan_tci for new tag */
++			skb_push(skb, ETH_HLEN);
++			skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
++							skb_vlan_tag_get(skb));
++			if (unlikely(!skb))
++				return false;
++
++			skb_pull(skb, ETH_HLEN);
++			skb_reset_mac_len(skb);
++			*vid = 0;
++			tagged = false;
++		} else {
++			tagged = true;
++		}
++	} else {
++		/* Untagged frame */
++		tagged = false;
++	}
++
++	if (!*vid) {
++		u16 pvid = br_get_pvid(vg);
++
++		/* Frame had a tag with VID 0 or did not have a tag.
++		 * See if pvid is set on this port.  That tells us which
++		 * vlan untagged or priority-tagged traffic belongs to.
++		 */
++		if (!pvid)
++			goto drop;
++
++		/* PVID is set on this port.  Any untagged or priority-tagged
++		 * ingress frame is considered to belong to this vlan.
++		 */
++		*vid = pvid;
++		if (likely(!tagged))
++			/* Untagged Frame. */
++			__vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid);
++		else
++			/* Priority-tagged Frame.
++			 * At this point, we know that skb->vlan_tci VID
++			 * field was 0.
++			 * We update only VID field and preserve PCP field.
++			 */
++			skb->vlan_tci |= pvid;
++
++		/* if snooping and stats are disabled we can avoid the lookup */
++		if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) &&
++		    !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
++			if (*state == BR_STATE_FORWARDING) {
++				*state = br_vlan_get_pvid_state(vg);
++				if (!br_vlan_state_allowed(*state, true))
++					goto drop;
++			}
++			return true;
++		}
++	}
++	v = br_vlan_find(vg, *vid);
++	if (!v || !br_vlan_should_use(v))
++		goto drop;
++
++	if (*state == BR_STATE_FORWARDING) {
++		*state = br_vlan_get_state(v);
++		if (!br_vlan_state_allowed(*state, true))
++			goto drop;
++	}
++
++	if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
++		stats = this_cpu_ptr(v->stats);
++		u64_stats_update_begin(&stats->syncp);
++		u64_stats_add(&stats->rx_bytes, skb->len);
++		u64_stats_inc(&stats->rx_packets);
++		u64_stats_update_end(&stats->syncp);
++	}
++
++	*vlan = v;
++
++	return true;
++
++drop:
++	kfree_skb(skb);
++	return false;
++}
++
++bool br_allowed_ingress(const struct net_bridge *br,
++			struct net_bridge_vlan_group *vg, struct sk_buff *skb,
++			u16 *vid, u8 *state,
++			struct net_bridge_vlan **vlan)
++{
++	/* If VLAN filtering is disabled on the bridge, all packets are
++	 * permitted.
++	 */
++	*vlan = NULL;
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED)) {
++		BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
++		return true;
++	}
++
++	return __allowed_ingress(br, vg, skb, vid, state, vlan);
++}
++
++/* Called under RCU. */
++bool br_allowed_egress(struct net_bridge_vlan_group *vg,
++		       const struct sk_buff *skb)
++{
++	const struct net_bridge_vlan *v;
++	u16 vid;
++
++	/* If this packet was not filtered at input, let it pass */
++	if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
++		return true;
++
++	br_vlan_get_tag(skb, &vid);
++	v = br_vlan_find(vg, vid);
++	if (v && br_vlan_should_use(v) &&
++	    br_vlan_state_allowed(br_vlan_get_state(v), false))
++		return true;
++
++	return false;
++}
++
++/* Called under RCU */
++bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge *br = p->br;
++	struct net_bridge_vlan *v;
++
++	/* If filtering was disabled at input, let it pass. */
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED))
++		return true;
++
++	vg = nbp_vlan_group_rcu(p);
++	if (!vg || !vg->num_vlans)
++		return false;
++
++	if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto)
++		*vid = 0;
++
++	if (!*vid) {
++		*vid = br_get_pvid(vg);
++		if (!*vid ||
++		    !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true))
++			return false;
++
++		return true;
++	}
++
++	v = br_vlan_find(vg, *vid);
++	if (v && br_vlan_state_allowed(br_vlan_get_state(v), true))
++		return true;
++
++	return false;
++}
++
++static int br_vlan_add_existing(struct net_bridge *br,
++				struct net_bridge_vlan_group *vg,
++				struct net_bridge_vlan *vlan,
++				u16 flags, bool *changed,
++				struct netlink_ext_ack *extack)
++{
++	bool would_change = __vlan_flags_would_change(vlan, flags);
++	bool becomes_brentry = false;
++	int err;
++
++	if (!br_vlan_is_brentry(vlan)) {
++		/* Trying to change flags of non-existent bridge vlan */
++		if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
++			return -EINVAL;
++
++		becomes_brentry = true;
++	}
++
++	/* Master VLANs that aren't brentries weren't notified before,
++	 * time to notify them now.
++	 */
++	if (becomes_brentry || would_change) {
++		err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags,
++						 would_change, extack);
++		if (err && err != -EOPNOTSUPP)
++			return err;
++	}
++
++	if (becomes_brentry) {
++		/* It was only kept for port vlans, now make it real */
++		err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid);
++		if (err) {
++			br_err(br, "failed to insert local address into bridge forwarding table\n");
++			goto err_fdb_insert;
++		}
++
++		refcount_inc(&vlan->refcnt);
++		vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
++		vg->num_vlans++;
++		*changed = true;
++		br_multicast_toggle_one_vlan(vlan, true);
++	}
++
++	__vlan_flags_commit(vlan, flags);
++	if (would_change)
++		*changed = true;
++
++	return 0;
++
++err_fdb_insert:
++	br_switchdev_port_vlan_del(br->dev, vlan->vid);
++	return err;
++}
++
++/* Must be protected by RTNL.
++ * Must be called with vid in range from 1 to 4094 inclusive.
++ * changed must be true only if the vlan was created or updated
++ */
++int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed,
++		struct netlink_ext_ack *extack)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *vlan;
++	int ret;
++
++	ASSERT_RTNL();
++
++	*changed = false;
++	vg = br_vlan_group(br);
++	vlan = br_vlan_find(vg, vid);
++	if (vlan)
++		return br_vlan_add_existing(br, vg, vlan, flags, changed,
++					    extack);
++
++	vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
++	if (!vlan)
++		return -ENOMEM;
++
++	vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
++	if (!vlan->stats) {
++		kfree(vlan);
++		return -ENOMEM;
++	}
++	vlan->vid = vid;
++	vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER;
++	vlan->flags &= ~BRIDGE_VLAN_INFO_PVID;
++	vlan->br = br;
++	if (flags & BRIDGE_VLAN_INFO_BRENTRY)
++		refcount_set(&vlan->refcnt, 1);
++	ret = __vlan_add(vlan, flags, extack);
++	if (ret) {
++		free_percpu(vlan->stats);
++		kfree(vlan);
++	} else {
++		*changed = true;
++	}
++
++	return ret;
++}
++
++/* Must be protected by RTNL.
++ * Must be called with vid in range from 1 to 4094 inclusive.
++ */
++int br_vlan_delete(struct net_bridge *br, u16 vid)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v;
++
++	ASSERT_RTNL();
++
++	vg = br_vlan_group(br);
++	v = br_vlan_find(vg, vid);
++	if (!v || !br_vlan_is_brentry(v))
++		return -ENOENT;
++
++	br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
++	br_fdb_delete_by_port(br, NULL, vid, 0);
++
++	vlan_tunnel_info_del(vg, v);
++
++	return __vlan_del(v);
++}
++
++void br_vlan_flush(struct net_bridge *br)
++{
++	struct net_bridge_vlan_group *vg;
++
++	ASSERT_RTNL();
++
++	vg = br_vlan_group(br);
++	__vlan_flush(br, NULL, vg);
++	RCU_INIT_POINTER(br->vlgrp, NULL);
++	synchronize_rcu();
++	__vlan_group_free(vg);
++}
++
++struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid)
++{
++	if (!vg)
++		return NULL;
++
++	return br_vlan_lookup(&vg->vlan_hash, vid);
++}
++
++/* Must be protected by RTNL. */
++static void recalculate_group_addr(struct net_bridge *br)
++{
++	if (br_opt_get(br, BROPT_GROUP_ADDR_SET))
++		return;
++
++	spin_lock_bh(&br->lock);
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
++	    br->vlan_proto == htons(ETH_P_8021Q)) {
++		/* Bridge Group Address */
++		br->group_addr[5] = 0x00;
++	} else { /* vlan_enabled && ETH_P_8021AD */
++		/* Provider Bridge Group Address */
++		br->group_addr[5] = 0x08;
++	}
++	spin_unlock_bh(&br->lock);
++}
++
++/* Must be protected by RTNL. */
++void br_recalculate_fwd_mask(struct net_bridge *br)
++{
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
++	    br->vlan_proto == htons(ETH_P_8021Q))
++		br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
++	else /* vlan_enabled && ETH_P_8021AD */
++		br->group_fwd_mask_required = BR_GROUPFWD_8021AD &
++					      ~(1u << br->group_addr[5]);
++}
++
++int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val,
++			  struct netlink_ext_ack *extack)
++{
++	struct switchdev_attr attr = {
++		.orig_dev = br->dev,
++		.id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
++		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
++		.u.vlan_filtering = val,
++	};
++	int err;
++
++	if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val)
++		return 0;
++
++	br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val);
++
++	err = switchdev_port_attr_set(br->dev, &attr, extack);
++	if (err && err != -EOPNOTSUPP) {
++		br_opt_toggle(br, BROPT_VLAN_ENABLED, !val);
++		return err;
++	}
++
++	br_manage_promisc(br);
++	recalculate_group_addr(br);
++	br_recalculate_fwd_mask(br);
++	if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
++		br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n");
++		br_multicast_toggle_vlan_snooping(br, false, NULL);
++	}
++
++	return 0;
++}
++
++bool br_vlan_enabled(const struct net_device *dev)
++{
++	struct net_bridge *br = netdev_priv(dev);
++
++	return br_opt_get(br, BROPT_VLAN_ENABLED);
++}
++EXPORT_SYMBOL_GPL(br_vlan_enabled);
++
++int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto)
++{
++	struct net_bridge *br = netdev_priv(dev);
++
++	*p_proto = ntohs(br->vlan_proto);
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_proto);
++
++int __br_vlan_set_proto(struct net_bridge *br, __be16 proto,
++			struct netlink_ext_ack *extack)
++{
++	struct switchdev_attr attr = {
++		.orig_dev = br->dev,
++		.id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL,
++		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
++		.u.vlan_protocol = ntohs(proto),
++	};
++	int err = 0;
++	struct net_bridge_port *p;
++	struct net_bridge_vlan *vlan;
++	struct net_bridge_vlan_group *vg;
++	__be16 oldproto = br->vlan_proto;
++
++	if (br->vlan_proto == proto)
++		return 0;
++
++	err = switchdev_port_attr_set(br->dev, &attr, extack);
++	if (err && err != -EOPNOTSUPP)
++		return err;
++
++	/* Add VLANs for the new proto to the device filter. */
++	list_for_each_entry(p, &br->port_list, list) {
++		vg = nbp_vlan_group(p);
++		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
++			if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++				continue;
++			err = vlan_vid_add(p->dev, proto, vlan->vid);
++			if (err)
++				goto err_filt;
++		}
++	}
++
++	br->vlan_proto = proto;
++
++	recalculate_group_addr(br);
++	br_recalculate_fwd_mask(br);
++
++	/* Delete VLANs for the old proto from the device filter. */
++	list_for_each_entry(p, &br->port_list, list) {
++		vg = nbp_vlan_group(p);
++		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
++			if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++				continue;
++			vlan_vid_del(p->dev, oldproto, vlan->vid);
++		}
++	}
++
++	return 0;
++
++err_filt:
++	attr.u.vlan_protocol = ntohs(oldproto);
++	switchdev_port_attr_set(br->dev, &attr, NULL);
++
++	list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) {
++		if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++			continue;
++		vlan_vid_del(p->dev, proto, vlan->vid);
++	}
++
++	list_for_each_entry_continue_reverse(p, &br->port_list, list) {
++		vg = nbp_vlan_group(p);
++		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
++			if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++				continue;
++			vlan_vid_del(p->dev, proto, vlan->vid);
++		}
++	}
++
++	return err;
++}
++
++int br_vlan_set_proto(struct net_bridge *br, unsigned long val,
++		      struct netlink_ext_ack *extack)
++{
++	if (!eth_type_vlan(htons(val)))
++		return -EPROTONOSUPPORT;
++
++	return __br_vlan_set_proto(br, htons(val), extack);
++}
++
++int br_vlan_set_stats(struct net_bridge *br, unsigned long val)
++{
++	switch (val) {
++	case 0:
++	case 1:
++		br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val);
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val)
++{
++	struct net_bridge_port *p;
++
++	/* allow to change the option if there are no port vlans configured */
++	list_for_each_entry(p, &br->port_list, list) {
++		struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
++
++		if (vg->num_vlans)
++			return -EBUSY;
++	}
++
++	switch (val) {
++	case 0:
++	case 1:
++		br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val);
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid)
++{
++	struct net_bridge_vlan *v;
++
++	if (vid != vg->pvid)
++		return false;
++
++	v = br_vlan_lookup(&vg->vlan_hash, vid);
++	if (v && br_vlan_should_use(v) &&
++	    (v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
++		return true;
++
++	return false;
++}
++
++static void br_vlan_disable_default_pvid(struct net_bridge *br)
++{
++	struct net_bridge_port *p;
++	u16 pvid = br->default_pvid;
++
++	/* Disable default_pvid on all ports where it is still
++	 * configured.
++	 */
++	if (vlan_default_pvid(br_vlan_group(br), pvid)) {
++		if (!br_vlan_delete(br, pvid))
++			br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
++	}
++
++	list_for_each_entry(p, &br->port_list, list) {
++		if (vlan_default_pvid(nbp_vlan_group(p), pvid) &&
++		    !nbp_vlan_delete(p, pvid))
++			br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
++	}
++
++	br->default_pvid = 0;
++}
++
++int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
++			       struct netlink_ext_ack *extack)
++{
++	const struct net_bridge_vlan *pvent;
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p;
++	unsigned long *changed;
++	bool vlchange;
++	u16 old_pvid;
++	int err = 0;
++
++	if (!pvid) {
++		br_vlan_disable_default_pvid(br);
++		return 0;
++	}
++
++	changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
++	if (!changed)
++		return -ENOMEM;
++
++	old_pvid = br->default_pvid;
++
++	/* Update default_pvid config only if we do not conflict with
++	 * user configuration.
++	 */
++	vg = br_vlan_group(br);
++	pvent = br_vlan_find(vg, pvid);
++	if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) &&
++	    (!pvent || !br_vlan_should_use(pvent))) {
++		err = br_vlan_add(br, pvid,
++				  BRIDGE_VLAN_INFO_PVID |
++				  BRIDGE_VLAN_INFO_UNTAGGED |
++				  BRIDGE_VLAN_INFO_BRENTRY,
++				  &vlchange, extack);
++		if (err)
++			goto out;
++
++		if (br_vlan_delete(br, old_pvid))
++			br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN);
++		br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN);
++		__set_bit(0, changed);
++	}
++
++	list_for_each_entry(p, &br->port_list, list) {
++		/* Update default_pvid config only if we do not conflict with
++		 * user configuration.
++		 */
++		vg = nbp_vlan_group(p);
++		if ((old_pvid &&
++		     !vlan_default_pvid(vg, old_pvid)) ||
++		    br_vlan_find(vg, pvid))
++			continue;
++
++		err = nbp_vlan_add(p, pvid,
++				   BRIDGE_VLAN_INFO_PVID |
++				   BRIDGE_VLAN_INFO_UNTAGGED,
++				   &vlchange, extack);
++		if (err)
++			goto err_port;
++		if (nbp_vlan_delete(p, old_pvid))
++			br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN);
++		br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN);
++		__set_bit(p->port_no, changed);
++	}
++
++	br->default_pvid = pvid;
++
++out:
++	bitmap_free(changed);
++	return err;
++
++err_port:
++	list_for_each_entry_continue_reverse(p, &br->port_list, list) {
++		if (!test_bit(p->port_no, changed))
++			continue;
++
++		if (old_pvid) {
++			nbp_vlan_add(p, old_pvid,
++				     BRIDGE_VLAN_INFO_PVID |
++				     BRIDGE_VLAN_INFO_UNTAGGED,
++				     &vlchange, NULL);
++			br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN);
++		}
++		nbp_vlan_delete(p, pvid);
++		br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
++	}
++
++	if (test_bit(0, changed)) {
++		if (old_pvid) {
++			br_vlan_add(br, old_pvid,
++				    BRIDGE_VLAN_INFO_PVID |
++				    BRIDGE_VLAN_INFO_UNTAGGED |
++				    BRIDGE_VLAN_INFO_BRENTRY,
++				    &vlchange, NULL);
++			br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN);
++		}
++		br_vlan_delete(br, pvid);
++		br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
++	}
++	goto out;
++}
++
++int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val,
++			     struct netlink_ext_ack *extack)
++{
++	u16 pvid = val;
++	int err = 0;
++
++	if (val >= VLAN_VID_MASK)
++		return -EINVAL;
++
++	if (pvid == br->default_pvid)
++		goto out;
++
++	/* Only allow default pvid change when filtering is disabled */
++	if (br_opt_get(br, BROPT_VLAN_ENABLED)) {
++		pr_info_once("Please disable vlan filtering to change default_pvid\n");
++		err = -EPERM;
++		goto out;
++	}
++	err = __br_vlan_set_default_pvid(br, pvid, extack);
++out:
++	return err;
++}
++
++int br_vlan_init(struct net_bridge *br)
++{
++	struct net_bridge_vlan_group *vg;
++	int ret = -ENOMEM;
++
++	vg = kzalloc(sizeof(*vg), GFP_KERNEL);
++	if (!vg)
++		goto out;
++	ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
++	if (ret)
++		goto err_rhtbl;
++	ret = vlan_tunnel_init(vg);
++	if (ret)
++		goto err_tunnel_init;
++	INIT_LIST_HEAD(&vg->vlan_list);
++	br->vlan_proto = htons(ETH_P_8021Q);
++	br->default_pvid = 1;
++	rcu_assign_pointer(br->vlgrp, vg);
++
++out:
++	return ret;
++
++err_tunnel_init:
++	rhashtable_destroy(&vg->vlan_hash);
++err_rhtbl:
++	kfree(vg);
++
++	goto out;
++}
++
++int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack)
++{
++	struct switchdev_attr attr = {
++		.orig_dev = p->br->dev,
++		.id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
++		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
++		.u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED),
++	};
++	struct net_bridge_vlan_group *vg;
++	int ret = -ENOMEM;
++
++	vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL);
++	if (!vg)
++		goto out;
++
++	ret = switchdev_port_attr_set(p->dev, &attr, extack);
++	if (ret && ret != -EOPNOTSUPP)
++		goto err_vlan_enabled;
++
++	ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
++	if (ret)
++		goto err_rhtbl;
++	ret = vlan_tunnel_init(vg);
++	if (ret)
++		goto err_tunnel_init;
++	INIT_LIST_HEAD(&vg->vlan_list);
++	rcu_assign_pointer(p->vlgrp, vg);
++	if (p->br->default_pvid) {
++		bool changed;
++
++		ret = nbp_vlan_add(p, p->br->default_pvid,
++				   BRIDGE_VLAN_INFO_PVID |
++				   BRIDGE_VLAN_INFO_UNTAGGED,
++				   &changed, extack);
++		if (ret)
++			goto err_vlan_add;
++		br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN);
++	}
++out:
++	return ret;
++
++err_vlan_add:
++	RCU_INIT_POINTER(p->vlgrp, NULL);
++	synchronize_rcu();
++	vlan_tunnel_deinit(vg);
++err_tunnel_init:
++	rhashtable_destroy(&vg->vlan_hash);
++err_rhtbl:
++err_vlan_enabled:
++	kfree(vg);
++
++	goto out;
++}
++
++/* Must be protected by RTNL.
++ * Must be called with vid in range from 1 to 4094 inclusive.
++ * changed must be true only if the vlan was created or updated
++ */
++int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
++		 bool *changed, struct netlink_ext_ack *extack)
++{
++	struct net_bridge_vlan *vlan;
++	int ret;
++
++	ASSERT_RTNL();
++
++	*changed = false;
++	vlan = br_vlan_find(nbp_vlan_group(port), vid);
++	if (vlan) {
++		bool would_change = __vlan_flags_would_change(vlan, flags);
++
++		if (would_change) {
++			/* Pass the flags to the hardware bridge */
++			ret = br_switchdev_port_vlan_add(port->dev, vid, flags,
++							 true, extack);
++			if (ret && ret != -EOPNOTSUPP)
++				return ret;
++		}
++
++		__vlan_flags_commit(vlan, flags);
++		*changed = would_change;
++
++		return 0;
++	}
++
++	vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
++	if (!vlan)
++		return -ENOMEM;
++
++	vlan->vid = vid;
++	vlan->port = port;
++	ret = __vlan_add(vlan, flags, extack);
++	if (ret)
++		kfree(vlan);
++	else
++		*changed = true;
++
++	return ret;
++}
++
++/* Must be protected by RTNL.
++ * Must be called with vid in range from 1 to 4094 inclusive.
++ */
++int nbp_vlan_delete(struct net_bridge_port *port, u16 vid)
++{
++	struct net_bridge_vlan *v;
++
++	ASSERT_RTNL();
++
++	v = br_vlan_find(nbp_vlan_group(port), vid);
++	if (!v)
++		return -ENOENT;
++	br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid);
++	br_fdb_delete_by_port(port->br, port, vid, 0);
++
++	return __vlan_del(v);
++}
++
++void nbp_vlan_flush(struct net_bridge_port *port)
++{
++	struct net_bridge_vlan_group *vg;
++
++	ASSERT_RTNL();
++
++	vg = nbp_vlan_group(port);
++	__vlan_flush(port->br, port, vg);
++	RCU_INIT_POINTER(port->vlgrp, NULL);
++	synchronize_rcu();
++	__vlan_group_free(vg);
++}
++
++void br_vlan_get_stats(const struct net_bridge_vlan *v,
++		       struct pcpu_sw_netstats *stats)
++{
++	int i;
++
++	memset(stats, 0, sizeof(*stats));
++	for_each_possible_cpu(i) {
++		u64 rxpackets, rxbytes, txpackets, txbytes;
++		struct pcpu_sw_netstats *cpu_stats;
++		unsigned int start;
++
++		cpu_stats = per_cpu_ptr(v->stats, i);
++		do {
++			start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
++			rxpackets = u64_stats_read(&cpu_stats->rx_packets);
++			rxbytes = u64_stats_read(&cpu_stats->rx_bytes);
++			txbytes = u64_stats_read(&cpu_stats->tx_bytes);
++			txpackets = u64_stats_read(&cpu_stats->tx_packets);
++		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
++
++		u64_stats_add(&stats->rx_packets, rxpackets);
++		u64_stats_add(&stats->rx_bytes, rxbytes);
++		u64_stats_add(&stats->tx_bytes, txbytes);
++		u64_stats_add(&stats->tx_packets, txpackets);
++	}
++}
++
++int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p;
++
++	ASSERT_RTNL();
++	p = br_port_get_check_rtnl(dev);
++	if (p)
++		vg = nbp_vlan_group(p);
++	else if (netif_is_bridge_master(dev))
++		vg = br_vlan_group(netdev_priv(dev));
++	else
++		return -EINVAL;
++
++	*p_pvid = br_get_pvid(vg);
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
++
++int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p;
++
++	p = br_port_get_check_rcu(dev);
++	if (p)
++		vg = nbp_vlan_group_rcu(p);
++	else if (netif_is_bridge_master(dev))
++		vg = br_vlan_group_rcu(netdev_priv(dev));
++	else
++		return -EINVAL;
++
++	*p_pvid = br_get_pvid(vg);
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
++
++void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
++				    struct net_device_path_ctx *ctx,
++				    struct net_device_path *path)
++{
++	struct net_bridge_vlan_group *vg;
++	int idx = ctx->num_vlans - 1;
++	u16 vid;
++
++	path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
++
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED))
++		return;
++
++	vg = br_vlan_group(br);
++
++	if (idx >= 0 &&
++	    ctx->vlan[idx].proto == br->vlan_proto) {
++		vid = ctx->vlan[idx].id;
++	} else {
++		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG;
++		vid = br_get_pvid(vg);
++	}
++
++	path->bridge.vlan_id = vid;
++	path->bridge.vlan_proto = br->vlan_proto;
++}
++
++int br_vlan_fill_forward_path_mode(struct net_bridge *br,
++				   struct net_bridge_port *dst,
++				   struct net_device_path *path)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v;
++
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED))
++		return 0;
++
++	vg = nbp_vlan_group_rcu(dst);
++	v = br_vlan_find(vg, path->bridge.vlan_id);
++	if (!v || !br_vlan_should_use(v))
++		return -EINVAL;
++
++	if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
++		return 0;
++
++	if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG)
++		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
++	else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW;
++	else
++		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG;
++
++	return 0;
++}
++
++int br_vlan_get_info(const struct net_device *dev, u16 vid,
++		     struct bridge_vlan_info *p_vinfo)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v;
++	struct net_bridge_port *p;
++
++	ASSERT_RTNL();
++	p = br_port_get_check_rtnl(dev);
++	if (p)
++		vg = nbp_vlan_group(p);
++	else if (netif_is_bridge_master(dev))
++		vg = br_vlan_group(netdev_priv(dev));
++	else
++		return -EINVAL;
++
++	v = br_vlan_find(vg, vid);
++	if (!v)
++		return -ENOENT;
++
++	p_vinfo->vid = vid;
++	p_vinfo->flags = v->flags;
++	if (vid == br_get_pvid(vg))
++		p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_info);
++
++int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid,
++			 struct bridge_vlan_info *p_vinfo)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v;
++	struct net_bridge_port *p;
++
++	p = br_port_get_check_rcu(dev);
++	if (p)
++		vg = nbp_vlan_group_rcu(p);
++	else if (netif_is_bridge_master(dev))
++		vg = br_vlan_group_rcu(netdev_priv(dev));
++	else
++		return -EINVAL;
++
++	v = br_vlan_find(vg, vid);
++	if (!v)
++		return -ENOENT;
++
++	p_vinfo->vid = vid;
++	p_vinfo->flags = v->flags;
++	if (vid == br_get_pvid(vg))
++		p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu);
++
++static int br_vlan_is_bind_vlan_dev(const struct net_device *dev)
++{
++	return is_vlan_dev(dev) &&
++		!!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING);
++}
++
++static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev,
++			       __always_unused struct netdev_nested_priv *priv)
++{
++	return br_vlan_is_bind_vlan_dev(dev);
++}
++
++static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev)
++{
++	int found;
++
++	rcu_read_lock();
++	found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn,
++					      NULL);
++	rcu_read_unlock();
++
++	return !!found;
++}
++
++struct br_vlan_bind_walk_data {
++	u16 vid;
++	struct net_device *result;
++};
++
++static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev,
++					  struct netdev_nested_priv *priv)
++{
++	struct br_vlan_bind_walk_data *data = priv->data;
++	int found = 0;
++
++	if (br_vlan_is_bind_vlan_dev(dev) &&
++	    vlan_dev_priv(dev)->vlan_id == data->vid) {
++		data->result = dev;
++		found = 1;
++	}
++
++	return found;
++}
++
++static struct net_device *
++br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid)
++{
++	struct br_vlan_bind_walk_data data = {
++		.vid = vid,
++	};
++	struct netdev_nested_priv priv = {
++		.data = (void *)&data,
++	};
++
++	rcu_read_lock();
++	netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn,
++				      &priv);
++	rcu_read_unlock();
++
++	return data.result;
++}
++
++static bool br_vlan_is_dev_up(const struct net_device *dev)
++{
++	return  !!(dev->flags & IFF_UP) && netif_oper_up(dev);
++}
++
++static void br_vlan_set_vlan_dev_state(const struct net_bridge *br,
++				       struct net_device *vlan_dev)
++{
++	u16 vid = vlan_dev_priv(vlan_dev)->vlan_id;
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p;
++	bool has_carrier = false;
++
++	if (!netif_carrier_ok(br->dev)) {
++		netif_carrier_off(vlan_dev);
++		return;
++	}
++
++	list_for_each_entry(p, &br->port_list, list) {
++		vg = nbp_vlan_group(p);
++		if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) {
++			has_carrier = true;
++			break;
++		}
++	}
++
++	if (has_carrier)
++		netif_carrier_on(vlan_dev);
++	else
++		netif_carrier_off(vlan_dev);
++}
++
++static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p)
++{
++	struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
++	struct net_bridge_vlan *vlan;
++	struct net_device *vlan_dev;
++
++	list_for_each_entry(vlan, &vg->vlan_list, vlist) {
++		vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev,
++							   vlan->vid);
++		if (vlan_dev) {
++			if (br_vlan_is_dev_up(p->dev)) {
++				if (netif_carrier_ok(p->br->dev))
++					netif_carrier_on(vlan_dev);
++			} else {
++				br_vlan_set_vlan_dev_state(p->br, vlan_dev);
++			}
++		}
++	}
++}
++
++static void br_vlan_upper_change(struct net_device *dev,
++				 struct net_device *upper_dev,
++				 bool linking)
++{
++	struct net_bridge *br = netdev_priv(dev);
++
++	if (!br_vlan_is_bind_vlan_dev(upper_dev))
++		return;
++
++	if (linking) {
++		br_vlan_set_vlan_dev_state(br, upper_dev);
++		br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true);
++	} else {
++		br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING,
++			      br_vlan_has_upper_bind_vlan_dev(dev));
++	}
++}
++
++struct br_vlan_link_state_walk_data {
++	struct net_bridge *br;
++};
++
++static int br_vlan_link_state_change_fn(struct net_device *vlan_dev,
++					struct netdev_nested_priv *priv)
++{
++	struct br_vlan_link_state_walk_data *data = priv->data;
++
++	if (br_vlan_is_bind_vlan_dev(vlan_dev))
++		br_vlan_set_vlan_dev_state(data->br, vlan_dev);
++
++	return 0;
++}
++
++static void br_vlan_link_state_change(struct net_device *dev,
++				      struct net_bridge *br)
++{
++	struct br_vlan_link_state_walk_data data = {
++		.br = br
++	};
++	struct netdev_nested_priv priv = {
++		.data = (void *)&data,
++	};
++
++	rcu_read_lock();
++	netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn,
++				      &priv);
++	rcu_read_unlock();
++}
++
++/* Must be protected by RTNL. */
++static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid)
++{
++	struct net_device *vlan_dev;
++
++	if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
++		return;
++
++	vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid);
++	if (vlan_dev)
++		br_vlan_set_vlan_dev_state(p->br, vlan_dev);
++}
++
++/* Must be protected by RTNL. */
++int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr)
++{
++	struct netdev_notifier_changeupper_info *info;
++	struct net_bridge *br = netdev_priv(dev);
++	int vlcmd = 0, ret = 0;
++	bool changed = false;
++
++	switch (event) {
++	case NETDEV_REGISTER:
++		ret = br_vlan_add(br, br->default_pvid,
++				  BRIDGE_VLAN_INFO_PVID |
++				  BRIDGE_VLAN_INFO_UNTAGGED |
++				  BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL);
++		vlcmd = RTM_NEWVLAN;
++		break;
++	case NETDEV_UNREGISTER:
++		changed = !br_vlan_delete(br, br->default_pvid);
++		vlcmd = RTM_DELVLAN;
++		break;
++	case NETDEV_CHANGEUPPER:
++		info = ptr;
++		br_vlan_upper_change(dev, info->upper_dev, info->linking);
++		break;
++
++	case NETDEV_CHANGE:
++	case NETDEV_UP:
++		if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING))
++			break;
++		br_vlan_link_state_change(dev, br);
++		break;
++	}
++	if (changed)
++		br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd);
++
++	return ret;
++}
++
++/* Must be protected by RTNL. */
++void br_vlan_port_event(struct net_bridge_port *p, unsigned long event)
++{
++	if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
++		return;
++
++	switch (event) {
++	case NETDEV_CHANGE:
++	case NETDEV_DOWN:
++	case NETDEV_UP:
++		br_vlan_set_all_vlan_dev_state(p);
++		break;
++	}
++}
++
++static bool br_vlan_stats_fill(struct sk_buff *skb,
++			       const struct net_bridge_vlan *v)
++{
++	struct pcpu_sw_netstats stats;
++	struct nlattr *nest;
++
++	nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS);
++	if (!nest)
++		return false;
++
++	br_vlan_get_stats(v, &stats);
++	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES,
++			      u64_stats_read(&stats.rx_bytes),
++			      BRIDGE_VLANDB_STATS_PAD) ||
++	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS,
++			      u64_stats_read(&stats.rx_packets),
++			      BRIDGE_VLANDB_STATS_PAD) ||
++	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES,
++			      u64_stats_read(&stats.tx_bytes),
++			      BRIDGE_VLANDB_STATS_PAD) ||
++	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS,
++			      u64_stats_read(&stats.tx_packets),
++			      BRIDGE_VLANDB_STATS_PAD))
++		goto out_err;
++
++	nla_nest_end(skb, nest);
++
++	return true;
++
++out_err:
++	nla_nest_cancel(skb, nest);
++	return false;
++}
++
++/* v_opts is used to dump the options which must be equal in the whole range */
++static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range,
++			      const struct net_bridge_vlan *v_opts,
++			      u16 flags,
++			      bool dump_stats)
++{
++	struct bridge_vlan_info info;
++	struct nlattr *nest;
++
++	nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY);
++	if (!nest)
++		return false;
++
++	memset(&info, 0, sizeof(info));
++	info.vid = vid;
++	if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
++		info.flags |= BRIDGE_VLAN_INFO_UNTAGGED;
++	if (flags & BRIDGE_VLAN_INFO_PVID)
++		info.flags |= BRIDGE_VLAN_INFO_PVID;
++
++	if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info))
++		goto out_err;
++
++	if (vid_range && vid < vid_range &&
++	    !(flags & BRIDGE_VLAN_INFO_PVID) &&
++	    nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range))
++		goto out_err;
++
++	if (v_opts) {
++		if (!br_vlan_opts_fill(skb, v_opts))
++			goto out_err;
++
++		if (dump_stats && !br_vlan_stats_fill(skb, v_opts))
++			goto out_err;
++	}
++
++	nla_nest_end(skb, nest);
++
++	return true;
++
++out_err:
++	nla_nest_cancel(skb, nest);
++	return false;
++}
++
++static size_t rtnl_vlan_nlmsg_size(void)
++{
++	return NLMSG_ALIGN(sizeof(struct br_vlan_msg))
++		+ nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */
++		+ nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */
++		+ nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */
++		+ br_vlan_opts_nl_size(); /* bridge vlan options */
++}
++
++void br_vlan_notify(const struct net_bridge *br,
++		    const struct net_bridge_port *p,
++		    u16 vid, u16 vid_range,
++		    int cmd)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v = NULL;
++	struct br_vlan_msg *bvm;
++	struct nlmsghdr *nlh;
++	struct sk_buff *skb;
++	int err = -ENOBUFS;
++	struct net *net;
++	u16 flags = 0;
++	int ifindex;
++
++	/* right now notifications are done only with rtnl held */
++	ASSERT_RTNL();
++
++	if (p) {
++		ifindex = p->dev->ifindex;
++		vg = nbp_vlan_group(p);
++		net = dev_net(p->dev);
++	} else {
++		ifindex = br->dev->ifindex;
++		vg = br_vlan_group(br);
++		net = dev_net(br->dev);
++	}
++
++	skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL);
++	if (!skb)
++		goto out_err;
++
++	err = -EMSGSIZE;
++	nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0);
++	if (!nlh)
++		goto out_err;
++	bvm = nlmsg_data(nlh);
++	memset(bvm, 0, sizeof(*bvm));
++	bvm->family = AF_BRIDGE;
++	bvm->ifindex = ifindex;
++
++	switch (cmd) {
++	case RTM_NEWVLAN:
++		/* need to find the vlan due to flags/options */
++		v = br_vlan_find(vg, vid);
++		if (!v || !br_vlan_should_use(v))
++			goto out_kfree;
++
++		flags = v->flags;
++		if (br_get_pvid(vg) == v->vid)
++			flags |= BRIDGE_VLAN_INFO_PVID;
++		break;
++	case RTM_DELVLAN:
++		break;
++	default:
++		goto out_kfree;
++	}
++
++	if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags, false))
++		goto out_err;
++
++	nlmsg_end(skb, nlh);
++	rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL);
++	return;
++
++out_err:
++	rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err);
++out_kfree:
++	kfree_skb(skb);
++}
++
++/* check if v_curr can enter a range ending in range_end */
++bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
++			     const struct net_bridge_vlan *range_end)
++{
++	return v_curr->vid - range_end->vid == 1 &&
++	       range_end->flags == v_curr->flags &&
++	       br_vlan_opts_eq_range(v_curr, range_end);
++}
++
++static int br_vlan_dump_dev(const struct net_device *dev,
++			    struct sk_buff *skb,
++			    struct netlink_callback *cb,
++			    u32 dump_flags)
++{
++	struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL;
++	bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL);
++	bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS);
++	struct net_bridge_vlan_group *vg;
++	int idx = 0, s_idx = cb->args[1];
++	struct nlmsghdr *nlh = NULL;
++	struct net_bridge_port *p;
++	struct br_vlan_msg *bvm;
++	struct net_bridge *br;
++	int err = 0;
++	u16 pvid;
++
++	if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev))
++		return -EINVAL;
++
++	if (netif_is_bridge_master(dev)) {
++		br = netdev_priv(dev);
++		vg = br_vlan_group_rcu(br);
++		p = NULL;
++	} else {
++		/* global options are dumped only for bridge devices */
++		if (dump_global)
++			return 0;
++
++		p = br_port_get_rcu(dev);
++		if (WARN_ON(!p))
++			return -EINVAL;
++		vg = nbp_vlan_group_rcu(p);
++		br = p->br;
++	}
++
++	if (!vg)
++		return 0;
++
++	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
++			RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI);
++	if (!nlh)
++		return -EMSGSIZE;
++	bvm = nlmsg_data(nlh);
++	memset(bvm, 0, sizeof(*bvm));
++	bvm->family = PF_BRIDGE;
++	bvm->ifindex = dev->ifindex;
++	pvid = br_get_pvid(vg);
++
++	/* idx must stay at range's beginning until it is filled in */
++	list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
++		if (!dump_global && !br_vlan_should_use(v))
++			continue;
++		if (idx < s_idx) {
++			idx++;
++			continue;
++		}
++
++		if (!range_start) {
++			range_start = v;
++			range_end = v;
++			continue;
++		}
++
++		if (dump_global) {
++			if (br_vlan_global_opts_can_enter_range(v, range_end))
++				goto update_end;
++			if (!br_vlan_global_opts_fill(skb, range_start->vid,
++						      range_end->vid,
++						      range_start)) {
++				err = -EMSGSIZE;
++				break;
++			}
++			/* advance number of filled vlans */
++			idx += range_end->vid - range_start->vid + 1;
++
++			range_start = v;
++		} else if (dump_stats || v->vid == pvid ||
++			   !br_vlan_can_enter_range(v, range_end)) {
++			u16 vlan_flags = br_vlan_flags(range_start, pvid);
++
++			if (!br_vlan_fill_vids(skb, range_start->vid,
++					       range_end->vid, range_start,
++					       vlan_flags, dump_stats)) {
++				err = -EMSGSIZE;
++				break;
++			}
++			/* advance number of filled vlans */
++			idx += range_end->vid - range_start->vid + 1;
++
++			range_start = v;
++		}
++update_end:
++		range_end = v;
++	}
++
++	/* err will be 0 and range_start will be set in 3 cases here:
++	 * - first vlan (range_start == range_end)
++	 * - last vlan (range_start == range_end, not in range)
++	 * - last vlan range (range_start != range_end, in range)
++	 */
++	if (!err && range_start) {
++		if (dump_global &&
++		    !br_vlan_global_opts_fill(skb, range_start->vid,
++					      range_end->vid, range_start))
++			err = -EMSGSIZE;
++		else if (!dump_global &&
++			 !br_vlan_fill_vids(skb, range_start->vid,
++					    range_end->vid, range_start,
++					    br_vlan_flags(range_start, pvid),
++					    dump_stats))
++			err = -EMSGSIZE;
++	}
++
++	cb->args[1] = err ? idx : 0;
++
++	nlmsg_end(skb, nlh);
++
++	return err;
++}
++
++static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = {
++	[BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 },
++};
++
++static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb)
++{
++	struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1];
++	int idx = 0, err = 0, s_idx = cb->args[0];
++	struct net *net = sock_net(skb->sk);
++	struct br_vlan_msg *bvm;
++	struct net_device *dev;
++	u32 dump_flags = 0;
++
++	err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX,
++			  br_vlan_db_dump_pol, cb->extack);
++	if (err < 0)
++		return err;
++
++	bvm = nlmsg_data(cb->nlh);
++	if (dtb[BRIDGE_VLANDB_DUMP_FLAGS])
++		dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]);
++
++	rcu_read_lock();
++	if (bvm->ifindex) {
++		dev = dev_get_by_index_rcu(net, bvm->ifindex);
++		if (!dev) {
++			err = -ENODEV;
++			goto out_err;
++		}
++		err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
++		/* if the dump completed without an error we return 0 here */
++		if (err != -EMSGSIZE)
++			goto out_err;
++	} else {
++		for_each_netdev_rcu(net, dev) {
++			if (idx < s_idx)
++				goto skip;
++
++			err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
++			if (err == -EMSGSIZE)
++				break;
++skip:
++			idx++;
++		}
++	}
++	cb->args[0] = idx;
++	rcu_read_unlock();
++
++	return skb->len;
++
++out_err:
++	rcu_read_unlock();
++
++	return err;
++}
++
++static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = {
++	[BRIDGE_VLANDB_ENTRY_INFO]	=
++		NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)),
++	[BRIDGE_VLANDB_ENTRY_RANGE]	= { .type = NLA_U16 },
++	[BRIDGE_VLANDB_ENTRY_STATE]	= { .type = NLA_U8 },
++	[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
++	[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]	= { .type = NLA_U8 },
++};
++
++static int br_vlan_rtm_process_one(struct net_device *dev,
++				   const struct nlattr *attr,
++				   int cmd, struct netlink_ext_ack *extack)
++{
++	struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL;
++	struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1];
++	bool changed = false, skip_processing = false;
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p = NULL;
++	int err = 0, cmdmap = 0;
++	struct net_bridge *br;
++
++	if (netif_is_bridge_master(dev)) {
++		br = netdev_priv(dev);
++		vg = br_vlan_group(br);
++	} else {
++		p = br_port_get_rtnl(dev);
++		if (WARN_ON(!p))
++			return -ENODEV;
++		br = p->br;
++		vg = nbp_vlan_group(p);
++	}
++
++	if (WARN_ON(!vg))
++		return -ENODEV;
++
++	err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr,
++			       br_vlan_db_policy, extack);
++	if (err)
++		return err;
++
++	if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) {
++		NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info");
++		return -EINVAL;
++	}
++	memset(&vrange_end, 0, sizeof(vrange_end));
++
++	vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]);
++	if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN |
++			    BRIDGE_VLAN_INFO_RANGE_END)) {
++		NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls");
++		return -EINVAL;
++	}
++	if (!br_vlan_valid_id(vinfo->vid, extack))
++		return -EINVAL;
++
++	if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) {
++		vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]);
++		/* validate user-provided flags without RANGE_BEGIN */
++		vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags;
++		vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
++
++		/* vinfo_last is the range start, vinfo the range end */
++		vinfo_last = vinfo;
++		vinfo = &vrange_end;
++
++		if (!br_vlan_valid_id(vinfo->vid, extack) ||
++		    !br_vlan_valid_range(vinfo, vinfo_last, extack))
++			return -EINVAL;
++	}
++
++	switch (cmd) {
++	case RTM_NEWVLAN:
++		cmdmap = RTM_SETLINK;
++		skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS);
++		break;
++	case RTM_DELVLAN:
++		cmdmap = RTM_DELLINK;
++		break;
++	}
++
++	if (!skip_processing) {
++		struct bridge_vlan_info *tmp_last = vinfo_last;
++
++		/* br_process_vlan_info may overwrite vinfo_last */
++		err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last,
++					   &changed, extack);
++
++		/* notify first if anything changed */
++		if (changed)
++			br_ifinfo_notify(cmdmap, br, p);
++
++		if (err)
++			return err;
++	}
++
++	/* deal with options */
++	if (cmd == RTM_NEWVLAN) {
++		struct net_bridge_vlan *range_start, *range_end;
++
++		if (vinfo_last) {
++			range_start = br_vlan_find(vg, vinfo_last->vid);
++			range_end = br_vlan_find(vg, vinfo->vid);
++		} else {
++			range_start = br_vlan_find(vg, vinfo->vid);
++			range_end = range_start;
++		}
++
++		err = br_vlan_process_options(br, p, range_start, range_end,
++					      tb, extack);
++	}
++
++	return err;
++}
++
++static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh,
++			       struct netlink_ext_ack *extack)
++{
++	struct net *net = sock_net(skb->sk);
++	struct br_vlan_msg *bvm;
++	struct net_device *dev;
++	struct nlattr *attr;
++	int err, vlans = 0;
++	int rem;
++
++	/* this should validate the header and check for remaining bytes */
++	err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL,
++			  extack);
++	if (err < 0)
++		return err;
++
++	bvm = nlmsg_data(nlh);
++	dev = __dev_get_by_index(net, bvm->ifindex);
++	if (!dev)
++		return -ENODEV;
++
++	if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) {
++		NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port");
++		return -EINVAL;
++	}
++
++	nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) {
++		switch (nla_type(attr)) {
++		case BRIDGE_VLANDB_ENTRY:
++			err = br_vlan_rtm_process_one(dev, attr,
++						      nlh->nlmsg_type,
++						      extack);
++			break;
++		case BRIDGE_VLANDB_GLOBAL_OPTIONS:
++			err = br_vlan_rtm_process_global_options(dev, attr,
++								 nlh->nlmsg_type,
++								 extack);
++			break;
++		default:
++			continue;
++		}
++
++		vlans++;
++		if (err)
++			break;
++	}
++	if (!vlans) {
++		NL_SET_ERR_MSG_MOD(extack, "No vlans found to process");
++		err = -EINVAL;
++	}
++
++	return err;
++}
++
++void br_vlan_rtnl_init(void)
++{
++	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL,
++			     br_vlan_rtm_dump, 0);
++	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN,
++			     br_vlan_rtm_process, NULL, 0);
++	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN,
++			     br_vlan_rtm_process, NULL, 0);
++}
++
++void br_vlan_rtnl_uninit(void)
++{
++	rtnl_unregister(PF_BRIDGE, RTM_GETVLAN);
++	rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN);
++	rtnl_unregister(PF_BRIDGE, RTM_DELVLAN);
++}
+diff -rupN linux.orig/net/core/dev.c linux/net/core/dev.c
+--- linux.orig/net/core/dev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/dev.c	2022-12-04 10:40:26.732034003 -0500
+@@ -4582,15 +4582,6 @@ static void rps_trigger_softirq(void *da
  
  #endif /* CONFIG_RPS */
  
@@ -8638,7 +39072,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644
  /*
   * Check if this softnet_data structure is another cpu one
   * If yes, queue it to our IPI list and return 1
-@@ -6661,6 +6652,30 @@ static void skb_defer_free_flush(struct softnet_data *sd)
+@@ -6665,6 +6656,30 @@ static void skb_defer_free_flush(struct
  	}
  }
  
@@ -8669,7 +39103,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644
  static __latent_entropy void net_rx_action(struct softirq_action *h)
  {
  	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
-@@ -10492,12 +10507,12 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
+@@ -10496,12 +10511,12 @@ void dev_fetch_sw_netstats(struct rtnl_l
  
  		stats = per_cpu_ptr(netstats, cpu);
  		do {
@@ -8684,7 +39118,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644
  
  		s->rx_packets += rx_packets;
  		s->rx_bytes   += rx_bytes;
-@@ -11412,7 +11427,11 @@ static int __init net_dev_init(void)
+@@ -11416,7 +11431,11 @@ static int __init net_dev_init(void)
  		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
  		sd->cpu = i;
  #endif
@@ -8696,11 +39130,11469 @@ index 56c8b0921c9fd..d96506980d2f2 100644
  		spin_lock_init(&sd->defer_lock);
  
  		init_gro_hash(&sd->backlog);
-diff --git a/net/core/devlink.c b/net/core/devlink.c
-index b50bcc18b8d9e..cfa6a099457ae 100644
---- a/net/core/devlink.c
-+++ b/net/core/devlink.c
-@@ -8268,10 +8268,10 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
+diff -rupN linux.orig/net/core/dev.c.orig linux/net/core/dev.c.orig
+--- linux.orig/net/core/dev.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/net/core/dev.c.orig	2022-12-04 10:40:18.728054516 -0500
+@@ -0,0 +1,11455 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *      NET3    Protocol independent device support routines.
++ *
++ *	Derived from the non IP parts of dev.c 1.0.19
++ *              Authors:	Ross Biro
++ *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
++ *				Mark Evans, <evansmp@uhura.aston.ac.uk>
++ *
++ *	Additional Authors:
++ *		Florian la Roche <rzsfl@rz.uni-sb.de>
++ *		Alan Cox <gw4pts@gw4pts.ampr.org>
++ *		David Hinds <dahinds@users.sourceforge.net>
++ *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
++ *		Adam Sulmicki <adam@cfar.umd.edu>
++ *              Pekka Riikonen <priikone@poesidon.pspt.fi>
++ *
++ *	Changes:
++ *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
++ *                                      to 2 if register_netdev gets called
++ *                                      before net_dev_init & also removed a
++ *                                      few lines of code in the process.
++ *		Alan Cox	:	device private ioctl copies fields back.
++ *		Alan Cox	:	Transmit queue code does relevant
++ *					stunts to keep the queue safe.
++ *		Alan Cox	:	Fixed double lock.
++ *		Alan Cox	:	Fixed promisc NULL pointer trap
++ *		????????	:	Support the full private ioctl range
++ *		Alan Cox	:	Moved ioctl permission check into
++ *					drivers
++ *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
++ *		Alan Cox	:	100 backlog just doesn't cut it when
++ *					you start doing multicast video 8)
++ *		Alan Cox	:	Rewrote net_bh and list manager.
++ *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
++ *		Alan Cox	:	Took out transmit every packet pass
++ *					Saved a few bytes in the ioctl handler
++ *		Alan Cox	:	Network driver sets packet type before
++ *					calling netif_rx. Saves a function
++ *					call a packet.
++ *		Alan Cox	:	Hashed net_bh()
++ *		Richard Kooijman:	Timestamp fixes.
++ *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
++ *		Alan Cox	:	Device lock protection.
++ *              Alan Cox        :       Fixed nasty side effect of device close
++ *					changes.
++ *		Rudi Cilibrasi	:	Pass the right thing to
++ *					set_mac_address()
++ *		Dave Miller	:	32bit quantity for the device lock to
++ *					make it work out on a Sparc.
++ *		Bjorn Ekwall	:	Added KERNELD hack.
++ *		Alan Cox	:	Cleaned up the backlog initialise.
++ *		Craig Metz	:	SIOCGIFCONF fix if space for under
++ *					1 device.
++ *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
++ *					is no device open function.
++ *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
++ *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
++ *		Cyrus Durgin	:	Cleaned for KMOD
++ *		Adam Sulmicki   :	Bug Fix : Network Device Unload
++ *					A network device unload needs to purge
++ *					the backlog queue.
++ *	Paul Rusty Russell	:	SIOCSIFNAME
++ *              Pekka Riikonen  :	Netdev boot-time settings code
++ *              Andrew Morton   :       Make unregister_netdevice wait
++ *                                      indefinitely on dev->refcnt
++ *              J Hadi Salim    :       - Backlog queue sampling
++ *				        - netif_rx() feedback
++ */
++
++#include <linux/uaccess.h>
++#include <linux/bitops.h>
++#include <linux/capability.h>
++#include <linux/cpu.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/hash.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/sched/mm.h>
++#include <linux/mutex.h>
++#include <linux/rwsem.h>
++#include <linux/string.h>
++#include <linux/mm.h>
++#include <linux/socket.h>
++#include <linux/sockios.h>
++#include <linux/errno.h>
++#include <linux/interrupt.h>
++#include <linux/if_ether.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/ethtool.h>
++#include <linux/skbuff.h>
++#include <linux/kthread.h>
++#include <linux/bpf.h>
++#include <linux/bpf_trace.h>
++#include <net/net_namespace.h>
++#include <net/sock.h>
++#include <net/busy_poll.h>
++#include <linux/rtnetlink.h>
++#include <linux/stat.h>
++#include <net/dsa.h>
++#include <net/dst.h>
++#include <net/dst_metadata.h>
++#include <net/gro.h>
++#include <net/pkt_sched.h>
++#include <net/pkt_cls.h>
++#include <net/checksum.h>
++#include <net/xfrm.h>
++#include <linux/highmem.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/netpoll.h>
++#include <linux/rcupdate.h>
++#include <linux/delay.h>
++#include <net/iw_handler.h>
++#include <asm/current.h>
++#include <linux/audit.h>
++#include <linux/dmaengine.h>
++#include <linux/err.h>
++#include <linux/ctype.h>
++#include <linux/if_arp.h>
++#include <linux/if_vlan.h>
++#include <linux/ip.h>
++#include <net/ip.h>
++#include <net/mpls.h>
++#include <linux/ipv6.h>
++#include <linux/in.h>
++#include <linux/jhash.h>
++#include <linux/random.h>
++#include <trace/events/napi.h>
++#include <trace/events/net.h>
++#include <trace/events/skb.h>
++#include <trace/events/qdisc.h>
++#include <linux/inetdevice.h>
++#include <linux/cpu_rmap.h>
++#include <linux/static_key.h>
++#include <linux/hashtable.h>
++#include <linux/vmalloc.h>
++#include <linux/if_macvlan.h>
++#include <linux/errqueue.h>
++#include <linux/hrtimer.h>
++#include <linux/netfilter_netdev.h>
++#include <linux/crash_dump.h>
++#include <linux/sctp.h>
++#include <net/udp_tunnel.h>
++#include <linux/net_namespace.h>
++#include <linux/indirect_call_wrapper.h>
++#include <net/devlink.h>
++#include <linux/pm_runtime.h>
++#include <linux/prandom.h>
++#include <linux/once_lite.h>
++
++#include "dev.h"
++#include "net-sysfs.h"
++
++
++static DEFINE_SPINLOCK(ptype_lock);
++struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
++struct list_head ptype_all __read_mostly;	/* Taps */
++
++static int netif_rx_internal(struct sk_buff *skb);
++static int call_netdevice_notifiers_info(unsigned long val,
++					 struct netdev_notifier_info *info);
++static int call_netdevice_notifiers_extack(unsigned long val,
++					   struct net_device *dev,
++					   struct netlink_ext_ack *extack);
++static struct napi_struct *napi_by_id(unsigned int napi_id);
++
++/*
++ * The @dev_base_head list is protected by @dev_base_lock and the rtnl
++ * semaphore.
++ *
++ * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
++ *
++ * Writers must hold the rtnl semaphore while they loop through the
++ * dev_base_head list, and hold dev_base_lock for writing when they do the
++ * actual updates.  This allows pure readers to access the list even
++ * while a writer is preparing to update it.
++ *
++ * To put it another way, dev_base_lock is held for writing only to
++ * protect against pure readers; the rtnl semaphore provides the
++ * protection against other writers.
++ *
++ * See, for example usages, register_netdevice() and
++ * unregister_netdevice(), which must be called with the rtnl
++ * semaphore held.
++ */
++DEFINE_RWLOCK(dev_base_lock);
++EXPORT_SYMBOL(dev_base_lock);
++
++static DEFINE_MUTEX(ifalias_mutex);
++
++/* protects napi_hash addition/deletion and napi_gen_id */
++static DEFINE_SPINLOCK(napi_hash_lock);
++
++static unsigned int napi_gen_id = NR_CPUS;
++static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
++
++static DECLARE_RWSEM(devnet_rename_sem);
++
++static inline void dev_base_seq_inc(struct net *net)
++{
++	while (++net->dev_base_seq == 0)
++		;
++}
++
++static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
++{
++	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
++
++	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
++}
++
++static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
++{
++	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
++}
++
++static inline void rps_lock_irqsave(struct softnet_data *sd,
++				    unsigned long *flags)
++{
++	if (IS_ENABLED(CONFIG_RPS))
++		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
++	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		local_irq_save(*flags);
++}
++
++static inline void rps_lock_irq_disable(struct softnet_data *sd)
++{
++	if (IS_ENABLED(CONFIG_RPS))
++		spin_lock_irq(&sd->input_pkt_queue.lock);
++	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		local_irq_disable();
++}
++
++static inline void rps_unlock_irq_restore(struct softnet_data *sd,
++					  unsigned long *flags)
++{
++	if (IS_ENABLED(CONFIG_RPS))
++		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
++	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		local_irq_restore(*flags);
++}
++
++static inline void rps_unlock_irq_enable(struct softnet_data *sd)
++{
++	if (IS_ENABLED(CONFIG_RPS))
++		spin_unlock_irq(&sd->input_pkt_queue.lock);
++	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		local_irq_enable();
++}
++
++static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
++						       const char *name)
++{
++	struct netdev_name_node *name_node;
++
++	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
++	if (!name_node)
++		return NULL;
++	INIT_HLIST_NODE(&name_node->hlist);
++	name_node->dev = dev;
++	name_node->name = name;
++	return name_node;
++}
++
++static struct netdev_name_node *
++netdev_name_node_head_alloc(struct net_device *dev)
++{
++	struct netdev_name_node *name_node;
++
++	name_node = netdev_name_node_alloc(dev, dev->name);
++	if (!name_node)
++		return NULL;
++	INIT_LIST_HEAD(&name_node->list);
++	return name_node;
++}
++
++static void netdev_name_node_free(struct netdev_name_node *name_node)
++{
++	kfree(name_node);
++}
++
++static void netdev_name_node_add(struct net *net,
++				 struct netdev_name_node *name_node)
++{
++	hlist_add_head_rcu(&name_node->hlist,
++			   dev_name_hash(net, name_node->name));
++}
++
++static void netdev_name_node_del(struct netdev_name_node *name_node)
++{
++	hlist_del_rcu(&name_node->hlist);
++}
++
++static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
++							const char *name)
++{
++	struct hlist_head *head = dev_name_hash(net, name);
++	struct netdev_name_node *name_node;
++
++	hlist_for_each_entry(name_node, head, hlist)
++		if (!strcmp(name_node->name, name))
++			return name_node;
++	return NULL;
++}
++
++static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
++							    const char *name)
++{
++	struct hlist_head *head = dev_name_hash(net, name);
++	struct netdev_name_node *name_node;
++
++	hlist_for_each_entry_rcu(name_node, head, hlist)
++		if (!strcmp(name_node->name, name))
++			return name_node;
++	return NULL;
++}
++
++bool netdev_name_in_use(struct net *net, const char *name)
++{
++	return netdev_name_node_lookup(net, name);
++}
++EXPORT_SYMBOL(netdev_name_in_use);
++
++int netdev_name_node_alt_create(struct net_device *dev, const char *name)
++{
++	struct netdev_name_node *name_node;
++	struct net *net = dev_net(dev);
++
++	name_node = netdev_name_node_lookup(net, name);
++	if (name_node)
++		return -EEXIST;
++	name_node = netdev_name_node_alloc(dev, name);
++	if (!name_node)
++		return -ENOMEM;
++	netdev_name_node_add(net, name_node);
++	/* The node that holds dev->name acts as a head of per-device list. */
++	list_add_tail(&name_node->list, &dev->name_node->list);
++
++	return 0;
++}
++
++static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
++{
++	list_del(&name_node->list);
++	netdev_name_node_del(name_node);
++	kfree(name_node->name);
++	netdev_name_node_free(name_node);
++}
++
++int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
++{
++	struct netdev_name_node *name_node;
++	struct net *net = dev_net(dev);
++
++	name_node = netdev_name_node_lookup(net, name);
++	if (!name_node)
++		return -ENOENT;
++	/* lookup might have found our primary name or a name belonging
++	 * to another device.
++	 */
++	if (name_node == dev->name_node || name_node->dev != dev)
++		return -EINVAL;
++
++	__netdev_name_node_alt_destroy(name_node);
++
++	return 0;
++}
++
++static void netdev_name_node_alt_flush(struct net_device *dev)
++{
++	struct netdev_name_node *name_node, *tmp;
++
++	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
++		__netdev_name_node_alt_destroy(name_node);
++}
++
++/* Device list insertion */
++static void list_netdevice(struct net_device *dev)
++{
++	struct net *net = dev_net(dev);
++
++	ASSERT_RTNL();
++
++	write_lock(&dev_base_lock);
++	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
++	netdev_name_node_add(net, dev->name_node);
++	hlist_add_head_rcu(&dev->index_hlist,
++			   dev_index_hash(net, dev->ifindex));
++	write_unlock(&dev_base_lock);
++
++	dev_base_seq_inc(net);
++}
++
++/* Device list removal
++ * caller must respect a RCU grace period before freeing/reusing dev
++ */
++static void unlist_netdevice(struct net_device *dev, bool lock)
++{
++	ASSERT_RTNL();
++
++	/* Unlink dev from the device chain */
++	if (lock)
++		write_lock(&dev_base_lock);
++	list_del_rcu(&dev->dev_list);
++	netdev_name_node_del(dev->name_node);
++	hlist_del_rcu(&dev->index_hlist);
++	if (lock)
++		write_unlock(&dev_base_lock);
++
++	dev_base_seq_inc(dev_net(dev));
++}
++
++/*
++ *	Our notifier list
++ */
++
++static RAW_NOTIFIER_HEAD(netdev_chain);
++
++/*
++ *	Device drivers call our routines to queue packets here. We empty the
++ *	queue in the local softnet handler.
++ */
++
++DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
++EXPORT_PER_CPU_SYMBOL(softnet_data);
++
++#ifdef CONFIG_LOCKDEP
++/*
++ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
++ * according to dev->type
++ */
++static const unsigned short netdev_lock_type[] = {
++	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
++	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
++	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
++	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
++	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
++	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
++	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
++	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
++	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
++	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
++	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
++	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
++	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
++	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
++	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
++
++static const char *const netdev_lock_name[] = {
++	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
++	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
++	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
++	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
++	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
++	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
++	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
++	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
++	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
++	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
++	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
++	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
++	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
++	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
++	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
++
++static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
++static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
++
++static inline unsigned short netdev_lock_pos(unsigned short dev_type)
++{
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
++		if (netdev_lock_type[i] == dev_type)
++			return i;
++	/* the last key is used by default */
++	return ARRAY_SIZE(netdev_lock_type) - 1;
++}
++
++static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
++						 unsigned short dev_type)
++{
++	int i;
++
++	i = netdev_lock_pos(dev_type);
++	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
++				   netdev_lock_name[i]);
++}
++
++static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
++{
++	int i;
++
++	i = netdev_lock_pos(dev->type);
++	lockdep_set_class_and_name(&dev->addr_list_lock,
++				   &netdev_addr_lock_key[i],
++				   netdev_lock_name[i]);
++}
++#else
++static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
++						 unsigned short dev_type)
++{
++}
++
++static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
++{
++}
++#endif
++
++/*******************************************************************************
++ *
++ *		Protocol management and registration routines
++ *
++ *******************************************************************************/
++
++
++/*
++ *	Add a protocol ID to the list. Now that the input handler is
++ *	smarter we can dispense with all the messy stuff that used to be
++ *	here.
++ *
++ *	BEWARE!!! Protocol handlers, mangling input packets,
++ *	MUST BE last in hash buckets and checking protocol handlers
++ *	MUST start from promiscuous ptype_all chain in net_bh.
++ *	It is true now, do not change it.
++ *	Explanation follows: if protocol handler, mangling packet, will
++ *	be the first on list, it is not able to sense, that packet
++ *	is cloned and should be copied-on-write, so that it will
++ *	change it and subsequent readers will get broken packet.
++ *							--ANK (980803)
++ */
++
++static inline struct list_head *ptype_head(const struct packet_type *pt)
++{
++	if (pt->type == htons(ETH_P_ALL))
++		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
++	else
++		return pt->dev ? &pt->dev->ptype_specific :
++				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
++}
++
++/**
++ *	dev_add_pack - add packet handler
++ *	@pt: packet type declaration
++ *
++ *	Add a protocol handler to the networking stack. The passed &packet_type
++ *	is linked into kernel lists and may not be freed until it has been
++ *	removed from the kernel lists.
++ *
++ *	This call does not sleep therefore it can not
++ *	guarantee all CPU's that are in middle of receiving packets
++ *	will see the new packet type (until the next received packet).
++ */
++
++void dev_add_pack(struct packet_type *pt)
++{
++	struct list_head *head = ptype_head(pt);
++
++	spin_lock(&ptype_lock);
++	list_add_rcu(&pt->list, head);
++	spin_unlock(&ptype_lock);
++}
++EXPORT_SYMBOL(dev_add_pack);
++
++/**
++ *	__dev_remove_pack	 - remove packet handler
++ *	@pt: packet type declaration
++ *
++ *	Remove a protocol handler that was previously added to the kernel
++ *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
++ *	from the kernel lists and can be freed or reused once this function
++ *	returns.
++ *
++ *      The packet type might still be in use by receivers
++ *	and must not be freed until after all the CPU's have gone
++ *	through a quiescent state.
++ */
++void __dev_remove_pack(struct packet_type *pt)
++{
++	struct list_head *head = ptype_head(pt);
++	struct packet_type *pt1;
++
++	spin_lock(&ptype_lock);
++
++	list_for_each_entry(pt1, head, list) {
++		if (pt == pt1) {
++			list_del_rcu(&pt->list);
++			goto out;
++		}
++	}
++
++	pr_warn("dev_remove_pack: %p not found\n", pt);
++out:
++	spin_unlock(&ptype_lock);
++}
++EXPORT_SYMBOL(__dev_remove_pack);
++
++/**
++ *	dev_remove_pack	 - remove packet handler
++ *	@pt: packet type declaration
++ *
++ *	Remove a protocol handler that was previously added to the kernel
++ *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
++ *	from the kernel lists and can be freed or reused once this function
++ *	returns.
++ *
++ *	This call sleeps to guarantee that no CPU is looking at the packet
++ *	type after return.
++ */
++void dev_remove_pack(struct packet_type *pt)
++{
++	__dev_remove_pack(pt);
++
++	synchronize_net();
++}
++EXPORT_SYMBOL(dev_remove_pack);
++
++
++/*******************************************************************************
++ *
++ *			    Device Interface Subroutines
++ *
++ *******************************************************************************/
++
++/**
++ *	dev_get_iflink	- get 'iflink' value of a interface
++ *	@dev: targeted interface
++ *
++ *	Indicates the ifindex the interface is linked to.
++ *	Physical interfaces have the same 'ifindex' and 'iflink' values.
++ */
++
++int dev_get_iflink(const struct net_device *dev)
++{
++	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
++		return dev->netdev_ops->ndo_get_iflink(dev);
++
++	return dev->ifindex;
++}
++EXPORT_SYMBOL(dev_get_iflink);
++
++/**
++ *	dev_fill_metadata_dst - Retrieve tunnel egress information.
++ *	@dev: targeted interface
++ *	@skb: The packet.
++ *
++ *	For better visibility of tunnel traffic OVS needs to retrieve
++ *	egress tunnel information for a packet. Following API allows
++ *	user to get this info.
++ */
++int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
++{
++	struct ip_tunnel_info *info;
++
++	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
++		return -EINVAL;
++
++	info = skb_tunnel_info_unclone(skb);
++	if (!info)
++		return -ENOMEM;
++	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
++		return -EINVAL;
++
++	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
++}
++EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
++
++static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
++{
++	int k = stack->num_paths++;
++
++	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
++		return NULL;
++
++	return &stack->path[k];
++}
++
++int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
++			  struct net_device_path_stack *stack)
++{
++	const struct net_device *last_dev;
++	struct net_device_path_ctx ctx = {
++		.dev	= dev,
++	};
++	struct net_device_path *path;
++	int ret = 0;
++
++	memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
++	stack->num_paths = 0;
++	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
++		last_dev = ctx.dev;
++		path = dev_fwd_path(stack);
++		if (!path)
++			return -1;
++
++		memset(path, 0, sizeof(struct net_device_path));
++		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
++		if (ret < 0)
++			return -1;
++
++		if (WARN_ON_ONCE(last_dev == ctx.dev))
++			return -1;
++	}
++
++	if (!ctx.dev)
++		return ret;
++
++	path = dev_fwd_path(stack);
++	if (!path)
++		return -1;
++	path->type = DEV_PATH_ETHERNET;
++	path->dev = ctx.dev;
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(dev_fill_forward_path);
++
++/**
++ *	__dev_get_by_name	- find a device by its name
++ *	@net: the applicable net namespace
++ *	@name: name to find
++ *
++ *	Find an interface by name. Must be called under RTNL semaphore
++ *	or @dev_base_lock. If the name is found a pointer to the device
++ *	is returned. If the name is not found then %NULL is returned. The
++ *	reference counters are not incremented so the caller must be
++ *	careful with locks.
++ */
++
++struct net_device *__dev_get_by_name(struct net *net, const char *name)
++{
++	struct netdev_name_node *node_name;
++
++	node_name = netdev_name_node_lookup(net, name);
++	return node_name ? node_name->dev : NULL;
++}
++EXPORT_SYMBOL(__dev_get_by_name);
++
++/**
++ * dev_get_by_name_rcu	- find a device by its name
++ * @net: the applicable net namespace
++ * @name: name to find
++ *
++ * Find an interface by name.
++ * If the name is found a pointer to the device is returned.
++ * If the name is not found then %NULL is returned.
++ * The reference counters are not incremented so the caller must be
++ * careful with locks. The caller must hold RCU lock.
++ */
++
++struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
++{
++	struct netdev_name_node *node_name;
++
++	node_name = netdev_name_node_lookup_rcu(net, name);
++	return node_name ? node_name->dev : NULL;
++}
++EXPORT_SYMBOL(dev_get_by_name_rcu);
++
++/**
++ *	dev_get_by_name		- find a device by its name
++ *	@net: the applicable net namespace
++ *	@name: name to find
++ *
++ *	Find an interface by name. This can be called from any
++ *	context and does its own locking. The returned handle has
++ *	the usage count incremented and the caller must use dev_put() to
++ *	release it when it is no longer needed. %NULL is returned if no
++ *	matching device is found.
++ */
++
++struct net_device *dev_get_by_name(struct net *net, const char *name)
++{
++	struct net_device *dev;
++
++	rcu_read_lock();
++	dev = dev_get_by_name_rcu(net, name);
++	dev_hold(dev);
++	rcu_read_unlock();
++	return dev;
++}
++EXPORT_SYMBOL(dev_get_by_name);
++
++/**
++ *	__dev_get_by_index - find a device by its ifindex
++ *	@net: the applicable net namespace
++ *	@ifindex: index of device
++ *
++ *	Search for an interface by index. Returns %NULL if the device
++ *	is not found or a pointer to the device. The device has not
++ *	had its reference counter increased so the caller must be careful
++ *	about locking. The caller must hold either the RTNL semaphore
++ *	or @dev_base_lock.
++ */
++
++struct net_device *__dev_get_by_index(struct net *net, int ifindex)
++{
++	struct net_device *dev;
++	struct hlist_head *head = dev_index_hash(net, ifindex);
++
++	hlist_for_each_entry(dev, head, index_hlist)
++		if (dev->ifindex == ifindex)
++			return dev;
++
++	return NULL;
++}
++EXPORT_SYMBOL(__dev_get_by_index);
++
++/**
++ *	dev_get_by_index_rcu - find a device by its ifindex
++ *	@net: the applicable net namespace
++ *	@ifindex: index of device
++ *
++ *	Search for an interface by index. Returns %NULL if the device
++ *	is not found or a pointer to the device. The device has not
++ *	had its reference counter increased so the caller must be careful
++ *	about locking. The caller must hold RCU lock.
++ */
++
++struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
++{
++	struct net_device *dev;
++	struct hlist_head *head = dev_index_hash(net, ifindex);
++
++	hlist_for_each_entry_rcu(dev, head, index_hlist)
++		if (dev->ifindex == ifindex)
++			return dev;
++
++	return NULL;
++}
++EXPORT_SYMBOL(dev_get_by_index_rcu);
++
++
++/**
++ *	dev_get_by_index - find a device by its ifindex
++ *	@net: the applicable net namespace
++ *	@ifindex: index of device
++ *
++ *	Search for an interface by index. Returns NULL if the device
++ *	is not found or a pointer to the device. The device returned has
++ *	had a reference added and the pointer is safe until the user calls
++ *	dev_put to indicate they have finished with it.
++ */
++
++struct net_device *dev_get_by_index(struct net *net, int ifindex)
++{
++	struct net_device *dev;
++
++	rcu_read_lock();
++	dev = dev_get_by_index_rcu(net, ifindex);
++	dev_hold(dev);
++	rcu_read_unlock();
++	return dev;
++}
++EXPORT_SYMBOL(dev_get_by_index);
++
++/**
++ *	dev_get_by_napi_id - find a device by napi_id
++ *	@napi_id: ID of the NAPI struct
++ *
++ *	Search for an interface by NAPI ID. Returns %NULL if the device
++ *	is not found or a pointer to the device. The device has not had
++ *	its reference counter increased so the caller must be careful
++ *	about locking. The caller must hold RCU lock.
++ */
++
++struct net_device *dev_get_by_napi_id(unsigned int napi_id)
++{
++	struct napi_struct *napi;
++
++	WARN_ON_ONCE(!rcu_read_lock_held());
++
++	if (napi_id < MIN_NAPI_ID)
++		return NULL;
++
++	napi = napi_by_id(napi_id);
++
++	return napi ? napi->dev : NULL;
++}
++EXPORT_SYMBOL(dev_get_by_napi_id);
++
++/**
++ *	netdev_get_name - get a netdevice name, knowing its ifindex.
++ *	@net: network namespace
++ *	@name: a pointer to the buffer where the name will be stored.
++ *	@ifindex: the ifindex of the interface to get the name from.
++ */
++int netdev_get_name(struct net *net, char *name, int ifindex)
++{
++	struct net_device *dev;
++	int ret;
++
++	down_read(&devnet_rename_sem);
++	rcu_read_lock();
++
++	dev = dev_get_by_index_rcu(net, ifindex);
++	if (!dev) {
++		ret = -ENODEV;
++		goto out;
++	}
++
++	strcpy(name, dev->name);
++
++	ret = 0;
++out:
++	rcu_read_unlock();
++	up_read(&devnet_rename_sem);
++	return ret;
++}
++
++/**
++ *	dev_getbyhwaddr_rcu - find a device by its hardware address
++ *	@net: the applicable net namespace
++ *	@type: media type of device
++ *	@ha: hardware address
++ *
++ *	Search for an interface by MAC address. Returns NULL if the device
++ *	is not found or a pointer to the device.
++ *	The caller must hold RCU or RTNL.
++ *	The returned device has not had its ref count increased
++ *	and the caller must therefore be careful about locking
++ *
++ */
++
++struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
++				       const char *ha)
++{
++	struct net_device *dev;
++
++	for_each_netdev_rcu(net, dev)
++		if (dev->type == type &&
++		    !memcmp(dev->dev_addr, ha, dev->addr_len))
++			return dev;
++
++	return NULL;
++}
++EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
++
++struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
++{
++	struct net_device *dev, *ret = NULL;
++
++	rcu_read_lock();
++	for_each_netdev_rcu(net, dev)
++		if (dev->type == type) {
++			dev_hold(dev);
++			ret = dev;
++			break;
++		}
++	rcu_read_unlock();
++	return ret;
++}
++EXPORT_SYMBOL(dev_getfirstbyhwtype);
++
++/**
++ *	__dev_get_by_flags - find any device with given flags
++ *	@net: the applicable net namespace
++ *	@if_flags: IFF_* values
++ *	@mask: bitmask of bits in if_flags to check
++ *
++ *	Search for any interface with the given flags. Returns NULL if a device
++ *	is not found or a pointer to the device. Must be called inside
++ *	rtnl_lock(), and result refcount is unchanged.
++ */
++
++struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
++				      unsigned short mask)
++{
++	struct net_device *dev, *ret;
++
++	ASSERT_RTNL();
++
++	ret = NULL;
++	for_each_netdev(net, dev) {
++		if (((dev->flags ^ if_flags) & mask) == 0) {
++			ret = dev;
++			break;
++		}
++	}
++	return ret;
++}
++EXPORT_SYMBOL(__dev_get_by_flags);
++
++/**
++ *	dev_valid_name - check if name is okay for network device
++ *	@name: name string
++ *
++ *	Network device names need to be valid file names to
++ *	allow sysfs to work.  We also disallow any kind of
++ *	whitespace.
++ */
++bool dev_valid_name(const char *name)
++{
++	if (*name == '\0')
++		return false;
++	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
++		return false;
++	if (!strcmp(name, ".") || !strcmp(name, ".."))
++		return false;
++
++	while (*name) {
++		if (*name == '/' || *name == ':' || isspace(*name))
++			return false;
++		name++;
++	}
++	return true;
++}
++EXPORT_SYMBOL(dev_valid_name);
++
++/**
++ *	__dev_alloc_name - allocate a name for a device
++ *	@net: network namespace to allocate the device name in
++ *	@name: name format string
++ *	@buf:  scratch buffer and result name string
++ *
++ *	Passed a format string - eg "lt%d" it will try and find a suitable
++ *	id. It scans list of devices to build up a free map, then chooses
++ *	the first empty slot. The caller must hold the dev_base or rtnl lock
++ *	while allocating the name and adding the device in order to avoid
++ *	duplicates.
++ *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
++ *	Returns the number of the unit assigned or a negative errno code.
++ */
++
++static int __dev_alloc_name(struct net *net, const char *name, char *buf)
++{
++	int i = 0;
++	const char *p;
++	const int max_netdevices = 8*PAGE_SIZE;
++	unsigned long *inuse;
++	struct net_device *d;
++
++	if (!dev_valid_name(name))
++		return -EINVAL;
++
++	p = strchr(name, '%');
++	if (p) {
++		/*
++		 * Verify the string as this thing may have come from
++		 * the user.  There must be either one "%d" and no other "%"
++		 * characters.
++		 */
++		if (p[1] != 'd' || strchr(p + 2, '%'))
++			return -EINVAL;
++
++		/* Use one page as a bit array of possible slots */
++		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
++		if (!inuse)
++			return -ENOMEM;
++
++		for_each_netdev(net, d) {
++			struct netdev_name_node *name_node;
++			list_for_each_entry(name_node, &d->name_node->list, list) {
++				if (!sscanf(name_node->name, name, &i))
++					continue;
++				if (i < 0 || i >= max_netdevices)
++					continue;
++
++				/*  avoid cases where sscanf is not exact inverse of printf */
++				snprintf(buf, IFNAMSIZ, name, i);
++				if (!strncmp(buf, name_node->name, IFNAMSIZ))
++					__set_bit(i, inuse);
++			}
++			if (!sscanf(d->name, name, &i))
++				continue;
++			if (i < 0 || i >= max_netdevices)
++				continue;
++
++			/*  avoid cases where sscanf is not exact inverse of printf */
++			snprintf(buf, IFNAMSIZ, name, i);
++			if (!strncmp(buf, d->name, IFNAMSIZ))
++				__set_bit(i, inuse);
++		}
++
++		i = find_first_zero_bit(inuse, max_netdevices);
++		free_page((unsigned long) inuse);
++	}
++
++	snprintf(buf, IFNAMSIZ, name, i);
++	if (!netdev_name_in_use(net, buf))
++		return i;
++
++	/* It is possible to run out of possible slots
++	 * when the name is long and there isn't enough space left
++	 * for the digits, or if all bits are used.
++	 */
++	return -ENFILE;
++}
++
++static int dev_alloc_name_ns(struct net *net,
++			     struct net_device *dev,
++			     const char *name)
++{
++	char buf[IFNAMSIZ];
++	int ret;
++
++	BUG_ON(!net);
++	ret = __dev_alloc_name(net, name, buf);
++	if (ret >= 0)
++		strlcpy(dev->name, buf, IFNAMSIZ);
++	return ret;
++}
++
++/**
++ *	dev_alloc_name - allocate a name for a device
++ *	@dev: device
++ *	@name: name format string
++ *
++ *	Passed a format string - eg "lt%d" it will try and find a suitable
++ *	id. It scans list of devices to build up a free map, then chooses
++ *	the first empty slot. The caller must hold the dev_base or rtnl lock
++ *	while allocating the name and adding the device in order to avoid
++ *	duplicates.
++ *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
++ *	Returns the number of the unit assigned or a negative errno code.
++ */
++
++int dev_alloc_name(struct net_device *dev, const char *name)
++{
++	return dev_alloc_name_ns(dev_net(dev), dev, name);
++}
++EXPORT_SYMBOL(dev_alloc_name);
++
++static int dev_get_valid_name(struct net *net, struct net_device *dev,
++			      const char *name)
++{
++	BUG_ON(!net);
++
++	if (!dev_valid_name(name))
++		return -EINVAL;
++
++	if (strchr(name, '%'))
++		return dev_alloc_name_ns(net, dev, name);
++	else if (netdev_name_in_use(net, name))
++		return -EEXIST;
++	else if (dev->name != name)
++		strlcpy(dev->name, name, IFNAMSIZ);
++
++	return 0;
++}
++
++/**
++ *	dev_change_name - change name of a device
++ *	@dev: device
++ *	@newname: name (or format string) must be at least IFNAMSIZ
++ *
++ *	Change name of a device, can pass format strings "eth%d".
++ *	for wildcarding.
++ */
++int dev_change_name(struct net_device *dev, const char *newname)
++{
++	unsigned char old_assign_type;
++	char oldname[IFNAMSIZ];
++	int err = 0;
++	int ret;
++	struct net *net;
++
++	ASSERT_RTNL();
++	BUG_ON(!dev_net(dev));
++
++	net = dev_net(dev);
++
++	/* Some auto-enslaved devices e.g. failover slaves are
++	 * special, as userspace might rename the device after
++	 * the interface had been brought up and running since
++	 * the point kernel initiated auto-enslavement. Allow
++	 * live name change even when these slave devices are
++	 * up and running.
++	 *
++	 * Typically, users of these auto-enslaving devices
++	 * don't actually care about slave name change, as
++	 * they are supposed to operate on master interface
++	 * directly.
++	 */
++	if (dev->flags & IFF_UP &&
++	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
++		return -EBUSY;
++
++	down_write(&devnet_rename_sem);
++
++	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
++		up_write(&devnet_rename_sem);
++		return 0;
++	}
++
++	memcpy(oldname, dev->name, IFNAMSIZ);
++
++	err = dev_get_valid_name(net, dev, newname);
++	if (err < 0) {
++		up_write(&devnet_rename_sem);
++		return err;
++	}
++
++	if (oldname[0] && !strchr(oldname, '%'))
++		netdev_info(dev, "renamed from %s\n", oldname);
++
++	old_assign_type = dev->name_assign_type;
++	dev->name_assign_type = NET_NAME_RENAMED;
++
++rollback:
++	ret = device_rename(&dev->dev, dev->name);
++	if (ret) {
++		memcpy(dev->name, oldname, IFNAMSIZ);
++		dev->name_assign_type = old_assign_type;
++		up_write(&devnet_rename_sem);
++		return ret;
++	}
++
++	up_write(&devnet_rename_sem);
++
++	netdev_adjacent_rename_links(dev, oldname);
++
++	write_lock(&dev_base_lock);
++	netdev_name_node_del(dev->name_node);
++	write_unlock(&dev_base_lock);
++
++	synchronize_rcu();
++
++	write_lock(&dev_base_lock);
++	netdev_name_node_add(net, dev->name_node);
++	write_unlock(&dev_base_lock);
++
++	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
++	ret = notifier_to_errno(ret);
++
++	if (ret) {
++		/* err >= 0 after dev_alloc_name() or stores the first errno */
++		if (err >= 0) {
++			err = ret;
++			down_write(&devnet_rename_sem);
++			memcpy(dev->name, oldname, IFNAMSIZ);
++			memcpy(oldname, newname, IFNAMSIZ);
++			dev->name_assign_type = old_assign_type;
++			old_assign_type = NET_NAME_RENAMED;
++			goto rollback;
++		} else {
++			netdev_err(dev, "name change rollback failed: %d\n",
++				   ret);
++		}
++	}
++
++	return err;
++}
++
++/**
++ *	dev_set_alias - change ifalias of a device
++ *	@dev: device
++ *	@alias: name up to IFALIASZ
++ *	@len: limit of bytes to copy from info
++ *
++ *	Set ifalias for a device,
++ */
++int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
++{
++	struct dev_ifalias *new_alias = NULL;
++
++	if (len >= IFALIASZ)
++		return -EINVAL;
++
++	if (len) {
++		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
++		if (!new_alias)
++			return -ENOMEM;
++
++		memcpy(new_alias->ifalias, alias, len);
++		new_alias->ifalias[len] = 0;
++	}
++
++	mutex_lock(&ifalias_mutex);
++	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
++					mutex_is_locked(&ifalias_mutex));
++	mutex_unlock(&ifalias_mutex);
++
++	if (new_alias)
++		kfree_rcu(new_alias, rcuhead);
++
++	return len;
++}
++EXPORT_SYMBOL(dev_set_alias);
++
++/**
++ *	dev_get_alias - get ifalias of a device
++ *	@dev: device
++ *	@name: buffer to store name of ifalias
++ *	@len: size of buffer
++ *
++ *	get ifalias for a device.  Caller must make sure dev cannot go
++ *	away,  e.g. rcu read lock or own a reference count to device.
++ */
++int dev_get_alias(const struct net_device *dev, char *name, size_t len)
++{
++	const struct dev_ifalias *alias;
++	int ret = 0;
++
++	rcu_read_lock();
++	alias = rcu_dereference(dev->ifalias);
++	if (alias)
++		ret = snprintf(name, len, "%s", alias->ifalias);
++	rcu_read_unlock();
++
++	return ret;
++}
++
++/**
++ *	netdev_features_change - device changes features
++ *	@dev: device to cause notification
++ *
++ *	Called to indicate a device has changed features.
++ */
++void netdev_features_change(struct net_device *dev)
++{
++	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
++}
++EXPORT_SYMBOL(netdev_features_change);
++
++/**
++ *	netdev_state_change - device changes state
++ *	@dev: device to cause notification
++ *
++ *	Called to indicate a device has changed state. This function calls
++ *	the notifier chains for netdev_chain and sends a NEWLINK message
++ *	to the routing socket.
++ */
++void netdev_state_change(struct net_device *dev)
++{
++	if (dev->flags & IFF_UP) {
++		struct netdev_notifier_change_info change_info = {
++			.info.dev = dev,
++		};
++
++		call_netdevice_notifiers_info(NETDEV_CHANGE,
++					      &change_info.info);
++		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
++	}
++}
++EXPORT_SYMBOL(netdev_state_change);
++
++/**
++ * __netdev_notify_peers - notify network peers about existence of @dev,
++ * to be called when rtnl lock is already held.
++ * @dev: network device
++ *
++ * Generate traffic such that interested network peers are aware of
++ * @dev, such as by generating a gratuitous ARP. This may be used when
++ * a device wants to inform the rest of the network about some sort of
++ * reconfiguration such as a failover event or virtual machine
++ * migration.
++ */
++void __netdev_notify_peers(struct net_device *dev)
++{
++	ASSERT_RTNL();
++	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
++	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
++}
++EXPORT_SYMBOL(__netdev_notify_peers);
++
++/**
++ * netdev_notify_peers - notify network peers about existence of @dev
++ * @dev: network device
++ *
++ * Generate traffic such that interested network peers are aware of
++ * @dev, such as by generating a gratuitous ARP. This may be used when
++ * a device wants to inform the rest of the network about some sort of
++ * reconfiguration such as a failover event or virtual machine
++ * migration.
++ */
++void netdev_notify_peers(struct net_device *dev)
++{
++	rtnl_lock();
++	__netdev_notify_peers(dev);
++	rtnl_unlock();
++}
++EXPORT_SYMBOL(netdev_notify_peers);
++
++static int napi_threaded_poll(void *data);
++
++static int napi_kthread_create(struct napi_struct *n)
++{
++	int err = 0;
++
++	/* Create and wake up the kthread once to put it in
++	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
++	 * warning and work with loadavg.
++	 */
++	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
++				n->dev->name, n->napi_id);
++	if (IS_ERR(n->thread)) {
++		err = PTR_ERR(n->thread);
++		pr_err("kthread_run failed with err %d\n", err);
++		n->thread = NULL;
++	}
++
++	return err;
++}
++
++static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	int ret;
++
++	ASSERT_RTNL();
++	dev_addr_check(dev);
++
++	if (!netif_device_present(dev)) {
++		/* may be detached because parent is runtime-suspended */
++		if (dev->dev.parent)
++			pm_runtime_resume(dev->dev.parent);
++		if (!netif_device_present(dev))
++			return -ENODEV;
++	}
++
++	/* Block netpoll from trying to do any rx path servicing.
++	 * If we don't do this there is a chance ndo_poll_controller
++	 * or ndo_poll may be running while we open the device
++	 */
++	netpoll_poll_disable(dev);
++
++	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
++	ret = notifier_to_errno(ret);
++	if (ret)
++		return ret;
++
++	set_bit(__LINK_STATE_START, &dev->state);
++
++	if (ops->ndo_validate_addr)
++		ret = ops->ndo_validate_addr(dev);
++
++	if (!ret && ops->ndo_open)
++		ret = ops->ndo_open(dev);
++
++	netpoll_poll_enable(dev);
++
++	if (ret)
++		clear_bit(__LINK_STATE_START, &dev->state);
++	else {
++		dev->flags |= IFF_UP;
++		dev_set_rx_mode(dev);
++		dev_activate(dev);
++		add_device_randomness(dev->dev_addr, dev->addr_len);
++	}
++
++	return ret;
++}
++
++/**
++ *	dev_open	- prepare an interface for use.
++ *	@dev: device to open
++ *	@extack: netlink extended ack
++ *
++ *	Takes a device from down to up state. The device's private open
++ *	function is invoked and then the multicast lists are loaded. Finally
++ *	the device is moved into the up state and a %NETDEV_UP message is
++ *	sent to the netdev notifier chain.
++ *
++ *	Calling this function on an active interface is a nop. On a failure
++ *	a negative errno code is returned.
++ */
++int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
++{
++	int ret;
++
++	if (dev->flags & IFF_UP)
++		return 0;
++
++	ret = __dev_open(dev, extack);
++	if (ret < 0)
++		return ret;
++
++	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
++	call_netdevice_notifiers(NETDEV_UP, dev);
++
++	return ret;
++}
++EXPORT_SYMBOL(dev_open);
++
++static void __dev_close_many(struct list_head *head)
++{
++	struct net_device *dev;
++
++	ASSERT_RTNL();
++	might_sleep();
++
++	list_for_each_entry(dev, head, close_list) {
++		/* Temporarily disable netpoll until the interface is down */
++		netpoll_poll_disable(dev);
++
++		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
++
++		clear_bit(__LINK_STATE_START, &dev->state);
++
++		/* Synchronize to scheduled poll. We cannot touch poll list, it
++		 * can be even on different cpu. So just clear netif_running().
++		 *
++		 * dev->stop() will invoke napi_disable() on all of it's
++		 * napi_struct instances on this device.
++		 */
++		smp_mb__after_atomic(); /* Commit netif_running(). */
++	}
++
++	dev_deactivate_many(head);
++
++	list_for_each_entry(dev, head, close_list) {
++		const struct net_device_ops *ops = dev->netdev_ops;
++
++		/*
++		 *	Call the device specific close. This cannot fail.
++		 *	Only if device is UP
++		 *
++		 *	We allow it to be called even after a DETACH hot-plug
++		 *	event.
++		 */
++		if (ops->ndo_stop)
++			ops->ndo_stop(dev);
++
++		dev->flags &= ~IFF_UP;
++		netpoll_poll_enable(dev);
++	}
++}
++
++static void __dev_close(struct net_device *dev)
++{
++	LIST_HEAD(single);
++
++	list_add(&dev->close_list, &single);
++	__dev_close_many(&single);
++	list_del(&single);
++}
++
++void dev_close_many(struct list_head *head, bool unlink)
++{
++	struct net_device *dev, *tmp;
++
++	/* Remove the devices that don't need to be closed */
++	list_for_each_entry_safe(dev, tmp, head, close_list)
++		if (!(dev->flags & IFF_UP))
++			list_del_init(&dev->close_list);
++
++	__dev_close_many(head);
++
++	list_for_each_entry_safe(dev, tmp, head, close_list) {
++		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
++		call_netdevice_notifiers(NETDEV_DOWN, dev);
++		if (unlink)
++			list_del_init(&dev->close_list);
++	}
++}
++EXPORT_SYMBOL(dev_close_many);
++
++/**
++ *	dev_close - shutdown an interface.
++ *	@dev: device to shutdown
++ *
++ *	This function moves an active device into down state. A
++ *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
++ *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
++ *	chain.
++ */
++void dev_close(struct net_device *dev)
++{
++	if (dev->flags & IFF_UP) {
++		LIST_HEAD(single);
++
++		list_add(&dev->close_list, &single);
++		dev_close_many(&single, true);
++		list_del(&single);
++	}
++}
++EXPORT_SYMBOL(dev_close);
++
++
++/**
++ *	dev_disable_lro - disable Large Receive Offload on a device
++ *	@dev: device
++ *
++ *	Disable Large Receive Offload (LRO) on a net device.  Must be
++ *	called under RTNL.  This is needed if received packets may be
++ *	forwarded to another interface.
++ */
++void dev_disable_lro(struct net_device *dev)
++{
++	struct net_device *lower_dev;
++	struct list_head *iter;
++
++	dev->wanted_features &= ~NETIF_F_LRO;
++	netdev_update_features(dev);
++
++	if (unlikely(dev->features & NETIF_F_LRO))
++		netdev_WARN(dev, "failed to disable LRO!\n");
++
++	netdev_for_each_lower_dev(dev, lower_dev, iter)
++		dev_disable_lro(lower_dev);
++}
++EXPORT_SYMBOL(dev_disable_lro);
++
++/**
++ *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
++ *	@dev: device
++ *
++ *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
++ *	called under RTNL.  This is needed if Generic XDP is installed on
++ *	the device.
++ */
++static void dev_disable_gro_hw(struct net_device *dev)
++{
++	dev->wanted_features &= ~NETIF_F_GRO_HW;
++	netdev_update_features(dev);
++
++	if (unlikely(dev->features & NETIF_F_GRO_HW))
++		netdev_WARN(dev, "failed to disable GRO_HW!\n");
++}
++
++const char *netdev_cmd_to_name(enum netdev_cmd cmd)
++{
++#define N(val) 						\
++	case NETDEV_##val:				\
++		return "NETDEV_" __stringify(val);
++	switch (cmd) {
++	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
++	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
++	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
++	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
++	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
++	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
++	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
++	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
++	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
++	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
++	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
++	}
++#undef N
++	return "UNKNOWN_NETDEV_EVENT";
++}
++EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
++
++static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
++				   struct net_device *dev)
++{
++	struct netdev_notifier_info info = {
++		.dev = dev,
++	};
++
++	return nb->notifier_call(nb, val, &info);
++}
++
++static int call_netdevice_register_notifiers(struct notifier_block *nb,
++					     struct net_device *dev)
++{
++	int err;
++
++	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
++	err = notifier_to_errno(err);
++	if (err)
++		return err;
++
++	if (!(dev->flags & IFF_UP))
++		return 0;
++
++	call_netdevice_notifier(nb, NETDEV_UP, dev);
++	return 0;
++}
++
++static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
++						struct net_device *dev)
++{
++	if (dev->flags & IFF_UP) {
++		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
++					dev);
++		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
++	}
++	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
++}
++
++static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
++						 struct net *net)
++{
++	struct net_device *dev;
++	int err;
++
++	for_each_netdev(net, dev) {
++		err = call_netdevice_register_notifiers(nb, dev);
++		if (err)
++			goto rollback;
++	}
++	return 0;
++
++rollback:
++	for_each_netdev_continue_reverse(net, dev)
++		call_netdevice_unregister_notifiers(nb, dev);
++	return err;
++}
++
++static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
++						    struct net *net)
++{
++	struct net_device *dev;
++
++	for_each_netdev(net, dev)
++		call_netdevice_unregister_notifiers(nb, dev);
++}
++
++static int dev_boot_phase = 1;
++
++/**
++ * register_netdevice_notifier - register a network notifier block
++ * @nb: notifier
++ *
++ * Register a notifier to be called when network device events occur.
++ * The notifier passed is linked into the kernel structures and must
++ * not be reused until it has been unregistered. A negative errno code
++ * is returned on a failure.
++ *
++ * When registered all registration and up events are replayed
++ * to the new notifier to allow device to have a race free
++ * view of the network device list.
++ */
++
++int register_netdevice_notifier(struct notifier_block *nb)
++{
++	struct net *net;
++	int err;
++
++	/* Close race with setup_net() and cleanup_net() */
++	down_write(&pernet_ops_rwsem);
++	rtnl_lock();
++	err = raw_notifier_chain_register(&netdev_chain, nb);
++	if (err)
++		goto unlock;
++	if (dev_boot_phase)
++		goto unlock;
++	for_each_net(net) {
++		err = call_netdevice_register_net_notifiers(nb, net);
++		if (err)
++			goto rollback;
++	}
++
++unlock:
++	rtnl_unlock();
++	up_write(&pernet_ops_rwsem);
++	return err;
++
++rollback:
++	for_each_net_continue_reverse(net)
++		call_netdevice_unregister_net_notifiers(nb, net);
++
++	raw_notifier_chain_unregister(&netdev_chain, nb);
++	goto unlock;
++}
++EXPORT_SYMBOL(register_netdevice_notifier);
++
++/**
++ * unregister_netdevice_notifier - unregister a network notifier block
++ * @nb: notifier
++ *
++ * Unregister a notifier previously registered by
++ * register_netdevice_notifier(). The notifier is unlinked into the
++ * kernel structures and may then be reused. A negative errno code
++ * is returned on a failure.
++ *
++ * After unregistering unregister and down device events are synthesized
++ * for all devices on the device list to the removed notifier to remove
++ * the need for special case cleanup code.
++ */
++
++int unregister_netdevice_notifier(struct notifier_block *nb)
++{
++	struct net *net;
++	int err;
++
++	/* Close race with setup_net() and cleanup_net() */
++	down_write(&pernet_ops_rwsem);
++	rtnl_lock();
++	err = raw_notifier_chain_unregister(&netdev_chain, nb);
++	if (err)
++		goto unlock;
++
++	for_each_net(net)
++		call_netdevice_unregister_net_notifiers(nb, net);
++
++unlock:
++	rtnl_unlock();
++	up_write(&pernet_ops_rwsem);
++	return err;
++}
++EXPORT_SYMBOL(unregister_netdevice_notifier);
++
++static int __register_netdevice_notifier_net(struct net *net,
++					     struct notifier_block *nb,
++					     bool ignore_call_fail)
++{
++	int err;
++
++	err = raw_notifier_chain_register(&net->netdev_chain, nb);
++	if (err)
++		return err;
++	if (dev_boot_phase)
++		return 0;
++
++	err = call_netdevice_register_net_notifiers(nb, net);
++	if (err && !ignore_call_fail)
++		goto chain_unregister;
++
++	return 0;
++
++chain_unregister:
++	raw_notifier_chain_unregister(&net->netdev_chain, nb);
++	return err;
++}
++
++static int __unregister_netdevice_notifier_net(struct net *net,
++					       struct notifier_block *nb)
++{
++	int err;
++
++	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
++	if (err)
++		return err;
++
++	call_netdevice_unregister_net_notifiers(nb, net);
++	return 0;
++}
++
++/**
++ * register_netdevice_notifier_net - register a per-netns network notifier block
++ * @net: network namespace
++ * @nb: notifier
++ *
++ * Register a notifier to be called when network device events occur.
++ * The notifier passed is linked into the kernel structures and must
++ * not be reused until it has been unregistered. A negative errno code
++ * is returned on a failure.
++ *
++ * When registered all registration and up events are replayed
++ * to the new notifier to allow device to have a race free
++ * view of the network device list.
++ */
++
++int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
++{
++	int err;
++
++	rtnl_lock();
++	err = __register_netdevice_notifier_net(net, nb, false);
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(register_netdevice_notifier_net);
++
++/**
++ * unregister_netdevice_notifier_net - unregister a per-netns
++ *                                     network notifier block
++ * @net: network namespace
++ * @nb: notifier
++ *
++ * Unregister a notifier previously registered by
++ * register_netdevice_notifier(). The notifier is unlinked into the
++ * kernel structures and may then be reused. A negative errno code
++ * is returned on a failure.
++ *
++ * After unregistering unregister and down device events are synthesized
++ * for all devices on the device list to the removed notifier to remove
++ * the need for special case cleanup code.
++ */
++
++int unregister_netdevice_notifier_net(struct net *net,
++				      struct notifier_block *nb)
++{
++	int err;
++
++	rtnl_lock();
++	err = __unregister_netdevice_notifier_net(net, nb);
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(unregister_netdevice_notifier_net);
++
++int register_netdevice_notifier_dev_net(struct net_device *dev,
++					struct notifier_block *nb,
++					struct netdev_net_notifier *nn)
++{
++	int err;
++
++	rtnl_lock();
++	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
++	if (!err) {
++		nn->nb = nb;
++		list_add(&nn->list, &dev->net_notifier_list);
++	}
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
++
++int unregister_netdevice_notifier_dev_net(struct net_device *dev,
++					  struct notifier_block *nb,
++					  struct netdev_net_notifier *nn)
++{
++	int err;
++
++	rtnl_lock();
++	list_del(&nn->list);
++	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
++
++static void move_netdevice_notifiers_dev_net(struct net_device *dev,
++					     struct net *net)
++{
++	struct netdev_net_notifier *nn;
++
++	list_for_each_entry(nn, &dev->net_notifier_list, list) {
++		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
++		__register_netdevice_notifier_net(net, nn->nb, true);
++	}
++}
++
++/**
++ *	call_netdevice_notifiers_info - call all network notifier blocks
++ *	@val: value passed unmodified to notifier function
++ *	@info: notifier information data
++ *
++ *	Call all network notifier blocks.  Parameters and return value
++ *	are as for raw_notifier_call_chain().
++ */
++
++static int call_netdevice_notifiers_info(unsigned long val,
++					 struct netdev_notifier_info *info)
++{
++	struct net *net = dev_net(info->dev);
++	int ret;
++
++	ASSERT_RTNL();
++
++	/* Run per-netns notifier block chain first, then run the global one.
++	 * Hopefully, one day, the global one is going to be removed after
++	 * all notifier block registrators get converted to be per-netns.
++	 */
++	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
++	if (ret & NOTIFY_STOP_MASK)
++		return ret;
++	return raw_notifier_call_chain(&netdev_chain, val, info);
++}
++
++/**
++ *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
++ *	                                       for and rollback on error
++ *	@val_up: value passed unmodified to notifier function
++ *	@val_down: value passed unmodified to the notifier function when
++ *	           recovering from an error on @val_up
++ *	@info: notifier information data
++ *
++ *	Call all per-netns network notifier blocks, but not notifier blocks on
++ *	the global notifier chain. Parameters and return value are as for
++ *	raw_notifier_call_chain_robust().
++ */
++
++static int
++call_netdevice_notifiers_info_robust(unsigned long val_up,
++				     unsigned long val_down,
++				     struct netdev_notifier_info *info)
++{
++	struct net *net = dev_net(info->dev);
++
++	ASSERT_RTNL();
++
++	return raw_notifier_call_chain_robust(&net->netdev_chain,
++					      val_up, val_down, info);
++}
++
++static int call_netdevice_notifiers_extack(unsigned long val,
++					   struct net_device *dev,
++					   struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_info info = {
++		.dev = dev,
++		.extack = extack,
++	};
++
++	return call_netdevice_notifiers_info(val, &info);
++}
++
++/**
++ *	call_netdevice_notifiers - call all network notifier blocks
++ *      @val: value passed unmodified to notifier function
++ *      @dev: net_device pointer passed unmodified to notifier function
++ *
++ *	Call all network notifier blocks.  Parameters and return value
++ *	are as for raw_notifier_call_chain().
++ */
++
++int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
++{
++	return call_netdevice_notifiers_extack(val, dev, NULL);
++}
++EXPORT_SYMBOL(call_netdevice_notifiers);
++
++/**
++ *	call_netdevice_notifiers_mtu - call all network notifier blocks
++ *	@val: value passed unmodified to notifier function
++ *	@dev: net_device pointer passed unmodified to notifier function
++ *	@arg: additional u32 argument passed to the notifier function
++ *
++ *	Call all network notifier blocks.  Parameters and return value
++ *	are as for raw_notifier_call_chain().
++ */
++static int call_netdevice_notifiers_mtu(unsigned long val,
++					struct net_device *dev, u32 arg)
++{
++	struct netdev_notifier_info_ext info = {
++		.info.dev = dev,
++		.ext.mtu = arg,
++	};
++
++	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
++
++	return call_netdevice_notifiers_info(val, &info.info);
++}
++
++#ifdef CONFIG_NET_INGRESS
++static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
++
++void net_inc_ingress_queue(void)
++{
++	static_branch_inc(&ingress_needed_key);
++}
++EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
++
++void net_dec_ingress_queue(void)
++{
++	static_branch_dec(&ingress_needed_key);
++}
++EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
++#endif
++
++#ifdef CONFIG_NET_EGRESS
++static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
++
++void net_inc_egress_queue(void)
++{
++	static_branch_inc(&egress_needed_key);
++}
++EXPORT_SYMBOL_GPL(net_inc_egress_queue);
++
++void net_dec_egress_queue(void)
++{
++	static_branch_dec(&egress_needed_key);
++}
++EXPORT_SYMBOL_GPL(net_dec_egress_queue);
++#endif
++
++DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
++EXPORT_SYMBOL(netstamp_needed_key);
++#ifdef CONFIG_JUMP_LABEL
++static atomic_t netstamp_needed_deferred;
++static atomic_t netstamp_wanted;
++static void netstamp_clear(struct work_struct *work)
++{
++	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
++	int wanted;
++
++	wanted = atomic_add_return(deferred, &netstamp_wanted);
++	if (wanted > 0)
++		static_branch_enable(&netstamp_needed_key);
++	else
++		static_branch_disable(&netstamp_needed_key);
++}
++static DECLARE_WORK(netstamp_work, netstamp_clear);
++#endif
++
++void net_enable_timestamp(void)
++{
++#ifdef CONFIG_JUMP_LABEL
++	int wanted;
++
++	while (1) {
++		wanted = atomic_read(&netstamp_wanted);
++		if (wanted <= 0)
++			break;
++		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
++			return;
++	}
++	atomic_inc(&netstamp_needed_deferred);
++	schedule_work(&netstamp_work);
++#else
++	static_branch_inc(&netstamp_needed_key);
++#endif
++}
++EXPORT_SYMBOL(net_enable_timestamp);
++
++void net_disable_timestamp(void)
++{
++#ifdef CONFIG_JUMP_LABEL
++	int wanted;
++
++	while (1) {
++		wanted = atomic_read(&netstamp_wanted);
++		if (wanted <= 1)
++			break;
++		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
++			return;
++	}
++	atomic_dec(&netstamp_needed_deferred);
++	schedule_work(&netstamp_work);
++#else
++	static_branch_dec(&netstamp_needed_key);
++#endif
++}
++EXPORT_SYMBOL(net_disable_timestamp);
++
++static inline void net_timestamp_set(struct sk_buff *skb)
++{
++	skb->tstamp = 0;
++	skb->mono_delivery_time = 0;
++	if (static_branch_unlikely(&netstamp_needed_key))
++		skb->tstamp = ktime_get_real();
++}
++
++#define net_timestamp_check(COND, SKB)				\
++	if (static_branch_unlikely(&netstamp_needed_key)) {	\
++		if ((COND) && !(SKB)->tstamp)			\
++			(SKB)->tstamp = ktime_get_real();	\
++	}							\
++
++bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
++{
++	return __is_skb_forwardable(dev, skb, true);
++}
++EXPORT_SYMBOL_GPL(is_skb_forwardable);
++
++static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
++			      bool check_mtu)
++{
++	int ret = ____dev_forward_skb(dev, skb, check_mtu);
++
++	if (likely(!ret)) {
++		skb->protocol = eth_type_trans(skb, dev);
++		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
++	}
++
++	return ret;
++}
++
++int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
++{
++	return __dev_forward_skb2(dev, skb, true);
++}
++EXPORT_SYMBOL_GPL(__dev_forward_skb);
++
++/**
++ * dev_forward_skb - loopback an skb to another netif
++ *
++ * @dev: destination network device
++ * @skb: buffer to forward
++ *
++ * return values:
++ *	NET_RX_SUCCESS	(no congestion)
++ *	NET_RX_DROP     (packet was dropped, but freed)
++ *
++ * dev_forward_skb can be used for injecting an skb from the
++ * start_xmit function of one device into the receive queue
++ * of another device.
++ *
++ * The receiving device may be in another namespace, so
++ * we have to clear all information in the skb that could
++ * impact namespace isolation.
++ */
++int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
++{
++	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
++}
++EXPORT_SYMBOL_GPL(dev_forward_skb);
++
++int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
++{
++	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
++}
++
++static inline int deliver_skb(struct sk_buff *skb,
++			      struct packet_type *pt_prev,
++			      struct net_device *orig_dev)
++{
++	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
++		return -ENOMEM;
++	refcount_inc(&skb->users);
++	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
++}
++
++static inline void deliver_ptype_list_skb(struct sk_buff *skb,
++					  struct packet_type **pt,
++					  struct net_device *orig_dev,
++					  __be16 type,
++					  struct list_head *ptype_list)
++{
++	struct packet_type *ptype, *pt_prev = *pt;
++
++	list_for_each_entry_rcu(ptype, ptype_list, list) {
++		if (ptype->type != type)
++			continue;
++		if (pt_prev)
++			deliver_skb(skb, pt_prev, orig_dev);
++		pt_prev = ptype;
++	}
++	*pt = pt_prev;
++}
++
++static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
++{
++	if (!ptype->af_packet_priv || !skb->sk)
++		return false;
++
++	if (ptype->id_match)
++		return ptype->id_match(ptype, skb->sk);
++	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
++		return true;
++
++	return false;
++}
++
++/**
++ * dev_nit_active - return true if any network interface taps are in use
++ *
++ * @dev: network device to check for the presence of taps
++ */
++bool dev_nit_active(struct net_device *dev)
++{
++	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
++}
++EXPORT_SYMBOL_GPL(dev_nit_active);
++
++/*
++ *	Support routine. Sends outgoing frames to any network
++ *	taps currently in use.
++ */
++
++void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
++{
++	struct packet_type *ptype;
++	struct sk_buff *skb2 = NULL;
++	struct packet_type *pt_prev = NULL;
++	struct list_head *ptype_list = &ptype_all;
++
++	rcu_read_lock();
++again:
++	list_for_each_entry_rcu(ptype, ptype_list, list) {
++		if (ptype->ignore_outgoing)
++			continue;
++
++		/* Never send packets back to the socket
++		 * they originated from - MvS (miquels@drinkel.ow.org)
++		 */
++		if (skb_loop_sk(ptype, skb))
++			continue;
++
++		if (pt_prev) {
++			deliver_skb(skb2, pt_prev, skb->dev);
++			pt_prev = ptype;
++			continue;
++		}
++
++		/* need to clone skb, done only once */
++		skb2 = skb_clone(skb, GFP_ATOMIC);
++		if (!skb2)
++			goto out_unlock;
++
++		net_timestamp_set(skb2);
++
++		/* skb->nh should be correctly
++		 * set by sender, so that the second statement is
++		 * just protection against buggy protocols.
++		 */
++		skb_reset_mac_header(skb2);
++
++		if (skb_network_header(skb2) < skb2->data ||
++		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
++			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
++					     ntohs(skb2->protocol),
++					     dev->name);
++			skb_reset_network_header(skb2);
++		}
++
++		skb2->transport_header = skb2->network_header;
++		skb2->pkt_type = PACKET_OUTGOING;
++		pt_prev = ptype;
++	}
++
++	if (ptype_list == &ptype_all) {
++		ptype_list = &dev->ptype_all;
++		goto again;
++	}
++out_unlock:
++	if (pt_prev) {
++		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
++			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
++		else
++			kfree_skb(skb2);
++	}
++	rcu_read_unlock();
++}
++EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
++
++/**
++ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
++ * @dev: Network device
++ * @txq: number of queues available
++ *
++ * If real_num_tx_queues is changed the tc mappings may no longer be
++ * valid. To resolve this verify the tc mapping remains valid and if
++ * not NULL the mapping. With no priorities mapping to this
++ * offset/count pair it will no longer be used. In the worst case TC0
++ * is invalid nothing can be done so disable priority mappings. If is
++ * expected that drivers will fix this mapping if they can before
++ * calling netif_set_real_num_tx_queues.
++ */
++static void netif_setup_tc(struct net_device *dev, unsigned int txq)
++{
++	int i;
++	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
++
++	/* If TC0 is invalidated disable TC mapping */
++	if (tc->offset + tc->count > txq) {
++		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
++		dev->num_tc = 0;
++		return;
++	}
++
++	/* Invalidated prio to tc mappings set to TC0 */
++	for (i = 1; i < TC_BITMASK + 1; i++) {
++		int q = netdev_get_prio_tc_map(dev, i);
++
++		tc = &dev->tc_to_txq[q];
++		if (tc->offset + tc->count > txq) {
++			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
++				    i, q);
++			netdev_set_prio_tc_map(dev, i, 0);
++		}
++	}
++}
++
++int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
++{
++	if (dev->num_tc) {
++		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
++		int i;
++
++		/* walk through the TCs and see if it falls into any of them */
++		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
++			if ((txq - tc->offset) < tc->count)
++				return i;
++		}
++
++		/* didn't find it, just return -1 to indicate no match */
++		return -1;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(netdev_txq_to_tc);
++
++#ifdef CONFIG_XPS
++static struct static_key xps_needed __read_mostly;
++static struct static_key xps_rxqs_needed __read_mostly;
++static DEFINE_MUTEX(xps_map_mutex);
++#define xmap_dereference(P)		\
++	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
++
++static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
++			     struct xps_dev_maps *old_maps, int tci, u16 index)
++{
++	struct xps_map *map = NULL;
++	int pos;
++
++	if (dev_maps)
++		map = xmap_dereference(dev_maps->attr_map[tci]);
++	if (!map)
++		return false;
++
++	for (pos = map->len; pos--;) {
++		if (map->queues[pos] != index)
++			continue;
++
++		if (map->len > 1) {
++			map->queues[pos] = map->queues[--map->len];
++			break;
++		}
++
++		if (old_maps)
++			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
++		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
++		kfree_rcu(map, rcu);
++		return false;
++	}
++
++	return true;
++}
++
++static bool remove_xps_queue_cpu(struct net_device *dev,
++				 struct xps_dev_maps *dev_maps,
++				 int cpu, u16 offset, u16 count)
++{
++	int num_tc = dev_maps->num_tc;
++	bool active = false;
++	int tci;
++
++	for (tci = cpu * num_tc; num_tc--; tci++) {
++		int i, j;
++
++		for (i = count, j = offset; i--; j++) {
++			if (!remove_xps_queue(dev_maps, NULL, tci, j))
++				break;
++		}
++
++		active |= i < 0;
++	}
++
++	return active;
++}
++
++static void reset_xps_maps(struct net_device *dev,
++			   struct xps_dev_maps *dev_maps,
++			   enum xps_map_type type)
++{
++	static_key_slow_dec_cpuslocked(&xps_needed);
++	if (type == XPS_RXQS)
++		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
++
++	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
++
++	kfree_rcu(dev_maps, rcu);
++}
++
++static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
++			   u16 offset, u16 count)
++{
++	struct xps_dev_maps *dev_maps;
++	bool active = false;
++	int i, j;
++
++	dev_maps = xmap_dereference(dev->xps_maps[type]);
++	if (!dev_maps)
++		return;
++
++	for (j = 0; j < dev_maps->nr_ids; j++)
++		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
++	if (!active)
++		reset_xps_maps(dev, dev_maps, type);
++
++	if (type == XPS_CPUS) {
++		for (i = offset + (count - 1); count--; i--)
++			netdev_queue_numa_node_write(
++				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
++	}
++}
++
++static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
++				   u16 count)
++{
++	if (!static_key_false(&xps_needed))
++		return;
++
++	cpus_read_lock();
++	mutex_lock(&xps_map_mutex);
++
++	if (static_key_false(&xps_rxqs_needed))
++		clean_xps_maps(dev, XPS_RXQS, offset, count);
++
++	clean_xps_maps(dev, XPS_CPUS, offset, count);
++
++	mutex_unlock(&xps_map_mutex);
++	cpus_read_unlock();
++}
++
++static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
++{
++	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
++}
++
++static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
++				      u16 index, bool is_rxqs_map)
++{
++	struct xps_map *new_map;
++	int alloc_len = XPS_MIN_MAP_ALLOC;
++	int i, pos;
++
++	for (pos = 0; map && pos < map->len; pos++) {
++		if (map->queues[pos] != index)
++			continue;
++		return map;
++	}
++
++	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
++	if (map) {
++		if (pos < map->alloc_len)
++			return map;
++
++		alloc_len = map->alloc_len * 2;
++	}
++
++	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
++	 *  map
++	 */
++	if (is_rxqs_map)
++		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
++	else
++		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
++				       cpu_to_node(attr_index));
++	if (!new_map)
++		return NULL;
++
++	for (i = 0; i < pos; i++)
++		new_map->queues[i] = map->queues[i];
++	new_map->alloc_len = alloc_len;
++	new_map->len = pos;
++
++	return new_map;
++}
++
++/* Copy xps maps at a given index */
++static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
++			      struct xps_dev_maps *new_dev_maps, int index,
++			      int tc, bool skip_tc)
++{
++	int i, tci = index * dev_maps->num_tc;
++	struct xps_map *map;
++
++	/* copy maps belonging to foreign traffic classes */
++	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
++		if (i == tc && skip_tc)
++			continue;
++
++		/* fill in the new device map from the old device map */
++		map = xmap_dereference(dev_maps->attr_map[tci]);
++		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
++	}
++}
++
++/* Must be called under cpus_read_lock */
++int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
++			  u16 index, enum xps_map_type type)
++{
++	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
++	const unsigned long *online_mask = NULL;
++	bool active = false, copy = false;
++	int i, j, tci, numa_node_id = -2;
++	int maps_sz, num_tc = 1, tc = 0;
++	struct xps_map *map, *new_map;
++	unsigned int nr_ids;
++
++	if (dev->num_tc) {
++		/* Do not allow XPS on subordinate device directly */
++		num_tc = dev->num_tc;
++		if (num_tc < 0)
++			return -EINVAL;
++
++		/* If queue belongs to subordinate dev use its map */
++		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
++
++		tc = netdev_txq_to_tc(dev, index);
++		if (tc < 0)
++			return -EINVAL;
++	}
++
++	mutex_lock(&xps_map_mutex);
++
++	dev_maps = xmap_dereference(dev->xps_maps[type]);
++	if (type == XPS_RXQS) {
++		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
++		nr_ids = dev->num_rx_queues;
++	} else {
++		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
++		if (num_possible_cpus() > 1)
++			online_mask = cpumask_bits(cpu_online_mask);
++		nr_ids = nr_cpu_ids;
++	}
++
++	if (maps_sz < L1_CACHE_BYTES)
++		maps_sz = L1_CACHE_BYTES;
++
++	/* The old dev_maps could be larger or smaller than the one we're
++	 * setting up now, as dev->num_tc or nr_ids could have been updated in
++	 * between. We could try to be smart, but let's be safe instead and only
++	 * copy foreign traffic classes if the two map sizes match.
++	 */
++	if (dev_maps &&
++	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
++		copy = true;
++
++	/* allocate memory for queue storage */
++	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
++	     j < nr_ids;) {
++		if (!new_dev_maps) {
++			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
++			if (!new_dev_maps) {
++				mutex_unlock(&xps_map_mutex);
++				return -ENOMEM;
++			}
++
++			new_dev_maps->nr_ids = nr_ids;
++			new_dev_maps->num_tc = num_tc;
++		}
++
++		tci = j * num_tc + tc;
++		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
++
++		map = expand_xps_map(map, j, index, type == XPS_RXQS);
++		if (!map)
++			goto error;
++
++		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
++	}
++
++	if (!new_dev_maps)
++		goto out_no_new_maps;
++
++	if (!dev_maps) {
++		/* Increment static keys at most once per type */
++		static_key_slow_inc_cpuslocked(&xps_needed);
++		if (type == XPS_RXQS)
++			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
++	}
++
++	for (j = 0; j < nr_ids; j++) {
++		bool skip_tc = false;
++
++		tci = j * num_tc + tc;
++		if (netif_attr_test_mask(j, mask, nr_ids) &&
++		    netif_attr_test_online(j, online_mask, nr_ids)) {
++			/* add tx-queue to CPU/rx-queue maps */
++			int pos = 0;
++
++			skip_tc = true;
++
++			map = xmap_dereference(new_dev_maps->attr_map[tci]);
++			while ((pos < map->len) && (map->queues[pos] != index))
++				pos++;
++
++			if (pos == map->len)
++				map->queues[map->len++] = index;
++#ifdef CONFIG_NUMA
++			if (type == XPS_CPUS) {
++				if (numa_node_id == -2)
++					numa_node_id = cpu_to_node(j);
++				else if (numa_node_id != cpu_to_node(j))
++					numa_node_id = -1;
++			}
++#endif
++		}
++
++		if (copy)
++			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
++					  skip_tc);
++	}
++
++	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
++
++	/* Cleanup old maps */
++	if (!dev_maps)
++		goto out_no_old_maps;
++
++	for (j = 0; j < dev_maps->nr_ids; j++) {
++		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
++			map = xmap_dereference(dev_maps->attr_map[tci]);
++			if (!map)
++				continue;
++
++			if (copy) {
++				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
++				if (map == new_map)
++					continue;
++			}
++
++			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
++			kfree_rcu(map, rcu);
++		}
++	}
++
++	old_dev_maps = dev_maps;
++
++out_no_old_maps:
++	dev_maps = new_dev_maps;
++	active = true;
++
++out_no_new_maps:
++	if (type == XPS_CPUS)
++		/* update Tx queue numa node */
++		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
++					     (numa_node_id >= 0) ?
++					     numa_node_id : NUMA_NO_NODE);
++
++	if (!dev_maps)
++		goto out_no_maps;
++
++	/* removes tx-queue from unused CPUs/rx-queues */
++	for (j = 0; j < dev_maps->nr_ids; j++) {
++		tci = j * dev_maps->num_tc;
++
++		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
++			if (i == tc &&
++			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
++			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
++				continue;
++
++			active |= remove_xps_queue(dev_maps,
++						   copy ? old_dev_maps : NULL,
++						   tci, index);
++		}
++	}
++
++	if (old_dev_maps)
++		kfree_rcu(old_dev_maps, rcu);
++
++	/* free map if not active */
++	if (!active)
++		reset_xps_maps(dev, dev_maps, type);
++
++out_no_maps:
++	mutex_unlock(&xps_map_mutex);
++
++	return 0;
++error:
++	/* remove any maps that we added */
++	for (j = 0; j < nr_ids; j++) {
++		for (i = num_tc, tci = j * num_tc; i--; tci++) {
++			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
++			map = copy ?
++			      xmap_dereference(dev_maps->attr_map[tci]) :
++			      NULL;
++			if (new_map && new_map != map)
++				kfree(new_map);
++		}
++	}
++
++	mutex_unlock(&xps_map_mutex);
++
++	kfree(new_dev_maps);
++	return -ENOMEM;
++}
++EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
++
++int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
++			u16 index)
++{
++	int ret;
++
++	cpus_read_lock();
++	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
++	cpus_read_unlock();
++
++	return ret;
++}
++EXPORT_SYMBOL(netif_set_xps_queue);
++
++#endif
++static void netdev_unbind_all_sb_channels(struct net_device *dev)
++{
++	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
++
++	/* Unbind any subordinate channels */
++	while (txq-- != &dev->_tx[0]) {
++		if (txq->sb_dev)
++			netdev_unbind_sb_channel(dev, txq->sb_dev);
++	}
++}
++
++void netdev_reset_tc(struct net_device *dev)
++{
++#ifdef CONFIG_XPS
++	netif_reset_xps_queues_gt(dev, 0);
++#endif
++	netdev_unbind_all_sb_channels(dev);
++
++	/* Reset TC configuration of device */
++	dev->num_tc = 0;
++	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
++	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
++}
++EXPORT_SYMBOL(netdev_reset_tc);
++
++int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
++{
++	if (tc >= dev->num_tc)
++		return -EINVAL;
++
++#ifdef CONFIG_XPS
++	netif_reset_xps_queues(dev, offset, count);
++#endif
++	dev->tc_to_txq[tc].count = count;
++	dev->tc_to_txq[tc].offset = offset;
++	return 0;
++}
++EXPORT_SYMBOL(netdev_set_tc_queue);
++
++int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
++{
++	if (num_tc > TC_MAX_QUEUE)
++		return -EINVAL;
++
++#ifdef CONFIG_XPS
++	netif_reset_xps_queues_gt(dev, 0);
++#endif
++	netdev_unbind_all_sb_channels(dev);
++
++	dev->num_tc = num_tc;
++	return 0;
++}
++EXPORT_SYMBOL(netdev_set_num_tc);
++
++void netdev_unbind_sb_channel(struct net_device *dev,
++			      struct net_device *sb_dev)
++{
++	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
++
++#ifdef CONFIG_XPS
++	netif_reset_xps_queues_gt(sb_dev, 0);
++#endif
++	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
++	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
++
++	while (txq-- != &dev->_tx[0]) {
++		if (txq->sb_dev == sb_dev)
++			txq->sb_dev = NULL;
++	}
++}
++EXPORT_SYMBOL(netdev_unbind_sb_channel);
++
++int netdev_bind_sb_channel_queue(struct net_device *dev,
++				 struct net_device *sb_dev,
++				 u8 tc, u16 count, u16 offset)
++{
++	/* Make certain the sb_dev and dev are already configured */
++	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
++		return -EINVAL;
++
++	/* We cannot hand out queues we don't have */
++	if ((offset + count) > dev->real_num_tx_queues)
++		return -EINVAL;
++
++	/* Record the mapping */
++	sb_dev->tc_to_txq[tc].count = count;
++	sb_dev->tc_to_txq[tc].offset = offset;
++
++	/* Provide a way for Tx queue to find the tc_to_txq map or
++	 * XPS map for itself.
++	 */
++	while (count--)
++		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
++
++	return 0;
++}
++EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
++
++int netdev_set_sb_channel(struct net_device *dev, u16 channel)
++{
++	/* Do not use a multiqueue device to represent a subordinate channel */
++	if (netif_is_multiqueue(dev))
++		return -ENODEV;
++
++	/* We allow channels 1 - 32767 to be used for subordinate channels.
++	 * Channel 0 is meant to be "native" mode and used only to represent
++	 * the main root device. We allow writing 0 to reset the device back
++	 * to normal mode after being used as a subordinate channel.
++	 */
++	if (channel > S16_MAX)
++		return -EINVAL;
++
++	dev->num_tc = -channel;
++
++	return 0;
++}
++EXPORT_SYMBOL(netdev_set_sb_channel);
++
++/*
++ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
++ * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
++ */
++int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
++{
++	bool disabling;
++	int rc;
++
++	disabling = txq < dev->real_num_tx_queues;
++
++	if (txq < 1 || txq > dev->num_tx_queues)
++		return -EINVAL;
++
++	if (dev->reg_state == NETREG_REGISTERED ||
++	    dev->reg_state == NETREG_UNREGISTERING) {
++		ASSERT_RTNL();
++
++		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
++						  txq);
++		if (rc)
++			return rc;
++
++		if (dev->num_tc)
++			netif_setup_tc(dev, txq);
++
++		dev_qdisc_change_real_num_tx(dev, txq);
++
++		dev->real_num_tx_queues = txq;
++
++		if (disabling) {
++			synchronize_net();
++			qdisc_reset_all_tx_gt(dev, txq);
++#ifdef CONFIG_XPS
++			netif_reset_xps_queues_gt(dev, txq);
++#endif
++		}
++	} else {
++		dev->real_num_tx_queues = txq;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(netif_set_real_num_tx_queues);
++
++#ifdef CONFIG_SYSFS
++/**
++ *	netif_set_real_num_rx_queues - set actual number of RX queues used
++ *	@dev: Network device
++ *	@rxq: Actual number of RX queues
++ *
++ *	This must be called either with the rtnl_lock held or before
++ *	registration of the net device.  Returns 0 on success, or a
++ *	negative error code.  If called before registration, it always
++ *	succeeds.
++ */
++int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
++{
++	int rc;
++
++	if (rxq < 1 || rxq > dev->num_rx_queues)
++		return -EINVAL;
++
++	if (dev->reg_state == NETREG_REGISTERED) {
++		ASSERT_RTNL();
++
++		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
++						  rxq);
++		if (rc)
++			return rc;
++	}
++
++	dev->real_num_rx_queues = rxq;
++	return 0;
++}
++EXPORT_SYMBOL(netif_set_real_num_rx_queues);
++#endif
++
++/**
++ *	netif_set_real_num_queues - set actual number of RX and TX queues used
++ *	@dev: Network device
++ *	@txq: Actual number of TX queues
++ *	@rxq: Actual number of RX queues
++ *
++ *	Set the real number of both TX and RX queues.
++ *	Does nothing if the number of queues is already correct.
++ */
++int netif_set_real_num_queues(struct net_device *dev,
++			      unsigned int txq, unsigned int rxq)
++{
++	unsigned int old_rxq = dev->real_num_rx_queues;
++	int err;
++
++	if (txq < 1 || txq > dev->num_tx_queues ||
++	    rxq < 1 || rxq > dev->num_rx_queues)
++		return -EINVAL;
++
++	/* Start from increases, so the error path only does decreases -
++	 * decreases can't fail.
++	 */
++	if (rxq > dev->real_num_rx_queues) {
++		err = netif_set_real_num_rx_queues(dev, rxq);
++		if (err)
++			return err;
++	}
++	if (txq > dev->real_num_tx_queues) {
++		err = netif_set_real_num_tx_queues(dev, txq);
++		if (err)
++			goto undo_rx;
++	}
++	if (rxq < dev->real_num_rx_queues)
++		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
++	if (txq < dev->real_num_tx_queues)
++		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
++
++	return 0;
++undo_rx:
++	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
++	return err;
++}
++EXPORT_SYMBOL(netif_set_real_num_queues);
++
++/**
++ * netif_set_tso_max_size() - set the max size of TSO frames supported
++ * @dev:	netdev to update
++ * @size:	max skb->len of a TSO frame
++ *
++ * Set the limit on the size of TSO super-frames the device can handle.
++ * Unless explicitly set the stack will assume the value of
++ * %GSO_LEGACY_MAX_SIZE.
++ */
++void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
++{
++	dev->tso_max_size = min(GSO_MAX_SIZE, size);
++	if (size < READ_ONCE(dev->gso_max_size))
++		netif_set_gso_max_size(dev, size);
++}
++EXPORT_SYMBOL(netif_set_tso_max_size);
++
++/**
++ * netif_set_tso_max_segs() - set the max number of segs supported for TSO
++ * @dev:	netdev to update
++ * @segs:	max number of TCP segments
++ *
++ * Set the limit on the number of TCP segments the device can generate from
++ * a single TSO super-frame.
++ * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
++ */
++void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
++{
++	dev->tso_max_segs = segs;
++	if (segs < READ_ONCE(dev->gso_max_segs))
++		netif_set_gso_max_segs(dev, segs);
++}
++EXPORT_SYMBOL(netif_set_tso_max_segs);
++
++/**
++ * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
++ * @to:		netdev to update
++ * @from:	netdev from which to copy the limits
++ */
++void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
++{
++	netif_set_tso_max_size(to, from->tso_max_size);
++	netif_set_tso_max_segs(to, from->tso_max_segs);
++}
++EXPORT_SYMBOL(netif_inherit_tso_max);
++
++/**
++ * netif_get_num_default_rss_queues - default number of RSS queues
++ *
++ * Default value is the number of physical cores if there are only 1 or 2, or
++ * divided by 2 if there are more.
++ */
++int netif_get_num_default_rss_queues(void)
++{
++	cpumask_var_t cpus;
++	int cpu, count = 0;
++
++	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
++		return 1;
++
++	cpumask_copy(cpus, cpu_online_mask);
++	for_each_cpu(cpu, cpus) {
++		++count;
++		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
++	}
++	free_cpumask_var(cpus);
++
++	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
++}
++EXPORT_SYMBOL(netif_get_num_default_rss_queues);
++
++static void __netif_reschedule(struct Qdisc *q)
++{
++	struct softnet_data *sd;
++	unsigned long flags;
++
++	local_irq_save(flags);
++	sd = this_cpu_ptr(&softnet_data);
++	q->next_sched = NULL;
++	*sd->output_queue_tailp = q;
++	sd->output_queue_tailp = &q->next_sched;
++	raise_softirq_irqoff(NET_TX_SOFTIRQ);
++	local_irq_restore(flags);
++}
++
++void __netif_schedule(struct Qdisc *q)
++{
++	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
++		__netif_reschedule(q);
++}
++EXPORT_SYMBOL(__netif_schedule);
++
++struct dev_kfree_skb_cb {
++	enum skb_free_reason reason;
++};
++
++static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
++{
++	return (struct dev_kfree_skb_cb *)skb->cb;
++}
++
++void netif_schedule_queue(struct netdev_queue *txq)
++{
++	rcu_read_lock();
++	if (!netif_xmit_stopped(txq)) {
++		struct Qdisc *q = rcu_dereference(txq->qdisc);
++
++		__netif_schedule(q);
++	}
++	rcu_read_unlock();
++}
++EXPORT_SYMBOL(netif_schedule_queue);
++
++void netif_tx_wake_queue(struct netdev_queue *dev_queue)
++{
++	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
++		struct Qdisc *q;
++
++		rcu_read_lock();
++		q = rcu_dereference(dev_queue->qdisc);
++		__netif_schedule(q);
++		rcu_read_unlock();
++	}
++}
++EXPORT_SYMBOL(netif_tx_wake_queue);
++
++void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
++{
++	unsigned long flags;
++
++	if (unlikely(!skb))
++		return;
++
++	if (likely(refcount_read(&skb->users) == 1)) {
++		smp_rmb();
++		refcount_set(&skb->users, 0);
++	} else if (likely(!refcount_dec_and_test(&skb->users))) {
++		return;
++	}
++	get_kfree_skb_cb(skb)->reason = reason;
++	local_irq_save(flags);
++	skb->next = __this_cpu_read(softnet_data.completion_queue);
++	__this_cpu_write(softnet_data.completion_queue, skb);
++	raise_softirq_irqoff(NET_TX_SOFTIRQ);
++	local_irq_restore(flags);
++}
++EXPORT_SYMBOL(__dev_kfree_skb_irq);
++
++void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
++{
++	if (in_hardirq() || irqs_disabled())
++		__dev_kfree_skb_irq(skb, reason);
++	else
++		dev_kfree_skb(skb);
++}
++EXPORT_SYMBOL(__dev_kfree_skb_any);
++
++
++/**
++ * netif_device_detach - mark device as removed
++ * @dev: network device
++ *
++ * Mark device as removed from system and therefore no longer available.
++ */
++void netif_device_detach(struct net_device *dev)
++{
++	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
++	    netif_running(dev)) {
++		netif_tx_stop_all_queues(dev);
++	}
++}
++EXPORT_SYMBOL(netif_device_detach);
++
++/**
++ * netif_device_attach - mark device as attached
++ * @dev: network device
++ *
++ * Mark device as attached from system and restart if needed.
++ */
++void netif_device_attach(struct net_device *dev)
++{
++	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
++	    netif_running(dev)) {
++		netif_tx_wake_all_queues(dev);
++		__netdev_watchdog_up(dev);
++	}
++}
++EXPORT_SYMBOL(netif_device_attach);
++
++/*
++ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
++ * to be used as a distribution range.
++ */
++static u16 skb_tx_hash(const struct net_device *dev,
++		       const struct net_device *sb_dev,
++		       struct sk_buff *skb)
++{
++	u32 hash;
++	u16 qoffset = 0;
++	u16 qcount = dev->real_num_tx_queues;
++
++	if (dev->num_tc) {
++		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
++
++		qoffset = sb_dev->tc_to_txq[tc].offset;
++		qcount = sb_dev->tc_to_txq[tc].count;
++		if (unlikely(!qcount)) {
++			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
++					     sb_dev->name, qoffset, tc);
++			qoffset = 0;
++			qcount = dev->real_num_tx_queues;
++		}
++	}
++
++	if (skb_rx_queue_recorded(skb)) {
++		hash = skb_get_rx_queue(skb);
++		if (hash >= qoffset)
++			hash -= qoffset;
++		while (unlikely(hash >= qcount))
++			hash -= qcount;
++		return hash + qoffset;
++	}
++
++	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
++}
++
++static void skb_warn_bad_offload(const struct sk_buff *skb)
++{
++	static const netdev_features_t null_features;
++	struct net_device *dev = skb->dev;
++	const char *name = "";
++
++	if (!net_ratelimit())
++		return;
++
++	if (dev) {
++		if (dev->dev.parent)
++			name = dev_driver_string(dev->dev.parent);
++		else
++			name = netdev_name(dev);
++	}
++	skb_dump(KERN_WARNING, skb, false);
++	WARN(1, "%s: caps=(%pNF, %pNF)\n",
++	     name, dev ? &dev->features : &null_features,
++	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
++}
++
++/*
++ * Invalidate hardware checksum when packet is to be mangled, and
++ * complete checksum manually on outgoing path.
++ */
++int skb_checksum_help(struct sk_buff *skb)
++{
++	__wsum csum;
++	int ret = 0, offset;
++
++	if (skb->ip_summed == CHECKSUM_COMPLETE)
++		goto out_set_summed;
++
++	if (unlikely(skb_is_gso(skb))) {
++		skb_warn_bad_offload(skb);
++		return -EINVAL;
++	}
++
++	/* Before computing a checksum, we should make sure no frag could
++	 * be modified by an external entity : checksum could be wrong.
++	 */
++	if (skb_has_shared_frag(skb)) {
++		ret = __skb_linearize(skb);
++		if (ret)
++			goto out;
++	}
++
++	offset = skb_checksum_start_offset(skb);
++	ret = -EINVAL;
++	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
++		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
++		goto out;
++	}
++	csum = skb_checksum(skb, offset, skb->len - offset, 0);
++
++	offset += skb->csum_offset;
++	if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) {
++		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
++		goto out;
++	}
++	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
++	if (ret)
++		goto out;
++
++	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
++out_set_summed:
++	skb->ip_summed = CHECKSUM_NONE;
++out:
++	return ret;
++}
++EXPORT_SYMBOL(skb_checksum_help);
++
++int skb_crc32c_csum_help(struct sk_buff *skb)
++{
++	__le32 crc32c_csum;
++	int ret = 0, offset, start;
++
++	if (skb->ip_summed != CHECKSUM_PARTIAL)
++		goto out;
++
++	if (unlikely(skb_is_gso(skb)))
++		goto out;
++
++	/* Before computing a checksum, we should make sure no frag could
++	 * be modified by an external entity : checksum could be wrong.
++	 */
++	if (unlikely(skb_has_shared_frag(skb))) {
++		ret = __skb_linearize(skb);
++		if (ret)
++			goto out;
++	}
++	start = skb_checksum_start_offset(skb);
++	offset = start + offsetof(struct sctphdr, checksum);
++	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
++		ret = -EINVAL;
++		goto out;
++	}
++
++	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
++	if (ret)
++		goto out;
++
++	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
++						  skb->len - start, ~(__u32)0,
++						  crc32c_csum_stub));
++	*(__le32 *)(skb->data + offset) = crc32c_csum;
++	skb->ip_summed = CHECKSUM_NONE;
++	skb->csum_not_inet = 0;
++out:
++	return ret;
++}
++
++__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
++{
++	__be16 type = skb->protocol;
++
++	/* Tunnel gso handlers can set protocol to ethernet. */
++	if (type == htons(ETH_P_TEB)) {
++		struct ethhdr *eth;
++
++		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
++			return 0;
++
++		eth = (struct ethhdr *)skb->data;
++		type = eth->h_proto;
++	}
++
++	return __vlan_get_protocol(skb, type, depth);
++}
++
++/* openvswitch calls this on rx path, so we need a different check.
++ */
++static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
++{
++	if (tx_path)
++		return skb->ip_summed != CHECKSUM_PARTIAL &&
++		       skb->ip_summed != CHECKSUM_UNNECESSARY;
++
++	return skb->ip_summed == CHECKSUM_NONE;
++}
++
++/**
++ *	__skb_gso_segment - Perform segmentation on skb.
++ *	@skb: buffer to segment
++ *	@features: features for the output path (see dev->features)
++ *	@tx_path: whether it is called in TX path
++ *
++ *	This function segments the given skb and returns a list of segments.
++ *
++ *	It may return NULL if the skb requires no segmentation.  This is
++ *	only possible when GSO is used for verifying header integrity.
++ *
++ *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
++ */
++struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
++				  netdev_features_t features, bool tx_path)
++{
++	struct sk_buff *segs;
++
++	if (unlikely(skb_needs_check(skb, tx_path))) {
++		int err;
++
++		/* We're going to init ->check field in TCP or UDP header */
++		err = skb_cow_head(skb, 0);
++		if (err < 0)
++			return ERR_PTR(err);
++	}
++
++	/* Only report GSO partial support if it will enable us to
++	 * support segmentation on this frame without needing additional
++	 * work.
++	 */
++	if (features & NETIF_F_GSO_PARTIAL) {
++		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
++		struct net_device *dev = skb->dev;
++
++		partial_features |= dev->features & dev->gso_partial_features;
++		if (!skb_gso_ok(skb, features | partial_features))
++			features &= ~NETIF_F_GSO_PARTIAL;
++	}
++
++	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
++		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
++
++	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
++	SKB_GSO_CB(skb)->encap_level = 0;
++
++	skb_reset_mac_header(skb);
++	skb_reset_mac_len(skb);
++
++	segs = skb_mac_gso_segment(skb, features);
++
++	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
++		skb_warn_bad_offload(skb);
++
++	return segs;
++}
++EXPORT_SYMBOL(__skb_gso_segment);
++
++/* Take action when hardware reception checksum errors are detected. */
++#ifdef CONFIG_BUG
++static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
++{
++	netdev_err(dev, "hw csum failure\n");
++	skb_dump(KERN_ERR, skb, true);
++	dump_stack();
++}
++
++void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
++{
++	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
++}
++EXPORT_SYMBOL(netdev_rx_csum_fault);
++#endif
++
++/* XXX: check that highmem exists at all on the given machine. */
++static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
++{
++#ifdef CONFIG_HIGHMEM
++	int i;
++
++	if (!(dev->features & NETIF_F_HIGHDMA)) {
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++			if (PageHighMem(skb_frag_page(frag)))
++				return 1;
++		}
++	}
++#endif
++	return 0;
++}
++
++/* If MPLS offload request, verify we are testing hardware MPLS features
++ * instead of standard features for the netdev.
++ */
++#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
++static netdev_features_t net_mpls_features(struct sk_buff *skb,
++					   netdev_features_t features,
++					   __be16 type)
++{
++	if (eth_p_mpls(type))
++		features &= skb->dev->mpls_features;
++
++	return features;
++}
++#else
++static netdev_features_t net_mpls_features(struct sk_buff *skb,
++					   netdev_features_t features,
++					   __be16 type)
++{
++	return features;
++}
++#endif
++
++static netdev_features_t harmonize_features(struct sk_buff *skb,
++	netdev_features_t features)
++{
++	__be16 type;
++
++	type = skb_network_protocol(skb, NULL);
++	features = net_mpls_features(skb, features, type);
++
++	if (skb->ip_summed != CHECKSUM_NONE &&
++	    !can_checksum_protocol(features, type)) {
++		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
++	}
++	if (illegal_highdma(skb->dev, skb))
++		features &= ~NETIF_F_SG;
++
++	return features;
++}
++
++netdev_features_t passthru_features_check(struct sk_buff *skb,
++					  struct net_device *dev,
++					  netdev_features_t features)
++{
++	return features;
++}
++EXPORT_SYMBOL(passthru_features_check);
++
++static netdev_features_t dflt_features_check(struct sk_buff *skb,
++					     struct net_device *dev,
++					     netdev_features_t features)
++{
++	return vlan_features_check(skb, features);
++}
++
++static netdev_features_t gso_features_check(const struct sk_buff *skb,
++					    struct net_device *dev,
++					    netdev_features_t features)
++{
++	u16 gso_segs = skb_shinfo(skb)->gso_segs;
++
++	if (gso_segs > READ_ONCE(dev->gso_max_segs))
++		return features & ~NETIF_F_GSO_MASK;
++
++	if (!skb_shinfo(skb)->gso_type) {
++		skb_warn_bad_offload(skb);
++		return features & ~NETIF_F_GSO_MASK;
++	}
++
++	/* Support for GSO partial features requires software
++	 * intervention before we can actually process the packets
++	 * so we need to strip support for any partial features now
++	 * and we can pull them back in after we have partially
++	 * segmented the frame.
++	 */
++	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
++		features &= ~dev->gso_partial_features;
++
++	/* Make sure to clear the IPv4 ID mangling feature if the
++	 * IPv4 header has the potential to be fragmented.
++	 */
++	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
++		struct iphdr *iph = skb->encapsulation ?
++				    inner_ip_hdr(skb) : ip_hdr(skb);
++
++		if (!(iph->frag_off & htons(IP_DF)))
++			features &= ~NETIF_F_TSO_MANGLEID;
++	}
++
++	return features;
++}
++
++netdev_features_t netif_skb_features(struct sk_buff *skb)
++{
++	struct net_device *dev = skb->dev;
++	netdev_features_t features = dev->features;
++
++	if (skb_is_gso(skb))
++		features = gso_features_check(skb, dev, features);
++
++	/* If encapsulation offload request, verify we are testing
++	 * hardware encapsulation features instead of standard
++	 * features for the netdev
++	 */
++	if (skb->encapsulation)
++		features &= dev->hw_enc_features;
++
++	if (skb_vlan_tagged(skb))
++		features = netdev_intersect_features(features,
++						     dev->vlan_features |
++						     NETIF_F_HW_VLAN_CTAG_TX |
++						     NETIF_F_HW_VLAN_STAG_TX);
++
++	if (dev->netdev_ops->ndo_features_check)
++		features &= dev->netdev_ops->ndo_features_check(skb, dev,
++								features);
++	else
++		features &= dflt_features_check(skb, dev, features);
++
++	return harmonize_features(skb, features);
++}
++EXPORT_SYMBOL(netif_skb_features);
++
++static int xmit_one(struct sk_buff *skb, struct net_device *dev,
++		    struct netdev_queue *txq, bool more)
++{
++	unsigned int len;
++	int rc;
++
++	if (dev_nit_active(dev))
++		dev_queue_xmit_nit(skb, dev);
++
++	len = skb->len;
++	trace_net_dev_start_xmit(skb, dev);
++	rc = netdev_start_xmit(skb, dev, txq, more);
++	trace_net_dev_xmit(skb, rc, dev, len);
++
++	return rc;
++}
++
++struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
++				    struct netdev_queue *txq, int *ret)
++{
++	struct sk_buff *skb = first;
++	int rc = NETDEV_TX_OK;
++
++	while (skb) {
++		struct sk_buff *next = skb->next;
++
++		skb_mark_not_on_list(skb);
++		rc = xmit_one(skb, dev, txq, next != NULL);
++		if (unlikely(!dev_xmit_complete(rc))) {
++			skb->next = next;
++			goto out;
++		}
++
++		skb = next;
++		if (netif_tx_queue_stopped(txq) && skb) {
++			rc = NETDEV_TX_BUSY;
++			break;
++		}
++	}
++
++out:
++	*ret = rc;
++	return skb;
++}
++
++static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
++					  netdev_features_t features)
++{
++	if (skb_vlan_tag_present(skb) &&
++	    !vlan_hw_offload_capable(features, skb->vlan_proto))
++		skb = __vlan_hwaccel_push_inside(skb);
++	return skb;
++}
++
++int skb_csum_hwoffload_help(struct sk_buff *skb,
++			    const netdev_features_t features)
++{
++	if (unlikely(skb_csum_is_sctp(skb)))
++		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
++			skb_crc32c_csum_help(skb);
++
++	if (features & NETIF_F_HW_CSUM)
++		return 0;
++
++	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
++		switch (skb->csum_offset) {
++		case offsetof(struct tcphdr, check):
++		case offsetof(struct udphdr, check):
++			return 0;
++		}
++	}
++
++	return skb_checksum_help(skb);
++}
++EXPORT_SYMBOL(skb_csum_hwoffload_help);
++
++static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
++{
++	netdev_features_t features;
++
++	features = netif_skb_features(skb);
++	skb = validate_xmit_vlan(skb, features);
++	if (unlikely(!skb))
++		goto out_null;
++
++	skb = sk_validate_xmit_skb(skb, dev);
++	if (unlikely(!skb))
++		goto out_null;
++
++	if (netif_needs_gso(skb, features)) {
++		struct sk_buff *segs;
++
++		segs = skb_gso_segment(skb, features);
++		if (IS_ERR(segs)) {
++			goto out_kfree_skb;
++		} else if (segs) {
++			consume_skb(skb);
++			skb = segs;
++		}
++	} else {
++		if (skb_needs_linearize(skb, features) &&
++		    __skb_linearize(skb))
++			goto out_kfree_skb;
++
++		/* If packet is not checksummed and device does not
++		 * support checksumming for this protocol, complete
++		 * checksumming here.
++		 */
++		if (skb->ip_summed == CHECKSUM_PARTIAL) {
++			if (skb->encapsulation)
++				skb_set_inner_transport_header(skb,
++							       skb_checksum_start_offset(skb));
++			else
++				skb_set_transport_header(skb,
++							 skb_checksum_start_offset(skb));
++			if (skb_csum_hwoffload_help(skb, features))
++				goto out_kfree_skb;
++		}
++	}
++
++	skb = validate_xmit_xfrm(skb, features, again);
++
++	return skb;
++
++out_kfree_skb:
++	kfree_skb(skb);
++out_null:
++	dev_core_stats_tx_dropped_inc(dev);
++	return NULL;
++}
++
++struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
++{
++	struct sk_buff *next, *head = NULL, *tail;
++
++	for (; skb != NULL; skb = next) {
++		next = skb->next;
++		skb_mark_not_on_list(skb);
++
++		/* in case skb wont be segmented, point to itself */
++		skb->prev = skb;
++
++		skb = validate_xmit_skb(skb, dev, again);
++		if (!skb)
++			continue;
++
++		if (!head)
++			head = skb;
++		else
++			tail->next = skb;
++		/* If skb was segmented, skb->prev points to
++		 * the last segment. If not, it still contains skb.
++		 */
++		tail = skb->prev;
++	}
++	return head;
++}
++EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
++
++static void qdisc_pkt_len_init(struct sk_buff *skb)
++{
++	const struct skb_shared_info *shinfo = skb_shinfo(skb);
++
++	qdisc_skb_cb(skb)->pkt_len = skb->len;
++
++	/* To get more precise estimation of bytes sent on wire,
++	 * we add to pkt_len the headers size of all segments
++	 */
++	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
++		unsigned int hdr_len;
++		u16 gso_segs = shinfo->gso_segs;
++
++		/* mac layer + network layer */
++		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
++
++		/* + transport layer */
++		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
++			const struct tcphdr *th;
++			struct tcphdr _tcphdr;
++
++			th = skb_header_pointer(skb, skb_transport_offset(skb),
++						sizeof(_tcphdr), &_tcphdr);
++			if (likely(th))
++				hdr_len += __tcp_hdrlen(th);
++		} else {
++			struct udphdr _udphdr;
++
++			if (skb_header_pointer(skb, skb_transport_offset(skb),
++					       sizeof(_udphdr), &_udphdr))
++				hdr_len += sizeof(struct udphdr);
++		}
++
++		if (shinfo->gso_type & SKB_GSO_DODGY)
++			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
++						shinfo->gso_size);
++
++		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
++	}
++}
++
++static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
++			     struct sk_buff **to_free,
++			     struct netdev_queue *txq)
++{
++	int rc;
++
++	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
++	if (rc == NET_XMIT_SUCCESS)
++		trace_qdisc_enqueue(q, txq, skb);
++	return rc;
++}
++
++static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
++				 struct net_device *dev,
++				 struct netdev_queue *txq)
++{
++	spinlock_t *root_lock = qdisc_lock(q);
++	struct sk_buff *to_free = NULL;
++	bool contended;
++	int rc;
++
++	qdisc_calculate_pkt_len(skb, q);
++
++	if (q->flags & TCQ_F_NOLOCK) {
++		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
++		    qdisc_run_begin(q)) {
++			/* Retest nolock_qdisc_is_empty() within the protection
++			 * of q->seqlock to protect from racing with requeuing.
++			 */
++			if (unlikely(!nolock_qdisc_is_empty(q))) {
++				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
++				__qdisc_run(q);
++				qdisc_run_end(q);
++
++				goto no_lock_out;
++			}
++
++			qdisc_bstats_cpu_update(q, skb);
++			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
++			    !nolock_qdisc_is_empty(q))
++				__qdisc_run(q);
++
++			qdisc_run_end(q);
++			return NET_XMIT_SUCCESS;
++		}
++
++		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
++		qdisc_run(q);
++
++no_lock_out:
++		if (unlikely(to_free))
++			kfree_skb_list_reason(to_free,
++					      SKB_DROP_REASON_QDISC_DROP);
++		return rc;
++	}
++
++	/*
++	 * Heuristic to force contended enqueues to serialize on a
++	 * separate lock before trying to get qdisc main lock.
++	 * This permits qdisc->running owner to get the lock more
++	 * often and dequeue packets faster.
++	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
++	 * and then other tasks will only enqueue packets. The packets will be
++	 * sent after the qdisc owner is scheduled again. To prevent this
++	 * scenario the task always serialize on the lock.
++	 */
++	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
++	if (unlikely(contended))
++		spin_lock(&q->busylock);
++
++	spin_lock(root_lock);
++	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
++		__qdisc_drop(skb, &to_free);
++		rc = NET_XMIT_DROP;
++	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
++		   qdisc_run_begin(q)) {
++		/*
++		 * This is a work-conserving queue; there are no old skbs
++		 * waiting to be sent out; and the qdisc is not running -
++		 * xmit the skb directly.
++		 */
++
++		qdisc_bstats_update(q, skb);
++
++		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
++			if (unlikely(contended)) {
++				spin_unlock(&q->busylock);
++				contended = false;
++			}
++			__qdisc_run(q);
++		}
++
++		qdisc_run_end(q);
++		rc = NET_XMIT_SUCCESS;
++	} else {
++		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
++		if (qdisc_run_begin(q)) {
++			if (unlikely(contended)) {
++				spin_unlock(&q->busylock);
++				contended = false;
++			}
++			__qdisc_run(q);
++			qdisc_run_end(q);
++		}
++	}
++	spin_unlock(root_lock);
++	if (unlikely(to_free))
++		kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
++	if (unlikely(contended))
++		spin_unlock(&q->busylock);
++	return rc;
++}
++
++#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
++static void skb_update_prio(struct sk_buff *skb)
++{
++	const struct netprio_map *map;
++	const struct sock *sk;
++	unsigned int prioidx;
++
++	if (skb->priority)
++		return;
++	map = rcu_dereference_bh(skb->dev->priomap);
++	if (!map)
++		return;
++	sk = skb_to_full_sk(skb);
++	if (!sk)
++		return;
++
++	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
++
++	if (prioidx < map->priomap_len)
++		skb->priority = map->priomap[prioidx];
++}
++#else
++#define skb_update_prio(skb)
++#endif
++
++/**
++ *	dev_loopback_xmit - loop back @skb
++ *	@net: network namespace this loopback is happening in
++ *	@sk:  sk needed to be a netfilter okfn
++ *	@skb: buffer to transmit
++ */
++int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
++{
++	skb_reset_mac_header(skb);
++	__skb_pull(skb, skb_network_offset(skb));
++	skb->pkt_type = PACKET_LOOPBACK;
++	if (skb->ip_summed == CHECKSUM_NONE)
++		skb->ip_summed = CHECKSUM_UNNECESSARY;
++	DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
++	skb_dst_force(skb);
++	netif_rx(skb);
++	return 0;
++}
++EXPORT_SYMBOL(dev_loopback_xmit);
++
++#ifdef CONFIG_NET_EGRESS
++static struct sk_buff *
++sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
++{
++#ifdef CONFIG_NET_CLS_ACT
++	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
++	struct tcf_result cl_res;
++
++	if (!miniq)
++		return skb;
++
++	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
++	tc_skb_cb(skb)->mru = 0;
++	tc_skb_cb(skb)->post_ct = false;
++	mini_qdisc_bstats_cpu_update(miniq, skb);
++
++	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
++	case TC_ACT_OK:
++	case TC_ACT_RECLASSIFY:
++		skb->tc_index = TC_H_MIN(cl_res.classid);
++		break;
++	case TC_ACT_SHOT:
++		mini_qdisc_qstats_cpu_drop(miniq);
++		*ret = NET_XMIT_DROP;
++		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
++		return NULL;
++	case TC_ACT_STOLEN:
++	case TC_ACT_QUEUED:
++	case TC_ACT_TRAP:
++		*ret = NET_XMIT_SUCCESS;
++		consume_skb(skb);
++		return NULL;
++	case TC_ACT_REDIRECT:
++		/* No need to push/pop skb's mac_header here on egress! */
++		skb_do_redirect(skb);
++		*ret = NET_XMIT_SUCCESS;
++		return NULL;
++	default:
++		break;
++	}
++#endif /* CONFIG_NET_CLS_ACT */
++
++	return skb;
++}
++
++static struct netdev_queue *
++netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
++{
++	int qm = skb_get_queue_mapping(skb);
++
++	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
++}
++
++static bool netdev_xmit_txqueue_skipped(void)
++{
++	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
++}
++
++void netdev_xmit_skip_txqueue(bool skip)
++{
++	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
++}
++EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
++#endif /* CONFIG_NET_EGRESS */
++
++#ifdef CONFIG_XPS
++static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
++			       struct xps_dev_maps *dev_maps, unsigned int tci)
++{
++	int tc = netdev_get_prio_tc_map(dev, skb->priority);
++	struct xps_map *map;
++	int queue_index = -1;
++
++	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
++		return queue_index;
++
++	tci *= dev_maps->num_tc;
++	tci += tc;
++
++	map = rcu_dereference(dev_maps->attr_map[tci]);
++	if (map) {
++		if (map->len == 1)
++			queue_index = map->queues[0];
++		else
++			queue_index = map->queues[reciprocal_scale(
++						skb_get_hash(skb), map->len)];
++		if (unlikely(queue_index >= dev->real_num_tx_queues))
++			queue_index = -1;
++	}
++	return queue_index;
++}
++#endif
++
++static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
++			 struct sk_buff *skb)
++{
++#ifdef CONFIG_XPS
++	struct xps_dev_maps *dev_maps;
++	struct sock *sk = skb->sk;
++	int queue_index = -1;
++
++	if (!static_key_false(&xps_needed))
++		return -1;
++
++	rcu_read_lock();
++	if (!static_key_false(&xps_rxqs_needed))
++		goto get_cpus_map;
++
++	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
++	if (dev_maps) {
++		int tci = sk_rx_queue_get(sk);
++
++		if (tci >= 0)
++			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
++							  tci);
++	}
++
++get_cpus_map:
++	if (queue_index < 0) {
++		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
++		if (dev_maps) {
++			unsigned int tci = skb->sender_cpu - 1;
++
++			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
++							  tci);
++		}
++	}
++	rcu_read_unlock();
++
++	return queue_index;
++#else
++	return -1;
++#endif
++}
++
++u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
++		     struct net_device *sb_dev)
++{
++	return 0;
++}
++EXPORT_SYMBOL(dev_pick_tx_zero);
++
++u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
++		       struct net_device *sb_dev)
++{
++	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
++}
++EXPORT_SYMBOL(dev_pick_tx_cpu_id);
++
++u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
++		     struct net_device *sb_dev)
++{
++	struct sock *sk = skb->sk;
++	int queue_index = sk_tx_queue_get(sk);
++
++	sb_dev = sb_dev ? : dev;
++
++	if (queue_index < 0 || skb->ooo_okay ||
++	    queue_index >= dev->real_num_tx_queues) {
++		int new_index = get_xps_queue(dev, sb_dev, skb);
++
++		if (new_index < 0)
++			new_index = skb_tx_hash(dev, sb_dev, skb);
++
++		if (queue_index != new_index && sk &&
++		    sk_fullsock(sk) &&
++		    rcu_access_pointer(sk->sk_dst_cache))
++			sk_tx_queue_set(sk, new_index);
++
++		queue_index = new_index;
++	}
++
++	return queue_index;
++}
++EXPORT_SYMBOL(netdev_pick_tx);
++
++struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
++					 struct sk_buff *skb,
++					 struct net_device *sb_dev)
++{
++	int queue_index = 0;
++
++#ifdef CONFIG_XPS
++	u32 sender_cpu = skb->sender_cpu - 1;
++
++	if (sender_cpu >= (u32)NR_CPUS)
++		skb->sender_cpu = raw_smp_processor_id() + 1;
++#endif
++
++	if (dev->real_num_tx_queues != 1) {
++		const struct net_device_ops *ops = dev->netdev_ops;
++
++		if (ops->ndo_select_queue)
++			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
++		else
++			queue_index = netdev_pick_tx(dev, skb, sb_dev);
++
++		queue_index = netdev_cap_txqueue(dev, queue_index);
++	}
++
++	skb_set_queue_mapping(skb, queue_index);
++	return netdev_get_tx_queue(dev, queue_index);
++}
++
++/**
++ * __dev_queue_xmit() - transmit a buffer
++ * @skb:	buffer to transmit
++ * @sb_dev:	suboordinate device used for L2 forwarding offload
++ *
++ * Queue a buffer for transmission to a network device. The caller must
++ * have set the device and priority and built the buffer before calling
++ * this function. The function can be called from an interrupt.
++ *
++ * When calling this method, interrupts MUST be enabled. This is because
++ * the BH enable code must have IRQs enabled so that it will not deadlock.
++ *
++ * Regardless of the return value, the skb is consumed, so it is currently
++ * difficult to retry a send to this method. (You can bump the ref count
++ * before sending to hold a reference for retry if you are careful.)
++ *
++ * Return:
++ * * 0				- buffer successfully transmitted
++ * * positive qdisc return code	- NET_XMIT_DROP etc.
++ * * negative errno		- other errors
++ */
++int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
++{
++	struct net_device *dev = skb->dev;
++	struct netdev_queue *txq = NULL;
++	struct Qdisc *q;
++	int rc = -ENOMEM;
++	bool again = false;
++
++	skb_reset_mac_header(skb);
++	skb_assert_len(skb);
++
++	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
++		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
++
++	/* Disable soft irqs for various locks below. Also
++	 * stops preemption for RCU.
++	 */
++	rcu_read_lock_bh();
++
++	skb_update_prio(skb);
++
++	qdisc_pkt_len_init(skb);
++#ifdef CONFIG_NET_CLS_ACT
++	skb->tc_at_ingress = 0;
++#endif
++#ifdef CONFIG_NET_EGRESS
++	if (static_branch_unlikely(&egress_needed_key)) {
++		if (nf_hook_egress_active()) {
++			skb = nf_hook_egress(skb, &rc, dev);
++			if (!skb)
++				goto out;
++		}
++
++		netdev_xmit_skip_txqueue(false);
++
++		nf_skip_egress(skb, true);
++		skb = sch_handle_egress(skb, &rc, dev);
++		if (!skb)
++			goto out;
++		nf_skip_egress(skb, false);
++
++		if (netdev_xmit_txqueue_skipped())
++			txq = netdev_tx_queue_mapping(dev, skb);
++	}
++#endif
++	/* If device/qdisc don't need skb->dst, release it right now while
++	 * its hot in this cpu cache.
++	 */
++	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
++		skb_dst_drop(skb);
++	else
++		skb_dst_force(skb);
++
++	if (!txq)
++		txq = netdev_core_pick_tx(dev, skb, sb_dev);
++
++	q = rcu_dereference_bh(txq->qdisc);
++
++	trace_net_dev_queue(skb);
++	if (q->enqueue) {
++		rc = __dev_xmit_skb(skb, q, dev, txq);
++		goto out;
++	}
++
++	/* The device has no queue. Common case for software devices:
++	 * loopback, all the sorts of tunnels...
++
++	 * Really, it is unlikely that netif_tx_lock protection is necessary
++	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
++	 * counters.)
++	 * However, it is possible, that they rely on protection
++	 * made by us here.
++
++	 * Check this and shot the lock. It is not prone from deadlocks.
++	 *Either shot noqueue qdisc, it is even simpler 8)
++	 */
++	if (dev->flags & IFF_UP) {
++		int cpu = smp_processor_id(); /* ok because BHs are off */
++
++		/* Other cpus might concurrently change txq->xmit_lock_owner
++		 * to -1 or to their cpu id, but not to our id.
++		 */
++		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
++			if (dev_xmit_recursion())
++				goto recursion_alert;
++
++			skb = validate_xmit_skb(skb, dev, &again);
++			if (!skb)
++				goto out;
++
++			HARD_TX_LOCK(dev, txq, cpu);
++
++			if (!netif_xmit_stopped(txq)) {
++				dev_xmit_recursion_inc();
++				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
++				dev_xmit_recursion_dec();
++				if (dev_xmit_complete(rc)) {
++					HARD_TX_UNLOCK(dev, txq);
++					goto out;
++				}
++			}
++			HARD_TX_UNLOCK(dev, txq);
++			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
++					     dev->name);
++		} else {
++			/* Recursion is detected! It is possible,
++			 * unfortunately
++			 */
++recursion_alert:
++			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
++					     dev->name);
++		}
++	}
++
++	rc = -ENETDOWN;
++	rcu_read_unlock_bh();
++
++	dev_core_stats_tx_dropped_inc(dev);
++	kfree_skb_list(skb);
++	return rc;
++out:
++	rcu_read_unlock_bh();
++	return rc;
++}
++EXPORT_SYMBOL(__dev_queue_xmit);
++
++int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
++{
++	struct net_device *dev = skb->dev;
++	struct sk_buff *orig_skb = skb;
++	struct netdev_queue *txq;
++	int ret = NETDEV_TX_BUSY;
++	bool again = false;
++
++	if (unlikely(!netif_running(dev) ||
++		     !netif_carrier_ok(dev)))
++		goto drop;
++
++	skb = validate_xmit_skb_list(skb, dev, &again);
++	if (skb != orig_skb)
++		goto drop;
++
++	skb_set_queue_mapping(skb, queue_id);
++	txq = skb_get_tx_queue(dev, skb);
++
++	local_bh_disable();
++
++	dev_xmit_recursion_inc();
++	HARD_TX_LOCK(dev, txq, smp_processor_id());
++	if (!netif_xmit_frozen_or_drv_stopped(txq))
++		ret = netdev_start_xmit(skb, dev, txq, false);
++	HARD_TX_UNLOCK(dev, txq);
++	dev_xmit_recursion_dec();
++
++	local_bh_enable();
++	return ret;
++drop:
++	dev_core_stats_tx_dropped_inc(dev);
++	kfree_skb_list(skb);
++	return NET_XMIT_DROP;
++}
++EXPORT_SYMBOL(__dev_direct_xmit);
++
++/*************************************************************************
++ *			Receiver routines
++ *************************************************************************/
++
++int netdev_max_backlog __read_mostly = 1000;
++EXPORT_SYMBOL(netdev_max_backlog);
++
++int netdev_tstamp_prequeue __read_mostly = 1;
++unsigned int sysctl_skb_defer_max __read_mostly = 64;
++int netdev_budget __read_mostly = 300;
++/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
++unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
++int weight_p __read_mostly = 64;           /* old backlog weight */
++int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
++int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
++int dev_rx_weight __read_mostly = 64;
++int dev_tx_weight __read_mostly = 64;
++
++/* Called with irq disabled */
++static inline void ____napi_schedule(struct softnet_data *sd,
++				     struct napi_struct *napi)
++{
++	struct task_struct *thread;
++
++	lockdep_assert_irqs_disabled();
++
++	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
++		/* Paired with smp_mb__before_atomic() in
++		 * napi_enable()/dev_set_threaded().
++		 * Use READ_ONCE() to guarantee a complete
++		 * read on napi->thread. Only call
++		 * wake_up_process() when it's not NULL.
++		 */
++		thread = READ_ONCE(napi->thread);
++		if (thread) {
++			/* Avoid doing set_bit() if the thread is in
++			 * INTERRUPTIBLE state, cause napi_thread_wait()
++			 * makes sure to proceed with napi polling
++			 * if the thread is explicitly woken from here.
++			 */
++			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
++				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
++			wake_up_process(thread);
++			return;
++		}
++	}
++
++	list_add_tail(&napi->poll_list, &sd->poll_list);
++	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
++}
++
++#ifdef CONFIG_RPS
++
++/* One global table that all flow-based protocols share. */
++struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
++EXPORT_SYMBOL(rps_sock_flow_table);
++u32 rps_cpu_mask __read_mostly;
++EXPORT_SYMBOL(rps_cpu_mask);
++
++struct static_key_false rps_needed __read_mostly;
++EXPORT_SYMBOL(rps_needed);
++struct static_key_false rfs_needed __read_mostly;
++EXPORT_SYMBOL(rfs_needed);
++
++static struct rps_dev_flow *
++set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
++	    struct rps_dev_flow *rflow, u16 next_cpu)
++{
++	if (next_cpu < nr_cpu_ids) {
++#ifdef CONFIG_RFS_ACCEL
++		struct netdev_rx_queue *rxqueue;
++		struct rps_dev_flow_table *flow_table;
++		struct rps_dev_flow *old_rflow;
++		u32 flow_id;
++		u16 rxq_index;
++		int rc;
++
++		/* Should we steer this flow to a different hardware queue? */
++		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
++		    !(dev->features & NETIF_F_NTUPLE))
++			goto out;
++		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
++		if (rxq_index == skb_get_rx_queue(skb))
++			goto out;
++
++		rxqueue = dev->_rx + rxq_index;
++		flow_table = rcu_dereference(rxqueue->rps_flow_table);
++		if (!flow_table)
++			goto out;
++		flow_id = skb_get_hash(skb) & flow_table->mask;
++		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
++							rxq_index, flow_id);
++		if (rc < 0)
++			goto out;
++		old_rflow = rflow;
++		rflow = &flow_table->flows[flow_id];
++		rflow->filter = rc;
++		if (old_rflow->filter == rflow->filter)
++			old_rflow->filter = RPS_NO_FILTER;
++	out:
++#endif
++		rflow->last_qtail =
++			per_cpu(softnet_data, next_cpu).input_queue_head;
++	}
++
++	rflow->cpu = next_cpu;
++	return rflow;
++}
++
++/*
++ * get_rps_cpu is called from netif_receive_skb and returns the target
++ * CPU from the RPS map of the receiving queue for a given skb.
++ * rcu_read_lock must be held on entry.
++ */
++static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
++		       struct rps_dev_flow **rflowp)
++{
++	const struct rps_sock_flow_table *sock_flow_table;
++	struct netdev_rx_queue *rxqueue = dev->_rx;
++	struct rps_dev_flow_table *flow_table;
++	struct rps_map *map;
++	int cpu = -1;
++	u32 tcpu;
++	u32 hash;
++
++	if (skb_rx_queue_recorded(skb)) {
++		u16 index = skb_get_rx_queue(skb);
++
++		if (unlikely(index >= dev->real_num_rx_queues)) {
++			WARN_ONCE(dev->real_num_rx_queues > 1,
++				  "%s received packet on queue %u, but number "
++				  "of RX queues is %u\n",
++				  dev->name, index, dev->real_num_rx_queues);
++			goto done;
++		}
++		rxqueue += index;
++	}
++
++	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
++
++	flow_table = rcu_dereference(rxqueue->rps_flow_table);
++	map = rcu_dereference(rxqueue->rps_map);
++	if (!flow_table && !map)
++		goto done;
++
++	skb_reset_network_header(skb);
++	hash = skb_get_hash(skb);
++	if (!hash)
++		goto done;
++
++	sock_flow_table = rcu_dereference(rps_sock_flow_table);
++	if (flow_table && sock_flow_table) {
++		struct rps_dev_flow *rflow;
++		u32 next_cpu;
++		u32 ident;
++
++		/* First check into global flow table if there is a match */
++		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
++		if ((ident ^ hash) & ~rps_cpu_mask)
++			goto try_rps;
++
++		next_cpu = ident & rps_cpu_mask;
++
++		/* OK, now we know there is a match,
++		 * we can look at the local (per receive queue) flow table
++		 */
++		rflow = &flow_table->flows[hash & flow_table->mask];
++		tcpu = rflow->cpu;
++
++		/*
++		 * If the desired CPU (where last recvmsg was done) is
++		 * different from current CPU (one in the rx-queue flow
++		 * table entry), switch if one of the following holds:
++		 *   - Current CPU is unset (>= nr_cpu_ids).
++		 *   - Current CPU is offline.
++		 *   - The current CPU's queue tail has advanced beyond the
++		 *     last packet that was enqueued using this table entry.
++		 *     This guarantees that all previous packets for the flow
++		 *     have been dequeued, thus preserving in order delivery.
++		 */
++		if (unlikely(tcpu != next_cpu) &&
++		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
++		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
++		      rflow->last_qtail)) >= 0)) {
++			tcpu = next_cpu;
++			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
++		}
++
++		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
++			*rflowp = rflow;
++			cpu = tcpu;
++			goto done;
++		}
++	}
++
++try_rps:
++
++	if (map) {
++		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
++		if (cpu_online(tcpu)) {
++			cpu = tcpu;
++			goto done;
++		}
++	}
++
++done:
++	return cpu;
++}
++
++#ifdef CONFIG_RFS_ACCEL
++
++/**
++ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
++ * @dev: Device on which the filter was set
++ * @rxq_index: RX queue index
++ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
++ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
++ *
++ * Drivers that implement ndo_rx_flow_steer() should periodically call
++ * this function for each installed filter and remove the filters for
++ * which it returns %true.
++ */
++bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
++			 u32 flow_id, u16 filter_id)
++{
++	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
++	struct rps_dev_flow_table *flow_table;
++	struct rps_dev_flow *rflow;
++	bool expire = true;
++	unsigned int cpu;
++
++	rcu_read_lock();
++	flow_table = rcu_dereference(rxqueue->rps_flow_table);
++	if (flow_table && flow_id <= flow_table->mask) {
++		rflow = &flow_table->flows[flow_id];
++		cpu = READ_ONCE(rflow->cpu);
++		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
++		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
++			   rflow->last_qtail) <
++		     (int)(10 * flow_table->mask)))
++			expire = false;
++	}
++	rcu_read_unlock();
++	return expire;
++}
++EXPORT_SYMBOL(rps_may_expire_flow);
++
++#endif /* CONFIG_RFS_ACCEL */
++
++/* Called from hardirq (IPI) context */
++static void rps_trigger_softirq(void *data)
++{
++	struct softnet_data *sd = data;
++
++	____napi_schedule(sd, &sd->backlog);
++	sd->received_rps++;
++}
++
++#endif /* CONFIG_RPS */
++
++/* Called from hardirq (IPI) context */
++static void trigger_rx_softirq(void *data)
++{
++	struct softnet_data *sd = data;
++
++	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
++	smp_store_release(&sd->defer_ipi_scheduled, 0);
++}
++
++/*
++ * Check if this softnet_data structure is another cpu one
++ * If yes, queue it to our IPI list and return 1
++ * If no, return 0
++ */
++static int napi_schedule_rps(struct softnet_data *sd)
++{
++	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
++
++#ifdef CONFIG_RPS
++	if (sd != mysd) {
++		sd->rps_ipi_next = mysd->rps_ipi_list;
++		mysd->rps_ipi_list = sd;
++
++		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
++		return 1;
++	}
++#endif /* CONFIG_RPS */
++	__napi_schedule_irqoff(&mysd->backlog);
++	return 0;
++}
++
++#ifdef CONFIG_NET_FLOW_LIMIT
++int netdev_flow_limit_table_len __read_mostly = (1 << 12);
++#endif
++
++static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
++{
++#ifdef CONFIG_NET_FLOW_LIMIT
++	struct sd_flow_limit *fl;
++	struct softnet_data *sd;
++	unsigned int old_flow, new_flow;
++
++	if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
++		return false;
++
++	sd = this_cpu_ptr(&softnet_data);
++
++	rcu_read_lock();
++	fl = rcu_dereference(sd->flow_limit);
++	if (fl) {
++		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
++		old_flow = fl->history[fl->history_head];
++		fl->history[fl->history_head] = new_flow;
++
++		fl->history_head++;
++		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
++
++		if (likely(fl->buckets[old_flow]))
++			fl->buckets[old_flow]--;
++
++		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
++			fl->count++;
++			rcu_read_unlock();
++			return true;
++		}
++	}
++	rcu_read_unlock();
++#endif
++	return false;
++}
++
++/*
++ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
++ * queue (may be a remote CPU queue).
++ */
++static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
++			      unsigned int *qtail)
++{
++	enum skb_drop_reason reason;
++	struct softnet_data *sd;
++	unsigned long flags;
++	unsigned int qlen;
++
++	reason = SKB_DROP_REASON_NOT_SPECIFIED;
++	sd = &per_cpu(softnet_data, cpu);
++
++	rps_lock_irqsave(sd, &flags);
++	if (!netif_running(skb->dev))
++		goto drop;
++	qlen = skb_queue_len(&sd->input_pkt_queue);
++	if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
++		if (qlen) {
++enqueue:
++			__skb_queue_tail(&sd->input_pkt_queue, skb);
++			input_queue_tail_incr_save(sd, qtail);
++			rps_unlock_irq_restore(sd, &flags);
++			return NET_RX_SUCCESS;
++		}
++
++		/* Schedule NAPI for backlog device
++		 * We can use non atomic operation since we own the queue lock
++		 */
++		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
++			napi_schedule_rps(sd);
++		goto enqueue;
++	}
++	reason = SKB_DROP_REASON_CPU_BACKLOG;
++
++drop:
++	sd->dropped++;
++	rps_unlock_irq_restore(sd, &flags);
++
++	dev_core_stats_rx_dropped_inc(skb->dev);
++	kfree_skb_reason(skb, reason);
++	return NET_RX_DROP;
++}
++
++static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
++{
++	struct net_device *dev = skb->dev;
++	struct netdev_rx_queue *rxqueue;
++
++	rxqueue = dev->_rx;
++
++	if (skb_rx_queue_recorded(skb)) {
++		u16 index = skb_get_rx_queue(skb);
++
++		if (unlikely(index >= dev->real_num_rx_queues)) {
++			WARN_ONCE(dev->real_num_rx_queues > 1,
++				  "%s received packet on queue %u, but number "
++				  "of RX queues is %u\n",
++				  dev->name, index, dev->real_num_rx_queues);
++
++			return rxqueue; /* Return first rxqueue */
++		}
++		rxqueue += index;
++	}
++	return rxqueue;
++}
++
++u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
++			     struct bpf_prog *xdp_prog)
++{
++	void *orig_data, *orig_data_end, *hard_start;
++	struct netdev_rx_queue *rxqueue;
++	bool orig_bcast, orig_host;
++	u32 mac_len, frame_sz;
++	__be16 orig_eth_type;
++	struct ethhdr *eth;
++	u32 metalen, act;
++	int off;
++
++	/* The XDP program wants to see the packet starting at the MAC
++	 * header.
++	 */
++	mac_len = skb->data - skb_mac_header(skb);
++	hard_start = skb->data - skb_headroom(skb);
++
++	/* SKB "head" area always have tailroom for skb_shared_info */
++	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
++	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++
++	rxqueue = netif_get_rxqueue(skb);
++	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
++	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
++			 skb_headlen(skb) + mac_len, true);
++
++	orig_data_end = xdp->data_end;
++	orig_data = xdp->data;
++	eth = (struct ethhdr *)xdp->data;
++	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
++	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
++	orig_eth_type = eth->h_proto;
++
++	act = bpf_prog_run_xdp(xdp_prog, xdp);
++
++	/* check if bpf_xdp_adjust_head was used */
++	off = xdp->data - orig_data;
++	if (off) {
++		if (off > 0)
++			__skb_pull(skb, off);
++		else if (off < 0)
++			__skb_push(skb, -off);
++
++		skb->mac_header += off;
++		skb_reset_network_header(skb);
++	}
++
++	/* check if bpf_xdp_adjust_tail was used */
++	off = xdp->data_end - orig_data_end;
++	if (off != 0) {
++		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
++		skb->len += off; /* positive on grow, negative on shrink */
++	}
++
++	/* check if XDP changed eth hdr such SKB needs update */
++	eth = (struct ethhdr *)xdp->data;
++	if ((orig_eth_type != eth->h_proto) ||
++	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
++						  skb->dev->dev_addr)) ||
++	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
++		__skb_push(skb, ETH_HLEN);
++		skb->pkt_type = PACKET_HOST;
++		skb->protocol = eth_type_trans(skb, skb->dev);
++	}
++
++	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
++	 * before calling us again on redirect path. We do not call do_redirect
++	 * as we leave that up to the caller.
++	 *
++	 * Caller is responsible for managing lifetime of skb (i.e. calling
++	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
++	 */
++	switch (act) {
++	case XDP_REDIRECT:
++	case XDP_TX:
++		__skb_push(skb, mac_len);
++		break;
++	case XDP_PASS:
++		metalen = xdp->data - xdp->data_meta;
++		if (metalen)
++			skb_metadata_set(skb, metalen);
++		break;
++	}
++
++	return act;
++}
++
++static u32 netif_receive_generic_xdp(struct sk_buff *skb,
++				     struct xdp_buff *xdp,
++				     struct bpf_prog *xdp_prog)
++{
++	u32 act = XDP_DROP;
++
++	/* Reinjected packets coming from act_mirred or similar should
++	 * not get XDP generic processing.
++	 */
++	if (skb_is_redirected(skb))
++		return XDP_PASS;
++
++	/* XDP packets must be linear and must have sufficient headroom
++	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
++	 * native XDP provides, thus we need to do it here as well.
++	 */
++	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
++	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
++		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
++		int troom = skb->tail + skb->data_len - skb->end;
++
++		/* In case we have to go down the path and also linearize,
++		 * then lets do the pskb_expand_head() work just once here.
++		 */
++		if (pskb_expand_head(skb,
++				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
++				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
++			goto do_drop;
++		if (skb_linearize(skb))
++			goto do_drop;
++	}
++
++	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
++	switch (act) {
++	case XDP_REDIRECT:
++	case XDP_TX:
++	case XDP_PASS:
++		break;
++	default:
++		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
++		fallthrough;
++	case XDP_ABORTED:
++		trace_xdp_exception(skb->dev, xdp_prog, act);
++		fallthrough;
++	case XDP_DROP:
++	do_drop:
++		kfree_skb(skb);
++		break;
++	}
++
++	return act;
++}
++
++/* When doing generic XDP we have to bypass the qdisc layer and the
++ * network taps in order to match in-driver-XDP behavior. This also means
++ * that XDP packets are able to starve other packets going through a qdisc,
++ * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
++ * queues, so they do not have this starvation issue.
++ */
++void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
++{
++	struct net_device *dev = skb->dev;
++	struct netdev_queue *txq;
++	bool free_skb = true;
++	int cpu, rc;
++
++	txq = netdev_core_pick_tx(dev, skb, NULL);
++	cpu = smp_processor_id();
++	HARD_TX_LOCK(dev, txq, cpu);
++	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
++		rc = netdev_start_xmit(skb, dev, txq, 0);
++		if (dev_xmit_complete(rc))
++			free_skb = false;
++	}
++	HARD_TX_UNLOCK(dev, txq);
++	if (free_skb) {
++		trace_xdp_exception(dev, xdp_prog, XDP_TX);
++		dev_core_stats_tx_dropped_inc(dev);
++		kfree_skb(skb);
++	}
++}
++
++static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
++
++int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
++{
++	if (xdp_prog) {
++		struct xdp_buff xdp;
++		u32 act;
++		int err;
++
++		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
++		if (act != XDP_PASS) {
++			switch (act) {
++			case XDP_REDIRECT:
++				err = xdp_do_generic_redirect(skb->dev, skb,
++							      &xdp, xdp_prog);
++				if (err)
++					goto out_redir;
++				break;
++			case XDP_TX:
++				generic_xdp_tx(skb, xdp_prog);
++				break;
++			}
++			return XDP_DROP;
++		}
++	}
++	return XDP_PASS;
++out_redir:
++	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
++	return XDP_DROP;
++}
++EXPORT_SYMBOL_GPL(do_xdp_generic);
++
++static int netif_rx_internal(struct sk_buff *skb)
++{
++	int ret;
++
++	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
++
++	trace_netif_rx(skb);
++
++#ifdef CONFIG_RPS
++	if (static_branch_unlikely(&rps_needed)) {
++		struct rps_dev_flow voidflow, *rflow = &voidflow;
++		int cpu;
++
++		rcu_read_lock();
++
++		cpu = get_rps_cpu(skb->dev, skb, &rflow);
++		if (cpu < 0)
++			cpu = smp_processor_id();
++
++		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
++
++		rcu_read_unlock();
++	} else
++#endif
++	{
++		unsigned int qtail;
++
++		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
++	}
++	return ret;
++}
++
++/**
++ *	__netif_rx	-	Slightly optimized version of netif_rx
++ *	@skb: buffer to post
++ *
++ *	This behaves as netif_rx except that it does not disable bottom halves.
++ *	As a result this function may only be invoked from the interrupt context
++ *	(either hard or soft interrupt).
++ */
++int __netif_rx(struct sk_buff *skb)
++{
++	int ret;
++
++	lockdep_assert_once(hardirq_count() | softirq_count());
++
++	trace_netif_rx_entry(skb);
++	ret = netif_rx_internal(skb);
++	trace_netif_rx_exit(ret);
++	return ret;
++}
++EXPORT_SYMBOL(__netif_rx);
++
++/**
++ *	netif_rx	-	post buffer to the network code
++ *	@skb: buffer to post
++ *
++ *	This function receives a packet from a device driver and queues it for
++ *	the upper (protocol) levels to process via the backlog NAPI device. It
++ *	always succeeds. The buffer may be dropped during processing for
++ *	congestion control or by the protocol layers.
++ *	The network buffer is passed via the backlog NAPI device. Modern NIC
++ *	driver should use NAPI and GRO.
++ *	This function can used from interrupt and from process context. The
++ *	caller from process context must not disable interrupts before invoking
++ *	this function.
++ *
++ *	return values:
++ *	NET_RX_SUCCESS	(no congestion)
++ *	NET_RX_DROP     (packet was dropped)
++ *
++ */
++int netif_rx(struct sk_buff *skb)
++{
++	bool need_bh_off = !(hardirq_count() | softirq_count());
++	int ret;
++
++	if (need_bh_off)
++		local_bh_disable();
++	trace_netif_rx_entry(skb);
++	ret = netif_rx_internal(skb);
++	trace_netif_rx_exit(ret);
++	if (need_bh_off)
++		local_bh_enable();
++	return ret;
++}
++EXPORT_SYMBOL(netif_rx);
++
++static __latent_entropy void net_tx_action(struct softirq_action *h)
++{
++	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
++
++	if (sd->completion_queue) {
++		struct sk_buff *clist;
++
++		local_irq_disable();
++		clist = sd->completion_queue;
++		sd->completion_queue = NULL;
++		local_irq_enable();
++
++		while (clist) {
++			struct sk_buff *skb = clist;
++
++			clist = clist->next;
++
++			WARN_ON(refcount_read(&skb->users));
++			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
++				trace_consume_skb(skb);
++			else
++				trace_kfree_skb(skb, net_tx_action,
++						SKB_DROP_REASON_NOT_SPECIFIED);
++
++			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
++				__kfree_skb(skb);
++			else
++				__kfree_skb_defer(skb);
++		}
++	}
++
++	if (sd->output_queue) {
++		struct Qdisc *head;
++
++		local_irq_disable();
++		head = sd->output_queue;
++		sd->output_queue = NULL;
++		sd->output_queue_tailp = &sd->output_queue;
++		local_irq_enable();
++
++		rcu_read_lock();
++
++		while (head) {
++			struct Qdisc *q = head;
++			spinlock_t *root_lock = NULL;
++
++			head = head->next_sched;
++
++			/* We need to make sure head->next_sched is read
++			 * before clearing __QDISC_STATE_SCHED
++			 */
++			smp_mb__before_atomic();
++
++			if (!(q->flags & TCQ_F_NOLOCK)) {
++				root_lock = qdisc_lock(q);
++				spin_lock(root_lock);
++			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
++						     &q->state))) {
++				/* There is a synchronize_net() between
++				 * STATE_DEACTIVATED flag being set and
++				 * qdisc_reset()/some_qdisc_is_busy() in
++				 * dev_deactivate(), so we can safely bail out
++				 * early here to avoid data race between
++				 * qdisc_deactivate() and some_qdisc_is_busy()
++				 * for lockless qdisc.
++				 */
++				clear_bit(__QDISC_STATE_SCHED, &q->state);
++				continue;
++			}
++
++			clear_bit(__QDISC_STATE_SCHED, &q->state);
++			qdisc_run(q);
++			if (root_lock)
++				spin_unlock(root_lock);
++		}
++
++		rcu_read_unlock();
++	}
++
++	xfrm_dev_backlog(sd);
++}
++
++#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
++/* This hook is defined here for ATM LANE */
++int (*br_fdb_test_addr_hook)(struct net_device *dev,
++			     unsigned char *addr) __read_mostly;
++EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
++#endif
++
++static inline struct sk_buff *
++sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
++		   struct net_device *orig_dev, bool *another)
++{
++#ifdef CONFIG_NET_CLS_ACT
++	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
++	struct tcf_result cl_res;
++
++	/* If there's at least one ingress present somewhere (so
++	 * we get here via enabled static key), remaining devices
++	 * that are not configured with an ingress qdisc will bail
++	 * out here.
++	 */
++	if (!miniq)
++		return skb;
++
++	if (*pt_prev) {
++		*ret = deliver_skb(skb, *pt_prev, orig_dev);
++		*pt_prev = NULL;
++	}
++
++	qdisc_skb_cb(skb)->pkt_len = skb->len;
++	tc_skb_cb(skb)->mru = 0;
++	tc_skb_cb(skb)->post_ct = false;
++	skb->tc_at_ingress = 1;
++	mini_qdisc_bstats_cpu_update(miniq, skb);
++
++	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
++	case TC_ACT_OK:
++	case TC_ACT_RECLASSIFY:
++		skb->tc_index = TC_H_MIN(cl_res.classid);
++		break;
++	case TC_ACT_SHOT:
++		mini_qdisc_qstats_cpu_drop(miniq);
++		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
++		*ret = NET_RX_DROP;
++		return NULL;
++	case TC_ACT_STOLEN:
++	case TC_ACT_QUEUED:
++	case TC_ACT_TRAP:
++		consume_skb(skb);
++		*ret = NET_RX_SUCCESS;
++		return NULL;
++	case TC_ACT_REDIRECT:
++		/* skb_mac_header check was done by cls/act_bpf, so
++		 * we can safely push the L2 header back before
++		 * redirecting to another netdev
++		 */
++		__skb_push(skb, skb->mac_len);
++		if (skb_do_redirect(skb) == -EAGAIN) {
++			__skb_pull(skb, skb->mac_len);
++			*another = true;
++			break;
++		}
++		*ret = NET_RX_SUCCESS;
++		return NULL;
++	case TC_ACT_CONSUMED:
++		*ret = NET_RX_SUCCESS;
++		return NULL;
++	default:
++		break;
++	}
++#endif /* CONFIG_NET_CLS_ACT */
++	return skb;
++}
++
++/**
++ *	netdev_is_rx_handler_busy - check if receive handler is registered
++ *	@dev: device to check
++ *
++ *	Check if a receive handler is already registered for a given device.
++ *	Return true if there one.
++ *
++ *	The caller must hold the rtnl_mutex.
++ */
++bool netdev_is_rx_handler_busy(struct net_device *dev)
++{
++	ASSERT_RTNL();
++	return dev && rtnl_dereference(dev->rx_handler);
++}
++EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
++
++/**
++ *	netdev_rx_handler_register - register receive handler
++ *	@dev: device to register a handler for
++ *	@rx_handler: receive handler to register
++ *	@rx_handler_data: data pointer that is used by rx handler
++ *
++ *	Register a receive handler for a device. This handler will then be
++ *	called from __netif_receive_skb. A negative errno code is returned
++ *	on a failure.
++ *
++ *	The caller must hold the rtnl_mutex.
++ *
++ *	For a general description of rx_handler, see enum rx_handler_result.
++ */
++int netdev_rx_handler_register(struct net_device *dev,
++			       rx_handler_func_t *rx_handler,
++			       void *rx_handler_data)
++{
++	if (netdev_is_rx_handler_busy(dev))
++		return -EBUSY;
++
++	if (dev->priv_flags & IFF_NO_RX_HANDLER)
++		return -EINVAL;
++
++	/* Note: rx_handler_data must be set before rx_handler */
++	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
++	rcu_assign_pointer(dev->rx_handler, rx_handler);
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
++
++/**
++ *	netdev_rx_handler_unregister - unregister receive handler
++ *	@dev: device to unregister a handler from
++ *
++ *	Unregister a receive handler from a device.
++ *
++ *	The caller must hold the rtnl_mutex.
++ */
++void netdev_rx_handler_unregister(struct net_device *dev)
++{
++
++	ASSERT_RTNL();
++	RCU_INIT_POINTER(dev->rx_handler, NULL);
++	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
++	 * section has a guarantee to see a non NULL rx_handler_data
++	 * as well.
++	 */
++	synchronize_net();
++	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
++}
++EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
++
++/*
++ * Limit the use of PFMEMALLOC reserves to those protocols that implement
++ * the special handling of PFMEMALLOC skbs.
++ */
++static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
++{
++	switch (skb->protocol) {
++	case htons(ETH_P_ARP):
++	case htons(ETH_P_IP):
++	case htons(ETH_P_IPV6):
++	case htons(ETH_P_8021Q):
++	case htons(ETH_P_8021AD):
++		return true;
++	default:
++		return false;
++	}
++}
++
++static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
++			     int *ret, struct net_device *orig_dev)
++{
++	if (nf_hook_ingress_active(skb)) {
++		int ingress_retval;
++
++		if (*pt_prev) {
++			*ret = deliver_skb(skb, *pt_prev, orig_dev);
++			*pt_prev = NULL;
++		}
++
++		rcu_read_lock();
++		ingress_retval = nf_hook_ingress(skb);
++		rcu_read_unlock();
++		return ingress_retval;
++	}
++	return 0;
++}
++
++static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
++				    struct packet_type **ppt_prev)
++{
++	struct packet_type *ptype, *pt_prev;
++	rx_handler_func_t *rx_handler;
++	struct sk_buff *skb = *pskb;
++	struct net_device *orig_dev;
++	bool deliver_exact = false;
++	int ret = NET_RX_DROP;
++	__be16 type;
++
++	net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
++
++	trace_netif_receive_skb(skb);
++
++	orig_dev = skb->dev;
++
++	skb_reset_network_header(skb);
++	if (!skb_transport_header_was_set(skb))
++		skb_reset_transport_header(skb);
++	skb_reset_mac_len(skb);
++
++	pt_prev = NULL;
++
++another_round:
++	skb->skb_iif = skb->dev->ifindex;
++
++	__this_cpu_inc(softnet_data.processed);
++
++	if (static_branch_unlikely(&generic_xdp_needed_key)) {
++		int ret2;
++
++		migrate_disable();
++		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
++		migrate_enable();
++
++		if (ret2 != XDP_PASS) {
++			ret = NET_RX_DROP;
++			goto out;
++		}
++	}
++
++	if (eth_type_vlan(skb->protocol)) {
++		skb = skb_vlan_untag(skb);
++		if (unlikely(!skb))
++			goto out;
++	}
++
++	if (skb_skip_tc_classify(skb))
++		goto skip_classify;
++
++	if (pfmemalloc)
++		goto skip_taps;
++
++	list_for_each_entry_rcu(ptype, &ptype_all, list) {
++		if (pt_prev)
++			ret = deliver_skb(skb, pt_prev, orig_dev);
++		pt_prev = ptype;
++	}
++
++	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
++		if (pt_prev)
++			ret = deliver_skb(skb, pt_prev, orig_dev);
++		pt_prev = ptype;
++	}
++
++skip_taps:
++#ifdef CONFIG_NET_INGRESS
++	if (static_branch_unlikely(&ingress_needed_key)) {
++		bool another = false;
++
++		nf_skip_egress(skb, true);
++		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
++					 &another);
++		if (another)
++			goto another_round;
++		if (!skb)
++			goto out;
++
++		nf_skip_egress(skb, false);
++		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
++			goto out;
++	}
++#endif
++	skb_reset_redirect(skb);
++skip_classify:
++	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
++		goto drop;
++
++	if (skb_vlan_tag_present(skb)) {
++		if (pt_prev) {
++			ret = deliver_skb(skb, pt_prev, orig_dev);
++			pt_prev = NULL;
++		}
++		if (vlan_do_receive(&skb))
++			goto another_round;
++		else if (unlikely(!skb))
++			goto out;
++	}
++
++	rx_handler = rcu_dereference(skb->dev->rx_handler);
++	if (rx_handler) {
++		if (pt_prev) {
++			ret = deliver_skb(skb, pt_prev, orig_dev);
++			pt_prev = NULL;
++		}
++		switch (rx_handler(&skb)) {
++		case RX_HANDLER_CONSUMED:
++			ret = NET_RX_SUCCESS;
++			goto out;
++		case RX_HANDLER_ANOTHER:
++			goto another_round;
++		case RX_HANDLER_EXACT:
++			deliver_exact = true;
++			break;
++		case RX_HANDLER_PASS:
++			break;
++		default:
++			BUG();
++		}
++	}
++
++	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
++check_vlan_id:
++		if (skb_vlan_tag_get_id(skb)) {
++			/* Vlan id is non 0 and vlan_do_receive() above couldn't
++			 * find vlan device.
++			 */
++			skb->pkt_type = PACKET_OTHERHOST;
++		} else if (eth_type_vlan(skb->protocol)) {
++			/* Outer header is 802.1P with vlan 0, inner header is
++			 * 802.1Q or 802.1AD and vlan_do_receive() above could
++			 * not find vlan dev for vlan id 0.
++			 */
++			__vlan_hwaccel_clear_tag(skb);
++			skb = skb_vlan_untag(skb);
++			if (unlikely(!skb))
++				goto out;
++			if (vlan_do_receive(&skb))
++				/* After stripping off 802.1P header with vlan 0
++				 * vlan dev is found for inner header.
++				 */
++				goto another_round;
++			else if (unlikely(!skb))
++				goto out;
++			else
++				/* We have stripped outer 802.1P vlan 0 header.
++				 * But could not find vlan dev.
++				 * check again for vlan id to set OTHERHOST.
++				 */
++				goto check_vlan_id;
++		}
++		/* Note: we might in the future use prio bits
++		 * and set skb->priority like in vlan_do_receive()
++		 * For the time being, just ignore Priority Code Point
++		 */
++		__vlan_hwaccel_clear_tag(skb);
++	}
++
++	type = skb->protocol;
++
++	/* deliver only exact match when indicated */
++	if (likely(!deliver_exact)) {
++		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
++				       &ptype_base[ntohs(type) &
++						   PTYPE_HASH_MASK]);
++	}
++
++	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
++			       &orig_dev->ptype_specific);
++
++	if (unlikely(skb->dev != orig_dev)) {
++		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
++				       &skb->dev->ptype_specific);
++	}
++
++	if (pt_prev) {
++		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
++			goto drop;
++		*ppt_prev = pt_prev;
++	} else {
++drop:
++		if (!deliver_exact)
++			dev_core_stats_rx_dropped_inc(skb->dev);
++		else
++			dev_core_stats_rx_nohandler_inc(skb->dev);
++		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
++		/* Jamal, now you will not able to escape explaining
++		 * me how you were going to use this. :-)
++		 */
++		ret = NET_RX_DROP;
++	}
++
++out:
++	/* The invariant here is that if *ppt_prev is not NULL
++	 * then skb should also be non-NULL.
++	 *
++	 * Apparently *ppt_prev assignment above holds this invariant due to
++	 * skb dereferencing near it.
++	 */
++	*pskb = skb;
++	return ret;
++}
++
++static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
++{
++	struct net_device *orig_dev = skb->dev;
++	struct packet_type *pt_prev = NULL;
++	int ret;
++
++	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
++	if (pt_prev)
++		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
++					 skb->dev, pt_prev, orig_dev);
++	return ret;
++}
++
++/**
++ *	netif_receive_skb_core - special purpose version of netif_receive_skb
++ *	@skb: buffer to process
++ *
++ *	More direct receive version of netif_receive_skb().  It should
++ *	only be used by callers that have a need to skip RPS and Generic XDP.
++ *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
++ *
++ *	This function may only be called from softirq context and interrupts
++ *	should be enabled.
++ *
++ *	Return values (usually ignored):
++ *	NET_RX_SUCCESS: no congestion
++ *	NET_RX_DROP: packet was dropped
++ */
++int netif_receive_skb_core(struct sk_buff *skb)
++{
++	int ret;
++
++	rcu_read_lock();
++	ret = __netif_receive_skb_one_core(skb, false);
++	rcu_read_unlock();
++
++	return ret;
++}
++EXPORT_SYMBOL(netif_receive_skb_core);
++
++static inline void __netif_receive_skb_list_ptype(struct list_head *head,
++						  struct packet_type *pt_prev,
++						  struct net_device *orig_dev)
++{
++	struct sk_buff *skb, *next;
++
++	if (!pt_prev)
++		return;
++	if (list_empty(head))
++		return;
++	if (pt_prev->list_func != NULL)
++		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
++				   ip_list_rcv, head, pt_prev, orig_dev);
++	else
++		list_for_each_entry_safe(skb, next, head, list) {
++			skb_list_del_init(skb);
++			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
++		}
++}
++
++static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
++{
++	/* Fast-path assumptions:
++	 * - There is no RX handler.
++	 * - Only one packet_type matches.
++	 * If either of these fails, we will end up doing some per-packet
++	 * processing in-line, then handling the 'last ptype' for the whole
++	 * sublist.  This can't cause out-of-order delivery to any single ptype,
++	 * because the 'last ptype' must be constant across the sublist, and all
++	 * other ptypes are handled per-packet.
++	 */
++	/* Current (common) ptype of sublist */
++	struct packet_type *pt_curr = NULL;
++	/* Current (common) orig_dev of sublist */
++	struct net_device *od_curr = NULL;
++	struct list_head sublist;
++	struct sk_buff *skb, *next;
++
++	INIT_LIST_HEAD(&sublist);
++	list_for_each_entry_safe(skb, next, head, list) {
++		struct net_device *orig_dev = skb->dev;
++		struct packet_type *pt_prev = NULL;
++
++		skb_list_del_init(skb);
++		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
++		if (!pt_prev)
++			continue;
++		if (pt_curr != pt_prev || od_curr != orig_dev) {
++			/* dispatch old sublist */
++			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
++			/* start new sublist */
++			INIT_LIST_HEAD(&sublist);
++			pt_curr = pt_prev;
++			od_curr = orig_dev;
++		}
++		list_add_tail(&skb->list, &sublist);
++	}
++
++	/* dispatch final sublist */
++	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
++}
++
++static int __netif_receive_skb(struct sk_buff *skb)
++{
++	int ret;
++
++	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
++		unsigned int noreclaim_flag;
++
++		/*
++		 * PFMEMALLOC skbs are special, they should
++		 * - be delivered to SOCK_MEMALLOC sockets only
++		 * - stay away from userspace
++		 * - have bounded memory usage
++		 *
++		 * Use PF_MEMALLOC as this saves us from propagating the allocation
++		 * context down to all allocation sites.
++		 */
++		noreclaim_flag = memalloc_noreclaim_save();
++		ret = __netif_receive_skb_one_core(skb, true);
++		memalloc_noreclaim_restore(noreclaim_flag);
++	} else
++		ret = __netif_receive_skb_one_core(skb, false);
++
++	return ret;
++}
++
++static void __netif_receive_skb_list(struct list_head *head)
++{
++	unsigned long noreclaim_flag = 0;
++	struct sk_buff *skb, *next;
++	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
++
++	list_for_each_entry_safe(skb, next, head, list) {
++		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
++			struct list_head sublist;
++
++			/* Handle the previous sublist */
++			list_cut_before(&sublist, head, &skb->list);
++			if (!list_empty(&sublist))
++				__netif_receive_skb_list_core(&sublist, pfmemalloc);
++			pfmemalloc = !pfmemalloc;
++			/* See comments in __netif_receive_skb */
++			if (pfmemalloc)
++				noreclaim_flag = memalloc_noreclaim_save();
++			else
++				memalloc_noreclaim_restore(noreclaim_flag);
++		}
++	}
++	/* Handle the remaining sublist */
++	if (!list_empty(head))
++		__netif_receive_skb_list_core(head, pfmemalloc);
++	/* Restore pflags */
++	if (pfmemalloc)
++		memalloc_noreclaim_restore(noreclaim_flag);
++}
++
++static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
++{
++	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
++	struct bpf_prog *new = xdp->prog;
++	int ret = 0;
++
++	switch (xdp->command) {
++	case XDP_SETUP_PROG:
++		rcu_assign_pointer(dev->xdp_prog, new);
++		if (old)
++			bpf_prog_put(old);
++
++		if (old && !new) {
++			static_branch_dec(&generic_xdp_needed_key);
++		} else if (new && !old) {
++			static_branch_inc(&generic_xdp_needed_key);
++			dev_disable_lro(dev);
++			dev_disable_gro_hw(dev);
++		}
++		break;
++
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	return ret;
++}
++
++static int netif_receive_skb_internal(struct sk_buff *skb)
++{
++	int ret;
++
++	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
++
++	if (skb_defer_rx_timestamp(skb))
++		return NET_RX_SUCCESS;
++
++	rcu_read_lock();
++#ifdef CONFIG_RPS
++	if (static_branch_unlikely(&rps_needed)) {
++		struct rps_dev_flow voidflow, *rflow = &voidflow;
++		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
++
++		if (cpu >= 0) {
++			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
++			rcu_read_unlock();
++			return ret;
++		}
++	}
++#endif
++	ret = __netif_receive_skb(skb);
++	rcu_read_unlock();
++	return ret;
++}
++
++void netif_receive_skb_list_internal(struct list_head *head)
++{
++	struct sk_buff *skb, *next;
++	struct list_head sublist;
++
++	INIT_LIST_HEAD(&sublist);
++	list_for_each_entry_safe(skb, next, head, list) {
++		net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
++		skb_list_del_init(skb);
++		if (!skb_defer_rx_timestamp(skb))
++			list_add_tail(&skb->list, &sublist);
++	}
++	list_splice_init(&sublist, head);
++
++	rcu_read_lock();
++#ifdef CONFIG_RPS
++	if (static_branch_unlikely(&rps_needed)) {
++		list_for_each_entry_safe(skb, next, head, list) {
++			struct rps_dev_flow voidflow, *rflow = &voidflow;
++			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
++
++			if (cpu >= 0) {
++				/* Will be handled, remove from list */
++				skb_list_del_init(skb);
++				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
++			}
++		}
++	}
++#endif
++	__netif_receive_skb_list(head);
++	rcu_read_unlock();
++}
++
++/**
++ *	netif_receive_skb - process receive buffer from network
++ *	@skb: buffer to process
++ *
++ *	netif_receive_skb() is the main receive data processing function.
++ *	It always succeeds. The buffer may be dropped during processing
++ *	for congestion control or by the protocol layers.
++ *
++ *	This function may only be called from softirq context and interrupts
++ *	should be enabled.
++ *
++ *	Return values (usually ignored):
++ *	NET_RX_SUCCESS: no congestion
++ *	NET_RX_DROP: packet was dropped
++ */
++int netif_receive_skb(struct sk_buff *skb)
++{
++	int ret;
++
++	trace_netif_receive_skb_entry(skb);
++
++	ret = netif_receive_skb_internal(skb);
++	trace_netif_receive_skb_exit(ret);
++
++	return ret;
++}
++EXPORT_SYMBOL(netif_receive_skb);
++
++/**
++ *	netif_receive_skb_list - process many receive buffers from network
++ *	@head: list of skbs to process.
++ *
++ *	Since return value of netif_receive_skb() is normally ignored, and
++ *	wouldn't be meaningful for a list, this function returns void.
++ *
++ *	This function may only be called from softirq context and interrupts
++ *	should be enabled.
++ */
++void netif_receive_skb_list(struct list_head *head)
++{
++	struct sk_buff *skb;
++
++	if (list_empty(head))
++		return;
++	if (trace_netif_receive_skb_list_entry_enabled()) {
++		list_for_each_entry(skb, head, list)
++			trace_netif_receive_skb_list_entry(skb);
++	}
++	netif_receive_skb_list_internal(head);
++	trace_netif_receive_skb_list_exit(0);
++}
++EXPORT_SYMBOL(netif_receive_skb_list);
++
++static DEFINE_PER_CPU(struct work_struct, flush_works);
++
++/* Network device is going away, flush any packets still pending */
++static void flush_backlog(struct work_struct *work)
++{
++	struct sk_buff *skb, *tmp;
++	struct softnet_data *sd;
++
++	local_bh_disable();
++	sd = this_cpu_ptr(&softnet_data);
++
++	rps_lock_irq_disable(sd);
++	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
++		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
++			__skb_unlink(skb, &sd->input_pkt_queue);
++			dev_kfree_skb_irq(skb);
++			input_queue_head_incr(sd);
++		}
++	}
++	rps_unlock_irq_enable(sd);
++
++	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
++		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
++			__skb_unlink(skb, &sd->process_queue);
++			kfree_skb(skb);
++			input_queue_head_incr(sd);
++		}
++	}
++	local_bh_enable();
++}
++
++static bool flush_required(int cpu)
++{
++#if IS_ENABLED(CONFIG_RPS)
++	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
++	bool do_flush;
++
++	rps_lock_irq_disable(sd);
++
++	/* as insertion into process_queue happens with the rps lock held,
++	 * process_queue access may race only with dequeue
++	 */
++	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
++		   !skb_queue_empty_lockless(&sd->process_queue);
++	rps_unlock_irq_enable(sd);
++
++	return do_flush;
++#endif
++	/* without RPS we can't safely check input_pkt_queue: during a
++	 * concurrent remote skb_queue_splice() we can detect as empty both
++	 * input_pkt_queue and process_queue even if the latter could end-up
++	 * containing a lot of packets.
++	 */
++	return true;
++}
++
++static void flush_all_backlogs(void)
++{
++	static cpumask_t flush_cpus;
++	unsigned int cpu;
++
++	/* since we are under rtnl lock protection we can use static data
++	 * for the cpumask and avoid allocating on stack the possibly
++	 * large mask
++	 */
++	ASSERT_RTNL();
++
++	cpus_read_lock();
++
++	cpumask_clear(&flush_cpus);
++	for_each_online_cpu(cpu) {
++		if (flush_required(cpu)) {
++			queue_work_on(cpu, system_highpri_wq,
++				      per_cpu_ptr(&flush_works, cpu));
++			cpumask_set_cpu(cpu, &flush_cpus);
++		}
++	}
++
++	/* we can have in flight packet[s] on the cpus we are not flushing,
++	 * synchronize_net() in unregister_netdevice_many() will take care of
++	 * them
++	 */
++	for_each_cpu(cpu, &flush_cpus)
++		flush_work(per_cpu_ptr(&flush_works, cpu));
++
++	cpus_read_unlock();
++}
++
++static void net_rps_send_ipi(struct softnet_data *remsd)
++{
++#ifdef CONFIG_RPS
++	while (remsd) {
++		struct softnet_data *next = remsd->rps_ipi_next;
++
++		if (cpu_online(remsd->cpu))
++			smp_call_function_single_async(remsd->cpu, &remsd->csd);
++		remsd = next;
++	}
++#endif
++}
++
++/*
++ * net_rps_action_and_irq_enable sends any pending IPI's for rps.
++ * Note: called with local irq disabled, but exits with local irq enabled.
++ */
++static void net_rps_action_and_irq_enable(struct softnet_data *sd)
++{
++#ifdef CONFIG_RPS
++	struct softnet_data *remsd = sd->rps_ipi_list;
++
++	if (remsd) {
++		sd->rps_ipi_list = NULL;
++
++		local_irq_enable();
++
++		/* Send pending IPI's to kick RPS processing on remote cpus. */
++		net_rps_send_ipi(remsd);
++	} else
++#endif
++		local_irq_enable();
++}
++
++static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
++{
++#ifdef CONFIG_RPS
++	return sd->rps_ipi_list != NULL;
++#else
++	return false;
++#endif
++}
++
++static int process_backlog(struct napi_struct *napi, int quota)
++{
++	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
++	bool again = true;
++	int work = 0;
++
++	/* Check if we have pending ipi, its better to send them now,
++	 * not waiting net_rx_action() end.
++	 */
++	if (sd_has_rps_ipi_waiting(sd)) {
++		local_irq_disable();
++		net_rps_action_and_irq_enable(sd);
++	}
++
++	napi->weight = READ_ONCE(dev_rx_weight);
++	while (again) {
++		struct sk_buff *skb;
++
++		while ((skb = __skb_dequeue(&sd->process_queue))) {
++			rcu_read_lock();
++			__netif_receive_skb(skb);
++			rcu_read_unlock();
++			input_queue_head_incr(sd);
++			if (++work >= quota)
++				return work;
++
++		}
++
++		rps_lock_irq_disable(sd);
++		if (skb_queue_empty(&sd->input_pkt_queue)) {
++			/*
++			 * Inline a custom version of __napi_complete().
++			 * only current cpu owns and manipulates this napi,
++			 * and NAPI_STATE_SCHED is the only possible flag set
++			 * on backlog.
++			 * We can use a plain write instead of clear_bit(),
++			 * and we dont need an smp_mb() memory barrier.
++			 */
++			napi->state = 0;
++			again = false;
++		} else {
++			skb_queue_splice_tail_init(&sd->input_pkt_queue,
++						   &sd->process_queue);
++		}
++		rps_unlock_irq_enable(sd);
++	}
++
++	return work;
++}
++
++/**
++ * __napi_schedule - schedule for receive
++ * @n: entry to schedule
++ *
++ * The entry's receive function will be scheduled to run.
++ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
++ */
++void __napi_schedule(struct napi_struct *n)
++{
++	unsigned long flags;
++
++	local_irq_save(flags);
++	____napi_schedule(this_cpu_ptr(&softnet_data), n);
++	local_irq_restore(flags);
++}
++EXPORT_SYMBOL(__napi_schedule);
++
++/**
++ *	napi_schedule_prep - check if napi can be scheduled
++ *	@n: napi context
++ *
++ * Test if NAPI routine is already running, and if not mark
++ * it as running.  This is used as a condition variable to
++ * insure only one NAPI poll instance runs.  We also make
++ * sure there is no pending NAPI disable.
++ */
++bool napi_schedule_prep(struct napi_struct *n)
++{
++	unsigned long val, new;
++
++	do {
++		val = READ_ONCE(n->state);
++		if (unlikely(val & NAPIF_STATE_DISABLE))
++			return false;
++		new = val | NAPIF_STATE_SCHED;
++
++		/* Sets STATE_MISSED bit if STATE_SCHED was already set
++		 * This was suggested by Alexander Duyck, as compiler
++		 * emits better code than :
++		 * if (val & NAPIF_STATE_SCHED)
++		 *     new |= NAPIF_STATE_MISSED;
++		 */
++		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
++						   NAPIF_STATE_MISSED;
++	} while (cmpxchg(&n->state, val, new) != val);
++
++	return !(val & NAPIF_STATE_SCHED);
++}
++EXPORT_SYMBOL(napi_schedule_prep);
++
++/**
++ * __napi_schedule_irqoff - schedule for receive
++ * @n: entry to schedule
++ *
++ * Variant of __napi_schedule() assuming hard irqs are masked.
++ *
++ * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
++ * because the interrupt disabled assumption might not be true
++ * due to force-threaded interrupts and spinlock substitution.
++ */
++void __napi_schedule_irqoff(struct napi_struct *n)
++{
++	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		____napi_schedule(this_cpu_ptr(&softnet_data), n);
++	else
++		__napi_schedule(n);
++}
++EXPORT_SYMBOL(__napi_schedule_irqoff);
++
++bool napi_complete_done(struct napi_struct *n, int work_done)
++{
++	unsigned long flags, val, new, timeout = 0;
++	bool ret = true;
++
++	/*
++	 * 1) Don't let napi dequeue from the cpu poll list
++	 *    just in case its running on a different cpu.
++	 * 2) If we are busy polling, do nothing here, we have
++	 *    the guarantee we will be called later.
++	 */
++	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
++				 NAPIF_STATE_IN_BUSY_POLL)))
++		return false;
++
++	if (work_done) {
++		if (n->gro_bitmask)
++			timeout = READ_ONCE(n->dev->gro_flush_timeout);
++		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
++	}
++	if (n->defer_hard_irqs_count > 0) {
++		n->defer_hard_irqs_count--;
++		timeout = READ_ONCE(n->dev->gro_flush_timeout);
++		if (timeout)
++			ret = false;
++	}
++	if (n->gro_bitmask) {
++		/* When the NAPI instance uses a timeout and keeps postponing
++		 * it, we need to bound somehow the time packets are kept in
++		 * the GRO layer
++		 */
++		napi_gro_flush(n, !!timeout);
++	}
++
++	gro_normal_list(n);
++
++	if (unlikely(!list_empty(&n->poll_list))) {
++		/* If n->poll_list is not empty, we need to mask irqs */
++		local_irq_save(flags);
++		list_del_init(&n->poll_list);
++		local_irq_restore(flags);
++	}
++
++	do {
++		val = READ_ONCE(n->state);
++
++		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
++
++		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
++			      NAPIF_STATE_SCHED_THREADED |
++			      NAPIF_STATE_PREFER_BUSY_POLL);
++
++		/* If STATE_MISSED was set, leave STATE_SCHED set,
++		 * because we will call napi->poll() one more time.
++		 * This C code was suggested by Alexander Duyck to help gcc.
++		 */
++		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
++						    NAPIF_STATE_SCHED;
++	} while (cmpxchg(&n->state, val, new) != val);
++
++	if (unlikely(val & NAPIF_STATE_MISSED)) {
++		__napi_schedule(n);
++		return false;
++	}
++
++	if (timeout)
++		hrtimer_start(&n->timer, ns_to_ktime(timeout),
++			      HRTIMER_MODE_REL_PINNED);
++	return ret;
++}
++EXPORT_SYMBOL(napi_complete_done);
++
++/* must be called under rcu_read_lock(), as we dont take a reference */
++static struct napi_struct *napi_by_id(unsigned int napi_id)
++{
++	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
++	struct napi_struct *napi;
++
++	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
++		if (napi->napi_id == napi_id)
++			return napi;
++
++	return NULL;
++}
++
++#if defined(CONFIG_NET_RX_BUSY_POLL)
++
++static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
++{
++	if (!skip_schedule) {
++		gro_normal_list(napi);
++		__napi_schedule(napi);
++		return;
++	}
++
++	if (napi->gro_bitmask) {
++		/* flush too old packets
++		 * If HZ < 1000, flush all packets.
++		 */
++		napi_gro_flush(napi, HZ >= 1000);
++	}
++
++	gro_normal_list(napi);
++	clear_bit(NAPI_STATE_SCHED, &napi->state);
++}
++
++static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
++			   u16 budget)
++{
++	bool skip_schedule = false;
++	unsigned long timeout;
++	int rc;
++
++	/* Busy polling means there is a high chance device driver hard irq
++	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
++	 * set in napi_schedule_prep().
++	 * Since we are about to call napi->poll() once more, we can safely
++	 * clear NAPI_STATE_MISSED.
++	 *
++	 * Note: x86 could use a single "lock and ..." instruction
++	 * to perform these two clear_bit()
++	 */
++	clear_bit(NAPI_STATE_MISSED, &napi->state);
++	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
++
++	local_bh_disable();
++
++	if (prefer_busy_poll) {
++		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
++		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
++		if (napi->defer_hard_irqs_count && timeout) {
++			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
++			skip_schedule = true;
++		}
++	}
++
++	/* All we really want here is to re-enable device interrupts.
++	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
++	 */
++	rc = napi->poll(napi, budget);
++	/* We can't gro_normal_list() here, because napi->poll() might have
++	 * rearmed the napi (napi_complete_done()) in which case it could
++	 * already be running on another CPU.
++	 */
++	trace_napi_poll(napi, rc, budget);
++	netpoll_poll_unlock(have_poll_lock);
++	if (rc == budget)
++		__busy_poll_stop(napi, skip_schedule);
++	local_bh_enable();
++}
++
++void napi_busy_loop(unsigned int napi_id,
++		    bool (*loop_end)(void *, unsigned long),
++		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
++{
++	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
++	int (*napi_poll)(struct napi_struct *napi, int budget);
++	void *have_poll_lock = NULL;
++	struct napi_struct *napi;
++
++restart:
++	napi_poll = NULL;
++
++	rcu_read_lock();
++
++	napi = napi_by_id(napi_id);
++	if (!napi)
++		goto out;
++
++	preempt_disable();
++	for (;;) {
++		int work = 0;
++
++		local_bh_disable();
++		if (!napi_poll) {
++			unsigned long val = READ_ONCE(napi->state);
++
++			/* If multiple threads are competing for this napi,
++			 * we avoid dirtying napi->state as much as we can.
++			 */
++			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
++				   NAPIF_STATE_IN_BUSY_POLL)) {
++				if (prefer_busy_poll)
++					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
++				goto count;
++			}
++			if (cmpxchg(&napi->state, val,
++				    val | NAPIF_STATE_IN_BUSY_POLL |
++					  NAPIF_STATE_SCHED) != val) {
++				if (prefer_busy_poll)
++					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
++				goto count;
++			}
++			have_poll_lock = netpoll_poll_lock(napi);
++			napi_poll = napi->poll;
++		}
++		work = napi_poll(napi, budget);
++		trace_napi_poll(napi, work, budget);
++		gro_normal_list(napi);
++count:
++		if (work > 0)
++			__NET_ADD_STATS(dev_net(napi->dev),
++					LINUX_MIB_BUSYPOLLRXPACKETS, work);
++		local_bh_enable();
++
++		if (!loop_end || loop_end(loop_end_arg, start_time))
++			break;
++
++		if (unlikely(need_resched())) {
++			if (napi_poll)
++				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
++			preempt_enable();
++			rcu_read_unlock();
++			cond_resched();
++			if (loop_end(loop_end_arg, start_time))
++				return;
++			goto restart;
++		}
++		cpu_relax();
++	}
++	if (napi_poll)
++		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
++	preempt_enable();
++out:
++	rcu_read_unlock();
++}
++EXPORT_SYMBOL(napi_busy_loop);
++
++#endif /* CONFIG_NET_RX_BUSY_POLL */
++
++static void napi_hash_add(struct napi_struct *napi)
++{
++	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
++		return;
++
++	spin_lock(&napi_hash_lock);
++
++	/* 0..NR_CPUS range is reserved for sender_cpu use */
++	do {
++		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
++			napi_gen_id = MIN_NAPI_ID;
++	} while (napi_by_id(napi_gen_id));
++	napi->napi_id = napi_gen_id;
++
++	hlist_add_head_rcu(&napi->napi_hash_node,
++			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
++
++	spin_unlock(&napi_hash_lock);
++}
++
++/* Warning : caller is responsible to make sure rcu grace period
++ * is respected before freeing memory containing @napi
++ */
++static void napi_hash_del(struct napi_struct *napi)
++{
++	spin_lock(&napi_hash_lock);
++
++	hlist_del_init_rcu(&napi->napi_hash_node);
++
++	spin_unlock(&napi_hash_lock);
++}
++
++static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
++{
++	struct napi_struct *napi;
++
++	napi = container_of(timer, struct napi_struct, timer);
++
++	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
++	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
++	 */
++	if (!napi_disable_pending(napi) &&
++	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
++		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
++		__napi_schedule_irqoff(napi);
++	}
++
++	return HRTIMER_NORESTART;
++}
++
++static void init_gro_hash(struct napi_struct *napi)
++{
++	int i;
++
++	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
++		INIT_LIST_HEAD(&napi->gro_hash[i].list);
++		napi->gro_hash[i].count = 0;
++	}
++	napi->gro_bitmask = 0;
++}
++
++int dev_set_threaded(struct net_device *dev, bool threaded)
++{
++	struct napi_struct *napi;
++	int err = 0;
++
++	if (dev->threaded == threaded)
++		return 0;
++
++	if (threaded) {
++		list_for_each_entry(napi, &dev->napi_list, dev_list) {
++			if (!napi->thread) {
++				err = napi_kthread_create(napi);
++				if (err) {
++					threaded = false;
++					break;
++				}
++			}
++		}
++	}
++
++	dev->threaded = threaded;
++
++	/* Make sure kthread is created before THREADED bit
++	 * is set.
++	 */
++	smp_mb__before_atomic();
++
++	/* Setting/unsetting threaded mode on a napi might not immediately
++	 * take effect, if the current napi instance is actively being
++	 * polled. In this case, the switch between threaded mode and
++	 * softirq mode will happen in the next round of napi_schedule().
++	 * This should not cause hiccups/stalls to the live traffic.
++	 */
++	list_for_each_entry(napi, &dev->napi_list, dev_list) {
++		if (threaded)
++			set_bit(NAPI_STATE_THREADED, &napi->state);
++		else
++			clear_bit(NAPI_STATE_THREADED, &napi->state);
++	}
++
++	return err;
++}
++EXPORT_SYMBOL(dev_set_threaded);
++
++/* Double check that napi_get_frags() allocates skbs with
++ * skb->head being backed by slab, not a page fragment.
++ * This is to make sure bug fixed in 3226b158e67c
++ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
++ * does not accidentally come back.
++ */
++static void napi_get_frags_check(struct napi_struct *napi)
++{
++	struct sk_buff *skb;
++
++	local_bh_disable();
++	skb = napi_get_frags(napi);
++	WARN_ON_ONCE(skb && skb->head_frag);
++	napi_free_frags(napi);
++	local_bh_enable();
++}
++
++void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
++			   int (*poll)(struct napi_struct *, int), int weight)
++{
++	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
++		return;
++
++	INIT_LIST_HEAD(&napi->poll_list);
++	INIT_HLIST_NODE(&napi->napi_hash_node);
++	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
++	napi->timer.function = napi_watchdog;
++	init_gro_hash(napi);
++	napi->skb = NULL;
++	INIT_LIST_HEAD(&napi->rx_list);
++	napi->rx_count = 0;
++	napi->poll = poll;
++	if (weight > NAPI_POLL_WEIGHT)
++		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
++				weight);
++	napi->weight = weight;
++	napi->dev = dev;
++#ifdef CONFIG_NETPOLL
++	napi->poll_owner = -1;
++#endif
++	set_bit(NAPI_STATE_SCHED, &napi->state);
++	set_bit(NAPI_STATE_NPSVC, &napi->state);
++	list_add_rcu(&napi->dev_list, &dev->napi_list);
++	napi_hash_add(napi);
++	napi_get_frags_check(napi);
++	/* Create kthread for this napi if dev->threaded is set.
++	 * Clear dev->threaded if kthread creation failed so that
++	 * threaded mode will not be enabled in napi_enable().
++	 */
++	if (dev->threaded && napi_kthread_create(napi))
++		dev->threaded = 0;
++}
++EXPORT_SYMBOL(netif_napi_add_weight);
++
++void napi_disable(struct napi_struct *n)
++{
++	unsigned long val, new;
++
++	might_sleep();
++	set_bit(NAPI_STATE_DISABLE, &n->state);
++
++	for ( ; ; ) {
++		val = READ_ONCE(n->state);
++		if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
++			usleep_range(20, 200);
++			continue;
++		}
++
++		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
++		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
++
++		if (cmpxchg(&n->state, val, new) == val)
++			break;
++	}
++
++	hrtimer_cancel(&n->timer);
++
++	clear_bit(NAPI_STATE_DISABLE, &n->state);
++}
++EXPORT_SYMBOL(napi_disable);
++
++/**
++ *	napi_enable - enable NAPI scheduling
++ *	@n: NAPI context
++ *
++ * Resume NAPI from being scheduled on this context.
++ * Must be paired with napi_disable.
++ */
++void napi_enable(struct napi_struct *n)
++{
++	unsigned long val, new;
++
++	do {
++		val = READ_ONCE(n->state);
++		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
++
++		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
++		if (n->dev->threaded && n->thread)
++			new |= NAPIF_STATE_THREADED;
++	} while (cmpxchg(&n->state, val, new) != val);
++}
++EXPORT_SYMBOL(napi_enable);
++
++static void flush_gro_hash(struct napi_struct *napi)
++{
++	int i;
++
++	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
++		struct sk_buff *skb, *n;
++
++		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
++			kfree_skb(skb);
++		napi->gro_hash[i].count = 0;
++	}
++}
++
++/* Must be called in process context */
++void __netif_napi_del(struct napi_struct *napi)
++{
++	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
++		return;
++
++	napi_hash_del(napi);
++	list_del_rcu(&napi->dev_list);
++	napi_free_frags(napi);
++
++	flush_gro_hash(napi);
++	napi->gro_bitmask = 0;
++
++	if (napi->thread) {
++		kthread_stop(napi->thread);
++		napi->thread = NULL;
++	}
++}
++EXPORT_SYMBOL(__netif_napi_del);
++
++static int __napi_poll(struct napi_struct *n, bool *repoll)
++{
++	int work, weight;
++
++	weight = n->weight;
++
++	/* This NAPI_STATE_SCHED test is for avoiding a race
++	 * with netpoll's poll_napi().  Only the entity which
++	 * obtains the lock and sees NAPI_STATE_SCHED set will
++	 * actually make the ->poll() call.  Therefore we avoid
++	 * accidentally calling ->poll() when NAPI is not scheduled.
++	 */
++	work = 0;
++	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
++		work = n->poll(n, weight);
++		trace_napi_poll(n, work, weight);
++	}
++
++	if (unlikely(work > weight))
++		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
++				n->poll, work, weight);
++
++	if (likely(work < weight))
++		return work;
++
++	/* Drivers must not modify the NAPI state if they
++	 * consume the entire weight.  In such cases this code
++	 * still "owns" the NAPI instance and therefore can
++	 * move the instance around on the list at-will.
++	 */
++	if (unlikely(napi_disable_pending(n))) {
++		napi_complete(n);
++		return work;
++	}
++
++	/* The NAPI context has more processing work, but busy-polling
++	 * is preferred. Exit early.
++	 */
++	if (napi_prefer_busy_poll(n)) {
++		if (napi_complete_done(n, work)) {
++			/* If timeout is not set, we need to make sure
++			 * that the NAPI is re-scheduled.
++			 */
++			napi_schedule(n);
++		}
++		return work;
++	}
++
++	if (n->gro_bitmask) {
++		/* flush too old packets
++		 * If HZ < 1000, flush all packets.
++		 */
++		napi_gro_flush(n, HZ >= 1000);
++	}
++
++	gro_normal_list(n);
++
++	/* Some drivers may have called napi_schedule
++	 * prior to exhausting their budget.
++	 */
++	if (unlikely(!list_empty(&n->poll_list))) {
++		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
++			     n->dev ? n->dev->name : "backlog");
++		return work;
++	}
++
++	*repoll = true;
++
++	return work;
++}
++
++static int napi_poll(struct napi_struct *n, struct list_head *repoll)
++{
++	bool do_repoll = false;
++	void *have;
++	int work;
++
++	list_del_init(&n->poll_list);
++
++	have = netpoll_poll_lock(n);
++
++	work = __napi_poll(n, &do_repoll);
++
++	if (do_repoll)
++		list_add_tail(&n->poll_list, repoll);
++
++	netpoll_poll_unlock(have);
++
++	return work;
++}
++
++static int napi_thread_wait(struct napi_struct *napi)
++{
++	bool woken = false;
++
++	set_current_state(TASK_INTERRUPTIBLE);
++
++	while (!kthread_should_stop()) {
++		/* Testing SCHED_THREADED bit here to make sure the current
++		 * kthread owns this napi and could poll on this napi.
++		 * Testing SCHED bit is not enough because SCHED bit might be
++		 * set by some other busy poll thread or by napi_disable().
++		 */
++		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
++			WARN_ON(!list_empty(&napi->poll_list));
++			__set_current_state(TASK_RUNNING);
++			return 0;
++		}
++
++		schedule();
++		/* woken being true indicates this thread owns this napi. */
++		woken = true;
++		set_current_state(TASK_INTERRUPTIBLE);
++	}
++	__set_current_state(TASK_RUNNING);
++
++	return -1;
++}
++
++static int napi_threaded_poll(void *data)
++{
++	struct napi_struct *napi = data;
++	void *have;
++
++	while (!napi_thread_wait(napi)) {
++		for (;;) {
++			bool repoll = false;
++
++			local_bh_disable();
++
++			have = netpoll_poll_lock(napi);
++			__napi_poll(napi, &repoll);
++			netpoll_poll_unlock(have);
++
++			local_bh_enable();
++
++			if (!repoll)
++				break;
++
++			cond_resched();
++		}
++	}
++	return 0;
++}
++
++static void skb_defer_free_flush(struct softnet_data *sd)
++{
++	struct sk_buff *skb, *next;
++	unsigned long flags;
++
++	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
++	if (!READ_ONCE(sd->defer_list))
++		return;
++
++	spin_lock_irqsave(&sd->defer_lock, flags);
++	skb = sd->defer_list;
++	sd->defer_list = NULL;
++	sd->defer_count = 0;
++	spin_unlock_irqrestore(&sd->defer_lock, flags);
++
++	while (skb != NULL) {
++		next = skb->next;
++		napi_consume_skb(skb, 1);
++		skb = next;
++	}
++}
++
++static __latent_entropy void net_rx_action(struct softirq_action *h)
++{
++	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
++	unsigned long time_limit = jiffies +
++		usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
++	int budget = READ_ONCE(netdev_budget);
++	LIST_HEAD(list);
++	LIST_HEAD(repoll);
++
++	local_irq_disable();
++	list_splice_init(&sd->poll_list, &list);
++	local_irq_enable();
++
++	for (;;) {
++		struct napi_struct *n;
++
++		skb_defer_free_flush(sd);
++
++		if (list_empty(&list)) {
++			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
++				goto end;
++			break;
++		}
++
++		n = list_first_entry(&list, struct napi_struct, poll_list);
++		budget -= napi_poll(n, &repoll);
++
++		/* If softirq window is exhausted then punt.
++		 * Allow this to run for 2 jiffies since which will allow
++		 * an average latency of 1.5/HZ.
++		 */
++		if (unlikely(budget <= 0 ||
++			     time_after_eq(jiffies, time_limit))) {
++			sd->time_squeeze++;
++			break;
++		}
++	}
++
++	local_irq_disable();
++
++	list_splice_tail_init(&sd->poll_list, &list);
++	list_splice_tail(&repoll, &list);
++	list_splice(&list, &sd->poll_list);
++	if (!list_empty(&sd->poll_list))
++		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
++
++	net_rps_action_and_irq_enable(sd);
++end:;
++}
++
++struct netdev_adjacent {
++	struct net_device *dev;
++	netdevice_tracker dev_tracker;
++
++	/* upper master flag, there can only be one master device per list */
++	bool master;
++
++	/* lookup ignore flag */
++	bool ignore;
++
++	/* counter for the number of times this device was added to us */
++	u16 ref_nr;
++
++	/* private field for the users */
++	void *private;
++
++	struct list_head list;
++	struct rcu_head rcu;
++};
++
++static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
++						 struct list_head *adj_list)
++{
++	struct netdev_adjacent *adj;
++
++	list_for_each_entry(adj, adj_list, list) {
++		if (adj->dev == adj_dev)
++			return adj;
++	}
++	return NULL;
++}
++
++static int ____netdev_has_upper_dev(struct net_device *upper_dev,
++				    struct netdev_nested_priv *priv)
++{
++	struct net_device *dev = (struct net_device *)priv->data;
++
++	return upper_dev == dev;
++}
++
++/**
++ * netdev_has_upper_dev - Check if device is linked to an upper device
++ * @dev: device
++ * @upper_dev: upper device to check
++ *
++ * Find out if a device is linked to specified upper device and return true
++ * in case it is. Note that this checks only immediate upper device,
++ * not through a complete stack of devices. The caller must hold the RTNL lock.
++ */
++bool netdev_has_upper_dev(struct net_device *dev,
++			  struct net_device *upper_dev)
++{
++	struct netdev_nested_priv priv = {
++		.data = (void *)upper_dev,
++	};
++
++	ASSERT_RTNL();
++
++	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
++					     &priv);
++}
++EXPORT_SYMBOL(netdev_has_upper_dev);
++
++/**
++ * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
++ * @dev: device
++ * @upper_dev: upper device to check
++ *
++ * Find out if a device is linked to specified upper device and return true
++ * in case it is. Note that this checks the entire upper device chain.
++ * The caller must hold rcu lock.
++ */
++
++bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
++				  struct net_device *upper_dev)
++{
++	struct netdev_nested_priv priv = {
++		.data = (void *)upper_dev,
++	};
++
++	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
++					       &priv);
++}
++EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
++
++/**
++ * netdev_has_any_upper_dev - Check if device is linked to some device
++ * @dev: device
++ *
++ * Find out if a device is linked to an upper device and return true in case
++ * it is. The caller must hold the RTNL lock.
++ */
++bool netdev_has_any_upper_dev(struct net_device *dev)
++{
++	ASSERT_RTNL();
++
++	return !list_empty(&dev->adj_list.upper);
++}
++EXPORT_SYMBOL(netdev_has_any_upper_dev);
++
++/**
++ * netdev_master_upper_dev_get - Get master upper device
++ * @dev: device
++ *
++ * Find a master upper device and return pointer to it or NULL in case
++ * it's not there. The caller must hold the RTNL lock.
++ */
++struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
++{
++	struct netdev_adjacent *upper;
++
++	ASSERT_RTNL();
++
++	if (list_empty(&dev->adj_list.upper))
++		return NULL;
++
++	upper = list_first_entry(&dev->adj_list.upper,
++				 struct netdev_adjacent, list);
++	if (likely(upper->master))
++		return upper->dev;
++	return NULL;
++}
++EXPORT_SYMBOL(netdev_master_upper_dev_get);
++
++static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
++{
++	struct netdev_adjacent *upper;
++
++	ASSERT_RTNL();
++
++	if (list_empty(&dev->adj_list.upper))
++		return NULL;
++
++	upper = list_first_entry(&dev->adj_list.upper,
++				 struct netdev_adjacent, list);
++	if (likely(upper->master) && !upper->ignore)
++		return upper->dev;
++	return NULL;
++}
++
++/**
++ * netdev_has_any_lower_dev - Check if device is linked to some device
++ * @dev: device
++ *
++ * Find out if a device is linked to a lower device and return true in case
++ * it is. The caller must hold the RTNL lock.
++ */
++static bool netdev_has_any_lower_dev(struct net_device *dev)
++{
++	ASSERT_RTNL();
++
++	return !list_empty(&dev->adj_list.lower);
++}
++
++void *netdev_adjacent_get_private(struct list_head *adj_list)
++{
++	struct netdev_adjacent *adj;
++
++	adj = list_entry(adj_list, struct netdev_adjacent, list);
++
++	return adj->private;
++}
++EXPORT_SYMBOL(netdev_adjacent_get_private);
++
++/**
++ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
++ * @dev: device
++ * @iter: list_head ** of the current position
++ *
++ * Gets the next device from the dev's upper list, starting from iter
++ * position. The caller must hold RCU read lock.
++ */
++struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
++						 struct list_head **iter)
++{
++	struct netdev_adjacent *upper;
++
++	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
++
++	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
++
++	if (&upper->list == &dev->adj_list.upper)
++		return NULL;
++
++	*iter = &upper->list;
++
++	return upper->dev;
++}
++EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
++
++static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
++						  struct list_head **iter,
++						  bool *ignore)
++{
++	struct netdev_adjacent *upper;
++
++	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
++
++	if (&upper->list == &dev->adj_list.upper)
++		return NULL;
++
++	*iter = &upper->list;
++	*ignore = upper->ignore;
++
++	return upper->dev;
++}
++
++static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
++						    struct list_head **iter)
++{
++	struct netdev_adjacent *upper;
++
++	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
++
++	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
++
++	if (&upper->list == &dev->adj_list.upper)
++		return NULL;
++
++	*iter = &upper->list;
++
++	return upper->dev;
++}
++
++static int __netdev_walk_all_upper_dev(struct net_device *dev,
++				       int (*fn)(struct net_device *dev,
++					 struct netdev_nested_priv *priv),
++				       struct netdev_nested_priv *priv)
++{
++	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++	bool ignore;
++
++	now = dev;
++	iter = &dev->adj_list.upper;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			udev = __netdev_next_upper_dev(now, &iter, &ignore);
++			if (!udev)
++				break;
++			if (ignore)
++				continue;
++
++			next = udev;
++			niter = &udev->adj_list.upper;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++
++int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
++				  int (*fn)(struct net_device *dev,
++					    struct netdev_nested_priv *priv),
++				  struct netdev_nested_priv *priv)
++{
++	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++
++	now = dev;
++	iter = &dev->adj_list.upper;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			udev = netdev_next_upper_dev_rcu(now, &iter);
++			if (!udev)
++				break;
++
++			next = udev;
++			niter = &udev->adj_list.upper;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
++
++static bool __netdev_has_upper_dev(struct net_device *dev,
++				   struct net_device *upper_dev)
++{
++	struct netdev_nested_priv priv = {
++		.flags = 0,
++		.data = (void *)upper_dev,
++	};
++
++	ASSERT_RTNL();
++
++	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
++					   &priv);
++}
++
++/**
++ * netdev_lower_get_next_private - Get the next ->private from the
++ *				   lower neighbour list
++ * @dev: device
++ * @iter: list_head ** of the current position
++ *
++ * Gets the next netdev_adjacent->private from the dev's lower neighbour
++ * list, starting from iter position. The caller must hold either hold the
++ * RTNL lock or its own locking that guarantees that the neighbour lower
++ * list will remain unchanged.
++ */
++void *netdev_lower_get_next_private(struct net_device *dev,
++				    struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry(*iter, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = lower->list.next;
++
++	return lower->private;
++}
++EXPORT_SYMBOL(netdev_lower_get_next_private);
++
++/**
++ * netdev_lower_get_next_private_rcu - Get the next ->private from the
++ *				       lower neighbour list, RCU
++ *				       variant
++ * @dev: device
++ * @iter: list_head ** of the current position
++ *
++ * Gets the next netdev_adjacent->private from the dev's lower neighbour
++ * list, starting from iter position. The caller must hold RCU read lock.
++ */
++void *netdev_lower_get_next_private_rcu(struct net_device *dev,
++					struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
++
++	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = &lower->list;
++
++	return lower->private;
++}
++EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
++
++/**
++ * netdev_lower_get_next - Get the next device from the lower neighbour
++ *                         list
++ * @dev: device
++ * @iter: list_head ** of the current position
++ *
++ * Gets the next netdev_adjacent from the dev's lower neighbour
++ * list, starting from iter position. The caller must hold RTNL lock or
++ * its own locking that guarantees that the neighbour lower
++ * list will remain unchanged.
++ */
++void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry(*iter, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = lower->list.next;
++
++	return lower->dev;
++}
++EXPORT_SYMBOL(netdev_lower_get_next);
++
++static struct net_device *netdev_next_lower_dev(struct net_device *dev,
++						struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = &lower->list;
++
++	return lower->dev;
++}
++
++static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
++						  struct list_head **iter,
++						  bool *ignore)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = &lower->list;
++	*ignore = lower->ignore;
++
++	return lower->dev;
++}
++
++int netdev_walk_all_lower_dev(struct net_device *dev,
++			      int (*fn)(struct net_device *dev,
++					struct netdev_nested_priv *priv),
++			      struct netdev_nested_priv *priv)
++{
++	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++
++	now = dev;
++	iter = &dev->adj_list.lower;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			ldev = netdev_next_lower_dev(now, &iter);
++			if (!ldev)
++				break;
++
++			next = ldev;
++			niter = &ldev->adj_list.lower;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
++
++static int __netdev_walk_all_lower_dev(struct net_device *dev,
++				       int (*fn)(struct net_device *dev,
++					 struct netdev_nested_priv *priv),
++				       struct netdev_nested_priv *priv)
++{
++	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++	bool ignore;
++
++	now = dev;
++	iter = &dev->adj_list.lower;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
++			if (!ldev)
++				break;
++			if (ignore)
++				continue;
++
++			next = ldev;
++			niter = &ldev->adj_list.lower;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++
++struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
++					     struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = &lower->list;
++
++	return lower->dev;
++}
++EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
++
++static u8 __netdev_upper_depth(struct net_device *dev)
++{
++	struct net_device *udev;
++	struct list_head *iter;
++	u8 max_depth = 0;
++	bool ignore;
++
++	for (iter = &dev->adj_list.upper,
++	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
++	     udev;
++	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
++		if (ignore)
++			continue;
++		if (max_depth < udev->upper_level)
++			max_depth = udev->upper_level;
++	}
++
++	return max_depth;
++}
++
++static u8 __netdev_lower_depth(struct net_device *dev)
++{
++	struct net_device *ldev;
++	struct list_head *iter;
++	u8 max_depth = 0;
++	bool ignore;
++
++	for (iter = &dev->adj_list.lower,
++	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
++	     ldev;
++	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
++		if (ignore)
++			continue;
++		if (max_depth < ldev->lower_level)
++			max_depth = ldev->lower_level;
++	}
++
++	return max_depth;
++}
++
++static int __netdev_update_upper_level(struct net_device *dev,
++				       struct netdev_nested_priv *__unused)
++{
++	dev->upper_level = __netdev_upper_depth(dev) + 1;
++	return 0;
++}
++
++#ifdef CONFIG_LOCKDEP
++static LIST_HEAD(net_unlink_list);
++
++static void net_unlink_todo(struct net_device *dev)
++{
++	if (list_empty(&dev->unlink_list))
++		list_add_tail(&dev->unlink_list, &net_unlink_list);
++}
++#endif
++
++static int __netdev_update_lower_level(struct net_device *dev,
++				       struct netdev_nested_priv *priv)
++{
++	dev->lower_level = __netdev_lower_depth(dev) + 1;
++
++#ifdef CONFIG_LOCKDEP
++	if (!priv)
++		return 0;
++
++	if (priv->flags & NESTED_SYNC_IMM)
++		dev->nested_level = dev->lower_level - 1;
++	if (priv->flags & NESTED_SYNC_TODO)
++		net_unlink_todo(dev);
++#endif
++	return 0;
++}
++
++int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
++				  int (*fn)(struct net_device *dev,
++					    struct netdev_nested_priv *priv),
++				  struct netdev_nested_priv *priv)
++{
++	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++
++	now = dev;
++	iter = &dev->adj_list.lower;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			ldev = netdev_next_lower_dev_rcu(now, &iter);
++			if (!ldev)
++				break;
++
++			next = ldev;
++			niter = &ldev->adj_list.lower;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
++
++/**
++ * netdev_lower_get_first_private_rcu - Get the first ->private from the
++ *				       lower neighbour list, RCU
++ *				       variant
++ * @dev: device
++ *
++ * Gets the first netdev_adjacent->private from the dev's lower neighbour
++ * list. The caller must hold RCU read lock.
++ */
++void *netdev_lower_get_first_private_rcu(struct net_device *dev)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_first_or_null_rcu(&dev->adj_list.lower,
++			struct netdev_adjacent, list);
++	if (lower)
++		return lower->private;
++	return NULL;
++}
++EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
++
++/**
++ * netdev_master_upper_dev_get_rcu - Get master upper device
++ * @dev: device
++ *
++ * Find a master upper device and return pointer to it or NULL in case
++ * it's not there. The caller must hold the RCU read lock.
++ */
++struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
++{
++	struct netdev_adjacent *upper;
++
++	upper = list_first_or_null_rcu(&dev->adj_list.upper,
++				       struct netdev_adjacent, list);
++	if (upper && likely(upper->master))
++		return upper->dev;
++	return NULL;
++}
++EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
++
++static int netdev_adjacent_sysfs_add(struct net_device *dev,
++			      struct net_device *adj_dev,
++			      struct list_head *dev_list)
++{
++	char linkname[IFNAMSIZ+7];
++
++	sprintf(linkname, dev_list == &dev->adj_list.upper ?
++		"upper_%s" : "lower_%s", adj_dev->name);
++	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
++				 linkname);
++}
++static void netdev_adjacent_sysfs_del(struct net_device *dev,
++			       char *name,
++			       struct list_head *dev_list)
++{
++	char linkname[IFNAMSIZ+7];
++
++	sprintf(linkname, dev_list == &dev->adj_list.upper ?
++		"upper_%s" : "lower_%s", name);
++	sysfs_remove_link(&(dev->dev.kobj), linkname);
++}
++
++static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
++						 struct net_device *adj_dev,
++						 struct list_head *dev_list)
++{
++	return (dev_list == &dev->adj_list.upper ||
++		dev_list == &dev->adj_list.lower) &&
++		net_eq(dev_net(dev), dev_net(adj_dev));
++}
++
++static int __netdev_adjacent_dev_insert(struct net_device *dev,
++					struct net_device *adj_dev,
++					struct list_head *dev_list,
++					void *private, bool master)
++{
++	struct netdev_adjacent *adj;
++	int ret;
++
++	adj = __netdev_find_adj(adj_dev, dev_list);
++
++	if (adj) {
++		adj->ref_nr += 1;
++		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
++			 dev->name, adj_dev->name, adj->ref_nr);
++
++		return 0;
++	}
++
++	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
++	if (!adj)
++		return -ENOMEM;
++
++	adj->dev = adj_dev;
++	adj->master = master;
++	adj->ref_nr = 1;
++	adj->private = private;
++	adj->ignore = false;
++	netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
++
++	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
++		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
++
++	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
++		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
++		if (ret)
++			goto free_adj;
++	}
++
++	/* Ensure that master link is always the first item in list. */
++	if (master) {
++		ret = sysfs_create_link(&(dev->dev.kobj),
++					&(adj_dev->dev.kobj), "master");
++		if (ret)
++			goto remove_symlinks;
++
++		list_add_rcu(&adj->list, dev_list);
++	} else {
++		list_add_tail_rcu(&adj->list, dev_list);
++	}
++
++	return 0;
++
++remove_symlinks:
++	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
++		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
++free_adj:
++	netdev_put(adj_dev, &adj->dev_tracker);
++	kfree(adj);
++
++	return ret;
++}
++
++static void __netdev_adjacent_dev_remove(struct net_device *dev,
++					 struct net_device *adj_dev,
++					 u16 ref_nr,
++					 struct list_head *dev_list)
++{
++	struct netdev_adjacent *adj;
++
++	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
++		 dev->name, adj_dev->name, ref_nr);
++
++	adj = __netdev_find_adj(adj_dev, dev_list);
++
++	if (!adj) {
++		pr_err("Adjacency does not exist for device %s from %s\n",
++		       dev->name, adj_dev->name);
++		WARN_ON(1);
++		return;
++	}
++
++	if (adj->ref_nr > ref_nr) {
++		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
++			 dev->name, adj_dev->name, ref_nr,
++			 adj->ref_nr - ref_nr);
++		adj->ref_nr -= ref_nr;
++		return;
++	}
++
++	if (adj->master)
++		sysfs_remove_link(&(dev->dev.kobj), "master");
++
++	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
++		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
++
++	list_del_rcu(&adj->list);
++	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
++		 adj_dev->name, dev->name, adj_dev->name);
++	netdev_put(adj_dev, &adj->dev_tracker);
++	kfree_rcu(adj, rcu);
++}
++
++static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
++					    struct net_device *upper_dev,
++					    struct list_head *up_list,
++					    struct list_head *down_list,
++					    void *private, bool master)
++{
++	int ret;
++
++	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
++					   private, master);
++	if (ret)
++		return ret;
++
++	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
++					   private, false);
++	if (ret) {
++		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
++		return ret;
++	}
++
++	return 0;
++}
++
++static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
++					       struct net_device *upper_dev,
++					       u16 ref_nr,
++					       struct list_head *up_list,
++					       struct list_head *down_list)
++{
++	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
++	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
++}
++
++static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
++						struct net_device *upper_dev,
++						void *private, bool master)
++{
++	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
++						&dev->adj_list.upper,
++						&upper_dev->adj_list.lower,
++						private, master);
++}
++
++static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
++						   struct net_device *upper_dev)
++{
++	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
++					   &dev->adj_list.upper,
++					   &upper_dev->adj_list.lower);
++}
++
++static int __netdev_upper_dev_link(struct net_device *dev,
++				   struct net_device *upper_dev, bool master,
++				   void *upper_priv, void *upper_info,
++				   struct netdev_nested_priv *priv,
++				   struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_changeupper_info changeupper_info = {
++		.info = {
++			.dev = dev,
++			.extack = extack,
++		},
++		.upper_dev = upper_dev,
++		.master = master,
++		.linking = true,
++		.upper_info = upper_info,
++	};
++	struct net_device *master_dev;
++	int ret = 0;
++
++	ASSERT_RTNL();
++
++	if (dev == upper_dev)
++		return -EBUSY;
++
++	/* To prevent loops, check if dev is not upper device to upper_dev. */
++	if (__netdev_has_upper_dev(upper_dev, dev))
++		return -EBUSY;
++
++	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
++		return -EMLINK;
++
++	if (!master) {
++		if (__netdev_has_upper_dev(dev, upper_dev))
++			return -EEXIST;
++	} else {
++		master_dev = __netdev_master_upper_dev_get(dev);
++		if (master_dev)
++			return master_dev == upper_dev ? -EEXIST : -EBUSY;
++	}
++
++	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
++					    &changeupper_info.info);
++	ret = notifier_to_errno(ret);
++	if (ret)
++		return ret;
++
++	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
++						   master);
++	if (ret)
++		return ret;
++
++	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
++					    &changeupper_info.info);
++	ret = notifier_to_errno(ret);
++	if (ret)
++		goto rollback;
++
++	__netdev_update_upper_level(dev, NULL);
++	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
++
++	__netdev_update_lower_level(upper_dev, priv);
++	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
++				    priv);
++
++	return 0;
++
++rollback:
++	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
++
++	return ret;
++}
++
++/**
++ * netdev_upper_dev_link - Add a link to the upper device
++ * @dev: device
++ * @upper_dev: new upper device
++ * @extack: netlink extended ack
++ *
++ * Adds a link to device which is upper to this one. The caller must hold
++ * the RTNL lock. On a failure a negative errno code is returned.
++ * On success the reference counts are adjusted and the function
++ * returns zero.
++ */
++int netdev_upper_dev_link(struct net_device *dev,
++			  struct net_device *upper_dev,
++			  struct netlink_ext_ack *extack)
++{
++	struct netdev_nested_priv priv = {
++		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
++		.data = NULL,
++	};
++
++	return __netdev_upper_dev_link(dev, upper_dev, false,
++				       NULL, NULL, &priv, extack);
++}
++EXPORT_SYMBOL(netdev_upper_dev_link);
++
++/**
++ * netdev_master_upper_dev_link - Add a master link to the upper device
++ * @dev: device
++ * @upper_dev: new upper device
++ * @upper_priv: upper device private
++ * @upper_info: upper info to be passed down via notifier
++ * @extack: netlink extended ack
++ *
++ * Adds a link to device which is upper to this one. In this case, only
++ * one master upper device can be linked, although other non-master devices
++ * might be linked as well. The caller must hold the RTNL lock.
++ * On a failure a negative errno code is returned. On success the reference
++ * counts are adjusted and the function returns zero.
++ */
++int netdev_master_upper_dev_link(struct net_device *dev,
++				 struct net_device *upper_dev,
++				 void *upper_priv, void *upper_info,
++				 struct netlink_ext_ack *extack)
++{
++	struct netdev_nested_priv priv = {
++		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
++		.data = NULL,
++	};
++
++	return __netdev_upper_dev_link(dev, upper_dev, true,
++				       upper_priv, upper_info, &priv, extack);
++}
++EXPORT_SYMBOL(netdev_master_upper_dev_link);
++
++static void __netdev_upper_dev_unlink(struct net_device *dev,
++				      struct net_device *upper_dev,
++				      struct netdev_nested_priv *priv)
++{
++	struct netdev_notifier_changeupper_info changeupper_info = {
++		.info = {
++			.dev = dev,
++		},
++		.upper_dev = upper_dev,
++		.linking = false,
++	};
++
++	ASSERT_RTNL();
++
++	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
++
++	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
++				      &changeupper_info.info);
++
++	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
++
++	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
++				      &changeupper_info.info);
++
++	__netdev_update_upper_level(dev, NULL);
++	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
++
++	__netdev_update_lower_level(upper_dev, priv);
++	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
++				    priv);
++}
++
++/**
++ * netdev_upper_dev_unlink - Removes a link to upper device
++ * @dev: device
++ * @upper_dev: new upper device
++ *
++ * Removes a link to device which is upper to this one. The caller must hold
++ * the RTNL lock.
++ */
++void netdev_upper_dev_unlink(struct net_device *dev,
++			     struct net_device *upper_dev)
++{
++	struct netdev_nested_priv priv = {
++		.flags = NESTED_SYNC_TODO,
++		.data = NULL,
++	};
++
++	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
++}
++EXPORT_SYMBOL(netdev_upper_dev_unlink);
++
++static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
++				      struct net_device *lower_dev,
++				      bool val)
++{
++	struct netdev_adjacent *adj;
++
++	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
++	if (adj)
++		adj->ignore = val;
++
++	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
++	if (adj)
++		adj->ignore = val;
++}
++
++static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
++					struct net_device *lower_dev)
++{
++	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
++}
++
++static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
++				       struct net_device *lower_dev)
++{
++	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
++}
++
++int netdev_adjacent_change_prepare(struct net_device *old_dev,
++				   struct net_device *new_dev,
++				   struct net_device *dev,
++				   struct netlink_ext_ack *extack)
++{
++	struct netdev_nested_priv priv = {
++		.flags = 0,
++		.data = NULL,
++	};
++	int err;
++
++	if (!new_dev)
++		return 0;
++
++	if (old_dev && new_dev != old_dev)
++		netdev_adjacent_dev_disable(dev, old_dev);
++	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
++				      extack);
++	if (err) {
++		if (old_dev && new_dev != old_dev)
++			netdev_adjacent_dev_enable(dev, old_dev);
++		return err;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(netdev_adjacent_change_prepare);
++
++void netdev_adjacent_change_commit(struct net_device *old_dev,
++				   struct net_device *new_dev,
++				   struct net_device *dev)
++{
++	struct netdev_nested_priv priv = {
++		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
++		.data = NULL,
++	};
++
++	if (!new_dev || !old_dev)
++		return;
++
++	if (new_dev == old_dev)
++		return;
++
++	netdev_adjacent_dev_enable(dev, old_dev);
++	__netdev_upper_dev_unlink(old_dev, dev, &priv);
++}
++EXPORT_SYMBOL(netdev_adjacent_change_commit);
++
++void netdev_adjacent_change_abort(struct net_device *old_dev,
++				  struct net_device *new_dev,
++				  struct net_device *dev)
++{
++	struct netdev_nested_priv priv = {
++		.flags = 0,
++		.data = NULL,
++	};
++
++	if (!new_dev)
++		return;
++
++	if (old_dev && new_dev != old_dev)
++		netdev_adjacent_dev_enable(dev, old_dev);
++
++	__netdev_upper_dev_unlink(new_dev, dev, &priv);
++}
++EXPORT_SYMBOL(netdev_adjacent_change_abort);
++
++/**
++ * netdev_bonding_info_change - Dispatch event about slave change
++ * @dev: device
++ * @bonding_info: info to dispatch
++ *
++ * Send NETDEV_BONDING_INFO to netdev notifiers with info.
++ * The caller must hold the RTNL lock.
++ */
++void netdev_bonding_info_change(struct net_device *dev,
++				struct netdev_bonding_info *bonding_info)
++{
++	struct netdev_notifier_bonding_info info = {
++		.info.dev = dev,
++	};
++
++	memcpy(&info.bonding_info, bonding_info,
++	       sizeof(struct netdev_bonding_info));
++	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
++				      &info.info);
++}
++EXPORT_SYMBOL(netdev_bonding_info_change);
++
++static int netdev_offload_xstats_enable_l3(struct net_device *dev,
++					   struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_offload_xstats_info info = {
++		.info.dev = dev,
++		.info.extack = extack,
++		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
++	};
++	int err;
++	int rc;
++
++	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
++					 GFP_KERNEL);
++	if (!dev->offload_xstats_l3)
++		return -ENOMEM;
++
++	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
++						  NETDEV_OFFLOAD_XSTATS_DISABLE,
++						  &info.info);
++	err = notifier_to_errno(rc);
++	if (err)
++		goto free_stats;
++
++	return 0;
++
++free_stats:
++	kfree(dev->offload_xstats_l3);
++	dev->offload_xstats_l3 = NULL;
++	return err;
++}
++
++int netdev_offload_xstats_enable(struct net_device *dev,
++				 enum netdev_offload_xstats_type type,
++				 struct netlink_ext_ack *extack)
++{
++	ASSERT_RTNL();
++
++	if (netdev_offload_xstats_enabled(dev, type))
++		return -EALREADY;
++
++	switch (type) {
++	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
++		return netdev_offload_xstats_enable_l3(dev, extack);
++	}
++
++	WARN_ON(1);
++	return -EINVAL;
++}
++EXPORT_SYMBOL(netdev_offload_xstats_enable);
++
++static void netdev_offload_xstats_disable_l3(struct net_device *dev)
++{
++	struct netdev_notifier_offload_xstats_info info = {
++		.info.dev = dev,
++		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
++	};
++
++	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
++				      &info.info);
++	kfree(dev->offload_xstats_l3);
++	dev->offload_xstats_l3 = NULL;
++}
++
++int netdev_offload_xstats_disable(struct net_device *dev,
++				  enum netdev_offload_xstats_type type)
++{
++	ASSERT_RTNL();
++
++	if (!netdev_offload_xstats_enabled(dev, type))
++		return -EALREADY;
++
++	switch (type) {
++	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
++		netdev_offload_xstats_disable_l3(dev);
++		return 0;
++	}
++
++	WARN_ON(1);
++	return -EINVAL;
++}
++EXPORT_SYMBOL(netdev_offload_xstats_disable);
++
++static void netdev_offload_xstats_disable_all(struct net_device *dev)
++{
++	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
++}
++
++static struct rtnl_hw_stats64 *
++netdev_offload_xstats_get_ptr(const struct net_device *dev,
++			      enum netdev_offload_xstats_type type)
++{
++	switch (type) {
++	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
++		return dev->offload_xstats_l3;
++	}
++
++	WARN_ON(1);
++	return NULL;
++}
++
++bool netdev_offload_xstats_enabled(const struct net_device *dev,
++				   enum netdev_offload_xstats_type type)
++{
++	ASSERT_RTNL();
++
++	return netdev_offload_xstats_get_ptr(dev, type);
++}
++EXPORT_SYMBOL(netdev_offload_xstats_enabled);
++
++struct netdev_notifier_offload_xstats_ru {
++	bool used;
++};
++
++struct netdev_notifier_offload_xstats_rd {
++	struct rtnl_hw_stats64 stats;
++	bool used;
++};
++
++static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
++				  const struct rtnl_hw_stats64 *src)
++{
++	dest->rx_packets	  += src->rx_packets;
++	dest->tx_packets	  += src->tx_packets;
++	dest->rx_bytes		  += src->rx_bytes;
++	dest->tx_bytes		  += src->tx_bytes;
++	dest->rx_errors		  += src->rx_errors;
++	dest->tx_errors		  += src->tx_errors;
++	dest->rx_dropped	  += src->rx_dropped;
++	dest->tx_dropped	  += src->tx_dropped;
++	dest->multicast		  += src->multicast;
++}
++
++static int netdev_offload_xstats_get_used(struct net_device *dev,
++					  enum netdev_offload_xstats_type type,
++					  bool *p_used,
++					  struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_offload_xstats_ru report_used = {};
++	struct netdev_notifier_offload_xstats_info info = {
++		.info.dev = dev,
++		.info.extack = extack,
++		.type = type,
++		.report_used = &report_used,
++	};
++	int rc;
++
++	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
++	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
++					   &info.info);
++	*p_used = report_used.used;
++	return notifier_to_errno(rc);
++}
++
++static int netdev_offload_xstats_get_stats(struct net_device *dev,
++					   enum netdev_offload_xstats_type type,
++					   struct rtnl_hw_stats64 *p_stats,
++					   bool *p_used,
++					   struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_offload_xstats_rd report_delta = {};
++	struct netdev_notifier_offload_xstats_info info = {
++		.info.dev = dev,
++		.info.extack = extack,
++		.type = type,
++		.report_delta = &report_delta,
++	};
++	struct rtnl_hw_stats64 *stats;
++	int rc;
++
++	stats = netdev_offload_xstats_get_ptr(dev, type);
++	if (WARN_ON(!stats))
++		return -EINVAL;
++
++	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
++					   &info.info);
++
++	/* Cache whatever we got, even if there was an error, otherwise the
++	 * successful stats retrievals would get lost.
++	 */
++	netdev_hw_stats64_add(stats, &report_delta.stats);
++
++	if (p_stats)
++		*p_stats = *stats;
++	*p_used = report_delta.used;
++
++	return notifier_to_errno(rc);
++}
++
++int netdev_offload_xstats_get(struct net_device *dev,
++			      enum netdev_offload_xstats_type type,
++			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
++			      struct netlink_ext_ack *extack)
++{
++	ASSERT_RTNL();
++
++	if (p_stats)
++		return netdev_offload_xstats_get_stats(dev, type, p_stats,
++						       p_used, extack);
++	else
++		return netdev_offload_xstats_get_used(dev, type, p_used,
++						      extack);
++}
++EXPORT_SYMBOL(netdev_offload_xstats_get);
++
++void
++netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
++				   const struct rtnl_hw_stats64 *stats)
++{
++	report_delta->used = true;
++	netdev_hw_stats64_add(&report_delta->stats, stats);
++}
++EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
++
++void
++netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
++{
++	report_used->used = true;
++}
++EXPORT_SYMBOL(netdev_offload_xstats_report_used);
++
++void netdev_offload_xstats_push_delta(struct net_device *dev,
++				      enum netdev_offload_xstats_type type,
++				      const struct rtnl_hw_stats64 *p_stats)
++{
++	struct rtnl_hw_stats64 *stats;
++
++	ASSERT_RTNL();
++
++	stats = netdev_offload_xstats_get_ptr(dev, type);
++	if (WARN_ON(!stats))
++		return;
++
++	netdev_hw_stats64_add(stats, p_stats);
++}
++EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
++
++/**
++ * netdev_get_xmit_slave - Get the xmit slave of master device
++ * @dev: device
++ * @skb: The packet
++ * @all_slaves: assume all the slaves are active
++ *
++ * The reference counters are not incremented so the caller must be
++ * careful with locks. The caller must hold RCU lock.
++ * %NULL is returned if no slave is found.
++ */
++
++struct net_device *netdev_get_xmit_slave(struct net_device *dev,
++					 struct sk_buff *skb,
++					 bool all_slaves)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (!ops->ndo_get_xmit_slave)
++		return NULL;
++	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
++}
++EXPORT_SYMBOL(netdev_get_xmit_slave);
++
++static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
++						  struct sock *sk)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (!ops->ndo_sk_get_lower_dev)
++		return NULL;
++	return ops->ndo_sk_get_lower_dev(dev, sk);
++}
++
++/**
++ * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
++ * @dev: device
++ * @sk: the socket
++ *
++ * %NULL is returned if no lower device is found.
++ */
++
++struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
++					    struct sock *sk)
++{
++	struct net_device *lower;
++
++	lower = netdev_sk_get_lower_dev(dev, sk);
++	while (lower) {
++		dev = lower;
++		lower = netdev_sk_get_lower_dev(dev, sk);
++	}
++
++	return dev;
++}
++EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
++
++static void netdev_adjacent_add_links(struct net_device *dev)
++{
++	struct netdev_adjacent *iter;
++
++	struct net *net = dev_net(dev);
++
++	list_for_each_entry(iter, &dev->adj_list.upper, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_add(iter->dev, dev,
++					  &iter->dev->adj_list.lower);
++		netdev_adjacent_sysfs_add(dev, iter->dev,
++					  &dev->adj_list.upper);
++	}
++
++	list_for_each_entry(iter, &dev->adj_list.lower, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_add(iter->dev, dev,
++					  &iter->dev->adj_list.upper);
++		netdev_adjacent_sysfs_add(dev, iter->dev,
++					  &dev->adj_list.lower);
++	}
++}
++
++static void netdev_adjacent_del_links(struct net_device *dev)
++{
++	struct netdev_adjacent *iter;
++
++	struct net *net = dev_net(dev);
++
++	list_for_each_entry(iter, &dev->adj_list.upper, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_del(iter->dev, dev->name,
++					  &iter->dev->adj_list.lower);
++		netdev_adjacent_sysfs_del(dev, iter->dev->name,
++					  &dev->adj_list.upper);
++	}
++
++	list_for_each_entry(iter, &dev->adj_list.lower, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_del(iter->dev, dev->name,
++					  &iter->dev->adj_list.upper);
++		netdev_adjacent_sysfs_del(dev, iter->dev->name,
++					  &dev->adj_list.lower);
++	}
++}
++
++void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
++{
++	struct netdev_adjacent *iter;
++
++	struct net *net = dev_net(dev);
++
++	list_for_each_entry(iter, &dev->adj_list.upper, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_del(iter->dev, oldname,
++					  &iter->dev->adj_list.lower);
++		netdev_adjacent_sysfs_add(iter->dev, dev,
++					  &iter->dev->adj_list.lower);
++	}
++
++	list_for_each_entry(iter, &dev->adj_list.lower, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_del(iter->dev, oldname,
++					  &iter->dev->adj_list.upper);
++		netdev_adjacent_sysfs_add(iter->dev, dev,
++					  &iter->dev->adj_list.upper);
++	}
++}
++
++void *netdev_lower_dev_get_private(struct net_device *dev,
++				   struct net_device *lower_dev)
++{
++	struct netdev_adjacent *lower;
++
++	if (!lower_dev)
++		return NULL;
++	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
++	if (!lower)
++		return NULL;
++
++	return lower->private;
++}
++EXPORT_SYMBOL(netdev_lower_dev_get_private);
++
++
++/**
++ * netdev_lower_state_changed - Dispatch event about lower device state change
++ * @lower_dev: device
++ * @lower_state_info: state to dispatch
++ *
++ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
++ * The caller must hold the RTNL lock.
++ */
++void netdev_lower_state_changed(struct net_device *lower_dev,
++				void *lower_state_info)
++{
++	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
++		.info.dev = lower_dev,
++	};
++
++	ASSERT_RTNL();
++	changelowerstate_info.lower_state_info = lower_state_info;
++	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
++				      &changelowerstate_info.info);
++}
++EXPORT_SYMBOL(netdev_lower_state_changed);
++
++static void dev_change_rx_flags(struct net_device *dev, int flags)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (ops->ndo_change_rx_flags)
++		ops->ndo_change_rx_flags(dev, flags);
++}
++
++static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
++{
++	unsigned int old_flags = dev->flags;
++	kuid_t uid;
++	kgid_t gid;
++
++	ASSERT_RTNL();
++
++	dev->flags |= IFF_PROMISC;
++	dev->promiscuity += inc;
++	if (dev->promiscuity == 0) {
++		/*
++		 * Avoid overflow.
++		 * If inc causes overflow, untouch promisc and return error.
++		 */
++		if (inc < 0)
++			dev->flags &= ~IFF_PROMISC;
++		else {
++			dev->promiscuity -= inc;
++			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
++			return -EOVERFLOW;
++		}
++	}
++	if (dev->flags != old_flags) {
++		pr_info("device %s %s promiscuous mode\n",
++			dev->name,
++			dev->flags & IFF_PROMISC ? "entered" : "left");
++		if (audit_enabled) {
++			current_uid_gid(&uid, &gid);
++			audit_log(audit_context(), GFP_ATOMIC,
++				  AUDIT_ANOM_PROMISCUOUS,
++				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
++				  dev->name, (dev->flags & IFF_PROMISC),
++				  (old_flags & IFF_PROMISC),
++				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
++				  from_kuid(&init_user_ns, uid),
++				  from_kgid(&init_user_ns, gid),
++				  audit_get_sessionid(current));
++		}
++
++		dev_change_rx_flags(dev, IFF_PROMISC);
++	}
++	if (notify)
++		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
++	return 0;
++}
++
++/**
++ *	dev_set_promiscuity	- update promiscuity count on a device
++ *	@dev: device
++ *	@inc: modifier
++ *
++ *	Add or remove promiscuity from a device. While the count in the device
++ *	remains above zero the interface remains promiscuous. Once it hits zero
++ *	the device reverts back to normal filtering operation. A negative inc
++ *	value is used to drop promiscuity on the device.
++ *	Return 0 if successful or a negative errno code on error.
++ */
++int dev_set_promiscuity(struct net_device *dev, int inc)
++{
++	unsigned int old_flags = dev->flags;
++	int err;
++
++	err = __dev_set_promiscuity(dev, inc, true);
++	if (err < 0)
++		return err;
++	if (dev->flags != old_flags)
++		dev_set_rx_mode(dev);
++	return err;
++}
++EXPORT_SYMBOL(dev_set_promiscuity);
++
++static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
++{
++	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
++
++	ASSERT_RTNL();
++
++	dev->flags |= IFF_ALLMULTI;
++	dev->allmulti += inc;
++	if (dev->allmulti == 0) {
++		/*
++		 * Avoid overflow.
++		 * If inc causes overflow, untouch allmulti and return error.
++		 */
++		if (inc < 0)
++			dev->flags &= ~IFF_ALLMULTI;
++		else {
++			dev->allmulti -= inc;
++			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
++			return -EOVERFLOW;
++		}
++	}
++	if (dev->flags ^ old_flags) {
++		dev_change_rx_flags(dev, IFF_ALLMULTI);
++		dev_set_rx_mode(dev);
++		if (notify)
++			__dev_notify_flags(dev, old_flags,
++					   dev->gflags ^ old_gflags);
++	}
++	return 0;
++}
++
++/**
++ *	dev_set_allmulti	- update allmulti count on a device
++ *	@dev: device
++ *	@inc: modifier
++ *
++ *	Add or remove reception of all multicast frames to a device. While the
++ *	count in the device remains above zero the interface remains listening
++ *	to all interfaces. Once it hits zero the device reverts back to normal
++ *	filtering operation. A negative @inc value is used to drop the counter
++ *	when releasing a resource needing all multicasts.
++ *	Return 0 if successful or a negative errno code on error.
++ */
++
++int dev_set_allmulti(struct net_device *dev, int inc)
++{
++	return __dev_set_allmulti(dev, inc, true);
++}
++EXPORT_SYMBOL(dev_set_allmulti);
++
++/*
++ *	Upload unicast and multicast address lists to device and
++ *	configure RX filtering. When the device doesn't support unicast
++ *	filtering it is put in promiscuous mode while unicast addresses
++ *	are present.
++ */
++void __dev_set_rx_mode(struct net_device *dev)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	/* dev_open will call this function so the list will stay sane. */
++	if (!(dev->flags&IFF_UP))
++		return;
++
++	if (!netif_device_present(dev))
++		return;
++
++	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
++		/* Unicast addresses changes may only happen under the rtnl,
++		 * therefore calling __dev_set_promiscuity here is safe.
++		 */
++		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
++			__dev_set_promiscuity(dev, 1, false);
++			dev->uc_promisc = true;
++		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
++			__dev_set_promiscuity(dev, -1, false);
++			dev->uc_promisc = false;
++		}
++	}
++
++	if (ops->ndo_set_rx_mode)
++		ops->ndo_set_rx_mode(dev);
++}
++
++void dev_set_rx_mode(struct net_device *dev)
++{
++	netif_addr_lock_bh(dev);
++	__dev_set_rx_mode(dev);
++	netif_addr_unlock_bh(dev);
++}
++
++/**
++ *	dev_get_flags - get flags reported to userspace
++ *	@dev: device
++ *
++ *	Get the combination of flag bits exported through APIs to userspace.
++ */
++unsigned int dev_get_flags(const struct net_device *dev)
++{
++	unsigned int flags;
++
++	flags = (dev->flags & ~(IFF_PROMISC |
++				IFF_ALLMULTI |
++				IFF_RUNNING |
++				IFF_LOWER_UP |
++				IFF_DORMANT)) |
++		(dev->gflags & (IFF_PROMISC |
++				IFF_ALLMULTI));
++
++	if (netif_running(dev)) {
++		if (netif_oper_up(dev))
++			flags |= IFF_RUNNING;
++		if (netif_carrier_ok(dev))
++			flags |= IFF_LOWER_UP;
++		if (netif_dormant(dev))
++			flags |= IFF_DORMANT;
++	}
++
++	return flags;
++}
++EXPORT_SYMBOL(dev_get_flags);
++
++int __dev_change_flags(struct net_device *dev, unsigned int flags,
++		       struct netlink_ext_ack *extack)
++{
++	unsigned int old_flags = dev->flags;
++	int ret;
++
++	ASSERT_RTNL();
++
++	/*
++	 *	Set the flags on our device.
++	 */
++
++	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
++			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
++			       IFF_AUTOMEDIA)) |
++		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
++				    IFF_ALLMULTI));
++
++	/*
++	 *	Load in the correct multicast list now the flags have changed.
++	 */
++
++	if ((old_flags ^ flags) & IFF_MULTICAST)
++		dev_change_rx_flags(dev, IFF_MULTICAST);
++
++	dev_set_rx_mode(dev);
++
++	/*
++	 *	Have we downed the interface. We handle IFF_UP ourselves
++	 *	according to user attempts to set it, rather than blindly
++	 *	setting it.
++	 */
++
++	ret = 0;
++	if ((old_flags ^ flags) & IFF_UP) {
++		if (old_flags & IFF_UP)
++			__dev_close(dev);
++		else
++			ret = __dev_open(dev, extack);
++	}
++
++	if ((flags ^ dev->gflags) & IFF_PROMISC) {
++		int inc = (flags & IFF_PROMISC) ? 1 : -1;
++		unsigned int old_flags = dev->flags;
++
++		dev->gflags ^= IFF_PROMISC;
++
++		if (__dev_set_promiscuity(dev, inc, false) >= 0)
++			if (dev->flags != old_flags)
++				dev_set_rx_mode(dev);
++	}
++
++	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
++	 * is important. Some (broken) drivers set IFF_PROMISC, when
++	 * IFF_ALLMULTI is requested not asking us and not reporting.
++	 */
++	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
++		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
++
++		dev->gflags ^= IFF_ALLMULTI;
++		__dev_set_allmulti(dev, inc, false);
++	}
++
++	return ret;
++}
++
++void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
++			unsigned int gchanges)
++{
++	unsigned int changes = dev->flags ^ old_flags;
++
++	if (gchanges)
++		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
++
++	if (changes & IFF_UP) {
++		if (dev->flags & IFF_UP)
++			call_netdevice_notifiers(NETDEV_UP, dev);
++		else
++			call_netdevice_notifiers(NETDEV_DOWN, dev);
++	}
++
++	if (dev->flags & IFF_UP &&
++	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
++		struct netdev_notifier_change_info change_info = {
++			.info = {
++				.dev = dev,
++			},
++			.flags_changed = changes,
++		};
++
++		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
++	}
++}
++
++/**
++ *	dev_change_flags - change device settings
++ *	@dev: device
++ *	@flags: device state flags
++ *	@extack: netlink extended ack
++ *
++ *	Change settings on device based state flags. The flags are
++ *	in the userspace exported format.
++ */
++int dev_change_flags(struct net_device *dev, unsigned int flags,
++		     struct netlink_ext_ack *extack)
++{
++	int ret;
++	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
++
++	ret = __dev_change_flags(dev, flags, extack);
++	if (ret < 0)
++		return ret;
++
++	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
++	__dev_notify_flags(dev, old_flags, changes);
++	return ret;
++}
++EXPORT_SYMBOL(dev_change_flags);
++
++int __dev_set_mtu(struct net_device *dev, int new_mtu)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (ops->ndo_change_mtu)
++		return ops->ndo_change_mtu(dev, new_mtu);
++
++	/* Pairs with all the lockless reads of dev->mtu in the stack */
++	WRITE_ONCE(dev->mtu, new_mtu);
++	return 0;
++}
++EXPORT_SYMBOL(__dev_set_mtu);
++
++int dev_validate_mtu(struct net_device *dev, int new_mtu,
++		     struct netlink_ext_ack *extack)
++{
++	/* MTU must be positive, and in range */
++	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
++		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
++		return -EINVAL;
++	}
++
++	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
++		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
++		return -EINVAL;
++	}
++	return 0;
++}
++
++/**
++ *	dev_set_mtu_ext - Change maximum transfer unit
++ *	@dev: device
++ *	@new_mtu: new transfer unit
++ *	@extack: netlink extended ack
++ *
++ *	Change the maximum transfer size of the network device.
++ */
++int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
++		    struct netlink_ext_ack *extack)
++{
++	int err, orig_mtu;
++
++	if (new_mtu == dev->mtu)
++		return 0;
++
++	err = dev_validate_mtu(dev, new_mtu, extack);
++	if (err)
++		return err;
++
++	if (!netif_device_present(dev))
++		return -ENODEV;
++
++	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
++	err = notifier_to_errno(err);
++	if (err)
++		return err;
++
++	orig_mtu = dev->mtu;
++	err = __dev_set_mtu(dev, new_mtu);
++
++	if (!err) {
++		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
++						   orig_mtu);
++		err = notifier_to_errno(err);
++		if (err) {
++			/* setting mtu back and notifying everyone again,
++			 * so that they have a chance to revert changes.
++			 */
++			__dev_set_mtu(dev, orig_mtu);
++			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
++						     new_mtu);
++		}
++	}
++	return err;
++}
++
++int dev_set_mtu(struct net_device *dev, int new_mtu)
++{
++	struct netlink_ext_ack extack;
++	int err;
++
++	memset(&extack, 0, sizeof(extack));
++	err = dev_set_mtu_ext(dev, new_mtu, &extack);
++	if (err && extack._msg)
++		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
++	return err;
++}
++EXPORT_SYMBOL(dev_set_mtu);
++
++/**
++ *	dev_change_tx_queue_len - Change TX queue length of a netdevice
++ *	@dev: device
++ *	@new_len: new tx queue length
++ */
++int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
++{
++	unsigned int orig_len = dev->tx_queue_len;
++	int res;
++
++	if (new_len != (unsigned int)new_len)
++		return -ERANGE;
++
++	if (new_len != orig_len) {
++		dev->tx_queue_len = new_len;
++		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
++		res = notifier_to_errno(res);
++		if (res)
++			goto err_rollback;
++		res = dev_qdisc_change_tx_queue_len(dev);
++		if (res)
++			goto err_rollback;
++	}
++
++	return 0;
++
++err_rollback:
++	netdev_err(dev, "refused to change device tx_queue_len\n");
++	dev->tx_queue_len = orig_len;
++	return res;
++}
++
++/**
++ *	dev_set_group - Change group this device belongs to
++ *	@dev: device
++ *	@new_group: group this device should belong to
++ */
++void dev_set_group(struct net_device *dev, int new_group)
++{
++	dev->group = new_group;
++}
++
++/**
++ *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
++ *	@dev: device
++ *	@addr: new address
++ *	@extack: netlink extended ack
++ */
++int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
++			      struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_pre_changeaddr_info info = {
++		.info.dev = dev,
++		.info.extack = extack,
++		.dev_addr = addr,
++	};
++	int rc;
++
++	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
++	return notifier_to_errno(rc);
++}
++EXPORT_SYMBOL(dev_pre_changeaddr_notify);
++
++/**
++ *	dev_set_mac_address - Change Media Access Control Address
++ *	@dev: device
++ *	@sa: new address
++ *	@extack: netlink extended ack
++ *
++ *	Change the hardware (MAC) address of the device
++ */
++int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
++			struct netlink_ext_ack *extack)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	int err;
++
++	if (!ops->ndo_set_mac_address)
++		return -EOPNOTSUPP;
++	if (sa->sa_family != dev->type)
++		return -EINVAL;
++	if (!netif_device_present(dev))
++		return -ENODEV;
++	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
++	if (err)
++		return err;
++	err = ops->ndo_set_mac_address(dev, sa);
++	if (err)
++		return err;
++	dev->addr_assign_type = NET_ADDR_SET;
++	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
++	add_device_randomness(dev->dev_addr, dev->addr_len);
++	return 0;
++}
++EXPORT_SYMBOL(dev_set_mac_address);
++
++static DECLARE_RWSEM(dev_addr_sem);
++
++int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
++			     struct netlink_ext_ack *extack)
++{
++	int ret;
++
++	down_write(&dev_addr_sem);
++	ret = dev_set_mac_address(dev, sa, extack);
++	up_write(&dev_addr_sem);
++	return ret;
++}
++EXPORT_SYMBOL(dev_set_mac_address_user);
++
++int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
++{
++	size_t size = sizeof(sa->sa_data);
++	struct net_device *dev;
++	int ret = 0;
++
++	down_read(&dev_addr_sem);
++	rcu_read_lock();
++
++	dev = dev_get_by_name_rcu(net, dev_name);
++	if (!dev) {
++		ret = -ENODEV;
++		goto unlock;
++	}
++	if (!dev->addr_len)
++		memset(sa->sa_data, 0, size);
++	else
++		memcpy(sa->sa_data, dev->dev_addr,
++		       min_t(size_t, size, dev->addr_len));
++	sa->sa_family = dev->type;
++
++unlock:
++	rcu_read_unlock();
++	up_read(&dev_addr_sem);
++	return ret;
++}
++EXPORT_SYMBOL(dev_get_mac_address);
++
++/**
++ *	dev_change_carrier - Change device carrier
++ *	@dev: device
++ *	@new_carrier: new value
++ *
++ *	Change device carrier
++ */
++int dev_change_carrier(struct net_device *dev, bool new_carrier)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (!ops->ndo_change_carrier)
++		return -EOPNOTSUPP;
++	if (!netif_device_present(dev))
++		return -ENODEV;
++	return ops->ndo_change_carrier(dev, new_carrier);
++}
++
++/**
++ *	dev_get_phys_port_id - Get device physical port ID
++ *	@dev: device
++ *	@ppid: port ID
++ *
++ *	Get device physical port ID
++ */
++int dev_get_phys_port_id(struct net_device *dev,
++			 struct netdev_phys_item_id *ppid)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (!ops->ndo_get_phys_port_id)
++		return -EOPNOTSUPP;
++	return ops->ndo_get_phys_port_id(dev, ppid);
++}
++
++/**
++ *	dev_get_phys_port_name - Get device physical port name
++ *	@dev: device
++ *	@name: port name
++ *	@len: limit of bytes to copy to name
++ *
++ *	Get device physical port name
++ */
++int dev_get_phys_port_name(struct net_device *dev,
++			   char *name, size_t len)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	int err;
++
++	if (ops->ndo_get_phys_port_name) {
++		err = ops->ndo_get_phys_port_name(dev, name, len);
++		if (err != -EOPNOTSUPP)
++			return err;
++	}
++	return devlink_compat_phys_port_name_get(dev, name, len);
++}
++
++/**
++ *	dev_get_port_parent_id - Get the device's port parent identifier
++ *	@dev: network device
++ *	@ppid: pointer to a storage for the port's parent identifier
++ *	@recurse: allow/disallow recursion to lower devices
++ *
++ *	Get the devices's port parent identifier
++ */
++int dev_get_port_parent_id(struct net_device *dev,
++			   struct netdev_phys_item_id *ppid,
++			   bool recurse)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	struct netdev_phys_item_id first = { };
++	struct net_device *lower_dev;
++	struct list_head *iter;
++	int err;
++
++	if (ops->ndo_get_port_parent_id) {
++		err = ops->ndo_get_port_parent_id(dev, ppid);
++		if (err != -EOPNOTSUPP)
++			return err;
++	}
++
++	err = devlink_compat_switch_id_get(dev, ppid);
++	if (!recurse || err != -EOPNOTSUPP)
++		return err;
++
++	netdev_for_each_lower_dev(dev, lower_dev, iter) {
++		err = dev_get_port_parent_id(lower_dev, ppid, true);
++		if (err)
++			break;
++		if (!first.id_len)
++			first = *ppid;
++		else if (memcmp(&first, ppid, sizeof(*ppid)))
++			return -EOPNOTSUPP;
++	}
++
++	return err;
++}
++EXPORT_SYMBOL(dev_get_port_parent_id);
++
++/**
++ *	netdev_port_same_parent_id - Indicate if two network devices have
++ *	the same port parent identifier
++ *	@a: first network device
++ *	@b: second network device
++ */
++bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
++{
++	struct netdev_phys_item_id a_id = { };
++	struct netdev_phys_item_id b_id = { };
++
++	if (dev_get_port_parent_id(a, &a_id, true) ||
++	    dev_get_port_parent_id(b, &b_id, true))
++		return false;
++
++	return netdev_phys_item_id_same(&a_id, &b_id);
++}
++EXPORT_SYMBOL(netdev_port_same_parent_id);
++
++/**
++ *	dev_change_proto_down - set carrier according to proto_down.
++ *
++ *	@dev: device
++ *	@proto_down: new value
++ */
++int dev_change_proto_down(struct net_device *dev, bool proto_down)
++{
++	if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
++		return -EOPNOTSUPP;
++	if (!netif_device_present(dev))
++		return -ENODEV;
++	if (proto_down)
++		netif_carrier_off(dev);
++	else
++		netif_carrier_on(dev);
++	dev->proto_down = proto_down;
++	return 0;
++}
++
++/**
++ *	dev_change_proto_down_reason - proto down reason
++ *
++ *	@dev: device
++ *	@mask: proto down mask
++ *	@value: proto down value
++ */
++void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
++				  u32 value)
++{
++	int b;
++
++	if (!mask) {
++		dev->proto_down_reason = value;
++	} else {
++		for_each_set_bit(b, &mask, 32) {
++			if (value & (1 << b))
++				dev->proto_down_reason |= BIT(b);
++			else
++				dev->proto_down_reason &= ~BIT(b);
++		}
++	}
++}
++
++struct bpf_xdp_link {
++	struct bpf_link link;
++	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
++	int flags;
++};
++
++static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
++{
++	if (flags & XDP_FLAGS_HW_MODE)
++		return XDP_MODE_HW;
++	if (flags & XDP_FLAGS_DRV_MODE)
++		return XDP_MODE_DRV;
++	if (flags & XDP_FLAGS_SKB_MODE)
++		return XDP_MODE_SKB;
++	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
++}
++
++static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
++{
++	switch (mode) {
++	case XDP_MODE_SKB:
++		return generic_xdp_install;
++	case XDP_MODE_DRV:
++	case XDP_MODE_HW:
++		return dev->netdev_ops->ndo_bpf;
++	default:
++		return NULL;
++	}
++}
++
++static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
++					 enum bpf_xdp_mode mode)
++{
++	return dev->xdp_state[mode].link;
++}
++
++static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
++				     enum bpf_xdp_mode mode)
++{
++	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
++
++	if (link)
++		return link->link.prog;
++	return dev->xdp_state[mode].prog;
++}
++
++u8 dev_xdp_prog_count(struct net_device *dev)
++{
++	u8 count = 0;
++	int i;
++
++	for (i = 0; i < __MAX_XDP_MODE; i++)
++		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
++			count++;
++	return count;
++}
++EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
++
++u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
++{
++	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
++
++	return prog ? prog->aux->id : 0;
++}
++
++static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
++			     struct bpf_xdp_link *link)
++{
++	dev->xdp_state[mode].link = link;
++	dev->xdp_state[mode].prog = NULL;
++}
++
++static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
++			     struct bpf_prog *prog)
++{
++	dev->xdp_state[mode].link = NULL;
++	dev->xdp_state[mode].prog = prog;
++}
++
++static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
++			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
++			   u32 flags, struct bpf_prog *prog)
++{
++	struct netdev_bpf xdp;
++	int err;
++
++	memset(&xdp, 0, sizeof(xdp));
++	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
++	xdp.extack = extack;
++	xdp.flags = flags;
++	xdp.prog = prog;
++
++	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
++	 * "moved" into driver), so they don't increment it on their own, but
++	 * they do decrement refcnt when program is detached or replaced.
++	 * Given net_device also owns link/prog, we need to bump refcnt here
++	 * to prevent drivers from underflowing it.
++	 */
++	if (prog)
++		bpf_prog_inc(prog);
++	err = bpf_op(dev, &xdp);
++	if (err) {
++		if (prog)
++			bpf_prog_put(prog);
++		return err;
++	}
++
++	if (mode != XDP_MODE_HW)
++		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
++
++	return 0;
++}
++
++static void dev_xdp_uninstall(struct net_device *dev)
++{
++	struct bpf_xdp_link *link;
++	struct bpf_prog *prog;
++	enum bpf_xdp_mode mode;
++	bpf_op_t bpf_op;
++
++	ASSERT_RTNL();
++
++	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
++		prog = dev_xdp_prog(dev, mode);
++		if (!prog)
++			continue;
++
++		bpf_op = dev_xdp_bpf_op(dev, mode);
++		if (!bpf_op)
++			continue;
++
++		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
++
++		/* auto-detach link from net device */
++		link = dev_xdp_link(dev, mode);
++		if (link)
++			link->dev = NULL;
++		else
++			bpf_prog_put(prog);
++
++		dev_xdp_set_link(dev, mode, NULL);
++	}
++}
++
++static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
++			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
++			  struct bpf_prog *old_prog, u32 flags)
++{
++	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
++	struct bpf_prog *cur_prog;
++	struct net_device *upper;
++	struct list_head *iter;
++	enum bpf_xdp_mode mode;
++	bpf_op_t bpf_op;
++	int err;
++
++	ASSERT_RTNL();
++
++	/* either link or prog attachment, never both */
++	if (link && (new_prog || old_prog))
++		return -EINVAL;
++	/* link supports only XDP mode flags */
++	if (link && (flags & ~XDP_FLAGS_MODES)) {
++		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
++		return -EINVAL;
++	}
++	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
++	if (num_modes > 1) {
++		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
++		return -EINVAL;
++	}
++	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
++	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
++		NL_SET_ERR_MSG(extack,
++			       "More than one program loaded, unset mode is ambiguous");
++		return -EINVAL;
++	}
++	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
++	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
++		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
++		return -EINVAL;
++	}
++
++	mode = dev_xdp_mode(dev, flags);
++	/* can't replace attached link */
++	if (dev_xdp_link(dev, mode)) {
++		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
++		return -EBUSY;
++	}
++
++	/* don't allow if an upper device already has a program */
++	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
++		if (dev_xdp_prog_count(upper) > 0) {
++			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
++			return -EEXIST;
++		}
++	}
++
++	cur_prog = dev_xdp_prog(dev, mode);
++	/* can't replace attached prog with link */
++	if (link && cur_prog) {
++		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
++		return -EBUSY;
++	}
++	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
++		NL_SET_ERR_MSG(extack, "Active program does not match expected");
++		return -EEXIST;
++	}
++
++	/* put effective new program into new_prog */
++	if (link)
++		new_prog = link->link.prog;
++
++	if (new_prog) {
++		bool offload = mode == XDP_MODE_HW;
++		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
++					       ? XDP_MODE_DRV : XDP_MODE_SKB;
++
++		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
++			NL_SET_ERR_MSG(extack, "XDP program already attached");
++			return -EBUSY;
++		}
++		if (!offload && dev_xdp_prog(dev, other_mode)) {
++			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
++			return -EEXIST;
++		}
++		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
++			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
++			return -EINVAL;
++		}
++		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
++			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
++			return -EINVAL;
++		}
++		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
++			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
++			return -EINVAL;
++		}
++	}
++
++	/* don't call drivers if the effective program didn't change */
++	if (new_prog != cur_prog) {
++		bpf_op = dev_xdp_bpf_op(dev, mode);
++		if (!bpf_op) {
++			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
++			return -EOPNOTSUPP;
++		}
++
++		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
++		if (err)
++			return err;
++	}
++
++	if (link)
++		dev_xdp_set_link(dev, mode, link);
++	else
++		dev_xdp_set_prog(dev, mode, new_prog);
++	if (cur_prog)
++		bpf_prog_put(cur_prog);
++
++	return 0;
++}
++
++static int dev_xdp_attach_link(struct net_device *dev,
++			       struct netlink_ext_ack *extack,
++			       struct bpf_xdp_link *link)
++{
++	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
++}
++
++static int dev_xdp_detach_link(struct net_device *dev,
++			       struct netlink_ext_ack *extack,
++			       struct bpf_xdp_link *link)
++{
++	enum bpf_xdp_mode mode;
++	bpf_op_t bpf_op;
++
++	ASSERT_RTNL();
++
++	mode = dev_xdp_mode(dev, link->flags);
++	if (dev_xdp_link(dev, mode) != link)
++		return -EINVAL;
++
++	bpf_op = dev_xdp_bpf_op(dev, mode);
++	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
++	dev_xdp_set_link(dev, mode, NULL);
++	return 0;
++}
++
++static void bpf_xdp_link_release(struct bpf_link *link)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++
++	rtnl_lock();
++
++	/* if racing with net_device's tear down, xdp_link->dev might be
++	 * already NULL, in which case link was already auto-detached
++	 */
++	if (xdp_link->dev) {
++		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
++		xdp_link->dev = NULL;
++	}
++
++	rtnl_unlock();
++}
++
++static int bpf_xdp_link_detach(struct bpf_link *link)
++{
++	bpf_xdp_link_release(link);
++	return 0;
++}
++
++static void bpf_xdp_link_dealloc(struct bpf_link *link)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++
++	kfree(xdp_link);
++}
++
++static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
++				     struct seq_file *seq)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++	u32 ifindex = 0;
++
++	rtnl_lock();
++	if (xdp_link->dev)
++		ifindex = xdp_link->dev->ifindex;
++	rtnl_unlock();
++
++	seq_printf(seq, "ifindex:\t%u\n", ifindex);
++}
++
++static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
++				       struct bpf_link_info *info)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++	u32 ifindex = 0;
++
++	rtnl_lock();
++	if (xdp_link->dev)
++		ifindex = xdp_link->dev->ifindex;
++	rtnl_unlock();
++
++	info->xdp.ifindex = ifindex;
++	return 0;
++}
++
++static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
++			       struct bpf_prog *old_prog)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++	enum bpf_xdp_mode mode;
++	bpf_op_t bpf_op;
++	int err = 0;
++
++	rtnl_lock();
++
++	/* link might have been auto-released already, so fail */
++	if (!xdp_link->dev) {
++		err = -ENOLINK;
++		goto out_unlock;
++	}
++
++	if (old_prog && link->prog != old_prog) {
++		err = -EPERM;
++		goto out_unlock;
++	}
++	old_prog = link->prog;
++	if (old_prog->type != new_prog->type ||
++	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
++		err = -EINVAL;
++		goto out_unlock;
++	}
++
++	if (old_prog == new_prog) {
++		/* no-op, don't disturb drivers */
++		bpf_prog_put(new_prog);
++		goto out_unlock;
++	}
++
++	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
++	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
++	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
++			      xdp_link->flags, new_prog);
++	if (err)
++		goto out_unlock;
++
++	old_prog = xchg(&link->prog, new_prog);
++	bpf_prog_put(old_prog);
++
++out_unlock:
++	rtnl_unlock();
++	return err;
++}
++
++static const struct bpf_link_ops bpf_xdp_link_lops = {
++	.release = bpf_xdp_link_release,
++	.dealloc = bpf_xdp_link_dealloc,
++	.detach = bpf_xdp_link_detach,
++	.show_fdinfo = bpf_xdp_link_show_fdinfo,
++	.fill_link_info = bpf_xdp_link_fill_link_info,
++	.update_prog = bpf_xdp_link_update,
++};
++
++int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
++{
++	struct net *net = current->nsproxy->net_ns;
++	struct bpf_link_primer link_primer;
++	struct bpf_xdp_link *link;
++	struct net_device *dev;
++	int err, fd;
++
++	rtnl_lock();
++	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
++	if (!dev) {
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	link = kzalloc(sizeof(*link), GFP_USER);
++	if (!link) {
++		err = -ENOMEM;
++		goto unlock;
++	}
++
++	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
++	link->dev = dev;
++	link->flags = attr->link_create.flags;
++
++	err = bpf_link_prime(&link->link, &link_primer);
++	if (err) {
++		kfree(link);
++		goto unlock;
++	}
++
++	err = dev_xdp_attach_link(dev, NULL, link);
++	rtnl_unlock();
++
++	if (err) {
++		link->dev = NULL;
++		bpf_link_cleanup(&link_primer);
++		goto out_put_dev;
++	}
++
++	fd = bpf_link_settle(&link_primer);
++	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
++	dev_put(dev);
++	return fd;
++
++unlock:
++	rtnl_unlock();
++
++out_put_dev:
++	dev_put(dev);
++	return err;
++}
++
++/**
++ *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
++ *	@dev: device
++ *	@extack: netlink extended ack
++ *	@fd: new program fd or negative value to clear
++ *	@expected_fd: old program fd that userspace expects to replace or clear
++ *	@flags: xdp-related flags
++ *
++ *	Set or clear a bpf program for a device
++ */
++int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
++		      int fd, int expected_fd, u32 flags)
++{
++	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
++	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
++	int err;
++
++	ASSERT_RTNL();
++
++	if (fd >= 0) {
++		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
++						 mode != XDP_MODE_SKB);
++		if (IS_ERR(new_prog))
++			return PTR_ERR(new_prog);
++	}
++
++	if (expected_fd >= 0) {
++		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
++						 mode != XDP_MODE_SKB);
++		if (IS_ERR(old_prog)) {
++			err = PTR_ERR(old_prog);
++			old_prog = NULL;
++			goto err_out;
++		}
++	}
++
++	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
++
++err_out:
++	if (err && new_prog)
++		bpf_prog_put(new_prog);
++	if (old_prog)
++		bpf_prog_put(old_prog);
++	return err;
++}
++
++/**
++ *	dev_new_index	-	allocate an ifindex
++ *	@net: the applicable net namespace
++ *
++ *	Returns a suitable unique value for a new device interface
++ *	number.  The caller must hold the rtnl semaphore or the
++ *	dev_base_lock to be sure it remains unique.
++ */
++static int dev_new_index(struct net *net)
++{
++	int ifindex = net->ifindex;
++
++	for (;;) {
++		if (++ifindex <= 0)
++			ifindex = 1;
++		if (!__dev_get_by_index(net, ifindex))
++			return net->ifindex = ifindex;
++	}
++}
++
++/* Delayed registration/unregisteration */
++LIST_HEAD(net_todo_list);
++DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
++
++static void net_set_todo(struct net_device *dev)
++{
++	list_add_tail(&dev->todo_list, &net_todo_list);
++	atomic_inc(&dev_net(dev)->dev_unreg_count);
++}
++
++static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
++	struct net_device *upper, netdev_features_t features)
++{
++	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
++	netdev_features_t feature;
++	int feature_bit;
++
++	for_each_netdev_feature(upper_disables, feature_bit) {
++		feature = __NETIF_F_BIT(feature_bit);
++		if (!(upper->wanted_features & feature)
++		    && (features & feature)) {
++			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
++				   &feature, upper->name);
++			features &= ~feature;
++		}
++	}
++
++	return features;
++}
++
++static void netdev_sync_lower_features(struct net_device *upper,
++	struct net_device *lower, netdev_features_t features)
++{
++	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
++	netdev_features_t feature;
++	int feature_bit;
++
++	for_each_netdev_feature(upper_disables, feature_bit) {
++		feature = __NETIF_F_BIT(feature_bit);
++		if (!(features & feature) && (lower->features & feature)) {
++			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
++				   &feature, lower->name);
++			lower->wanted_features &= ~feature;
++			__netdev_update_features(lower);
++
++			if (unlikely(lower->features & feature))
++				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
++					    &feature, lower->name);
++			else
++				netdev_features_change(lower);
++		}
++	}
++}
++
++static netdev_features_t netdev_fix_features(struct net_device *dev,
++	netdev_features_t features)
++{
++	/* Fix illegal checksum combinations */
++	if ((features & NETIF_F_HW_CSUM) &&
++	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
++		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
++		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
++	}
++
++	/* TSO requires that SG is present as well. */
++	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
++		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
++		features &= ~NETIF_F_ALL_TSO;
++	}
++
++	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
++					!(features & NETIF_F_IP_CSUM)) {
++		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
++		features &= ~NETIF_F_TSO;
++		features &= ~NETIF_F_TSO_ECN;
++	}
++
++	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
++					 !(features & NETIF_F_IPV6_CSUM)) {
++		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
++		features &= ~NETIF_F_TSO6;
++	}
++
++	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
++	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
++		features &= ~NETIF_F_TSO_MANGLEID;
++
++	/* TSO ECN requires that TSO is present as well. */
++	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
++		features &= ~NETIF_F_TSO_ECN;
++
++	/* Software GSO depends on SG. */
++	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
++		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
++		features &= ~NETIF_F_GSO;
++	}
++
++	/* GSO partial features require GSO partial be set */
++	if ((features & dev->gso_partial_features) &&
++	    !(features & NETIF_F_GSO_PARTIAL)) {
++		netdev_dbg(dev,
++			   "Dropping partially supported GSO features since no GSO partial.\n");
++		features &= ~dev->gso_partial_features;
++	}
++
++	if (!(features & NETIF_F_RXCSUM)) {
++		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
++		 * successfully merged by hardware must also have the
++		 * checksum verified by hardware.  If the user does not
++		 * want to enable RXCSUM, logically, we should disable GRO_HW.
++		 */
++		if (features & NETIF_F_GRO_HW) {
++			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
++			features &= ~NETIF_F_GRO_HW;
++		}
++	}
++
++	/* LRO/HW-GRO features cannot be combined with RX-FCS */
++	if (features & NETIF_F_RXFCS) {
++		if (features & NETIF_F_LRO) {
++			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
++			features &= ~NETIF_F_LRO;
++		}
++
++		if (features & NETIF_F_GRO_HW) {
++			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
++			features &= ~NETIF_F_GRO_HW;
++		}
++	}
++
++	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
++		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
++		features &= ~NETIF_F_LRO;
++	}
++
++	if (features & NETIF_F_HW_TLS_TX) {
++		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
++			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
++		bool hw_csum = features & NETIF_F_HW_CSUM;
++
++		if (!ip_csum && !hw_csum) {
++			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
++			features &= ~NETIF_F_HW_TLS_TX;
++		}
++	}
++
++	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
++		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
++		features &= ~NETIF_F_HW_TLS_RX;
++	}
++
++	return features;
++}
++
++int __netdev_update_features(struct net_device *dev)
++{
++	struct net_device *upper, *lower;
++	netdev_features_t features;
++	struct list_head *iter;
++	int err = -1;
++
++	ASSERT_RTNL();
++
++	features = netdev_get_wanted_features(dev);
++
++	if (dev->netdev_ops->ndo_fix_features)
++		features = dev->netdev_ops->ndo_fix_features(dev, features);
++
++	/* driver might be less strict about feature dependencies */
++	features = netdev_fix_features(dev, features);
++
++	/* some features can't be enabled if they're off on an upper device */
++	netdev_for_each_upper_dev_rcu(dev, upper, iter)
++		features = netdev_sync_upper_features(dev, upper, features);
++
++	if (dev->features == features)
++		goto sync_lower;
++
++	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
++		&dev->features, &features);
++
++	if (dev->netdev_ops->ndo_set_features)
++		err = dev->netdev_ops->ndo_set_features(dev, features);
++	else
++		err = 0;
++
++	if (unlikely(err < 0)) {
++		netdev_err(dev,
++			"set_features() failed (%d); wanted %pNF, left %pNF\n",
++			err, &features, &dev->features);
++		/* return non-0 since some features might have changed and
++		 * it's better to fire a spurious notification than miss it
++		 */
++		return -1;
++	}
++
++sync_lower:
++	/* some features must be disabled on lower devices when disabled
++	 * on an upper device (think: bonding master or bridge)
++	 */
++	netdev_for_each_lower_dev(dev, lower, iter)
++		netdev_sync_lower_features(dev, lower, features);
++
++	if (!err) {
++		netdev_features_t diff = features ^ dev->features;
++
++		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
++			/* udp_tunnel_{get,drop}_rx_info both need
++			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
++			 * device, or they won't do anything.
++			 * Thus we need to update dev->features
++			 * *before* calling udp_tunnel_get_rx_info,
++			 * but *after* calling udp_tunnel_drop_rx_info.
++			 */
++			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
++				dev->features = features;
++				udp_tunnel_get_rx_info(dev);
++			} else {
++				udp_tunnel_drop_rx_info(dev);
++			}
++		}
++
++		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
++			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
++				dev->features = features;
++				err |= vlan_get_rx_ctag_filter_info(dev);
++			} else {
++				vlan_drop_rx_ctag_filter_info(dev);
++			}
++		}
++
++		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
++			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
++				dev->features = features;
++				err |= vlan_get_rx_stag_filter_info(dev);
++			} else {
++				vlan_drop_rx_stag_filter_info(dev);
++			}
++		}
++
++		dev->features = features;
++	}
++
++	return err < 0 ? 0 : 1;
++}
++
++/**
++ *	netdev_update_features - recalculate device features
++ *	@dev: the device to check
++ *
++ *	Recalculate dev->features set and send notifications if it
++ *	has changed. Should be called after driver or hardware dependent
++ *	conditions might have changed that influence the features.
++ */
++void netdev_update_features(struct net_device *dev)
++{
++	if (__netdev_update_features(dev))
++		netdev_features_change(dev);
++}
++EXPORT_SYMBOL(netdev_update_features);
++
++/**
++ *	netdev_change_features - recalculate device features
++ *	@dev: the device to check
++ *
++ *	Recalculate dev->features set and send notifications even
++ *	if they have not changed. Should be called instead of
++ *	netdev_update_features() if also dev->vlan_features might
++ *	have changed to allow the changes to be propagated to stacked
++ *	VLAN devices.
++ */
++void netdev_change_features(struct net_device *dev)
++{
++	__netdev_update_features(dev);
++	netdev_features_change(dev);
++}
++EXPORT_SYMBOL(netdev_change_features);
++
++/**
++ *	netif_stacked_transfer_operstate -	transfer operstate
++ *	@rootdev: the root or lower level device to transfer state from
++ *	@dev: the device to transfer operstate to
++ *
++ *	Transfer operational state from root to device. This is normally
++ *	called when a stacking relationship exists between the root
++ *	device and the device(a leaf device).
++ */
++void netif_stacked_transfer_operstate(const struct net_device *rootdev,
++					struct net_device *dev)
++{
++	if (rootdev->operstate == IF_OPER_DORMANT)
++		netif_dormant_on(dev);
++	else
++		netif_dormant_off(dev);
++
++	if (rootdev->operstate == IF_OPER_TESTING)
++		netif_testing_on(dev);
++	else
++		netif_testing_off(dev);
++
++	if (netif_carrier_ok(rootdev))
++		netif_carrier_on(dev);
++	else
++		netif_carrier_off(dev);
++}
++EXPORT_SYMBOL(netif_stacked_transfer_operstate);
++
++static int netif_alloc_rx_queues(struct net_device *dev)
++{
++	unsigned int i, count = dev->num_rx_queues;
++	struct netdev_rx_queue *rx;
++	size_t sz = count * sizeof(*rx);
++	int err = 0;
++
++	BUG_ON(count < 1);
++
++	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
++	if (!rx)
++		return -ENOMEM;
++
++	dev->_rx = rx;
++
++	for (i = 0; i < count; i++) {
++		rx[i].dev = dev;
++
++		/* XDP RX-queue setup */
++		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
++		if (err < 0)
++			goto err_rxq_info;
++	}
++	return 0;
++
++err_rxq_info:
++	/* Rollback successful reg's and free other resources */
++	while (i--)
++		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
++	kvfree(dev->_rx);
++	dev->_rx = NULL;
++	return err;
++}
++
++static void netif_free_rx_queues(struct net_device *dev)
++{
++	unsigned int i, count = dev->num_rx_queues;
++
++	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
++	if (!dev->_rx)
++		return;
++
++	for (i = 0; i < count; i++)
++		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
++
++	kvfree(dev->_rx);
++}
++
++static void netdev_init_one_queue(struct net_device *dev,
++				  struct netdev_queue *queue, void *_unused)
++{
++	/* Initialize queue lock */
++	spin_lock_init(&queue->_xmit_lock);
++	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
++	queue->xmit_lock_owner = -1;
++	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
++	queue->dev = dev;
++#ifdef CONFIG_BQL
++	dql_init(&queue->dql, HZ);
++#endif
++}
++
++static void netif_free_tx_queues(struct net_device *dev)
++{
++	kvfree(dev->_tx);
++}
++
++static int netif_alloc_netdev_queues(struct net_device *dev)
++{
++	unsigned int count = dev->num_tx_queues;
++	struct netdev_queue *tx;
++	size_t sz = count * sizeof(*tx);
++
++	if (count < 1 || count > 0xffff)
++		return -EINVAL;
++
++	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
++	if (!tx)
++		return -ENOMEM;
++
++	dev->_tx = tx;
++
++	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
++	spin_lock_init(&dev->tx_global_lock);
++
++	return 0;
++}
++
++void netif_tx_stop_all_queues(struct net_device *dev)
++{
++	unsigned int i;
++
++	for (i = 0; i < dev->num_tx_queues; i++) {
++		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
++
++		netif_tx_stop_queue(txq);
++	}
++}
++EXPORT_SYMBOL(netif_tx_stop_all_queues);
++
++/**
++ * register_netdevice() - register a network device
++ * @dev: device to register
++ *
++ * Take a prepared network device structure and make it externally accessible.
++ * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
++ * Callers must hold the rtnl lock - you may want register_netdev()
++ * instead of this.
++ */
++int register_netdevice(struct net_device *dev)
++{
++	int ret;
++	struct net *net = dev_net(dev);
++
++	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
++		     NETDEV_FEATURE_COUNT);
++	BUG_ON(dev_boot_phase);
++	ASSERT_RTNL();
++
++	might_sleep();
++
++	/* When net_device's are persistent, this will be fatal. */
++	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
++	BUG_ON(!net);
++
++	ret = ethtool_check_ops(dev->ethtool_ops);
++	if (ret)
++		return ret;
++
++	spin_lock_init(&dev->addr_list_lock);
++	netdev_set_addr_lockdep_class(dev);
++
++	ret = dev_get_valid_name(net, dev, dev->name);
++	if (ret < 0)
++		goto out;
++
++	ret = -ENOMEM;
++	dev->name_node = netdev_name_node_head_alloc(dev);
++	if (!dev->name_node)
++		goto out;
++
++	/* Init, if this function is available */
++	if (dev->netdev_ops->ndo_init) {
++		ret = dev->netdev_ops->ndo_init(dev);
++		if (ret) {
++			if (ret > 0)
++				ret = -EIO;
++			goto err_free_name;
++		}
++	}
++
++	if (((dev->hw_features | dev->features) &
++	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
++	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
++	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
++		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
++		ret = -EINVAL;
++		goto err_uninit;
++	}
++
++	ret = -EBUSY;
++	if (!dev->ifindex)
++		dev->ifindex = dev_new_index(net);
++	else if (__dev_get_by_index(net, dev->ifindex))
++		goto err_uninit;
++
++	/* Transfer changeable features to wanted_features and enable
++	 * software offloads (GSO and GRO).
++	 */
++	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
++	dev->features |= NETIF_F_SOFT_FEATURES;
++
++	if (dev->udp_tunnel_nic_info) {
++		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
++		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
++	}
++
++	dev->wanted_features = dev->features & dev->hw_features;
++
++	if (!(dev->flags & IFF_LOOPBACK))
++		dev->hw_features |= NETIF_F_NOCACHE_COPY;
++
++	/* If IPv4 TCP segmentation offload is supported we should also
++	 * allow the device to enable segmenting the frame with the option
++	 * of ignoring a static IP ID value.  This doesn't enable the
++	 * feature itself but allows the user to enable it later.
++	 */
++	if (dev->hw_features & NETIF_F_TSO)
++		dev->hw_features |= NETIF_F_TSO_MANGLEID;
++	if (dev->vlan_features & NETIF_F_TSO)
++		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
++	if (dev->mpls_features & NETIF_F_TSO)
++		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
++	if (dev->hw_enc_features & NETIF_F_TSO)
++		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
++
++	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
++	 */
++	dev->vlan_features |= NETIF_F_HIGHDMA;
++
++	/* Make NETIF_F_SG inheritable to tunnel devices.
++	 */
++	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
++
++	/* Make NETIF_F_SG inheritable to MPLS.
++	 */
++	dev->mpls_features |= NETIF_F_SG;
++
++	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
++	ret = notifier_to_errno(ret);
++	if (ret)
++		goto err_uninit;
++
++	ret = netdev_register_kobject(dev);
++	write_lock(&dev_base_lock);
++	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
++	write_unlock(&dev_base_lock);
++	if (ret)
++		goto err_uninit;
++
++	__netdev_update_features(dev);
++
++	/*
++	 *	Default initial state at registry is that the
++	 *	device is present.
++	 */
++
++	set_bit(__LINK_STATE_PRESENT, &dev->state);
++
++	linkwatch_init_dev(dev);
++
++	dev_init_scheduler(dev);
++
++	netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
++	list_netdevice(dev);
++
++	add_device_randomness(dev->dev_addr, dev->addr_len);
++
++	/* If the device has permanent device address, driver should
++	 * set dev_addr and also addr_assign_type should be set to
++	 * NET_ADDR_PERM (default value).
++	 */
++	if (dev->addr_assign_type == NET_ADDR_PERM)
++		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
++
++	/* Notify protocols, that a new device appeared. */
++	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
++	ret = notifier_to_errno(ret);
++	if (ret) {
++		/* Expect explicit free_netdev() on failure */
++		dev->needs_free_netdev = false;
++		unregister_netdevice_queue(dev, NULL);
++		goto out;
++	}
++	/*
++	 *	Prevent userspace races by waiting until the network
++	 *	device is fully setup before sending notifications.
++	 */
++	if (!dev->rtnl_link_ops ||
++	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
++		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
++
++out:
++	return ret;
++
++err_uninit:
++	if (dev->netdev_ops->ndo_uninit)
++		dev->netdev_ops->ndo_uninit(dev);
++	if (dev->priv_destructor)
++		dev->priv_destructor(dev);
++err_free_name:
++	netdev_name_node_free(dev->name_node);
++	goto out;
++}
++EXPORT_SYMBOL(register_netdevice);
++
++/**
++ *	init_dummy_netdev	- init a dummy network device for NAPI
++ *	@dev: device to init
++ *
++ *	This takes a network device structure and initialize the minimum
++ *	amount of fields so it can be used to schedule NAPI polls without
++ *	registering a full blown interface. This is to be used by drivers
++ *	that need to tie several hardware interfaces to a single NAPI
++ *	poll scheduler due to HW limitations.
++ */
++int init_dummy_netdev(struct net_device *dev)
++{
++	/* Clear everything. Note we don't initialize spinlocks
++	 * are they aren't supposed to be taken by any of the
++	 * NAPI code and this dummy netdev is supposed to be
++	 * only ever used for NAPI polls
++	 */
++	memset(dev, 0, sizeof(struct net_device));
++
++	/* make sure we BUG if trying to hit standard
++	 * register/unregister code path
++	 */
++	dev->reg_state = NETREG_DUMMY;
++
++	/* NAPI wants this */
++	INIT_LIST_HEAD(&dev->napi_list);
++
++	/* a dummy interface is started by default */
++	set_bit(__LINK_STATE_PRESENT, &dev->state);
++	set_bit(__LINK_STATE_START, &dev->state);
++
++	/* napi_busy_loop stats accounting wants this */
++	dev_net_set(dev, &init_net);
++
++	/* Note : We dont allocate pcpu_refcnt for dummy devices,
++	 * because users of this 'device' dont need to change
++	 * its refcount.
++	 */
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(init_dummy_netdev);
++
++
++/**
++ *	register_netdev	- register a network device
++ *	@dev: device to register
++ *
++ *	Take a completed network device structure and add it to the kernel
++ *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
++ *	chain. 0 is returned on success. A negative errno code is returned
++ *	on a failure to set up the device, or if the name is a duplicate.
++ *
++ *	This is a wrapper around register_netdevice that takes the rtnl semaphore
++ *	and expands the device name if you passed a format string to
++ *	alloc_netdev.
++ */
++int register_netdev(struct net_device *dev)
++{
++	int err;
++
++	if (rtnl_lock_killable())
++		return -EINTR;
++	err = register_netdevice(dev);
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(register_netdev);
++
++int netdev_refcnt_read(const struct net_device *dev)
++{
++#ifdef CONFIG_PCPU_DEV_REFCNT
++	int i, refcnt = 0;
++
++	for_each_possible_cpu(i)
++		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
++	return refcnt;
++#else
++	return refcount_read(&dev->dev_refcnt);
++#endif
++}
++EXPORT_SYMBOL(netdev_refcnt_read);
++
++int netdev_unregister_timeout_secs __read_mostly = 10;
++
++#define WAIT_REFS_MIN_MSECS 1
++#define WAIT_REFS_MAX_MSECS 250
++/**
++ * netdev_wait_allrefs_any - wait until all references are gone.
++ * @list: list of net_devices to wait on
++ *
++ * This is called when unregistering network devices.
++ *
++ * Any protocol or device that holds a reference should register
++ * for netdevice notification, and cleanup and put back the
++ * reference if they receive an UNREGISTER event.
++ * We can get stuck here if buggy protocols don't correctly
++ * call dev_put.
++ */
++static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
++{
++	unsigned long rebroadcast_time, warning_time;
++	struct net_device *dev;
++	int wait = 0;
++
++	rebroadcast_time = warning_time = jiffies;
++
++	list_for_each_entry(dev, list, todo_list)
++		if (netdev_refcnt_read(dev) == 1)
++			return dev;
++
++	while (true) {
++		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
++			rtnl_lock();
++
++			/* Rebroadcast unregister notification */
++			list_for_each_entry(dev, list, todo_list)
++				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
++
++			__rtnl_unlock();
++			rcu_barrier();
++			rtnl_lock();
++
++			list_for_each_entry(dev, list, todo_list)
++				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
++					     &dev->state)) {
++					/* We must not have linkwatch events
++					 * pending on unregister. If this
++					 * happens, we simply run the queue
++					 * unscheduled, resulting in a noop
++					 * for this device.
++					 */
++					linkwatch_run_queue();
++					break;
++				}
++
++			__rtnl_unlock();
++
++			rebroadcast_time = jiffies;
++		}
++
++		if (!wait) {
++			rcu_barrier();
++			wait = WAIT_REFS_MIN_MSECS;
++		} else {
++			msleep(wait);
++			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
++		}
++
++		list_for_each_entry(dev, list, todo_list)
++			if (netdev_refcnt_read(dev) == 1)
++				return dev;
++
++		if (time_after(jiffies, warning_time +
++			       READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
++			list_for_each_entry(dev, list, todo_list) {
++				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
++					 dev->name, netdev_refcnt_read(dev));
++				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
++			}
++
++			warning_time = jiffies;
++		}
++	}
++}
++
++/* The sequence is:
++ *
++ *	rtnl_lock();
++ *	...
++ *	register_netdevice(x1);
++ *	register_netdevice(x2);
++ *	...
++ *	unregister_netdevice(y1);
++ *	unregister_netdevice(y2);
++ *      ...
++ *	rtnl_unlock();
++ *	free_netdev(y1);
++ *	free_netdev(y2);
++ *
++ * We are invoked by rtnl_unlock().
++ * This allows us to deal with problems:
++ * 1) We can delete sysfs objects which invoke hotplug
++ *    without deadlocking with linkwatch via keventd.
++ * 2) Since we run with the RTNL semaphore not held, we can sleep
++ *    safely in order to wait for the netdev refcnt to drop to zero.
++ *
++ * We must not return until all unregister events added during
++ * the interval the lock was held have been completed.
++ */
++void netdev_run_todo(void)
++{
++	struct net_device *dev, *tmp;
++	struct list_head list;
++#ifdef CONFIG_LOCKDEP
++	struct list_head unlink_list;
++
++	list_replace_init(&net_unlink_list, &unlink_list);
++
++	while (!list_empty(&unlink_list)) {
++		struct net_device *dev = list_first_entry(&unlink_list,
++							  struct net_device,
++							  unlink_list);
++		list_del_init(&dev->unlink_list);
++		dev->nested_level = dev->lower_level - 1;
++	}
++#endif
++
++	/* Snapshot list, allow later requests */
++	list_replace_init(&net_todo_list, &list);
++
++	__rtnl_unlock();
++
++	/* Wait for rcu callbacks to finish before next phase */
++	if (!list_empty(&list))
++		rcu_barrier();
++
++	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
++		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
++			netdev_WARN(dev, "run_todo but not unregistering\n");
++			list_del(&dev->todo_list);
++			continue;
++		}
++
++		write_lock(&dev_base_lock);
++		dev->reg_state = NETREG_UNREGISTERED;
++		write_unlock(&dev_base_lock);
++		linkwatch_forget_dev(dev);
++	}
++
++	while (!list_empty(&list)) {
++		dev = netdev_wait_allrefs_any(&list);
++		list_del(&dev->todo_list);
++
++		/* paranoia */
++		BUG_ON(netdev_refcnt_read(dev) != 1);
++		BUG_ON(!list_empty(&dev->ptype_all));
++		BUG_ON(!list_empty(&dev->ptype_specific));
++		WARN_ON(rcu_access_pointer(dev->ip_ptr));
++		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
++#if IS_ENABLED(CONFIG_DECNET)
++		WARN_ON(dev->dn_ptr);
++#endif
++		if (dev->priv_destructor)
++			dev->priv_destructor(dev);
++		if (dev->needs_free_netdev)
++			free_netdev(dev);
++
++		if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
++			wake_up(&netdev_unregistering_wq);
++
++		/* Free network device */
++		kobject_put(&dev->dev.kobj);
++	}
++}
++
++/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
++ * all the same fields in the same order as net_device_stats, with only
++ * the type differing, but rtnl_link_stats64 may have additional fields
++ * at the end for newer counters.
++ */
++void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
++			     const struct net_device_stats *netdev_stats)
++{
++#if BITS_PER_LONG == 64
++	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
++	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
++	/* zero out counters that only exist in rtnl_link_stats64 */
++	memset((char *)stats64 + sizeof(*netdev_stats), 0,
++	       sizeof(*stats64) - sizeof(*netdev_stats));
++#else
++	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
++	const unsigned long *src = (const unsigned long *)netdev_stats;
++	u64 *dst = (u64 *)stats64;
++
++	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
++	for (i = 0; i < n; i++)
++		dst[i] = src[i];
++	/* zero out counters that only exist in rtnl_link_stats64 */
++	memset((char *)stats64 + n * sizeof(u64), 0,
++	       sizeof(*stats64) - n * sizeof(u64));
++#endif
++}
++EXPORT_SYMBOL(netdev_stats_to_stats64);
++
++struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
++{
++	struct net_device_core_stats __percpu *p;
++
++	p = alloc_percpu_gfp(struct net_device_core_stats,
++			     GFP_ATOMIC | __GFP_NOWARN);
++
++	if (p && cmpxchg(&dev->core_stats, NULL, p))
++		free_percpu(p);
++
++	/* This READ_ONCE() pairs with the cmpxchg() above */
++	return READ_ONCE(dev->core_stats);
++}
++EXPORT_SYMBOL(netdev_core_stats_alloc);
++
++/**
++ *	dev_get_stats	- get network device statistics
++ *	@dev: device to get statistics from
++ *	@storage: place to store stats
++ *
++ *	Get network statistics from device. Return @storage.
++ *	The device driver may provide its own method by setting
++ *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
++ *	otherwise the internal statistics structure is used.
++ */
++struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
++					struct rtnl_link_stats64 *storage)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	const struct net_device_core_stats __percpu *p;
++
++	if (ops->ndo_get_stats64) {
++		memset(storage, 0, sizeof(*storage));
++		ops->ndo_get_stats64(dev, storage);
++	} else if (ops->ndo_get_stats) {
++		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
++	} else {
++		netdev_stats_to_stats64(storage, &dev->stats);
++	}
++
++	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
++	p = READ_ONCE(dev->core_stats);
++	if (p) {
++		const struct net_device_core_stats *core_stats;
++		int i;
++
++		for_each_possible_cpu(i) {
++			core_stats = per_cpu_ptr(p, i);
++			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
++			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
++			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
++			storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
++		}
++	}
++	return storage;
++}
++EXPORT_SYMBOL(dev_get_stats);
++
++/**
++ *	dev_fetch_sw_netstats - get per-cpu network device statistics
++ *	@s: place to store stats
++ *	@netstats: per-cpu network stats to read from
++ *
++ *	Read per-cpu network statistics and populate the related fields in @s.
++ */
++void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
++			   const struct pcpu_sw_netstats __percpu *netstats)
++{
++	int cpu;
++
++	for_each_possible_cpu(cpu) {
++		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
++		const struct pcpu_sw_netstats *stats;
++		unsigned int start;
++
++		stats = per_cpu_ptr(netstats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&stats->syncp);
++			rx_packets = u64_stats_read(&stats->rx_packets);
++			rx_bytes   = u64_stats_read(&stats->rx_bytes);
++			tx_packets = u64_stats_read(&stats->tx_packets);
++			tx_bytes   = u64_stats_read(&stats->tx_bytes);
++		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
++
++		s->rx_packets += rx_packets;
++		s->rx_bytes   += rx_bytes;
++		s->tx_packets += tx_packets;
++		s->tx_bytes   += tx_bytes;
++	}
++}
++EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
++
++/**
++ *	dev_get_tstats64 - ndo_get_stats64 implementation
++ *	@dev: device to get statistics from
++ *	@s: place to store stats
++ *
++ *	Populate @s from dev->stats and dev->tstats. Can be used as
++ *	ndo_get_stats64() callback.
++ */
++void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
++{
++	netdev_stats_to_stats64(s, &dev->stats);
++	dev_fetch_sw_netstats(s, dev->tstats);
++}
++EXPORT_SYMBOL_GPL(dev_get_tstats64);
++
++struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
++{
++	struct netdev_queue *queue = dev_ingress_queue(dev);
++
++#ifdef CONFIG_NET_CLS_ACT
++	if (queue)
++		return queue;
++	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
++	if (!queue)
++		return NULL;
++	netdev_init_one_queue(dev, queue, NULL);
++	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
++	queue->qdisc_sleeping = &noop_qdisc;
++	rcu_assign_pointer(dev->ingress_queue, queue);
++#endif
++	return queue;
++}
++
++static const struct ethtool_ops default_ethtool_ops;
++
++void netdev_set_default_ethtool_ops(struct net_device *dev,
++				    const struct ethtool_ops *ops)
++{
++	if (dev->ethtool_ops == &default_ethtool_ops)
++		dev->ethtool_ops = ops;
++}
++EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
++
++void netdev_freemem(struct net_device *dev)
++{
++	char *addr = (char *)dev - dev->padded;
++
++	kvfree(addr);
++}
++
++/**
++ * alloc_netdev_mqs - allocate network device
++ * @sizeof_priv: size of private data to allocate space for
++ * @name: device name format string
++ * @name_assign_type: origin of device name
++ * @setup: callback to initialize device
++ * @txqs: the number of TX subqueues to allocate
++ * @rxqs: the number of RX subqueues to allocate
++ *
++ * Allocates a struct net_device with private data area for driver use
++ * and performs basic initialization.  Also allocates subqueue structs
++ * for each queue on the device.
++ */
++struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
++		unsigned char name_assign_type,
++		void (*setup)(struct net_device *),
++		unsigned int txqs, unsigned int rxqs)
++{
++	struct net_device *dev;
++	unsigned int alloc_size;
++	struct net_device *p;
++
++	BUG_ON(strlen(name) >= sizeof(dev->name));
++
++	if (txqs < 1) {
++		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
++		return NULL;
++	}
++
++	if (rxqs < 1) {
++		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
++		return NULL;
++	}
++
++	alloc_size = sizeof(struct net_device);
++	if (sizeof_priv) {
++		/* ensure 32-byte alignment of private area */
++		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
++		alloc_size += sizeof_priv;
++	}
++	/* ensure 32-byte alignment of whole construct */
++	alloc_size += NETDEV_ALIGN - 1;
++
++	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
++	if (!p)
++		return NULL;
++
++	dev = PTR_ALIGN(p, NETDEV_ALIGN);
++	dev->padded = (char *)dev - (char *)p;
++
++	ref_tracker_dir_init(&dev->refcnt_tracker, 128);
++#ifdef CONFIG_PCPU_DEV_REFCNT
++	dev->pcpu_refcnt = alloc_percpu(int);
++	if (!dev->pcpu_refcnt)
++		goto free_dev;
++	__dev_hold(dev);
++#else
++	refcount_set(&dev->dev_refcnt, 1);
++#endif
++
++	if (dev_addr_init(dev))
++		goto free_pcpu;
++
++	dev_mc_init(dev);
++	dev_uc_init(dev);
++
++	dev_net_set(dev, &init_net);
++
++	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
++	dev->gso_max_segs = GSO_MAX_SEGS;
++	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
++	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
++	dev->tso_max_segs = TSO_MAX_SEGS;
++	dev->upper_level = 1;
++	dev->lower_level = 1;
++#ifdef CONFIG_LOCKDEP
++	dev->nested_level = 0;
++	INIT_LIST_HEAD(&dev->unlink_list);
++#endif
++
++	INIT_LIST_HEAD(&dev->napi_list);
++	INIT_LIST_HEAD(&dev->unreg_list);
++	INIT_LIST_HEAD(&dev->close_list);
++	INIT_LIST_HEAD(&dev->link_watch_list);
++	INIT_LIST_HEAD(&dev->adj_list.upper);
++	INIT_LIST_HEAD(&dev->adj_list.lower);
++	INIT_LIST_HEAD(&dev->ptype_all);
++	INIT_LIST_HEAD(&dev->ptype_specific);
++	INIT_LIST_HEAD(&dev->net_notifier_list);
++#ifdef CONFIG_NET_SCHED
++	hash_init(dev->qdisc_hash);
++#endif
++	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
++	setup(dev);
++
++	if (!dev->tx_queue_len) {
++		dev->priv_flags |= IFF_NO_QUEUE;
++		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
++	}
++
++	dev->num_tx_queues = txqs;
++	dev->real_num_tx_queues = txqs;
++	if (netif_alloc_netdev_queues(dev))
++		goto free_all;
++
++	dev->num_rx_queues = rxqs;
++	dev->real_num_rx_queues = rxqs;
++	if (netif_alloc_rx_queues(dev))
++		goto free_all;
++
++	strcpy(dev->name, name);
++	dev->name_assign_type = name_assign_type;
++	dev->group = INIT_NETDEV_GROUP;
++	if (!dev->ethtool_ops)
++		dev->ethtool_ops = &default_ethtool_ops;
++
++	nf_hook_netdev_init(dev);
++
++	return dev;
++
++free_all:
++	free_netdev(dev);
++	return NULL;
++
++free_pcpu:
++#ifdef CONFIG_PCPU_DEV_REFCNT
++	free_percpu(dev->pcpu_refcnt);
++free_dev:
++#endif
++	netdev_freemem(dev);
++	return NULL;
++}
++EXPORT_SYMBOL(alloc_netdev_mqs);
++
++/**
++ * free_netdev - free network device
++ * @dev: device
++ *
++ * This function does the last stage of destroying an allocated device
++ * interface. The reference to the device object is released. If this
++ * is the last reference then it will be freed.Must be called in process
++ * context.
++ */
++void free_netdev(struct net_device *dev)
++{
++	struct napi_struct *p, *n;
++
++	might_sleep();
++
++	/* When called immediately after register_netdevice() failed the unwind
++	 * handling may still be dismantling the device. Handle that case by
++	 * deferring the free.
++	 */
++	if (dev->reg_state == NETREG_UNREGISTERING) {
++		ASSERT_RTNL();
++		dev->needs_free_netdev = true;
++		return;
++	}
++
++	netif_free_tx_queues(dev);
++	netif_free_rx_queues(dev);
++
++	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
++
++	/* Flush device addresses */
++	dev_addr_flush(dev);
++
++	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
++		netif_napi_del(p);
++
++	ref_tracker_dir_exit(&dev->refcnt_tracker);
++#ifdef CONFIG_PCPU_DEV_REFCNT
++	free_percpu(dev->pcpu_refcnt);
++	dev->pcpu_refcnt = NULL;
++#endif
++	free_percpu(dev->core_stats);
++	dev->core_stats = NULL;
++	free_percpu(dev->xdp_bulkq);
++	dev->xdp_bulkq = NULL;
++
++	/*  Compatibility with error handling in drivers */
++	if (dev->reg_state == NETREG_UNINITIALIZED) {
++		netdev_freemem(dev);
++		return;
++	}
++
++	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
++	dev->reg_state = NETREG_RELEASED;
++
++	/* will free via device release */
++	put_device(&dev->dev);
++}
++EXPORT_SYMBOL(free_netdev);
++
++/**
++ *	synchronize_net -  Synchronize with packet receive processing
++ *
++ *	Wait for packets currently being received to be done.
++ *	Does not block later packets from starting.
++ */
++void synchronize_net(void)
++{
++	might_sleep();
++	if (rtnl_is_locked())
++		synchronize_rcu_expedited();
++	else
++		synchronize_rcu();
++}
++EXPORT_SYMBOL(synchronize_net);
++
++/**
++ *	unregister_netdevice_queue - remove device from the kernel
++ *	@dev: device
++ *	@head: list
++ *
++ *	This function shuts down a device interface and removes it
++ *	from the kernel tables.
++ *	If head not NULL, device is queued to be unregistered later.
++ *
++ *	Callers must hold the rtnl semaphore.  You may want
++ *	unregister_netdev() instead of this.
++ */
++
++void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
++{
++	ASSERT_RTNL();
++
++	if (head) {
++		list_move_tail(&dev->unreg_list, head);
++	} else {
++		LIST_HEAD(single);
++
++		list_add(&dev->unreg_list, &single);
++		unregister_netdevice_many(&single);
++	}
++}
++EXPORT_SYMBOL(unregister_netdevice_queue);
++
++/**
++ *	unregister_netdevice_many - unregister many devices
++ *	@head: list of devices
++ *
++ *  Note: As most callers use a stack allocated list_head,
++ *  we force a list_del() to make sure stack wont be corrupted later.
++ */
++void unregister_netdevice_many(struct list_head *head)
++{
++	struct net_device *dev, *tmp;
++	LIST_HEAD(close_head);
++
++	BUG_ON(dev_boot_phase);
++	ASSERT_RTNL();
++
++	if (list_empty(head))
++		return;
++
++	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
++		/* Some devices call without registering
++		 * for initialization unwind. Remove those
++		 * devices and proceed with the remaining.
++		 */
++		if (dev->reg_state == NETREG_UNINITIALIZED) {
++			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
++				 dev->name, dev);
++
++			WARN_ON(1);
++			list_del(&dev->unreg_list);
++			continue;
++		}
++		dev->dismantle = true;
++		BUG_ON(dev->reg_state != NETREG_REGISTERED);
++	}
++
++	/* If device is running, close it first. */
++	list_for_each_entry(dev, head, unreg_list)
++		list_add_tail(&dev->close_list, &close_head);
++	dev_close_many(&close_head, true);
++
++	list_for_each_entry(dev, head, unreg_list) {
++		/* And unlink it from device chain. */
++		write_lock(&dev_base_lock);
++		unlist_netdevice(dev, false);
++		dev->reg_state = NETREG_UNREGISTERING;
++		write_unlock(&dev_base_lock);
++	}
++	flush_all_backlogs();
++
++	synchronize_net();
++
++	list_for_each_entry(dev, head, unreg_list) {
++		struct sk_buff *skb = NULL;
++
++		/* Shutdown queueing discipline. */
++		dev_shutdown(dev);
++
++		dev_xdp_uninstall(dev);
++
++		netdev_offload_xstats_disable_all(dev);
++
++		/* Notify protocols, that we are about to destroy
++		 * this device. They should clean all the things.
++		 */
++		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
++
++		if (!dev->rtnl_link_ops ||
++		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
++			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
++						     GFP_KERNEL, NULL, 0);
++
++		/*
++		 *	Flush the unicast and multicast chains
++		 */
++		dev_uc_flush(dev);
++		dev_mc_flush(dev);
++
++		netdev_name_node_alt_flush(dev);
++		netdev_name_node_free(dev->name_node);
++
++		if (dev->netdev_ops->ndo_uninit)
++			dev->netdev_ops->ndo_uninit(dev);
++
++		if (skb)
++			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
++
++		/* Notifier chain MUST detach us all upper devices. */
++		WARN_ON(netdev_has_any_upper_dev(dev));
++		WARN_ON(netdev_has_any_lower_dev(dev));
++
++		/* Remove entries from kobject tree */
++		netdev_unregister_kobject(dev);
++#ifdef CONFIG_XPS
++		/* Remove XPS queueing entries */
++		netif_reset_xps_queues_gt(dev, 0);
++#endif
++	}
++
++	synchronize_net();
++
++	list_for_each_entry(dev, head, unreg_list) {
++		netdev_put(dev, &dev->dev_registered_tracker);
++		net_set_todo(dev);
++	}
++
++	list_del(head);
++}
++EXPORT_SYMBOL(unregister_netdevice_many);
++
++/**
++ *	unregister_netdev - remove device from the kernel
++ *	@dev: device
++ *
++ *	This function shuts down a device interface and removes it
++ *	from the kernel tables.
++ *
++ *	This is just a wrapper for unregister_netdevice that takes
++ *	the rtnl semaphore.  In general you want to use this and not
++ *	unregister_netdevice.
++ */
++void unregister_netdev(struct net_device *dev)
++{
++	rtnl_lock();
++	unregister_netdevice(dev);
++	rtnl_unlock();
++}
++EXPORT_SYMBOL(unregister_netdev);
++
++/**
++ *	__dev_change_net_namespace - move device to different nethost namespace
++ *	@dev: device
++ *	@net: network namespace
++ *	@pat: If not NULL name pattern to try if the current device name
++ *	      is already taken in the destination network namespace.
++ *	@new_ifindex: If not zero, specifies device index in the target
++ *	              namespace.
++ *
++ *	This function shuts down a device interface and moves it
++ *	to a new network namespace. On success 0 is returned, on
++ *	a failure a netagive errno code is returned.
++ *
++ *	Callers must hold the rtnl semaphore.
++ */
++
++int __dev_change_net_namespace(struct net_device *dev, struct net *net,
++			       const char *pat, int new_ifindex)
++{
++	struct net *net_old = dev_net(dev);
++	int err, new_nsid;
++
++	ASSERT_RTNL();
++
++	/* Don't allow namespace local devices to be moved. */
++	err = -EINVAL;
++	if (dev->features & NETIF_F_NETNS_LOCAL)
++		goto out;
++
++	/* Ensure the device has been registrered */
++	if (dev->reg_state != NETREG_REGISTERED)
++		goto out;
++
++	/* Get out if there is nothing todo */
++	err = 0;
++	if (net_eq(net_old, net))
++		goto out;
++
++	/* Pick the destination device name, and ensure
++	 * we can use it in the destination network namespace.
++	 */
++	err = -EEXIST;
++	if (netdev_name_in_use(net, dev->name)) {
++		/* We get here if we can't use the current device name */
++		if (!pat)
++			goto out;
++		err = dev_get_valid_name(net, dev, pat);
++		if (err < 0)
++			goto out;
++	}
++
++	/* Check that new_ifindex isn't used yet. */
++	err = -EBUSY;
++	if (new_ifindex && __dev_get_by_index(net, new_ifindex))
++		goto out;
++
++	/*
++	 * And now a mini version of register_netdevice unregister_netdevice.
++	 */
++
++	/* If device is running close it first. */
++	dev_close(dev);
++
++	/* And unlink it from device chain */
++	unlist_netdevice(dev, true);
++
++	synchronize_net();
++
++	/* Shutdown queueing discipline. */
++	dev_shutdown(dev);
++
++	/* Notify protocols, that we are about to destroy
++	 * this device. They should clean all the things.
++	 *
++	 * Note that dev->reg_state stays at NETREG_REGISTERED.
++	 * This is wanted because this way 8021q and macvlan know
++	 * the device is just moving and can keep their slaves up.
++	 */
++	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
++	rcu_barrier();
++
++	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
++	/* If there is an ifindex conflict assign a new one */
++	if (!new_ifindex) {
++		if (__dev_get_by_index(net, dev->ifindex))
++			new_ifindex = dev_new_index(net);
++		else
++			new_ifindex = dev->ifindex;
++	}
++
++	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
++			    new_ifindex);
++
++	/*
++	 *	Flush the unicast and multicast chains
++	 */
++	dev_uc_flush(dev);
++	dev_mc_flush(dev);
++
++	/* Send a netdev-removed uevent to the old namespace */
++	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
++	netdev_adjacent_del_links(dev);
++
++	/* Move per-net netdevice notifiers that are following the netdevice */
++	move_netdevice_notifiers_dev_net(dev, net);
++
++	/* Actually switch the network namespace */
++	dev_net_set(dev, net);
++	dev->ifindex = new_ifindex;
++
++	/* Send a netdev-add uevent to the new namespace */
++	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
++	netdev_adjacent_add_links(dev);
++
++	/* Fixup kobjects */
++	err = device_rename(&dev->dev, dev->name);
++	WARN_ON(err);
++
++	/* Adapt owner in case owning user namespace of target network
++	 * namespace is different from the original one.
++	 */
++	err = netdev_change_owner(dev, net_old, net);
++	WARN_ON(err);
++
++	/* Add the device back in the hashes */
++	list_netdevice(dev);
++
++	/* Notify protocols, that a new device appeared. */
++	call_netdevice_notifiers(NETDEV_REGISTER, dev);
++
++	/*
++	 *	Prevent userspace races by waiting until the network
++	 *	device is fully setup before sending notifications.
++	 */
++	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
++
++	synchronize_net();
++	err = 0;
++out:
++	return err;
++}
++EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
++
++static int dev_cpu_dead(unsigned int oldcpu)
++{
++	struct sk_buff **list_skb;
++	struct sk_buff *skb;
++	unsigned int cpu;
++	struct softnet_data *sd, *oldsd, *remsd = NULL;
++
++	local_irq_disable();
++	cpu = smp_processor_id();
++	sd = &per_cpu(softnet_data, cpu);
++	oldsd = &per_cpu(softnet_data, oldcpu);
++
++	/* Find end of our completion_queue. */
++	list_skb = &sd->completion_queue;
++	while (*list_skb)
++		list_skb = &(*list_skb)->next;
++	/* Append completion queue from offline CPU. */
++	*list_skb = oldsd->completion_queue;
++	oldsd->completion_queue = NULL;
++
++	/* Append output queue from offline CPU. */
++	if (oldsd->output_queue) {
++		*sd->output_queue_tailp = oldsd->output_queue;
++		sd->output_queue_tailp = oldsd->output_queue_tailp;
++		oldsd->output_queue = NULL;
++		oldsd->output_queue_tailp = &oldsd->output_queue;
++	}
++	/* Append NAPI poll list from offline CPU, with one exception :
++	 * process_backlog() must be called by cpu owning percpu backlog.
++	 * We properly handle process_queue & input_pkt_queue later.
++	 */
++	while (!list_empty(&oldsd->poll_list)) {
++		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
++							    struct napi_struct,
++							    poll_list);
++
++		list_del_init(&napi->poll_list);
++		if (napi->poll == process_backlog)
++			napi->state = 0;
++		else
++			____napi_schedule(sd, napi);
++	}
++
++	raise_softirq_irqoff(NET_TX_SOFTIRQ);
++	local_irq_enable();
++
++#ifdef CONFIG_RPS
++	remsd = oldsd->rps_ipi_list;
++	oldsd->rps_ipi_list = NULL;
++#endif
++	/* send out pending IPI's on offline CPU */
++	net_rps_send_ipi(remsd);
++
++	/* Process offline CPU's input_pkt_queue */
++	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
++		netif_rx(skb);
++		input_queue_head_incr(oldsd);
++	}
++	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
++		netif_rx(skb);
++		input_queue_head_incr(oldsd);
++	}
++
++	return 0;
++}
++
++/**
++ *	netdev_increment_features - increment feature set by one
++ *	@all: current feature set
++ *	@one: new feature set
++ *	@mask: mask feature set
++ *
++ *	Computes a new feature set after adding a device with feature set
++ *	@one to the master device with current feature set @all.  Will not
++ *	enable anything that is off in @mask. Returns the new feature set.
++ */
++netdev_features_t netdev_increment_features(netdev_features_t all,
++	netdev_features_t one, netdev_features_t mask)
++{
++	if (mask & NETIF_F_HW_CSUM)
++		mask |= NETIF_F_CSUM_MASK;
++	mask |= NETIF_F_VLAN_CHALLENGED;
++
++	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
++	all &= one | ~NETIF_F_ALL_FOR_ALL;
++
++	/* If one device supports hw checksumming, set for all. */
++	if (all & NETIF_F_HW_CSUM)
++		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
++
++	return all;
++}
++EXPORT_SYMBOL(netdev_increment_features);
++
++static struct hlist_head * __net_init netdev_create_hash(void)
++{
++	int i;
++	struct hlist_head *hash;
++
++	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
++	if (hash != NULL)
++		for (i = 0; i < NETDEV_HASHENTRIES; i++)
++			INIT_HLIST_HEAD(&hash[i]);
++
++	return hash;
++}
++
++/* Initialize per network namespace state */
++static int __net_init netdev_init(struct net *net)
++{
++	BUILD_BUG_ON(GRO_HASH_BUCKETS >
++		     8 * sizeof_field(struct napi_struct, gro_bitmask));
++
++	INIT_LIST_HEAD(&net->dev_base_head);
++
++	net->dev_name_head = netdev_create_hash();
++	if (net->dev_name_head == NULL)
++		goto err_name;
++
++	net->dev_index_head = netdev_create_hash();
++	if (net->dev_index_head == NULL)
++		goto err_idx;
++
++	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
++
++	return 0;
++
++err_idx:
++	kfree(net->dev_name_head);
++err_name:
++	return -ENOMEM;
++}
++
++/**
++ *	netdev_drivername - network driver for the device
++ *	@dev: network device
++ *
++ *	Determine network driver for device.
++ */
++const char *netdev_drivername(const struct net_device *dev)
++{
++	const struct device_driver *driver;
++	const struct device *parent;
++	const char *empty = "";
++
++	parent = dev->dev.parent;
++	if (!parent)
++		return empty;
++
++	driver = parent->driver;
++	if (driver && driver->name)
++		return driver->name;
++	return empty;
++}
++
++static void __netdev_printk(const char *level, const struct net_device *dev,
++			    struct va_format *vaf)
++{
++	if (dev && dev->dev.parent) {
++		dev_printk_emit(level[1] - '0',
++				dev->dev.parent,
++				"%s %s %s%s: %pV",
++				dev_driver_string(dev->dev.parent),
++				dev_name(dev->dev.parent),
++				netdev_name(dev), netdev_reg_state(dev),
++				vaf);
++	} else if (dev) {
++		printk("%s%s%s: %pV",
++		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
++	} else {
++		printk("%s(NULL net_device): %pV", level, vaf);
++	}
++}
++
++void netdev_printk(const char *level, const struct net_device *dev,
++		   const char *format, ...)
++{
++	struct va_format vaf;
++	va_list args;
++
++	va_start(args, format);
++
++	vaf.fmt = format;
++	vaf.va = &args;
++
++	__netdev_printk(level, dev, &vaf);
++
++	va_end(args);
++}
++EXPORT_SYMBOL(netdev_printk);
++
++#define define_netdev_printk_level(func, level)			\
++void func(const struct net_device *dev, const char *fmt, ...)	\
++{								\
++	struct va_format vaf;					\
++	va_list args;						\
++								\
++	va_start(args, fmt);					\
++								\
++	vaf.fmt = fmt;						\
++	vaf.va = &args;						\
++								\
++	__netdev_printk(level, dev, &vaf);			\
++								\
++	va_end(args);						\
++}								\
++EXPORT_SYMBOL(func);
++
++define_netdev_printk_level(netdev_emerg, KERN_EMERG);
++define_netdev_printk_level(netdev_alert, KERN_ALERT);
++define_netdev_printk_level(netdev_crit, KERN_CRIT);
++define_netdev_printk_level(netdev_err, KERN_ERR);
++define_netdev_printk_level(netdev_warn, KERN_WARNING);
++define_netdev_printk_level(netdev_notice, KERN_NOTICE);
++define_netdev_printk_level(netdev_info, KERN_INFO);
++
++static void __net_exit netdev_exit(struct net *net)
++{
++	kfree(net->dev_name_head);
++	kfree(net->dev_index_head);
++	if (net != &init_net)
++		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
++}
++
++static struct pernet_operations __net_initdata netdev_net_ops = {
++	.init = netdev_init,
++	.exit = netdev_exit,
++};
++
++static void __net_exit default_device_exit_net(struct net *net)
++{
++	struct net_device *dev, *aux;
++	/*
++	 * Push all migratable network devices back to the
++	 * initial network namespace
++	 */
++	ASSERT_RTNL();
++	for_each_netdev_safe(net, dev, aux) {
++		int err;
++		char fb_name[IFNAMSIZ];
++
++		/* Ignore unmoveable devices (i.e. loopback) */
++		if (dev->features & NETIF_F_NETNS_LOCAL)
++			continue;
++
++		/* Leave virtual devices for the generic cleanup */
++		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
++			continue;
++
++		/* Push remaining network devices to init_net */
++		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
++		if (netdev_name_in_use(&init_net, fb_name))
++			snprintf(fb_name, IFNAMSIZ, "dev%%d");
++		err = dev_change_net_namespace(dev, &init_net, fb_name);
++		if (err) {
++			pr_emerg("%s: failed to move %s to init_net: %d\n",
++				 __func__, dev->name, err);
++			BUG();
++		}
++	}
++}
++
++static void __net_exit default_device_exit_batch(struct list_head *net_list)
++{
++	/* At exit all network devices most be removed from a network
++	 * namespace.  Do this in the reverse order of registration.
++	 * Do this across as many network namespaces as possible to
++	 * improve batching efficiency.
++	 */
++	struct net_device *dev;
++	struct net *net;
++	LIST_HEAD(dev_kill_list);
++
++	rtnl_lock();
++	list_for_each_entry(net, net_list, exit_list) {
++		default_device_exit_net(net);
++		cond_resched();
++	}
++
++	list_for_each_entry(net, net_list, exit_list) {
++		for_each_netdev_reverse(net, dev) {
++			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
++				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
++			else
++				unregister_netdevice_queue(dev, &dev_kill_list);
++		}
++	}
++	unregister_netdevice_many(&dev_kill_list);
++	rtnl_unlock();
++}
++
++static struct pernet_operations __net_initdata default_device_ops = {
++	.exit_batch = default_device_exit_batch,
++};
++
++/*
++ *	Initialize the DEV module. At boot time this walks the device list and
++ *	unhooks any devices that fail to initialise (normally hardware not
++ *	present) and leaves us with a valid list of present and active devices.
++ *
++ */
++
++/*
++ *       This is called single threaded during boot, so no need
++ *       to take the rtnl semaphore.
++ */
++static int __init net_dev_init(void)
++{
++	int i, rc = -ENOMEM;
++
++	BUG_ON(!dev_boot_phase);
++
++	if (dev_proc_init())
++		goto out;
++
++	if (netdev_kobject_init())
++		goto out;
++
++	INIT_LIST_HEAD(&ptype_all);
++	for (i = 0; i < PTYPE_HASH_SIZE; i++)
++		INIT_LIST_HEAD(&ptype_base[i]);
++
++	if (register_pernet_subsys(&netdev_net_ops))
++		goto out;
++
++	/*
++	 *	Initialise the packet receive queues.
++	 */
++
++	for_each_possible_cpu(i) {
++		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
++		struct softnet_data *sd = &per_cpu(softnet_data, i);
++
++		INIT_WORK(flush, flush_backlog);
++
++		skb_queue_head_init(&sd->input_pkt_queue);
++		skb_queue_head_init(&sd->process_queue);
++#ifdef CONFIG_XFRM_OFFLOAD
++		skb_queue_head_init(&sd->xfrm_backlog);
++#endif
++		INIT_LIST_HEAD(&sd->poll_list);
++		sd->output_queue_tailp = &sd->output_queue;
++#ifdef CONFIG_RPS
++		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
++		sd->cpu = i;
++#endif
++		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
++		spin_lock_init(&sd->defer_lock);
++
++		init_gro_hash(&sd->backlog);
++		sd->backlog.poll = process_backlog;
++		sd->backlog.weight = weight_p;
++	}
++
++	dev_boot_phase = 0;
++
++	/* The loopback device is special if any other network devices
++	 * is present in a network namespace the loopback device must
++	 * be present. Since we now dynamically allocate and free the
++	 * loopback device ensure this invariant is maintained by
++	 * keeping the loopback device as the first device on the
++	 * list of network devices.  Ensuring the loopback devices
++	 * is the first device that appears and the last network device
++	 * that disappears.
++	 */
++	if (register_pernet_device(&loopback_net_ops))
++		goto out;
++
++	if (register_pernet_device(&default_device_ops))
++		goto out;
++
++	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
++	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
++
++	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
++				       NULL, dev_cpu_dead);
++	WARN_ON(rc < 0);
++	rc = 0;
++out:
++	return rc;
++}
++
++subsys_initcall(net_dev_init);
+diff -rupN linux.orig/net/core/devlink.c linux/net/core/devlink.c
+--- linux.orig/net/core/devlink.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/devlink.c	2022-12-04 10:40:26.732034003 -0500
+@@ -8268,10 +8268,10 @@ static void devlink_trap_stats_read(stru
  
  		cpu_stats = per_cpu_ptr(trap_stats, i);
  		do {
@@ -8713,11 +50605,10 @@ index b50bcc18b8d9e..cfa6a099457ae 100644
  
  		u64_stats_add(&stats->rx_packets, rx_packets);
  		u64_stats_add(&stats->rx_bytes, rx_bytes);
-diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
-index 75501e1bdd25b..dfcaf61d972c7 100644
---- a/net/core/drop_monitor.c
-+++ b/net/core/drop_monitor.c
-@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net_dm_stats *stats)
+diff -rupN linux.orig/net/core/drop_monitor.c linux/net/core/drop_monitor.c
+--- linux.orig/net/core/drop_monitor.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/drop_monitor.c	2022-12-04 10:40:26.732034003 -0500
+@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net
  		u64 dropped;
  
  		do {
@@ -8729,7 +50620,7 @@ index 75501e1bdd25b..dfcaf61d972c7 100644
  
  		u64_stats_add(&stats->dropped, dropped);
  	}
-@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats)
+@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct
  		u64 dropped;
  
  		do {
@@ -8741,11 +50632,10 @@ index 75501e1bdd25b..dfcaf61d972c7 100644
  
  		u64_stats_add(&stats->dropped, dropped);
  	}
-diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
-index c8d137ef5980e..b71ccaec09914 100644
---- a/net/core/gen_stats.c
-+++ b/net/core/gen_stats.c
-@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats,
+diff -rupN linux.orig/net/core/gen_stats.c linux/net/core/gen_stats.c
+--- linux.orig/net/core/gen_stats.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/gen_stats.c	2022-12-04 10:40:26.732034003 -0500
+@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(str
  		u64 bytes, packets;
  
  		do {
@@ -8758,7 +50648,7 @@ index c8d137ef5980e..b71ccaec09914 100644
  
  		t_bytes += bytes;
  		t_packets += packets;
-@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats,
+@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_st
  	}
  	do {
  		if (running)
@@ -8771,7 +50661,7 @@ index c8d137ef5980e..b71ccaec09914 100644
  
  	_bstats_update(bstats, bytes, packets);
  }
-@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
+@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *r
  			u64 bytes, packets;
  
  			do {
@@ -8784,7 +50674,7 @@ index c8d137ef5980e..b71ccaec09914 100644
  
  			t_bytes += bytes;
  			t_packets += packets;
-@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
+@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *r
  	}
  	do {
  		if (running)
@@ -8797,11 +50687,10 @@ index c8d137ef5980e..b71ccaec09914 100644
  }
  
  static int
-diff --git a/net/core/skbuff.c b/net/core/skbuff.c
-index 417463da4fac7..505c72a9b1534 100644
---- a/net/core/skbuff.c
-+++ b/net/core/skbuff.c
-@@ -6555,6 +6555,11 @@ nodefer:	__kfree_skb(skb);
+diff -rupN linux.orig/net/core/skbuff.c linux/net/core/skbuff.c
+--- linux.orig/net/core/skbuff.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/skbuff.c	2022-12-04 10:40:26.732034003 -0500
+@@ -6557,6 +6557,11 @@ nodefer:	__kfree_skb(skb);
  	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
  	 * if we are unlucky enough (this seems very unlikely).
  	 */
@@ -8814,11 +50703,6576 @@ index 417463da4fac7..505c72a9b1534 100644
 +#endif
 +	}
  }
-diff --git a/net/dsa/slave.c b/net/dsa/slave.c
-index 1291c2431d440..dcc550b871623 100644
---- a/net/dsa/slave.c
-+++ b/net/dsa/slave.c
-@@ -934,12 +934,12 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
+diff -rupN linux.orig/net/core/skbuff.c.orig linux/net/core/skbuff.c.orig
+--- linux.orig/net/core/skbuff.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/net/core/skbuff.c.orig	2022-12-04 10:40:18.728054516 -0500
+@@ -0,0 +1,6562 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *	Routines having to do with the 'struct sk_buff' memory handlers.
++ *
++ *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
++ *			Florian La Roche <rzsfl@rz.uni-sb.de>
++ *
++ *	Fixes:
++ *		Alan Cox	:	Fixed the worst of the load
++ *					balancer bugs.
++ *		Dave Platt	:	Interrupt stacking fix.
++ *	Richard Kooijman	:	Timestamp fixes.
++ *		Alan Cox	:	Changed buffer format.
++ *		Alan Cox	:	destructor hook for AF_UNIX etc.
++ *		Linus Torvalds	:	Better skb_clone.
++ *		Alan Cox	:	Added skb_copy.
++ *		Alan Cox	:	Added all the changed routines Linus
++ *					only put in the headers
++ *		Ray VanTassle	:	Fixed --skb->lock in free
++ *		Alan Cox	:	skb_copy copy arp field
++ *		Andi Kleen	:	slabified it.
++ *		Robert Olsson	:	Removed skb_head_pool
++ *
++ *	NOTE:
++ *		The __skb_ routines should be called with interrupts
++ *	disabled, or you better be *real* sure that the operation is atomic
++ *	with respect to whatever list is being frobbed (e.g. via lock_sock()
++ *	or via disabling bottom half handlers, etc).
++ */
++
++/*
++ *	The functions in this file will not compile correctly with gcc 2.4.x
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/module.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/interrupt.h>
++#include <linux/in.h>
++#include <linux/inet.h>
++#include <linux/slab.h>
++#include <linux/tcp.h>
++#include <linux/udp.h>
++#include <linux/sctp.h>
++#include <linux/netdevice.h>
++#ifdef CONFIG_NET_CLS_ACT
++#include <net/pkt_sched.h>
++#endif
++#include <linux/string.h>
++#include <linux/skbuff.h>
++#include <linux/splice.h>
++#include <linux/cache.h>
++#include <linux/rtnetlink.h>
++#include <linux/init.h>
++#include <linux/scatterlist.h>
++#include <linux/errqueue.h>
++#include <linux/prefetch.h>
++#include <linux/if_vlan.h>
++#include <linux/mpls.h>
++#include <linux/kcov.h>
++
++#include <net/protocol.h>
++#include <net/dst.h>
++#include <net/sock.h>
++#include <net/checksum.h>
++#include <net/ip6_checksum.h>
++#include <net/xfrm.h>
++#include <net/mpls.h>
++#include <net/mptcp.h>
++#include <net/mctp.h>
++#include <net/page_pool.h>
++
++#include <linux/uaccess.h>
++#include <trace/events/skb.h>
++#include <linux/highmem.h>
++#include <linux/capability.h>
++#include <linux/user_namespace.h>
++#include <linux/indirect_call_wrapper.h>
++
++#include "dev.h"
++#include "sock_destructor.h"
++
++struct kmem_cache *skbuff_head_cache __ro_after_init;
++static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
++#ifdef CONFIG_SKB_EXTENSIONS
++static struct kmem_cache *skbuff_ext_cache __ro_after_init;
++#endif
++int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
++EXPORT_SYMBOL(sysctl_max_skb_frags);
++
++#undef FN
++#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
++const char * const drop_reasons[] = {
++	DEFINE_DROP_REASON(FN, FN)
++};
++EXPORT_SYMBOL(drop_reasons);
++
++/**
++ *	skb_panic - private function for out-of-line support
++ *	@skb:	buffer
++ *	@sz:	size
++ *	@addr:	address
++ *	@msg:	skb_over_panic or skb_under_panic
++ *
++ *	Out-of-line support for skb_put() and skb_push().
++ *	Called via the wrapper skb_over_panic() or skb_under_panic().
++ *	Keep out of line to prevent kernel bloat.
++ *	__builtin_return_address is not used because it is not always reliable.
++ */
++static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
++		      const char msg[])
++{
++	pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
++		 msg, addr, skb->len, sz, skb->head, skb->data,
++		 (unsigned long)skb->tail, (unsigned long)skb->end,
++		 skb->dev ? skb->dev->name : "<NULL>");
++	BUG();
++}
++
++static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
++{
++	skb_panic(skb, sz, addr, __func__);
++}
++
++static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
++{
++	skb_panic(skb, sz, addr, __func__);
++}
++
++#define NAPI_SKB_CACHE_SIZE	64
++#define NAPI_SKB_CACHE_BULK	16
++#define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
++
++struct napi_alloc_cache {
++	struct page_frag_cache page;
++	unsigned int skb_count;
++	void *skb_cache[NAPI_SKB_CACHE_SIZE];
++};
++
++static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
++static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
++
++void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
++{
++	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
++
++	fragsz = SKB_DATA_ALIGN(fragsz);
++
++	return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
++}
++EXPORT_SYMBOL(__napi_alloc_frag_align);
++
++void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
++{
++	void *data;
++
++	fragsz = SKB_DATA_ALIGN(fragsz);
++	if (in_hardirq() || irqs_disabled()) {
++		struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
++
++		data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
++	} else {
++		struct napi_alloc_cache *nc;
++
++		local_bh_disable();
++		nc = this_cpu_ptr(&napi_alloc_cache);
++		data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
++		local_bh_enable();
++	}
++	return data;
++}
++EXPORT_SYMBOL(__netdev_alloc_frag_align);
++
++static struct sk_buff *napi_skb_cache_get(void)
++{
++	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
++	struct sk_buff *skb;
++
++	if (unlikely(!nc->skb_count)) {
++		nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
++						      GFP_ATOMIC,
++						      NAPI_SKB_CACHE_BULK,
++						      nc->skb_cache);
++		if (unlikely(!nc->skb_count))
++			return NULL;
++	}
++
++	skb = nc->skb_cache[--nc->skb_count];
++	kasan_unpoison_object_data(skbuff_head_cache, skb);
++
++	return skb;
++}
++
++/* Caller must provide SKB that is memset cleared */
++static void __build_skb_around(struct sk_buff *skb, void *data,
++			       unsigned int frag_size)
++{
++	struct skb_shared_info *shinfo;
++	unsigned int size = frag_size ? : ksize(data);
++
++	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++
++	/* Assumes caller memset cleared SKB */
++	skb->truesize = SKB_TRUESIZE(size);
++	refcount_set(&skb->users, 1);
++	skb->head = data;
++	skb->data = data;
++	skb_reset_tail_pointer(skb);
++	skb_set_end_offset(skb, size);
++	skb->mac_header = (typeof(skb->mac_header))~0U;
++	skb->transport_header = (typeof(skb->transport_header))~0U;
++	skb->alloc_cpu = raw_smp_processor_id();
++	/* make sure we initialize shinfo sequentially */
++	shinfo = skb_shinfo(skb);
++	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
++	atomic_set(&shinfo->dataref, 1);
++
++	skb_set_kcov_handle(skb, kcov_common_handle());
++}
++
++/**
++ * __build_skb - build a network buffer
++ * @data: data buffer provided by caller
++ * @frag_size: size of data, or 0 if head was kmalloced
++ *
++ * Allocate a new &sk_buff. Caller provides space holding head and
++ * skb_shared_info. @data must have been allocated by kmalloc() only if
++ * @frag_size is 0, otherwise data should come from the page allocator
++ *  or vmalloc()
++ * The return is the new skb buffer.
++ * On a failure the return is %NULL, and @data is not freed.
++ * Notes :
++ *  Before IO, driver allocates only data buffer where NIC put incoming frame
++ *  Driver should add room at head (NET_SKB_PAD) and
++ *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
++ *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
++ *  before giving packet to stack.
++ *  RX rings only contains data buffers, not full skbs.
++ */
++struct sk_buff *__build_skb(void *data, unsigned int frag_size)
++{
++	struct sk_buff *skb;
++
++	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
++	if (unlikely(!skb))
++		return NULL;
++
++	memset(skb, 0, offsetof(struct sk_buff, tail));
++	__build_skb_around(skb, data, frag_size);
++
++	return skb;
++}
++
++/* build_skb() is wrapper over __build_skb(), that specifically
++ * takes care of skb->head and skb->pfmemalloc
++ * This means that if @frag_size is not zero, then @data must be backed
++ * by a page fragment, not kmalloc() or vmalloc()
++ */
++struct sk_buff *build_skb(void *data, unsigned int frag_size)
++{
++	struct sk_buff *skb = __build_skb(data, frag_size);
++
++	if (skb && frag_size) {
++		skb->head_frag = 1;
++		if (page_is_pfmemalloc(virt_to_head_page(data)))
++			skb->pfmemalloc = 1;
++	}
++	return skb;
++}
++EXPORT_SYMBOL(build_skb);
++
++/**
++ * build_skb_around - build a network buffer around provided skb
++ * @skb: sk_buff provide by caller, must be memset cleared
++ * @data: data buffer provided by caller
++ * @frag_size: size of data, or 0 if head was kmalloced
++ */
++struct sk_buff *build_skb_around(struct sk_buff *skb,
++				 void *data, unsigned int frag_size)
++{
++	if (unlikely(!skb))
++		return NULL;
++
++	__build_skb_around(skb, data, frag_size);
++
++	if (frag_size) {
++		skb->head_frag = 1;
++		if (page_is_pfmemalloc(virt_to_head_page(data)))
++			skb->pfmemalloc = 1;
++	}
++	return skb;
++}
++EXPORT_SYMBOL(build_skb_around);
++
++/**
++ * __napi_build_skb - build a network buffer
++ * @data: data buffer provided by caller
++ * @frag_size: size of data, or 0 if head was kmalloced
++ *
++ * Version of __build_skb() that uses NAPI percpu caches to obtain
++ * skbuff_head instead of inplace allocation.
++ *
++ * Returns a new &sk_buff on success, %NULL on allocation failure.
++ */
++static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
++{
++	struct sk_buff *skb;
++
++	skb = napi_skb_cache_get();
++	if (unlikely(!skb))
++		return NULL;
++
++	memset(skb, 0, offsetof(struct sk_buff, tail));
++	__build_skb_around(skb, data, frag_size);
++
++	return skb;
++}
++
++/**
++ * napi_build_skb - build a network buffer
++ * @data: data buffer provided by caller
++ * @frag_size: size of data, or 0 if head was kmalloced
++ *
++ * Version of __napi_build_skb() that takes care of skb->head_frag
++ * and skb->pfmemalloc when the data is a page or page fragment.
++ *
++ * Returns a new &sk_buff on success, %NULL on allocation failure.
++ */
++struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
++{
++	struct sk_buff *skb = __napi_build_skb(data, frag_size);
++
++	if (likely(skb) && frag_size) {
++		skb->head_frag = 1;
++		skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
++	}
++
++	return skb;
++}
++EXPORT_SYMBOL(napi_build_skb);
++
++/*
++ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
++ * the caller if emergency pfmemalloc reserves are being used. If it is and
++ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
++ * may be used. Otherwise, the packet data may be discarded until enough
++ * memory is free
++ */
++static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
++			     bool *pfmemalloc)
++{
++	void *obj;
++	bool ret_pfmemalloc = false;
++
++	/*
++	 * Try a regular allocation, when that fails and we're not entitled
++	 * to the reserves, fail.
++	 */
++	obj = kmalloc_node_track_caller(size,
++					flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
++					node);
++	if (obj || !(gfp_pfmemalloc_allowed(flags)))
++		goto out;
++
++	/* Try again but now we are using pfmemalloc reserves */
++	ret_pfmemalloc = true;
++	obj = kmalloc_node_track_caller(size, flags, node);
++
++out:
++	if (pfmemalloc)
++		*pfmemalloc = ret_pfmemalloc;
++
++	return obj;
++}
++
++/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
++ *	'private' fields and also do memory statistics to find all the
++ *	[BEEP] leaks.
++ *
++ */
++
++/**
++ *	__alloc_skb	-	allocate a network buffer
++ *	@size: size to allocate
++ *	@gfp_mask: allocation mask
++ *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
++ *		instead of head cache and allocate a cloned (child) skb.
++ *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
++ *		allocations in case the data is required for writeback
++ *	@node: numa node to allocate memory on
++ *
++ *	Allocate a new &sk_buff. The returned buffer has no headroom and a
++ *	tail room of at least size bytes. The object has a reference count
++ *	of one. The return is the buffer. On a failure the return is %NULL.
++ *
++ *	Buffers may only be allocated from interrupts using a @gfp_mask of
++ *	%GFP_ATOMIC.
++ */
++struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
++			    int flags, int node)
++{
++	struct kmem_cache *cache;
++	struct sk_buff *skb;
++	unsigned int osize;
++	bool pfmemalloc;
++	u8 *data;
++
++	cache = (flags & SKB_ALLOC_FCLONE)
++		? skbuff_fclone_cache : skbuff_head_cache;
++
++	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
++		gfp_mask |= __GFP_MEMALLOC;
++
++	/* Get the HEAD */
++	if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
++	    likely(node == NUMA_NO_NODE || node == numa_mem_id()))
++		skb = napi_skb_cache_get();
++	else
++		skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
++	if (unlikely(!skb))
++		return NULL;
++	prefetchw(skb);
++
++	/* We do our best to align skb_shared_info on a separate cache
++	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
++	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
++	 * Both skb->head and skb_shared_info are cache line aligned.
++	 */
++	size = SKB_DATA_ALIGN(size);
++	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++	data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
++	if (unlikely(!data))
++		goto nodata;
++	/* kmalloc(size) might give us more room than requested.
++	 * Put skb_shared_info exactly at the end of allocated zone,
++	 * to allow max possible filling before reallocation.
++	 */
++	osize = ksize(data);
++	size = SKB_WITH_OVERHEAD(osize);
++	prefetchw(data + size);
++
++	/*
++	 * Only clear those fields we need to clear, not those that we will
++	 * actually initialise below. Hence, don't put any more fields after
++	 * the tail pointer in struct sk_buff!
++	 */
++	memset(skb, 0, offsetof(struct sk_buff, tail));
++	__build_skb_around(skb, data, osize);
++	skb->pfmemalloc = pfmemalloc;
++
++	if (flags & SKB_ALLOC_FCLONE) {
++		struct sk_buff_fclones *fclones;
++
++		fclones = container_of(skb, struct sk_buff_fclones, skb1);
++
++		skb->fclone = SKB_FCLONE_ORIG;
++		refcount_set(&fclones->fclone_ref, 1);
++	}
++
++	return skb;
++
++nodata:
++	kmem_cache_free(cache, skb);
++	return NULL;
++}
++EXPORT_SYMBOL(__alloc_skb);
++
++/**
++ *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
++ *	@dev: network device to receive on
++ *	@len: length to allocate
++ *	@gfp_mask: get_free_pages mask, passed to alloc_skb
++ *
++ *	Allocate a new &sk_buff and assign it a usage count of one. The
++ *	buffer has NET_SKB_PAD headroom built in. Users should allocate
++ *	the headroom they think they need without accounting for the
++ *	built in space. The built in space is used for optimisations.
++ *
++ *	%NULL is returned if there is no free memory.
++ */
++struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
++				   gfp_t gfp_mask)
++{
++	struct page_frag_cache *nc;
++	struct sk_buff *skb;
++	bool pfmemalloc;
++	void *data;
++
++	len += NET_SKB_PAD;
++
++	/* If requested length is either too small or too big,
++	 * we use kmalloc() for skb->head allocation.
++	 */
++	if (len <= SKB_WITH_OVERHEAD(1024) ||
++	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
++	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
++		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
++		if (!skb)
++			goto skb_fail;
++		goto skb_success;
++	}
++
++	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++	len = SKB_DATA_ALIGN(len);
++
++	if (sk_memalloc_socks())
++		gfp_mask |= __GFP_MEMALLOC;
++
++	if (in_hardirq() || irqs_disabled()) {
++		nc = this_cpu_ptr(&netdev_alloc_cache);
++		data = page_frag_alloc(nc, len, gfp_mask);
++		pfmemalloc = nc->pfmemalloc;
++	} else {
++		local_bh_disable();
++		nc = this_cpu_ptr(&napi_alloc_cache.page);
++		data = page_frag_alloc(nc, len, gfp_mask);
++		pfmemalloc = nc->pfmemalloc;
++		local_bh_enable();
++	}
++
++	if (unlikely(!data))
++		return NULL;
++
++	skb = __build_skb(data, len);
++	if (unlikely(!skb)) {
++		skb_free_frag(data);
++		return NULL;
++	}
++
++	if (pfmemalloc)
++		skb->pfmemalloc = 1;
++	skb->head_frag = 1;
++
++skb_success:
++	skb_reserve(skb, NET_SKB_PAD);
++	skb->dev = dev;
++
++skb_fail:
++	return skb;
++}
++EXPORT_SYMBOL(__netdev_alloc_skb);
++
++/**
++ *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
++ *	@napi: napi instance this buffer was allocated for
++ *	@len: length to allocate
++ *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
++ *
++ *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
++ *	attempt to allocate the head from a special reserved region used
++ *	only for NAPI Rx allocation.  By doing this we can save several
++ *	CPU cycles by avoiding having to disable and re-enable IRQs.
++ *
++ *	%NULL is returned if there is no free memory.
++ */
++struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
++				 gfp_t gfp_mask)
++{
++	struct napi_alloc_cache *nc;
++	struct sk_buff *skb;
++	void *data;
++
++	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
++	len += NET_SKB_PAD + NET_IP_ALIGN;
++
++	/* If requested length is either too small or too big,
++	 * we use kmalloc() for skb->head allocation.
++	 */
++	if (len <= SKB_WITH_OVERHEAD(1024) ||
++	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
++	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
++		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
++				  NUMA_NO_NODE);
++		if (!skb)
++			goto skb_fail;
++		goto skb_success;
++	}
++
++	nc = this_cpu_ptr(&napi_alloc_cache);
++	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++	len = SKB_DATA_ALIGN(len);
++
++	if (sk_memalloc_socks())
++		gfp_mask |= __GFP_MEMALLOC;
++
++	data = page_frag_alloc(&nc->page, len, gfp_mask);
++	if (unlikely(!data))
++		return NULL;
++
++	skb = __napi_build_skb(data, len);
++	if (unlikely(!skb)) {
++		skb_free_frag(data);
++		return NULL;
++	}
++
++	if (nc->page.pfmemalloc)
++		skb->pfmemalloc = 1;
++	skb->head_frag = 1;
++
++skb_success:
++	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
++	skb->dev = napi->dev;
++
++skb_fail:
++	return skb;
++}
++EXPORT_SYMBOL(__napi_alloc_skb);
++
++void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
++		     int size, unsigned int truesize)
++{
++	skb_fill_page_desc(skb, i, page, off, size);
++	skb->len += size;
++	skb->data_len += size;
++	skb->truesize += truesize;
++}
++EXPORT_SYMBOL(skb_add_rx_frag);
++
++void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
++			  unsigned int truesize)
++{
++	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++	skb_frag_size_add(frag, size);
++	skb->len += size;
++	skb->data_len += size;
++	skb->truesize += truesize;
++}
++EXPORT_SYMBOL(skb_coalesce_rx_frag);
++
++static void skb_drop_list(struct sk_buff **listp)
++{
++	kfree_skb_list(*listp);
++	*listp = NULL;
++}
++
++static inline void skb_drop_fraglist(struct sk_buff *skb)
++{
++	skb_drop_list(&skb_shinfo(skb)->frag_list);
++}
++
++static void skb_clone_fraglist(struct sk_buff *skb)
++{
++	struct sk_buff *list;
++
++	skb_walk_frags(skb, list)
++		skb_get(list);
++}
++
++static void skb_free_head(struct sk_buff *skb)
++{
++	unsigned char *head = skb->head;
++
++	if (skb->head_frag) {
++		if (skb_pp_recycle(skb, head))
++			return;
++		skb_free_frag(head);
++	} else {
++		kfree(head);
++	}
++}
++
++static void skb_release_data(struct sk_buff *skb)
++{
++	struct skb_shared_info *shinfo = skb_shinfo(skb);
++	int i;
++
++	if (skb->cloned &&
++	    atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
++			      &shinfo->dataref))
++		goto exit;
++
++	if (skb_zcopy(skb)) {
++		bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
++
++		skb_zcopy_clear(skb, true);
++		if (skip_unref)
++			goto free_head;
++	}
++
++	for (i = 0; i < shinfo->nr_frags; i++)
++		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
++
++free_head:
++	if (shinfo->frag_list)
++		kfree_skb_list(shinfo->frag_list);
++
++	skb_free_head(skb);
++exit:
++	/* When we clone an SKB we copy the reycling bit. The pp_recycle
++	 * bit is only set on the head though, so in order to avoid races
++	 * while trying to recycle fragments on __skb_frag_unref() we need
++	 * to make one SKB responsible for triggering the recycle path.
++	 * So disable the recycling bit if an SKB is cloned and we have
++	 * additional references to the fragmented part of the SKB.
++	 * Eventually the last SKB will have the recycling bit set and it's
++	 * dataref set to 0, which will trigger the recycling
++	 */
++	skb->pp_recycle = 0;
++}
++
++/*
++ *	Free an skbuff by memory without cleaning the state.
++ */
++static void kfree_skbmem(struct sk_buff *skb)
++{
++	struct sk_buff_fclones *fclones;
++
++	switch (skb->fclone) {
++	case SKB_FCLONE_UNAVAILABLE:
++		kmem_cache_free(skbuff_head_cache, skb);
++		return;
++
++	case SKB_FCLONE_ORIG:
++		fclones = container_of(skb, struct sk_buff_fclones, skb1);
++
++		/* We usually free the clone (TX completion) before original skb
++		 * This test would have no chance to be true for the clone,
++		 * while here, branch prediction will be good.
++		 */
++		if (refcount_read(&fclones->fclone_ref) == 1)
++			goto fastpath;
++		break;
++
++	default: /* SKB_FCLONE_CLONE */
++		fclones = container_of(skb, struct sk_buff_fclones, skb2);
++		break;
++	}
++	if (!refcount_dec_and_test(&fclones->fclone_ref))
++		return;
++fastpath:
++	kmem_cache_free(skbuff_fclone_cache, fclones);
++}
++
++void skb_release_head_state(struct sk_buff *skb)
++{
++	skb_dst_drop(skb);
++	if (skb->destructor) {
++		DEBUG_NET_WARN_ON_ONCE(in_hardirq());
++		skb->destructor(skb);
++	}
++#if IS_ENABLED(CONFIG_NF_CONNTRACK)
++	nf_conntrack_put(skb_nfct(skb));
++#endif
++	skb_ext_put(skb);
++}
++
++/* Free everything but the sk_buff shell. */
++static void skb_release_all(struct sk_buff *skb)
++{
++	skb_release_head_state(skb);
++	if (likely(skb->head))
++		skb_release_data(skb);
++}
++
++/**
++ *	__kfree_skb - private function
++ *	@skb: buffer
++ *
++ *	Free an sk_buff. Release anything attached to the buffer.
++ *	Clean the state. This is an internal helper function. Users should
++ *	always call kfree_skb
++ */
++
++void __kfree_skb(struct sk_buff *skb)
++{
++	skb_release_all(skb);
++	kfree_skbmem(skb);
++}
++EXPORT_SYMBOL(__kfree_skb);
++
++/**
++ *	kfree_skb_reason - free an sk_buff with special reason
++ *	@skb: buffer to free
++ *	@reason: reason why this skb is dropped
++ *
++ *	Drop a reference to the buffer and free it if the usage count has
++ *	hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
++ *	tracepoint.
++ */
++void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
++{
++	if (!skb_unref(skb))
++		return;
++
++	DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
++
++	trace_kfree_skb(skb, __builtin_return_address(0), reason);
++	__kfree_skb(skb);
++}
++EXPORT_SYMBOL(kfree_skb_reason);
++
++void kfree_skb_list_reason(struct sk_buff *segs,
++			   enum skb_drop_reason reason)
++{
++	while (segs) {
++		struct sk_buff *next = segs->next;
++
++		kfree_skb_reason(segs, reason);
++		segs = next;
++	}
++}
++EXPORT_SYMBOL(kfree_skb_list_reason);
++
++/* Dump skb information and contents.
++ *
++ * Must only be called from net_ratelimit()-ed paths.
++ *
++ * Dumps whole packets if full_pkt, only headers otherwise.
++ */
++void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
++{
++	struct skb_shared_info *sh = skb_shinfo(skb);
++	struct net_device *dev = skb->dev;
++	struct sock *sk = skb->sk;
++	struct sk_buff *list_skb;
++	bool has_mac, has_trans;
++	int headroom, tailroom;
++	int i, len, seg_len;
++
++	if (full_pkt)
++		len = skb->len;
++	else
++		len = min_t(int, skb->len, MAX_HEADER + 128);
++
++	headroom = skb_headroom(skb);
++	tailroom = skb_tailroom(skb);
++
++	has_mac = skb_mac_header_was_set(skb);
++	has_trans = skb_transport_header_was_set(skb);
++
++	printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
++	       "mac=(%d,%d) net=(%d,%d) trans=%d\n"
++	       "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
++	       "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
++	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
++	       level, skb->len, headroom, skb_headlen(skb), tailroom,
++	       has_mac ? skb->mac_header : -1,
++	       has_mac ? skb_mac_header_len(skb) : -1,
++	       skb->network_header,
++	       has_trans ? skb_network_header_len(skb) : -1,
++	       has_trans ? skb->transport_header : -1,
++	       sh->tx_flags, sh->nr_frags,
++	       sh->gso_size, sh->gso_type, sh->gso_segs,
++	       skb->csum, skb->ip_summed, skb->csum_complete_sw,
++	       skb->csum_valid, skb->csum_level,
++	       skb->hash, skb->sw_hash, skb->l4_hash,
++	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
++
++	if (dev)
++		printk("%sdev name=%s feat=%pNF\n",
++		       level, dev->name, &dev->features);
++	if (sk)
++		printk("%ssk family=%hu type=%u proto=%u\n",
++		       level, sk->sk_family, sk->sk_type, sk->sk_protocol);
++
++	if (full_pkt && headroom)
++		print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
++			       16, 1, skb->head, headroom, false);
++
++	seg_len = min_t(int, skb_headlen(skb), len);
++	if (seg_len)
++		print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
++			       16, 1, skb->data, seg_len, false);
++	len -= seg_len;
++
++	if (full_pkt && tailroom)
++		print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
++			       16, 1, skb_tail_pointer(skb), tailroom, false);
++
++	for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++		u32 p_off, p_len, copied;
++		struct page *p;
++		u8 *vaddr;
++
++		skb_frag_foreach_page(frag, skb_frag_off(frag),
++				      skb_frag_size(frag), p, p_off, p_len,
++				      copied) {
++			seg_len = min_t(int, p_len, len);
++			vaddr = kmap_atomic(p);
++			print_hex_dump(level, "skb frag:     ",
++				       DUMP_PREFIX_OFFSET,
++				       16, 1, vaddr + p_off, seg_len, false);
++			kunmap_atomic(vaddr);
++			len -= seg_len;
++			if (!len)
++				break;
++		}
++	}
++
++	if (full_pkt && skb_has_frag_list(skb)) {
++		printk("skb fraglist:\n");
++		skb_walk_frags(skb, list_skb)
++			skb_dump(level, list_skb, true);
++	}
++}
++EXPORT_SYMBOL(skb_dump);
++
++/**
++ *	skb_tx_error - report an sk_buff xmit error
++ *	@skb: buffer that triggered an error
++ *
++ *	Report xmit error if a device callback is tracking this skb.
++ *	skb must be freed afterwards.
++ */
++void skb_tx_error(struct sk_buff *skb)
++{
++	if (skb) {
++		skb_zcopy_downgrade_managed(skb);
++		skb_zcopy_clear(skb, true);
++	}
++}
++EXPORT_SYMBOL(skb_tx_error);
++
++#ifdef CONFIG_TRACEPOINTS
++/**
++ *	consume_skb - free an skbuff
++ *	@skb: buffer to free
++ *
++ *	Drop a ref to the buffer and free it if the usage count has hit zero
++ *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
++ *	is being dropped after a failure and notes that
++ */
++void consume_skb(struct sk_buff *skb)
++{
++	if (!skb_unref(skb))
++		return;
++
++	trace_consume_skb(skb);
++	__kfree_skb(skb);
++}
++EXPORT_SYMBOL(consume_skb);
++#endif
++
++/**
++ *	__consume_stateless_skb - free an skbuff, assuming it is stateless
++ *	@skb: buffer to free
++ *
++ *	Alike consume_skb(), but this variant assumes that this is the last
++ *	skb reference and all the head states have been already dropped
++ */
++void __consume_stateless_skb(struct sk_buff *skb)
++{
++	trace_consume_skb(skb);
++	skb_release_data(skb);
++	kfree_skbmem(skb);
++}
++
++static void napi_skb_cache_put(struct sk_buff *skb)
++{
++	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
++	u32 i;
++
++	kasan_poison_object_data(skbuff_head_cache, skb);
++	nc->skb_cache[nc->skb_count++] = skb;
++
++	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
++		for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
++			kasan_unpoison_object_data(skbuff_head_cache,
++						   nc->skb_cache[i]);
++
++		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
++				     nc->skb_cache + NAPI_SKB_CACHE_HALF);
++		nc->skb_count = NAPI_SKB_CACHE_HALF;
++	}
++}
++
++void __kfree_skb_defer(struct sk_buff *skb)
++{
++	skb_release_all(skb);
++	napi_skb_cache_put(skb);
++}
++
++void napi_skb_free_stolen_head(struct sk_buff *skb)
++{
++	if (unlikely(skb->slow_gro)) {
++		nf_reset_ct(skb);
++		skb_dst_drop(skb);
++		skb_ext_put(skb);
++		skb_orphan(skb);
++		skb->slow_gro = 0;
++	}
++	napi_skb_cache_put(skb);
++}
++
++void napi_consume_skb(struct sk_buff *skb, int budget)
++{
++	/* Zero budget indicate non-NAPI context called us, like netpoll */
++	if (unlikely(!budget)) {
++		dev_consume_skb_any(skb);
++		return;
++	}
++
++	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
++
++	if (!skb_unref(skb))
++		return;
++
++	/* if reaching here SKB is ready to free */
++	trace_consume_skb(skb);
++
++	/* if SKB is a clone, don't handle this case */
++	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
++		__kfree_skb(skb);
++		return;
++	}
++
++	skb_release_all(skb);
++	napi_skb_cache_put(skb);
++}
++EXPORT_SYMBOL(napi_consume_skb);
++
++/* Make sure a field is contained by headers group */
++#define CHECK_SKB_FIELD(field) \
++	BUILD_BUG_ON(offsetof(struct sk_buff, field) !=		\
++		     offsetof(struct sk_buff, headers.field));	\
++
++static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
++{
++	new->tstamp		= old->tstamp;
++	/* We do not copy old->sk */
++	new->dev		= old->dev;
++	memcpy(new->cb, old->cb, sizeof(old->cb));
++	skb_dst_copy(new, old);
++	__skb_ext_copy(new, old);
++	__nf_copy(new, old, false);
++
++	/* Note : this field could be in the headers group.
++	 * It is not yet because we do not want to have a 16 bit hole
++	 */
++	new->queue_mapping = old->queue_mapping;
++
++	memcpy(&new->headers, &old->headers, sizeof(new->headers));
++	CHECK_SKB_FIELD(protocol);
++	CHECK_SKB_FIELD(csum);
++	CHECK_SKB_FIELD(hash);
++	CHECK_SKB_FIELD(priority);
++	CHECK_SKB_FIELD(skb_iif);
++	CHECK_SKB_FIELD(vlan_proto);
++	CHECK_SKB_FIELD(vlan_tci);
++	CHECK_SKB_FIELD(transport_header);
++	CHECK_SKB_FIELD(network_header);
++	CHECK_SKB_FIELD(mac_header);
++	CHECK_SKB_FIELD(inner_protocol);
++	CHECK_SKB_FIELD(inner_transport_header);
++	CHECK_SKB_FIELD(inner_network_header);
++	CHECK_SKB_FIELD(inner_mac_header);
++	CHECK_SKB_FIELD(mark);
++#ifdef CONFIG_NETWORK_SECMARK
++	CHECK_SKB_FIELD(secmark);
++#endif
++#ifdef CONFIG_NET_RX_BUSY_POLL
++	CHECK_SKB_FIELD(napi_id);
++#endif
++	CHECK_SKB_FIELD(alloc_cpu);
++#ifdef CONFIG_XPS
++	CHECK_SKB_FIELD(sender_cpu);
++#endif
++#ifdef CONFIG_NET_SCHED
++	CHECK_SKB_FIELD(tc_index);
++#endif
++
++}
++
++/*
++ * You should not add any new code to this function.  Add it to
++ * __copy_skb_header above instead.
++ */
++static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
++{
++#define C(x) n->x = skb->x
++
++	n->next = n->prev = NULL;
++	n->sk = NULL;
++	__copy_skb_header(n, skb);
++
++	C(len);
++	C(data_len);
++	C(mac_len);
++	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
++	n->cloned = 1;
++	n->nohdr = 0;
++	n->peeked = 0;
++	C(pfmemalloc);
++	C(pp_recycle);
++	n->destructor = NULL;
++	C(tail);
++	C(end);
++	C(head);
++	C(head_frag);
++	C(data);
++	C(truesize);
++	refcount_set(&n->users, 1);
++
++	atomic_inc(&(skb_shinfo(skb)->dataref));
++	skb->cloned = 1;
++
++	return n;
++#undef C
++}
++
++/**
++ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
++ * @first: first sk_buff of the msg
++ */
++struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
++{
++	struct sk_buff *n;
++
++	n = alloc_skb(0, GFP_ATOMIC);
++	if (!n)
++		return NULL;
++
++	n->len = first->len;
++	n->data_len = first->len;
++	n->truesize = first->truesize;
++
++	skb_shinfo(n)->frag_list = first;
++
++	__copy_skb_header(n, first);
++	n->destructor = NULL;
++
++	return n;
++}
++EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
++
++/**
++ *	skb_morph	-	morph one skb into another
++ *	@dst: the skb to receive the contents
++ *	@src: the skb to supply the contents
++ *
++ *	This is identical to skb_clone except that the target skb is
++ *	supplied by the user.
++ *
++ *	The target skb is returned upon exit.
++ */
++struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
++{
++	skb_release_all(dst);
++	return __skb_clone(dst, src);
++}
++EXPORT_SYMBOL_GPL(skb_morph);
++
++int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
++{
++	unsigned long max_pg, num_pg, new_pg, old_pg;
++	struct user_struct *user;
++
++	if (capable(CAP_IPC_LOCK) || !size)
++		return 0;
++
++	num_pg = (size >> PAGE_SHIFT) + 2;	/* worst case */
++	max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
++	user = mmp->user ? : current_user();
++
++	do {
++		old_pg = atomic_long_read(&user->locked_vm);
++		new_pg = old_pg + num_pg;
++		if (new_pg > max_pg)
++			return -ENOBUFS;
++	} while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
++		 old_pg);
++
++	if (!mmp->user) {
++		mmp->user = get_uid(user);
++		mmp->num_pg = num_pg;
++	} else {
++		mmp->num_pg += num_pg;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
++
++void mm_unaccount_pinned_pages(struct mmpin *mmp)
++{
++	if (mmp->user) {
++		atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
++		free_uid(mmp->user);
++	}
++}
++EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
++
++static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
++{
++	struct ubuf_info *uarg;
++	struct sk_buff *skb;
++
++	WARN_ON_ONCE(!in_task());
++
++	skb = sock_omalloc(sk, 0, GFP_KERNEL);
++	if (!skb)
++		return NULL;
++
++	BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
++	uarg = (void *)skb->cb;
++	uarg->mmp.user = NULL;
++
++	if (mm_account_pinned_pages(&uarg->mmp, size)) {
++		kfree_skb(skb);
++		return NULL;
++	}
++
++	uarg->callback = msg_zerocopy_callback;
++	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
++	uarg->len = 1;
++	uarg->bytelen = size;
++	uarg->zerocopy = 1;
++	uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
++	refcount_set(&uarg->refcnt, 1);
++	sock_hold(sk);
++
++	return uarg;
++}
++
++static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
++{
++	return container_of((void *)uarg, struct sk_buff, cb);
++}
++
++struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
++				       struct ubuf_info *uarg)
++{
++	if (uarg) {
++		const u32 byte_limit = 1 << 19;		/* limit to a few TSO */
++		u32 bytelen, next;
++
++		/* there might be non MSG_ZEROCOPY users */
++		if (uarg->callback != msg_zerocopy_callback)
++			return NULL;
++
++		/* realloc only when socket is locked (TCP, UDP cork),
++		 * so uarg->len and sk_zckey access is serialized
++		 */
++		if (!sock_owned_by_user(sk)) {
++			WARN_ON_ONCE(1);
++			return NULL;
++		}
++
++		bytelen = uarg->bytelen + size;
++		if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
++			/* TCP can create new skb to attach new uarg */
++			if (sk->sk_type == SOCK_STREAM)
++				goto new_alloc;
++			return NULL;
++		}
++
++		next = (u32)atomic_read(&sk->sk_zckey);
++		if ((u32)(uarg->id + uarg->len) == next) {
++			if (mm_account_pinned_pages(&uarg->mmp, size))
++				return NULL;
++			uarg->len++;
++			uarg->bytelen = bytelen;
++			atomic_set(&sk->sk_zckey, ++next);
++
++			/* no extra ref when appending to datagram (MSG_MORE) */
++			if (sk->sk_type == SOCK_STREAM)
++				net_zcopy_get(uarg);
++
++			return uarg;
++		}
++	}
++
++new_alloc:
++	return msg_zerocopy_alloc(sk, size);
++}
++EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
++
++static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
++{
++	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
++	u32 old_lo, old_hi;
++	u64 sum_len;
++
++	old_lo = serr->ee.ee_info;
++	old_hi = serr->ee.ee_data;
++	sum_len = old_hi - old_lo + 1ULL + len;
++
++	if (sum_len >= (1ULL << 32))
++		return false;
++
++	if (lo != old_hi + 1)
++		return false;
++
++	serr->ee.ee_data += len;
++	return true;
++}
++
++static void __msg_zerocopy_callback(struct ubuf_info *uarg)
++{
++	struct sk_buff *tail, *skb = skb_from_uarg(uarg);
++	struct sock_exterr_skb *serr;
++	struct sock *sk = skb->sk;
++	struct sk_buff_head *q;
++	unsigned long flags;
++	bool is_zerocopy;
++	u32 lo, hi;
++	u16 len;
++
++	mm_unaccount_pinned_pages(&uarg->mmp);
++
++	/* if !len, there was only 1 call, and it was aborted
++	 * so do not queue a completion notification
++	 */
++	if (!uarg->len || sock_flag(sk, SOCK_DEAD))
++		goto release;
++
++	len = uarg->len;
++	lo = uarg->id;
++	hi = uarg->id + len - 1;
++	is_zerocopy = uarg->zerocopy;
++
++	serr = SKB_EXT_ERR(skb);
++	memset(serr, 0, sizeof(*serr));
++	serr->ee.ee_errno = 0;
++	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
++	serr->ee.ee_data = hi;
++	serr->ee.ee_info = lo;
++	if (!is_zerocopy)
++		serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
++
++	q = &sk->sk_error_queue;
++	spin_lock_irqsave(&q->lock, flags);
++	tail = skb_peek_tail(q);
++	if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
++	    !skb_zerocopy_notify_extend(tail, lo, len)) {
++		__skb_queue_tail(q, skb);
++		skb = NULL;
++	}
++	spin_unlock_irqrestore(&q->lock, flags);
++
++	sk_error_report(sk);
++
++release:
++	consume_skb(skb);
++	sock_put(sk);
++}
++
++void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
++			   bool success)
++{
++	uarg->zerocopy = uarg->zerocopy & success;
++
++	if (refcount_dec_and_test(&uarg->refcnt))
++		__msg_zerocopy_callback(uarg);
++}
++EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
++
++void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
++{
++	struct sock *sk = skb_from_uarg(uarg)->sk;
++
++	atomic_dec(&sk->sk_zckey);
++	uarg->len--;
++
++	if (have_uref)
++		msg_zerocopy_callback(NULL, uarg, true);
++}
++EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
++
++int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
++			     struct msghdr *msg, int len,
++			     struct ubuf_info *uarg)
++{
++	struct ubuf_info *orig_uarg = skb_zcopy(skb);
++	int err, orig_len = skb->len;
++
++	/* An skb can only point to one uarg. This edge case happens when
++	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
++	 */
++	if (orig_uarg && uarg != orig_uarg)
++		return -EEXIST;
++
++	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
++	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
++		struct sock *save_sk = skb->sk;
++
++		/* Streams do not free skb on error. Reset to prev state. */
++		iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
++		skb->sk = sk;
++		___pskb_trim(skb, orig_len);
++		skb->sk = save_sk;
++		return err;
++	}
++
++	skb_zcopy_set(skb, uarg, NULL);
++	return skb->len - orig_len;
++}
++EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
++
++void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
++{
++	int i;
++
++	skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
++		skb_frag_ref(skb, i);
++}
++EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
++
++static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
++			      gfp_t gfp_mask)
++{
++	if (skb_zcopy(orig)) {
++		if (skb_zcopy(nskb)) {
++			/* !gfp_mask callers are verified to !skb_zcopy(nskb) */
++			if (!gfp_mask) {
++				WARN_ON_ONCE(1);
++				return -ENOMEM;
++			}
++			if (skb_uarg(nskb) == skb_uarg(orig))
++				return 0;
++			if (skb_copy_ubufs(nskb, GFP_ATOMIC))
++				return -EIO;
++		}
++		skb_zcopy_set(nskb, skb_uarg(orig), NULL);
++	}
++	return 0;
++}
++
++/**
++ *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
++ *	@skb: the skb to modify
++ *	@gfp_mask: allocation priority
++ *
++ *	This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
++ *	It will copy all frags into kernel and drop the reference
++ *	to userspace pages.
++ *
++ *	If this function is called from an interrupt gfp_mask() must be
++ *	%GFP_ATOMIC.
++ *
++ *	Returns 0 on success or a negative error code on failure
++ *	to allocate kernel memory to copy to.
++ */
++int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
++{
++	int num_frags = skb_shinfo(skb)->nr_frags;
++	struct page *page, *head = NULL;
++	int i, new_frags;
++	u32 d_off;
++
++	if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
++		return -EINVAL;
++
++	if (!num_frags)
++		goto release;
++
++	new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
++	for (i = 0; i < new_frags; i++) {
++		page = alloc_page(gfp_mask);
++		if (!page) {
++			while (head) {
++				struct page *next = (struct page *)page_private(head);
++				put_page(head);
++				head = next;
++			}
++			return -ENOMEM;
++		}
++		set_page_private(page, (unsigned long)head);
++		head = page;
++	}
++
++	page = head;
++	d_off = 0;
++	for (i = 0; i < num_frags; i++) {
++		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
++		u32 p_off, p_len, copied;
++		struct page *p;
++		u8 *vaddr;
++
++		skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
++				      p, p_off, p_len, copied) {
++			u32 copy, done = 0;
++			vaddr = kmap_atomic(p);
++
++			while (done < p_len) {
++				if (d_off == PAGE_SIZE) {
++					d_off = 0;
++					page = (struct page *)page_private(page);
++				}
++				copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
++				memcpy(page_address(page) + d_off,
++				       vaddr + p_off + done, copy);
++				done += copy;
++				d_off += copy;
++			}
++			kunmap_atomic(vaddr);
++		}
++	}
++
++	/* skb frags release userspace buffers */
++	for (i = 0; i < num_frags; i++)
++		skb_frag_unref(skb, i);
++
++	/* skb frags point to kernel buffers */
++	for (i = 0; i < new_frags - 1; i++) {
++		__skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
++		head = (struct page *)page_private(head);
++	}
++	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
++	skb_shinfo(skb)->nr_frags = new_frags;
++
++release:
++	skb_zcopy_clear(skb, false);
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_copy_ubufs);
++
++/**
++ *	skb_clone	-	duplicate an sk_buff
++ *	@skb: buffer to clone
++ *	@gfp_mask: allocation priority
++ *
++ *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
++ *	copies share the same packet data but not structure. The new
++ *	buffer has a reference count of 1. If the allocation fails the
++ *	function returns %NULL otherwise the new buffer is returned.
++ *
++ *	If this function is called from an interrupt gfp_mask() must be
++ *	%GFP_ATOMIC.
++ */
++
++struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
++{
++	struct sk_buff_fclones *fclones = container_of(skb,
++						       struct sk_buff_fclones,
++						       skb1);
++	struct sk_buff *n;
++
++	if (skb_orphan_frags(skb, gfp_mask))
++		return NULL;
++
++	if (skb->fclone == SKB_FCLONE_ORIG &&
++	    refcount_read(&fclones->fclone_ref) == 1) {
++		n = &fclones->skb2;
++		refcount_set(&fclones->fclone_ref, 2);
++		n->fclone = SKB_FCLONE_CLONE;
++	} else {
++		if (skb_pfmemalloc(skb))
++			gfp_mask |= __GFP_MEMALLOC;
++
++		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
++		if (!n)
++			return NULL;
++
++		n->fclone = SKB_FCLONE_UNAVAILABLE;
++	}
++
++	return __skb_clone(n, skb);
++}
++EXPORT_SYMBOL(skb_clone);
++
++void skb_headers_offset_update(struct sk_buff *skb, int off)
++{
++	/* Only adjust this if it actually is csum_start rather than csum */
++	if (skb->ip_summed == CHECKSUM_PARTIAL)
++		skb->csum_start += off;
++	/* {transport,network,mac}_header and tail are relative to skb->head */
++	skb->transport_header += off;
++	skb->network_header   += off;
++	if (skb_mac_header_was_set(skb))
++		skb->mac_header += off;
++	skb->inner_transport_header += off;
++	skb->inner_network_header += off;
++	skb->inner_mac_header += off;
++}
++EXPORT_SYMBOL(skb_headers_offset_update);
++
++void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
++{
++	__copy_skb_header(new, old);
++
++	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
++	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
++	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
++}
++EXPORT_SYMBOL(skb_copy_header);
++
++static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
++{
++	if (skb_pfmemalloc(skb))
++		return SKB_ALLOC_RX;
++	return 0;
++}
++
++/**
++ *	skb_copy	-	create private copy of an sk_buff
++ *	@skb: buffer to copy
++ *	@gfp_mask: allocation priority
++ *
++ *	Make a copy of both an &sk_buff and its data. This is used when the
++ *	caller wishes to modify the data and needs a private copy of the
++ *	data to alter. Returns %NULL on failure or the pointer to the buffer
++ *	on success. The returned buffer has a reference count of 1.
++ *
++ *	As by-product this function converts non-linear &sk_buff to linear
++ *	one, so that &sk_buff becomes completely private and caller is allowed
++ *	to modify all the data of returned buffer. This means that this
++ *	function is not recommended for use in circumstances when only
++ *	header is going to be modified. Use pskb_copy() instead.
++ */
++
++struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
++{
++	int headerlen = skb_headroom(skb);
++	unsigned int size = skb_end_offset(skb) + skb->data_len;
++	struct sk_buff *n = __alloc_skb(size, gfp_mask,
++					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
++
++	if (!n)
++		return NULL;
++
++	/* Set the data pointer */
++	skb_reserve(n, headerlen);
++	/* Set the tail pointer and length */
++	skb_put(n, skb->len);
++
++	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
++
++	skb_copy_header(n, skb);
++	return n;
++}
++EXPORT_SYMBOL(skb_copy);
++
++/**
++ *	__pskb_copy_fclone	-  create copy of an sk_buff with private head.
++ *	@skb: buffer to copy
++ *	@headroom: headroom of new skb
++ *	@gfp_mask: allocation priority
++ *	@fclone: if true allocate the copy of the skb from the fclone
++ *	cache instead of the head cache; it is recommended to set this
++ *	to true for the cases where the copy will likely be cloned
++ *
++ *	Make a copy of both an &sk_buff and part of its data, located
++ *	in header. Fragmented data remain shared. This is used when
++ *	the caller wishes to modify only header of &sk_buff and needs
++ *	private copy of the header to alter. Returns %NULL on failure
++ *	or the pointer to the buffer on success.
++ *	The returned buffer has a reference count of 1.
++ */
++
++struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
++				   gfp_t gfp_mask, bool fclone)
++{
++	unsigned int size = skb_headlen(skb) + headroom;
++	int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
++	struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
++
++	if (!n)
++		goto out;
++
++	/* Set the data pointer */
++	skb_reserve(n, headroom);
++	/* Set the tail pointer and length */
++	skb_put(n, skb_headlen(skb));
++	/* Copy the bytes */
++	skb_copy_from_linear_data(skb, n->data, n->len);
++
++	n->truesize += skb->data_len;
++	n->data_len  = skb->data_len;
++	n->len	     = skb->len;
++
++	if (skb_shinfo(skb)->nr_frags) {
++		int i;
++
++		if (skb_orphan_frags(skb, gfp_mask) ||
++		    skb_zerocopy_clone(n, skb, gfp_mask)) {
++			kfree_skb(n);
++			n = NULL;
++			goto out;
++		}
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
++			skb_frag_ref(skb, i);
++		}
++		skb_shinfo(n)->nr_frags = i;
++	}
++
++	if (skb_has_frag_list(skb)) {
++		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
++		skb_clone_fraglist(n);
++	}
++
++	skb_copy_header(n, skb);
++out:
++	return n;
++}
++EXPORT_SYMBOL(__pskb_copy_fclone);
++
++/**
++ *	pskb_expand_head - reallocate header of &sk_buff
++ *	@skb: buffer to reallocate
++ *	@nhead: room to add at head
++ *	@ntail: room to add at tail
++ *	@gfp_mask: allocation priority
++ *
++ *	Expands (or creates identical copy, if @nhead and @ntail are zero)
++ *	header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
++ *	reference count of 1. Returns zero in the case of success or error,
++ *	if expansion failed. In the last case, &sk_buff is not changed.
++ *
++ *	All the pointers pointing into skb header may change and must be
++ *	reloaded after call to this function.
++ */
++
++int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
++		     gfp_t gfp_mask)
++{
++	int i, osize = skb_end_offset(skb);
++	int size = osize + nhead + ntail;
++	long off;
++	u8 *data;
++
++	BUG_ON(nhead < 0);
++
++	BUG_ON(skb_shared(skb));
++
++	skb_zcopy_downgrade_managed(skb);
++
++	size = SKB_DATA_ALIGN(size);
++
++	if (skb_pfmemalloc(skb))
++		gfp_mask |= __GFP_MEMALLOC;
++	data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
++			       gfp_mask, NUMA_NO_NODE, NULL);
++	if (!data)
++		goto nodata;
++	size = SKB_WITH_OVERHEAD(ksize(data));
++
++	/* Copy only real data... and, alas, header. This should be
++	 * optimized for the cases when header is void.
++	 */
++	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
++
++	memcpy((struct skb_shared_info *)(data + size),
++	       skb_shinfo(skb),
++	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
++
++	/*
++	 * if shinfo is shared we must drop the old head gracefully, but if it
++	 * is not we can just drop the old head and let the existing refcount
++	 * be since all we did is relocate the values
++	 */
++	if (skb_cloned(skb)) {
++		if (skb_orphan_frags(skb, gfp_mask))
++			goto nofrags;
++		if (skb_zcopy(skb))
++			refcount_inc(&skb_uarg(skb)->refcnt);
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
++			skb_frag_ref(skb, i);
++
++		if (skb_has_frag_list(skb))
++			skb_clone_fraglist(skb);
++
++		skb_release_data(skb);
++	} else {
++		skb_free_head(skb);
++	}
++	off = (data + nhead) - skb->head;
++
++	skb->head     = data;
++	skb->head_frag = 0;
++	skb->data    += off;
++
++	skb_set_end_offset(skb, size);
++#ifdef NET_SKBUFF_DATA_USES_OFFSET
++	off           = nhead;
++#endif
++	skb->tail	      += off;
++	skb_headers_offset_update(skb, nhead);
++	skb->cloned   = 0;
++	skb->hdr_len  = 0;
++	skb->nohdr    = 0;
++	atomic_set(&skb_shinfo(skb)->dataref, 1);
++
++	skb_metadata_clear(skb);
++
++	/* It is not generally safe to change skb->truesize.
++	 * For the moment, we really care of rx path, or
++	 * when skb is orphaned (not attached to a socket).
++	 */
++	if (!skb->sk || skb->destructor == sock_edemux)
++		skb->truesize += size - osize;
++
++	return 0;
++
++nofrags:
++	kfree(data);
++nodata:
++	return -ENOMEM;
++}
++EXPORT_SYMBOL(pskb_expand_head);
++
++/* Make private copy of skb with writable head and some headroom */
++
++struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
++{
++	struct sk_buff *skb2;
++	int delta = headroom - skb_headroom(skb);
++
++	if (delta <= 0)
++		skb2 = pskb_copy(skb, GFP_ATOMIC);
++	else {
++		skb2 = skb_clone(skb, GFP_ATOMIC);
++		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
++					     GFP_ATOMIC)) {
++			kfree_skb(skb2);
++			skb2 = NULL;
++		}
++	}
++	return skb2;
++}
++EXPORT_SYMBOL(skb_realloc_headroom);
++
++int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
++{
++	unsigned int saved_end_offset, saved_truesize;
++	struct skb_shared_info *shinfo;
++	int res;
++
++	saved_end_offset = skb_end_offset(skb);
++	saved_truesize = skb->truesize;
++
++	res = pskb_expand_head(skb, 0, 0, pri);
++	if (res)
++		return res;
++
++	skb->truesize = saved_truesize;
++
++	if (likely(skb_end_offset(skb) == saved_end_offset))
++		return 0;
++
++	shinfo = skb_shinfo(skb);
++
++	/* We are about to change back skb->end,
++	 * we need to move skb_shinfo() to its new location.
++	 */
++	memmove(skb->head + saved_end_offset,
++		shinfo,
++		offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
++
++	skb_set_end_offset(skb, saved_end_offset);
++
++	return 0;
++}
++
++/**
++ *	skb_expand_head - reallocate header of &sk_buff
++ *	@skb: buffer to reallocate
++ *	@headroom: needed headroom
++ *
++ *	Unlike skb_realloc_headroom, this one does not allocate a new skb
++ *	if possible; copies skb->sk to new skb as needed
++ *	and frees original skb in case of failures.
++ *
++ *	It expect increased headroom and generates warning otherwise.
++ */
++
++struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
++{
++	int delta = headroom - skb_headroom(skb);
++	int osize = skb_end_offset(skb);
++	struct sock *sk = skb->sk;
++
++	if (WARN_ONCE(delta <= 0,
++		      "%s is expecting an increase in the headroom", __func__))
++		return skb;
++
++	delta = SKB_DATA_ALIGN(delta);
++	/* pskb_expand_head() might crash, if skb is shared. */
++	if (skb_shared(skb) || !is_skb_wmem(skb)) {
++		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
++
++		if (unlikely(!nskb))
++			goto fail;
++
++		if (sk)
++			skb_set_owner_w(nskb, sk);
++		consume_skb(skb);
++		skb = nskb;
++	}
++	if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
++		goto fail;
++
++	if (sk && is_skb_wmem(skb)) {
++		delta = skb_end_offset(skb) - osize;
++		refcount_add(delta, &sk->sk_wmem_alloc);
++		skb->truesize += delta;
++	}
++	return skb;
++
++fail:
++	kfree_skb(skb);
++	return NULL;
++}
++EXPORT_SYMBOL(skb_expand_head);
++
++/**
++ *	skb_copy_expand	-	copy and expand sk_buff
++ *	@skb: buffer to copy
++ *	@newheadroom: new free bytes at head
++ *	@newtailroom: new free bytes at tail
++ *	@gfp_mask: allocation priority
++ *
++ *	Make a copy of both an &sk_buff and its data and while doing so
++ *	allocate additional space.
++ *
++ *	This is used when the caller wishes to modify the data and needs a
++ *	private copy of the data to alter as well as more space for new fields.
++ *	Returns %NULL on failure or the pointer to the buffer
++ *	on success. The returned buffer has a reference count of 1.
++ *
++ *	You must pass %GFP_ATOMIC as the allocation priority if this function
++ *	is called from an interrupt.
++ */
++struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
++				int newheadroom, int newtailroom,
++				gfp_t gfp_mask)
++{
++	/*
++	 *	Allocate the copy buffer
++	 */
++	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
++					gfp_mask, skb_alloc_rx_flag(skb),
++					NUMA_NO_NODE);
++	int oldheadroom = skb_headroom(skb);
++	int head_copy_len, head_copy_off;
++
++	if (!n)
++		return NULL;
++
++	skb_reserve(n, newheadroom);
++
++	/* Set the tail pointer and length */
++	skb_put(n, skb->len);
++
++	head_copy_len = oldheadroom;
++	head_copy_off = 0;
++	if (newheadroom <= head_copy_len)
++		head_copy_len = newheadroom;
++	else
++		head_copy_off = newheadroom - head_copy_len;
++
++	/* Copy the linear header and data. */
++	BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
++			     skb->len + head_copy_len));
++
++	skb_copy_header(n, skb);
++
++	skb_headers_offset_update(n, newheadroom - oldheadroom);
++
++	return n;
++}
++EXPORT_SYMBOL(skb_copy_expand);
++
++/**
++ *	__skb_pad		-	zero pad the tail of an skb
++ *	@skb: buffer to pad
++ *	@pad: space to pad
++ *	@free_on_error: free buffer on error
++ *
++ *	Ensure that a buffer is followed by a padding area that is zero
++ *	filled. Used by network drivers which may DMA or transfer data
++ *	beyond the buffer end onto the wire.
++ *
++ *	May return error in out of memory cases. The skb is freed on error
++ *	if @free_on_error is true.
++ */
++
++int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
++{
++	int err;
++	int ntail;
++
++	/* If the skbuff is non linear tailroom is always zero.. */
++	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
++		memset(skb->data+skb->len, 0, pad);
++		return 0;
++	}
++
++	ntail = skb->data_len + pad - (skb->end - skb->tail);
++	if (likely(skb_cloned(skb) || ntail > 0)) {
++		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
++		if (unlikely(err))
++			goto free_skb;
++	}
++
++	/* FIXME: The use of this function with non-linear skb's really needs
++	 * to be audited.
++	 */
++	err = skb_linearize(skb);
++	if (unlikely(err))
++		goto free_skb;
++
++	memset(skb->data + skb->len, 0, pad);
++	return 0;
++
++free_skb:
++	if (free_on_error)
++		kfree_skb(skb);
++	return err;
++}
++EXPORT_SYMBOL(__skb_pad);
++
++/**
++ *	pskb_put - add data to the tail of a potentially fragmented buffer
++ *	@skb: start of the buffer to use
++ *	@tail: tail fragment of the buffer to use
++ *	@len: amount of data to add
++ *
++ *	This function extends the used data area of the potentially
++ *	fragmented buffer. @tail must be the last fragment of @skb -- or
++ *	@skb itself. If this would exceed the total buffer size the kernel
++ *	will panic. A pointer to the first byte of the extra data is
++ *	returned.
++ */
++
++void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
++{
++	if (tail != skb) {
++		skb->data_len += len;
++		skb->len += len;
++	}
++	return skb_put(tail, len);
++}
++EXPORT_SYMBOL_GPL(pskb_put);
++
++/**
++ *	skb_put - add data to a buffer
++ *	@skb: buffer to use
++ *	@len: amount of data to add
++ *
++ *	This function extends the used data area of the buffer. If this would
++ *	exceed the total buffer size the kernel will panic. A pointer to the
++ *	first byte of the extra data is returned.
++ */
++void *skb_put(struct sk_buff *skb, unsigned int len)
++{
++	void *tmp = skb_tail_pointer(skb);
++	SKB_LINEAR_ASSERT(skb);
++	skb->tail += len;
++	skb->len  += len;
++	if (unlikely(skb->tail > skb->end))
++		skb_over_panic(skb, len, __builtin_return_address(0));
++	return tmp;
++}
++EXPORT_SYMBOL(skb_put);
++
++/**
++ *	skb_push - add data to the start of a buffer
++ *	@skb: buffer to use
++ *	@len: amount of data to add
++ *
++ *	This function extends the used data area of the buffer at the buffer
++ *	start. If this would exceed the total buffer headroom the kernel will
++ *	panic. A pointer to the first byte of the extra data is returned.
++ */
++void *skb_push(struct sk_buff *skb, unsigned int len)
++{
++	skb->data -= len;
++	skb->len  += len;
++	if (unlikely(skb->data < skb->head))
++		skb_under_panic(skb, len, __builtin_return_address(0));
++	return skb->data;
++}
++EXPORT_SYMBOL(skb_push);
++
++/**
++ *	skb_pull - remove data from the start of a buffer
++ *	@skb: buffer to use
++ *	@len: amount of data to remove
++ *
++ *	This function removes data from the start of a buffer, returning
++ *	the memory to the headroom. A pointer to the next data in the buffer
++ *	is returned. Once the data has been pulled future pushes will overwrite
++ *	the old data.
++ */
++void *skb_pull(struct sk_buff *skb, unsigned int len)
++{
++	return skb_pull_inline(skb, len);
++}
++EXPORT_SYMBOL(skb_pull);
++
++/**
++ *	skb_pull_data - remove data from the start of a buffer returning its
++ *	original position.
++ *	@skb: buffer to use
++ *	@len: amount of data to remove
++ *
++ *	This function removes data from the start of a buffer, returning
++ *	the memory to the headroom. A pointer to the original data in the buffer
++ *	is returned after checking if there is enough data to pull. Once the
++ *	data has been pulled future pushes will overwrite the old data.
++ */
++void *skb_pull_data(struct sk_buff *skb, size_t len)
++{
++	void *data = skb->data;
++
++	if (skb->len < len)
++		return NULL;
++
++	skb_pull(skb, len);
++
++	return data;
++}
++EXPORT_SYMBOL(skb_pull_data);
++
++/**
++ *	skb_trim - remove end from a buffer
++ *	@skb: buffer to alter
++ *	@len: new length
++ *
++ *	Cut the length of a buffer down by removing data from the tail. If
++ *	the buffer is already under the length specified it is not modified.
++ *	The skb must be linear.
++ */
++void skb_trim(struct sk_buff *skb, unsigned int len)
++{
++	if (skb->len > len)
++		__skb_trim(skb, len);
++}
++EXPORT_SYMBOL(skb_trim);
++
++/* Trims skb to length len. It can change skb pointers.
++ */
++
++int ___pskb_trim(struct sk_buff *skb, unsigned int len)
++{
++	struct sk_buff **fragp;
++	struct sk_buff *frag;
++	int offset = skb_headlen(skb);
++	int nfrags = skb_shinfo(skb)->nr_frags;
++	int i;
++	int err;
++
++	if (skb_cloned(skb) &&
++	    unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
++		return err;
++
++	i = 0;
++	if (offset >= len)
++		goto drop_pages;
++
++	for (; i < nfrags; i++) {
++		int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (end < len) {
++			offset = end;
++			continue;
++		}
++
++		skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
++
++drop_pages:
++		skb_shinfo(skb)->nr_frags = i;
++
++		for (; i < nfrags; i++)
++			skb_frag_unref(skb, i);
++
++		if (skb_has_frag_list(skb))
++			skb_drop_fraglist(skb);
++		goto done;
++	}
++
++	for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
++	     fragp = &frag->next) {
++		int end = offset + frag->len;
++
++		if (skb_shared(frag)) {
++			struct sk_buff *nfrag;
++
++			nfrag = skb_clone(frag, GFP_ATOMIC);
++			if (unlikely(!nfrag))
++				return -ENOMEM;
++
++			nfrag->next = frag->next;
++			consume_skb(frag);
++			frag = nfrag;
++			*fragp = frag;
++		}
++
++		if (end < len) {
++			offset = end;
++			continue;
++		}
++
++		if (end > len &&
++		    unlikely((err = pskb_trim(frag, len - offset))))
++			return err;
++
++		if (frag->next)
++			skb_drop_list(&frag->next);
++		break;
++	}
++
++done:
++	if (len > skb_headlen(skb)) {
++		skb->data_len -= skb->len - len;
++		skb->len       = len;
++	} else {
++		skb->len       = len;
++		skb->data_len  = 0;
++		skb_set_tail_pointer(skb, len);
++	}
++
++	if (!skb->sk || skb->destructor == sock_edemux)
++		skb_condense(skb);
++	return 0;
++}
++EXPORT_SYMBOL(___pskb_trim);
++
++/* Note : use pskb_trim_rcsum() instead of calling this directly
++ */
++int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
++{
++	if (skb->ip_summed == CHECKSUM_COMPLETE) {
++		int delta = skb->len - len;
++
++		skb->csum = csum_block_sub(skb->csum,
++					   skb_checksum(skb, len, delta, 0),
++					   len);
++	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
++		int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
++		int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
++
++		if (offset + sizeof(__sum16) > hdlen)
++			return -EINVAL;
++	}
++	return __pskb_trim(skb, len);
++}
++EXPORT_SYMBOL(pskb_trim_rcsum_slow);
++
++/**
++ *	__pskb_pull_tail - advance tail of skb header
++ *	@skb: buffer to reallocate
++ *	@delta: number of bytes to advance tail
++ *
++ *	The function makes a sense only on a fragmented &sk_buff,
++ *	it expands header moving its tail forward and copying necessary
++ *	data from fragmented part.
++ *
++ *	&sk_buff MUST have reference count of 1.
++ *
++ *	Returns %NULL (and &sk_buff does not change) if pull failed
++ *	or value of new tail of skb in the case of success.
++ *
++ *	All the pointers pointing into skb header may change and must be
++ *	reloaded after call to this function.
++ */
++
++/* Moves tail of skb head forward, copying data from fragmented part,
++ * when it is necessary.
++ * 1. It may fail due to malloc failure.
++ * 2. It may change skb pointers.
++ *
++ * It is pretty complicated. Luckily, it is called only in exceptional cases.
++ */
++void *__pskb_pull_tail(struct sk_buff *skb, int delta)
++{
++	/* If skb has not enough free space at tail, get new one
++	 * plus 128 bytes for future expansions. If we have enough
++	 * room at tail, reallocate without expansion only if skb is cloned.
++	 */
++	int i, k, eat = (skb->tail + delta) - skb->end;
++
++	if (eat > 0 || skb_cloned(skb)) {
++		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
++				     GFP_ATOMIC))
++			return NULL;
++	}
++
++	BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
++			     skb_tail_pointer(skb), delta));
++
++	/* Optimization: no fragments, no reasons to preestimate
++	 * size of pulled pages. Superb.
++	 */
++	if (!skb_has_frag_list(skb))
++		goto pull_pages;
++
++	/* Estimate size of pulled pages. */
++	eat = delta;
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (size >= eat)
++			goto pull_pages;
++		eat -= size;
++	}
++
++	/* If we need update frag list, we are in troubles.
++	 * Certainly, it is possible to add an offset to skb data,
++	 * but taking into account that pulling is expected to
++	 * be very rare operation, it is worth to fight against
++	 * further bloating skb head and crucify ourselves here instead.
++	 * Pure masohism, indeed. 8)8)
++	 */
++	if (eat) {
++		struct sk_buff *list = skb_shinfo(skb)->frag_list;
++		struct sk_buff *clone = NULL;
++		struct sk_buff *insp = NULL;
++
++		do {
++			if (list->len <= eat) {
++				/* Eaten as whole. */
++				eat -= list->len;
++				list = list->next;
++				insp = list;
++			} else {
++				/* Eaten partially. */
++
++				if (skb_shared(list)) {
++					/* Sucks! We need to fork list. :-( */
++					clone = skb_clone(list, GFP_ATOMIC);
++					if (!clone)
++						return NULL;
++					insp = list->next;
++					list = clone;
++				} else {
++					/* This may be pulled without
++					 * problems. */
++					insp = list;
++				}
++				if (!pskb_pull(list, eat)) {
++					kfree_skb(clone);
++					return NULL;
++				}
++				break;
++			}
++		} while (eat);
++
++		/* Free pulled out fragments. */
++		while ((list = skb_shinfo(skb)->frag_list) != insp) {
++			skb_shinfo(skb)->frag_list = list->next;
++			consume_skb(list);
++		}
++		/* And insert new clone at head. */
++		if (clone) {
++			clone->next = list;
++			skb_shinfo(skb)->frag_list = clone;
++		}
++	}
++	/* Success! Now we may commit changes to skb data. */
++
++pull_pages:
++	eat = delta;
++	k = 0;
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (size <= eat) {
++			skb_frag_unref(skb, i);
++			eat -= size;
++		} else {
++			skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
++
++			*frag = skb_shinfo(skb)->frags[i];
++			if (eat) {
++				skb_frag_off_add(frag, eat);
++				skb_frag_size_sub(frag, eat);
++				if (!i)
++					goto end;
++				eat = 0;
++			}
++			k++;
++		}
++	}
++	skb_shinfo(skb)->nr_frags = k;
++
++end:
++	skb->tail     += delta;
++	skb->data_len -= delta;
++
++	if (!skb->data_len)
++		skb_zcopy_clear(skb, false);
++
++	return skb_tail_pointer(skb);
++}
++EXPORT_SYMBOL(__pskb_pull_tail);
++
++/**
++ *	skb_copy_bits - copy bits from skb to kernel buffer
++ *	@skb: source skb
++ *	@offset: offset in source
++ *	@to: destination buffer
++ *	@len: number of bytes to copy
++ *
++ *	Copy the specified number of bytes from the source skb to the
++ *	destination buffer.
++ *
++ *	CAUTION ! :
++ *		If its prototype is ever changed,
++ *		check arch/{*}/net/{*}.S files,
++ *		since it is called from BPF assembly code.
++ */
++int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
++{
++	int start = skb_headlen(skb);
++	struct sk_buff *frag_iter;
++	int i, copy;
++
++	if (offset > (int)skb->len - len)
++		goto fault;
++
++	/* Copy header. */
++	if ((copy = start - offset) > 0) {
++		if (copy > len)
++			copy = len;
++		skb_copy_from_linear_data_offset(skb, offset, to, copy);
++		if ((len -= copy) == 0)
++			return 0;
++		offset += copy;
++		to     += copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int end;
++		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(f);
++		if ((copy = end - offset) > 0) {
++			u32 p_off, p_len, copied;
++			struct page *p;
++			u8 *vaddr;
++
++			if (copy > len)
++				copy = len;
++
++			skb_frag_foreach_page(f,
++					      skb_frag_off(f) + offset - start,
++					      copy, p, p_off, p_len, copied) {
++				vaddr = kmap_atomic(p);
++				memcpy(to + copied, vaddr + p_off, p_len);
++				kunmap_atomic(vaddr);
++			}
++
++			if ((len -= copy) == 0)
++				return 0;
++			offset += copy;
++			to     += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			if (copy > len)
++				copy = len;
++			if (skb_copy_bits(frag_iter, offset - start, to, copy))
++				goto fault;
++			if ((len -= copy) == 0)
++				return 0;
++			offset += copy;
++			to     += copy;
++		}
++		start = end;
++	}
++
++	if (!len)
++		return 0;
++
++fault:
++	return -EFAULT;
++}
++EXPORT_SYMBOL(skb_copy_bits);
++
++/*
++ * Callback from splice_to_pipe(), if we need to release some pages
++ * at the end of the spd in case we error'ed out in filling the pipe.
++ */
++static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
++{
++	put_page(spd->pages[i]);
++}
++
++static struct page *linear_to_page(struct page *page, unsigned int *len,
++				   unsigned int *offset,
++				   struct sock *sk)
++{
++	struct page_frag *pfrag = sk_page_frag(sk);
++
++	if (!sk_page_frag_refill(sk, pfrag))
++		return NULL;
++
++	*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
++
++	memcpy(page_address(pfrag->page) + pfrag->offset,
++	       page_address(page) + *offset, *len);
++	*offset = pfrag->offset;
++	pfrag->offset += *len;
++
++	return pfrag->page;
++}
++
++static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
++			     struct page *page,
++			     unsigned int offset)
++{
++	return	spd->nr_pages &&
++		spd->pages[spd->nr_pages - 1] == page &&
++		(spd->partial[spd->nr_pages - 1].offset +
++		 spd->partial[spd->nr_pages - 1].len == offset);
++}
++
++/*
++ * Fill page/offset/length into spd, if it can hold more pages.
++ */
++static bool spd_fill_page(struct splice_pipe_desc *spd,
++			  struct pipe_inode_info *pipe, struct page *page,
++			  unsigned int *len, unsigned int offset,
++			  bool linear,
++			  struct sock *sk)
++{
++	if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
++		return true;
++
++	if (linear) {
++		page = linear_to_page(page, len, &offset, sk);
++		if (!page)
++			return true;
++	}
++	if (spd_can_coalesce(spd, page, offset)) {
++		spd->partial[spd->nr_pages - 1].len += *len;
++		return false;
++	}
++	get_page(page);
++	spd->pages[spd->nr_pages] = page;
++	spd->partial[spd->nr_pages].len = *len;
++	spd->partial[spd->nr_pages].offset = offset;
++	spd->nr_pages++;
++
++	return false;
++}
++
++static bool __splice_segment(struct page *page, unsigned int poff,
++			     unsigned int plen, unsigned int *off,
++			     unsigned int *len,
++			     struct splice_pipe_desc *spd, bool linear,
++			     struct sock *sk,
++			     struct pipe_inode_info *pipe)
++{
++	if (!*len)
++		return true;
++
++	/* skip this segment if already processed */
++	if (*off >= plen) {
++		*off -= plen;
++		return false;
++	}
++
++	/* ignore any bits we already processed */
++	poff += *off;
++	plen -= *off;
++	*off = 0;
++
++	do {
++		unsigned int flen = min(*len, plen);
++
++		if (spd_fill_page(spd, pipe, page, &flen, poff,
++				  linear, sk))
++			return true;
++		poff += flen;
++		plen -= flen;
++		*len -= flen;
++	} while (*len && plen);
++
++	return false;
++}
++
++/*
++ * Map linear and fragment data from the skb to spd. It reports true if the
++ * pipe is full or if we already spliced the requested length.
++ */
++static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
++			      unsigned int *offset, unsigned int *len,
++			      struct splice_pipe_desc *spd, struct sock *sk)
++{
++	int seg;
++	struct sk_buff *iter;
++
++	/* map the linear part :
++	 * If skb->head_frag is set, this 'linear' part is backed by a
++	 * fragment, and if the head is not shared with any clones then
++	 * we can avoid a copy since we own the head portion of this page.
++	 */
++	if (__splice_segment(virt_to_page(skb->data),
++			     (unsigned long) skb->data & (PAGE_SIZE - 1),
++			     skb_headlen(skb),
++			     offset, len, spd,
++			     skb_head_is_locked(skb),
++			     sk, pipe))
++		return true;
++
++	/*
++	 * then map the fragments
++	 */
++	for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
++		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
++
++		if (__splice_segment(skb_frag_page(f),
++				     skb_frag_off(f), skb_frag_size(f),
++				     offset, len, spd, false, sk, pipe))
++			return true;
++	}
++
++	skb_walk_frags(skb, iter) {
++		if (*offset >= iter->len) {
++			*offset -= iter->len;
++			continue;
++		}
++		/* __skb_splice_bits() only fails if the output has no room
++		 * left, so no point in going over the frag_list for the error
++		 * case.
++		 */
++		if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
++			return true;
++	}
++
++	return false;
++}
++
++/*
++ * Map data from the skb to a pipe. Should handle both the linear part,
++ * the fragments, and the frag list.
++ */
++int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
++		    struct pipe_inode_info *pipe, unsigned int tlen,
++		    unsigned int flags)
++{
++	struct partial_page partial[MAX_SKB_FRAGS];
++	struct page *pages[MAX_SKB_FRAGS];
++	struct splice_pipe_desc spd = {
++		.pages = pages,
++		.partial = partial,
++		.nr_pages_max = MAX_SKB_FRAGS,
++		.ops = &nosteal_pipe_buf_ops,
++		.spd_release = sock_spd_release,
++	};
++	int ret = 0;
++
++	__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
++
++	if (spd.nr_pages)
++		ret = splice_to_pipe(pipe, &spd);
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(skb_splice_bits);
++
++static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
++			    struct kvec *vec, size_t num, size_t size)
++{
++	struct socket *sock = sk->sk_socket;
++
++	if (!sock)
++		return -EINVAL;
++	return kernel_sendmsg(sock, msg, vec, num, size);
++}
++
++static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
++			     size_t size, int flags)
++{
++	struct socket *sock = sk->sk_socket;
++
++	if (!sock)
++		return -EINVAL;
++	return kernel_sendpage(sock, page, offset, size, flags);
++}
++
++typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
++			    struct kvec *vec, size_t num, size_t size);
++typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
++			     size_t size, int flags);
++static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
++			   int len, sendmsg_func sendmsg, sendpage_func sendpage)
++{
++	unsigned int orig_len = len;
++	struct sk_buff *head = skb;
++	unsigned short fragidx;
++	int slen, ret;
++
++do_frag_list:
++
++	/* Deal with head data */
++	while (offset < skb_headlen(skb) && len) {
++		struct kvec kv;
++		struct msghdr msg;
++
++		slen = min_t(int, len, skb_headlen(skb) - offset);
++		kv.iov_base = skb->data + offset;
++		kv.iov_len = slen;
++		memset(&msg, 0, sizeof(msg));
++		msg.msg_flags = MSG_DONTWAIT;
++
++		ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
++				      sendmsg_unlocked, sk, &msg, &kv, 1, slen);
++		if (ret <= 0)
++			goto error;
++
++		offset += ret;
++		len -= ret;
++	}
++
++	/* All the data was skb head? */
++	if (!len)
++		goto out;
++
++	/* Make offset relative to start of frags */
++	offset -= skb_headlen(skb);
++
++	/* Find where we are in frag list */
++	for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
++		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
++
++		if (offset < skb_frag_size(frag))
++			break;
++
++		offset -= skb_frag_size(frag);
++	}
++
++	for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
++		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
++
++		slen = min_t(size_t, len, skb_frag_size(frag) - offset);
++
++		while (slen) {
++			ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
++					      sendpage_unlocked, sk,
++					      skb_frag_page(frag),
++					      skb_frag_off(frag) + offset,
++					      slen, MSG_DONTWAIT);
++			if (ret <= 0)
++				goto error;
++
++			len -= ret;
++			offset += ret;
++			slen -= ret;
++		}
++
++		offset = 0;
++	}
++
++	if (len) {
++		/* Process any frag lists */
++
++		if (skb == head) {
++			if (skb_has_frag_list(skb)) {
++				skb = skb_shinfo(skb)->frag_list;
++				goto do_frag_list;
++			}
++		} else if (skb->next) {
++			skb = skb->next;
++			goto do_frag_list;
++		}
++	}
++
++out:
++	return orig_len - len;
++
++error:
++	return orig_len == len ? ret : orig_len - len;
++}
++
++/* Send skb data on a socket. Socket must be locked. */
++int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
++			 int len)
++{
++	return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
++			       kernel_sendpage_locked);
++}
++EXPORT_SYMBOL_GPL(skb_send_sock_locked);
++
++/* Send skb data on a socket. Socket must be unlocked. */
++int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
++{
++	return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
++			       sendpage_unlocked);
++}
++
++/**
++ *	skb_store_bits - store bits from kernel buffer to skb
++ *	@skb: destination buffer
++ *	@offset: offset in destination
++ *	@from: source buffer
++ *	@len: number of bytes to copy
++ *
++ *	Copy the specified number of bytes from the source buffer to the
++ *	destination skb.  This function handles all the messy bits of
++ *	traversing fragment lists and such.
++ */
++
++int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
++{
++	int start = skb_headlen(skb);
++	struct sk_buff *frag_iter;
++	int i, copy;
++
++	if (offset > (int)skb->len - len)
++		goto fault;
++
++	if ((copy = start - offset) > 0) {
++		if (copy > len)
++			copy = len;
++		skb_copy_to_linear_data_offset(skb, offset, from, copy);
++		if ((len -= copy) == 0)
++			return 0;
++		offset += copy;
++		from += copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(frag);
++		if ((copy = end - offset) > 0) {
++			u32 p_off, p_len, copied;
++			struct page *p;
++			u8 *vaddr;
++
++			if (copy > len)
++				copy = len;
++
++			skb_frag_foreach_page(frag,
++					      skb_frag_off(frag) + offset - start,
++					      copy, p, p_off, p_len, copied) {
++				vaddr = kmap_atomic(p);
++				memcpy(vaddr + p_off, from + copied, p_len);
++				kunmap_atomic(vaddr);
++			}
++
++			if ((len -= copy) == 0)
++				return 0;
++			offset += copy;
++			from += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			if (copy > len)
++				copy = len;
++			if (skb_store_bits(frag_iter, offset - start,
++					   from, copy))
++				goto fault;
++			if ((len -= copy) == 0)
++				return 0;
++			offset += copy;
++			from += copy;
++		}
++		start = end;
++	}
++	if (!len)
++		return 0;
++
++fault:
++	return -EFAULT;
++}
++EXPORT_SYMBOL(skb_store_bits);
++
++/* Checksum skb data. */
++__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
++		      __wsum csum, const struct skb_checksum_ops *ops)
++{
++	int start = skb_headlen(skb);
++	int i, copy = start - offset;
++	struct sk_buff *frag_iter;
++	int pos = 0;
++
++	/* Checksum header. */
++	if (copy > 0) {
++		if (copy > len)
++			copy = len;
++		csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
++				       skb->data + offset, copy, csum);
++		if ((len -= copy) == 0)
++			return csum;
++		offset += copy;
++		pos	= copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int end;
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(frag);
++		if ((copy = end - offset) > 0) {
++			u32 p_off, p_len, copied;
++			struct page *p;
++			__wsum csum2;
++			u8 *vaddr;
++
++			if (copy > len)
++				copy = len;
++
++			skb_frag_foreach_page(frag,
++					      skb_frag_off(frag) + offset - start,
++					      copy, p, p_off, p_len, copied) {
++				vaddr = kmap_atomic(p);
++				csum2 = INDIRECT_CALL_1(ops->update,
++							csum_partial_ext,
++							vaddr + p_off, p_len, 0);
++				kunmap_atomic(vaddr);
++				csum = INDIRECT_CALL_1(ops->combine,
++						       csum_block_add_ext, csum,
++						       csum2, pos, p_len);
++				pos += p_len;
++			}
++
++			if (!(len -= copy))
++				return csum;
++			offset += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			__wsum csum2;
++			if (copy > len)
++				copy = len;
++			csum2 = __skb_checksum(frag_iter, offset - start,
++					       copy, 0, ops);
++			csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
++					       csum, csum2, pos, copy);
++			if ((len -= copy) == 0)
++				return csum;
++			offset += copy;
++			pos    += copy;
++		}
++		start = end;
++	}
++	BUG_ON(len);
++
++	return csum;
++}
++EXPORT_SYMBOL(__skb_checksum);
++
++__wsum skb_checksum(const struct sk_buff *skb, int offset,
++		    int len, __wsum csum)
++{
++	const struct skb_checksum_ops ops = {
++		.update  = csum_partial_ext,
++		.combine = csum_block_add_ext,
++	};
++
++	return __skb_checksum(skb, offset, len, csum, &ops);
++}
++EXPORT_SYMBOL(skb_checksum);
++
++/* Both of above in one bottle. */
++
++__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
++				    u8 *to, int len)
++{
++	int start = skb_headlen(skb);
++	int i, copy = start - offset;
++	struct sk_buff *frag_iter;
++	int pos = 0;
++	__wsum csum = 0;
++
++	/* Copy header. */
++	if (copy > 0) {
++		if (copy > len)
++			copy = len;
++		csum = csum_partial_copy_nocheck(skb->data + offset, to,
++						 copy);
++		if ((len -= copy) == 0)
++			return csum;
++		offset += copy;
++		to     += copy;
++		pos	= copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
++		if ((copy = end - offset) > 0) {
++			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++			u32 p_off, p_len, copied;
++			struct page *p;
++			__wsum csum2;
++			u8 *vaddr;
++
++			if (copy > len)
++				copy = len;
++
++			skb_frag_foreach_page(frag,
++					      skb_frag_off(frag) + offset - start,
++					      copy, p, p_off, p_len, copied) {
++				vaddr = kmap_atomic(p);
++				csum2 = csum_partial_copy_nocheck(vaddr + p_off,
++								  to + copied,
++								  p_len);
++				kunmap_atomic(vaddr);
++				csum = csum_block_add(csum, csum2, pos);
++				pos += p_len;
++			}
++
++			if (!(len -= copy))
++				return csum;
++			offset += copy;
++			to     += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		__wsum csum2;
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			if (copy > len)
++				copy = len;
++			csum2 = skb_copy_and_csum_bits(frag_iter,
++						       offset - start,
++						       to, copy);
++			csum = csum_block_add(csum, csum2, pos);
++			if ((len -= copy) == 0)
++				return csum;
++			offset += copy;
++			to     += copy;
++			pos    += copy;
++		}
++		start = end;
++	}
++	BUG_ON(len);
++	return csum;
++}
++EXPORT_SYMBOL(skb_copy_and_csum_bits);
++
++__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
++{
++	__sum16 sum;
++
++	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
++	/* See comments in __skb_checksum_complete(). */
++	if (likely(!sum)) {
++		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
++		    !skb->csum_complete_sw)
++			netdev_rx_csum_fault(skb->dev, skb);
++	}
++	if (!skb_shared(skb))
++		skb->csum_valid = !sum;
++	return sum;
++}
++EXPORT_SYMBOL(__skb_checksum_complete_head);
++
++/* This function assumes skb->csum already holds pseudo header's checksum,
++ * which has been changed from the hardware checksum, for example, by
++ * __skb_checksum_validate_complete(). And, the original skb->csum must
++ * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
++ *
++ * It returns non-zero if the recomputed checksum is still invalid, otherwise
++ * zero. The new checksum is stored back into skb->csum unless the skb is
++ * shared.
++ */
++__sum16 __skb_checksum_complete(struct sk_buff *skb)
++{
++	__wsum csum;
++	__sum16 sum;
++
++	csum = skb_checksum(skb, 0, skb->len, 0);
++
++	sum = csum_fold(csum_add(skb->csum, csum));
++	/* This check is inverted, because we already knew the hardware
++	 * checksum is invalid before calling this function. So, if the
++	 * re-computed checksum is valid instead, then we have a mismatch
++	 * between the original skb->csum and skb_checksum(). This means either
++	 * the original hardware checksum is incorrect or we screw up skb->csum
++	 * when moving skb->data around.
++	 */
++	if (likely(!sum)) {
++		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
++		    !skb->csum_complete_sw)
++			netdev_rx_csum_fault(skb->dev, skb);
++	}
++
++	if (!skb_shared(skb)) {
++		/* Save full packet checksum */
++		skb->csum = csum;
++		skb->ip_summed = CHECKSUM_COMPLETE;
++		skb->csum_complete_sw = 1;
++		skb->csum_valid = !sum;
++	}
++
++	return sum;
++}
++EXPORT_SYMBOL(__skb_checksum_complete);
++
++static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
++{
++	net_warn_ratelimited(
++		"%s: attempt to compute crc32c without libcrc32c.ko\n",
++		__func__);
++	return 0;
++}
++
++static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
++				       int offset, int len)
++{
++	net_warn_ratelimited(
++		"%s: attempt to compute crc32c without libcrc32c.ko\n",
++		__func__);
++	return 0;
++}
++
++static const struct skb_checksum_ops default_crc32c_ops = {
++	.update  = warn_crc32c_csum_update,
++	.combine = warn_crc32c_csum_combine,
++};
++
++const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
++	&default_crc32c_ops;
++EXPORT_SYMBOL(crc32c_csum_stub);
++
++ /**
++ *	skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
++ *	@from: source buffer
++ *
++ *	Calculates the amount of linear headroom needed in the 'to' skb passed
++ *	into skb_zerocopy().
++ */
++unsigned int
++skb_zerocopy_headlen(const struct sk_buff *from)
++{
++	unsigned int hlen = 0;
++
++	if (!from->head_frag ||
++	    skb_headlen(from) < L1_CACHE_BYTES ||
++	    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
++		hlen = skb_headlen(from);
++		if (!hlen)
++			hlen = from->len;
++	}
++
++	if (skb_has_frag_list(from))
++		hlen = from->len;
++
++	return hlen;
++}
++EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
++
++/**
++ *	skb_zerocopy - Zero copy skb to skb
++ *	@to: destination buffer
++ *	@from: source buffer
++ *	@len: number of bytes to copy from source buffer
++ *	@hlen: size of linear headroom in destination buffer
++ *
++ *	Copies up to `len` bytes from `from` to `to` by creating references
++ *	to the frags in the source buffer.
++ *
++ *	The `hlen` as calculated by skb_zerocopy_headlen() specifies the
++ *	headroom in the `to` buffer.
++ *
++ *	Return value:
++ *	0: everything is OK
++ *	-ENOMEM: couldn't orphan frags of @from due to lack of memory
++ *	-EFAULT: skb_copy_bits() found some problem with skb geometry
++ */
++int
++skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
++{
++	int i, j = 0;
++	int plen = 0; /* length of skb->head fragment */
++	int ret;
++	struct page *page;
++	unsigned int offset;
++
++	BUG_ON(!from->head_frag && !hlen);
++
++	/* dont bother with small payloads */
++	if (len <= skb_tailroom(to))
++		return skb_copy_bits(from, 0, skb_put(to, len), len);
++
++	if (hlen) {
++		ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
++		if (unlikely(ret))
++			return ret;
++		len -= hlen;
++	} else {
++		plen = min_t(int, skb_headlen(from), len);
++		if (plen) {
++			page = virt_to_head_page(from->head);
++			offset = from->data - (unsigned char *)page_address(page);
++			__skb_fill_page_desc(to, 0, page, offset, plen);
++			get_page(page);
++			j = 1;
++			len -= plen;
++		}
++	}
++
++	skb_len_add(to, len + plen);
++
++	if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
++		skb_tx_error(from);
++		return -ENOMEM;
++	}
++	skb_zerocopy_clone(to, from, GFP_ATOMIC);
++
++	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
++		int size;
++
++		if (!len)
++			break;
++		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
++		size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
++					len);
++		skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
++		len -= size;
++		skb_frag_ref(to, j);
++		j++;
++	}
++	skb_shinfo(to)->nr_frags = j;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_zerocopy);
++
++void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
++{
++	__wsum csum;
++	long csstart;
++
++	if (skb->ip_summed == CHECKSUM_PARTIAL)
++		csstart = skb_checksum_start_offset(skb);
++	else
++		csstart = skb_headlen(skb);
++
++	BUG_ON(csstart > skb_headlen(skb));
++
++	skb_copy_from_linear_data(skb, to, csstart);
++
++	csum = 0;
++	if (csstart != skb->len)
++		csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
++					      skb->len - csstart);
++
++	if (skb->ip_summed == CHECKSUM_PARTIAL) {
++		long csstuff = csstart + skb->csum_offset;
++
++		*((__sum16 *)(to + csstuff)) = csum_fold(csum);
++	}
++}
++EXPORT_SYMBOL(skb_copy_and_csum_dev);
++
++/**
++ *	skb_dequeue - remove from the head of the queue
++ *	@list: list to dequeue from
++ *
++ *	Remove the head of the list. The list lock is taken so the function
++ *	may be used safely with other locking list functions. The head item is
++ *	returned or %NULL if the list is empty.
++ */
++
++struct sk_buff *skb_dequeue(struct sk_buff_head *list)
++{
++	unsigned long flags;
++	struct sk_buff *result;
++
++	spin_lock_irqsave(&list->lock, flags);
++	result = __skb_dequeue(list);
++	spin_unlock_irqrestore(&list->lock, flags);
++	return result;
++}
++EXPORT_SYMBOL(skb_dequeue);
++
++/**
++ *	skb_dequeue_tail - remove from the tail of the queue
++ *	@list: list to dequeue from
++ *
++ *	Remove the tail of the list. The list lock is taken so the function
++ *	may be used safely with other locking list functions. The tail item is
++ *	returned or %NULL if the list is empty.
++ */
++struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
++{
++	unsigned long flags;
++	struct sk_buff *result;
++
++	spin_lock_irqsave(&list->lock, flags);
++	result = __skb_dequeue_tail(list);
++	spin_unlock_irqrestore(&list->lock, flags);
++	return result;
++}
++EXPORT_SYMBOL(skb_dequeue_tail);
++
++/**
++ *	skb_queue_purge - empty a list
++ *	@list: list to empty
++ *
++ *	Delete all buffers on an &sk_buff list. Each buffer is removed from
++ *	the list and one reference dropped. This function takes the list
++ *	lock and is atomic with respect to other list locking functions.
++ */
++void skb_queue_purge(struct sk_buff_head *list)
++{
++	struct sk_buff *skb;
++	while ((skb = skb_dequeue(list)) != NULL)
++		kfree_skb(skb);
++}
++EXPORT_SYMBOL(skb_queue_purge);
++
++/**
++ *	skb_rbtree_purge - empty a skb rbtree
++ *	@root: root of the rbtree to empty
++ *	Return value: the sum of truesizes of all purged skbs.
++ *
++ *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
++ *	the list and one reference dropped. This function does not take
++ *	any lock. Synchronization should be handled by the caller (e.g., TCP
++ *	out-of-order queue is protected by the socket lock).
++ */
++unsigned int skb_rbtree_purge(struct rb_root *root)
++{
++	struct rb_node *p = rb_first(root);
++	unsigned int sum = 0;
++
++	while (p) {
++		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
++
++		p = rb_next(p);
++		rb_erase(&skb->rbnode, root);
++		sum += skb->truesize;
++		kfree_skb(skb);
++	}
++	return sum;
++}
++
++/**
++ *	skb_queue_head - queue a buffer at the list head
++ *	@list: list to use
++ *	@newsk: buffer to queue
++ *
++ *	Queue a buffer at the start of the list. This function takes the
++ *	list lock and can be used safely with other locking &sk_buff functions
++ *	safely.
++ *
++ *	A buffer cannot be placed on two lists at the same time.
++ */
++void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&list->lock, flags);
++	__skb_queue_head(list, newsk);
++	spin_unlock_irqrestore(&list->lock, flags);
++}
++EXPORT_SYMBOL(skb_queue_head);
++
++/**
++ *	skb_queue_tail - queue a buffer at the list tail
++ *	@list: list to use
++ *	@newsk: buffer to queue
++ *
++ *	Queue a buffer at the tail of the list. This function takes the
++ *	list lock and can be used safely with other locking &sk_buff functions
++ *	safely.
++ *
++ *	A buffer cannot be placed on two lists at the same time.
++ */
++void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&list->lock, flags);
++	__skb_queue_tail(list, newsk);
++	spin_unlock_irqrestore(&list->lock, flags);
++}
++EXPORT_SYMBOL(skb_queue_tail);
++
++/**
++ *	skb_unlink	-	remove a buffer from a list
++ *	@skb: buffer to remove
++ *	@list: list to use
++ *
++ *	Remove a packet from a list. The list locks are taken and this
++ *	function is atomic with respect to other list locked calls
++ *
++ *	You must know what list the SKB is on.
++ */
++void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&list->lock, flags);
++	__skb_unlink(skb, list);
++	spin_unlock_irqrestore(&list->lock, flags);
++}
++EXPORT_SYMBOL(skb_unlink);
++
++/**
++ *	skb_append	-	append a buffer
++ *	@old: buffer to insert after
++ *	@newsk: buffer to insert
++ *	@list: list to use
++ *
++ *	Place a packet after a given packet in a list. The list locks are taken
++ *	and this function is atomic with respect to other list locked calls.
++ *	A buffer cannot be placed on two lists at the same time.
++ */
++void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&list->lock, flags);
++	__skb_queue_after(list, old, newsk);
++	spin_unlock_irqrestore(&list->lock, flags);
++}
++EXPORT_SYMBOL(skb_append);
++
++static inline void skb_split_inside_header(struct sk_buff *skb,
++					   struct sk_buff* skb1,
++					   const u32 len, const int pos)
++{
++	int i;
++
++	skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
++					 pos - len);
++	/* And move data appendix as is. */
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
++		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
++
++	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
++	skb_shinfo(skb)->nr_frags  = 0;
++	skb1->data_len		   = skb->data_len;
++	skb1->len		   += skb1->data_len;
++	skb->data_len		   = 0;
++	skb->len		   = len;
++	skb_set_tail_pointer(skb, len);
++}
++
++static inline void skb_split_no_header(struct sk_buff *skb,
++				       struct sk_buff* skb1,
++				       const u32 len, int pos)
++{
++	int i, k = 0;
++	const int nfrags = skb_shinfo(skb)->nr_frags;
++
++	skb_shinfo(skb)->nr_frags = 0;
++	skb1->len		  = skb1->data_len = skb->len - len;
++	skb->len		  = len;
++	skb->data_len		  = len - pos;
++
++	for (i = 0; i < nfrags; i++) {
++		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (pos + size > len) {
++			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
++
++			if (pos < len) {
++				/* Split frag.
++				 * We have two variants in this case:
++				 * 1. Move all the frag to the second
++				 *    part, if it is possible. F.e.
++				 *    this approach is mandatory for TUX,
++				 *    where splitting is expensive.
++				 * 2. Split is accurately. We make this.
++				 */
++				skb_frag_ref(skb, i);
++				skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
++				skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
++				skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
++				skb_shinfo(skb)->nr_frags++;
++			}
++			k++;
++		} else
++			skb_shinfo(skb)->nr_frags++;
++		pos += size;
++	}
++	skb_shinfo(skb1)->nr_frags = k;
++}
++
++/**
++ * skb_split - Split fragmented skb to two parts at length len.
++ * @skb: the buffer to split
++ * @skb1: the buffer to receive the second part
++ * @len: new length for skb
++ */
++void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
++{
++	int pos = skb_headlen(skb);
++	const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
++
++	skb_zcopy_downgrade_managed(skb);
++
++	skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
++	skb_zerocopy_clone(skb1, skb, 0);
++	if (len < pos)	/* Split line is inside header. */
++		skb_split_inside_header(skb, skb1, len, pos);
++	else		/* Second chunk has no header, nothing to copy. */
++		skb_split_no_header(skb, skb1, len, pos);
++}
++EXPORT_SYMBOL(skb_split);
++
++/* Shifting from/to a cloned skb is a no-go.
++ *
++ * Caller cannot keep skb_shinfo related pointers past calling here!
++ */
++static int skb_prepare_for_shift(struct sk_buff *skb)
++{
++	return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
++}
++
++/**
++ * skb_shift - Shifts paged data partially from skb to another
++ * @tgt: buffer into which tail data gets added
++ * @skb: buffer from which the paged data comes from
++ * @shiftlen: shift up to this many bytes
++ *
++ * Attempts to shift up to shiftlen worth of bytes, which may be less than
++ * the length of the skb, from skb to tgt. Returns number bytes shifted.
++ * It's up to caller to free skb if everything was shifted.
++ *
++ * If @tgt runs out of frags, the whole operation is aborted.
++ *
++ * Skb cannot include anything else but paged data while tgt is allowed
++ * to have non-paged data as well.
++ *
++ * TODO: full sized shift could be optimized but that would need
++ * specialized skb free'er to handle frags without up-to-date nr_frags.
++ */
++int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
++{
++	int from, to, merge, todo;
++	skb_frag_t *fragfrom, *fragto;
++
++	BUG_ON(shiftlen > skb->len);
++
++	if (skb_headlen(skb))
++		return 0;
++	if (skb_zcopy(tgt) || skb_zcopy(skb))
++		return 0;
++
++	todo = shiftlen;
++	from = 0;
++	to = skb_shinfo(tgt)->nr_frags;
++	fragfrom = &skb_shinfo(skb)->frags[from];
++
++	/* Actual merge is delayed until the point when we know we can
++	 * commit all, so that we don't have to undo partial changes
++	 */
++	if (!to ||
++	    !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
++			      skb_frag_off(fragfrom))) {
++		merge = -1;
++	} else {
++		merge = to - 1;
++
++		todo -= skb_frag_size(fragfrom);
++		if (todo < 0) {
++			if (skb_prepare_for_shift(skb) ||
++			    skb_prepare_for_shift(tgt))
++				return 0;
++
++			/* All previous frag pointers might be stale! */
++			fragfrom = &skb_shinfo(skb)->frags[from];
++			fragto = &skb_shinfo(tgt)->frags[merge];
++
++			skb_frag_size_add(fragto, shiftlen);
++			skb_frag_size_sub(fragfrom, shiftlen);
++			skb_frag_off_add(fragfrom, shiftlen);
++
++			goto onlymerged;
++		}
++
++		from++;
++	}
++
++	/* Skip full, not-fitting skb to avoid expensive operations */
++	if ((shiftlen == skb->len) &&
++	    (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
++		return 0;
++
++	if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
++		return 0;
++
++	while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
++		if (to == MAX_SKB_FRAGS)
++			return 0;
++
++		fragfrom = &skb_shinfo(skb)->frags[from];
++		fragto = &skb_shinfo(tgt)->frags[to];
++
++		if (todo >= skb_frag_size(fragfrom)) {
++			*fragto = *fragfrom;
++			todo -= skb_frag_size(fragfrom);
++			from++;
++			to++;
++
++		} else {
++			__skb_frag_ref(fragfrom);
++			skb_frag_page_copy(fragto, fragfrom);
++			skb_frag_off_copy(fragto, fragfrom);
++			skb_frag_size_set(fragto, todo);
++
++			skb_frag_off_add(fragfrom, todo);
++			skb_frag_size_sub(fragfrom, todo);
++			todo = 0;
++
++			to++;
++			break;
++		}
++	}
++
++	/* Ready to "commit" this state change to tgt */
++	skb_shinfo(tgt)->nr_frags = to;
++
++	if (merge >= 0) {
++		fragfrom = &skb_shinfo(skb)->frags[0];
++		fragto = &skb_shinfo(tgt)->frags[merge];
++
++		skb_frag_size_add(fragto, skb_frag_size(fragfrom));
++		__skb_frag_unref(fragfrom, skb->pp_recycle);
++	}
++
++	/* Reposition in the original skb */
++	to = 0;
++	while (from < skb_shinfo(skb)->nr_frags)
++		skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
++	skb_shinfo(skb)->nr_frags = to;
++
++	BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
++
++onlymerged:
++	/* Most likely the tgt won't ever need its checksum anymore, skb on
++	 * the other hand might need it if it needs to be resent
++	 */
++	tgt->ip_summed = CHECKSUM_PARTIAL;
++	skb->ip_summed = CHECKSUM_PARTIAL;
++
++	skb_len_add(skb, -shiftlen);
++	skb_len_add(tgt, shiftlen);
++
++	return shiftlen;
++}
++
++/**
++ * skb_prepare_seq_read - Prepare a sequential read of skb data
++ * @skb: the buffer to read
++ * @from: lower offset of data to be read
++ * @to: upper offset of data to be read
++ * @st: state variable
++ *
++ * Initializes the specified state variable. Must be called before
++ * invoking skb_seq_read() for the first time.
++ */
++void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
++			  unsigned int to, struct skb_seq_state *st)
++{
++	st->lower_offset = from;
++	st->upper_offset = to;
++	st->root_skb = st->cur_skb = skb;
++	st->frag_idx = st->stepped_offset = 0;
++	st->frag_data = NULL;
++	st->frag_off = 0;
++}
++EXPORT_SYMBOL(skb_prepare_seq_read);
++
++/**
++ * skb_seq_read - Sequentially read skb data
++ * @consumed: number of bytes consumed by the caller so far
++ * @data: destination pointer for data to be returned
++ * @st: state variable
++ *
++ * Reads a block of skb data at @consumed relative to the
++ * lower offset specified to skb_prepare_seq_read(). Assigns
++ * the head of the data block to @data and returns the length
++ * of the block or 0 if the end of the skb data or the upper
++ * offset has been reached.
++ *
++ * The caller is not required to consume all of the data
++ * returned, i.e. @consumed is typically set to the number
++ * of bytes already consumed and the next call to
++ * skb_seq_read() will return the remaining part of the block.
++ *
++ * Note 1: The size of each block of data returned can be arbitrary,
++ *       this limitation is the cost for zerocopy sequential
++ *       reads of potentially non linear data.
++ *
++ * Note 2: Fragment lists within fragments are not implemented
++ *       at the moment, state->root_skb could be replaced with
++ *       a stack for this purpose.
++ */
++unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
++			  struct skb_seq_state *st)
++{
++	unsigned int block_limit, abs_offset = consumed + st->lower_offset;
++	skb_frag_t *frag;
++
++	if (unlikely(abs_offset >= st->upper_offset)) {
++		if (st->frag_data) {
++			kunmap_atomic(st->frag_data);
++			st->frag_data = NULL;
++		}
++		return 0;
++	}
++
++next_skb:
++	block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
++
++	if (abs_offset < block_limit && !st->frag_data) {
++		*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
++		return block_limit - abs_offset;
++	}
++
++	if (st->frag_idx == 0 && !st->frag_data)
++		st->stepped_offset += skb_headlen(st->cur_skb);
++
++	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
++		unsigned int pg_idx, pg_off, pg_sz;
++
++		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
++
++		pg_idx = 0;
++		pg_off = skb_frag_off(frag);
++		pg_sz = skb_frag_size(frag);
++
++		if (skb_frag_must_loop(skb_frag_page(frag))) {
++			pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
++			pg_off = offset_in_page(pg_off + st->frag_off);
++			pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
++						    PAGE_SIZE - pg_off);
++		}
++
++		block_limit = pg_sz + st->stepped_offset;
++		if (abs_offset < block_limit) {
++			if (!st->frag_data)
++				st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
++
++			*data = (u8 *)st->frag_data + pg_off +
++				(abs_offset - st->stepped_offset);
++
++			return block_limit - abs_offset;
++		}
++
++		if (st->frag_data) {
++			kunmap_atomic(st->frag_data);
++			st->frag_data = NULL;
++		}
++
++		st->stepped_offset += pg_sz;
++		st->frag_off += pg_sz;
++		if (st->frag_off == skb_frag_size(frag)) {
++			st->frag_off = 0;
++			st->frag_idx++;
++		}
++	}
++
++	if (st->frag_data) {
++		kunmap_atomic(st->frag_data);
++		st->frag_data = NULL;
++	}
++
++	if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
++		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
++		st->frag_idx = 0;
++		goto next_skb;
++	} else if (st->cur_skb->next) {
++		st->cur_skb = st->cur_skb->next;
++		st->frag_idx = 0;
++		goto next_skb;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(skb_seq_read);
++
++/**
++ * skb_abort_seq_read - Abort a sequential read of skb data
++ * @st: state variable
++ *
++ * Must be called if skb_seq_read() was not called until it
++ * returned 0.
++ */
++void skb_abort_seq_read(struct skb_seq_state *st)
++{
++	if (st->frag_data)
++		kunmap_atomic(st->frag_data);
++}
++EXPORT_SYMBOL(skb_abort_seq_read);
++
++#define TS_SKB_CB(state)	((struct skb_seq_state *) &((state)->cb))
++
++static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
++					  struct ts_config *conf,
++					  struct ts_state *state)
++{
++	return skb_seq_read(offset, text, TS_SKB_CB(state));
++}
++
++static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
++{
++	skb_abort_seq_read(TS_SKB_CB(state));
++}
++
++/**
++ * skb_find_text - Find a text pattern in skb data
++ * @skb: the buffer to look in
++ * @from: search offset
++ * @to: search limit
++ * @config: textsearch configuration
++ *
++ * Finds a pattern in the skb data according to the specified
++ * textsearch configuration. Use textsearch_next() to retrieve
++ * subsequent occurrences of the pattern. Returns the offset
++ * to the first occurrence or UINT_MAX if no match was found.
++ */
++unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
++			   unsigned int to, struct ts_config *config)
++{
++	struct ts_state state;
++	unsigned int ret;
++
++	BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
++
++	config->get_next_block = skb_ts_get_next_block;
++	config->finish = skb_ts_finish;
++
++	skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
++
++	ret = textsearch_find(config, &state);
++	return (ret <= to - from ? ret : UINT_MAX);
++}
++EXPORT_SYMBOL(skb_find_text);
++
++int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
++			 int offset, size_t size)
++{
++	int i = skb_shinfo(skb)->nr_frags;
++
++	if (skb_can_coalesce(skb, i, page, offset)) {
++		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
++	} else if (i < MAX_SKB_FRAGS) {
++		skb_zcopy_downgrade_managed(skb);
++		get_page(page);
++		skb_fill_page_desc_noacc(skb, i, page, offset, size);
++	} else {
++		return -EMSGSIZE;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_append_pagefrags);
++
++/**
++ *	skb_pull_rcsum - pull skb and update receive checksum
++ *	@skb: buffer to update
++ *	@len: length of data pulled
++ *
++ *	This function performs an skb_pull on the packet and updates
++ *	the CHECKSUM_COMPLETE checksum.  It should be used on
++ *	receive path processing instead of skb_pull unless you know
++ *	that the checksum difference is zero (e.g., a valid IP header)
++ *	or you are setting ip_summed to CHECKSUM_NONE.
++ */
++void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
++{
++	unsigned char *data = skb->data;
++
++	BUG_ON(len > skb->len);
++	__skb_pull(skb, len);
++	skb_postpull_rcsum(skb, data, len);
++	return skb->data;
++}
++EXPORT_SYMBOL_GPL(skb_pull_rcsum);
++
++static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
++{
++	skb_frag_t head_frag;
++	struct page *page;
++
++	page = virt_to_head_page(frag_skb->head);
++	__skb_frag_set_page(&head_frag, page);
++	skb_frag_off_set(&head_frag, frag_skb->data -
++			 (unsigned char *)page_address(page));
++	skb_frag_size_set(&head_frag, skb_headlen(frag_skb));
++	return head_frag;
++}
++
++struct sk_buff *skb_segment_list(struct sk_buff *skb,
++				 netdev_features_t features,
++				 unsigned int offset)
++{
++	struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
++	unsigned int tnl_hlen = skb_tnl_header_len(skb);
++	unsigned int delta_truesize = 0;
++	unsigned int delta_len = 0;
++	struct sk_buff *tail = NULL;
++	struct sk_buff *nskb, *tmp;
++	int len_diff, err;
++
++	skb_push(skb, -skb_network_offset(skb) + offset);
++
++	skb_shinfo(skb)->frag_list = NULL;
++
++	do {
++		nskb = list_skb;
++		list_skb = list_skb->next;
++
++		err = 0;
++		delta_truesize += nskb->truesize;
++		if (skb_shared(nskb)) {
++			tmp = skb_clone(nskb, GFP_ATOMIC);
++			if (tmp) {
++				consume_skb(nskb);
++				nskb = tmp;
++				err = skb_unclone(nskb, GFP_ATOMIC);
++			} else {
++				err = -ENOMEM;
++			}
++		}
++
++		if (!tail)
++			skb->next = nskb;
++		else
++			tail->next = nskb;
++
++		if (unlikely(err)) {
++			nskb->next = list_skb;
++			goto err_linearize;
++		}
++
++		tail = nskb;
++
++		delta_len += nskb->len;
++
++		skb_push(nskb, -skb_network_offset(nskb) + offset);
++
++		skb_release_head_state(nskb);
++		len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
++		__copy_skb_header(nskb, skb);
++
++		skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
++		nskb->transport_header += len_diff;
++		skb_copy_from_linear_data_offset(skb, -tnl_hlen,
++						 nskb->data - tnl_hlen,
++						 offset + tnl_hlen);
++
++		if (skb_needs_linearize(nskb, features) &&
++		    __skb_linearize(nskb))
++			goto err_linearize;
++
++	} while (list_skb);
++
++	skb->truesize = skb->truesize - delta_truesize;
++	skb->data_len = skb->data_len - delta_len;
++	skb->len = skb->len - delta_len;
++
++	skb_gso_reset(skb);
++
++	skb->prev = tail;
++
++	if (skb_needs_linearize(skb, features) &&
++	    __skb_linearize(skb))
++		goto err_linearize;
++
++	skb_get(skb);
++
++	return skb;
++
++err_linearize:
++	kfree_skb_list(skb->next);
++	skb->next = NULL;
++	return ERR_PTR(-ENOMEM);
++}
++EXPORT_SYMBOL_GPL(skb_segment_list);
++
++/**
++ *	skb_segment - Perform protocol segmentation on skb.
++ *	@head_skb: buffer to segment
++ *	@features: features for the output path (see dev->features)
++ *
++ *	This function performs segmentation on the given skb.  It returns
++ *	a pointer to the first in a list of new skbs for the segments.
++ *	In case of error it returns ERR_PTR(err).
++ */
++struct sk_buff *skb_segment(struct sk_buff *head_skb,
++			    netdev_features_t features)
++{
++	struct sk_buff *segs = NULL;
++	struct sk_buff *tail = NULL;
++	struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
++	skb_frag_t *frag = skb_shinfo(head_skb)->frags;
++	unsigned int mss = skb_shinfo(head_skb)->gso_size;
++	unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
++	struct sk_buff *frag_skb = head_skb;
++	unsigned int offset = doffset;
++	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
++	unsigned int partial_segs = 0;
++	unsigned int headroom;
++	unsigned int len = head_skb->len;
++	__be16 proto;
++	bool csum, sg;
++	int nfrags = skb_shinfo(head_skb)->nr_frags;
++	int err = -ENOMEM;
++	int i = 0;
++	int pos;
++
++	if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
++	    mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
++		struct sk_buff *check_skb;
++
++		for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
++			if (skb_headlen(check_skb) && !check_skb->head_frag) {
++				/* gso_size is untrusted, and we have a frag_list with
++				 * a linear non head_frag item.
++				 *
++				 * If head_skb's headlen does not fit requested gso_size,
++				 * it means that the frag_list members do NOT terminate
++				 * on exact gso_size boundaries. Hence we cannot perform
++				 * skb_frag_t page sharing. Therefore we must fallback to
++				 * copying the frag_list skbs; we do so by disabling SG.
++				 */
++				features &= ~NETIF_F_SG;
++				break;
++			}
++		}
++	}
++
++	__skb_push(head_skb, doffset);
++	proto = skb_network_protocol(head_skb, NULL);
++	if (unlikely(!proto))
++		return ERR_PTR(-EINVAL);
++
++	sg = !!(features & NETIF_F_SG);
++	csum = !!can_checksum_protocol(features, proto);
++
++	if (sg && csum && (mss != GSO_BY_FRAGS))  {
++		if (!(features & NETIF_F_GSO_PARTIAL)) {
++			struct sk_buff *iter;
++			unsigned int frag_len;
++
++			if (!list_skb ||
++			    !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
++				goto normal;
++
++			/* If we get here then all the required
++			 * GSO features except frag_list are supported.
++			 * Try to split the SKB to multiple GSO SKBs
++			 * with no frag_list.
++			 * Currently we can do that only when the buffers don't
++			 * have a linear part and all the buffers except
++			 * the last are of the same length.
++			 */
++			frag_len = list_skb->len;
++			skb_walk_frags(head_skb, iter) {
++				if (frag_len != iter->len && iter->next)
++					goto normal;
++				if (skb_headlen(iter) && !iter->head_frag)
++					goto normal;
++
++				len -= iter->len;
++			}
++
++			if (len != frag_len)
++				goto normal;
++		}
++
++		/* GSO partial only requires that we trim off any excess that
++		 * doesn't fit into an MSS sized block, so take care of that
++		 * now.
++		 */
++		partial_segs = len / mss;
++		if (partial_segs > 1)
++			mss *= partial_segs;
++		else
++			partial_segs = 0;
++	}
++
++normal:
++	headroom = skb_headroom(head_skb);
++	pos = skb_headlen(head_skb);
++
++	do {
++		struct sk_buff *nskb;
++		skb_frag_t *nskb_frag;
++		int hsize;
++		int size;
++
++		if (unlikely(mss == GSO_BY_FRAGS)) {
++			len = list_skb->len;
++		} else {
++			len = head_skb->len - offset;
++			if (len > mss)
++				len = mss;
++		}
++
++		hsize = skb_headlen(head_skb) - offset;
++
++		if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
++		    (skb_headlen(list_skb) == len || sg)) {
++			BUG_ON(skb_headlen(list_skb) > len);
++
++			i = 0;
++			nfrags = skb_shinfo(list_skb)->nr_frags;
++			frag = skb_shinfo(list_skb)->frags;
++			frag_skb = list_skb;
++			pos += skb_headlen(list_skb);
++
++			while (pos < offset + len) {
++				BUG_ON(i >= nfrags);
++
++				size = skb_frag_size(frag);
++				if (pos + size > offset + len)
++					break;
++
++				i++;
++				pos += size;
++				frag++;
++			}
++
++			nskb = skb_clone(list_skb, GFP_ATOMIC);
++			list_skb = list_skb->next;
++
++			if (unlikely(!nskb))
++				goto err;
++
++			if (unlikely(pskb_trim(nskb, len))) {
++				kfree_skb(nskb);
++				goto err;
++			}
++
++			hsize = skb_end_offset(nskb);
++			if (skb_cow_head(nskb, doffset + headroom)) {
++				kfree_skb(nskb);
++				goto err;
++			}
++
++			nskb->truesize += skb_end_offset(nskb) - hsize;
++			skb_release_head_state(nskb);
++			__skb_push(nskb, doffset);
++		} else {
++			if (hsize < 0)
++				hsize = 0;
++			if (hsize > len || !sg)
++				hsize = len;
++
++			nskb = __alloc_skb(hsize + doffset + headroom,
++					   GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
++					   NUMA_NO_NODE);
++
++			if (unlikely(!nskb))
++				goto err;
++
++			skb_reserve(nskb, headroom);
++			__skb_put(nskb, doffset);
++		}
++
++		if (segs)
++			tail->next = nskb;
++		else
++			segs = nskb;
++		tail = nskb;
++
++		__copy_skb_header(nskb, head_skb);
++
++		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
++		skb_reset_mac_len(nskb);
++
++		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
++						 nskb->data - tnl_hlen,
++						 doffset + tnl_hlen);
++
++		if (nskb->len == len + doffset)
++			goto perform_csum_check;
++
++		if (!sg) {
++			if (!csum) {
++				if (!nskb->remcsum_offload)
++					nskb->ip_summed = CHECKSUM_NONE;
++				SKB_GSO_CB(nskb)->csum =
++					skb_copy_and_csum_bits(head_skb, offset,
++							       skb_put(nskb,
++								       len),
++							       len);
++				SKB_GSO_CB(nskb)->csum_start =
++					skb_headroom(nskb) + doffset;
++			} else {
++				if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
++					goto err;
++			}
++			continue;
++		}
++
++		nskb_frag = skb_shinfo(nskb)->frags;
++
++		skb_copy_from_linear_data_offset(head_skb, offset,
++						 skb_put(nskb, hsize), hsize);
++
++		skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
++					   SKBFL_SHARED_FRAG;
++
++		if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
++		    skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
++			goto err;
++
++		while (pos < offset + len) {
++			if (i >= nfrags) {
++				i = 0;
++				nfrags = skb_shinfo(list_skb)->nr_frags;
++				frag = skb_shinfo(list_skb)->frags;
++				frag_skb = list_skb;
++				if (!skb_headlen(list_skb)) {
++					BUG_ON(!nfrags);
++				} else {
++					BUG_ON(!list_skb->head_frag);
++
++					/* to make room for head_frag. */
++					i--;
++					frag--;
++				}
++				if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
++				    skb_zerocopy_clone(nskb, frag_skb,
++						       GFP_ATOMIC))
++					goto err;
++
++				list_skb = list_skb->next;
++			}
++
++			if (unlikely(skb_shinfo(nskb)->nr_frags >=
++				     MAX_SKB_FRAGS)) {
++				net_warn_ratelimited(
++					"skb_segment: too many frags: %u %u\n",
++					pos, mss);
++				err = -EINVAL;
++				goto err;
++			}
++
++			*nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
++			__skb_frag_ref(nskb_frag);
++			size = skb_frag_size(nskb_frag);
++
++			if (pos < offset) {
++				skb_frag_off_add(nskb_frag, offset - pos);
++				skb_frag_size_sub(nskb_frag, offset - pos);
++			}
++
++			skb_shinfo(nskb)->nr_frags++;
++
++			if (pos + size <= offset + len) {
++				i++;
++				frag++;
++				pos += size;
++			} else {
++				skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
++				goto skip_fraglist;
++			}
++
++			nskb_frag++;
++		}
++
++skip_fraglist:
++		nskb->data_len = len - hsize;
++		nskb->len += nskb->data_len;
++		nskb->truesize += nskb->data_len;
++
++perform_csum_check:
++		if (!csum) {
++			if (skb_has_shared_frag(nskb) &&
++			    __skb_linearize(nskb))
++				goto err;
++
++			if (!nskb->remcsum_offload)
++				nskb->ip_summed = CHECKSUM_NONE;
++			SKB_GSO_CB(nskb)->csum =
++				skb_checksum(nskb, doffset,
++					     nskb->len - doffset, 0);
++			SKB_GSO_CB(nskb)->csum_start =
++				skb_headroom(nskb) + doffset;
++		}
++	} while ((offset += len) < head_skb->len);
++
++	/* Some callers want to get the end of the list.
++	 * Put it in segs->prev to avoid walking the list.
++	 * (see validate_xmit_skb_list() for example)
++	 */
++	segs->prev = tail;
++
++	if (partial_segs) {
++		struct sk_buff *iter;
++		int type = skb_shinfo(head_skb)->gso_type;
++		unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
++
++		/* Update type to add partial and then remove dodgy if set */
++		type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
++		type &= ~SKB_GSO_DODGY;
++
++		/* Update GSO info and prepare to start updating headers on
++		 * our way back down the stack of protocols.
++		 */
++		for (iter = segs; iter; iter = iter->next) {
++			skb_shinfo(iter)->gso_size = gso_size;
++			skb_shinfo(iter)->gso_segs = partial_segs;
++			skb_shinfo(iter)->gso_type = type;
++			SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
++		}
++
++		if (tail->len - doffset <= gso_size)
++			skb_shinfo(tail)->gso_size = 0;
++		else if (tail != segs)
++			skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
++	}
++
++	/* Following permits correct backpressure, for protocols
++	 * using skb_set_owner_w().
++	 * Idea is to tranfert ownership from head_skb to last segment.
++	 */
++	if (head_skb->destructor == sock_wfree) {
++		swap(tail->truesize, head_skb->truesize);
++		swap(tail->destructor, head_skb->destructor);
++		swap(tail->sk, head_skb->sk);
++	}
++	return segs;
++
++err:
++	kfree_skb_list(segs);
++	return ERR_PTR(err);
++}
++EXPORT_SYMBOL_GPL(skb_segment);
++
++#ifdef CONFIG_SKB_EXTENSIONS
++#define SKB_EXT_ALIGN_VALUE	8
++#define SKB_EXT_CHUNKSIZEOF(x)	(ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
++
++static const u8 skb_ext_type_len[] = {
++#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
++	[SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
++#endif
++#ifdef CONFIG_XFRM
++	[SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
++#endif
++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
++	[TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
++#endif
++#if IS_ENABLED(CONFIG_MPTCP)
++	[SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
++#endif
++#if IS_ENABLED(CONFIG_MCTP_FLOWS)
++	[SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
++#endif
++};
++
++static __always_inline unsigned int skb_ext_total_length(void)
++{
++	return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
++#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
++		skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
++#endif
++#ifdef CONFIG_XFRM
++		skb_ext_type_len[SKB_EXT_SEC_PATH] +
++#endif
++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
++		skb_ext_type_len[TC_SKB_EXT] +
++#endif
++#if IS_ENABLED(CONFIG_MPTCP)
++		skb_ext_type_len[SKB_EXT_MPTCP] +
++#endif
++#if IS_ENABLED(CONFIG_MCTP_FLOWS)
++		skb_ext_type_len[SKB_EXT_MCTP] +
++#endif
++		0;
++}
++
++static void skb_extensions_init(void)
++{
++	BUILD_BUG_ON(SKB_EXT_NUM >= 8);
++	BUILD_BUG_ON(skb_ext_total_length() > 255);
++
++	skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
++					     SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
++					     0,
++					     SLAB_HWCACHE_ALIGN|SLAB_PANIC,
++					     NULL);
++}
++#else
++static void skb_extensions_init(void) {}
++#endif
++
++void __init skb_init(void)
++{
++	skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
++					      sizeof(struct sk_buff),
++					      0,
++					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
++					      offsetof(struct sk_buff, cb),
++					      sizeof_field(struct sk_buff, cb),
++					      NULL);
++	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
++						sizeof(struct sk_buff_fclones),
++						0,
++						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
++						NULL);
++	skb_extensions_init();
++}
++
++static int
++__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
++	       unsigned int recursion_level)
++{
++	int start = skb_headlen(skb);
++	int i, copy = start - offset;
++	struct sk_buff *frag_iter;
++	int elt = 0;
++
++	if (unlikely(recursion_level >= 24))
++		return -EMSGSIZE;
++
++	if (copy > 0) {
++		if (copy > len)
++			copy = len;
++		sg_set_buf(sg, skb->data + offset, copy);
++		elt++;
++		if ((len -= copy) == 0)
++			return elt;
++		offset += copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
++		if ((copy = end - offset) > 0) {
++			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
++				return -EMSGSIZE;
++
++			if (copy > len)
++				copy = len;
++			sg_set_page(&sg[elt], skb_frag_page(frag), copy,
++				    skb_frag_off(frag) + offset - start);
++			elt++;
++			if (!(len -= copy))
++				return elt;
++			offset += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		int end, ret;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
++				return -EMSGSIZE;
++
++			if (copy > len)
++				copy = len;
++			ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
++					      copy, recursion_level + 1);
++			if (unlikely(ret < 0))
++				return ret;
++			elt += ret;
++			if ((len -= copy) == 0)
++				return elt;
++			offset += copy;
++		}
++		start = end;
++	}
++	BUG_ON(len);
++	return elt;
++}
++
++/**
++ *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer
++ *	@skb: Socket buffer containing the buffers to be mapped
++ *	@sg: The scatter-gather list to map into
++ *	@offset: The offset into the buffer's contents to start mapping
++ *	@len: Length of buffer space to be mapped
++ *
++ *	Fill the specified scatter-gather list with mappings/pointers into a
++ *	region of the buffer space attached to a socket buffer. Returns either
++ *	the number of scatterlist items used, or -EMSGSIZE if the contents
++ *	could not fit.
++ */
++int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
++{
++	int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
++
++	if (nsg <= 0)
++		return nsg;
++
++	sg_mark_end(&sg[nsg - 1]);
++
++	return nsg;
++}
++EXPORT_SYMBOL_GPL(skb_to_sgvec);
++
++/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
++ * sglist without mark the sg which contain last skb data as the end.
++ * So the caller can mannipulate sg list as will when padding new data after
++ * the first call without calling sg_unmark_end to expend sg list.
++ *
++ * Scenario to use skb_to_sgvec_nomark:
++ * 1. sg_init_table
++ * 2. skb_to_sgvec_nomark(payload1)
++ * 3. skb_to_sgvec_nomark(payload2)
++ *
++ * This is equivalent to:
++ * 1. sg_init_table
++ * 2. skb_to_sgvec(payload1)
++ * 3. sg_unmark_end
++ * 4. skb_to_sgvec(payload2)
++ *
++ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
++ * is more preferable.
++ */
++int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
++			int offset, int len)
++{
++	return __skb_to_sgvec(skb, sg, offset, len, 0);
++}
++EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
++
++
++
++/**
++ *	skb_cow_data - Check that a socket buffer's data buffers are writable
++ *	@skb: The socket buffer to check.
++ *	@tailbits: Amount of trailing space to be added
++ *	@trailer: Returned pointer to the skb where the @tailbits space begins
++ *
++ *	Make sure that the data buffers attached to a socket buffer are
++ *	writable. If they are not, private copies are made of the data buffers
++ *	and the socket buffer is set to use these instead.
++ *
++ *	If @tailbits is given, make sure that there is space to write @tailbits
++ *	bytes of data beyond current end of socket buffer.  @trailer will be
++ *	set to point to the skb in which this space begins.
++ *
++ *	The number of scatterlist elements required to completely map the
++ *	COW'd and extended socket buffer will be returned.
++ */
++int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
++{
++	int copyflag;
++	int elt;
++	struct sk_buff *skb1, **skb_p;
++
++	/* If skb is cloned or its head is paged, reallocate
++	 * head pulling out all the pages (pages are considered not writable
++	 * at the moment even if they are anonymous).
++	 */
++	if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
++	    !__pskb_pull_tail(skb, __skb_pagelen(skb)))
++		return -ENOMEM;
++
++	/* Easy case. Most of packets will go this way. */
++	if (!skb_has_frag_list(skb)) {
++		/* A little of trouble, not enough of space for trailer.
++		 * This should not happen, when stack is tuned to generate
++		 * good frames. OK, on miss we reallocate and reserve even more
++		 * space, 128 bytes is fair. */
++
++		if (skb_tailroom(skb) < tailbits &&
++		    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
++			return -ENOMEM;
++
++		/* Voila! */
++		*trailer = skb;
++		return 1;
++	}
++
++	/* Misery. We are in troubles, going to mincer fragments... */
++
++	elt = 1;
++	skb_p = &skb_shinfo(skb)->frag_list;
++	copyflag = 0;
++
++	while ((skb1 = *skb_p) != NULL) {
++		int ntail = 0;
++
++		/* The fragment is partially pulled by someone,
++		 * this can happen on input. Copy it and everything
++		 * after it. */
++
++		if (skb_shared(skb1))
++			copyflag = 1;
++
++		/* If the skb is the last, worry about trailer. */
++
++		if (skb1->next == NULL && tailbits) {
++			if (skb_shinfo(skb1)->nr_frags ||
++			    skb_has_frag_list(skb1) ||
++			    skb_tailroom(skb1) < tailbits)
++				ntail = tailbits + 128;
++		}
++
++		if (copyflag ||
++		    skb_cloned(skb1) ||
++		    ntail ||
++		    skb_shinfo(skb1)->nr_frags ||
++		    skb_has_frag_list(skb1)) {
++			struct sk_buff *skb2;
++
++			/* Fuck, we are miserable poor guys... */
++			if (ntail == 0)
++				skb2 = skb_copy(skb1, GFP_ATOMIC);
++			else
++				skb2 = skb_copy_expand(skb1,
++						       skb_headroom(skb1),
++						       ntail,
++						       GFP_ATOMIC);
++			if (unlikely(skb2 == NULL))
++				return -ENOMEM;
++
++			if (skb1->sk)
++				skb_set_owner_w(skb2, skb1->sk);
++
++			/* Looking around. Are we still alive?
++			 * OK, link new skb, drop old one */
++
++			skb2->next = skb1->next;
++			*skb_p = skb2;
++			kfree_skb(skb1);
++			skb1 = skb2;
++		}
++		elt++;
++		*trailer = skb1;
++		skb_p = &skb1->next;
++	}
++
++	return elt;
++}
++EXPORT_SYMBOL_GPL(skb_cow_data);
++
++static void sock_rmem_free(struct sk_buff *skb)
++{
++	struct sock *sk = skb->sk;
++
++	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
++}
++
++static void skb_set_err_queue(struct sk_buff *skb)
++{
++	/* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
++	 * So, it is safe to (mis)use it to mark skbs on the error queue.
++	 */
++	skb->pkt_type = PACKET_OUTGOING;
++	BUILD_BUG_ON(PACKET_OUTGOING == 0);
++}
++
++/*
++ * Note: We dont mem charge error packets (no sk_forward_alloc changes)
++ */
++int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
++{
++	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
++	    (unsigned int)READ_ONCE(sk->sk_rcvbuf))
++		return -ENOMEM;
++
++	skb_orphan(skb);
++	skb->sk = sk;
++	skb->destructor = sock_rmem_free;
++	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
++	skb_set_err_queue(skb);
++
++	/* before exiting rcu section, make sure dst is refcounted */
++	skb_dst_force(skb);
++
++	skb_queue_tail(&sk->sk_error_queue, skb);
++	if (!sock_flag(sk, SOCK_DEAD))
++		sk_error_report(sk);
++	return 0;
++}
++EXPORT_SYMBOL(sock_queue_err_skb);
++
++static bool is_icmp_err_skb(const struct sk_buff *skb)
++{
++	return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
++		       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
++}
++
++struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
++{
++	struct sk_buff_head *q = &sk->sk_error_queue;
++	struct sk_buff *skb, *skb_next = NULL;
++	bool icmp_next = false;
++	unsigned long flags;
++
++	spin_lock_irqsave(&q->lock, flags);
++	skb = __skb_dequeue(q);
++	if (skb && (skb_next = skb_peek(q))) {
++		icmp_next = is_icmp_err_skb(skb_next);
++		if (icmp_next)
++			sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
++	}
++	spin_unlock_irqrestore(&q->lock, flags);
++
++	if (is_icmp_err_skb(skb) && !icmp_next)
++		sk->sk_err = 0;
++
++	if (skb_next)
++		sk_error_report(sk);
++
++	return skb;
++}
++EXPORT_SYMBOL(sock_dequeue_err_skb);
++
++/**
++ * skb_clone_sk - create clone of skb, and take reference to socket
++ * @skb: the skb to clone
++ *
++ * This function creates a clone of a buffer that holds a reference on
++ * sk_refcnt.  Buffers created via this function are meant to be
++ * returned using sock_queue_err_skb, or free via kfree_skb.
++ *
++ * When passing buffers allocated with this function to sock_queue_err_skb
++ * it is necessary to wrap the call with sock_hold/sock_put in order to
++ * prevent the socket from being released prior to being enqueued on
++ * the sk_error_queue.
++ */
++struct sk_buff *skb_clone_sk(struct sk_buff *skb)
++{
++	struct sock *sk = skb->sk;
++	struct sk_buff *clone;
++
++	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
++		return NULL;
++
++	clone = skb_clone(skb, GFP_ATOMIC);
++	if (!clone) {
++		sock_put(sk);
++		return NULL;
++	}
++
++	clone->sk = sk;
++	clone->destructor = sock_efree;
++
++	return clone;
++}
++EXPORT_SYMBOL(skb_clone_sk);
++
++static void __skb_complete_tx_timestamp(struct sk_buff *skb,
++					struct sock *sk,
++					int tstype,
++					bool opt_stats)
++{
++	struct sock_exterr_skb *serr;
++	int err;
++
++	BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
++
++	serr = SKB_EXT_ERR(skb);
++	memset(serr, 0, sizeof(*serr));
++	serr->ee.ee_errno = ENOMSG;
++	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
++	serr->ee.ee_info = tstype;
++	serr->opt_stats = opt_stats;
++	serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
++	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
++		serr->ee.ee_data = skb_shinfo(skb)->tskey;
++		if (sk_is_tcp(sk))
++			serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
++	}
++
++	err = sock_queue_err_skb(sk, skb);
++
++	if (err)
++		kfree_skb(skb);
++}
++
++static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
++{
++	bool ret;
++
++	if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
++		return true;
++
++	read_lock_bh(&sk->sk_callback_lock);
++	ret = sk->sk_socket && sk->sk_socket->file &&
++	      file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
++	read_unlock_bh(&sk->sk_callback_lock);
++	return ret;
++}
++
++void skb_complete_tx_timestamp(struct sk_buff *skb,
++			       struct skb_shared_hwtstamps *hwtstamps)
++{
++	struct sock *sk = skb->sk;
++
++	if (!skb_may_tx_timestamp(sk, false))
++		goto err;
++
++	/* Take a reference to prevent skb_orphan() from freeing the socket,
++	 * but only if the socket refcount is not zero.
++	 */
++	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
++		*skb_hwtstamps(skb) = *hwtstamps;
++		__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
++		sock_put(sk);
++		return;
++	}
++
++err:
++	kfree_skb(skb);
++}
++EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
++
++void __skb_tstamp_tx(struct sk_buff *orig_skb,
++		     const struct sk_buff *ack_skb,
++		     struct skb_shared_hwtstamps *hwtstamps,
++		     struct sock *sk, int tstype)
++{
++	struct sk_buff *skb;
++	bool tsonly, opt_stats = false;
++
++	if (!sk)
++		return;
++
++	if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
++	    skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
++		return;
++
++	tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
++	if (!skb_may_tx_timestamp(sk, tsonly))
++		return;
++
++	if (tsonly) {
++#ifdef CONFIG_INET
++		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
++		    sk_is_tcp(sk)) {
++			skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
++							     ack_skb);
++			opt_stats = true;
++		} else
++#endif
++			skb = alloc_skb(0, GFP_ATOMIC);
++	} else {
++		skb = skb_clone(orig_skb, GFP_ATOMIC);
++	}
++	if (!skb)
++		return;
++
++	if (tsonly) {
++		skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
++					     SKBTX_ANY_TSTAMP;
++		skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
++	}
++
++	if (hwtstamps)
++		*skb_hwtstamps(skb) = *hwtstamps;
++	else
++		__net_timestamp(skb);
++
++	__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
++}
++EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
++
++void skb_tstamp_tx(struct sk_buff *orig_skb,
++		   struct skb_shared_hwtstamps *hwtstamps)
++{
++	return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
++			       SCM_TSTAMP_SND);
++}
++EXPORT_SYMBOL_GPL(skb_tstamp_tx);
++
++void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
++{
++	struct sock *sk = skb->sk;
++	struct sock_exterr_skb *serr;
++	int err = 1;
++
++	skb->wifi_acked_valid = 1;
++	skb->wifi_acked = acked;
++
++	serr = SKB_EXT_ERR(skb);
++	memset(serr, 0, sizeof(*serr));
++	serr->ee.ee_errno = ENOMSG;
++	serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
++
++	/* Take a reference to prevent skb_orphan() from freeing the socket,
++	 * but only if the socket refcount is not zero.
++	 */
++	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
++		err = sock_queue_err_skb(sk, skb);
++		sock_put(sk);
++	}
++	if (err)
++		kfree_skb(skb);
++}
++EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
++
++/**
++ * skb_partial_csum_set - set up and verify partial csum values for packet
++ * @skb: the skb to set
++ * @start: the number of bytes after skb->data to start checksumming.
++ * @off: the offset from start to place the checksum.
++ *
++ * For untrusted partially-checksummed packets, we need to make sure the values
++ * for skb->csum_start and skb->csum_offset are valid so we don't oops.
++ *
++ * This function checks and sets those values and skb->ip_summed: if this
++ * returns false you should drop the packet.
++ */
++bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
++{
++	u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
++	u32 csum_start = skb_headroom(skb) + (u32)start;
++
++	if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
++		net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
++				     start, off, skb_headroom(skb), skb_headlen(skb));
++		return false;
++	}
++	skb->ip_summed = CHECKSUM_PARTIAL;
++	skb->csum_start = csum_start;
++	skb->csum_offset = off;
++	skb_set_transport_header(skb, start);
++	return true;
++}
++EXPORT_SYMBOL_GPL(skb_partial_csum_set);
++
++static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
++			       unsigned int max)
++{
++	if (skb_headlen(skb) >= len)
++		return 0;
++
++	/* If we need to pullup then pullup to the max, so we
++	 * won't need to do it again.
++	 */
++	if (max > skb->len)
++		max = skb->len;
++
++	if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
++		return -ENOMEM;
++
++	if (skb_headlen(skb) < len)
++		return -EPROTO;
++
++	return 0;
++}
++
++#define MAX_TCP_HDR_LEN (15 * 4)
++
++static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
++				      typeof(IPPROTO_IP) proto,
++				      unsigned int off)
++{
++	int err;
++
++	switch (proto) {
++	case IPPROTO_TCP:
++		err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
++					  off + MAX_TCP_HDR_LEN);
++		if (!err && !skb_partial_csum_set(skb, off,
++						  offsetof(struct tcphdr,
++							   check)))
++			err = -EPROTO;
++		return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
++
++	case IPPROTO_UDP:
++		err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
++					  off + sizeof(struct udphdr));
++		if (!err && !skb_partial_csum_set(skb, off,
++						  offsetof(struct udphdr,
++							   check)))
++			err = -EPROTO;
++		return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
++	}
++
++	return ERR_PTR(-EPROTO);
++}
++
++/* This value should be large enough to cover a tagged ethernet header plus
++ * maximally sized IP and TCP or UDP headers.
++ */
++#define MAX_IP_HDR_LEN 128
++
++static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
++{
++	unsigned int off;
++	bool fragment;
++	__sum16 *csum;
++	int err;
++
++	fragment = false;
++
++	err = skb_maybe_pull_tail(skb,
++				  sizeof(struct iphdr),
++				  MAX_IP_HDR_LEN);
++	if (err < 0)
++		goto out;
++
++	if (ip_is_fragment(ip_hdr(skb)))
++		fragment = true;
++
++	off = ip_hdrlen(skb);
++
++	err = -EPROTO;
++
++	if (fragment)
++		goto out;
++
++	csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
++	if (IS_ERR(csum))
++		return PTR_ERR(csum);
++
++	if (recalculate)
++		*csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
++					   ip_hdr(skb)->daddr,
++					   skb->len - off,
++					   ip_hdr(skb)->protocol, 0);
++	err = 0;
++
++out:
++	return err;
++}
++
++/* This value should be large enough to cover a tagged ethernet header plus
++ * an IPv6 header, all options, and a maximal TCP or UDP header.
++ */
++#define MAX_IPV6_HDR_LEN 256
++
++#define OPT_HDR(type, skb, off) \
++	(type *)(skb_network_header(skb) + (off))
++
++static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
++{
++	int err;
++	u8 nexthdr;
++	unsigned int off;
++	unsigned int len;
++	bool fragment;
++	bool done;
++	__sum16 *csum;
++
++	fragment = false;
++	done = false;
++
++	off = sizeof(struct ipv6hdr);
++
++	err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
++	if (err < 0)
++		goto out;
++
++	nexthdr = ipv6_hdr(skb)->nexthdr;
++
++	len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
++	while (off <= len && !done) {
++		switch (nexthdr) {
++		case IPPROTO_DSTOPTS:
++		case IPPROTO_HOPOPTS:
++		case IPPROTO_ROUTING: {
++			struct ipv6_opt_hdr *hp;
++
++			err = skb_maybe_pull_tail(skb,
++						  off +
++						  sizeof(struct ipv6_opt_hdr),
++						  MAX_IPV6_HDR_LEN);
++			if (err < 0)
++				goto out;
++
++			hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
++			nexthdr = hp->nexthdr;
++			off += ipv6_optlen(hp);
++			break;
++		}
++		case IPPROTO_AH: {
++			struct ip_auth_hdr *hp;
++
++			err = skb_maybe_pull_tail(skb,
++						  off +
++						  sizeof(struct ip_auth_hdr),
++						  MAX_IPV6_HDR_LEN);
++			if (err < 0)
++				goto out;
++
++			hp = OPT_HDR(struct ip_auth_hdr, skb, off);
++			nexthdr = hp->nexthdr;
++			off += ipv6_authlen(hp);
++			break;
++		}
++		case IPPROTO_FRAGMENT: {
++			struct frag_hdr *hp;
++
++			err = skb_maybe_pull_tail(skb,
++						  off +
++						  sizeof(struct frag_hdr),
++						  MAX_IPV6_HDR_LEN);
++			if (err < 0)
++				goto out;
++
++			hp = OPT_HDR(struct frag_hdr, skb, off);
++
++			if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
++				fragment = true;
++
++			nexthdr = hp->nexthdr;
++			off += sizeof(struct frag_hdr);
++			break;
++		}
++		default:
++			done = true;
++			break;
++		}
++	}
++
++	err = -EPROTO;
++
++	if (!done || fragment)
++		goto out;
++
++	csum = skb_checksum_setup_ip(skb, nexthdr, off);
++	if (IS_ERR(csum))
++		return PTR_ERR(csum);
++
++	if (recalculate)
++		*csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
++					 &ipv6_hdr(skb)->daddr,
++					 skb->len - off, nexthdr, 0);
++	err = 0;
++
++out:
++	return err;
++}
++
++/**
++ * skb_checksum_setup - set up partial checksum offset
++ * @skb: the skb to set up
++ * @recalculate: if true the pseudo-header checksum will be recalculated
++ */
++int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
++{
++	int err;
++
++	switch (skb->protocol) {
++	case htons(ETH_P_IP):
++		err = skb_checksum_setup_ipv4(skb, recalculate);
++		break;
++
++	case htons(ETH_P_IPV6):
++		err = skb_checksum_setup_ipv6(skb, recalculate);
++		break;
++
++	default:
++		err = -EPROTO;
++		break;
++	}
++
++	return err;
++}
++EXPORT_SYMBOL(skb_checksum_setup);
++
++/**
++ * skb_checksum_maybe_trim - maybe trims the given skb
++ * @skb: the skb to check
++ * @transport_len: the data length beyond the network header
++ *
++ * Checks whether the given skb has data beyond the given transport length.
++ * If so, returns a cloned skb trimmed to this transport length.
++ * Otherwise returns the provided skb. Returns NULL in error cases
++ * (e.g. transport_len exceeds skb length or out-of-memory).
++ *
++ * Caller needs to set the skb transport header and free any returned skb if it
++ * differs from the provided skb.
++ */
++static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
++					       unsigned int transport_len)
++{
++	struct sk_buff *skb_chk;
++	unsigned int len = skb_transport_offset(skb) + transport_len;
++	int ret;
++
++	if (skb->len < len)
++		return NULL;
++	else if (skb->len == len)
++		return skb;
++
++	skb_chk = skb_clone(skb, GFP_ATOMIC);
++	if (!skb_chk)
++		return NULL;
++
++	ret = pskb_trim_rcsum(skb_chk, len);
++	if (ret) {
++		kfree_skb(skb_chk);
++		return NULL;
++	}
++
++	return skb_chk;
++}
++
++/**
++ * skb_checksum_trimmed - validate checksum of an skb
++ * @skb: the skb to check
++ * @transport_len: the data length beyond the network header
++ * @skb_chkf: checksum function to use
++ *
++ * Applies the given checksum function skb_chkf to the provided skb.
++ * Returns a checked and maybe trimmed skb. Returns NULL on error.
++ *
++ * If the skb has data beyond the given transport length, then a
++ * trimmed & cloned skb is checked and returned.
++ *
++ * Caller needs to set the skb transport header and free any returned skb if it
++ * differs from the provided skb.
++ */
++struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
++				     unsigned int transport_len,
++				     __sum16(*skb_chkf)(struct sk_buff *skb))
++{
++	struct sk_buff *skb_chk;
++	unsigned int offset = skb_transport_offset(skb);
++	__sum16 ret;
++
++	skb_chk = skb_checksum_maybe_trim(skb, transport_len);
++	if (!skb_chk)
++		goto err;
++
++	if (!pskb_may_pull(skb_chk, offset))
++		goto err;
++
++	skb_pull_rcsum(skb_chk, offset);
++	ret = skb_chkf(skb_chk);
++	skb_push_rcsum(skb_chk, offset);
++
++	if (ret)
++		goto err;
++
++	return skb_chk;
++
++err:
++	if (skb_chk && skb_chk != skb)
++		kfree_skb(skb_chk);
++
++	return NULL;
++
++}
++EXPORT_SYMBOL(skb_checksum_trimmed);
++
++void __skb_warn_lro_forwarding(const struct sk_buff *skb)
++{
++	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
++			     skb->dev->name);
++}
++EXPORT_SYMBOL(__skb_warn_lro_forwarding);
++
++void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
++{
++	if (head_stolen) {
++		skb_release_head_state(skb);
++		kmem_cache_free(skbuff_head_cache, skb);
++	} else {
++		__kfree_skb(skb);
++	}
++}
++EXPORT_SYMBOL(kfree_skb_partial);
++
++/**
++ * skb_try_coalesce - try to merge skb to prior one
++ * @to: prior buffer
++ * @from: buffer to add
++ * @fragstolen: pointer to boolean
++ * @delta_truesize: how much more was allocated than was requested
++ */
++bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
++		      bool *fragstolen, int *delta_truesize)
++{
++	struct skb_shared_info *to_shinfo, *from_shinfo;
++	int i, delta, len = from->len;
++
++	*fragstolen = false;
++
++	if (skb_cloned(to))
++		return false;
++
++	/* In general, avoid mixing slab allocated and page_pool allocated
++	 * pages within the same SKB. However when @to is not pp_recycle and
++	 * @from is cloned, we can transition frag pages from page_pool to
++	 * reference counted.
++	 *
++	 * On the other hand, don't allow coalescing two pp_recycle SKBs if
++	 * @from is cloned, in case the SKB is using page_pool fragment
++	 * references (PP_FLAG_PAGE_FRAG). Since we only take full page
++	 * references for cloned SKBs at the moment that would result in
++	 * inconsistent reference counts.
++	 */
++	if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from)))
++		return false;
++
++	if (len <= skb_tailroom(to)) {
++		if (len)
++			BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
++		*delta_truesize = 0;
++		return true;
++	}
++
++	to_shinfo = skb_shinfo(to);
++	from_shinfo = skb_shinfo(from);
++	if (to_shinfo->frag_list || from_shinfo->frag_list)
++		return false;
++	if (skb_zcopy(to) || skb_zcopy(from))
++		return false;
++
++	if (skb_headlen(from) != 0) {
++		struct page *page;
++		unsigned int offset;
++
++		if (to_shinfo->nr_frags +
++		    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
++			return false;
++
++		if (skb_head_is_locked(from))
++			return false;
++
++		delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
++
++		page = virt_to_head_page(from->head);
++		offset = from->data - (unsigned char *)page_address(page);
++
++		skb_fill_page_desc(to, to_shinfo->nr_frags,
++				   page, offset, skb_headlen(from));
++		*fragstolen = true;
++	} else {
++		if (to_shinfo->nr_frags +
++		    from_shinfo->nr_frags > MAX_SKB_FRAGS)
++			return false;
++
++		delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
++	}
++
++	WARN_ON_ONCE(delta < len);
++
++	memcpy(to_shinfo->frags + to_shinfo->nr_frags,
++	       from_shinfo->frags,
++	       from_shinfo->nr_frags * sizeof(skb_frag_t));
++	to_shinfo->nr_frags += from_shinfo->nr_frags;
++
++	if (!skb_cloned(from))
++		from_shinfo->nr_frags = 0;
++
++	/* if the skb is not cloned this does nothing
++	 * since we set nr_frags to 0.
++	 */
++	for (i = 0; i < from_shinfo->nr_frags; i++)
++		__skb_frag_ref(&from_shinfo->frags[i]);
++
++	to->truesize += delta;
++	to->len += len;
++	to->data_len += len;
++
++	*delta_truesize = delta;
++	return true;
++}
++EXPORT_SYMBOL(skb_try_coalesce);
++
++/**
++ * skb_scrub_packet - scrub an skb
++ *
++ * @skb: buffer to clean
++ * @xnet: packet is crossing netns
++ *
++ * skb_scrub_packet can be used after encapsulating or decapsulting a packet
++ * into/from a tunnel. Some information have to be cleared during these
++ * operations.
++ * skb_scrub_packet can also be used to clean a skb before injecting it in
++ * another namespace (@xnet == true). We have to clear all information in the
++ * skb that could impact namespace isolation.
++ */
++void skb_scrub_packet(struct sk_buff *skb, bool xnet)
++{
++	skb->pkt_type = PACKET_HOST;
++	skb->skb_iif = 0;
++	skb->ignore_df = 0;
++	skb_dst_drop(skb);
++	skb_ext_reset(skb);
++	nf_reset_ct(skb);
++	nf_reset_trace(skb);
++
++#ifdef CONFIG_NET_SWITCHDEV
++	skb->offload_fwd_mark = 0;
++	skb->offload_l3_fwd_mark = 0;
++#endif
++
++	if (!xnet)
++		return;
++
++	ipvs_reset(skb);
++	skb->mark = 0;
++	skb_clear_tstamp(skb);
++}
++EXPORT_SYMBOL_GPL(skb_scrub_packet);
++
++/**
++ * skb_gso_transport_seglen - Return length of individual segments of a gso packet
++ *
++ * @skb: GSO skb
++ *
++ * skb_gso_transport_seglen is used to determine the real size of the
++ * individual segments, including Layer4 headers (TCP/UDP).
++ *
++ * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
++ */
++static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
++{
++	const struct skb_shared_info *shinfo = skb_shinfo(skb);
++	unsigned int thlen = 0;
++
++	if (skb->encapsulation) {
++		thlen = skb_inner_transport_header(skb) -
++			skb_transport_header(skb);
++
++		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
++			thlen += inner_tcp_hdrlen(skb);
++	} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
++		thlen = tcp_hdrlen(skb);
++	} else if (unlikely(skb_is_gso_sctp(skb))) {
++		thlen = sizeof(struct sctphdr);
++	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
++		thlen = sizeof(struct udphdr);
++	}
++	/* UFO sets gso_size to the size of the fragmentation
++	 * payload, i.e. the size of the L4 (UDP) header is already
++	 * accounted for.
++	 */
++	return thlen + shinfo->gso_size;
++}
++
++/**
++ * skb_gso_network_seglen - Return length of individual segments of a gso packet
++ *
++ * @skb: GSO skb
++ *
++ * skb_gso_network_seglen is used to determine the real size of the
++ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
++ *
++ * The MAC/L2 header is not accounted for.
++ */
++static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
++{
++	unsigned int hdr_len = skb_transport_header(skb) -
++			       skb_network_header(skb);
++
++	return hdr_len + skb_gso_transport_seglen(skb);
++}
++
++/**
++ * skb_gso_mac_seglen - Return length of individual segments of a gso packet
++ *
++ * @skb: GSO skb
++ *
++ * skb_gso_mac_seglen is used to determine the real size of the
++ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
++ * headers (TCP/UDP).
++ */
++static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
++{
++	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
++
++	return hdr_len + skb_gso_transport_seglen(skb);
++}
++
++/**
++ * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
++ *
++ * There are a couple of instances where we have a GSO skb, and we
++ * want to determine what size it would be after it is segmented.
++ *
++ * We might want to check:
++ * -    L3+L4+payload size (e.g. IP forwarding)
++ * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
++ *
++ * This is a helper to do that correctly considering GSO_BY_FRAGS.
++ *
++ * @skb: GSO skb
++ *
++ * @seg_len: The segmented length (from skb_gso_*_seglen). In the
++ *           GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
++ *
++ * @max_len: The maximum permissible length.
++ *
++ * Returns true if the segmented length <= max length.
++ */
++static inline bool skb_gso_size_check(const struct sk_buff *skb,
++				      unsigned int seg_len,
++				      unsigned int max_len) {
++	const struct skb_shared_info *shinfo = skb_shinfo(skb);
++	const struct sk_buff *iter;
++
++	if (shinfo->gso_size != GSO_BY_FRAGS)
++		return seg_len <= max_len;
++
++	/* Undo this so we can re-use header sizes */
++	seg_len -= GSO_BY_FRAGS;
++
++	skb_walk_frags(skb, iter) {
++		if (seg_len + skb_headlen(iter) > max_len)
++			return false;
++	}
++
++	return true;
++}
++
++/**
++ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
++ *
++ * @skb: GSO skb
++ * @mtu: MTU to validate against
++ *
++ * skb_gso_validate_network_len validates if a given skb will fit a
++ * wanted MTU once split. It considers L3 headers, L4 headers, and the
++ * payload.
++ */
++bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
++{
++	return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
++}
++EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
++
++/**
++ * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
++ *
++ * @skb: GSO skb
++ * @len: length to validate against
++ *
++ * skb_gso_validate_mac_len validates if a given skb will fit a wanted
++ * length once split, including L2, L3 and L4 headers and the payload.
++ */
++bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
++{
++	return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
++}
++EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
++
++static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
++{
++	int mac_len, meta_len;
++	void *meta;
++
++	if (skb_cow(skb, skb_headroom(skb)) < 0) {
++		kfree_skb(skb);
++		return NULL;
++	}
++
++	mac_len = skb->data - skb_mac_header(skb);
++	if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
++		memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
++			mac_len - VLAN_HLEN - ETH_TLEN);
++	}
++
++	meta_len = skb_metadata_len(skb);
++	if (meta_len) {
++		meta = skb_metadata_end(skb) - meta_len;
++		memmove(meta + VLAN_HLEN, meta, meta_len);
++	}
++
++	skb->mac_header += VLAN_HLEN;
++	return skb;
++}
++
++struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
++{
++	struct vlan_hdr *vhdr;
++	u16 vlan_tci;
++
++	if (unlikely(skb_vlan_tag_present(skb))) {
++		/* vlan_tci is already set-up so leave this for another time */
++		return skb;
++	}
++
++	skb = skb_share_check(skb, GFP_ATOMIC);
++	if (unlikely(!skb))
++		goto err_free;
++	/* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
++	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
++		goto err_free;
++
++	vhdr = (struct vlan_hdr *)skb->data;
++	vlan_tci = ntohs(vhdr->h_vlan_TCI);
++	__vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
++
++	skb_pull_rcsum(skb, VLAN_HLEN);
++	vlan_set_encap_proto(skb, vhdr);
++
++	skb = skb_reorder_vlan_header(skb);
++	if (unlikely(!skb))
++		goto err_free;
++
++	skb_reset_network_header(skb);
++	if (!skb_transport_header_was_set(skb))
++		skb_reset_transport_header(skb);
++	skb_reset_mac_len(skb);
++
++	return skb;
++
++err_free:
++	kfree_skb(skb);
++	return NULL;
++}
++EXPORT_SYMBOL(skb_vlan_untag);
++
++int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
++{
++	if (!pskb_may_pull(skb, write_len))
++		return -ENOMEM;
++
++	if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
++		return 0;
++
++	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
++}
++EXPORT_SYMBOL(skb_ensure_writable);
++
++/* remove VLAN header from packet and update csum accordingly.
++ * expects a non skb_vlan_tag_present skb with a vlan tag payload
++ */
++int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
++{
++	struct vlan_hdr *vhdr;
++	int offset = skb->data - skb_mac_header(skb);
++	int err;
++
++	if (WARN_ONCE(offset,
++		      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
++		      offset)) {
++		return -EINVAL;
++	}
++
++	err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
++	if (unlikely(err))
++		return err;
++
++	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
++
++	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
++	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
++
++	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
++	__skb_pull(skb, VLAN_HLEN);
++
++	vlan_set_encap_proto(skb, vhdr);
++	skb->mac_header += VLAN_HLEN;
++
++	if (skb_network_offset(skb) < ETH_HLEN)
++		skb_set_network_header(skb, ETH_HLEN);
++
++	skb_reset_mac_len(skb);
++
++	return err;
++}
++EXPORT_SYMBOL(__skb_vlan_pop);
++
++/* Pop a vlan tag either from hwaccel or from payload.
++ * Expects skb->data at mac header.
++ */
++int skb_vlan_pop(struct sk_buff *skb)
++{
++	u16 vlan_tci;
++	__be16 vlan_proto;
++	int err;
++
++	if (likely(skb_vlan_tag_present(skb))) {
++		__vlan_hwaccel_clear_tag(skb);
++	} else {
++		if (unlikely(!eth_type_vlan(skb->protocol)))
++			return 0;
++
++		err = __skb_vlan_pop(skb, &vlan_tci);
++		if (err)
++			return err;
++	}
++	/* move next vlan tag to hw accel tag */
++	if (likely(!eth_type_vlan(skb->protocol)))
++		return 0;
++
++	vlan_proto = skb->protocol;
++	err = __skb_vlan_pop(skb, &vlan_tci);
++	if (unlikely(err))
++		return err;
++
++	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
++	return 0;
++}
++EXPORT_SYMBOL(skb_vlan_pop);
++
++/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
++ * Expects skb->data at mac header.
++ */
++int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
++{
++	if (skb_vlan_tag_present(skb)) {
++		int offset = skb->data - skb_mac_header(skb);
++		int err;
++
++		if (WARN_ONCE(offset,
++			      "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
++			      offset)) {
++			return -EINVAL;
++		}
++
++		err = __vlan_insert_tag(skb, skb->vlan_proto,
++					skb_vlan_tag_get(skb));
++		if (err)
++			return err;
++
++		skb->protocol = skb->vlan_proto;
++		skb->mac_len += VLAN_HLEN;
++
++		skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
++	}
++	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
++	return 0;
++}
++EXPORT_SYMBOL(skb_vlan_push);
++
++/**
++ * skb_eth_pop() - Drop the Ethernet header at the head of a packet
++ *
++ * @skb: Socket buffer to modify
++ *
++ * Drop the Ethernet header of @skb.
++ *
++ * Expects that skb->data points to the mac header and that no VLAN tags are
++ * present.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_eth_pop(struct sk_buff *skb)
++{
++	if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
++	    skb_network_offset(skb) < ETH_HLEN)
++		return -EPROTO;
++
++	skb_pull_rcsum(skb, ETH_HLEN);
++	skb_reset_mac_header(skb);
++	skb_reset_mac_len(skb);
++
++	return 0;
++}
++EXPORT_SYMBOL(skb_eth_pop);
++
++/**
++ * skb_eth_push() - Add a new Ethernet header at the head of a packet
++ *
++ * @skb: Socket buffer to modify
++ * @dst: Destination MAC address of the new header
++ * @src: Source MAC address of the new header
++ *
++ * Prepend @skb with a new Ethernet header.
++ *
++ * Expects that skb->data points to the mac header, which must be empty.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
++		 const unsigned char *src)
++{
++	struct ethhdr *eth;
++	int err;
++
++	if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
++		return -EPROTO;
++
++	err = skb_cow_head(skb, sizeof(*eth));
++	if (err < 0)
++		return err;
++
++	skb_push(skb, sizeof(*eth));
++	skb_reset_mac_header(skb);
++	skb_reset_mac_len(skb);
++
++	eth = eth_hdr(skb);
++	ether_addr_copy(eth->h_dest, dst);
++	ether_addr_copy(eth->h_source, src);
++	eth->h_proto = skb->protocol;
++
++	skb_postpush_rcsum(skb, eth, sizeof(*eth));
++
++	return 0;
++}
++EXPORT_SYMBOL(skb_eth_push);
++
++/* Update the ethertype of hdr and the skb csum value if required. */
++static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
++			     __be16 ethertype)
++{
++	if (skb->ip_summed == CHECKSUM_COMPLETE) {
++		__be16 diff[] = { ~hdr->h_proto, ethertype };
++
++		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
++	}
++
++	hdr->h_proto = ethertype;
++}
++
++/**
++ * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
++ *                   the packet
++ *
++ * @skb: buffer
++ * @mpls_lse: MPLS label stack entry to push
++ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
++ * @mac_len: length of the MAC header
++ * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
++ *            ethernet
++ *
++ * Expects skb->data at mac header.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
++		  int mac_len, bool ethernet)
++{
++	struct mpls_shim_hdr *lse;
++	int err;
++
++	if (unlikely(!eth_p_mpls(mpls_proto)))
++		return -EINVAL;
++
++	/* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
++	if (skb->encapsulation)
++		return -EINVAL;
++
++	err = skb_cow_head(skb, MPLS_HLEN);
++	if (unlikely(err))
++		return err;
++
++	if (!skb->inner_protocol) {
++		skb_set_inner_network_header(skb, skb_network_offset(skb));
++		skb_set_inner_protocol(skb, skb->protocol);
++	}
++
++	skb_push(skb, MPLS_HLEN);
++	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
++		mac_len);
++	skb_reset_mac_header(skb);
++	skb_set_network_header(skb, mac_len);
++	skb_reset_mac_len(skb);
++
++	lse = mpls_hdr(skb);
++	lse->label_stack_entry = mpls_lse;
++	skb_postpush_rcsum(skb, lse, MPLS_HLEN);
++
++	if (ethernet && mac_len >= ETH_HLEN)
++		skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
++	skb->protocol = mpls_proto;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_mpls_push);
++
++/**
++ * skb_mpls_pop() - pop the outermost MPLS header
++ *
++ * @skb: buffer
++ * @next_proto: ethertype of header after popped MPLS header
++ * @mac_len: length of the MAC header
++ * @ethernet: flag to indicate if the packet is ethernet
++ *
++ * Expects skb->data at mac header.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
++		 bool ethernet)
++{
++	int err;
++
++	if (unlikely(!eth_p_mpls(skb->protocol)))
++		return 0;
++
++	err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
++	if (unlikely(err))
++		return err;
++
++	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
++	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
++		mac_len);
++
++	__skb_pull(skb, MPLS_HLEN);
++	skb_reset_mac_header(skb);
++	skb_set_network_header(skb, mac_len);
++
++	if (ethernet && mac_len >= ETH_HLEN) {
++		struct ethhdr *hdr;
++
++		/* use mpls_hdr() to get ethertype to account for VLANs. */
++		hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
++		skb_mod_eth_type(skb, hdr, next_proto);
++	}
++	skb->protocol = next_proto;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_mpls_pop);
++
++/**
++ * skb_mpls_update_lse() - modify outermost MPLS header and update csum
++ *
++ * @skb: buffer
++ * @mpls_lse: new MPLS label stack entry to update to
++ *
++ * Expects skb->data at mac header.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
++{
++	int err;
++
++	if (unlikely(!eth_p_mpls(skb->protocol)))
++		return -EINVAL;
++
++	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
++	if (unlikely(err))
++		return err;
++
++	if (skb->ip_summed == CHECKSUM_COMPLETE) {
++		__be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
++
++		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
++	}
++
++	mpls_hdr(skb)->label_stack_entry = mpls_lse;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
++
++/**
++ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
++ *
++ * @skb: buffer
++ *
++ * Expects skb->data at mac header.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_mpls_dec_ttl(struct sk_buff *skb)
++{
++	u32 lse;
++	u8 ttl;
++
++	if (unlikely(!eth_p_mpls(skb->protocol)))
++		return -EINVAL;
++
++	if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
++		return -ENOMEM;
++
++	lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
++	ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
++	if (!--ttl)
++		return -EINVAL;
++
++	lse &= ~MPLS_LS_TTL_MASK;
++	lse |= ttl << MPLS_LS_TTL_SHIFT;
++
++	return skb_mpls_update_lse(skb, cpu_to_be32(lse));
++}
++EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
++
++/**
++ * alloc_skb_with_frags - allocate skb with page frags
++ *
++ * @header_len: size of linear part
++ * @data_len: needed length in frags
++ * @max_page_order: max page order desired.
++ * @errcode: pointer to error code if any
++ * @gfp_mask: allocation mask
++ *
++ * This can be used to allocate a paged skb, given a maximal order for frags.
++ */
++struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
++				     unsigned long data_len,
++				     int max_page_order,
++				     int *errcode,
++				     gfp_t gfp_mask)
++{
++	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
++	unsigned long chunk;
++	struct sk_buff *skb;
++	struct page *page;
++	int i;
++
++	*errcode = -EMSGSIZE;
++	/* Note this test could be relaxed, if we succeed to allocate
++	 * high order pages...
++	 */
++	if (npages > MAX_SKB_FRAGS)
++		return NULL;
++
++	*errcode = -ENOBUFS;
++	skb = alloc_skb(header_len, gfp_mask);
++	if (!skb)
++		return NULL;
++
++	skb->truesize += npages << PAGE_SHIFT;
++
++	for (i = 0; npages > 0; i++) {
++		int order = max_page_order;
++
++		while (order) {
++			if (npages >= 1 << order) {
++				page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
++						   __GFP_COMP |
++						   __GFP_NOWARN,
++						   order);
++				if (page)
++					goto fill_page;
++				/* Do not retry other high order allocations */
++				order = 1;
++				max_page_order = 0;
++			}
++			order--;
++		}
++		page = alloc_page(gfp_mask);
++		if (!page)
++			goto failure;
++fill_page:
++		chunk = min_t(unsigned long, data_len,
++			      PAGE_SIZE << order);
++		skb_fill_page_desc(skb, i, page, 0, chunk);
++		data_len -= chunk;
++		npages -= 1 << order;
++	}
++	return skb;
++
++failure:
++	kfree_skb(skb);
++	return NULL;
++}
++EXPORT_SYMBOL(alloc_skb_with_frags);
++
++/* carve out the first off bytes from skb when off < headlen */
++static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
++				    const int headlen, gfp_t gfp_mask)
++{
++	int i;
++	int size = skb_end_offset(skb);
++	int new_hlen = headlen - off;
++	u8 *data;
++
++	size = SKB_DATA_ALIGN(size);
++
++	if (skb_pfmemalloc(skb))
++		gfp_mask |= __GFP_MEMALLOC;
++	data = kmalloc_reserve(size +
++			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
++			       gfp_mask, NUMA_NO_NODE, NULL);
++	if (!data)
++		return -ENOMEM;
++
++	size = SKB_WITH_OVERHEAD(ksize(data));
++
++	/* Copy real data, and all frags */
++	skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
++	skb->len -= off;
++
++	memcpy((struct skb_shared_info *)(data + size),
++	       skb_shinfo(skb),
++	       offsetof(struct skb_shared_info,
++			frags[skb_shinfo(skb)->nr_frags]));
++	if (skb_cloned(skb)) {
++		/* drop the old head gracefully */
++		if (skb_orphan_frags(skb, gfp_mask)) {
++			kfree(data);
++			return -ENOMEM;
++		}
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
++			skb_frag_ref(skb, i);
++		if (skb_has_frag_list(skb))
++			skb_clone_fraglist(skb);
++		skb_release_data(skb);
++	} else {
++		/* we can reuse existing recount- all we did was
++		 * relocate values
++		 */
++		skb_free_head(skb);
++	}
++
++	skb->head = data;
++	skb->data = data;
++	skb->head_frag = 0;
++	skb_set_end_offset(skb, size);
++	skb_set_tail_pointer(skb, skb_headlen(skb));
++	skb_headers_offset_update(skb, 0);
++	skb->cloned = 0;
++	skb->hdr_len = 0;
++	skb->nohdr = 0;
++	atomic_set(&skb_shinfo(skb)->dataref, 1);
++
++	return 0;
++}
++
++static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
++
++/* carve out the first eat bytes from skb's frag_list. May recurse into
++ * pskb_carve()
++ */
++static int pskb_carve_frag_list(struct sk_buff *skb,
++				struct skb_shared_info *shinfo, int eat,
++				gfp_t gfp_mask)
++{
++	struct sk_buff *list = shinfo->frag_list;
++	struct sk_buff *clone = NULL;
++	struct sk_buff *insp = NULL;
++
++	do {
++		if (!list) {
++			pr_err("Not enough bytes to eat. Want %d\n", eat);
++			return -EFAULT;
++		}
++		if (list->len <= eat) {
++			/* Eaten as whole. */
++			eat -= list->len;
++			list = list->next;
++			insp = list;
++		} else {
++			/* Eaten partially. */
++			if (skb_shared(list)) {
++				clone = skb_clone(list, gfp_mask);
++				if (!clone)
++					return -ENOMEM;
++				insp = list->next;
++				list = clone;
++			} else {
++				/* This may be pulled without problems. */
++				insp = list;
++			}
++			if (pskb_carve(list, eat, gfp_mask) < 0) {
++				kfree_skb(clone);
++				return -ENOMEM;
++			}
++			break;
++		}
++	} while (eat);
++
++	/* Free pulled out fragments. */
++	while ((list = shinfo->frag_list) != insp) {
++		shinfo->frag_list = list->next;
++		consume_skb(list);
++	}
++	/* And insert new clone at head. */
++	if (clone) {
++		clone->next = list;
++		shinfo->frag_list = clone;
++	}
++	return 0;
++}
++
++/* carve off first len bytes from skb. Split line (off) is in the
++ * non-linear part of skb
++ */
++static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
++				       int pos, gfp_t gfp_mask)
++{
++	int i, k = 0;
++	int size = skb_end_offset(skb);
++	u8 *data;
++	const int nfrags = skb_shinfo(skb)->nr_frags;
++	struct skb_shared_info *shinfo;
++
++	size = SKB_DATA_ALIGN(size);
++
++	if (skb_pfmemalloc(skb))
++		gfp_mask |= __GFP_MEMALLOC;
++	data = kmalloc_reserve(size +
++			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
++			       gfp_mask, NUMA_NO_NODE, NULL);
++	if (!data)
++		return -ENOMEM;
++
++	size = SKB_WITH_OVERHEAD(ksize(data));
++
++	memcpy((struct skb_shared_info *)(data + size),
++	       skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
++	if (skb_orphan_frags(skb, gfp_mask)) {
++		kfree(data);
++		return -ENOMEM;
++	}
++	shinfo = (struct skb_shared_info *)(data + size);
++	for (i = 0; i < nfrags; i++) {
++		int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (pos + fsize > off) {
++			shinfo->frags[k] = skb_shinfo(skb)->frags[i];
++
++			if (pos < off) {
++				/* Split frag.
++				 * We have two variants in this case:
++				 * 1. Move all the frag to the second
++				 *    part, if it is possible. F.e.
++				 *    this approach is mandatory for TUX,
++				 *    where splitting is expensive.
++				 * 2. Split is accurately. We make this.
++				 */
++				skb_frag_off_add(&shinfo->frags[0], off - pos);
++				skb_frag_size_sub(&shinfo->frags[0], off - pos);
++			}
++			skb_frag_ref(skb, i);
++			k++;
++		}
++		pos += fsize;
++	}
++	shinfo->nr_frags = k;
++	if (skb_has_frag_list(skb))
++		skb_clone_fraglist(skb);
++
++	/* split line is in frag list */
++	if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
++		/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
++		if (skb_has_frag_list(skb))
++			kfree_skb_list(skb_shinfo(skb)->frag_list);
++		kfree(data);
++		return -ENOMEM;
++	}
++	skb_release_data(skb);
++
++	skb->head = data;
++	skb->head_frag = 0;
++	skb->data = data;
++	skb_set_end_offset(skb, size);
++	skb_reset_tail_pointer(skb);
++	skb_headers_offset_update(skb, 0);
++	skb->cloned   = 0;
++	skb->hdr_len  = 0;
++	skb->nohdr    = 0;
++	skb->len -= off;
++	skb->data_len = skb->len;
++	atomic_set(&skb_shinfo(skb)->dataref, 1);
++	return 0;
++}
++
++/* remove len bytes from the beginning of the skb */
++static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
++{
++	int headlen = skb_headlen(skb);
++
++	if (len < headlen)
++		return pskb_carve_inside_header(skb, len, headlen, gfp);
++	else
++		return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
++}
++
++/* Extract to_copy bytes starting at off from skb, and return this in
++ * a new skb
++ */
++struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
++			     int to_copy, gfp_t gfp)
++{
++	struct sk_buff  *clone = skb_clone(skb, gfp);
++
++	if (!clone)
++		return NULL;
++
++	if (pskb_carve(clone, off, gfp) < 0 ||
++	    pskb_trim(clone, to_copy)) {
++		kfree_skb(clone);
++		return NULL;
++	}
++	return clone;
++}
++EXPORT_SYMBOL(pskb_extract);
++
++/**
++ * skb_condense - try to get rid of fragments/frag_list if possible
++ * @skb: buffer
++ *
++ * Can be used to save memory before skb is added to a busy queue.
++ * If packet has bytes in frags and enough tail room in skb->head,
++ * pull all of them, so that we can free the frags right now and adjust
++ * truesize.
++ * Notes:
++ *	We do not reallocate skb->head thus can not fail.
++ *	Caller must re-evaluate skb->truesize if needed.
++ */
++void skb_condense(struct sk_buff *skb)
++{
++	if (skb->data_len) {
++		if (skb->data_len > skb->end - skb->tail ||
++		    skb_cloned(skb))
++			return;
++
++		/* Nice, we can free page frag(s) right now */
++		__pskb_pull_tail(skb, skb->data_len);
++	}
++	/* At this point, skb->truesize might be over estimated,
++	 * because skb had a fragment, and fragments do not tell
++	 * their truesize.
++	 * When we pulled its content into skb->head, fragment
++	 * was freed, but __pskb_pull_tail() could not possibly
++	 * adjust skb->truesize, not knowing the frag truesize.
++	 */
++	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
++}
++
++#ifdef CONFIG_SKB_EXTENSIONS
++static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
++{
++	return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
++}
++
++/**
++ * __skb_ext_alloc - allocate a new skb extensions storage
++ *
++ * @flags: See kmalloc().
++ *
++ * Returns the newly allocated pointer. The pointer can later attached to a
++ * skb via __skb_ext_set().
++ * Note: caller must handle the skb_ext as an opaque data.
++ */
++struct skb_ext *__skb_ext_alloc(gfp_t flags)
++{
++	struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
++
++	if (new) {
++		memset(new->offset, 0, sizeof(new->offset));
++		refcount_set(&new->refcnt, 1);
++	}
++
++	return new;
++}
++
++static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
++					 unsigned int old_active)
++{
++	struct skb_ext *new;
++
++	if (refcount_read(&old->refcnt) == 1)
++		return old;
++
++	new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
++	if (!new)
++		return NULL;
++
++	memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
++	refcount_set(&new->refcnt, 1);
++
++#ifdef CONFIG_XFRM
++	if (old_active & (1 << SKB_EXT_SEC_PATH)) {
++		struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
++		unsigned int i;
++
++		for (i = 0; i < sp->len; i++)
++			xfrm_state_hold(sp->xvec[i]);
++	}
++#endif
++	__skb_ext_put(old);
++	return new;
++}
++
++/**
++ * __skb_ext_set - attach the specified extension storage to this skb
++ * @skb: buffer
++ * @id: extension id
++ * @ext: extension storage previously allocated via __skb_ext_alloc()
++ *
++ * Existing extensions, if any, are cleared.
++ *
++ * Returns the pointer to the extension.
++ */
++void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
++		    struct skb_ext *ext)
++{
++	unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
++
++	skb_ext_put(skb);
++	newlen = newoff + skb_ext_type_len[id];
++	ext->chunks = newlen;
++	ext->offset[id] = newoff;
++	skb->extensions = ext;
++	skb->active_extensions = 1 << id;
++	return skb_ext_get_ptr(ext, id);
++}
++
++/**
++ * skb_ext_add - allocate space for given extension, COW if needed
++ * @skb: buffer
++ * @id: extension to allocate space for
++ *
++ * Allocates enough space for the given extension.
++ * If the extension is already present, a pointer to that extension
++ * is returned.
++ *
++ * If the skb was cloned, COW applies and the returned memory can be
++ * modified without changing the extension space of clones buffers.
++ *
++ * Returns pointer to the extension or NULL on allocation failure.
++ */
++void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
++{
++	struct skb_ext *new, *old = NULL;
++	unsigned int newlen, newoff;
++
++	if (skb->active_extensions) {
++		old = skb->extensions;
++
++		new = skb_ext_maybe_cow(old, skb->active_extensions);
++		if (!new)
++			return NULL;
++
++		if (__skb_ext_exist(new, id))
++			goto set_active;
++
++		newoff = new->chunks;
++	} else {
++		newoff = SKB_EXT_CHUNKSIZEOF(*new);
++
++		new = __skb_ext_alloc(GFP_ATOMIC);
++		if (!new)
++			return NULL;
++	}
++
++	newlen = newoff + skb_ext_type_len[id];
++	new->chunks = newlen;
++	new->offset[id] = newoff;
++set_active:
++	skb->slow_gro = 1;
++	skb->extensions = new;
++	skb->active_extensions |= 1 << id;
++	return skb_ext_get_ptr(new, id);
++}
++EXPORT_SYMBOL(skb_ext_add);
++
++#ifdef CONFIG_XFRM
++static void skb_ext_put_sp(struct sec_path *sp)
++{
++	unsigned int i;
++
++	for (i = 0; i < sp->len; i++)
++		xfrm_state_put(sp->xvec[i]);
++}
++#endif
++
++#ifdef CONFIG_MCTP_FLOWS
++static void skb_ext_put_mctp(struct mctp_flow *flow)
++{
++	if (flow->key)
++		mctp_key_unref(flow->key);
++}
++#endif
++
++void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
++{
++	struct skb_ext *ext = skb->extensions;
++
++	skb->active_extensions &= ~(1 << id);
++	if (skb->active_extensions == 0) {
++		skb->extensions = NULL;
++		__skb_ext_put(ext);
++#ifdef CONFIG_XFRM
++	} else if (id == SKB_EXT_SEC_PATH &&
++		   refcount_read(&ext->refcnt) == 1) {
++		struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
++
++		skb_ext_put_sp(sp);
++		sp->len = 0;
++#endif
++	}
++}
++EXPORT_SYMBOL(__skb_ext_del);
++
++void __skb_ext_put(struct skb_ext *ext)
++{
++	/* If this is last clone, nothing can increment
++	 * it after check passes.  Avoids one atomic op.
++	 */
++	if (refcount_read(&ext->refcnt) == 1)
++		goto free_now;
++
++	if (!refcount_dec_and_test(&ext->refcnt))
++		return;
++free_now:
++#ifdef CONFIG_XFRM
++	if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
++		skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
++#endif
++#ifdef CONFIG_MCTP_FLOWS
++	if (__skb_ext_exist(ext, SKB_EXT_MCTP))
++		skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
++#endif
++
++	kmem_cache_free(skbuff_ext_cache, ext);
++}
++EXPORT_SYMBOL(__skb_ext_put);
++#endif /* CONFIG_SKB_EXTENSIONS */
++
++/**
++ * skb_attempt_defer_free - queue skb for remote freeing
++ * @skb: buffer
++ *
++ * Put @skb in a per-cpu list, using the cpu which
++ * allocated the skb/pages to reduce false sharing
++ * and memory zone spinlock contention.
++ */
++void skb_attempt_defer_free(struct sk_buff *skb)
++{
++	int cpu = skb->alloc_cpu;
++	struct softnet_data *sd;
++	unsigned long flags;
++	unsigned int defer_max;
++	bool kick;
++
++	if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
++	    !cpu_online(cpu) ||
++	    cpu == raw_smp_processor_id()) {
++nodefer:	__kfree_skb(skb);
++		return;
++	}
++
++	sd = &per_cpu(softnet_data, cpu);
++	defer_max = READ_ONCE(sysctl_skb_defer_max);
++	if (READ_ONCE(sd->defer_count) >= defer_max)
++		goto nodefer;
++
++	spin_lock_irqsave(&sd->defer_lock, flags);
++	/* Send an IPI every time queue reaches half capacity. */
++	kick = sd->defer_count == (defer_max >> 1);
++	/* Paired with the READ_ONCE() few lines above */
++	WRITE_ONCE(sd->defer_count, sd->defer_count + 1);
++
++	skb->next = sd->defer_list;
++	/* Paired with READ_ONCE() in skb_defer_free_flush() */
++	WRITE_ONCE(sd->defer_list, skb);
++	spin_unlock_irqrestore(&sd->defer_lock, flags);
++
++	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
++	 * if we are unlucky enough (this seems very unlikely).
++	 */
++	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
++		smp_call_function_single_async(cpu, &sd->defer_csd);
++}
+diff -rupN linux.orig/net/dsa/slave.c linux/net/dsa/slave.c
+--- linux.orig/net/dsa/slave.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/dsa/slave.c	2022-12-04 10:40:26.732034003 -0500
+@@ -934,12 +934,12 @@ static void dsa_slave_get_ethtool_stats(
  
  		s = per_cpu_ptr(dev->tstats, i);
  		do {
@@ -8833,11 +57287,10 @@ index 1291c2431d440..dcc550b871623 100644
  		data[0] += tx_packets;
  		data[1] += tx_bytes;
  		data[2] += rx_packets;
-diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
-index 3ca0cc4678862..dbae0c79d5cfb 100644
---- a/net/ipv4/af_inet.c
-+++ b/net/ipv4/af_inet.c
-@@ -1684,9 +1684,9 @@ u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
+diff -rupN linux.orig/net/ipv4/af_inet.c linux/net/ipv4/af_inet.c
+--- linux.orig/net/ipv4/af_inet.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/ipv4/af_inet.c	2022-12-04 10:40:26.732034003 -0500
+@@ -1686,9 +1686,9 @@ u64 snmp_get_cpu_field64(void __percpu *
  	bhptr = per_cpu_ptr(mib, cpu);
  	syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
  	do {
@@ -8849,11 +57302,2095 @@ index 3ca0cc4678862..dbae0c79d5cfb 100644
  
  	return v;
  }
-diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
-index b7de5e46fdd8f..f84da849819cc 100644
---- a/net/ipv6/seg6_local.c
-+++ b/net/ipv6/seg6_local.c
-@@ -1508,13 +1508,13 @@ static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+diff -rupN linux.orig/net/ipv4/af_inet.c.orig linux/net/ipv4/af_inet.c.orig
+--- linux.orig/net/ipv4/af_inet.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/net/ipv4/af_inet.c.orig	2022-12-04 10:40:18.732054506 -0500
+@@ -0,0 +1,2081 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * INET		An implementation of the TCP/IP protocol suite for the LINUX
++ *		operating system.  INET is implemented using the  BSD Socket
++ *		interface as the means of communication with the user level.
++ *
++ *		PF_INET protocol family socket handler.
++ *
++ * Authors:	Ross Biro
++ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
++ *		Florian La Roche, <flla@stud.uni-sb.de>
++ *		Alan Cox, <A.Cox@swansea.ac.uk>
++ *
++ * Changes (see also sock.c)
++ *
++ *		piggy,
++ *		Karl Knutson	:	Socket protocol table
++ *		A.N.Kuznetsov	:	Socket death error in accept().
++ *		John Richardson :	Fix non blocking error in connect()
++ *					so sockets that fail to connect
++ *					don't return -EINPROGRESS.
++ *		Alan Cox	:	Asynchronous I/O support
++ *		Alan Cox	:	Keep correct socket pointer on sock
++ *					structures
++ *					when accept() ed
++ *		Alan Cox	:	Semantics of SO_LINGER aren't state
++ *					moved to close when you look carefully.
++ *					With this fixed and the accept bug fixed
++ *					some RPC stuff seems happier.
++ *		Niibe Yutaka	:	4.4BSD style write async I/O
++ *		Alan Cox,
++ *		Tony Gale 	:	Fixed reuse semantics.
++ *		Alan Cox	:	bind() shouldn't abort existing but dead
++ *					sockets. Stops FTP netin:.. I hope.
++ *		Alan Cox	:	bind() works correctly for RAW sockets.
++ *					Note that FreeBSD at least was broken
++ *					in this respect so be careful with
++ *					compatibility tests...
++ *		Alan Cox	:	routing cache support
++ *		Alan Cox	:	memzero the socket structure for
++ *					compactness.
++ *		Matt Day	:	nonblock connect error handler
++ *		Alan Cox	:	Allow large numbers of pending sockets
++ *					(eg for big web sites), but only if
++ *					specifically application requested.
++ *		Alan Cox	:	New buffering throughout IP. Used
++ *					dumbly.
++ *		Alan Cox	:	New buffering now used smartly.
++ *		Alan Cox	:	BSD rather than common sense
++ *					interpretation of listen.
++ *		Germano Caronni	:	Assorted small races.
++ *		Alan Cox	:	sendmsg/recvmsg basic support.
++ *		Alan Cox	:	Only sendmsg/recvmsg now supported.
++ *		Alan Cox	:	Locked down bind (see security list).
++ *		Alan Cox	:	Loosened bind a little.
++ *		Mike McLagan	:	ADD/DEL DLCI Ioctls
++ *	Willy Konynenberg	:	Transparent proxying support.
++ *		David S. Miller	:	New socket lookup architecture.
++ *					Some other random speedups.
++ *		Cyrus Durgin	:	Cleaned up file for kmod hacks.
++ *		Andi Kleen	:	Fix inet_stream_connect TCP race.
++ */
++
++#define pr_fmt(fmt) "IPv4: " fmt
++
++#include <linux/err.h>
++#include <linux/errno.h>
++#include <linux/types.h>
++#include <linux/socket.h>
++#include <linux/in.h>
++#include <linux/kernel.h>
++#include <linux/kmod.h>
++#include <linux/sched.h>
++#include <linux/timer.h>
++#include <linux/string.h>
++#include <linux/sockios.h>
++#include <linux/net.h>
++#include <linux/capability.h>
++#include <linux/fcntl.h>
++#include <linux/mm.h>
++#include <linux/interrupt.h>
++#include <linux/stat.h>
++#include <linux/init.h>
++#include <linux/poll.h>
++#include <linux/netfilter_ipv4.h>
++#include <linux/random.h>
++#include <linux/slab.h>
++
++#include <linux/uaccess.h>
++
++#include <linux/inet.h>
++#include <linux/igmp.h>
++#include <linux/inetdevice.h>
++#include <linux/netdevice.h>
++#include <net/checksum.h>
++#include <net/ip.h>
++#include <net/protocol.h>
++#include <net/arp.h>
++#include <net/route.h>
++#include <net/ip_fib.h>
++#include <net/inet_connection_sock.h>
++#include <net/gro.h>
++#include <net/tcp.h>
++#include <net/udp.h>
++#include <net/udplite.h>
++#include <net/ping.h>
++#include <linux/skbuff.h>
++#include <net/sock.h>
++#include <net/raw.h>
++#include <net/icmp.h>
++#include <net/inet_common.h>
++#include <net/ip_tunnels.h>
++#include <net/xfrm.h>
++#include <net/net_namespace.h>
++#include <net/secure_seq.h>
++#ifdef CONFIG_IP_MROUTE
++#include <linux/mroute.h>
++#endif
++#include <net/l3mdev.h>
++#include <net/compat.h>
++
++#include <trace/events/sock.h>
++
++/* The inetsw table contains everything that inet_create needs to
++ * build a new socket.
++ */
++static struct list_head inetsw[SOCK_MAX];
++static DEFINE_SPINLOCK(inetsw_lock);
++
++/* New destruction routine */
++
++void inet_sock_destruct(struct sock *sk)
++{
++	struct inet_sock *inet = inet_sk(sk);
++
++	__skb_queue_purge(&sk->sk_receive_queue);
++	__skb_queue_purge(&sk->sk_error_queue);
++
++	sk_mem_reclaim_final(sk);
++
++	if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
++		pr_err("Attempt to release TCP socket in state %d %p\n",
++		       sk->sk_state, sk);
++		return;
++	}
++	if (!sock_flag(sk, SOCK_DEAD)) {
++		pr_err("Attempt to release alive inet socket %p\n", sk);
++		return;
++	}
++
++	WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
++	WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
++	WARN_ON_ONCE(sk->sk_wmem_queued);
++	WARN_ON_ONCE(sk_forward_alloc_get(sk));
++
++	kfree(rcu_dereference_protected(inet->inet_opt, 1));
++	dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
++	dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
++	sk_refcnt_debug_dec(sk);
++}
++EXPORT_SYMBOL(inet_sock_destruct);
++
++/*
++ *	The routines beyond this point handle the behaviour of an AF_INET
++ *	socket object. Mostly it punts to the subprotocols of IP to do
++ *	the work.
++ */
++
++/*
++ *	Automatically bind an unbound socket.
++ */
++
++static int inet_autobind(struct sock *sk)
++{
++	struct inet_sock *inet;
++	/* We may need to bind the socket. */
++	lock_sock(sk);
++	inet = inet_sk(sk);
++	if (!inet->inet_num) {
++		if (sk->sk_prot->get_port(sk, 0)) {
++			release_sock(sk);
++			return -EAGAIN;
++		}
++		inet->inet_sport = htons(inet->inet_num);
++	}
++	release_sock(sk);
++	return 0;
++}
++
++/*
++ *	Move a socket into listening state.
++ */
++int inet_listen(struct socket *sock, int backlog)
++{
++	struct sock *sk = sock->sk;
++	unsigned char old_state;
++	int err, tcp_fastopen;
++
++	lock_sock(sk);
++
++	err = -EINVAL;
++	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
++		goto out;
++
++	old_state = sk->sk_state;
++	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
++		goto out;
++
++	WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
++	/* Really, if the socket is already in listen state
++	 * we can only allow the backlog to be adjusted.
++	 */
++	if (old_state != TCP_LISTEN) {
++		/* Enable TFO w/o requiring TCP_FASTOPEN socket option.
++		 * Note that only TCP sockets (SOCK_STREAM) will reach here.
++		 * Also fastopen backlog may already been set via the option
++		 * because the socket was in TCP_LISTEN state previously but
++		 * was shutdown() rather than close().
++		 */
++		tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
++		if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
++		    (tcp_fastopen & TFO_SERVER_ENABLE) &&
++		    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
++			fastopen_queue_tune(sk, backlog);
++			tcp_fastopen_init_key_once(sock_net(sk));
++		}
++
++		err = inet_csk_listen_start(sk);
++		if (err)
++			goto out;
++		tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
++	}
++	err = 0;
++
++out:
++	release_sock(sk);
++	return err;
++}
++EXPORT_SYMBOL(inet_listen);
++
++/*
++ *	Create an inet socket.
++ */
++
++static int inet_create(struct net *net, struct socket *sock, int protocol,
++		       int kern)
++{
++	struct sock *sk;
++	struct inet_protosw *answer;
++	struct inet_sock *inet;
++	struct proto *answer_prot;
++	unsigned char answer_flags;
++	int try_loading_module = 0;
++	int err;
++
++	if (protocol < 0 || protocol >= IPPROTO_MAX)
++		return -EINVAL;
++
++	sock->state = SS_UNCONNECTED;
++
++	/* Look for the requested type/protocol pair. */
++lookup_protocol:
++	err = -ESOCKTNOSUPPORT;
++	rcu_read_lock();
++	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
++
++		err = 0;
++		/* Check the non-wild match. */
++		if (protocol == answer->protocol) {
++			if (protocol != IPPROTO_IP)
++				break;
++		} else {
++			/* Check for the two wild cases. */
++			if (IPPROTO_IP == protocol) {
++				protocol = answer->protocol;
++				break;
++			}
++			if (IPPROTO_IP == answer->protocol)
++				break;
++		}
++		err = -EPROTONOSUPPORT;
++	}
++
++	if (unlikely(err)) {
++		if (try_loading_module < 2) {
++			rcu_read_unlock();
++			/*
++			 * Be more specific, e.g. net-pf-2-proto-132-type-1
++			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
++			 */
++			if (++try_loading_module == 1)
++				request_module("net-pf-%d-proto-%d-type-%d",
++					       PF_INET, protocol, sock->type);
++			/*
++			 * Fall back to generic, e.g. net-pf-2-proto-132
++			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
++			 */
++			else
++				request_module("net-pf-%d-proto-%d",
++					       PF_INET, protocol);
++			goto lookup_protocol;
++		} else
++			goto out_rcu_unlock;
++	}
++
++	err = -EPERM;
++	if (sock->type == SOCK_RAW && !kern &&
++	    !ns_capable(net->user_ns, CAP_NET_RAW))
++		goto out_rcu_unlock;
++
++	sock->ops = answer->ops;
++	answer_prot = answer->prot;
++	answer_flags = answer->flags;
++	rcu_read_unlock();
++
++	WARN_ON(!answer_prot->slab);
++
++	err = -ENOMEM;
++	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
++	if (!sk)
++		goto out;
++
++	err = 0;
++	if (INET_PROTOSW_REUSE & answer_flags)
++		sk->sk_reuse = SK_CAN_REUSE;
++
++	inet = inet_sk(sk);
++	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
++
++	inet->nodefrag = 0;
++
++	if (SOCK_RAW == sock->type) {
++		inet->inet_num = protocol;
++		if (IPPROTO_RAW == protocol)
++			inet->hdrincl = 1;
++	}
++
++	if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
++		inet->pmtudisc = IP_PMTUDISC_DONT;
++	else
++		inet->pmtudisc = IP_PMTUDISC_WANT;
++
++	inet->inet_id = 0;
++
++	sock_init_data(sock, sk);
++
++	sk->sk_destruct	   = inet_sock_destruct;
++	sk->sk_protocol	   = protocol;
++	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
++
++	inet->uc_ttl	= -1;
++	inet->mc_loop	= 1;
++	inet->mc_ttl	= 1;
++	inet->mc_all	= 1;
++	inet->mc_index	= 0;
++	inet->mc_list	= NULL;
++	inet->rcv_tos	= 0;
++
++	sk_refcnt_debug_inc(sk);
++
++	if (inet->inet_num) {
++		/* It assumes that any protocol which allows
++		 * the user to assign a number at socket
++		 * creation time automatically
++		 * shares.
++		 */
++		inet->inet_sport = htons(inet->inet_num);
++		/* Add to protocol hash chains. */
++		err = sk->sk_prot->hash(sk);
++		if (err) {
++			sk_common_release(sk);
++			goto out;
++		}
++	}
++
++	if (sk->sk_prot->init) {
++		err = sk->sk_prot->init(sk);
++		if (err) {
++			sk_common_release(sk);
++			goto out;
++		}
++	}
++
++	if (!kern) {
++		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
++		if (err) {
++			sk_common_release(sk);
++			goto out;
++		}
++	}
++out:
++	return err;
++out_rcu_unlock:
++	rcu_read_unlock();
++	goto out;
++}
++
++
++/*
++ *	The peer socket should always be NULL (or else). When we call this
++ *	function we are destroying the object and from then on nobody
++ *	should refer to it.
++ */
++int inet_release(struct socket *sock)
++{
++	struct sock *sk = sock->sk;
++
++	if (sk) {
++		long timeout;
++
++		if (!sk->sk_kern_sock)
++			BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);
++
++		/* Applications forget to leave groups before exiting */
++		ip_mc_drop_socket(sk);
++
++		/* If linger is set, we don't return until the close
++		 * is complete.  Otherwise we return immediately. The
++		 * actually closing is done the same either way.
++		 *
++		 * If the close is due to the process exiting, we never
++		 * linger..
++		 */
++		timeout = 0;
++		if (sock_flag(sk, SOCK_LINGER) &&
++		    !(current->flags & PF_EXITING))
++			timeout = sk->sk_lingertime;
++		sk->sk_prot->close(sk, timeout);
++		sock->sk = NULL;
++	}
++	return 0;
++}
++EXPORT_SYMBOL(inet_release);
++
++int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
++{
++	struct sock *sk = sock->sk;
++	u32 flags = BIND_WITH_LOCK;
++	int err;
++
++	/* If the socket has its own bind function then use it. (RAW) */
++	if (sk->sk_prot->bind) {
++		return sk->sk_prot->bind(sk, uaddr, addr_len);
++	}
++	if (addr_len < sizeof(struct sockaddr_in))
++		return -EINVAL;
++
++	/* BPF prog is run before any checks are done so that if the prog
++	 * changes context in a wrong way it will be caught.
++	 */
++	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
++						 CGROUP_INET4_BIND, &flags);
++	if (err)
++		return err;
++
++	return __inet_bind(sk, uaddr, addr_len, flags);
++}
++EXPORT_SYMBOL(inet_bind);
++
++int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
++		u32 flags)
++{
++	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
++	struct inet_sock *inet = inet_sk(sk);
++	struct net *net = sock_net(sk);
++	unsigned short snum;
++	int chk_addr_ret;
++	u32 tb_id = RT_TABLE_LOCAL;
++	int err;
++
++	if (addr->sin_family != AF_INET) {
++		/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
++		 * only if s_addr is INADDR_ANY.
++		 */
++		err = -EAFNOSUPPORT;
++		if (addr->sin_family != AF_UNSPEC ||
++		    addr->sin_addr.s_addr != htonl(INADDR_ANY))
++			goto out;
++	}
++
++	tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
++	chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
++
++	/* Not specified by any standard per-se, however it breaks too
++	 * many applications when removed.  It is unfortunate since
++	 * allowing applications to make a non-local bind solves
++	 * several problems with systems using dynamic addressing.
++	 * (ie. your servers still start up even if your ISDN link
++	 *  is temporarily down)
++	 */
++	err = -EADDRNOTAVAIL;
++	if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
++	                                 chk_addr_ret))
++		goto out;
++
++	snum = ntohs(addr->sin_port);
++	err = -EACCES;
++	if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
++	    snum && inet_port_requires_bind_service(net, snum) &&
++	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
++		goto out;
++
++	/*      We keep a pair of addresses. rcv_saddr is the one
++	 *      used by hash lookups, and saddr is used for transmit.
++	 *
++	 *      In the BSD API these are the same except where it
++	 *      would be illegal to use them (multicast/broadcast) in
++	 *      which case the sending device address is used.
++	 */
++	if (flags & BIND_WITH_LOCK)
++		lock_sock(sk);
++
++	/* Check these errors (active socket, double bind). */
++	err = -EINVAL;
++	if (sk->sk_state != TCP_CLOSE || inet->inet_num)
++		goto out_release_sock;
++
++	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
++	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
++		inet->inet_saddr = 0;  /* Use device */
++
++	/* Make sure we are allowed to bind here. */
++	if (snum || !(inet->bind_address_no_port ||
++		      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
++		if (sk->sk_prot->get_port(sk, snum)) {
++			inet->inet_saddr = inet->inet_rcv_saddr = 0;
++			err = -EADDRINUSE;
++			goto out_release_sock;
++		}
++		if (!(flags & BIND_FROM_BPF)) {
++			err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
++			if (err) {
++				inet->inet_saddr = inet->inet_rcv_saddr = 0;
++				if (sk->sk_prot->put_port)
++					sk->sk_prot->put_port(sk);
++				goto out_release_sock;
++			}
++		}
++	}
++
++	if (inet->inet_rcv_saddr)
++		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
++	if (snum)
++		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
++	inet->inet_sport = htons(inet->inet_num);
++	inet->inet_daddr = 0;
++	inet->inet_dport = 0;
++	sk_dst_reset(sk);
++	err = 0;
++out_release_sock:
++	if (flags & BIND_WITH_LOCK)
++		release_sock(sk);
++out:
++	return err;
++}
++
++int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
++		       int addr_len, int flags)
++{
++	struct sock *sk = sock->sk;
++	int err;
++
++	if (addr_len < sizeof(uaddr->sa_family))
++		return -EINVAL;
++	if (uaddr->sa_family == AF_UNSPEC)
++		return sk->sk_prot->disconnect(sk, flags);
++
++	if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
++		err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
++		if (err)
++			return err;
++	}
++
++	if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
++		return -EAGAIN;
++	return sk->sk_prot->connect(sk, uaddr, addr_len);
++}
++EXPORT_SYMBOL(inet_dgram_connect);
++
++static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
++{
++	DEFINE_WAIT_FUNC(wait, woken_wake_function);
++
++	add_wait_queue(sk_sleep(sk), &wait);
++	sk->sk_write_pending += writebias;
++
++	/* Basic assumption: if someone sets sk->sk_err, he _must_
++	 * change state of the socket from TCP_SYN_*.
++	 * Connect() does not allow to get error notifications
++	 * without closing the socket.
++	 */
++	while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
++		release_sock(sk);
++		timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
++		lock_sock(sk);
++		if (signal_pending(current) || !timeo)
++			break;
++	}
++	remove_wait_queue(sk_sleep(sk), &wait);
++	sk->sk_write_pending -= writebias;
++	return timeo;
++}
++
++/*
++ *	Connect to a remote host. There is regrettably still a little
++ *	TCP 'magic' in here.
++ */
++int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
++			  int addr_len, int flags, int is_sendmsg)
++{
++	struct sock *sk = sock->sk;
++	int err;
++	long timeo;
++
++	/*
++	 * uaddr can be NULL and addr_len can be 0 if:
++	 * sk is a TCP fastopen active socket and
++	 * TCP_FASTOPEN_CONNECT sockopt is set and
++	 * we already have a valid cookie for this socket.
++	 * In this case, user can call write() after connect().
++	 * write() will invoke tcp_sendmsg_fastopen() which calls
++	 * __inet_stream_connect().
++	 */
++	if (uaddr) {
++		if (addr_len < sizeof(uaddr->sa_family))
++			return -EINVAL;
++
++		if (uaddr->sa_family == AF_UNSPEC) {
++			err = sk->sk_prot->disconnect(sk, flags);
++			sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
++			goto out;
++		}
++	}
++
++	switch (sock->state) {
++	default:
++		err = -EINVAL;
++		goto out;
++	case SS_CONNECTED:
++		err = -EISCONN;
++		goto out;
++	case SS_CONNECTING:
++		if (inet_sk(sk)->defer_connect)
++			err = is_sendmsg ? -EINPROGRESS : -EISCONN;
++		else
++			err = -EALREADY;
++		/* Fall out of switch with err, set for this state */
++		break;
++	case SS_UNCONNECTED:
++		err = -EISCONN;
++		if (sk->sk_state != TCP_CLOSE)
++			goto out;
++
++		if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
++			err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
++			if (err)
++				goto out;
++		}
++
++		err = sk->sk_prot->connect(sk, uaddr, addr_len);
++		if (err < 0)
++			goto out;
++
++		sock->state = SS_CONNECTING;
++
++		if (!err && inet_sk(sk)->defer_connect)
++			goto out;
++
++		/* Just entered SS_CONNECTING state; the only
++		 * difference is that return value in non-blocking
++		 * case is EINPROGRESS, rather than EALREADY.
++		 */
++		err = -EINPROGRESS;
++		break;
++	}
++
++	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
++
++	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
++		int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
++				tcp_sk(sk)->fastopen_req &&
++				tcp_sk(sk)->fastopen_req->data ? 1 : 0;
++
++		/* Error code is set above */
++		if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
++			goto out;
++
++		err = sock_intr_errno(timeo);
++		if (signal_pending(current))
++			goto out;
++	}
++
++	/* Connection was closed by RST, timeout, ICMP error
++	 * or another process disconnected us.
++	 */
++	if (sk->sk_state == TCP_CLOSE)
++		goto sock_error;
++
++	/* sk->sk_err may be not zero now, if RECVERR was ordered by user
++	 * and error was received after socket entered established state.
++	 * Hence, it is handled normally after connect() return successfully.
++	 */
++
++	sock->state = SS_CONNECTED;
++	err = 0;
++out:
++	return err;
++
++sock_error:
++	err = sock_error(sk) ? : -ECONNABORTED;
++	sock->state = SS_UNCONNECTED;
++	if (sk->sk_prot->disconnect(sk, flags))
++		sock->state = SS_DISCONNECTING;
++	goto out;
++}
++EXPORT_SYMBOL(__inet_stream_connect);
++
++int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
++			int addr_len, int flags)
++{
++	int err;
++
++	lock_sock(sock->sk);
++	err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
++	release_sock(sock->sk);
++	return err;
++}
++EXPORT_SYMBOL(inet_stream_connect);
++
++/*
++ *	Accept a pending connection. The TCP layer now gives BSD semantics.
++ */
++
++int inet_accept(struct socket *sock, struct socket *newsock, int flags,
++		bool kern)
++{
++	struct sock *sk1 = sock->sk;
++	int err = -EINVAL;
++	struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern);
++
++	if (!sk2)
++		goto do_err;
++
++	lock_sock(sk2);
++
++	sock_rps_record_flow(sk2);
++	WARN_ON(!((1 << sk2->sk_state) &
++		  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
++		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
++
++	if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
++		set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
++	sock_graft(sk2, newsock);
++
++	newsock->state = SS_CONNECTED;
++	err = 0;
++	release_sock(sk2);
++do_err:
++	return err;
++}
++EXPORT_SYMBOL(inet_accept);
++
++/*
++ *	This does both peername and sockname.
++ */
++int inet_getname(struct socket *sock, struct sockaddr *uaddr,
++		 int peer)
++{
++	struct sock *sk		= sock->sk;
++	struct inet_sock *inet	= inet_sk(sk);
++	DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
++
++	sin->sin_family = AF_INET;
++	lock_sock(sk);
++	if (peer) {
++		if (!inet->inet_dport ||
++		    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
++		     peer == 1)) {
++			release_sock(sk);
++			return -ENOTCONN;
++		}
++		sin->sin_port = inet->inet_dport;
++		sin->sin_addr.s_addr = inet->inet_daddr;
++		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
++				       CGROUP_INET4_GETPEERNAME);
++	} else {
++		__be32 addr = inet->inet_rcv_saddr;
++		if (!addr)
++			addr = inet->inet_saddr;
++		sin->sin_port = inet->inet_sport;
++		sin->sin_addr.s_addr = addr;
++		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
++				       CGROUP_INET4_GETSOCKNAME);
++	}
++	release_sock(sk);
++	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++	return sizeof(*sin);
++}
++EXPORT_SYMBOL(inet_getname);
++
++int inet_send_prepare(struct sock *sk)
++{
++	sock_rps_record_flow(sk);
++
++	/* We may need to bind the socket. */
++	if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind &&
++	    inet_autobind(sk))
++		return -EAGAIN;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(inet_send_prepare);
++
++int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
++{
++	struct sock *sk = sock->sk;
++
++	if (unlikely(inet_send_prepare(sk)))
++		return -EAGAIN;
++
++	return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,
++			       sk, msg, size);
++}
++EXPORT_SYMBOL(inet_sendmsg);
++
++ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
++		      size_t size, int flags)
++{
++	struct sock *sk = sock->sk;
++
++	if (unlikely(inet_send_prepare(sk)))
++		return -EAGAIN;
++
++	if (sk->sk_prot->sendpage)
++		return sk->sk_prot->sendpage(sk, page, offset, size, flags);
++	return sock_no_sendpage(sock, page, offset, size, flags);
++}
++EXPORT_SYMBOL(inet_sendpage);
++
++INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
++					  size_t, int, int *));
++int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
++		 int flags)
++{
++	struct sock *sk = sock->sk;
++	int addr_len = 0;
++	int err;
++
++	if (likely(!(flags & MSG_ERRQUEUE)))
++		sock_rps_record_flow(sk);
++
++	err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
++			      sk, msg, size, flags, &addr_len);
++	if (err >= 0)
++		msg->msg_namelen = addr_len;
++	return err;
++}
++EXPORT_SYMBOL(inet_recvmsg);
++
++int inet_shutdown(struct socket *sock, int how)
++{
++	struct sock *sk = sock->sk;
++	int err = 0;
++
++	/* This should really check to make sure
++	 * the socket is a TCP socket. (WHY AC...)
++	 */
++	how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
++		       1->2 bit 2 snds.
++		       2->3 */
++	if ((how & ~SHUTDOWN_MASK) || !how)	/* MAXINT->0 */
++		return -EINVAL;
++
++	lock_sock(sk);
++	if (sock->state == SS_CONNECTING) {
++		if ((1 << sk->sk_state) &
++		    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
++			sock->state = SS_DISCONNECTING;
++		else
++			sock->state = SS_CONNECTED;
++	}
++
++	switch (sk->sk_state) {
++	case TCP_CLOSE:
++		err = -ENOTCONN;
++		/* Hack to wake up other listeners, who can poll for
++		   EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
++		fallthrough;
++	default:
++		sk->sk_shutdown |= how;
++		if (sk->sk_prot->shutdown)
++			sk->sk_prot->shutdown(sk, how);
++		break;
++
++	/* Remaining two branches are temporary solution for missing
++	 * close() in multithreaded environment. It is _not_ a good idea,
++	 * but we have no choice until close() is repaired at VFS level.
++	 */
++	case TCP_LISTEN:
++		if (!(how & RCV_SHUTDOWN))
++			break;
++		fallthrough;
++	case TCP_SYN_SENT:
++		err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
++		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
++		break;
++	}
++
++	/* Wake up anyone sleeping in poll. */
++	sk->sk_state_change(sk);
++	release_sock(sk);
++	return err;
++}
++EXPORT_SYMBOL(inet_shutdown);
++
++/*
++ *	ioctl() calls you can issue on an INET socket. Most of these are
++ *	device configuration and stuff and very rarely used. Some ioctls
++ *	pass on to the socket itself.
++ *
++ *	NOTE: I like the idea of a module for the config stuff. ie ifconfig
++ *	loads the devconfigure module does its configuring and unloads it.
++ *	There's a good 20K of config code hanging around the kernel.
++ */
++
++int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
++{
++	struct sock *sk = sock->sk;
++	int err = 0;
++	struct net *net = sock_net(sk);
++	void __user *p = (void __user *)arg;
++	struct ifreq ifr;
++	struct rtentry rt;
++
++	switch (cmd) {
++	case SIOCADDRT:
++	case SIOCDELRT:
++		if (copy_from_user(&rt, p, sizeof(struct rtentry)))
++			return -EFAULT;
++		err = ip_rt_ioctl(net, cmd, &rt);
++		break;
++	case SIOCRTMSG:
++		err = -EINVAL;
++		break;
++	case SIOCDARP:
++	case SIOCGARP:
++	case SIOCSARP:
++		err = arp_ioctl(net, cmd, (void __user *)arg);
++		break;
++	case SIOCGIFADDR:
++	case SIOCGIFBRDADDR:
++	case SIOCGIFNETMASK:
++	case SIOCGIFDSTADDR:
++	case SIOCGIFPFLAGS:
++		if (get_user_ifreq(&ifr, NULL, p))
++			return -EFAULT;
++		err = devinet_ioctl(net, cmd, &ifr);
++		if (!err && put_user_ifreq(&ifr, p))
++			err = -EFAULT;
++		break;
++
++	case SIOCSIFADDR:
++	case SIOCSIFBRDADDR:
++	case SIOCSIFNETMASK:
++	case SIOCSIFDSTADDR:
++	case SIOCSIFPFLAGS:
++	case SIOCSIFFLAGS:
++		if (get_user_ifreq(&ifr, NULL, p))
++			return -EFAULT;
++		err = devinet_ioctl(net, cmd, &ifr);
++		break;
++	default:
++		if (sk->sk_prot->ioctl)
++			err = sk->sk_prot->ioctl(sk, cmd, arg);
++		else
++			err = -ENOIOCTLCMD;
++		break;
++	}
++	return err;
++}
++EXPORT_SYMBOL(inet_ioctl);
++
++#ifdef CONFIG_COMPAT
++static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
++		struct compat_rtentry __user *ur)
++{
++	compat_uptr_t rtdev;
++	struct rtentry rt;
++
++	if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
++			3 * sizeof(struct sockaddr)) ||
++	    get_user(rt.rt_flags, &ur->rt_flags) ||
++	    get_user(rt.rt_metric, &ur->rt_metric) ||
++	    get_user(rt.rt_mtu, &ur->rt_mtu) ||
++	    get_user(rt.rt_window, &ur->rt_window) ||
++	    get_user(rt.rt_irtt, &ur->rt_irtt) ||
++	    get_user(rtdev, &ur->rt_dev))
++		return -EFAULT;
++
++	rt.rt_dev = compat_ptr(rtdev);
++	return ip_rt_ioctl(sock_net(sk), cmd, &rt);
++}
++
++static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
++{
++	void __user *argp = compat_ptr(arg);
++	struct sock *sk = sock->sk;
++
++	switch (cmd) {
++	case SIOCADDRT:
++	case SIOCDELRT:
++		return inet_compat_routing_ioctl(sk, cmd, argp);
++	default:
++		if (!sk->sk_prot->compat_ioctl)
++			return -ENOIOCTLCMD;
++		return sk->sk_prot->compat_ioctl(sk, cmd, arg);
++	}
++}
++#endif /* CONFIG_COMPAT */
++
++const struct proto_ops inet_stream_ops = {
++	.family		   = PF_INET,
++	.owner		   = THIS_MODULE,
++	.release	   = inet_release,
++	.bind		   = inet_bind,
++	.connect	   = inet_stream_connect,
++	.socketpair	   = sock_no_socketpair,
++	.accept		   = inet_accept,
++	.getname	   = inet_getname,
++	.poll		   = tcp_poll,
++	.ioctl		   = inet_ioctl,
++	.gettstamp	   = sock_gettstamp,
++	.listen		   = inet_listen,
++	.shutdown	   = inet_shutdown,
++	.setsockopt	   = sock_common_setsockopt,
++	.getsockopt	   = sock_common_getsockopt,
++	.sendmsg	   = inet_sendmsg,
++	.recvmsg	   = inet_recvmsg,
++#ifdef CONFIG_MMU
++	.mmap		   = tcp_mmap,
++#endif
++	.sendpage	   = inet_sendpage,
++	.splice_read	   = tcp_splice_read,
++	.read_sock	   = tcp_read_sock,
++	.read_skb	   = tcp_read_skb,
++	.sendmsg_locked    = tcp_sendmsg_locked,
++	.sendpage_locked   = tcp_sendpage_locked,
++	.peek_len	   = tcp_peek_len,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	   = inet_compat_ioctl,
++#endif
++	.set_rcvlowat	   = tcp_set_rcvlowat,
++};
++EXPORT_SYMBOL(inet_stream_ops);
++
++const struct proto_ops inet_dgram_ops = {
++	.family		   = PF_INET,
++	.owner		   = THIS_MODULE,
++	.release	   = inet_release,
++	.bind		   = inet_bind,
++	.connect	   = inet_dgram_connect,
++	.socketpair	   = sock_no_socketpair,
++	.accept		   = sock_no_accept,
++	.getname	   = inet_getname,
++	.poll		   = udp_poll,
++	.ioctl		   = inet_ioctl,
++	.gettstamp	   = sock_gettstamp,
++	.listen		   = sock_no_listen,
++	.shutdown	   = inet_shutdown,
++	.setsockopt	   = sock_common_setsockopt,
++	.getsockopt	   = sock_common_getsockopt,
++	.sendmsg	   = inet_sendmsg,
++	.read_skb	   = udp_read_skb,
++	.recvmsg	   = inet_recvmsg,
++	.mmap		   = sock_no_mmap,
++	.sendpage	   = inet_sendpage,
++	.set_peek_off	   = sk_set_peek_off,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	   = inet_compat_ioctl,
++#endif
++};
++EXPORT_SYMBOL(inet_dgram_ops);
++
++/*
++ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
++ * udp_poll
++ */
++static const struct proto_ops inet_sockraw_ops = {
++	.family		   = PF_INET,
++	.owner		   = THIS_MODULE,
++	.release	   = inet_release,
++	.bind		   = inet_bind,
++	.connect	   = inet_dgram_connect,
++	.socketpair	   = sock_no_socketpair,
++	.accept		   = sock_no_accept,
++	.getname	   = inet_getname,
++	.poll		   = datagram_poll,
++	.ioctl		   = inet_ioctl,
++	.gettstamp	   = sock_gettstamp,
++	.listen		   = sock_no_listen,
++	.shutdown	   = inet_shutdown,
++	.setsockopt	   = sock_common_setsockopt,
++	.getsockopt	   = sock_common_getsockopt,
++	.sendmsg	   = inet_sendmsg,
++	.recvmsg	   = inet_recvmsg,
++	.mmap		   = sock_no_mmap,
++	.sendpage	   = inet_sendpage,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	   = inet_compat_ioctl,
++#endif
++};
++
++static const struct net_proto_family inet_family_ops = {
++	.family = PF_INET,
++	.create = inet_create,
++	.owner	= THIS_MODULE,
++};
++
++/* Upon startup we insert all the elements in inetsw_array[] into
++ * the linked list inetsw.
++ */
++static struct inet_protosw inetsw_array[] =
++{
++	{
++		.type =       SOCK_STREAM,
++		.protocol =   IPPROTO_TCP,
++		.prot =       &tcp_prot,
++		.ops =        &inet_stream_ops,
++		.flags =      INET_PROTOSW_PERMANENT |
++			      INET_PROTOSW_ICSK,
++	},
++
++	{
++		.type =       SOCK_DGRAM,
++		.protocol =   IPPROTO_UDP,
++		.prot =       &udp_prot,
++		.ops =        &inet_dgram_ops,
++		.flags =      INET_PROTOSW_PERMANENT,
++       },
++
++       {
++		.type =       SOCK_DGRAM,
++		.protocol =   IPPROTO_ICMP,
++		.prot =       &ping_prot,
++		.ops =        &inet_sockraw_ops,
++		.flags =      INET_PROTOSW_REUSE,
++       },
++
++       {
++	       .type =       SOCK_RAW,
++	       .protocol =   IPPROTO_IP,	/* wild card */
++	       .prot =       &raw_prot,
++	       .ops =        &inet_sockraw_ops,
++	       .flags =      INET_PROTOSW_REUSE,
++       }
++};
++
++#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
++
++void inet_register_protosw(struct inet_protosw *p)
++{
++	struct list_head *lh;
++	struct inet_protosw *answer;
++	int protocol = p->protocol;
++	struct list_head *last_perm;
++
++	spin_lock_bh(&inetsw_lock);
++
++	if (p->type >= SOCK_MAX)
++		goto out_illegal;
++
++	/* If we are trying to override a permanent protocol, bail. */
++	last_perm = &inetsw[p->type];
++	list_for_each(lh, &inetsw[p->type]) {
++		answer = list_entry(lh, struct inet_protosw, list);
++		/* Check only the non-wild match. */
++		if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
++			break;
++		if (protocol == answer->protocol)
++			goto out_permanent;
++		last_perm = lh;
++	}
++
++	/* Add the new entry after the last permanent entry if any, so that
++	 * the new entry does not override a permanent entry when matched with
++	 * a wild-card protocol. But it is allowed to override any existing
++	 * non-permanent entry.  This means that when we remove this entry, the
++	 * system automatically returns to the old behavior.
++	 */
++	list_add_rcu(&p->list, last_perm);
++out:
++	spin_unlock_bh(&inetsw_lock);
++
++	return;
++
++out_permanent:
++	pr_err("Attempt to override permanent protocol %d\n", protocol);
++	goto out;
++
++out_illegal:
++	pr_err("Ignoring attempt to register invalid socket type %d\n",
++	       p->type);
++	goto out;
++}
++EXPORT_SYMBOL(inet_register_protosw);
++
++void inet_unregister_protosw(struct inet_protosw *p)
++{
++	if (INET_PROTOSW_PERMANENT & p->flags) {
++		pr_err("Attempt to unregister permanent protocol %d\n",
++		       p->protocol);
++	} else {
++		spin_lock_bh(&inetsw_lock);
++		list_del_rcu(&p->list);
++		spin_unlock_bh(&inetsw_lock);
++
++		synchronize_net();
++	}
++}
++EXPORT_SYMBOL(inet_unregister_protosw);
++
++static int inet_sk_reselect_saddr(struct sock *sk)
++{
++	struct inet_sock *inet = inet_sk(sk);
++	__be32 old_saddr = inet->inet_saddr;
++	__be32 daddr = inet->inet_daddr;
++	struct flowi4 *fl4;
++	struct rtable *rt;
++	__be32 new_saddr;
++	struct ip_options_rcu *inet_opt;
++
++	inet_opt = rcu_dereference_protected(inet->inet_opt,
++					     lockdep_sock_is_held(sk));
++	if (inet_opt && inet_opt->opt.srr)
++		daddr = inet_opt->opt.faddr;
++
++	/* Query new route. */
++	fl4 = &inet->cork.fl.u.ip4;
++	rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if,
++			      sk->sk_protocol, inet->inet_sport,
++			      inet->inet_dport, sk);
++	if (IS_ERR(rt))
++		return PTR_ERR(rt);
++
++	sk_setup_caps(sk, &rt->dst);
++
++	new_saddr = fl4->saddr;
++
++	if (new_saddr == old_saddr)
++		return 0;
++
++	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
++		pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
++			__func__, &old_saddr, &new_saddr);
++	}
++
++	inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
++
++	/*
++	 * XXX The only one ugly spot where we need to
++	 * XXX really change the sockets identity after
++	 * XXX it has entered the hashes. -DaveM
++	 *
++	 * Besides that, it does not check for connection
++	 * uniqueness. Wait for troubles.
++	 */
++	return __sk_prot_rehash(sk);
++}
++
++int inet_sk_rebuild_header(struct sock *sk)
++{
++	struct inet_sock *inet = inet_sk(sk);
++	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
++	__be32 daddr;
++	struct ip_options_rcu *inet_opt;
++	struct flowi4 *fl4;
++	int err;
++
++	/* Route is OK, nothing to do. */
++	if (rt)
++		return 0;
++
++	/* Reroute. */
++	rcu_read_lock();
++	inet_opt = rcu_dereference(inet->inet_opt);
++	daddr = inet->inet_daddr;
++	if (inet_opt && inet_opt->opt.srr)
++		daddr = inet_opt->opt.faddr;
++	rcu_read_unlock();
++	fl4 = &inet->cork.fl.u.ip4;
++	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
++				   inet->inet_dport, inet->inet_sport,
++				   sk->sk_protocol, RT_CONN_FLAGS(sk),
++				   sk->sk_bound_dev_if);
++	if (!IS_ERR(rt)) {
++		err = 0;
++		sk_setup_caps(sk, &rt->dst);
++	} else {
++		err = PTR_ERR(rt);
++
++		/* Routing failed... */
++		sk->sk_route_caps = 0;
++		/*
++		 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
++		 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
++		 */
++		if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
++		    sk->sk_state != TCP_SYN_SENT ||
++		    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
++		    (err = inet_sk_reselect_saddr(sk)) != 0)
++			sk->sk_err_soft = -err;
++	}
++
++	return err;
++}
++EXPORT_SYMBOL(inet_sk_rebuild_header);
++
++void inet_sk_set_state(struct sock *sk, int state)
++{
++	trace_inet_sock_set_state(sk, sk->sk_state, state);
++	sk->sk_state = state;
++}
++EXPORT_SYMBOL(inet_sk_set_state);
++
++void inet_sk_state_store(struct sock *sk, int newstate)
++{
++	trace_inet_sock_set_state(sk, sk->sk_state, newstate);
++	smp_store_release(&sk->sk_state, newstate);
++}
++
++struct sk_buff *inet_gso_segment(struct sk_buff *skb,
++				 netdev_features_t features)
++{
++	bool udpfrag = false, fixedid = false, gso_partial, encap;
++	struct sk_buff *segs = ERR_PTR(-EINVAL);
++	const struct net_offload *ops;
++	unsigned int offset = 0;
++	struct iphdr *iph;
++	int proto, tot_len;
++	int nhoff;
++	int ihl;
++	int id;
++
++	skb_reset_network_header(skb);
++	nhoff = skb_network_header(skb) - skb_mac_header(skb);
++	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
++		goto out;
++
++	iph = ip_hdr(skb);
++	ihl = iph->ihl * 4;
++	if (ihl < sizeof(*iph))
++		goto out;
++
++	id = ntohs(iph->id);
++	proto = iph->protocol;
++
++	/* Warning: after this point, iph might be no longer valid */
++	if (unlikely(!pskb_may_pull(skb, ihl)))
++		goto out;
++	__skb_pull(skb, ihl);
++
++	encap = SKB_GSO_CB(skb)->encap_level > 0;
++	if (encap)
++		features &= skb->dev->hw_enc_features;
++	SKB_GSO_CB(skb)->encap_level += ihl;
++
++	skb_reset_transport_header(skb);
++
++	segs = ERR_PTR(-EPROTONOSUPPORT);
++
++	if (!skb->encapsulation || encap) {
++		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
++		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
++
++		/* fixed ID is invalid if DF bit is not set */
++		if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
++			goto out;
++	}
++
++	ops = rcu_dereference(inet_offloads[proto]);
++	if (likely(ops && ops->callbacks.gso_segment)) {
++		segs = ops->callbacks.gso_segment(skb, features);
++		if (!segs)
++			skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
++	}
++
++	if (IS_ERR_OR_NULL(segs))
++		goto out;
++
++	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
++
++	skb = segs;
++	do {
++		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
++		if (udpfrag) {
++			iph->frag_off = htons(offset >> 3);
++			if (skb->next)
++				iph->frag_off |= htons(IP_MF);
++			offset += skb->len - nhoff - ihl;
++			tot_len = skb->len - nhoff;
++		} else if (skb_is_gso(skb)) {
++			if (!fixedid) {
++				iph->id = htons(id);
++				id += skb_shinfo(skb)->gso_segs;
++			}
++
++			if (gso_partial)
++				tot_len = skb_shinfo(skb)->gso_size +
++					  SKB_GSO_CB(skb)->data_offset +
++					  skb->head - (unsigned char *)iph;
++			else
++				tot_len = skb->len - nhoff;
++		} else {
++			if (!fixedid)
++				iph->id = htons(id++);
++			tot_len = skb->len - nhoff;
++		}
++		iph->tot_len = htons(tot_len);
++		ip_send_check(iph);
++		if (encap)
++			skb_reset_inner_headers(skb);
++		skb->network_header = (u8 *)iph - skb->head;
++		skb_reset_mac_len(skb);
++	} while ((skb = skb->next));
++
++out:
++	return segs;
++}
++
++static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
++					netdev_features_t features)
++{
++	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
++		return ERR_PTR(-EINVAL);
++
++	return inet_gso_segment(skb, features);
++}
++
++struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
++{
++	const struct net_offload *ops;
++	struct sk_buff *pp = NULL;
++	const struct iphdr *iph;
++	struct sk_buff *p;
++	unsigned int hlen;
++	unsigned int off;
++	unsigned int id;
++	int flush = 1;
++	int proto;
++
++	off = skb_gro_offset(skb);
++	hlen = off + sizeof(*iph);
++	iph = skb_gro_header_fast(skb, off);
++	if (skb_gro_header_hard(skb, hlen)) {
++		iph = skb_gro_header_slow(skb, hlen, off);
++		if (unlikely(!iph))
++			goto out;
++	}
++
++	proto = iph->protocol;
++
++	ops = rcu_dereference(inet_offloads[proto]);
++	if (!ops || !ops->callbacks.gro_receive)
++		goto out;
++
++	if (*(u8 *)iph != 0x45)
++		goto out;
++
++	if (ip_is_fragment(iph))
++		goto out;
++
++	if (unlikely(ip_fast_csum((u8 *)iph, 5)))
++		goto out;
++
++	id = ntohl(*(__be32 *)&iph->id);
++	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
++	id >>= 16;
++
++	list_for_each_entry(p, head, list) {
++		struct iphdr *iph2;
++		u16 flush_id;
++
++		if (!NAPI_GRO_CB(p)->same_flow)
++			continue;
++
++		iph2 = (struct iphdr *)(p->data + off);
++		/* The above works because, with the exception of the top
++		 * (inner most) layer, we only aggregate pkts with the same
++		 * hdr length so all the hdrs we'll need to verify will start
++		 * at the same offset.
++		 */
++		if ((iph->protocol ^ iph2->protocol) |
++		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
++		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
++			NAPI_GRO_CB(p)->same_flow = 0;
++			continue;
++		}
++
++		/* All fields must match except length and checksum. */
++		NAPI_GRO_CB(p)->flush |=
++			(iph->ttl ^ iph2->ttl) |
++			(iph->tos ^ iph2->tos) |
++			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
++
++		NAPI_GRO_CB(p)->flush |= flush;
++
++		/* We need to store of the IP ID check to be included later
++		 * when we can verify that this packet does in fact belong
++		 * to a given flow.
++		 */
++		flush_id = (u16)(id - ntohs(iph2->id));
++
++		/* This bit of code makes it much easier for us to identify
++		 * the cases where we are doing atomic vs non-atomic IP ID
++		 * checks.  Specifically an atomic check can return IP ID
++		 * values 0 - 0xFFFF, while a non-atomic check can only
++		 * return 0 or 0xFFFF.
++		 */
++		if (!NAPI_GRO_CB(p)->is_atomic ||
++		    !(iph->frag_off & htons(IP_DF))) {
++			flush_id ^= NAPI_GRO_CB(p)->count;
++			flush_id = flush_id ? 0xFFFF : 0;
++		}
++
++		/* If the previous IP ID value was based on an atomic
++		 * datagram we can overwrite the value and ignore it.
++		 */
++		if (NAPI_GRO_CB(skb)->is_atomic)
++			NAPI_GRO_CB(p)->flush_id = flush_id;
++		else
++			NAPI_GRO_CB(p)->flush_id |= flush_id;
++	}
++
++	NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
++	NAPI_GRO_CB(skb)->flush |= flush;
++	skb_set_network_header(skb, off);
++	/* The above will be needed by the transport layer if there is one
++	 * immediately following this IP hdr.
++	 */
++
++	/* Note : No need to call skb_gro_postpull_rcsum() here,
++	 * as we already checked checksum over ipv4 header was 0
++	 */
++	skb_gro_pull(skb, sizeof(*iph));
++	skb_set_transport_header(skb, skb_gro_offset(skb));
++
++	pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
++				       ops->callbacks.gro_receive, head, skb);
++
++out:
++	skb_gro_flush_final(skb, pp, flush);
++
++	return pp;
++}
++
++static struct sk_buff *ipip_gro_receive(struct list_head *head,
++					struct sk_buff *skb)
++{
++	if (NAPI_GRO_CB(skb)->encap_mark) {
++		NAPI_GRO_CB(skb)->flush = 1;
++		return NULL;
++	}
++
++	NAPI_GRO_CB(skb)->encap_mark = 1;
++
++	return inet_gro_receive(head, skb);
++}
++
++#define SECONDS_PER_DAY	86400
++
++/* inet_current_timestamp - Return IP network timestamp
++ *
++ * Return milliseconds since midnight in network byte order.
++ */
++__be32 inet_current_timestamp(void)
++{
++	u32 secs;
++	u32 msecs;
++	struct timespec64 ts;
++
++	ktime_get_real_ts64(&ts);
++
++	/* Get secs since midnight. */
++	(void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
++	/* Convert to msecs. */
++	msecs = secs * MSEC_PER_SEC;
++	/* Convert nsec to msec. */
++	msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;
++
++	/* Convert to network byte order. */
++	return htonl(msecs);
++}
++EXPORT_SYMBOL(inet_current_timestamp);
++
++int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
++{
++	if (sk->sk_family == AF_INET)
++		return ip_recv_error(sk, msg, len, addr_len);
++#if IS_ENABLED(CONFIG_IPV6)
++	if (sk->sk_family == AF_INET6)
++		return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
++#endif
++	return -EINVAL;
++}
++
++int inet_gro_complete(struct sk_buff *skb, int nhoff)
++{
++	__be16 newlen = htons(skb->len - nhoff);
++	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
++	const struct net_offload *ops;
++	int proto = iph->protocol;
++	int err = -ENOSYS;
++
++	if (skb->encapsulation) {
++		skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
++		skb_set_inner_network_header(skb, nhoff);
++	}
++
++	csum_replace2(&iph->check, iph->tot_len, newlen);
++	iph->tot_len = newlen;
++
++	ops = rcu_dereference(inet_offloads[proto]);
++	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
++		goto out;
++
++	/* Only need to add sizeof(*iph) to get to the next hdr below
++	 * because any hdr with option will have been flushed in
++	 * inet_gro_receive().
++	 */
++	err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
++			      tcp4_gro_complete, udp4_gro_complete,
++			      skb, nhoff + sizeof(*iph));
++
++out:
++	return err;
++}
++
++static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
++{
++	skb->encapsulation = 1;
++	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
++	return inet_gro_complete(skb, nhoff);
++}
++
++int inet_ctl_sock_create(struct sock **sk, unsigned short family,
++			 unsigned short type, unsigned char protocol,
++			 struct net *net)
++{
++	struct socket *sock;
++	int rc = sock_create_kern(net, family, type, protocol, &sock);
++
++	if (rc == 0) {
++		*sk = sock->sk;
++		(*sk)->sk_allocation = GFP_ATOMIC;
++		/*
++		 * Unhash it so that IP input processing does not even see it,
++		 * we do not wish this socket to see incoming packets.
++		 */
++		(*sk)->sk_prot->unhash(*sk);
++	}
++	return rc;
++}
++EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
++
++unsigned long snmp_fold_field(void __percpu *mib, int offt)
++{
++	unsigned long res = 0;
++	int i;
++
++	for_each_possible_cpu(i)
++		res += snmp_get_cpu_field(mib, i, offt);
++	return res;
++}
++EXPORT_SYMBOL_GPL(snmp_fold_field);
++
++#if BITS_PER_LONG==32
++
++u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
++			 size_t syncp_offset)
++{
++	void *bhptr;
++	struct u64_stats_sync *syncp;
++	u64 v;
++	unsigned int start;
++
++	bhptr = per_cpu_ptr(mib, cpu);
++	syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
++	do {
++		start = u64_stats_fetch_begin_irq(syncp);
++		v = *(((u64 *)bhptr) + offt);
++	} while (u64_stats_fetch_retry_irq(syncp, start));
++
++	return v;
++}
++EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);
++
++u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
++{
++	u64 res = 0;
++	int cpu;
++
++	for_each_possible_cpu(cpu) {
++		res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset);
++	}
++	return res;
++}
++EXPORT_SYMBOL_GPL(snmp_fold_field64);
++#endif
++
++#ifdef CONFIG_IP_MULTICAST
++static const struct net_protocol igmp_protocol = {
++	.handler =	igmp_rcv,
++};
++#endif
++
++static const struct net_protocol tcp_protocol = {
++	.handler	=	tcp_v4_rcv,
++	.err_handler	=	tcp_v4_err,
++	.no_policy	=	1,
++	.icmp_strict_tag_validation = 1,
++};
++
++static const struct net_protocol udp_protocol = {
++	.handler =	udp_rcv,
++	.err_handler =	udp_err,
++	.no_policy =	1,
++};
++
++static const struct net_protocol icmp_protocol = {
++	.handler =	icmp_rcv,
++	.err_handler =	icmp_err,
++	.no_policy =	1,
++};
++
++static __net_init int ipv4_mib_init_net(struct net *net)
++{
++	int i;
++
++	net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
++	if (!net->mib.tcp_statistics)
++		goto err_tcp_mib;
++	net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
++	if (!net->mib.ip_statistics)
++		goto err_ip_mib;
++
++	for_each_possible_cpu(i) {
++		struct ipstats_mib *af_inet_stats;
++		af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
++		u64_stats_init(&af_inet_stats->syncp);
++	}
++
++	net->mib.net_statistics = alloc_percpu(struct linux_mib);
++	if (!net->mib.net_statistics)
++		goto err_net_mib;
++	net->mib.udp_statistics = alloc_percpu(struct udp_mib);
++	if (!net->mib.udp_statistics)
++		goto err_udp_mib;
++	net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
++	if (!net->mib.udplite_statistics)
++		goto err_udplite_mib;
++	net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
++	if (!net->mib.icmp_statistics)
++		goto err_icmp_mib;
++	net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
++					      GFP_KERNEL);
++	if (!net->mib.icmpmsg_statistics)
++		goto err_icmpmsg_mib;
++
++	tcp_mib_init(net);
++	return 0;
++
++err_icmpmsg_mib:
++	free_percpu(net->mib.icmp_statistics);
++err_icmp_mib:
++	free_percpu(net->mib.udplite_statistics);
++err_udplite_mib:
++	free_percpu(net->mib.udp_statistics);
++err_udp_mib:
++	free_percpu(net->mib.net_statistics);
++err_net_mib:
++	free_percpu(net->mib.ip_statistics);
++err_ip_mib:
++	free_percpu(net->mib.tcp_statistics);
++err_tcp_mib:
++	return -ENOMEM;
++}
++
++static __net_exit void ipv4_mib_exit_net(struct net *net)
++{
++	kfree(net->mib.icmpmsg_statistics);
++	free_percpu(net->mib.icmp_statistics);
++	free_percpu(net->mib.udplite_statistics);
++	free_percpu(net->mib.udp_statistics);
++	free_percpu(net->mib.net_statistics);
++	free_percpu(net->mib.ip_statistics);
++	free_percpu(net->mib.tcp_statistics);
++#ifdef CONFIG_MPTCP
++	/* allocated on demand, see mptcp_init_sock() */
++	free_percpu(net->mib.mptcp_statistics);
++#endif
++}
++
++static __net_initdata struct pernet_operations ipv4_mib_ops = {
++	.init = ipv4_mib_init_net,
++	.exit = ipv4_mib_exit_net,
++};
++
++static int __init init_ipv4_mibs(void)
++{
++	return register_pernet_subsys(&ipv4_mib_ops);
++}
++
++static __net_init int inet_init_net(struct net *net)
++{
++	/*
++	 * Set defaults for local port range
++	 */
++	seqlock_init(&net->ipv4.ip_local_ports.lock);
++	net->ipv4.ip_local_ports.range[0] =  32768;
++	net->ipv4.ip_local_ports.range[1] =  60999;
++
++	seqlock_init(&net->ipv4.ping_group_range.lock);
++	/*
++	 * Sane defaults - nobody may create ping sockets.
++	 * Boot scripts should set this to distro-specific group.
++	 */
++	net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
++	net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
++
++	/* Default values for sysctl-controlled parameters.
++	 * We set them here, in case sysctl is not compiled.
++	 */
++	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
++	net->ipv4.sysctl_ip_fwd_update_priority = 1;
++	net->ipv4.sysctl_ip_dynaddr = 0;
++	net->ipv4.sysctl_ip_early_demux = 1;
++	net->ipv4.sysctl_udp_early_demux = 1;
++	net->ipv4.sysctl_tcp_early_demux = 1;
++	net->ipv4.sysctl_nexthop_compat_mode = 1;
++#ifdef CONFIG_SYSCTL
++	net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
++#endif
++
++	/* Some igmp sysctl, whose values are always used */
++	net->ipv4.sysctl_igmp_max_memberships = 20;
++	net->ipv4.sysctl_igmp_max_msf = 10;
++	/* IGMP reports for link-local multicast groups are enabled by default */
++	net->ipv4.sysctl_igmp_llm_reports = 1;
++	net->ipv4.sysctl_igmp_qrv = 2;
++
++	net->ipv4.sysctl_fib_notify_on_flag_change = 0;
++
++	return 0;
++}
++
++static __net_initdata struct pernet_operations af_inet_ops = {
++	.init = inet_init_net,
++};
++
++static int __init init_inet_pernet_ops(void)
++{
++	return register_pernet_subsys(&af_inet_ops);
++}
++
++static int ipv4_proc_init(void);
++
++/*
++ *	IP protocol layer initialiser
++ */
++
++static struct packet_offload ip_packet_offload __read_mostly = {
++	.type = cpu_to_be16(ETH_P_IP),
++	.callbacks = {
++		.gso_segment = inet_gso_segment,
++		.gro_receive = inet_gro_receive,
++		.gro_complete = inet_gro_complete,
++	},
++};
++
++static const struct net_offload ipip_offload = {
++	.callbacks = {
++		.gso_segment	= ipip_gso_segment,
++		.gro_receive	= ipip_gro_receive,
++		.gro_complete	= ipip_gro_complete,
++	},
++};
++
++static int __init ipip_offload_init(void)
++{
++	return inet_add_offload(&ipip_offload, IPPROTO_IPIP);
++}
++
++static int __init ipv4_offload_init(void)
++{
++	/*
++	 * Add offloads
++	 */
++	if (udpv4_offload_init() < 0)
++		pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
++	if (tcpv4_offload_init() < 0)
++		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
++	if (ipip_offload_init() < 0)
++		pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
++
++	dev_add_offload(&ip_packet_offload);
++	return 0;
++}
++
++fs_initcall(ipv4_offload_init);
++
++static struct packet_type ip_packet_type __read_mostly = {
++	.type = cpu_to_be16(ETH_P_IP),
++	.func = ip_rcv,
++	.list_func = ip_list_rcv,
++};
++
++static int __init inet_init(void)
++{
++	struct inet_protosw *q;
++	struct list_head *r;
++	int rc;
++
++	sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
++
++	raw_hashinfo_init(&raw_v4_hashinfo);
++
++	rc = proto_register(&tcp_prot, 1);
++	if (rc)
++		goto out;
++
++	rc = proto_register(&udp_prot, 1);
++	if (rc)
++		goto out_unregister_tcp_proto;
++
++	rc = proto_register(&raw_prot, 1);
++	if (rc)
++		goto out_unregister_udp_proto;
++
++	rc = proto_register(&ping_prot, 1);
++	if (rc)
++		goto out_unregister_raw_proto;
++
++	/*
++	 *	Tell SOCKET that we are alive...
++	 */
++
++	(void)sock_register(&inet_family_ops);
++
++#ifdef CONFIG_SYSCTL
++	ip_static_sysctl_init();
++#endif
++
++	/*
++	 *	Add all the base protocols.
++	 */
++
++	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
++		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
++	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
++		pr_crit("%s: Cannot add UDP protocol\n", __func__);
++	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
++		pr_crit("%s: Cannot add TCP protocol\n", __func__);
++#ifdef CONFIG_IP_MULTICAST
++	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
++		pr_crit("%s: Cannot add IGMP protocol\n", __func__);
++#endif
++
++	/* Register the socket-side information for inet_create. */
++	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
++		INIT_LIST_HEAD(r);
++
++	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
++		inet_register_protosw(q);
++
++	/*
++	 *	Set the ARP module up
++	 */
++
++	arp_init();
++
++	/*
++	 *	Set the IP module up
++	 */
++
++	ip_init();
++
++	/* Initialise per-cpu ipv4 mibs */
++	if (init_ipv4_mibs())
++		panic("%s: Cannot init ipv4 mibs\n", __func__);
++
++	/* Setup TCP slab cache for open requests. */
++	tcp_init();
++
++	/* Setup UDP memory threshold */
++	udp_init();
++
++	/* Add UDP-Lite (RFC 3828) */
++	udplite4_register();
++
++	raw_init();
++
++	ping_init();
++
++	/*
++	 *	Set the ICMP layer up
++	 */
++
++	if (icmp_init() < 0)
++		panic("Failed to create the ICMP control socket.\n");
++
++	/*
++	 *	Initialise the multicast router
++	 */
++#if defined(CONFIG_IP_MROUTE)
++	if (ip_mr_init())
++		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
++#endif
++
++	if (init_inet_pernet_ops())
++		pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
++
++	ipv4_proc_init();
++
++	ipfrag_init();
++
++	dev_add_pack(&ip_packet_type);
++
++	ip_tunnel_core_init();
++
++	rc = 0;
++out:
++	return rc;
++out_unregister_raw_proto:
++	proto_unregister(&raw_prot);
++out_unregister_udp_proto:
++	proto_unregister(&udp_prot);
++out_unregister_tcp_proto:
++	proto_unregister(&tcp_prot);
++	goto out;
++}
++
++fs_initcall(inet_init);
++
++/* ------------------------------------------------------------------------ */
++
++#ifdef CONFIG_PROC_FS
++static int __init ipv4_proc_init(void)
++{
++	int rc = 0;
++
++	if (raw_proc_init())
++		goto out_raw;
++	if (tcp4_proc_init())
++		goto out_tcp;
++	if (udp4_proc_init())
++		goto out_udp;
++	if (ping_proc_init())
++		goto out_ping;
++	if (ip_misc_proc_init())
++		goto out_misc;
++out:
++	return rc;
++out_misc:
++	ping_proc_exit();
++out_ping:
++	udp4_proc_exit();
++out_udp:
++	tcp4_proc_exit();
++out_tcp:
++	raw_proc_exit();
++out_raw:
++	rc = -ENOMEM;
++	goto out;
++}
++
++#else /* CONFIG_PROC_FS */
++static int __init ipv4_proc_init(void)
++{
++	return 0;
++}
++#endif /* CONFIG_PROC_FS */
+diff -rupN linux.orig/net/ipv6/seg6_local.c linux/net/ipv6/seg6_local.c
+--- linux.orig/net/ipv6/seg6_local.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/ipv6/seg6_local.c	2022-12-04 10:40:26.732034003 -0500
+@@ -1508,13 +1508,13 @@ static int put_nla_counters(struct sk_bu
  
  		pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
  		do {
@@ -8869,11 +59406,10 @@ index b7de5e46fdd8f..f84da849819cc 100644
  
  		counters.packets += packets;
  		counters.bytes += bytes;
-diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
-index 9d7b238a67372..965b9cb2ef3f2 100644
---- a/net/mac80211/sta_info.c
-+++ b/net/mac80211/sta_info.c
-@@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
+diff -rupN linux.orig/net/mac80211/sta_info.c linux/net/mac80211/sta_info.c
+--- linux.orig/net/mac80211/sta_info.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/mac80211/sta_info.c	2022-12-04 10:40:26.732034003 -0500
+@@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu(
  	u64 value;
  
  	do {
@@ -8885,7 +59421,7 @@ index 9d7b238a67372..965b9cb2ef3f2 100644
  
  	return value;
  }
-@@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
+@@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(st
  	u64 value;
  
  	do {
@@ -8897,11 +59433,10 @@ index 9d7b238a67372..965b9cb2ef3f2 100644
  
  	return value;
  }
-diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
-index b52afe316dc41..35b5f806fdda1 100644
---- a/net/mpls/af_mpls.c
-+++ b/net/mpls/af_mpls.c
-@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev,
+diff -rupN linux.orig/net/mpls/af_mpls.c linux/net/mpls/af_mpls.c
+--- linux.orig/net/mpls/af_mpls.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/mpls/af_mpls.c	2022-12-04 10:40:26.732034003 -0500
+@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_d
  
  		p = per_cpu_ptr(mdev->stats, i);
  		do {
@@ -8913,11 +59448,10 @@ index b52afe316dc41..35b5f806fdda1 100644
  
  		stats->rx_packets	+= local.rx_packets;
  		stats->rx_bytes		+= local.rx_bytes;
-diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
-index efab2b06d3732..5a7349002508e 100644
---- a/net/netfilter/ipvs/ip_vs_ctl.c
-+++ b/net/netfilter/ipvs/ip_vs_ctl.c
-@@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+diff -rupN linux.orig/net/netfilter/ipvs/ip_vs_ctl.c linux/net/netfilter/ipvs/ip_vs_ctl.c
+--- linux.orig/net/netfilter/ipvs/ip_vs_ctl.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/netfilter/ipvs/ip_vs_ctl.c	2022-12-04 10:40:26.736033993 -0500
+@@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struc
  		u64 conns, inpkts, outpkts, inbytes, outbytes;
  
  		do {
@@ -8933,11 +59467,10 @@ index efab2b06d3732..5a7349002508e 100644
  
  		seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
  			   i, (u64)conns, (u64)inpkts,
-diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
-index 63c70141b3e5d..cde0d9f0d838e 100644
---- a/net/netfilter/nf_tables_api.c
-+++ b/net/netfilter/nf_tables_api.c
-@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
+diff -rupN linux.orig/net/netfilter/nf_tables_api.c linux/net/netfilter/nf_tables_api.c
+--- linux.orig/net/netfilter/nf_tables_api.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/netfilter/nf_tables_api.c	2022-12-04 10:40:26.736033993 -0500
+@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff
  	for_each_possible_cpu(cpu) {
  		cpu_stats = per_cpu_ptr(stats, cpu);
  		do {
@@ -8950,11 +59483,10 @@ index 63c70141b3e5d..cde0d9f0d838e 100644
  		total.pkts += pkts;
  		total.bytes += bytes;
  	}
-diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
-index 93c596e3b22b9..b05458c170484 100644
---- a/net/openvswitch/datapath.c
-+++ b/net/openvswitch/datapath.c
-@@ -715,9 +715,9 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
+diff -rupN linux.orig/net/openvswitch/datapath.c linux/net/openvswitch/datapath.c
+--- linux.orig/net/openvswitch/datapath.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/openvswitch/datapath.c	2022-12-04 10:40:26.736033993 -0500
+@@ -715,9 +715,9 @@ static void get_dp_stats(const struct da
  		percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
  
  		do {
@@ -8966,11 +59498,10 @@ index 93c596e3b22b9..b05458c170484 100644
  
  		stats->n_hit += local_stats.n_hit;
  		stats->n_missed += local_stats.n_missed;
-diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
-index d4a2db0b22998..0a0e4c283f02e 100644
---- a/net/openvswitch/flow_table.c
-+++ b/net/openvswitch/flow_table.c
-@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
+diff -rupN linux.orig/net/openvswitch/flow_table.c linux/net/openvswitch/flow_table.c
+--- linux.orig/net/openvswitch/flow_table.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/openvswitch/flow_table.c	2022-12-04 10:40:26.736033993 -0500
+@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counter
  
  			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
  			do {
@@ -8982,7 +59513,7 @@ index d4a2db0b22998..0a0e4c283f02e 100644
  
  			ma->masks_usage_zero_cntr[i] += counter;
  		}
-@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
+@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flo
  
  			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
  			do {
diff --git a/packages/jelos/tmpfiles.d/jelos-dirs.conf b/packages/virtual/emulators/tmpfiles.d/jelos-dirs.conf
similarity index 100%
rename from packages/jelos/tmpfiles.d/jelos-dirs.conf
rename to packages/virtual/emulators/tmpfiles.d/jelos-dirs.conf
diff --git a/projects/Rockchip/packages/linux/package.mk b/projects/Rockchip/packages/linux/package.mk
index 92fe16b78..de7b60fc9 100644
--- a/projects/Rockchip/packages/linux/package.mk
+++ b/projects/Rockchip/packages/linux/package.mk
@@ -25,7 +25,7 @@ case ${DEVICE} in
     PKG_GIT_CLONE_BRANCH="main"
   ;;
   RG552)
-    PKG_VERSION="6.0.7"
+    PKG_VERSION="6.0.11"
     PKG_URL="https://www.kernel.org/pub/linux/kernel/v6.x/${PKG_NAME}-${PKG_VERSION}.tar.xz"
   ;;
   RG353P|RG503)
diff --git a/projects/Rockchip/packages/linux/patches/RG552/patch-6.0.5-rt14.patch b/projects/Rockchip/packages/linux/patches/RG552/patch-6.0.5-rt14.patch
index c0c976eb9..2de168ec2 100644
--- a/projects/Rockchip/packages/linux/patches/RG552/patch-6.0.5-rt14.patch
+++ b/projects/Rockchip/packages/linux/patches/RG552/patch-6.0.5-rt14.patch
@@ -1,7 +1,43 @@
-diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
-index 11ecf09aadc86..98aa5a478719c 100644
---- a/arch/arm/Kconfig
-+++ b/arch/arm/Kconfig
+diff -rupN linux.orig/arch/arm/include/asm/thread_info.h linux/arch/arm/include/asm/thread_info.h
+--- linux.orig/arch/arm/include/asm/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/include/asm/thread_info.h	2022-12-04 10:40:26.676034147 -0500
+@@ -62,6 +62,7 @@ struct cpu_context_save {
+ struct thread_info {
+ 	unsigned long		flags;		/* low level flags */
+ 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
++	int			preempt_lazy_count; /* 0 => preemptable, <0 => bug */
+ 	__u32			cpu;		/* cpu */
+ 	__u32			cpu_domain;	/* cpu domain */
+ 	struct cpu_context_save	cpu_context;	/* cpu context */
+@@ -133,6 +134,7 @@ extern int vfp_restore_user_hwstate(stru
+ #define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
+ #define TIF_SECCOMP		7	/* seccomp syscall filtering active */
+ #define TIF_NOTIFY_SIGNAL	8	/* signal notifications exist */
++#define TIF_NEED_RESCHED_LAZY	9
+ 
+ #define TIF_USING_IWMMXT	17
+ #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
+@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(stru
+ #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+ #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+ #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
++#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
+ #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
+ 
+ /* Checks for any syscall work in entry-common.S */
+@@ -156,7 +159,8 @@ extern int vfp_restore_user_hwstate(stru
+ /*
+  * Change these and you break ASM code in entry-common.S
+  */
+-#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
++#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
++				 _TIF_SIGPENDING | \
+ 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ 				 _TIF_NOTIFY_SIGNAL)
+ 
+diff -rupN linux.orig/arch/arm/Kconfig linux/arch/arm/Kconfig
+--- linux.orig/arch/arm/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/Kconfig	2022-12-04 10:40:26.676034147 -0500
 @@ -33,6 +33,7 @@ config ARM
  	select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
  	select ARCH_SUPPORTS_ATOMIC_RMW
@@ -35,48 +71,9 @@ index 11ecf09aadc86..98aa5a478719c 100644
  	select RTC_LIB
  	select SYS_SUPPORTS_APM_EMULATION
  	select THREAD_INFO_IN_TASK
-diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
-index aecc403b28804..1b56e56f8f415 100644
---- a/arch/arm/include/asm/thread_info.h
-+++ b/arch/arm/include/asm/thread_info.h
-@@ -62,6 +62,7 @@ struct cpu_context_save {
- struct thread_info {
- 	unsigned long		flags;		/* low level flags */
- 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
-+	int			preempt_lazy_count; /* 0 => preemptable, <0 => bug */
- 	__u32			cpu;		/* cpu */
- 	__u32			cpu_domain;	/* cpu domain */
- 	struct cpu_context_save	cpu_context;	/* cpu context */
-@@ -133,6 +134,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
- #define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
- #define TIF_SECCOMP		7	/* seccomp syscall filtering active */
- #define TIF_NOTIFY_SIGNAL	8	/* signal notifications exist */
-+#define TIF_NEED_RESCHED_LAZY	9
- 
- #define TIF_USING_IWMMXT	17
- #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
-@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
- #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
- #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
- #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
-+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
- #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
- 
- /* Checks for any syscall work in entry-common.S */
-@@ -156,7 +159,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
- /*
-  * Change these and you break ASM code in entry-common.S
-  */
--#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-+#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
-+				 _TIF_SIGPENDING | \
- 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- 				 _TIF_NOTIFY_SIGNAL)
- 
-diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
-index 2c8d76fd7c662..c3bdec7d2df9c 100644
---- a/arch/arm/kernel/asm-offsets.c
-+++ b/arch/arm/kernel/asm-offsets.c
+diff -rupN linux.orig/arch/arm/kernel/asm-offsets.c linux/arch/arm/kernel/asm-offsets.c
+--- linux.orig/arch/arm/kernel/asm-offsets.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/kernel/asm-offsets.c	2022-12-04 10:40:26.676034147 -0500
 @@ -43,6 +43,7 @@ int main(void)
    BLANK();
    DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
@@ -85,11 +82,10 @@ index 2c8d76fd7c662..c3bdec7d2df9c 100644
    DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
    DEFINE(TI_CPU_DOMAIN,		offsetof(struct thread_info, cpu_domain));
    DEFINE(TI_CPU_SAVE,		offsetof(struct thread_info, cpu_context));
-diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
-index c39303e5c2347..cfb4660e9feab 100644
---- a/arch/arm/kernel/entry-armv.S
-+++ b/arch/arm/kernel/entry-armv.S
-@@ -222,11 +222,18 @@ ENDPROC(__dabt_svc)
+diff -rupN linux.orig/arch/arm/kernel/entry-armv.S linux/arch/arm/kernel/entry-armv.S
+--- linux.orig/arch/arm/kernel/entry-armv.S	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/kernel/entry-armv.S	2022-12-04 10:40:26.676034147 -0500
+@@ -222,11 +222,18 @@ __irq_svc:
  
  #ifdef CONFIG_PREEMPTION
  	ldr	r8, [tsk, #TI_PREEMPT]		@ get preempt count
@@ -110,7 +106,7 @@ index c39303e5c2347..cfb4660e9feab 100644
  #endif
  
  	svc_exit r5, irq = 1			@ return from exception
-@@ -241,8 +248,14 @@ ENDPROC(__irq_svc)
+@@ -241,8 +248,14 @@ svc_preempt:
  1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
  	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
  	tst	r0, #_TIF_NEED_RESCHED
@@ -126,11 +122,10 @@ index c39303e5c2347..cfb4660e9feab 100644
  #endif
  
  __und_fault:
-diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
-index ea128e32e8ca8..3671a4214d6f4 100644
---- a/arch/arm/kernel/signal.c
-+++ b/arch/arm/kernel/signal.c
-@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
+diff -rupN linux.orig/arch/arm/kernel/signal.c linux/arch/arm/kernel/signal.c
+--- linux.orig/arch/arm/kernel/signal.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/kernel/signal.c	2022-12-04 10:40:26.676034147 -0500
+@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, un
  	 */
  	trace_hardirqs_off();
  	do {
@@ -140,11 +135,10 @@ index ea128e32e8ca8..3671a4214d6f4 100644
  			schedule();
  		} else {
  			if (unlikely(!user_mode(regs)))
-diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
-index 46cccd6bf705a..480a1976a9dce 100644
---- a/arch/arm/mm/fault.c
-+++ b/arch/arm/mm/fault.c
-@@ -421,6 +421,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
+diff -rupN linux.orig/arch/arm/mm/fault.c linux/arch/arm/mm/fault.c
+--- linux.orig/arch/arm/mm/fault.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm/mm/fault.c	2022-12-04 10:40:26.676034147 -0500
+@@ -421,6 +421,9 @@ do_translation_fault(unsigned long addr,
  	if (addr < TASK_SIZE)
  		return do_page_fault(addr, fsr, regs);
  
@@ -154,7 +148,7 @@ index 46cccd6bf705a..480a1976a9dce 100644
  	if (user_mode(regs))
  		goto bad_area;
  
-@@ -491,6 +494,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
+@@ -491,6 +494,9 @@ do_translation_fault(unsigned long addr,
  static int
  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
  {
@@ -164,31 +158,10 @@ index 46cccd6bf705a..480a1976a9dce 100644
  	do_bad_area(addr, fsr, regs);
  	return 0;
  }
-diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
-index 3795eb5ba1cdd..6922949e61b71 100644
---- a/arch/arm64/Kconfig
-+++ b/arch/arm64/Kconfig
-@@ -93,6 +93,7 @@ config ARM64
- 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
- 	select ARCH_SUPPORTS_NUMA_BALANCING
- 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
-+	select ARCH_SUPPORTS_RT
- 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
- 	select ARCH_WANT_DEFAULT_BPF_JIT
- 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
-@@ -200,6 +201,7 @@ config ARM64
- 	select HAVE_PERF_USER_STACK_DUMP
- 	select HAVE_PREEMPT_DYNAMIC_KEY
- 	select HAVE_REGS_AND_STACK_ACCESS_API
-+	select HAVE_PREEMPT_LAZY
- 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
- 	select HAVE_FUNCTION_ARG_ACCESS_API
- 	select MMU_GATHER_RCU_TABLE_FREE
-diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
-index 0159b625cc7f0..a5486918e5eeb 100644
---- a/arch/arm64/include/asm/preempt.h
-+++ b/arch/arm64/include/asm/preempt.h
-@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_and_test(void)
+diff -rupN linux.orig/arch/arm64/include/asm/preempt.h linux/arch/arm64/include/asm/preempt.h
+--- linux.orig/arch/arm64/include/asm/preempt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/include/asm/preempt.h	2022-12-04 10:40:26.676034147 -0500
+@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_a
  	 * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE
  	 * pair.
  	 */
@@ -226,10 +199,9 @@ index 0159b625cc7f0..a5486918e5eeb 100644
  }
  
  #ifdef CONFIG_PREEMPTION
-diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
-index 848739c15de82..4b7148fd5551f 100644
---- a/arch/arm64/include/asm/thread_info.h
-+++ b/arch/arm64/include/asm/thread_info.h
+diff -rupN linux.orig/arch/arm64/include/asm/thread_info.h linux/arch/arm64/include/asm/thread_info.h
+--- linux.orig/arch/arm64/include/asm/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/include/asm/thread_info.h	2022-12-04 10:40:26.676034147 -0500
 @@ -26,6 +26,7 @@ struct thread_info {
  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
  	u64			ttbr0;		/* saved TTBR0_EL1 */
@@ -238,7 +210,7 @@ index 848739c15de82..4b7148fd5551f 100644
  	union {
  		u64		preempt_count;	/* 0 => preemptible, <0 => bug */
  		struct {
-@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_struct *dst,
+@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_str
  #define TIF_UPROBE		4	/* uprobe breakpoint or singlestep */
  #define TIF_MTE_ASYNC_FAULT	5	/* MTE Asynchronous Tag Check Fault */
  #define TIF_NOTIFY_SIGNAL	6	/* signal notifications exist */
@@ -246,7 +218,7 @@ index 848739c15de82..4b7148fd5551f 100644
  #define TIF_SYSCALL_TRACE	8	/* syscall trace active */
  #define TIF_SYSCALL_AUDIT	9	/* syscall auditing */
  #define TIF_SYSCALL_TRACEPOINT	10	/* syscall tracepoint for ftrace */
-@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_struct *dst,
+@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_str
  #define _TIF_SVE		(1 << TIF_SVE)
  #define _TIF_MTE_ASYNC_FAULT	(1 << TIF_MTE_ASYNC_FAULT)
  #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
@@ -258,7 +230,7 @@ index 848739c15de82..4b7148fd5551f 100644
  				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
  				 _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
  				 _TIF_NOTIFY_SIGNAL)
-@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_struct *dst,
+@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_str
  				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
  				 _TIF_SYSCALL_EMU)
  
@@ -267,10 +239,28 @@ index 848739c15de82..4b7148fd5551f 100644
  #ifdef CONFIG_SHADOW_CALL_STACK
  #define INIT_SCS							\
  	.scs_base	= init_shadow_call_stack,			\
-diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
-index 1197e7679882e..e74c0415f67ea 100644
---- a/arch/arm64/kernel/asm-offsets.c
-+++ b/arch/arm64/kernel/asm-offsets.c
+diff -rupN linux.orig/arch/arm64/Kconfig linux/arch/arm64/Kconfig
+--- linux.orig/arch/arm64/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/Kconfig	2022-12-04 10:40:26.676034147 -0500
+@@ -93,6 +93,7 @@ config ARM64
+ 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ 	select ARCH_SUPPORTS_NUMA_BALANCING
+ 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
++	select ARCH_SUPPORTS_RT
+ 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
+ 	select ARCH_WANT_DEFAULT_BPF_JIT
+ 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+@@ -200,6 +201,7 @@ config ARM64
+ 	select HAVE_PERF_USER_STACK_DUMP
+ 	select HAVE_PREEMPT_DYNAMIC_KEY
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
++	select HAVE_PREEMPT_LAZY
+ 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ 	select HAVE_FUNCTION_ARG_ACCESS_API
+ 	select MMU_GATHER_RCU_TABLE_FREE
+diff -rupN linux.orig/arch/arm64/kernel/asm-offsets.c linux/arch/arm64/kernel/asm-offsets.c
+--- linux.orig/arch/arm64/kernel/asm-offsets.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/kernel/asm-offsets.c	2022-12-04 10:40:26.676034147 -0500
 @@ -32,6 +32,7 @@ int main(void)
    DEFINE(TSK_TI_CPU,		offsetof(struct task_struct, thread_info.cpu));
    DEFINE(TSK_TI_FLAGS,		offsetof(struct task_struct, thread_info.flags));
@@ -279,11 +269,10 @@ index 1197e7679882e..e74c0415f67ea 100644
  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
    DEFINE(TSK_TI_TTBR0,		offsetof(struct task_struct, thread_info.ttbr0));
  #endif
-diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
-index 9ad911f1647c8..545c41a84411e 100644
---- a/arch/arm64/kernel/signal.c
-+++ b/arch/arm64/kernel/signal.c
-@@ -1103,7 +1103,7 @@ static void do_signal(struct pt_regs *regs)
+diff -rupN linux.orig/arch/arm64/kernel/signal.c linux/arch/arm64/kernel/signal.c
+--- linux.orig/arch/arm64/kernel/signal.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/arm64/kernel/signal.c	2022-12-04 10:40:26.676034147 -0500
+@@ -1103,7 +1103,7 @@ static void do_signal(struct pt_regs *re
  void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
  {
  	do {
@@ -292,34 +281,10 @@ index 9ad911f1647c8..545c41a84411e 100644
  			/* Unmask Debug and SError for the next task */
  			local_daif_restore(DAIF_PROCCTX_NOIRQ);
  
-diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
-index cbe7bb029aec8..ad5bcc255f4e3 100644
---- a/arch/powerpc/Kconfig
-+++ b/arch/powerpc/Kconfig
-@@ -149,6 +149,7 @@ config PPC
- 	select ARCH_STACKWALK
- 	select ARCH_SUPPORTS_ATOMIC_RMW
- 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx || 40x
-+	select ARCH_SUPPORTS_RT			if HAVE_POSIX_CPU_TIMERS_TASK_WORK
- 	select ARCH_USE_BUILTIN_BSWAP
- 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
- 	select ARCH_USE_MEMTEST
-@@ -241,8 +242,10 @@ config PPC
- 	select HAVE_PERF_EVENTS_NMI		if PPC64
- 	select HAVE_PERF_REGS
- 	select HAVE_PERF_USER_STACK_DUMP
-+	select HAVE_PREEMPT_LAZY
- 	select HAVE_REGS_AND_STACK_ACCESS_API
- 	select HAVE_RELIABLE_STACKTRACE
-+	select HAVE_POSIX_CPU_TIMERS_TASK_WORK	if !KVM
- 	select HAVE_RSEQ
- 	select HAVE_SETUP_PER_CPU_AREA		if PPC64
- 	select HAVE_SOFTIRQ_ON_OWN_STACK
-diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h
-index 1c8460e235838..b1653c160bab9 100644
---- a/arch/powerpc/include/asm/stackprotector.h
-+++ b/arch/powerpc/include/asm/stackprotector.h
-@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void)
+diff -rupN linux.orig/arch/powerpc/include/asm/stackprotector.h linux/arch/powerpc/include/asm/stackprotector.h
+--- linux.orig/arch/powerpc/include/asm/stackprotector.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/include/asm/stackprotector.h	2022-12-04 10:40:26.676034147 -0500
+@@ -24,7 +24,11 @@ static __always_inline void boot_init_st
  	unsigned long canary;
  
  	/* Try to get a semi random initial value. */
@@ -331,10 +296,9 @@ index 1c8460e235838..b1653c160bab9 100644
  	canary ^= mftb();
  	canary ^= LINUX_VERSION_CODE;
  	canary &= CANARY_MASK;
-diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
-index af58f1ed3952e..520864de8bb27 100644
---- a/arch/powerpc/include/asm/thread_info.h
-+++ b/arch/powerpc/include/asm/thread_info.h
+diff -rupN linux.orig/arch/powerpc/include/asm/thread_info.h linux/arch/powerpc/include/asm/thread_info.h
+--- linux.orig/arch/powerpc/include/asm/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/include/asm/thread_info.h	2022-12-04 10:40:26.676034147 -0500
 @@ -53,6 +53,8 @@
  struct thread_info {
  	int		preempt_count;		/* 0 => preemptable,
@@ -389,11 +353,32 @@ index af58f1ed3952e..520864de8bb27 100644
  
  /* Bits in local_flags */
  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
-diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
-index f9db0a172401a..38aa3d06c632c 100644
---- a/arch/powerpc/kernel/interrupt.c
-+++ b/arch/powerpc/kernel/interrupt.c
-@@ -184,7 +184,7 @@ interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs)
+diff -rupN linux.orig/arch/powerpc/Kconfig linux/arch/powerpc/Kconfig
+--- linux.orig/arch/powerpc/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/Kconfig	2022-12-04 10:40:26.676034147 -0500
+@@ -149,6 +149,7 @@ config PPC
+ 	select ARCH_STACKWALK
+ 	select ARCH_SUPPORTS_ATOMIC_RMW
+ 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx || 40x
++	select ARCH_SUPPORTS_RT			if HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ 	select ARCH_USE_BUILTIN_BSWAP
+ 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
+ 	select ARCH_USE_MEMTEST
+@@ -241,8 +242,10 @@ config PPC
+ 	select HAVE_PERF_EVENTS_NMI		if PPC64
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
++	select HAVE_PREEMPT_LAZY
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
+ 	select HAVE_RELIABLE_STACKTRACE
++	select HAVE_POSIX_CPU_TIMERS_TASK_WORK	if !KVM
+ 	select HAVE_RSEQ
+ 	select HAVE_SETUP_PER_CPU_AREA		if PPC64
+ 	select HAVE_SOFTIRQ_ON_OWN_STACK
+diff -rupN linux.orig/arch/powerpc/kernel/interrupt.c linux/arch/powerpc/kernel/interrupt.c
+--- linux.orig/arch/powerpc/kernel/interrupt.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/kernel/interrupt.c	2022-12-04 10:40:26.676034147 -0500
+@@ -184,7 +184,7 @@ again:
  	ti_flags = read_thread_flags();
  	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
  		local_irq_enable();
@@ -402,7 +387,7 @@ index f9db0a172401a..38aa3d06c632c 100644
  			schedule();
  		} else {
  			/*
-@@ -388,11 +388,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
+@@ -388,11 +388,15 @@ notrace unsigned long interrupt_exit_ker
  		/* Returning to a kernel context with local irqs enabled. */
  		WARN_ON_ONCE(!(regs->msr & MSR_EE));
  again:
@@ -419,10 +404,9 @@ index f9db0a172401a..38aa3d06c632c 100644
  			}
  		}
  
-diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
-index dadfcef5d6db4..3bfe55d82b042 100644
---- a/arch/powerpc/kernel/traps.c
-+++ b/arch/powerpc/kernel/traps.c
+diff -rupN linux.orig/arch/powerpc/kernel/traps.c linux/arch/powerpc/kernel/traps.c
+--- linux.orig/arch/powerpc/kernel/traps.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/kernel/traps.c	2022-12-04 10:40:26.676034147 -0500
 @@ -260,12 +260,17 @@ static char *get_mmu_str(void)
  
  static int __die(const char *str, struct pt_regs *regs, long err)
@@ -442,10 +426,9 @@ index dadfcef5d6db4..3bfe55d82b042 100644
  	       IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
  	       IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
  	       debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
-diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
-index dcb398d5e0093..2cfa432afdb12 100644
---- a/arch/powerpc/kvm/Kconfig
-+++ b/arch/powerpc/kvm/Kconfig
+diff -rupN linux.orig/arch/powerpc/kvm/Kconfig linux/arch/powerpc/kvm/Kconfig
+--- linux.orig/arch/powerpc/kvm/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/kvm/Kconfig	2022-12-04 10:40:26.676034147 -0500
 @@ -221,6 +221,7 @@ config KVM_E500MC
  config KVM_MPIC
  	bool "KVM in-kernel MPIC emulation"
@@ -454,10 +437,9 @@ index dcb398d5e0093..2cfa432afdb12 100644
  	select HAVE_KVM_IRQCHIP
  	select HAVE_KVM_IRQFD
  	select HAVE_KVM_IRQ_ROUTING
-diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
-index 561adac690229..61c4c0610aa6a 100644
---- a/arch/powerpc/platforms/pseries/iommu.c
-+++ b/arch/powerpc/platforms/pseries/iommu.c
+diff -rupN linux.orig/arch/powerpc/platforms/pseries/iommu.c linux/arch/powerpc/platforms/pseries/iommu.c
+--- linux.orig/arch/powerpc/platforms/pseries/iommu.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/powerpc/platforms/pseries/iommu.c	2022-12-04 10:40:26.676034147 -0500
 @@ -24,6 +24,7 @@
  #include <linux/of.h>
  #include <linux/iommu.h>
@@ -466,7 +448,7 @@ index 561adac690229..61c4c0610aa6a 100644
  #include <asm/io.h>
  #include <asm/prom.h>
  #include <asm/rtas.h>
-@@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
+@@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned
  	return ret;
  }
  
@@ -481,7 +463,7 @@ index 561adac690229..61c4c0610aa6a 100644
  
  static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
  				     long npages, unsigned long uaddr,
-@@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+@@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(stru
  		                           direction, attrs);
  	}
  
@@ -494,7 +476,7 @@ index 561adac690229..61c4c0610aa6a 100644
  
  	/* This is safe to do since interrupts are off when we're called
  	 * from iommu_alloc{,_sg}()
-@@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+@@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(stru
  		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
  		/* If allocation fails, fall back to the loop implementation */
  		if (!tcep) {
@@ -509,7 +491,7 @@ index 561adac690229..61c4c0610aa6a 100644
  	}
  
  	rpn = __pa(uaddr) >> tceshift;
-@@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+@@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(stru
  		tcenum += limit;
  	} while (npages > 0 && !rc);
  
@@ -518,7 +500,7 @@ index 561adac690229..61c4c0610aa6a 100644
  
  	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
  		ret = (int)rc;
-@@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
+@@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP(
  				DMA_BIDIRECTIONAL, 0);
  	}
  
@@ -540,7 +522,7 @@ index 561adac690229..61c4c0610aa6a 100644
  	}
  
  	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
-@@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
+@@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP(
  
  	/* error cleanup: caller will clear whole range */
  
@@ -549,31 +531,10 @@ index 561adac690229..61c4c0610aa6a 100644
  	return rc;
  }
  
-diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
-index 159c025ebb03e..4d62ceece1bb0 100644
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -109,6 +109,7 @@ config X86
- 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
- 	select ARCH_SUPPORTS_LTO_CLANG
- 	select ARCH_SUPPORTS_LTO_CLANG_THIN
-+	select ARCH_SUPPORTS_RT
- 	select ARCH_USE_BUILTIN_BSWAP
- 	select ARCH_USE_MEMTEST
- 	select ARCH_USE_QUEUED_RWLOCKS
-@@ -243,6 +244,7 @@ config X86
- 	select HAVE_PCI
- 	select HAVE_PERF_REGS
- 	select HAVE_PERF_USER_STACK_DUMP
-+	select HAVE_PREEMPT_LAZY
- 	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
- 	select MMU_GATHER_MERGE_VMAS
- 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
-diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
-index 5f6daea1ee248..cd20b4a5719a4 100644
---- a/arch/x86/include/asm/preempt.h
-+++ b/arch/x86/include/asm/preempt.h
-@@ -90,17 +90,48 @@ static __always_inline void __preempt_count_sub(int val)
+diff -rupN linux.orig/arch/x86/include/asm/preempt.h linux/arch/x86/include/asm/preempt.h
+--- linux.orig/arch/x86/include/asm/preempt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/x86/include/asm/preempt.h	2022-12-04 10:40:26.676034147 -0500
+@@ -90,17 +90,48 @@ static __always_inline void __preempt_co
   * a decrement which hits zero means we have no preempt_count and should
   * reschedule.
   */
@@ -623,10 +584,9 @@ index 5f6daea1ee248..cd20b4a5719a4 100644
  }
  
  #ifdef CONFIG_PREEMPTION
-diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
-index f0cb881c1d690..fd8fb76f324fc 100644
---- a/arch/x86/include/asm/thread_info.h
-+++ b/arch/x86/include/asm/thread_info.h
+diff -rupN linux.orig/arch/x86/include/asm/thread_info.h linux/arch/x86/include/asm/thread_info.h
+--- linux.orig/arch/x86/include/asm/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/x86/include/asm/thread_info.h	2022-12-04 10:40:26.676034147 -0500
 @@ -57,6 +57,8 @@ struct thread_info {
  	unsigned long		flags;		/* low level flags */
  	unsigned long		syscall_work;	/* SYSCALL_WORK_ flags */
@@ -660,11 +620,29 @@ index f0cb881c1d690..fd8fb76f324fc 100644
  #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
  #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
  #define _TIF_SPEC_FORCE_UPDATE	(1 << TIF_SPEC_FORCE_UPDATE)
-diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c
-index fac8ff983aec8..65fb9bad1577a 100644
---- a/drivers/bcma/driver_gpio.c
-+++ b/drivers/bcma/driver_gpio.c
-@@ -115,7 +115,7 @@ static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id)
+diff -rupN linux.orig/arch/x86/Kconfig linux/arch/x86/Kconfig
+--- linux.orig/arch/x86/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/arch/x86/Kconfig	2022-12-04 10:40:26.676034147 -0500
+@@ -109,6 +109,7 @@ config X86
+ 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
+ 	select ARCH_SUPPORTS_LTO_CLANG
+ 	select ARCH_SUPPORTS_LTO_CLANG_THIN
++	select ARCH_SUPPORTS_RT
+ 	select ARCH_USE_BUILTIN_BSWAP
+ 	select ARCH_USE_MEMTEST
+ 	select ARCH_USE_QUEUED_RWLOCKS
+@@ -243,6 +244,7 @@ config X86
+ 	select HAVE_PCI
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
++	select HAVE_PREEMPT_LAZY
+ 	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
+ 	select MMU_GATHER_MERGE_VMAS
+ 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+diff -rupN linux.orig/drivers/bcma/driver_gpio.c linux/drivers/bcma/driver_gpio.c
+--- linux.orig/drivers/bcma/driver_gpio.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/bcma/driver_gpio.c	2022-12-04 10:40:26.680034137 -0500
+@@ -115,7 +115,7 @@ static irqreturn_t bcma_gpio_irq_handler
  		return IRQ_NONE;
  
  	for_each_set_bit(gpio, &irqs, gc->ngpio)
@@ -673,11 +651,10 @@ index fac8ff983aec8..65fb9bad1577a 100644
  	bcma_chipco_gpio_polarity(cc, irqs, val & irqs);
  
  	return IRQ_HANDLED;
-diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
-index 226ea76cc8197..4043d909d41bf 100644
---- a/drivers/block/zram/zram_drv.c
-+++ b/drivers/block/zram/zram_drv.c
-@@ -60,6 +60,40 @@ static void zram_free_page(struct zram *zram, size_t index);
+diff -rupN linux.orig/drivers/block/zram/zram_drv.c linux/drivers/block/zram/zram_drv.c
+--- linux.orig/drivers/block/zram/zram_drv.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/block/zram/zram_drv.c	2022-12-04 10:40:26.680034137 -0500
+@@ -60,6 +60,40 @@ static void zram_free_page(struct zram *
  static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
  				u32 index, int offset, struct bio *bio);
  
@@ -718,7 +695,7 @@ index 226ea76cc8197..4043d909d41bf 100644
  
  static int zram_slot_trylock(struct zram *zram, u32 index)
  {
-@@ -75,6 +109,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
+@@ -75,6 +109,7 @@ static void zram_slot_unlock(struct zram
  {
  	bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
  }
@@ -726,7 +703,7 @@ index 226ea76cc8197..4043d909d41bf 100644
  
  static inline bool init_done(struct zram *zram)
  {
-@@ -1198,6 +1233,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
+@@ -1198,6 +1233,7 @@ static bool zram_meta_alloc(struct zram
  
  	if (!huge_class_size)
  		huge_class_size = zs_huge_class_size(zram->mem_pool);
@@ -734,10 +711,9 @@ index 226ea76cc8197..4043d909d41bf 100644
  	return true;
  }
  
-diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
-index 80c3b43b4828f..ff021a9728d1e 100644
---- a/drivers/block/zram/zram_drv.h
-+++ b/drivers/block/zram/zram_drv.h
+diff -rupN linux.orig/drivers/block/zram/zram_drv.h linux/drivers/block/zram/zram_drv.h
+--- linux.orig/drivers/block/zram/zram_drv.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/block/zram/zram_drv.h	2022-12-04 10:40:26.680034137 -0500
 @@ -63,6 +63,9 @@ struct zram_table_entry {
  		unsigned long element;
  	};
@@ -748,11 +724,10 @@ index 80c3b43b4828f..ff021a9728d1e 100644
  #ifdef CONFIG_ZRAM_MEMORY_TRACKING
  	ktime_t ac_time;
  #endif
-diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
-index bcff6429e0b4f..4a9ae338a2bdf 100644
---- a/drivers/char/tpm/tpm_tis.c
-+++ b/drivers/char/tpm/tpm_tis.c
-@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
+diff -rupN linux.orig/drivers/char/tpm/tpm_tis.c linux/drivers/char/tpm/tpm_tis.c
+--- linux.orig/drivers/char/tpm/tpm_tis.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/char/tpm/tpm_tis.c	2022-12-04 10:40:26.680034137 -0500
+@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to
  	return container_of(data, struct tpm_tis_tcg_phy, priv);
  }
  
@@ -784,7 +759,7 @@ index bcff6429e0b4f..4a9ae338a2bdf 100644
  static int interrupts = -1;
  module_param(interrupts, int, 0444);
  MODULE_PARM_DESC(interrupts, "Enable interrupts");
-@@ -185,12 +210,12 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
+@@ -185,12 +210,12 @@ static int tpm_tcg_write_bytes(struct tp
  	switch (io_mode) {
  	case TPM_TIS_PHYS_8:
  		while (len--)
@@ -799,11 +774,10 @@ index bcff6429e0b4f..4a9ae338a2bdf 100644
  		break;
  	}
  
-diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c
-index 64cb060d9d753..77a41151c921b 100644
---- a/drivers/gpio/gpio-mlxbf2.c
-+++ b/drivers/gpio/gpio-mlxbf2.c
-@@ -273,10 +273,8 @@ static irqreturn_t mlxbf2_gpio_irq_handler(int irq, void *ptr)
+diff -rupN linux.orig/drivers/gpio/gpio-mlxbf2.c linux/drivers/gpio/gpio-mlxbf2.c
+--- linux.orig/drivers/gpio/gpio-mlxbf2.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpio/gpio-mlxbf2.c	2022-12-04 10:40:26.680034137 -0500
+@@ -273,10 +273,8 @@ static irqreturn_t mlxbf2_gpio_irq_handl
  	pending = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_CAUSE_EVTEN0);
  	writel(pending, gs->gpio_io + YU_GPIO_CAUSE_OR_CLRCAUSE);
  
@@ -816,23 +790,10 @@ index 64cb060d9d753..77a41151c921b 100644
  
  	return IRQ_RETVAL(pending);
  }
-diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
-index 7ae3b7d67fcfc..844f54f1daea9 100644
---- a/drivers/gpu/drm/i915/Kconfig
-+++ b/drivers/gpu/drm/i915/Kconfig
-@@ -3,7 +3,6 @@ config DRM_I915
- 	tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics"
- 	depends on DRM
- 	depends on X86 && PCI
--	depends on !PREEMPT_RT
- 	select INTEL_GTT if X86
- 	select INTERVAL_TREE
- 	# we need shmfs for the swappable backing store, and in particular
-diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c
-index 4442aa355f868..23085e82c3ed5 100644
---- a/drivers/gpu/drm/i915/display/intel_crtc.c
-+++ b/drivers/gpu/drm/i915/display/intel_crtc.c
-@@ -522,7 +522,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
+diff -rupN linux.orig/drivers/gpu/drm/i915/display/intel_crtc.c linux/drivers/gpu/drm/i915/display/intel_crtc.c
+--- linux.orig/drivers/gpu/drm/i915/display/intel_crtc.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/display/intel_crtc.c	2022-12-04 10:40:26.680034137 -0500
+@@ -522,7 +522,8 @@ void intel_pipe_update_start(struct inte
  	 */
  	intel_psr_wait_for_idle_locked(new_crtc_state);
  
@@ -842,7 +803,7 @@ index 4442aa355f868..23085e82c3ed5 100644
  
  	crtc->debug.min_vbl = min;
  	crtc->debug.max_vbl = max;
-@@ -547,11 +548,13 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
+@@ -547,11 +548,13 @@ void intel_pipe_update_start(struct inte
  			break;
  		}
  
@@ -858,7 +819,7 @@ index 4442aa355f868..23085e82c3ed5 100644
  	}
  
  	finish_wait(wq, &wait);
-@@ -584,7 +587,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
+@@ -584,7 +587,8 @@ void intel_pipe_update_start(struct inte
  	return;
  
  irq_disable:
@@ -868,7 +829,7 @@ index 4442aa355f868..23085e82c3ed5 100644
  }
  
  #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE)
-@@ -685,7 +689,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state)
+@@ -685,7 +689,8 @@ void intel_pipe_update_end(struct intel_
  	 */
  	intel_vrr_send_push(new_crtc_state);
  
@@ -878,11 +839,10 @@ index 4442aa355f868..23085e82c3ed5 100644
  
  	if (intel_vgpu_active(dev_priv))
  		return;
-diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
-index ecc990ec1b952..8d04b10681f0d 100644
---- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
-+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
-@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
+diff -rupN linux.orig/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c linux/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+--- linux.orig/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c	2022-12-04 10:40:26.680034137 -0500
+@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct int
  	/* Kick the work once more to drain the signalers, and disarm the irq */
  	irq_work_sync(&b->irq_work);
  	while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
@@ -895,11 +855,10 @@ index ecc990ec1b952..8d04b10681f0d 100644
  	}
  }
  
-diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
-index c718e6dc40b51..0e592999b7d60 100644
---- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
-+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
-@@ -1302,7 +1302,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+diff -rupN linux.orig/drivers/gpu/drm/i915/gt/intel_execlists_submission.c linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+--- linux.orig/drivers/gpu/drm/i915/gt/intel_execlists_submission.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c	2022-12-04 10:40:26.680034137 -0500
+@@ -1302,7 +1302,7 @@ static void execlists_dequeue(struct int
  	 * and context switches) submission.
  	 */
  
@@ -908,7 +867,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  
  	/*
  	 * If the queue is higher priority than the last
-@@ -1402,7 +1402,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+@@ -1402,7 +1402,7 @@ static void execlists_dequeue(struct int
  				 * Even if ELSP[1] is occupied and not worthy
  				 * of timeslices, our queue might be.
  				 */
@@ -917,7 +876,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  				return;
  			}
  		}
-@@ -1428,7 +1428,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+@@ -1428,7 +1428,7 @@ static void execlists_dequeue(struct int
  
  		if (last && !can_merge_rq(last, rq)) {
  			spin_unlock(&ve->base.sched_engine->lock);
@@ -926,7 +885,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  			return; /* leave this for another sibling */
  		}
  
-@@ -1590,7 +1590,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+@@ -1590,7 +1590,7 @@ done:
  	 */
  	sched_engine->queue_priority_hint = queue_prio(sched_engine);
  	i915_sched_engine_reset_on_empty(sched_engine);
@@ -935,7 +894,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  
  	/*
  	 * We can skip poking the HW if we ended up with exactly the same set
-@@ -1616,13 +1616,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
+@@ -1616,13 +1616,6 @@ done:
  	}
  }
  
@@ -949,7 +908,7 @@ index c718e6dc40b51..0e592999b7d60 100644
  static void clear_ports(struct i915_request **ports, int count)
  {
  	memset_p((void **)ports, NULL, count);
-@@ -2468,7 +2461,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t)
+@@ -2468,7 +2461,7 @@ static void execlists_submission_tasklet
  	}
  
  	if (!engine->execlists.pending[0]) {
@@ -958,11 +917,10 @@ index c718e6dc40b51..0e592999b7d60 100644
  		start_timeslice(engine);
  	}
  
-diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
-index 73cebc6aa6507..98305fb393413 100644
---- a/drivers/gpu/drm/i915/i915_irq.c
-+++ b/drivers/gpu/drm/i915/i915_irq.c
-@@ -917,7 +917,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
+diff -rupN linux.orig/drivers/gpu/drm/i915/i915_irq.c linux/drivers/gpu/drm/i915/i915_irq.c
+--- linux.orig/drivers/gpu/drm/i915/i915_irq.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/i915_irq.c	2022-12-04 10:40:26.680034137 -0500
+@@ -917,7 +917,8 @@ static bool i915_get_crtc_scanoutpos(str
  	 */
  	spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
  
@@ -972,7 +930,7 @@ index 73cebc6aa6507..98305fb393413 100644
  
  	/* Get optional system timestamp before query. */
  	if (stime)
-@@ -981,7 +982,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
+@@ -981,7 +982,8 @@ static bool i915_get_crtc_scanoutpos(str
  	if (etime)
  		*etime = ktime_get();
  
@@ -982,11 +940,10 @@ index 73cebc6aa6507..98305fb393413 100644
  
  	spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
  
-diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
-index 62fad16a55e84..af07927650b24 100644
---- a/drivers/gpu/drm/i915/i915_request.c
-+++ b/drivers/gpu/drm/i915/i915_request.c
-@@ -612,7 +612,6 @@ bool __i915_request_submit(struct i915_request *request)
+diff -rupN linux.orig/drivers/gpu/drm/i915/i915_request.c linux/drivers/gpu/drm/i915/i915_request.c
+--- linux.orig/drivers/gpu/drm/i915/i915_request.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/i915_request.c	2022-12-04 10:40:26.680034137 -0500
+@@ -612,7 +612,6 @@ bool __i915_request_submit(struct i915_r
  
  	RQ_TRACE(request, "\n");
  
@@ -994,7 +951,7 @@ index 62fad16a55e84..af07927650b24 100644
  	lockdep_assert_held(&engine->sched_engine->lock);
  
  	/*
-@@ -721,7 +720,6 @@ void __i915_request_unsubmit(struct i915_request *request)
+@@ -721,7 +720,6 @@ void __i915_request_unsubmit(struct i915
  	 */
  	RQ_TRACE(request, "\n");
  
@@ -1002,10 +959,9 @@ index 62fad16a55e84..af07927650b24 100644
  	lockdep_assert_held(&engine->sched_engine->lock);
  
  	/*
-diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
-index 37b5c9e9d260e..73f29d8008f0c 100644
---- a/drivers/gpu/drm/i915/i915_trace.h
-+++ b/drivers/gpu/drm/i915/i915_trace.h
+diff -rupN linux.orig/drivers/gpu/drm/i915/i915_trace.h linux/drivers/gpu/drm/i915/i915_trace.h
+--- linux.orig/drivers/gpu/drm/i915/i915_trace.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/i915_trace.h	2022-12-04 10:40:26.680034137 -0500
 @@ -6,6 +6,10 @@
  #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
  #define _I915_TRACE_H_
@@ -1017,7 +973,7 @@ index 37b5c9e9d260e..73f29d8008f0c 100644
  #include <linux/stringify.h>
  #include <linux/types.h>
  #include <linux/tracepoint.h>
-@@ -323,7 +327,7 @@ DEFINE_EVENT(i915_request, i915_request_add,
+@@ -323,7 +327,7 @@ DEFINE_EVENT(i915_request, i915_request_
  	     TP_ARGS(rq)
  );
  
@@ -1026,11 +982,10 @@ index 37b5c9e9d260e..73f29d8008f0c 100644
  DEFINE_EVENT(i915_request, i915_request_guc_submit,
  	     TP_PROTO(struct i915_request *rq),
  	     TP_ARGS(rq)
-diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
-index c10d68cdc3ca5..593f3a7e0e4fc 100644
---- a/drivers/gpu/drm/i915/i915_utils.h
-+++ b/drivers/gpu/drm/i915/i915_utils.h
-@@ -294,7 +294,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
+diff -rupN linux.orig/drivers/gpu/drm/i915/i915_utils.h linux/drivers/gpu/drm/i915/i915_utils.h
+--- linux.orig/drivers/gpu/drm/i915/i915_utils.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/i915_utils.h	2022-12-04 10:40:26.680034137 -0500
+@@ -294,7 +294,7 @@ wait_remaining_ms_from_jiffies(unsigned
  #define wait_for(COND, MS)		_wait_for((COND), (MS) * 1000, 10, 1000)
  
  /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */
@@ -1039,10 +994,20 @@ index c10d68cdc3ca5..593f3a7e0e4fc 100644
  # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic())
  #else
  # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0)
-diff --git a/drivers/net/ethernet/alacritech/slic.h b/drivers/net/ethernet/alacritech/slic.h
-index 4eecbdfff3ff1..82071d0e5f7fc 100644
---- a/drivers/net/ethernet/alacritech/slic.h
-+++ b/drivers/net/ethernet/alacritech/slic.h
+diff -rupN linux.orig/drivers/gpu/drm/i915/Kconfig linux/drivers/gpu/drm/i915/Kconfig
+--- linux.orig/drivers/gpu/drm/i915/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/gpu/drm/i915/Kconfig	2022-12-04 10:40:26.680034137 -0500
+@@ -3,7 +3,6 @@ config DRM_I915
+ 	tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics"
+ 	depends on DRM
+ 	depends on X86 && PCI
+-	depends on !PREEMPT_RT
+ 	select INTEL_GTT if X86
+ 	select INTERVAL_TREE
+ 	# we need shmfs for the swappable backing store, and in particular
+diff -rupN linux.orig/drivers/net/ethernet/alacritech/slic.h linux/drivers/net/ethernet/alacritech/slic.h
+--- linux.orig/drivers/net/ethernet/alacritech/slic.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/alacritech/slic.h	2022-12-04 10:40:26.680034137 -0500
 @@ -288,13 +288,13 @@ do {						\
  	u64_stats_update_end(&(st)->syncp);	\
  } while (0)
@@ -1063,11 +1028,10 @@ index 4eecbdfff3ff1..82071d0e5f7fc 100644
  }
  
  struct slic_upr {
-diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
-index 39242c5a17290..8f81d288c4880 100644
---- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c
-+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
-@@ -118,9 +118,9 @@ static void ena_safe_update_stat(u64 *src, u64 *dst,
+diff -rupN linux.orig/drivers/net/ethernet/amazon/ena/ena_ethtool.c linux/drivers/net/ethernet/amazon/ena/ena_ethtool.c
+--- linux.orig/drivers/net/ethernet/amazon/ena/ena_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/amazon/ena/ena_ethtool.c	2022-12-04 10:40:26.680034137 -0500
+@@ -118,9 +118,9 @@ static void ena_safe_update_stat(u64 *sr
  	unsigned int start;
  
  	do {
@@ -1079,11 +1043,10 @@ index 39242c5a17290..8f81d288c4880 100644
  }
  
  static void ena_queue_stats(struct ena_adapter *adapter, u64 **data)
-diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
-index 6a356a6cee15a..1c5d482990806 100644
---- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
-+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
-@@ -3270,10 +3270,10 @@ static void ena_get_stats64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/amazon/ena/ena_netdev.c linux/drivers/net/ethernet/amazon/ena/ena_netdev.c
+--- linux.orig/drivers/net/ethernet/amazon/ena/ena_netdev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/amazon/ena/ena_netdev.c	2022-12-04 10:40:26.680034137 -0500
+@@ -3270,10 +3270,10 @@ static void ena_get_stats64(struct net_d
  		tx_ring = &adapter->tx_ring[i];
  
  		do {
@@ -1096,7 +1059,7 @@ index 6a356a6cee15a..1c5d482990806 100644
  
  		stats->tx_packets += packets;
  		stats->tx_bytes += bytes;
-@@ -3281,20 +3281,20 @@ static void ena_get_stats64(struct net_device *netdev,
+@@ -3281,20 +3281,20 @@ static void ena_get_stats64(struct net_d
  		rx_ring = &adapter->rx_ring[i];
  
  		do {
@@ -1121,11 +1084,10 @@ index 6a356a6cee15a..1c5d482990806 100644
  
  	stats->rx_dropped = rx_drops;
  	stats->tx_dropped = tx_drops;
-diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
-index 25129e723b575..1e8d902e1c8ea 100644
---- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
-+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
-@@ -934,7 +934,7 @@ unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data)
+diff -rupN linux.orig/drivers/net/ethernet/aquantia/atlantic/aq_ring.c linux/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+--- linux.orig/drivers/net/ethernet/aquantia/atlantic/aq_ring.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/aquantia/atlantic/aq_ring.c	2022-12-04 10:40:26.680034137 -0500
+@@ -934,7 +934,7 @@ unsigned int aq_ring_fill_stats_data(str
  		/* This data should mimic aq_ethtool_queue_rx_stat_names structure */
  		do {
  			count = 0;
@@ -1134,7 +1096,7 @@ index 25129e723b575..1e8d902e1c8ea 100644
  			data[count] = self->stats.rx.packets;
  			data[++count] = self->stats.rx.jumbo_packets;
  			data[++count] = self->stats.rx.lro_packets;
-@@ -951,15 +951,15 @@ unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data)
+@@ -951,15 +951,15 @@ unsigned int aq_ring_fill_stats_data(str
  			data[++count] = self->stats.rx.xdp_tx;
  			data[++count] = self->stats.rx.xdp_invalid;
  			data[++count] = self->stats.rx.xdp_redirect;
@@ -1153,11 +1115,10 @@ index 25129e723b575..1e8d902e1c8ea 100644
  	}
  
  	return ++count;
-diff --git a/drivers/net/ethernet/asix/ax88796c_main.c b/drivers/net/ethernet/asix/ax88796c_main.c
-index 6ba5b024a7be7..25e7beb68e515 100644
---- a/drivers/net/ethernet/asix/ax88796c_main.c
-+++ b/drivers/net/ethernet/asix/ax88796c_main.c
-@@ -662,12 +662,12 @@ static void ax88796c_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/ethernet/asix/ax88796c_main.c linux/drivers/net/ethernet/asix/ax88796c_main.c
+--- linux.orig/drivers/net/ethernet/asix/ax88796c_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/asix/ax88796c_main.c	2022-12-04 10:40:26.680034137 -0500
+@@ -662,12 +662,12 @@ static void ax88796c_get_stats64(struct
  		s = per_cpu_ptr(ax_local->stats, cpu);
  
  		do {
@@ -1172,11 +1133,10 @@ index 6ba5b024a7be7..25e7beb68e515 100644
  
  		stats->rx_packets += rx_packets;
  		stats->rx_bytes   += rx_bytes;
-diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
-index e5857e88c2076..caf1714f36a18 100644
---- a/drivers/net/ethernet/broadcom/b44.c
-+++ b/drivers/net/ethernet/broadcom/b44.c
-@@ -1680,7 +1680,7 @@ static void b44_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/broadcom/b44.c linux/drivers/net/ethernet/broadcom/b44.c
+--- linux.orig/drivers/net/ethernet/broadcom/b44.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/broadcom/b44.c	2022-12-04 10:40:26.680034137 -0500
+@@ -1680,7 +1680,7 @@ static void b44_get_stats64(struct net_d
  	unsigned int start;
  
  	do {
@@ -1185,7 +1145,7 @@ index e5857e88c2076..caf1714f36a18 100644
  
  		/* Convert HW stats into rtnl_link_stats64 stats. */
  		nstat->rx_packets = hwstat->rx_pkts;
-@@ -1714,7 +1714,7 @@ static void b44_get_stats64(struct net_device *dev,
+@@ -1714,7 +1714,7 @@ static void b44_get_stats64(struct net_d
  		/* Carrier lost counter seems to be broken for some devices */
  		nstat->tx_carrier_errors = hwstat->tx_carrier_lost;
  #endif
@@ -1194,7 +1154,7 @@ index e5857e88c2076..caf1714f36a18 100644
  
  }
  
-@@ -2082,12 +2082,12 @@ static void b44_get_ethtool_stats(struct net_device *dev,
+@@ -2082,12 +2082,12 @@ static void b44_get_ethtool_stats(struct
  	do {
  		data_src = &hwstat->tx_good_octets;
  		data_dst = data;
@@ -1209,11 +1169,10 @@ index e5857e88c2076..caf1714f36a18 100644
  }
  
  static void b44_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
-diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
-index 47fc8e6963d59..98d5bd15ee433 100644
---- a/drivers/net/ethernet/broadcom/bcmsysport.c
-+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
-@@ -457,10 +457,10 @@ static void bcm_sysport_update_tx_stats(struct bcm_sysport_priv *priv,
+diff -rupN linux.orig/drivers/net/ethernet/broadcom/bcmsysport.c linux/drivers/net/ethernet/broadcom/bcmsysport.c
+--- linux.orig/drivers/net/ethernet/broadcom/bcmsysport.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/broadcom/bcmsysport.c	2022-12-04 10:40:26.680034137 -0500
+@@ -457,10 +457,10 @@ static void bcm_sysport_update_tx_stats(
  	for (q = 0; q < priv->netdev->num_tx_queues; q++) {
  		ring = &priv->tx_rings[q];
  		do {
@@ -1226,7 +1185,7 @@ index 47fc8e6963d59..98d5bd15ee433 100644
  
  		*tx_bytes += bytes;
  		*tx_packets += packets;
-@@ -504,9 +504,9 @@ static void bcm_sysport_get_stats(struct net_device *dev,
+@@ -504,9 +504,9 @@ static void bcm_sysport_get_stats(struct
  		if (s->stat_sizeof == sizeof(u64) &&
  		    s->type == BCM_SYSPORT_STAT_NETDEV64) {
  			do {
@@ -1238,7 +1197,7 @@ index 47fc8e6963d59..98d5bd15ee433 100644
  		} else
  			data[i] = *(u32 *)p;
  		j++;
-@@ -1878,10 +1878,10 @@ static void bcm_sysport_get_stats64(struct net_device *dev,
+@@ -1878,10 +1878,10 @@ static void bcm_sysport_get_stats64(stru
  				    &stats->tx_packets);
  
  	do {
@@ -1251,11 +1210,10 @@ index 47fc8e6963d59..98d5bd15ee433 100644
  }
  
  static void bcm_sysport_netif_start(struct net_device *dev)
-diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
-index 6dae768671e3d..9e6de2f968fa3 100644
---- a/drivers/net/ethernet/cortina/gemini.c
-+++ b/drivers/net/ethernet/cortina/gemini.c
-@@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/cortina/gemini.c linux/drivers/net/ethernet/cortina/gemini.c
+--- linux.orig/drivers/net/ethernet/cortina/gemini.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/cortina/gemini.c	2022-12-04 10:40:26.680034137 -0500
+@@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_
  
  	/* Racing with RX NAPI */
  	do {
@@ -1264,7 +1222,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  		stats->rx_packets = port->stats.rx_packets;
  		stats->rx_bytes = port->stats.rx_bytes;
-@@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_device *netdev,
+@@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_
  		stats->rx_crc_errors = port->stats.rx_crc_errors;
  		stats->rx_frame_errors = port->stats.rx_frame_errors;
  
@@ -1278,7 +1236,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  		stats->tx_errors = port->stats.tx_errors;
  		stats->tx_packets = port->stats.tx_packets;
-@@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_device *netdev,
+@@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_
  		stats->rx_missed_errors = port->stats.rx_missed_errors;
  		stats->rx_fifo_errors = port->stats.rx_fifo_errors;
  
@@ -1297,7 +1255,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  	stats->rx_dropped += stats->rx_missed_errors;
  }
-@@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
+@@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struc
  	/* Racing with MIB interrupt */
  	do {
  		p = values;
@@ -1319,7 +1277,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  		for (i = 0; i < RX_STATUS_NUM; i++)
  			*p++ = port->rx_stats[i];
-@@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
+@@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struc
  			*p++ = port->rx_csum_stats[i];
  		*p++ = port->rx_napi_exits;
  
@@ -1335,7 +1293,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  
  		for (i = 0; i < TX_MAX_FRAGS; i++) {
  			*values++ = port->tx_frag_stats[i];
-@@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
+@@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struc
  		*values++ = port->tx_frags_linearized;
  		*values++ = port->tx_hw_csummed;
  
@@ -1344,11 +1302,10 @@ index 6dae768671e3d..9e6de2f968fa3 100644
  }
  
  static int gmac_get_ksettings(struct net_device *netdev,
-diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
-index bd0df189d8719..39e7a4a3c15e6 100644
---- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
-+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
-@@ -389,10 +389,10 @@ static void be_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/emulex/benet/be_ethtool.c linux/drivers/net/ethernet/emulex/benet/be_ethtool.c
+--- linux.orig/drivers/net/ethernet/emulex/benet/be_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/emulex/benet/be_ethtool.c	2022-12-04 10:40:26.680034137 -0500
+@@ -389,10 +389,10 @@ static void be_get_ethtool_stats(struct
  		struct be_rx_stats *stats = rx_stats(rxo);
  
  		do {
@@ -1361,7 +1318,7 @@ index bd0df189d8719..39e7a4a3c15e6 100644
  
  		for (i = 2; i < ETHTOOL_RXSTATS_NUM; i++) {
  			p = (u8 *)stats + et_rx_stats[i].offset;
-@@ -405,19 +405,19 @@ static void be_get_ethtool_stats(struct net_device *netdev,
+@@ -405,19 +405,19 @@ static void be_get_ethtool_stats(struct
  		struct be_tx_stats *stats = tx_stats(txo);
  
  		do {
@@ -1385,11 +1342,10 @@ index bd0df189d8719..39e7a4a3c15e6 100644
  		base += ETHTOOL_TXSTATS_NUM;
  	}
  }
-diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
-index 414362febbb9d..9350c901aa27b 100644
---- a/drivers/net/ethernet/emulex/benet/be_main.c
-+++ b/drivers/net/ethernet/emulex/benet/be_main.c
-@@ -665,10 +665,10 @@ static void be_get_stats64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/emulex/benet/be_main.c linux/drivers/net/ethernet/emulex/benet/be_main.c
+--- linux.orig/drivers/net/ethernet/emulex/benet/be_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/emulex/benet/be_main.c	2022-12-04 10:40:26.684034126 -0500
+@@ -665,10 +665,10 @@ static void be_get_stats64(struct net_de
  		const struct be_rx_stats *rx_stats = rx_stats(rxo);
  
  		do {
@@ -1402,7 +1358,7 @@ index 414362febbb9d..9350c901aa27b 100644
  		stats->rx_packets += pkts;
  		stats->rx_bytes += bytes;
  		stats->multicast += rx_stats(rxo)->rx_mcast_pkts;
-@@ -680,10 +680,10 @@ static void be_get_stats64(struct net_device *netdev,
+@@ -680,10 +680,10 @@ static void be_get_stats64(struct net_de
  		const struct be_tx_stats *tx_stats = tx_stats(txo);
  
  		do {
@@ -1415,7 +1371,7 @@ index 414362febbb9d..9350c901aa27b 100644
  		stats->tx_packets += pkts;
  		stats->tx_bytes += bytes;
  	}
-@@ -2155,16 +2155,16 @@ static int be_get_new_eqd(struct be_eq_obj *eqo)
+@@ -2155,16 +2155,16 @@ static int be_get_new_eqd(struct be_eq_o
  
  	for_all_rx_queues_on_eq(adapter, eqo, rxo, i) {
  		do {
@@ -1436,10 +1392,9 @@ index 414362febbb9d..9350c901aa27b 100644
  	}
  
  	/* Skip, if wrapped around or first calculation */
-diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
-index 671f51135c269..53b7e95213a85 100644
---- a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
-+++ b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
+diff -rupN linux.orig/drivers/net/ethernet/fungible/funeth/funeth_txrx.h linux/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
+--- linux.orig/drivers/net/ethernet/fungible/funeth/funeth_txrx.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/fungible/funeth/funeth_txrx.h	2022-12-04 10:40:26.684034126 -0500
 @@ -206,9 +206,9 @@ struct funeth_rxq {
  
  #define FUN_QSTAT_READ(q, seq, stats_copy) \
@@ -1452,11 +1407,10 @@ index 671f51135c269..53b7e95213a85 100644
  
  #define FUN_INT_NAME_LEN (IFNAMSIZ + 16)
  
-diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c
-index 7b9a2d9d96243..50b384910c839 100644
---- a/drivers/net/ethernet/google/gve/gve_ethtool.c
-+++ b/drivers/net/ethernet/google/gve/gve_ethtool.c
-@@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/google/gve/gve_ethtool.c linux/drivers/net/ethernet/google/gve/gve_ethtool.c
+--- linux.orig/drivers/net/ethernet/google/gve/gve_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/google/gve/gve_ethtool.c	2022-12-04 10:40:26.684034126 -0500
+@@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device
  				struct gve_rx_ring *rx = &priv->rx[ring];
  
  				start =
@@ -1473,7 +1427,7 @@ index 7b9a2d9d96243..50b384910c839 100644
  						       start));
  			rx_pkts += tmp_rx_pkts;
  			rx_bytes += tmp_rx_bytes;
-@@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device *netdev,
+@@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device
  		if (priv->tx) {
  			do {
  				start =
@@ -1486,7 +1440,7 @@ index 7b9a2d9d96243..50b384910c839 100644
  						       start));
  			tx_pkts += tmp_tx_pkts;
  			tx_bytes += tmp_tx_bytes;
-@@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device *netdev,
+@@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device
  			data[i++] = rx->fill_cnt - rx->cnt;
  			do {
  				start =
@@ -1502,7 +1456,7 @@ index 7b9a2d9d96243..50b384910c839 100644
  						       start));
  			data[i++] = tmp_rx_bytes;
  			data[i++] = rx->rx_cont_packet_cnt;
-@@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device *netdev,
+@@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device
  			}
  			do {
  				start =
@@ -1514,11 +1468,10 @@ index 7b9a2d9d96243..50b384910c839 100644
  						       start));
  			data[i++] = tmp_tx_bytes;
  			data[i++] = tx->wake_queue;
-diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
-index 044db3ebb071c..6cafee55efc32 100644
---- a/drivers/net/ethernet/google/gve/gve_main.c
-+++ b/drivers/net/ethernet/google/gve/gve_main.c
-@@ -51,10 +51,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
+diff -rupN linux.orig/drivers/net/ethernet/google/gve/gve_main.c linux/drivers/net/ethernet/google/gve/gve_main.c
+--- linux.orig/drivers/net/ethernet/google/gve/gve_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/google/gve/gve_main.c	2022-12-04 10:40:26.684034126 -0500
+@@ -51,10 +51,10 @@ static void gve_get_stats(struct net_dev
  		for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {
  			do {
  				start =
@@ -1531,7 +1484,7 @@ index 044db3ebb071c..6cafee55efc32 100644
  						       start));
  			s->rx_packets += packets;
  			s->rx_bytes += bytes;
-@@ -64,10 +64,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
+@@ -64,10 +64,10 @@ static void gve_get_stats(struct net_dev
  		for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) {
  			do {
  				start =
@@ -1544,7 +1497,7 @@ index 044db3ebb071c..6cafee55efc32 100644
  						       start));
  			s->tx_packets += packets;
  			s->tx_bytes += bytes;
-@@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_priv *priv)
+@@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_
  			}
  
  			do {
@@ -1556,11 +1509,10 @@ index 044db3ebb071c..6cafee55efc32 100644
  			stats[stats_idx++] = (struct stats) {
  				.stat_name = cpu_to_be32(TX_WAKE_CNT),
  				.value = cpu_to_be64(priv->tx[idx].wake_queue),
-diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
-index 35d70041b9e84..f82e98263307a 100644
---- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
-+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
-@@ -2486,7 +2486,7 @@ static void hns3_fetch_stats(struct rtnl_link_stats64 *stats,
+diff -rupN linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+--- linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c	2022-12-04 10:40:26.684034126 -0500
+@@ -2488,7 +2488,7 @@ static void hns3_fetch_stats(struct rtnl
  	unsigned int start;
  
  	do {
@@ -1569,7 +1521,7 @@ index 35d70041b9e84..f82e98263307a 100644
  		if (is_tx) {
  			stats->tx_bytes += ring->stats.tx_bytes;
  			stats->tx_packets += ring->stats.tx_pkts;
-@@ -2520,7 +2520,7 @@ static void hns3_fetch_stats(struct rtnl_link_stats64 *stats,
+@@ -2522,7 +2522,7 @@ static void hns3_fetch_stats(struct rtnl
  			stats->multicast += ring->stats.rx_multicast;
  			stats->rx_length_errors += ring->stats.err_pkt_len;
  		}
@@ -1578,11 +1530,5909 @@ index 35d70041b9e84..f82e98263307a 100644
  }
  
  static void hns3_nic_get_stats64(struct net_device *netdev,
-diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
-index e5828a658caf4..a866bea651103 100644
---- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c
-+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
-@@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, struct hinic_rxq_stats *stats)
+diff -rupN linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig
+--- linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig	2022-12-04 10:40:18.116056079 -0500
+@@ -0,0 +1,5895 @@
++// SPDX-License-Identifier: GPL-2.0+
++// Copyright (c) 2016-2017 Hisilicon Limited.
++
++#include <linux/dma-mapping.h>
++#include <linux/etherdevice.h>
++#include <linux/interrupt.h>
++#ifdef CONFIG_RFS_ACCEL
++#include <linux/cpu_rmap.h>
++#endif
++#include <linux/if_vlan.h>
++#include <linux/irq.h>
++#include <linux/ip.h>
++#include <linux/ipv6.h>
++#include <linux/module.h>
++#include <linux/pci.h>
++#include <linux/aer.h>
++#include <linux/skbuff.h>
++#include <linux/sctp.h>
++#include <net/gre.h>
++#include <net/gro.h>
++#include <net/ip6_checksum.h>
++#include <net/pkt_cls.h>
++#include <net/tcp.h>
++#include <net/vxlan.h>
++#include <net/geneve.h>
++
++#include "hnae3.h"
++#include "hns3_enet.h"
++/* All hns3 tracepoints are defined by the include below, which
++ * must be included exactly once across the whole kernel with
++ * CREATE_TRACE_POINTS defined
++ */
++#define CREATE_TRACE_POINTS
++#include "hns3_trace.h"
++
++#define hns3_set_field(origin, shift, val)	((origin) |= (val) << (shift))
++#define hns3_tx_bd_count(S)	DIV_ROUND_UP(S, HNS3_MAX_BD_SIZE)
++
++#define hns3_rl_err(fmt, ...)						\
++	do {								\
++		if (net_ratelimit())					\
++			netdev_err(fmt, ##__VA_ARGS__);			\
++	} while (0)
++
++static void hns3_clear_all_ring(struct hnae3_handle *h, bool force);
++
++static const char hns3_driver_name[] = "hns3";
++static const char hns3_driver_string[] =
++			"Hisilicon Ethernet Network Driver for Hip08 Family";
++static const char hns3_copyright[] = "Copyright (c) 2017 Huawei Corporation.";
++static struct hnae3_client client;
++
++static int debug = -1;
++module_param(debug, int, 0);
++MODULE_PARM_DESC(debug, " Network interface message level setting");
++
++static unsigned int tx_sgl = 1;
++module_param(tx_sgl, uint, 0600);
++MODULE_PARM_DESC(tx_sgl, "Minimum number of frags when using dma_map_sg() to optimize the IOMMU mapping");
++
++static bool page_pool_enabled = true;
++module_param(page_pool_enabled, bool, 0400);
++
++#define HNS3_SGL_SIZE(nfrag)	(sizeof(struct scatterlist) * (nfrag) +	\
++				 sizeof(struct sg_table))
++#define HNS3_MAX_SGL_SIZE	ALIGN(HNS3_SGL_SIZE(HNS3_MAX_TSO_BD_NUM), \
++				      dma_get_cache_alignment())
++
++#define DEFAULT_MSG_LEVEL (NETIF_MSG_PROBE | NETIF_MSG_LINK | \
++			   NETIF_MSG_IFDOWN | NETIF_MSG_IFUP)
++
++#define HNS3_INNER_VLAN_TAG	1
++#define HNS3_OUTER_VLAN_TAG	2
++
++#define HNS3_MIN_TX_LEN		33U
++#define HNS3_MIN_TUN_PKT_LEN	65U
++
++/* hns3_pci_tbl - PCI Device ID Table
++ *
++ * Last entry must be all 0s
++ *
++ * { Vendor ID, Device ID, SubVendor ID, SubDevice ID,
++ *   Class, Class Mask, private data (not used) }
++ */
++static const struct pci_device_id hns3_pci_tbl[] = {
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_200G_RDMA),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_VF), 0},
++	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_RDMA_DCB_PFC_VF),
++	 HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
++	/* required last entry */
++	{0,}
++};
++MODULE_DEVICE_TABLE(pci, hns3_pci_tbl);
++
++#define HNS3_RX_PTYPE_ENTRY(ptype, l, s, t, h) \
++	{	ptype, \
++		l, \
++		CHECKSUM_##s, \
++		HNS3_L3_TYPE_##t, \
++		1, \
++		h}
++
++#define HNS3_RX_PTYPE_UNUSED_ENTRY(ptype) \
++		{ ptype, 0, CHECKSUM_NONE, HNS3_L3_TYPE_PARSE_FAIL, 0, \
++		  PKT_HASH_TYPE_NONE }
++
++static const struct hns3_rx_ptype hns3_rx_ptype_tbl[] = {
++	HNS3_RX_PTYPE_UNUSED_ENTRY(0),
++	HNS3_RX_PTYPE_ENTRY(1, 0, COMPLETE, ARP, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(2, 0, COMPLETE, RARP, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(3, 0, COMPLETE, LLDP, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(4, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(5, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(6, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(7, 0, COMPLETE, CNM, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(8, 0, NONE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(9),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(10),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(11),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(12),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(13),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(14),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(15),
++	HNS3_RX_PTYPE_ENTRY(16, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(17, 0, COMPLETE, IPV4, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(18, 0, COMPLETE, IPV4, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(19, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(20, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(21, 0, NONE, IPV4, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(22, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(23, 0, NONE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(24, 0, NONE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(25, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(26),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(27),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(28),
++	HNS3_RX_PTYPE_ENTRY(29, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(30, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(31, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(32, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(33, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(34, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(35, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(36, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(37, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(38),
++	HNS3_RX_PTYPE_ENTRY(39, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(40, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(41, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(42, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(43, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(44, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(45, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(46),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(47),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(48),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(49),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(50),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(51),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(52),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(53),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(54),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(55),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(56),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(57),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(58),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(59),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(60),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(61),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(62),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(63),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(64),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(65),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(66),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(67),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(68),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(69),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(70),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(71),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(72),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(73),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(74),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(75),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(76),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(77),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(78),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(79),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(80),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(81),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(82),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(83),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(84),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(85),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(86),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(87),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(88),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(89),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(90),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(91),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(92),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(93),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(94),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(95),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(96),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(97),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(98),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(99),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(100),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(101),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(102),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(103),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(104),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(105),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(106),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(107),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(108),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(109),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(110),
++	HNS3_RX_PTYPE_ENTRY(111, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(112, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(113, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(114, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(115, 0, NONE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(116, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(117, 0, NONE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(118, 0, NONE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(119, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(120),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(121),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(122),
++	HNS3_RX_PTYPE_ENTRY(123, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(124, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE),
++	HNS3_RX_PTYPE_ENTRY(125, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(126, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(127, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(128, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(129, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(130, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(131, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(132),
++	HNS3_RX_PTYPE_ENTRY(133, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(134, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(135, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(136, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(137, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4),
++	HNS3_RX_PTYPE_ENTRY(138, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_ENTRY(139, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(140),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(141),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(142),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(143),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(144),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(145),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(146),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(147),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(148),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(149),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(150),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(151),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(152),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(153),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(154),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(155),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(156),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(157),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(158),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(159),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(160),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(161),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(162),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(163),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(164),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(165),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(166),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(167),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(168),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(169),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(170),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(171),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(172),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(173),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(174),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(175),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(176),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(177),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(178),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(179),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(180),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(181),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(182),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(183),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(184),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(185),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(186),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(187),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(188),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(189),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(190),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(191),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(192),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(193),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(194),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(195),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(196),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(197),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(198),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(199),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(200),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(201),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(202),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(203),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(204),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(205),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(206),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(207),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(208),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(209),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(210),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(211),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(212),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(213),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(214),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(215),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(216),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(217),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(218),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(219),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(220),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(221),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(222),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(223),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(224),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(225),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(226),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(227),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(228),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(229),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(230),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(231),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(232),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(233),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(234),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(235),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(236),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(237),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(238),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(239),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(240),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(241),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(242),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(243),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(244),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(245),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(246),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(247),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(248),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(249),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(250),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(251),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(252),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(253),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(254),
++	HNS3_RX_PTYPE_UNUSED_ENTRY(255),
++};
++
++#define HNS3_INVALID_PTYPE \
++		ARRAY_SIZE(hns3_rx_ptype_tbl)
++
++static irqreturn_t hns3_irq_handle(int irq, void *vector)
++{
++	struct hns3_enet_tqp_vector *tqp_vector = vector;
++
++	napi_schedule_irqoff(&tqp_vector->napi);
++	tqp_vector->event_cnt++;
++
++	return IRQ_HANDLED;
++}
++
++static void hns3_nic_uninit_irq(struct hns3_nic_priv *priv)
++{
++	struct hns3_enet_tqp_vector *tqp_vectors;
++	unsigned int i;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vectors = &priv->tqp_vector[i];
++
++		if (tqp_vectors->irq_init_flag != HNS3_VECTOR_INITED)
++			continue;
++
++		/* clear the affinity mask */
++		irq_set_affinity_hint(tqp_vectors->vector_irq, NULL);
++
++		/* release the irq resource */
++		free_irq(tqp_vectors->vector_irq, tqp_vectors);
++		tqp_vectors->irq_init_flag = HNS3_VECTOR_NOT_INITED;
++	}
++}
++
++static int hns3_nic_init_irq(struct hns3_nic_priv *priv)
++{
++	struct hns3_enet_tqp_vector *tqp_vectors;
++	int txrx_int_idx = 0;
++	int rx_int_idx = 0;
++	int tx_int_idx = 0;
++	unsigned int i;
++	int ret;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vectors = &priv->tqp_vector[i];
++
++		if (tqp_vectors->irq_init_flag == HNS3_VECTOR_INITED)
++			continue;
++
++		if (tqp_vectors->tx_group.ring && tqp_vectors->rx_group.ring) {
++			snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN,
++				 "%s-%s-%s-%d", hns3_driver_name,
++				 pci_name(priv->ae_handle->pdev),
++				 "TxRx", txrx_int_idx++);
++			txrx_int_idx++;
++		} else if (tqp_vectors->rx_group.ring) {
++			snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN,
++				 "%s-%s-%s-%d", hns3_driver_name,
++				 pci_name(priv->ae_handle->pdev),
++				 "Rx", rx_int_idx++);
++		} else if (tqp_vectors->tx_group.ring) {
++			snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN,
++				 "%s-%s-%s-%d", hns3_driver_name,
++				 pci_name(priv->ae_handle->pdev),
++				 "Tx", tx_int_idx++);
++		} else {
++			/* Skip this unused q_vector */
++			continue;
++		}
++
++		tqp_vectors->name[HNAE3_INT_NAME_LEN - 1] = '\0';
++
++		irq_set_status_flags(tqp_vectors->vector_irq, IRQ_NOAUTOEN);
++		ret = request_irq(tqp_vectors->vector_irq, hns3_irq_handle, 0,
++				  tqp_vectors->name, tqp_vectors);
++		if (ret) {
++			netdev_err(priv->netdev, "request irq(%d) fail\n",
++				   tqp_vectors->vector_irq);
++			hns3_nic_uninit_irq(priv);
++			return ret;
++		}
++
++		irq_set_affinity_hint(tqp_vectors->vector_irq,
++				      &tqp_vectors->affinity_mask);
++
++		tqp_vectors->irq_init_flag = HNS3_VECTOR_INITED;
++	}
++
++	return 0;
++}
++
++static void hns3_mask_vector_irq(struct hns3_enet_tqp_vector *tqp_vector,
++				 u32 mask_en)
++{
++	writel(mask_en, tqp_vector->mask_addr);
++}
++
++static void hns3_vector_enable(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	napi_enable(&tqp_vector->napi);
++	enable_irq(tqp_vector->vector_irq);
++
++	/* enable vector */
++	hns3_mask_vector_irq(tqp_vector, 1);
++}
++
++static void hns3_vector_disable(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	/* disable vector */
++	hns3_mask_vector_irq(tqp_vector, 0);
++
++	disable_irq(tqp_vector->vector_irq);
++	napi_disable(&tqp_vector->napi);
++	cancel_work_sync(&tqp_vector->rx_group.dim.work);
++	cancel_work_sync(&tqp_vector->tx_group.dim.work);
++}
++
++void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector,
++				 u32 rl_value)
++{
++	u32 rl_reg = hns3_rl_usec_to_reg(rl_value);
++
++	/* this defines the configuration for RL (Interrupt Rate Limiter).
++	 * Rl defines rate of interrupts i.e. number of interrupts-per-second
++	 * GL and RL(Rate Limiter) are 2 ways to acheive interrupt coalescing
++	 */
++	if (rl_reg > 0 && !tqp_vector->tx_group.coal.adapt_enable &&
++	    !tqp_vector->rx_group.coal.adapt_enable)
++		/* According to the hardware, the range of rl_reg is
++		 * 0-59 and the unit is 4.
++		 */
++		rl_reg |=  HNS3_INT_RL_ENABLE_MASK;
++
++	writel(rl_reg, tqp_vector->mask_addr + HNS3_VECTOR_RL_OFFSET);
++}
++
++void hns3_set_vector_coalesce_rx_gl(struct hns3_enet_tqp_vector *tqp_vector,
++				    u32 gl_value)
++{
++	u32 new_val;
++
++	if (tqp_vector->rx_group.coal.unit_1us)
++		new_val = gl_value | HNS3_INT_GL_1US;
++	else
++		new_val = hns3_gl_usec_to_reg(gl_value);
++
++	writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET);
++}
++
++void hns3_set_vector_coalesce_tx_gl(struct hns3_enet_tqp_vector *tqp_vector,
++				    u32 gl_value)
++{
++	u32 new_val;
++
++	if (tqp_vector->tx_group.coal.unit_1us)
++		new_val = gl_value | HNS3_INT_GL_1US;
++	else
++		new_val = hns3_gl_usec_to_reg(gl_value);
++
++	writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET);
++}
++
++void hns3_set_vector_coalesce_tx_ql(struct hns3_enet_tqp_vector *tqp_vector,
++				    u32 ql_value)
++{
++	writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_TX_QL_OFFSET);
++}
++
++void hns3_set_vector_coalesce_rx_ql(struct hns3_enet_tqp_vector *tqp_vector,
++				    u32 ql_value)
++{
++	writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_RX_QL_OFFSET);
++}
++
++static void hns3_vector_coalesce_init(struct hns3_enet_tqp_vector *tqp_vector,
++				      struct hns3_nic_priv *priv)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
++	struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal;
++	struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal;
++	struct hns3_enet_coalesce *ptx_coal = &priv->tx_coal;
++	struct hns3_enet_coalesce *prx_coal = &priv->rx_coal;
++
++	tx_coal->adapt_enable = ptx_coal->adapt_enable;
++	rx_coal->adapt_enable = prx_coal->adapt_enable;
++
++	tx_coal->int_gl = ptx_coal->int_gl;
++	rx_coal->int_gl = prx_coal->int_gl;
++
++	rx_coal->flow_level = prx_coal->flow_level;
++	tx_coal->flow_level = ptx_coal->flow_level;
++
++	/* device version above V3(include V3), GL can configure 1us
++	 * unit, so uses 1us unit.
++	 */
++	if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) {
++		tx_coal->unit_1us = 1;
++		rx_coal->unit_1us = 1;
++	}
++
++	if (ae_dev->dev_specs.int_ql_max) {
++		tx_coal->ql_enable = 1;
++		rx_coal->ql_enable = 1;
++		tx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max;
++		rx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max;
++		tx_coal->int_ql = ptx_coal->int_ql;
++		rx_coal->int_ql = prx_coal->int_ql;
++	}
++}
++
++static void
++hns3_vector_coalesce_init_hw(struct hns3_enet_tqp_vector *tqp_vector,
++			     struct hns3_nic_priv *priv)
++{
++	struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal;
++	struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal;
++	struct hnae3_handle *h = priv->ae_handle;
++
++	hns3_set_vector_coalesce_tx_gl(tqp_vector, tx_coal->int_gl);
++	hns3_set_vector_coalesce_rx_gl(tqp_vector, rx_coal->int_gl);
++	hns3_set_vector_coalesce_rl(tqp_vector, h->kinfo.int_rl_setting);
++
++	if (tx_coal->ql_enable)
++		hns3_set_vector_coalesce_tx_ql(tqp_vector, tx_coal->int_ql);
++
++	if (rx_coal->ql_enable)
++		hns3_set_vector_coalesce_rx_ql(tqp_vector, rx_coal->int_ql);
++}
++
++static int hns3_nic_set_real_num_queue(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct hnae3_knic_private_info *kinfo = &h->kinfo;
++	struct hnae3_tc_info *tc_info = &kinfo->tc_info;
++	unsigned int queue_size = kinfo->num_tqps;
++	int i, ret;
++
++	if (tc_info->num_tc <= 1 && !tc_info->mqprio_active) {
++		netdev_reset_tc(netdev);
++	} else {
++		ret = netdev_set_num_tc(netdev, tc_info->num_tc);
++		if (ret) {
++			netdev_err(netdev,
++				   "netdev_set_num_tc fail, ret=%d!\n", ret);
++			return ret;
++		}
++
++		for (i = 0; i < tc_info->num_tc; i++)
++			netdev_set_tc_queue(netdev, i, tc_info->tqp_count[i],
++					    tc_info->tqp_offset[i]);
++	}
++
++	ret = netif_set_real_num_tx_queues(netdev, queue_size);
++	if (ret) {
++		netdev_err(netdev,
++			   "netif_set_real_num_tx_queues fail, ret=%d!\n", ret);
++		return ret;
++	}
++
++	ret = netif_set_real_num_rx_queues(netdev, queue_size);
++	if (ret) {
++		netdev_err(netdev,
++			   "netif_set_real_num_rx_queues fail, ret=%d!\n", ret);
++		return ret;
++	}
++
++	return 0;
++}
++
++u16 hns3_get_max_available_channels(struct hnae3_handle *h)
++{
++	u16 alloc_tqps, max_rss_size, rss_size;
++
++	h->ae_algo->ops->get_tqps_and_rss_info(h, &alloc_tqps, &max_rss_size);
++	rss_size = alloc_tqps / h->kinfo.tc_info.num_tc;
++
++	return min_t(u16, rss_size, max_rss_size);
++}
++
++static void hns3_tqp_enable(struct hnae3_queue *tqp)
++{
++	u32 rcb_reg;
++
++	rcb_reg = hns3_read_dev(tqp, HNS3_RING_EN_REG);
++	rcb_reg |= BIT(HNS3_RING_EN_B);
++	hns3_write_dev(tqp, HNS3_RING_EN_REG, rcb_reg);
++}
++
++static void hns3_tqp_disable(struct hnae3_queue *tqp)
++{
++	u32 rcb_reg;
++
++	rcb_reg = hns3_read_dev(tqp, HNS3_RING_EN_REG);
++	rcb_reg &= ~BIT(HNS3_RING_EN_B);
++	hns3_write_dev(tqp, HNS3_RING_EN_REG, rcb_reg);
++}
++
++static void hns3_free_rx_cpu_rmap(struct net_device *netdev)
++{
++#ifdef CONFIG_RFS_ACCEL
++	free_irq_cpu_rmap(netdev->rx_cpu_rmap);
++	netdev->rx_cpu_rmap = NULL;
++#endif
++}
++
++static int hns3_set_rx_cpu_rmap(struct net_device *netdev)
++{
++#ifdef CONFIG_RFS_ACCEL
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hns3_enet_tqp_vector *tqp_vector;
++	int i, ret;
++
++	if (!netdev->rx_cpu_rmap) {
++		netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->vector_num);
++		if (!netdev->rx_cpu_rmap)
++			return -ENOMEM;
++	}
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vector = &priv->tqp_vector[i];
++		ret = irq_cpu_rmap_add(netdev->rx_cpu_rmap,
++				       tqp_vector->vector_irq);
++		if (ret) {
++			hns3_free_rx_cpu_rmap(netdev);
++			return ret;
++		}
++	}
++#endif
++	return 0;
++}
++
++static int hns3_nic_net_up(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = priv->ae_handle;
++	int i, j;
++	int ret;
++
++	ret = hns3_nic_reset_all_ring(h);
++	if (ret)
++		return ret;
++
++	clear_bit(HNS3_NIC_STATE_DOWN, &priv->state);
++
++	/* enable the vectors */
++	for (i = 0; i < priv->vector_num; i++)
++		hns3_vector_enable(&priv->tqp_vector[i]);
++
++	/* enable rcb */
++	for (j = 0; j < h->kinfo.num_tqps; j++)
++		hns3_tqp_enable(h->kinfo.tqp[j]);
++
++	/* start the ae_dev */
++	ret = h->ae_algo->ops->start ? h->ae_algo->ops->start(h) : 0;
++	if (ret) {
++		set_bit(HNS3_NIC_STATE_DOWN, &priv->state);
++		while (j--)
++			hns3_tqp_disable(h->kinfo.tqp[j]);
++
++		for (j = i - 1; j >= 0; j--)
++			hns3_vector_disable(&priv->tqp_vector[j]);
++	}
++
++	return ret;
++}
++
++static void hns3_config_xps(struct hns3_nic_priv *priv)
++{
++	int i;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		struct hns3_enet_tqp_vector *tqp_vector = &priv->tqp_vector[i];
++		struct hns3_enet_ring *ring = tqp_vector->tx_group.ring;
++
++		while (ring) {
++			int ret;
++
++			ret = netif_set_xps_queue(priv->netdev,
++						  &tqp_vector->affinity_mask,
++						  ring->tqp->tqp_index);
++			if (ret)
++				netdev_warn(priv->netdev,
++					    "set xps queue failed: %d", ret);
++
++			ring = ring->next;
++		}
++	}
++}
++
++static int hns3_nic_net_open(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct hnae3_knic_private_info *kinfo;
++	int i, ret;
++
++	if (hns3_nic_resetting(netdev))
++		return -EBUSY;
++
++	if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) {
++		netdev_warn(netdev, "net open repeatedly!\n");
++		return 0;
++	}
++
++	netif_carrier_off(netdev);
++
++	ret = hns3_nic_set_real_num_queue(netdev);
++	if (ret)
++		return ret;
++
++	ret = hns3_nic_net_up(netdev);
++	if (ret) {
++		netdev_err(netdev, "net up fail, ret=%d!\n", ret);
++		return ret;
++	}
++
++	kinfo = &h->kinfo;
++	for (i = 0; i < HNAE3_MAX_USER_PRIO; i++)
++		netdev_set_prio_tc_map(netdev, i, kinfo->tc_info.prio_tc[i]);
++
++	if (h->ae_algo->ops->set_timer_task)
++		h->ae_algo->ops->set_timer_task(priv->ae_handle, true);
++
++	hns3_config_xps(priv);
++
++	netif_dbg(h, drv, netdev, "net open\n");
++
++	return 0;
++}
++
++static void hns3_reset_tx_queue(struct hnae3_handle *h)
++{
++	struct net_device *ndev = h->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	struct netdev_queue *dev_queue;
++	u32 i;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		dev_queue = netdev_get_tx_queue(ndev,
++						priv->ring[i].queue_index);
++		netdev_tx_reset_queue(dev_queue);
++	}
++}
++
++static void hns3_nic_net_down(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	const struct hnae3_ae_ops *ops;
++	int i;
++
++	/* disable vectors */
++	for (i = 0; i < priv->vector_num; i++)
++		hns3_vector_disable(&priv->tqp_vector[i]);
++
++	/* disable rcb */
++	for (i = 0; i < h->kinfo.num_tqps; i++)
++		hns3_tqp_disable(h->kinfo.tqp[i]);
++
++	/* stop ae_dev */
++	ops = priv->ae_handle->ae_algo->ops;
++	if (ops->stop)
++		ops->stop(priv->ae_handle);
++
++	/* delay ring buffer clearing to hns3_reset_notify_uninit_enet
++	 * during reset process, because driver may not be able
++	 * to disable the ring through firmware when downing the netdev.
++	 */
++	if (!hns3_nic_resetting(netdev))
++		hns3_clear_all_ring(priv->ae_handle, false);
++
++	hns3_reset_tx_queue(priv->ae_handle);
++}
++
++static int hns3_nic_net_stop(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state))
++		return 0;
++
++	netif_dbg(h, drv, netdev, "net stop\n");
++
++	if (h->ae_algo->ops->set_timer_task)
++		h->ae_algo->ops->set_timer_task(priv->ae_handle, false);
++
++	netif_carrier_off(netdev);
++	netif_tx_disable(netdev);
++
++	hns3_nic_net_down(netdev);
++
++	return 0;
++}
++
++static int hns3_nic_uc_sync(struct net_device *netdev,
++			    const unsigned char *addr)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (h->ae_algo->ops->add_uc_addr)
++		return h->ae_algo->ops->add_uc_addr(h, addr);
++
++	return 0;
++}
++
++static int hns3_nic_uc_unsync(struct net_device *netdev,
++			      const unsigned char *addr)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	/* need ignore the request of removing device address, because
++	 * we store the device address and other addresses of uc list
++	 * in the function's mac filter list.
++	 */
++	if (ether_addr_equal(addr, netdev->dev_addr))
++		return 0;
++
++	if (h->ae_algo->ops->rm_uc_addr)
++		return h->ae_algo->ops->rm_uc_addr(h, addr);
++
++	return 0;
++}
++
++static int hns3_nic_mc_sync(struct net_device *netdev,
++			    const unsigned char *addr)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (h->ae_algo->ops->add_mc_addr)
++		return h->ae_algo->ops->add_mc_addr(h, addr);
++
++	return 0;
++}
++
++static int hns3_nic_mc_unsync(struct net_device *netdev,
++			      const unsigned char *addr)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (h->ae_algo->ops->rm_mc_addr)
++		return h->ae_algo->ops->rm_mc_addr(h, addr);
++
++	return 0;
++}
++
++static u8 hns3_get_netdev_flags(struct net_device *netdev)
++{
++	u8 flags = 0;
++
++	if (netdev->flags & IFF_PROMISC)
++		flags = HNAE3_USER_UPE | HNAE3_USER_MPE | HNAE3_BPE;
++	else if (netdev->flags & IFF_ALLMULTI)
++		flags = HNAE3_USER_MPE;
++
++	return flags;
++}
++
++static void hns3_nic_set_rx_mode(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	u8 new_flags;
++
++	new_flags = hns3_get_netdev_flags(netdev);
++
++	__dev_uc_sync(netdev, hns3_nic_uc_sync, hns3_nic_uc_unsync);
++	__dev_mc_sync(netdev, hns3_nic_mc_sync, hns3_nic_mc_unsync);
++
++	/* User mode Promisc mode enable and vlan filtering is disabled to
++	 * let all packets in.
++	 */
++	h->netdev_flags = new_flags;
++	hns3_request_update_promisc_mode(h);
++}
++
++void hns3_request_update_promisc_mode(struct hnae3_handle *handle)
++{
++	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
++
++	if (ops->request_update_promisc_mode)
++		ops->request_update_promisc_mode(handle);
++}
++
++static u32 hns3_tx_spare_space(struct hns3_enet_ring *ring)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++	u32 ntc, ntu;
++
++	/* This smp_load_acquire() pairs with smp_store_release() in
++	 * hns3_tx_spare_update() called in tx desc cleaning process.
++	 */
++	ntc = smp_load_acquire(&tx_spare->last_to_clean);
++	ntu = tx_spare->next_to_use;
++
++	if (ntc > ntu)
++		return ntc - ntu - 1;
++
++	/* The free tx buffer is divided into two part, so pick the
++	 * larger one.
++	 */
++	return max(ntc, tx_spare->len - ntu) - 1;
++}
++
++static void hns3_tx_spare_update(struct hns3_enet_ring *ring)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++
++	if (!tx_spare ||
++	    tx_spare->last_to_clean == tx_spare->next_to_clean)
++		return;
++
++	/* This smp_store_release() pairs with smp_load_acquire() in
++	 * hns3_tx_spare_space() called in xmit process.
++	 */
++	smp_store_release(&tx_spare->last_to_clean,
++			  tx_spare->next_to_clean);
++}
++
++static bool hns3_can_use_tx_bounce(struct hns3_enet_ring *ring,
++				   struct sk_buff *skb,
++				   u32 space)
++{
++	u32 len = skb->len <= ring->tx_copybreak ? skb->len :
++				skb_headlen(skb);
++
++	if (len > ring->tx_copybreak)
++		return false;
++
++	if (ALIGN(len, dma_get_cache_alignment()) > space) {
++		hns3_ring_stats_update(ring, tx_spare_full);
++		return false;
++	}
++
++	return true;
++}
++
++static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring,
++				struct sk_buff *skb,
++				u32 space)
++{
++	if (skb->len <= ring->tx_copybreak || !tx_sgl ||
++	    (!skb_has_frag_list(skb) &&
++	     skb_shinfo(skb)->nr_frags < tx_sgl))
++		return false;
++
++	if (space < HNS3_MAX_SGL_SIZE) {
++		hns3_ring_stats_update(ring, tx_spare_full);
++		return false;
++	}
++
++	return true;
++}
++
++static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
++{
++	u32 alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size;
++	struct hns3_tx_spare *tx_spare;
++	struct page *page;
++	dma_addr_t dma;
++	int order;
++
++	if (!alloc_size)
++		return;
++
++	order = get_order(alloc_size);
++	if (order >= MAX_ORDER) {
++		if (net_ratelimit())
++			dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n");
++		return;
++	}
++
++	tx_spare = devm_kzalloc(ring_to_dev(ring), sizeof(*tx_spare),
++				GFP_KERNEL);
++	if (!tx_spare) {
++		/* The driver still work without the tx spare buffer */
++		dev_warn(ring_to_dev(ring), "failed to allocate hns3_tx_spare\n");
++		goto devm_kzalloc_error;
++	}
++
++	page = alloc_pages_node(dev_to_node(ring_to_dev(ring)),
++				GFP_KERNEL, order);
++	if (!page) {
++		dev_warn(ring_to_dev(ring), "failed to allocate tx spare pages\n");
++		goto alloc_pages_error;
++	}
++
++	dma = dma_map_page(ring_to_dev(ring), page, 0,
++			   PAGE_SIZE << order, DMA_TO_DEVICE);
++	if (dma_mapping_error(ring_to_dev(ring), dma)) {
++		dev_warn(ring_to_dev(ring), "failed to map pages for tx spare\n");
++		goto dma_mapping_error;
++	}
++
++	tx_spare->dma = dma;
++	tx_spare->buf = page_address(page);
++	tx_spare->len = PAGE_SIZE << order;
++	ring->tx_spare = tx_spare;
++	return;
++
++dma_mapping_error:
++	put_page(page);
++alloc_pages_error:
++	devm_kfree(ring_to_dev(ring), tx_spare);
++devm_kzalloc_error:
++	ring->tqp->handle->kinfo.tx_spare_buf_size = 0;
++}
++
++/* Use hns3_tx_spare_space() to make sure there is enough buffer
++ * before calling below function to allocate tx buffer.
++ */
++static void *hns3_tx_spare_alloc(struct hns3_enet_ring *ring,
++				 unsigned int size, dma_addr_t *dma,
++				 u32 *cb_len)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++	u32 ntu = tx_spare->next_to_use;
++
++	size = ALIGN(size, dma_get_cache_alignment());
++	*cb_len = size;
++
++	/* Tx spare buffer wraps back here because the end of
++	 * freed tx buffer is not enough.
++	 */
++	if (ntu + size > tx_spare->len) {
++		*cb_len += (tx_spare->len - ntu);
++		ntu = 0;
++	}
++
++	tx_spare->next_to_use = ntu + size;
++	if (tx_spare->next_to_use == tx_spare->len)
++		tx_spare->next_to_use = 0;
++
++	*dma = tx_spare->dma + ntu;
++
++	return tx_spare->buf + ntu;
++}
++
++static void hns3_tx_spare_rollback(struct hns3_enet_ring *ring, u32 len)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++
++	if (len > tx_spare->next_to_use) {
++		len -= tx_spare->next_to_use;
++		tx_spare->next_to_use = tx_spare->len - len;
++	} else {
++		tx_spare->next_to_use -= len;
++	}
++}
++
++static void hns3_tx_spare_reclaim_cb(struct hns3_enet_ring *ring,
++				     struct hns3_desc_cb *cb)
++{
++	struct hns3_tx_spare *tx_spare = ring->tx_spare;
++	u32 ntc = tx_spare->next_to_clean;
++	u32 len = cb->length;
++
++	tx_spare->next_to_clean += len;
++
++	if (tx_spare->next_to_clean >= tx_spare->len) {
++		tx_spare->next_to_clean -= tx_spare->len;
++
++		if (tx_spare->next_to_clean) {
++			ntc = 0;
++			len = tx_spare->next_to_clean;
++		}
++	}
++
++	/* This tx spare buffer is only really reclaimed after calling
++	 * hns3_tx_spare_update(), so it is still safe to use the info in
++	 * the tx buffer to do the dma sync or sg unmapping after
++	 * tx_spare->next_to_clean is moved forword.
++	 */
++	if (cb->type & (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) {
++		dma_addr_t dma = tx_spare->dma + ntc;
++
++		dma_sync_single_for_cpu(ring_to_dev(ring), dma, len,
++					DMA_TO_DEVICE);
++	} else {
++		struct sg_table *sgt = tx_spare->buf + ntc;
++
++		dma_unmap_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents,
++			     DMA_TO_DEVICE);
++	}
++}
++
++static int hns3_set_tso(struct sk_buff *skb, u32 *paylen_fdop_ol4cs,
++			u16 *mss, u32 *type_cs_vlan_tso, u32 *send_bytes)
++{
++	u32 l4_offset, hdr_len;
++	union l3_hdr_info l3;
++	union l4_hdr_info l4;
++	u32 l4_paylen;
++	int ret;
++
++	if (!skb_is_gso(skb))
++		return 0;
++
++	ret = skb_cow_head(skb, 0);
++	if (unlikely(ret < 0))
++		return ret;
++
++	l3.hdr = skb_network_header(skb);
++	l4.hdr = skb_transport_header(skb);
++
++	/* Software should clear the IPv4's checksum field when tso is
++	 * needed.
++	 */
++	if (l3.v4->version == 4)
++		l3.v4->check = 0;
++
++	/* tunnel packet */
++	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
++					 SKB_GSO_GRE_CSUM |
++					 SKB_GSO_UDP_TUNNEL |
++					 SKB_GSO_UDP_TUNNEL_CSUM)) {
++		/* reset l3&l4 pointers from outer to inner headers */
++		l3.hdr = skb_inner_network_header(skb);
++		l4.hdr = skb_inner_transport_header(skb);
++
++		/* Software should clear the IPv4's checksum field when
++		 * tso is needed.
++		 */
++		if (l3.v4->version == 4)
++			l3.v4->check = 0;
++	}
++
++	/* normal or tunnel packet */
++	l4_offset = l4.hdr - skb->data;
++
++	/* remove payload length from inner pseudo checksum when tso */
++	l4_paylen = skb->len - l4_offset;
++
++	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
++		hdr_len = sizeof(*l4.udp) + l4_offset;
++		csum_replace_by_diff(&l4.udp->check,
++				     (__force __wsum)htonl(l4_paylen));
++	} else {
++		hdr_len = (l4.tcp->doff << 2) + l4_offset;
++		csum_replace_by_diff(&l4.tcp->check,
++				     (__force __wsum)htonl(l4_paylen));
++	}
++
++	*send_bytes = (skb_shinfo(skb)->gso_segs - 1) * hdr_len + skb->len;
++
++	/* find the txbd field values */
++	*paylen_fdop_ol4cs = skb->len - hdr_len;
++	hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_TSO_B, 1);
++
++	/* offload outer UDP header checksum */
++	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)
++		hns3_set_field(*paylen_fdop_ol4cs, HNS3_TXD_OL4CS_B, 1);
++
++	/* get MSS for TSO */
++	*mss = skb_shinfo(skb)->gso_size;
++
++	trace_hns3_tso(skb);
++
++	return 0;
++}
++
++static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto,
++				u8 *il4_proto)
++{
++	union l3_hdr_info l3;
++	unsigned char *l4_hdr;
++	unsigned char *exthdr;
++	u8 l4_proto_tmp;
++	__be16 frag_off;
++
++	/* find outer header point */
++	l3.hdr = skb_network_header(skb);
++	l4_hdr = skb_transport_header(skb);
++
++	if (skb->protocol == htons(ETH_P_IPV6)) {
++		exthdr = l3.hdr + sizeof(*l3.v6);
++		l4_proto_tmp = l3.v6->nexthdr;
++		if (l4_hdr != exthdr)
++			ipv6_skip_exthdr(skb, exthdr - skb->data,
++					 &l4_proto_tmp, &frag_off);
++	} else if (skb->protocol == htons(ETH_P_IP)) {
++		l4_proto_tmp = l3.v4->protocol;
++	} else {
++		return -EINVAL;
++	}
++
++	*ol4_proto = l4_proto_tmp;
++
++	/* tunnel packet */
++	if (!skb->encapsulation) {
++		*il4_proto = 0;
++		return 0;
++	}
++
++	/* find inner header point */
++	l3.hdr = skb_inner_network_header(skb);
++	l4_hdr = skb_inner_transport_header(skb);
++
++	if (l3.v6->version == 6) {
++		exthdr = l3.hdr + sizeof(*l3.v6);
++		l4_proto_tmp = l3.v6->nexthdr;
++		if (l4_hdr != exthdr)
++			ipv6_skip_exthdr(skb, exthdr - skb->data,
++					 &l4_proto_tmp, &frag_off);
++	} else if (l3.v4->version == 4) {
++		l4_proto_tmp = l3.v4->protocol;
++	}
++
++	*il4_proto = l4_proto_tmp;
++
++	return 0;
++}
++
++/* when skb->encapsulation is 0, skb->ip_summed is CHECKSUM_PARTIAL
++ * and it is udp packet, which has a dest port as the IANA assigned.
++ * the hardware is expected to do the checksum offload, but the
++ * hardware will not do the checksum offload when udp dest port is
++ * 4789, 4790 or 6081.
++ */
++static bool hns3_tunnel_csum_bug(struct sk_buff *skb)
++{
++	struct hns3_nic_priv *priv = netdev_priv(skb->dev);
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
++	union l4_hdr_info l4;
++
++	/* device version above V3(include V3), the hardware can
++	 * do this checksum offload.
++	 */
++	if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3)
++		return false;
++
++	l4.hdr = skb_transport_header(skb);
++
++	if (!(!skb->encapsulation &&
++	      (l4.udp->dest == htons(IANA_VXLAN_UDP_PORT) ||
++	      l4.udp->dest == htons(GENEVE_UDP_PORT) ||
++	      l4.udp->dest == htons(IANA_VXLAN_GPE_UDP_PORT))))
++		return false;
++
++	return true;
++}
++
++static void hns3_set_outer_l2l3l4(struct sk_buff *skb, u8 ol4_proto,
++				  u32 *ol_type_vlan_len_msec)
++{
++	u32 l2_len, l3_len, l4_len;
++	unsigned char *il2_hdr;
++	union l3_hdr_info l3;
++	union l4_hdr_info l4;
++
++	l3.hdr = skb_network_header(skb);
++	l4.hdr = skb_transport_header(skb);
++
++	/* compute OL2 header size, defined in 2 Bytes */
++	l2_len = l3.hdr - skb->data;
++	hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L2LEN_S, l2_len >> 1);
++
++	/* compute OL3 header size, defined in 4 Bytes */
++	l3_len = l4.hdr - l3.hdr;
++	hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L3LEN_S, l3_len >> 2);
++
++	il2_hdr = skb_inner_mac_header(skb);
++	/* compute OL4 header size, defined in 4 Bytes */
++	l4_len = il2_hdr - l4.hdr;
++	hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L4LEN_S, l4_len >> 2);
++
++	/* define outer network header type */
++	if (skb->protocol == htons(ETH_P_IP)) {
++		if (skb_is_gso(skb))
++			hns3_set_field(*ol_type_vlan_len_msec,
++				       HNS3_TXD_OL3T_S,
++				       HNS3_OL3T_IPV4_CSUM);
++		else
++			hns3_set_field(*ol_type_vlan_len_msec,
++				       HNS3_TXD_OL3T_S,
++				       HNS3_OL3T_IPV4_NO_CSUM);
++	} else if (skb->protocol == htons(ETH_P_IPV6)) {
++		hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_OL3T_S,
++			       HNS3_OL3T_IPV6);
++	}
++
++	if (ol4_proto == IPPROTO_UDP)
++		hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_TUNTYPE_S,
++			       HNS3_TUN_MAC_IN_UDP);
++	else if (ol4_proto == IPPROTO_GRE)
++		hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_TUNTYPE_S,
++			       HNS3_TUN_NVGRE);
++}
++
++static void hns3_set_l3_type(struct sk_buff *skb, union l3_hdr_info l3,
++			     u32 *type_cs_vlan_tso)
++{
++	if (l3.v4->version == 4) {
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S,
++			       HNS3_L3T_IPV4);
++
++		/* the stack computes the IP header already, the only time we
++		 * need the hardware to recompute it is in the case of TSO.
++		 */
++		if (skb_is_gso(skb))
++			hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3CS_B, 1);
++	} else if (l3.v6->version == 6) {
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S,
++			       HNS3_L3T_IPV6);
++	}
++}
++
++static int hns3_set_l4_csum_length(struct sk_buff *skb, union l4_hdr_info l4,
++				   u32 l4_proto, u32 *type_cs_vlan_tso)
++{
++	/* compute inner(/normal) L4 header size, defined in 4 Bytes */
++	switch (l4_proto) {
++	case IPPROTO_TCP:
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S,
++			       HNS3_L4T_TCP);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S,
++			       l4.tcp->doff);
++		break;
++	case IPPROTO_UDP:
++		if (hns3_tunnel_csum_bug(skb)) {
++			int ret = skb_put_padto(skb, HNS3_MIN_TUN_PKT_LEN);
++
++			return ret ? ret : skb_checksum_help(skb);
++		}
++
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S,
++			       HNS3_L4T_UDP);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S,
++			       (sizeof(struct udphdr) >> 2));
++		break;
++	case IPPROTO_SCTP:
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S,
++			       HNS3_L4T_SCTP);
++		hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S,
++			       (sizeof(struct sctphdr) >> 2));
++		break;
++	default:
++		/* drop the skb tunnel packet if hardware don't support,
++		 * because hardware can't calculate csum when TSO.
++		 */
++		if (skb_is_gso(skb))
++			return -EDOM;
++
++		/* the stack computes the IP header already,
++		 * driver calculate l4 checksum when not TSO.
++		 */
++		return skb_checksum_help(skb);
++	}
++
++	return 0;
++}
++
++static int hns3_set_l2l3l4(struct sk_buff *skb, u8 ol4_proto,
++			   u8 il4_proto, u32 *type_cs_vlan_tso,
++			   u32 *ol_type_vlan_len_msec)
++{
++	unsigned char *l2_hdr = skb->data;
++	u32 l4_proto = ol4_proto;
++	union l4_hdr_info l4;
++	union l3_hdr_info l3;
++	u32 l2_len, l3_len;
++
++	l4.hdr = skb_transport_header(skb);
++	l3.hdr = skb_network_header(skb);
++
++	/* handle encapsulation skb */
++	if (skb->encapsulation) {
++		/* If this is a not UDP/GRE encapsulation skb */
++		if (!(ol4_proto == IPPROTO_UDP || ol4_proto == IPPROTO_GRE)) {
++			/* drop the skb tunnel packet if hardware don't support,
++			 * because hardware can't calculate csum when TSO.
++			 */
++			if (skb_is_gso(skb))
++				return -EDOM;
++
++			/* the stack computes the IP header already,
++			 * driver calculate l4 checksum when not TSO.
++			 */
++			return skb_checksum_help(skb);
++		}
++
++		hns3_set_outer_l2l3l4(skb, ol4_proto, ol_type_vlan_len_msec);
++
++		/* switch to inner header */
++		l2_hdr = skb_inner_mac_header(skb);
++		l3.hdr = skb_inner_network_header(skb);
++		l4.hdr = skb_inner_transport_header(skb);
++		l4_proto = il4_proto;
++	}
++
++	hns3_set_l3_type(skb, l3, type_cs_vlan_tso);
++
++	/* compute inner(/normal) L2 header size, defined in 2 Bytes */
++	l2_len = l3.hdr - l2_hdr;
++	hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L2LEN_S, l2_len >> 1);
++
++	/* compute inner(/normal) L3 header size, defined in 4 Bytes */
++	l3_len = l4.hdr - l3.hdr;
++	hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3LEN_S, l3_len >> 2);
++
++	return hns3_set_l4_csum_length(skb, l4, l4_proto, type_cs_vlan_tso);
++}
++
++static int hns3_handle_vtags(struct hns3_enet_ring *tx_ring,
++			     struct sk_buff *skb)
++{
++	struct hnae3_handle *handle = tx_ring->tqp->handle;
++	struct hnae3_ae_dev *ae_dev;
++	struct vlan_ethhdr *vhdr;
++	int rc;
++
++	if (!(skb->protocol == htons(ETH_P_8021Q) ||
++	      skb_vlan_tag_present(skb)))
++		return 0;
++
++	/* For HW limitation on HNAE3_DEVICE_VERSION_V2, if port based insert
++	 * VLAN enabled, only one VLAN header is allowed in skb, otherwise it
++	 * will cause RAS error.
++	 */
++	ae_dev = pci_get_drvdata(handle->pdev);
++	if (unlikely(skb_vlan_tagged_multi(skb) &&
++		     ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2 &&
++		     handle->port_base_vlan_state ==
++		     HNAE3_PORT_BASE_VLAN_ENABLE))
++		return -EINVAL;
++
++	if (skb->protocol == htons(ETH_P_8021Q) &&
++	    !(handle->kinfo.netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
++		/* When HW VLAN acceleration is turned off, and the stack
++		 * sets the protocol to 802.1q, the driver just need to
++		 * set the protocol to the encapsulated ethertype.
++		 */
++		skb->protocol = vlan_get_protocol(skb);
++		return 0;
++	}
++
++	if (skb_vlan_tag_present(skb)) {
++		/* Based on hw strategy, use out_vtag in two layer tag case,
++		 * and use inner_vtag in one tag case.
++		 */
++		if (skb->protocol == htons(ETH_P_8021Q) &&
++		    handle->port_base_vlan_state ==
++		    HNAE3_PORT_BASE_VLAN_DISABLE)
++			rc = HNS3_OUTER_VLAN_TAG;
++		else
++			rc = HNS3_INNER_VLAN_TAG;
++
++		skb->protocol = vlan_get_protocol(skb);
++		return rc;
++	}
++
++	rc = skb_cow_head(skb, 0);
++	if (unlikely(rc < 0))
++		return rc;
++
++	vhdr = (struct vlan_ethhdr *)skb->data;
++	vhdr->h_vlan_TCI |= cpu_to_be16((skb->priority << VLAN_PRIO_SHIFT)
++					 & VLAN_PRIO_MASK);
++
++	skb->protocol = vlan_get_protocol(skb);
++	return 0;
++}
++
++/* check if the hardware is capable of checksum offloading */
++static bool hns3_check_hw_tx_csum(struct sk_buff *skb)
++{
++	struct hns3_nic_priv *priv = netdev_priv(skb->dev);
++
++	/* Kindly note, due to backward compatibility of the TX descriptor,
++	 * HW checksum of the non-IP packets and GSO packets is handled at
++	 * different place in the following code
++	 */
++	if (skb_csum_is_sctp(skb) || skb_is_gso(skb) ||
++	    !test_bit(HNS3_NIC_STATE_HW_TX_CSUM_ENABLE, &priv->state))
++		return false;
++
++	return true;
++}
++
++struct hns3_desc_param {
++	u32 paylen_ol4cs;
++	u32 ol_type_vlan_len_msec;
++	u32 type_cs_vlan_tso;
++	u16 mss_hw_csum;
++	u16 inner_vtag;
++	u16 out_vtag;
++};
++
++static void hns3_init_desc_data(struct sk_buff *skb, struct hns3_desc_param *pa)
++{
++	pa->paylen_ol4cs = skb->len;
++	pa->ol_type_vlan_len_msec = 0;
++	pa->type_cs_vlan_tso = 0;
++	pa->mss_hw_csum = 0;
++	pa->inner_vtag = 0;
++	pa->out_vtag = 0;
++}
++
++static int hns3_handle_vlan_info(struct hns3_enet_ring *ring,
++				 struct sk_buff *skb,
++				 struct hns3_desc_param *param)
++{
++	int ret;
++
++	ret = hns3_handle_vtags(ring, skb);
++	if (unlikely(ret < 0)) {
++		hns3_ring_stats_update(ring, tx_vlan_err);
++		return ret;
++	} else if (ret == HNS3_INNER_VLAN_TAG) {
++		param->inner_vtag = skb_vlan_tag_get(skb);
++		param->inner_vtag |= (skb->priority << VLAN_PRIO_SHIFT) &
++				VLAN_PRIO_MASK;
++		hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_VLAN_B, 1);
++	} else if (ret == HNS3_OUTER_VLAN_TAG) {
++		param->out_vtag = skb_vlan_tag_get(skb);
++		param->out_vtag |= (skb->priority << VLAN_PRIO_SHIFT) &
++				VLAN_PRIO_MASK;
++		hns3_set_field(param->ol_type_vlan_len_msec, HNS3_TXD_OVLAN_B,
++			       1);
++	}
++	return 0;
++}
++
++static int hns3_handle_csum_partial(struct hns3_enet_ring *ring,
++				    struct sk_buff *skb,
++				    struct hns3_desc_cb *desc_cb,
++				    struct hns3_desc_param *param)
++{
++	u8 ol4_proto, il4_proto;
++	int ret;
++
++	if (hns3_check_hw_tx_csum(skb)) {
++		/* set checksum start and offset, defined in 2 Bytes */
++		hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_CSUM_START_S,
++			       skb_checksum_start_offset(skb) >> 1);
++		hns3_set_field(param->ol_type_vlan_len_msec,
++			       HNS3_TXD_CSUM_OFFSET_S,
++			       skb->csum_offset >> 1);
++		param->mss_hw_csum |= BIT(HNS3_TXD_HW_CS_B);
++		return 0;
++	}
++
++	skb_reset_mac_len(skb);
++
++	ret = hns3_get_l4_protocol(skb, &ol4_proto, &il4_proto);
++	if (unlikely(ret < 0)) {
++		hns3_ring_stats_update(ring, tx_l4_proto_err);
++		return ret;
++	}
++
++	ret = hns3_set_l2l3l4(skb, ol4_proto, il4_proto,
++			      &param->type_cs_vlan_tso,
++			      &param->ol_type_vlan_len_msec);
++	if (unlikely(ret < 0)) {
++		hns3_ring_stats_update(ring, tx_l2l3l4_err);
++		return ret;
++	}
++
++	ret = hns3_set_tso(skb, &param->paylen_ol4cs, &param->mss_hw_csum,
++			   &param->type_cs_vlan_tso, &desc_cb->send_bytes);
++	if (unlikely(ret < 0)) {
++		hns3_ring_stats_update(ring, tx_tso_err);
++		return ret;
++	}
++	return 0;
++}
++
++static int hns3_fill_skb_desc(struct hns3_enet_ring *ring,
++			      struct sk_buff *skb, struct hns3_desc *desc,
++			      struct hns3_desc_cb *desc_cb)
++{
++	struct hns3_desc_param param;
++	int ret;
++
++	hns3_init_desc_data(skb, &param);
++	ret = hns3_handle_vlan_info(ring, skb, &param);
++	if (unlikely(ret < 0))
++		return ret;
++
++	desc_cb->send_bytes = skb->len;
++
++	if (skb->ip_summed == CHECKSUM_PARTIAL) {
++		ret = hns3_handle_csum_partial(ring, skb, desc_cb, &param);
++		if (ret)
++			return ret;
++	}
++
++	/* Set txbd */
++	desc->tx.ol_type_vlan_len_msec =
++		cpu_to_le32(param.ol_type_vlan_len_msec);
++	desc->tx.type_cs_vlan_tso_len = cpu_to_le32(param.type_cs_vlan_tso);
++	desc->tx.paylen_ol4cs = cpu_to_le32(param.paylen_ol4cs);
++	desc->tx.mss_hw_csum = cpu_to_le16(param.mss_hw_csum);
++	desc->tx.vlan_tag = cpu_to_le16(param.inner_vtag);
++	desc->tx.outer_vlan_tag = cpu_to_le16(param.out_vtag);
++
++	return 0;
++}
++
++static int hns3_fill_desc(struct hns3_enet_ring *ring, dma_addr_t dma,
++			  unsigned int size)
++{
++#define HNS3_LIKELY_BD_NUM	1
++
++	struct hns3_desc *desc = &ring->desc[ring->next_to_use];
++	unsigned int frag_buf_num;
++	int k, sizeoflast;
++
++	if (likely(size <= HNS3_MAX_BD_SIZE)) {
++		desc->addr = cpu_to_le64(dma);
++		desc->tx.send_size = cpu_to_le16(size);
++		desc->tx.bdtp_fe_sc_vld_ra_ri =
++			cpu_to_le16(BIT(HNS3_TXD_VLD_B));
++
++		trace_hns3_tx_desc(ring, ring->next_to_use);
++		ring_ptr_move_fw(ring, next_to_use);
++		return HNS3_LIKELY_BD_NUM;
++	}
++
++	frag_buf_num = hns3_tx_bd_count(size);
++	sizeoflast = size % HNS3_MAX_BD_SIZE;
++	sizeoflast = sizeoflast ? sizeoflast : HNS3_MAX_BD_SIZE;
++
++	/* When frag size is bigger than hardware limit, split this frag */
++	for (k = 0; k < frag_buf_num; k++) {
++		/* now, fill the descriptor */
++		desc->addr = cpu_to_le64(dma + HNS3_MAX_BD_SIZE * k);
++		desc->tx.send_size = cpu_to_le16((k == frag_buf_num - 1) ?
++				     (u16)sizeoflast : (u16)HNS3_MAX_BD_SIZE);
++		desc->tx.bdtp_fe_sc_vld_ra_ri =
++				cpu_to_le16(BIT(HNS3_TXD_VLD_B));
++
++		trace_hns3_tx_desc(ring, ring->next_to_use);
++		/* move ring pointer to next */
++		ring_ptr_move_fw(ring, next_to_use);
++
++		desc = &ring->desc[ring->next_to_use];
++	}
++
++	return frag_buf_num;
++}
++
++static int hns3_map_and_fill_desc(struct hns3_enet_ring *ring, void *priv,
++				  unsigned int type)
++{
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use];
++	struct device *dev = ring_to_dev(ring);
++	unsigned int size;
++	dma_addr_t dma;
++
++	if (type & (DESC_TYPE_FRAGLIST_SKB | DESC_TYPE_SKB)) {
++		struct sk_buff *skb = (struct sk_buff *)priv;
++
++		size = skb_headlen(skb);
++		if (!size)
++			return 0;
++
++		dma = dma_map_single(dev, skb->data, size, DMA_TO_DEVICE);
++	} else if (type & DESC_TYPE_BOUNCE_HEAD) {
++		/* Head data has been filled in hns3_handle_tx_bounce(),
++		 * just return 0 here.
++		 */
++		return 0;
++	} else {
++		skb_frag_t *frag = (skb_frag_t *)priv;
++
++		size = skb_frag_size(frag);
++		if (!size)
++			return 0;
++
++		dma = skb_frag_dma_map(dev, frag, 0, size, DMA_TO_DEVICE);
++	}
++
++	if (unlikely(dma_mapping_error(dev, dma))) {
++		hns3_ring_stats_update(ring, sw_err_cnt);
++		return -ENOMEM;
++	}
++
++	desc_cb->priv = priv;
++	desc_cb->length = size;
++	desc_cb->dma = dma;
++	desc_cb->type = type;
++
++	return hns3_fill_desc(ring, dma, size);
++}
++
++static unsigned int hns3_skb_bd_num(struct sk_buff *skb, unsigned int *bd_size,
++				    unsigned int bd_num)
++{
++	unsigned int size;
++	int i;
++
++	size = skb_headlen(skb);
++	while (size > HNS3_MAX_BD_SIZE) {
++		bd_size[bd_num++] = HNS3_MAX_BD_SIZE;
++		size -= HNS3_MAX_BD_SIZE;
++
++		if (bd_num > HNS3_MAX_TSO_BD_NUM)
++			return bd_num;
++	}
++
++	if (size) {
++		bd_size[bd_num++] = size;
++		if (bd_num > HNS3_MAX_TSO_BD_NUM)
++			return bd_num;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++		size = skb_frag_size(frag);
++		if (!size)
++			continue;
++
++		while (size > HNS3_MAX_BD_SIZE) {
++			bd_size[bd_num++] = HNS3_MAX_BD_SIZE;
++			size -= HNS3_MAX_BD_SIZE;
++
++			if (bd_num > HNS3_MAX_TSO_BD_NUM)
++				return bd_num;
++		}
++
++		bd_size[bd_num++] = size;
++		if (bd_num > HNS3_MAX_TSO_BD_NUM)
++			return bd_num;
++	}
++
++	return bd_num;
++}
++
++static unsigned int hns3_tx_bd_num(struct sk_buff *skb, unsigned int *bd_size,
++				   u8 max_non_tso_bd_num, unsigned int bd_num,
++				   unsigned int recursion_level)
++{
++#define HNS3_MAX_RECURSION_LEVEL	24
++
++	struct sk_buff *frag_skb;
++
++	/* If the total len is within the max bd limit */
++	if (likely(skb->len <= HNS3_MAX_BD_SIZE && !recursion_level &&
++		   !skb_has_frag_list(skb) &&
++		   skb_shinfo(skb)->nr_frags < max_non_tso_bd_num))
++		return skb_shinfo(skb)->nr_frags + 1U;
++
++	if (unlikely(recursion_level >= HNS3_MAX_RECURSION_LEVEL))
++		return UINT_MAX;
++
++	bd_num = hns3_skb_bd_num(skb, bd_size, bd_num);
++	if (!skb_has_frag_list(skb) || bd_num > HNS3_MAX_TSO_BD_NUM)
++		return bd_num;
++
++	skb_walk_frags(skb, frag_skb) {
++		bd_num = hns3_tx_bd_num(frag_skb, bd_size, max_non_tso_bd_num,
++					bd_num, recursion_level + 1);
++		if (bd_num > HNS3_MAX_TSO_BD_NUM)
++			return bd_num;
++	}
++
++	return bd_num;
++}
++
++static unsigned int hns3_gso_hdr_len(struct sk_buff *skb)
++{
++	if (!skb->encapsulation)
++		return skb_tcp_all_headers(skb);
++
++	return skb_inner_tcp_all_headers(skb);
++}
++
++/* HW need every continuous max_non_tso_bd_num buffer data to be larger
++ * than MSS, we simplify it by ensuring skb_headlen + the first continuous
++ * max_non_tso_bd_num - 1 frags to be larger than gso header len + mss,
++ * and the remaining continuous max_non_tso_bd_num - 1 frags to be larger
++ * than MSS except the last max_non_tso_bd_num - 1 frags.
++ */
++static bool hns3_skb_need_linearized(struct sk_buff *skb, unsigned int *bd_size,
++				     unsigned int bd_num, u8 max_non_tso_bd_num)
++{
++	unsigned int tot_len = 0;
++	int i;
++
++	for (i = 0; i < max_non_tso_bd_num - 1U; i++)
++		tot_len += bd_size[i];
++
++	/* ensure the first max_non_tso_bd_num frags is greater than
++	 * mss + header
++	 */
++	if (tot_len + bd_size[max_non_tso_bd_num - 1U] <
++	    skb_shinfo(skb)->gso_size + hns3_gso_hdr_len(skb))
++		return true;
++
++	/* ensure every continuous max_non_tso_bd_num - 1 buffer is greater
++	 * than mss except the last one.
++	 */
++	for (i = 0; i < bd_num - max_non_tso_bd_num; i++) {
++		tot_len -= bd_size[i];
++		tot_len += bd_size[i + max_non_tso_bd_num - 1U];
++
++		if (tot_len < skb_shinfo(skb)->gso_size)
++			return true;
++	}
++
++	return false;
++}
++
++void hns3_shinfo_pack(struct skb_shared_info *shinfo, __u32 *size)
++{
++	int i;
++
++	for (i = 0; i < MAX_SKB_FRAGS; i++)
++		size[i] = skb_frag_size(&shinfo->frags[i]);
++}
++
++static int hns3_skb_linearize(struct hns3_enet_ring *ring,
++			      struct sk_buff *skb,
++			      unsigned int bd_num)
++{
++	/* 'bd_num == UINT_MAX' means the skb' fraglist has a
++	 * recursion level of over HNS3_MAX_RECURSION_LEVEL.
++	 */
++	if (bd_num == UINT_MAX) {
++		hns3_ring_stats_update(ring, over_max_recursion);
++		return -ENOMEM;
++	}
++
++	/* The skb->len has exceeded the hw limitation, linearization
++	 * will not help.
++	 */
++	if (skb->len > HNS3_MAX_TSO_SIZE ||
++	    (!skb_is_gso(skb) && skb->len > HNS3_MAX_NON_TSO_SIZE)) {
++		hns3_ring_stats_update(ring, hw_limitation);
++		return -ENOMEM;
++	}
++
++	if (__skb_linearize(skb)) {
++		hns3_ring_stats_update(ring, sw_err_cnt);
++		return -ENOMEM;
++	}
++
++	return 0;
++}
++
++static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring,
++				  struct net_device *netdev,
++				  struct sk_buff *skb)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	u8 max_non_tso_bd_num = priv->max_non_tso_bd_num;
++	unsigned int bd_size[HNS3_MAX_TSO_BD_NUM + 1U];
++	unsigned int bd_num;
++
++	bd_num = hns3_tx_bd_num(skb, bd_size, max_non_tso_bd_num, 0, 0);
++	if (unlikely(bd_num > max_non_tso_bd_num)) {
++		if (bd_num <= HNS3_MAX_TSO_BD_NUM && skb_is_gso(skb) &&
++		    !hns3_skb_need_linearized(skb, bd_size, bd_num,
++					      max_non_tso_bd_num)) {
++			trace_hns3_over_max_bd(skb);
++			goto out;
++		}
++
++		if (hns3_skb_linearize(ring, skb, bd_num))
++			return -ENOMEM;
++
++		bd_num = hns3_tx_bd_count(skb->len);
++
++		hns3_ring_stats_update(ring, tx_copy);
++	}
++
++out:
++	if (likely(ring_space(ring) >= bd_num))
++		return bd_num;
++
++	netif_stop_subqueue(netdev, ring->queue_index);
++	smp_mb(); /* Memory barrier before checking ring_space */
++
++	/* Start queue in case hns3_clean_tx_ring has just made room
++	 * available and has not seen the queue stopped state performed
++	 * by netif_stop_subqueue above.
++	 */
++	if (ring_space(ring) >= bd_num && netif_carrier_ok(netdev) &&
++	    !test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) {
++		netif_start_subqueue(netdev, ring->queue_index);
++		return bd_num;
++	}
++
++	hns3_ring_stats_update(ring, tx_busy);
++
++	return -EBUSY;
++}
++
++static void hns3_clear_desc(struct hns3_enet_ring *ring, int next_to_use_orig)
++{
++	struct device *dev = ring_to_dev(ring);
++	unsigned int i;
++
++	for (i = 0; i < ring->desc_num; i++) {
++		struct hns3_desc *desc = &ring->desc[ring->next_to_use];
++		struct hns3_desc_cb *desc_cb;
++
++		memset(desc, 0, sizeof(*desc));
++
++		/* check if this is where we started */
++		if (ring->next_to_use == next_to_use_orig)
++			break;
++
++		/* rollback one */
++		ring_ptr_move_bw(ring, next_to_use);
++
++		desc_cb = &ring->desc_cb[ring->next_to_use];
++
++		if (!desc_cb->dma)
++			continue;
++
++		/* unmap the descriptor dma address */
++		if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB))
++			dma_unmap_single(dev, desc_cb->dma, desc_cb->length,
++					 DMA_TO_DEVICE);
++		else if (desc_cb->type &
++			 (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL))
++			hns3_tx_spare_rollback(ring, desc_cb->length);
++		else if (desc_cb->length)
++			dma_unmap_page(dev, desc_cb->dma, desc_cb->length,
++				       DMA_TO_DEVICE);
++
++		desc_cb->length = 0;
++		desc_cb->dma = 0;
++		desc_cb->type = DESC_TYPE_UNKNOWN;
++	}
++}
++
++static int hns3_fill_skb_to_desc(struct hns3_enet_ring *ring,
++				 struct sk_buff *skb, unsigned int type)
++{
++	struct sk_buff *frag_skb;
++	int i, ret, bd_num = 0;
++
++	ret = hns3_map_and_fill_desc(ring, skb, type);
++	if (unlikely(ret < 0))
++		return ret;
++
++	bd_num += ret;
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++		ret = hns3_map_and_fill_desc(ring, frag, DESC_TYPE_PAGE);
++		if (unlikely(ret < 0))
++			return ret;
++
++		bd_num += ret;
++	}
++
++	skb_walk_frags(skb, frag_skb) {
++		ret = hns3_fill_skb_to_desc(ring, frag_skb,
++					    DESC_TYPE_FRAGLIST_SKB);
++		if (unlikely(ret < 0))
++			return ret;
++
++		bd_num += ret;
++	}
++
++	return bd_num;
++}
++
++static void hns3_tx_push_bd(struct hns3_enet_ring *ring, int num)
++{
++#define HNS3_BYTES_PER_64BIT		8
++
++	struct hns3_desc desc[HNS3_MAX_PUSH_BD_NUM] = {};
++	int offset = 0;
++
++	/* make sure everything is visible to device before
++	 * excuting tx push or updating doorbell
++	 */
++	dma_wmb();
++
++	do {
++		int idx = (ring->next_to_use - num + ring->desc_num) %
++			  ring->desc_num;
++
++		u64_stats_update_begin(&ring->syncp);
++		ring->stats.tx_push++;
++		u64_stats_update_end(&ring->syncp);
++		memcpy(&desc[offset], &ring->desc[idx],
++		       sizeof(struct hns3_desc));
++		offset++;
++	} while (--num);
++
++	__iowrite64_copy(ring->tqp->mem_base, desc,
++			 (sizeof(struct hns3_desc) * HNS3_MAX_PUSH_BD_NUM) /
++			 HNS3_BYTES_PER_64BIT);
++
++	io_stop_wc();
++}
++
++static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring)
++{
++#define HNS3_MEM_DOORBELL_OFFSET	64
++
++	__le64 bd_num = cpu_to_le64((u64)ring->pending_buf);
++
++	/* make sure everything is visible to device before
++	 * excuting tx push or updating doorbell
++	 */
++	dma_wmb();
++
++	__iowrite64_copy(ring->tqp->mem_base + HNS3_MEM_DOORBELL_OFFSET,
++			 &bd_num, 1);
++	u64_stats_update_begin(&ring->syncp);
++	ring->stats.tx_mem_doorbell += ring->pending_buf;
++	u64_stats_update_end(&ring->syncp);
++
++	io_stop_wc();
++}
++
++static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num,
++			     bool doorbell)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	/* when tx push is enabled, the packet whose number of BD below
++	 * HNS3_MAX_PUSH_BD_NUM can be pushed directly.
++	 */
++	if (test_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state) && num &&
++	    !ring->pending_buf && num <= HNS3_MAX_PUSH_BD_NUM && doorbell) {
++		hns3_tx_push_bd(ring, num);
++		WRITE_ONCE(ring->last_to_use, ring->next_to_use);
++		return;
++	}
++
++	ring->pending_buf += num;
++
++	if (!doorbell) {
++		hns3_ring_stats_update(ring, tx_more);
++		return;
++	}
++
++	if (ring->tqp->mem_base)
++		hns3_tx_mem_doorbell(ring);
++	else
++		writel(ring->pending_buf,
++		       ring->tqp->io_base + HNS3_RING_TX_RING_TAIL_REG);
++
++	ring->pending_buf = 0;
++	WRITE_ONCE(ring->last_to_use, ring->next_to_use);
++}
++
++static void hns3_tsyn(struct net_device *netdev, struct sk_buff *skb,
++		      struct hns3_desc *desc)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (!(h->ae_algo->ops->set_tx_hwts_info &&
++	      h->ae_algo->ops->set_tx_hwts_info(h, skb)))
++		return;
++
++	desc->tx.bdtp_fe_sc_vld_ra_ri |= cpu_to_le16(BIT(HNS3_TXD_TSYN_B));
++}
++
++static int hns3_handle_tx_bounce(struct hns3_enet_ring *ring,
++				 struct sk_buff *skb)
++{
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use];
++	unsigned int type = DESC_TYPE_BOUNCE_HEAD;
++	unsigned int size = skb_headlen(skb);
++	dma_addr_t dma;
++	int bd_num = 0;
++	u32 cb_len;
++	void *buf;
++	int ret;
++
++	if (skb->len <= ring->tx_copybreak) {
++		size = skb->len;
++		type = DESC_TYPE_BOUNCE_ALL;
++	}
++
++	/* hns3_can_use_tx_bounce() is called to ensure the below
++	 * function can always return the tx buffer.
++	 */
++	buf = hns3_tx_spare_alloc(ring, size, &dma, &cb_len);
++
++	ret = skb_copy_bits(skb, 0, buf, size);
++	if (unlikely(ret < 0)) {
++		hns3_tx_spare_rollback(ring, cb_len);
++		hns3_ring_stats_update(ring, copy_bits_err);
++		return ret;
++	}
++
++	desc_cb->priv = skb;
++	desc_cb->length = cb_len;
++	desc_cb->dma = dma;
++	desc_cb->type = type;
++
++	bd_num += hns3_fill_desc(ring, dma, size);
++
++	if (type == DESC_TYPE_BOUNCE_HEAD) {
++		ret = hns3_fill_skb_to_desc(ring, skb,
++					    DESC_TYPE_BOUNCE_HEAD);
++		if (unlikely(ret < 0))
++			return ret;
++
++		bd_num += ret;
++	}
++
++	dma_sync_single_for_device(ring_to_dev(ring), dma, size,
++				   DMA_TO_DEVICE);
++
++	hns3_ring_stats_update(ring, tx_bounce);
++
++	return bd_num;
++}
++
++static int hns3_handle_tx_sgl(struct hns3_enet_ring *ring,
++			      struct sk_buff *skb)
++{
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use];
++	u32 nfrag = skb_shinfo(skb)->nr_frags + 1;
++	struct sg_table *sgt;
++	int i, bd_num = 0;
++	dma_addr_t dma;
++	u32 cb_len;
++	int nents;
++
++	if (skb_has_frag_list(skb))
++		nfrag = HNS3_MAX_TSO_BD_NUM;
++
++	/* hns3_can_use_tx_sgl() is called to ensure the below
++	 * function can always return the tx buffer.
++	 */
++	sgt = hns3_tx_spare_alloc(ring, HNS3_SGL_SIZE(nfrag),
++				  &dma, &cb_len);
++
++	/* scatterlist follows by the sg table */
++	sgt->sgl = (struct scatterlist *)(sgt + 1);
++	sg_init_table(sgt->sgl, nfrag);
++	nents = skb_to_sgvec(skb, sgt->sgl, 0, skb->len);
++	if (unlikely(nents < 0)) {
++		hns3_tx_spare_rollback(ring, cb_len);
++		hns3_ring_stats_update(ring, skb2sgl_err);
++		return -ENOMEM;
++	}
++
++	sgt->orig_nents = nents;
++	sgt->nents = dma_map_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents,
++				DMA_TO_DEVICE);
++	if (unlikely(!sgt->nents)) {
++		hns3_tx_spare_rollback(ring, cb_len);
++		hns3_ring_stats_update(ring, map_sg_err);
++		return -ENOMEM;
++	}
++
++	desc_cb->priv = skb;
++	desc_cb->length = cb_len;
++	desc_cb->dma = dma;
++	desc_cb->type = DESC_TYPE_SGL_SKB;
++
++	for (i = 0; i < sgt->nents; i++)
++		bd_num += hns3_fill_desc(ring, sg_dma_address(sgt->sgl + i),
++					 sg_dma_len(sgt->sgl + i));
++	hns3_ring_stats_update(ring, tx_sgl);
++
++	return bd_num;
++}
++
++static int hns3_handle_desc_filling(struct hns3_enet_ring *ring,
++				    struct sk_buff *skb)
++{
++	u32 space;
++
++	if (!ring->tx_spare)
++		goto out;
++
++	space = hns3_tx_spare_space(ring);
++
++	if (hns3_can_use_tx_sgl(ring, skb, space))
++		return hns3_handle_tx_sgl(ring, skb);
++
++	if (hns3_can_use_tx_bounce(ring, skb, space))
++		return hns3_handle_tx_bounce(ring, skb);
++
++out:
++	return hns3_fill_skb_to_desc(ring, skb, DESC_TYPE_SKB);
++}
++
++static int hns3_handle_skb_desc(struct hns3_enet_ring *ring,
++				struct sk_buff *skb,
++				struct hns3_desc_cb *desc_cb,
++				int next_to_use_head)
++{
++	int ret;
++
++	ret = hns3_fill_skb_desc(ring, skb, &ring->desc[ring->next_to_use],
++				 desc_cb);
++	if (unlikely(ret < 0))
++		goto fill_err;
++
++	/* 'ret < 0' means filling error, 'ret == 0' means skb->len is
++	 * zero, which is unlikely, and 'ret > 0' means how many tx desc
++	 * need to be notified to the hw.
++	 */
++	ret = hns3_handle_desc_filling(ring, skb);
++	if (likely(ret > 0))
++		return ret;
++
++fill_err:
++	hns3_clear_desc(ring, next_to_use_head);
++	return ret;
++}
++
++netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hns3_enet_ring *ring = &priv->ring[skb->queue_mapping];
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use];
++	struct netdev_queue *dev_queue;
++	int pre_ntu, ret;
++	bool doorbell;
++
++	/* Hardware can only handle short frames above 32 bytes */
++	if (skb_put_padto(skb, HNS3_MIN_TX_LEN)) {
++		hns3_tx_doorbell(ring, 0, !netdev_xmit_more());
++
++		hns3_ring_stats_update(ring, sw_err_cnt);
++
++		return NETDEV_TX_OK;
++	}
++
++	/* Prefetch the data used later */
++	prefetch(skb->data);
++
++	ret = hns3_nic_maybe_stop_tx(ring, netdev, skb);
++	if (unlikely(ret <= 0)) {
++		if (ret == -EBUSY) {
++			hns3_tx_doorbell(ring, 0, true);
++			return NETDEV_TX_BUSY;
++		}
++
++		hns3_rl_err(netdev, "xmit error: %d!\n", ret);
++		goto out_err_tx_ok;
++	}
++
++	ret = hns3_handle_skb_desc(ring, skb, desc_cb, ring->next_to_use);
++	if (unlikely(ret <= 0))
++		goto out_err_tx_ok;
++
++	pre_ntu = ring->next_to_use ? (ring->next_to_use - 1) :
++					(ring->desc_num - 1);
++
++	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
++		hns3_tsyn(netdev, skb, &ring->desc[pre_ntu]);
++
++	ring->desc[pre_ntu].tx.bdtp_fe_sc_vld_ra_ri |=
++				cpu_to_le16(BIT(HNS3_TXD_FE_B));
++	trace_hns3_tx_desc(ring, pre_ntu);
++
++	skb_tx_timestamp(skb);
++
++	/* Complete translate all packets */
++	dev_queue = netdev_get_tx_queue(netdev, ring->queue_index);
++	doorbell = __netdev_tx_sent_queue(dev_queue, desc_cb->send_bytes,
++					  netdev_xmit_more());
++	hns3_tx_doorbell(ring, ret, doorbell);
++
++	return NETDEV_TX_OK;
++
++out_err_tx_ok:
++	dev_kfree_skb_any(skb);
++	hns3_tx_doorbell(ring, 0, !netdev_xmit_more());
++	return NETDEV_TX_OK;
++}
++
++static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p)
++{
++	char format_mac_addr_perm[HNAE3_FORMAT_MAC_ADDR_LEN];
++	char format_mac_addr_sa[HNAE3_FORMAT_MAC_ADDR_LEN];
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct sockaddr *mac_addr = p;
++	int ret;
++
++	if (!mac_addr || !is_valid_ether_addr((const u8 *)mac_addr->sa_data))
++		return -EADDRNOTAVAIL;
++
++	if (ether_addr_equal(netdev->dev_addr, mac_addr->sa_data)) {
++		hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data);
++		netdev_info(netdev, "already using mac address %s\n",
++			    format_mac_addr_sa);
++		return 0;
++	}
++
++	/* For VF device, if there is a perm_addr, then the user will not
++	 * be allowed to change the address.
++	 */
++	if (!hns3_is_phys_func(h->pdev) &&
++	    !is_zero_ether_addr(netdev->perm_addr)) {
++		hnae3_format_mac_addr(format_mac_addr_perm, netdev->perm_addr);
++		hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data);
++		netdev_err(netdev, "has permanent MAC %s, user MAC %s not allow\n",
++			   format_mac_addr_perm, format_mac_addr_sa);
++		return -EPERM;
++	}
++
++	ret = h->ae_algo->ops->set_mac_addr(h, mac_addr->sa_data, false);
++	if (ret) {
++		netdev_err(netdev, "set_mac_address fail, ret=%d!\n", ret);
++		return ret;
++	}
++
++	eth_hw_addr_set(netdev, mac_addr->sa_data);
++
++	return 0;
++}
++
++static int hns3_nic_do_ioctl(struct net_device *netdev,
++			     struct ifreq *ifr, int cmd)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (!netif_running(netdev))
++		return -EINVAL;
++
++	if (!h->ae_algo->ops->do_ioctl)
++		return -EOPNOTSUPP;
++
++	return h->ae_algo->ops->do_ioctl(h, ifr, cmd);
++}
++
++static int hns3_nic_set_features(struct net_device *netdev,
++				 netdev_features_t features)
++{
++	netdev_features_t changed = netdev->features ^ features;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct hnae3_handle *h = priv->ae_handle;
++	bool enable;
++	int ret;
++
++	if (changed & (NETIF_F_GRO_HW) && h->ae_algo->ops->set_gro_en) {
++		enable = !!(features & NETIF_F_GRO_HW);
++		ret = h->ae_algo->ops->set_gro_en(h, enable);
++		if (ret)
++			return ret;
++	}
++
++	if ((changed & NETIF_F_HW_VLAN_CTAG_RX) &&
++	    h->ae_algo->ops->enable_hw_strip_rxvtag) {
++		enable = !!(features & NETIF_F_HW_VLAN_CTAG_RX);
++		ret = h->ae_algo->ops->enable_hw_strip_rxvtag(h, enable);
++		if (ret)
++			return ret;
++	}
++
++	if ((changed & NETIF_F_NTUPLE) && h->ae_algo->ops->enable_fd) {
++		enable = !!(features & NETIF_F_NTUPLE);
++		h->ae_algo->ops->enable_fd(h, enable);
++	}
++
++	if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) &&
++	    h->ae_algo->ops->cls_flower_active(h)) {
++		netdev_err(netdev,
++			   "there are offloaded TC filters active, cannot disable HW TC offload");
++		return -EINVAL;
++	}
++
++	if ((changed & NETIF_F_HW_VLAN_CTAG_FILTER) &&
++	    h->ae_algo->ops->enable_vlan_filter) {
++		enable = !!(features & NETIF_F_HW_VLAN_CTAG_FILTER);
++		ret = h->ae_algo->ops->enable_vlan_filter(h, enable);
++		if (ret)
++			return ret;
++	}
++
++	netdev->features = features;
++	return 0;
++}
++
++static netdev_features_t hns3_features_check(struct sk_buff *skb,
++					     struct net_device *dev,
++					     netdev_features_t features)
++{
++#define HNS3_MAX_HDR_LEN	480U
++#define HNS3_MAX_L4_HDR_LEN	60U
++
++	size_t len;
++
++	if (skb->ip_summed != CHECKSUM_PARTIAL)
++		return features;
++
++	if (skb->encapsulation)
++		len = skb_inner_transport_header(skb) - skb->data;
++	else
++		len = skb_transport_header(skb) - skb->data;
++
++	/* Assume L4 is 60 byte as TCP is the only protocol with a
++	 * a flexible value, and it's max len is 60 bytes.
++	 */
++	len += HNS3_MAX_L4_HDR_LEN;
++
++	/* Hardware only supports checksum on the skb with a max header
++	 * len of 480 bytes.
++	 */
++	if (len > HNS3_MAX_HDR_LEN)
++		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
++
++	return features;
++}
++
++static void hns3_fetch_stats(struct rtnl_link_stats64 *stats,
++			     struct hns3_enet_ring *ring, bool is_tx)
++{
++	unsigned int start;
++
++	do {
++		start = u64_stats_fetch_begin_irq(&ring->syncp);
++		if (is_tx) {
++			stats->tx_bytes += ring->stats.tx_bytes;
++			stats->tx_packets += ring->stats.tx_pkts;
++			stats->tx_dropped += ring->stats.sw_err_cnt;
++			stats->tx_dropped += ring->stats.tx_vlan_err;
++			stats->tx_dropped += ring->stats.tx_l4_proto_err;
++			stats->tx_dropped += ring->stats.tx_l2l3l4_err;
++			stats->tx_dropped += ring->stats.tx_tso_err;
++			stats->tx_dropped += ring->stats.over_max_recursion;
++			stats->tx_dropped += ring->stats.hw_limitation;
++			stats->tx_dropped += ring->stats.copy_bits_err;
++			stats->tx_dropped += ring->stats.skb2sgl_err;
++			stats->tx_dropped += ring->stats.map_sg_err;
++			stats->tx_errors += ring->stats.sw_err_cnt;
++			stats->tx_errors += ring->stats.tx_vlan_err;
++			stats->tx_errors += ring->stats.tx_l4_proto_err;
++			stats->tx_errors += ring->stats.tx_l2l3l4_err;
++			stats->tx_errors += ring->stats.tx_tso_err;
++			stats->tx_errors += ring->stats.over_max_recursion;
++			stats->tx_errors += ring->stats.hw_limitation;
++			stats->tx_errors += ring->stats.copy_bits_err;
++			stats->tx_errors += ring->stats.skb2sgl_err;
++			stats->tx_errors += ring->stats.map_sg_err;
++		} else {
++			stats->rx_bytes += ring->stats.rx_bytes;
++			stats->rx_packets += ring->stats.rx_pkts;
++			stats->rx_dropped += ring->stats.l2_err;
++			stats->rx_errors += ring->stats.l2_err;
++			stats->rx_errors += ring->stats.l3l4_csum_err;
++			stats->rx_crc_errors += ring->stats.l2_err;
++			stats->multicast += ring->stats.rx_multicast;
++			stats->rx_length_errors += ring->stats.err_pkt_len;
++		}
++	} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
++}
++
++static void hns3_nic_get_stats64(struct net_device *netdev,
++				 struct rtnl_link_stats64 *stats)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	int queue_num = priv->ae_handle->kinfo.num_tqps;
++	struct hnae3_handle *handle = priv->ae_handle;
++	struct rtnl_link_stats64 ring_total_stats;
++	struct hns3_enet_ring *ring;
++	unsigned int idx;
++
++	if (test_bit(HNS3_NIC_STATE_DOWN, &priv->state))
++		return;
++
++	handle->ae_algo->ops->update_stats(handle, &netdev->stats);
++
++	memset(&ring_total_stats, 0, sizeof(ring_total_stats));
++	for (idx = 0; idx < queue_num; idx++) {
++		/* fetch the tx stats */
++		ring = &priv->ring[idx];
++		hns3_fetch_stats(&ring_total_stats, ring, true);
++
++		/* fetch the rx stats */
++		ring = &priv->ring[idx + queue_num];
++		hns3_fetch_stats(&ring_total_stats, ring, false);
++	}
++
++	stats->tx_bytes = ring_total_stats.tx_bytes;
++	stats->tx_packets = ring_total_stats.tx_packets;
++	stats->rx_bytes = ring_total_stats.rx_bytes;
++	stats->rx_packets = ring_total_stats.rx_packets;
++
++	stats->rx_errors = ring_total_stats.rx_errors;
++	stats->multicast = ring_total_stats.multicast;
++	stats->rx_length_errors = ring_total_stats.rx_length_errors;
++	stats->rx_crc_errors = ring_total_stats.rx_crc_errors;
++	stats->rx_missed_errors = netdev->stats.rx_missed_errors;
++
++	stats->tx_errors = ring_total_stats.tx_errors;
++	stats->rx_dropped = ring_total_stats.rx_dropped;
++	stats->tx_dropped = ring_total_stats.tx_dropped;
++	stats->collisions = netdev->stats.collisions;
++	stats->rx_over_errors = netdev->stats.rx_over_errors;
++	stats->rx_frame_errors = netdev->stats.rx_frame_errors;
++	stats->rx_fifo_errors = netdev->stats.rx_fifo_errors;
++	stats->tx_aborted_errors = netdev->stats.tx_aborted_errors;
++	stats->tx_carrier_errors = netdev->stats.tx_carrier_errors;
++	stats->tx_fifo_errors = netdev->stats.tx_fifo_errors;
++	stats->tx_heartbeat_errors = netdev->stats.tx_heartbeat_errors;
++	stats->tx_window_errors = netdev->stats.tx_window_errors;
++	stats->rx_compressed = netdev->stats.rx_compressed;
++	stats->tx_compressed = netdev->stats.tx_compressed;
++}
++
++static int hns3_setup_tc(struct net_device *netdev, void *type_data)
++{
++	struct tc_mqprio_qopt_offload *mqprio_qopt = type_data;
++	struct hnae3_knic_private_info *kinfo;
++	u8 tc = mqprio_qopt->qopt.num_tc;
++	u16 mode = mqprio_qopt->mode;
++	u8 hw = mqprio_qopt->qopt.hw;
++	struct hnae3_handle *h;
++
++	if (!((hw == TC_MQPRIO_HW_OFFLOAD_TCS &&
++	       mode == TC_MQPRIO_MODE_CHANNEL) || (!hw && tc == 0)))
++		return -EOPNOTSUPP;
++
++	if (tc > HNAE3_MAX_TC)
++		return -EINVAL;
++
++	if (!netdev)
++		return -EINVAL;
++
++	h = hns3_get_handle(netdev);
++	kinfo = &h->kinfo;
++
++	netif_dbg(h, drv, netdev, "setup tc: num_tc=%u\n", tc);
++
++	return (kinfo->dcb_ops && kinfo->dcb_ops->setup_tc) ?
++		kinfo->dcb_ops->setup_tc(h, mqprio_qopt) : -EOPNOTSUPP;
++}
++
++static int hns3_setup_tc_cls_flower(struct hns3_nic_priv *priv,
++				    struct flow_cls_offload *flow)
++{
++	int tc = tc_classid_to_hwtc(priv->netdev, flow->classid);
++	struct hnae3_handle *h = hns3_get_handle(priv->netdev);
++
++	switch (flow->command) {
++	case FLOW_CLS_REPLACE:
++		if (h->ae_algo->ops->add_cls_flower)
++			return h->ae_algo->ops->add_cls_flower(h, flow, tc);
++		break;
++	case FLOW_CLS_DESTROY:
++		if (h->ae_algo->ops->del_cls_flower)
++			return h->ae_algo->ops->del_cls_flower(h, flow);
++		break;
++	default:
++		break;
++	}
++
++	return -EOPNOTSUPP;
++}
++
++static int hns3_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
++				  void *cb_priv)
++{
++	struct hns3_nic_priv *priv = cb_priv;
++
++	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
++		return -EOPNOTSUPP;
++
++	switch (type) {
++	case TC_SETUP_CLSFLOWER:
++		return hns3_setup_tc_cls_flower(priv, type_data);
++	default:
++		return -EOPNOTSUPP;
++	}
++}
++
++static LIST_HEAD(hns3_block_cb_list);
++
++static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type,
++			     void *type_data)
++{
++	struct hns3_nic_priv *priv = netdev_priv(dev);
++	int ret;
++
++	switch (type) {
++	case TC_SETUP_QDISC_MQPRIO:
++		ret = hns3_setup_tc(dev, type_data);
++		break;
++	case TC_SETUP_BLOCK:
++		ret = flow_block_cb_setup_simple(type_data,
++						 &hns3_block_cb_list,
++						 hns3_setup_tc_block_cb,
++						 priv, priv, true);
++		break;
++	default:
++		return -EOPNOTSUPP;
++	}
++
++	return ret;
++}
++
++static int hns3_vlan_rx_add_vid(struct net_device *netdev,
++				__be16 proto, u16 vid)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret = -EIO;
++
++	if (h->ae_algo->ops->set_vlan_filter)
++		ret = h->ae_algo->ops->set_vlan_filter(h, proto, vid, false);
++
++	return ret;
++}
++
++static int hns3_vlan_rx_kill_vid(struct net_device *netdev,
++				 __be16 proto, u16 vid)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret = -EIO;
++
++	if (h->ae_algo->ops->set_vlan_filter)
++		ret = h->ae_algo->ops->set_vlan_filter(h, proto, vid, true);
++
++	return ret;
++}
++
++static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
++				u8 qos, __be16 vlan_proto)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret = -EIO;
++
++	netif_dbg(h, drv, netdev,
++		  "set vf vlan: vf=%d, vlan=%u, qos=%u, vlan_proto=0x%x\n",
++		  vf, vlan, qos, ntohs(vlan_proto));
++
++	if (h->ae_algo->ops->set_vf_vlan_filter)
++		ret = h->ae_algo->ops->set_vf_vlan_filter(h, vf, vlan,
++							  qos, vlan_proto);
++
++	return ret;
++}
++
++static int hns3_set_vf_spoofchk(struct net_device *netdev, int vf, bool enable)
++{
++	struct hnae3_handle *handle = hns3_get_handle(netdev);
++
++	if (hns3_nic_resetting(netdev))
++		return -EBUSY;
++
++	if (!handle->ae_algo->ops->set_vf_spoofchk)
++		return -EOPNOTSUPP;
++
++	return handle->ae_algo->ops->set_vf_spoofchk(handle, vf, enable);
++}
++
++static int hns3_set_vf_trust(struct net_device *netdev, int vf, bool enable)
++{
++	struct hnae3_handle *handle = hns3_get_handle(netdev);
++
++	if (!handle->ae_algo->ops->set_vf_trust)
++		return -EOPNOTSUPP;
++
++	return handle->ae_algo->ops->set_vf_trust(handle, vf, enable);
++}
++
++static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret;
++
++	if (hns3_nic_resetting(netdev))
++		return -EBUSY;
++
++	if (!h->ae_algo->ops->set_mtu)
++		return -EOPNOTSUPP;
++
++	netif_dbg(h, drv, netdev,
++		  "change mtu from %u to %d\n", netdev->mtu, new_mtu);
++
++	ret = h->ae_algo->ops->set_mtu(h, new_mtu);
++	if (ret)
++		netdev_err(netdev, "failed to change MTU in hardware %d\n",
++			   ret);
++	else
++		netdev->mtu = new_mtu;
++
++	return ret;
++}
++
++static int hns3_get_timeout_queue(struct net_device *ndev)
++{
++	int i;
++
++	/* Find the stopped queue the same way the stack does */
++	for (i = 0; i < ndev->num_tx_queues; i++) {
++		struct netdev_queue *q;
++		unsigned long trans_start;
++
++		q = netdev_get_tx_queue(ndev, i);
++		trans_start = READ_ONCE(q->trans_start);
++		if (netif_xmit_stopped(q) &&
++		    time_after(jiffies,
++			       (trans_start + ndev->watchdog_timeo))) {
++#ifdef CONFIG_BQL
++			struct dql *dql = &q->dql;
++
++			netdev_info(ndev, "DQL info last_cnt: %u, queued: %u, adj_limit: %u, completed: %u\n",
++				    dql->last_obj_cnt, dql->num_queued,
++				    dql->adj_limit, dql->num_completed);
++#endif
++			netdev_info(ndev, "queue state: 0x%lx, delta msecs: %u\n",
++				    q->state,
++				    jiffies_to_msecs(jiffies - trans_start));
++			break;
++		}
++	}
++
++	return i;
++}
++
++static void hns3_dump_queue_stats(struct net_device *ndev,
++				  struct hns3_enet_ring *tx_ring,
++				  int timeout_queue)
++{
++	struct napi_struct *napi = &tx_ring->tqp_vector->napi;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++
++	netdev_info(ndev,
++		    "tx_timeout count: %llu, queue id: %d, SW_NTU: 0x%x, SW_NTC: 0x%x, napi state: %lu\n",
++		    priv->tx_timeout_count, timeout_queue, tx_ring->next_to_use,
++		    tx_ring->next_to_clean, napi->state);
++
++	netdev_info(ndev,
++		    "tx_pkts: %llu, tx_bytes: %llu, sw_err_cnt: %llu, tx_pending: %d\n",
++		    tx_ring->stats.tx_pkts, tx_ring->stats.tx_bytes,
++		    tx_ring->stats.sw_err_cnt, tx_ring->pending_buf);
++
++	netdev_info(ndev,
++		    "seg_pkt_cnt: %llu, tx_more: %llu, restart_queue: %llu, tx_busy: %llu\n",
++		    tx_ring->stats.seg_pkt_cnt, tx_ring->stats.tx_more,
++		    tx_ring->stats.restart_queue, tx_ring->stats.tx_busy);
++
++	netdev_info(ndev, "tx_push: %llu, tx_mem_doorbell: %llu\n",
++		    tx_ring->stats.tx_push, tx_ring->stats.tx_mem_doorbell);
++}
++
++static void hns3_dump_queue_reg(struct net_device *ndev,
++				struct hns3_enet_ring *tx_ring)
++{
++	netdev_info(ndev,
++		    "BD_NUM: 0x%x HW_HEAD: 0x%x, HW_TAIL: 0x%x, BD_ERR: 0x%x, INT: 0x%x\n",
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_NUM_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_HEAD_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TAIL_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_ERR_REG),
++		    readl(tx_ring->tqp_vector->mask_addr));
++	netdev_info(ndev,
++		    "RING_EN: 0x%x, TC: 0x%x, FBD_NUM: 0x%x FBD_OFT: 0x%x, EBD_NUM: 0x%x, EBD_OFT: 0x%x\n",
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_EN_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TC_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_FBDNUM_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_OFFSET_REG),
++		    hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_EBDNUM_REG),
++		    hns3_tqp_read_reg(tx_ring,
++				      HNS3_RING_TX_RING_EBD_OFFSET_REG));
++}
++
++static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	struct hnae3_handle *h = hns3_get_handle(ndev);
++	struct hns3_enet_ring *tx_ring;
++	int timeout_queue;
++
++	timeout_queue = hns3_get_timeout_queue(ndev);
++	if (timeout_queue >= ndev->num_tx_queues) {
++		netdev_info(ndev,
++			    "no netdev TX timeout queue found, timeout count: %llu\n",
++			    priv->tx_timeout_count);
++		return false;
++	}
++
++	priv->tx_timeout_count++;
++
++	tx_ring = &priv->ring[timeout_queue];
++	hns3_dump_queue_stats(ndev, tx_ring, timeout_queue);
++
++	/* When mac received many pause frames continuous, it's unable to send
++	 * packets, which may cause tx timeout
++	 */
++	if (h->ae_algo->ops->get_mac_stats) {
++		struct hns3_mac_stats mac_stats;
++
++		h->ae_algo->ops->get_mac_stats(h, &mac_stats);
++		netdev_info(ndev, "tx_pause_cnt: %llu, rx_pause_cnt: %llu\n",
++			    mac_stats.tx_pause_cnt, mac_stats.rx_pause_cnt);
++	}
++
++	hns3_dump_queue_reg(ndev, tx_ring);
++
++	return true;
++}
++
++static void hns3_nic_net_timeout(struct net_device *ndev, unsigned int txqueue)
++{
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	struct hnae3_handle *h = priv->ae_handle;
++
++	if (!hns3_get_tx_timeo_queue_info(ndev))
++		return;
++
++	/* request the reset, and let the hclge to determine
++	 * which reset level should be done
++	 */
++	if (h->ae_algo->ops->reset_event)
++		h->ae_algo->ops->reset_event(h->pdev, h);
++}
++
++#ifdef CONFIG_RFS_ACCEL
++static int hns3_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
++			      u16 rxq_index, u32 flow_id)
++{
++	struct hnae3_handle *h = hns3_get_handle(dev);
++	struct flow_keys fkeys;
++
++	if (!h->ae_algo->ops->add_arfs_entry)
++		return -EOPNOTSUPP;
++
++	if (skb->encapsulation)
++		return -EPROTONOSUPPORT;
++
++	if (!skb_flow_dissect_flow_keys(skb, &fkeys, 0))
++		return -EPROTONOSUPPORT;
++
++	if ((fkeys.basic.n_proto != htons(ETH_P_IP) &&
++	     fkeys.basic.n_proto != htons(ETH_P_IPV6)) ||
++	    (fkeys.basic.ip_proto != IPPROTO_TCP &&
++	     fkeys.basic.ip_proto != IPPROTO_UDP))
++		return -EPROTONOSUPPORT;
++
++	return h->ae_algo->ops->add_arfs_entry(h, rxq_index, flow_id, &fkeys);
++}
++#endif
++
++static int hns3_nic_get_vf_config(struct net_device *ndev, int vf,
++				  struct ifla_vf_info *ivf)
++{
++	struct hnae3_handle *h = hns3_get_handle(ndev);
++
++	if (!h->ae_algo->ops->get_vf_config)
++		return -EOPNOTSUPP;
++
++	return h->ae_algo->ops->get_vf_config(h, vf, ivf);
++}
++
++static int hns3_nic_set_vf_link_state(struct net_device *ndev, int vf,
++				      int link_state)
++{
++	struct hnae3_handle *h = hns3_get_handle(ndev);
++
++	if (!h->ae_algo->ops->set_vf_link_state)
++		return -EOPNOTSUPP;
++
++	return h->ae_algo->ops->set_vf_link_state(h, vf, link_state);
++}
++
++static int hns3_nic_set_vf_rate(struct net_device *ndev, int vf,
++				int min_tx_rate, int max_tx_rate)
++{
++	struct hnae3_handle *h = hns3_get_handle(ndev);
++
++	if (!h->ae_algo->ops->set_vf_rate)
++		return -EOPNOTSUPP;
++
++	return h->ae_algo->ops->set_vf_rate(h, vf, min_tx_rate, max_tx_rate,
++					    false);
++}
++
++static int hns3_nic_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
++
++	if (!h->ae_algo->ops->set_vf_mac)
++		return -EOPNOTSUPP;
++
++	if (is_multicast_ether_addr(mac)) {
++		hnae3_format_mac_addr(format_mac_addr, mac);
++		netdev_err(netdev,
++			   "Invalid MAC:%s specified. Could not set MAC\n",
++			   format_mac_addr);
++		return -EINVAL;
++	}
++
++	return h->ae_algo->ops->set_vf_mac(h, vf_id, mac);
++}
++
++static const struct net_device_ops hns3_nic_netdev_ops = {
++	.ndo_open		= hns3_nic_net_open,
++	.ndo_stop		= hns3_nic_net_stop,
++	.ndo_start_xmit		= hns3_nic_net_xmit,
++	.ndo_tx_timeout		= hns3_nic_net_timeout,
++	.ndo_set_mac_address	= hns3_nic_net_set_mac_address,
++	.ndo_eth_ioctl		= hns3_nic_do_ioctl,
++	.ndo_change_mtu		= hns3_nic_change_mtu,
++	.ndo_set_features	= hns3_nic_set_features,
++	.ndo_features_check	= hns3_features_check,
++	.ndo_get_stats64	= hns3_nic_get_stats64,
++	.ndo_setup_tc		= hns3_nic_setup_tc,
++	.ndo_set_rx_mode	= hns3_nic_set_rx_mode,
++	.ndo_vlan_rx_add_vid	= hns3_vlan_rx_add_vid,
++	.ndo_vlan_rx_kill_vid	= hns3_vlan_rx_kill_vid,
++	.ndo_set_vf_vlan	= hns3_ndo_set_vf_vlan,
++	.ndo_set_vf_spoofchk	= hns3_set_vf_spoofchk,
++	.ndo_set_vf_trust	= hns3_set_vf_trust,
++#ifdef CONFIG_RFS_ACCEL
++	.ndo_rx_flow_steer	= hns3_rx_flow_steer,
++#endif
++	.ndo_get_vf_config	= hns3_nic_get_vf_config,
++	.ndo_set_vf_link_state	= hns3_nic_set_vf_link_state,
++	.ndo_set_vf_rate	= hns3_nic_set_vf_rate,
++	.ndo_set_vf_mac		= hns3_nic_set_vf_mac,
++};
++
++bool hns3_is_phys_func(struct pci_dev *pdev)
++{
++	u32 dev_id = pdev->device;
++
++	switch (dev_id) {
++	case HNAE3_DEV_ID_GE:
++	case HNAE3_DEV_ID_25GE:
++	case HNAE3_DEV_ID_25GE_RDMA:
++	case HNAE3_DEV_ID_25GE_RDMA_MACSEC:
++	case HNAE3_DEV_ID_50GE_RDMA:
++	case HNAE3_DEV_ID_50GE_RDMA_MACSEC:
++	case HNAE3_DEV_ID_100G_RDMA_MACSEC:
++	case HNAE3_DEV_ID_200G_RDMA:
++		return true;
++	case HNAE3_DEV_ID_VF:
++	case HNAE3_DEV_ID_RDMA_DCB_PFC_VF:
++		return false;
++	default:
++		dev_warn(&pdev->dev, "un-recognized pci device-id %u",
++			 dev_id);
++	}
++
++	return false;
++}
++
++static void hns3_disable_sriov(struct pci_dev *pdev)
++{
++	/* If our VFs are assigned we cannot shut down SR-IOV
++	 * without causing issues, so just leave the hardware
++	 * available but disabled
++	 */
++	if (pci_vfs_assigned(pdev)) {
++		dev_warn(&pdev->dev,
++			 "disabling driver while VFs are assigned\n");
++		return;
++	}
++
++	pci_disable_sriov(pdev);
++}
++
++/* hns3_probe - Device initialization routine
++ * @pdev: PCI device information struct
++ * @ent: entry in hns3_pci_tbl
++ *
++ * hns3_probe initializes a PF identified by a pci_dev structure.
++ * The OS initialization, configuring of the PF private structure,
++ * and a hardware reset occur.
++ *
++ * Returns 0 on success, negative on failure
++ */
++static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
++{
++	struct hnae3_ae_dev *ae_dev;
++	int ret;
++
++	ae_dev = devm_kzalloc(&pdev->dev, sizeof(*ae_dev), GFP_KERNEL);
++	if (!ae_dev)
++		return -ENOMEM;
++
++	ae_dev->pdev = pdev;
++	ae_dev->flag = ent->driver_data;
++	pci_set_drvdata(pdev, ae_dev);
++
++	ret = hnae3_register_ae_dev(ae_dev);
++	if (ret)
++		pci_set_drvdata(pdev, NULL);
++
++	return ret;
++}
++
++/**
++ * hns3_clean_vf_config
++ * @pdev: pointer to a pci_dev structure
++ * @num_vfs: number of VFs allocated
++ *
++ * Clean residual vf config after disable sriov
++ **/
++static void hns3_clean_vf_config(struct pci_dev *pdev, int num_vfs)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	if (ae_dev->ops->clean_vf_config)
++		ae_dev->ops->clean_vf_config(ae_dev, num_vfs);
++}
++
++/* hns3_remove - Device removal routine
++ * @pdev: PCI device information struct
++ */
++static void hns3_remove(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	if (hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))
++		hns3_disable_sriov(pdev);
++
++	hnae3_unregister_ae_dev(ae_dev);
++	pci_set_drvdata(pdev, NULL);
++}
++
++/**
++ * hns3_pci_sriov_configure
++ * @pdev: pointer to a pci_dev structure
++ * @num_vfs: number of VFs to allocate
++ *
++ * Enable or change the number of VFs. Called when the user updates the number
++ * of VFs in sysfs.
++ **/
++static int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
++{
++	int ret;
++
++	if (!(hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))) {
++		dev_warn(&pdev->dev, "Can not config SRIOV\n");
++		return -EINVAL;
++	}
++
++	if (num_vfs) {
++		ret = pci_enable_sriov(pdev, num_vfs);
++		if (ret)
++			dev_err(&pdev->dev, "SRIOV enable failed %d\n", ret);
++		else
++			return num_vfs;
++	} else if (!pci_vfs_assigned(pdev)) {
++		int num_vfs_pre = pci_num_vf(pdev);
++
++		pci_disable_sriov(pdev);
++		hns3_clean_vf_config(pdev, num_vfs_pre);
++	} else {
++		dev_warn(&pdev->dev,
++			 "Unable to free VFs because some are assigned to VMs.\n");
++	}
++
++	return 0;
++}
++
++static void hns3_shutdown(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	hnae3_unregister_ae_dev(ae_dev);
++	pci_set_drvdata(pdev, NULL);
++
++	if (system_state == SYSTEM_POWER_OFF)
++		pci_set_power_state(pdev, PCI_D3hot);
++}
++
++static int __maybe_unused hns3_suspend(struct device *dev)
++{
++	struct hnae3_ae_dev *ae_dev = dev_get_drvdata(dev);
++
++	if (ae_dev && hns3_is_phys_func(ae_dev->pdev)) {
++		dev_info(dev, "Begin to suspend.\n");
++		if (ae_dev->ops && ae_dev->ops->reset_prepare)
++			ae_dev->ops->reset_prepare(ae_dev, HNAE3_FUNC_RESET);
++	}
++
++	return 0;
++}
++
++static int __maybe_unused hns3_resume(struct device *dev)
++{
++	struct hnae3_ae_dev *ae_dev = dev_get_drvdata(dev);
++
++	if (ae_dev && hns3_is_phys_func(ae_dev->pdev)) {
++		dev_info(dev, "Begin to resume.\n");
++		if (ae_dev->ops && ae_dev->ops->reset_done)
++			ae_dev->ops->reset_done(ae_dev);
++	}
++
++	return 0;
++}
++
++static pci_ers_result_t hns3_error_detected(struct pci_dev *pdev,
++					    pci_channel_state_t state)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++	pci_ers_result_t ret;
++
++	dev_info(&pdev->dev, "PCI error detected, state(=%u)!!\n", state);
++
++	if (state == pci_channel_io_perm_failure)
++		return PCI_ERS_RESULT_DISCONNECT;
++
++	if (!ae_dev || !ae_dev->ops) {
++		dev_err(&pdev->dev,
++			"Can't recover - error happened before device initialized\n");
++		return PCI_ERS_RESULT_NONE;
++	}
++
++	if (ae_dev->ops->handle_hw_ras_error)
++		ret = ae_dev->ops->handle_hw_ras_error(ae_dev);
++	else
++		return PCI_ERS_RESULT_NONE;
++
++	return ret;
++}
++
++static pci_ers_result_t hns3_slot_reset(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++	const struct hnae3_ae_ops *ops;
++	enum hnae3_reset_type reset_type;
++	struct device *dev = &pdev->dev;
++
++	if (!ae_dev || !ae_dev->ops)
++		return PCI_ERS_RESULT_NONE;
++
++	ops = ae_dev->ops;
++	/* request the reset */
++	if (ops->reset_event && ops->get_reset_level &&
++	    ops->set_default_reset_request) {
++		if (ae_dev->hw_err_reset_req) {
++			reset_type = ops->get_reset_level(ae_dev,
++						&ae_dev->hw_err_reset_req);
++			ops->set_default_reset_request(ae_dev, reset_type);
++			dev_info(dev, "requesting reset due to PCI error\n");
++			ops->reset_event(pdev, NULL);
++		}
++
++		return PCI_ERS_RESULT_RECOVERED;
++	}
++
++	return PCI_ERS_RESULT_DISCONNECT;
++}
++
++static void hns3_reset_prepare(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	dev_info(&pdev->dev, "FLR prepare\n");
++	if (ae_dev && ae_dev->ops && ae_dev->ops->reset_prepare)
++		ae_dev->ops->reset_prepare(ae_dev, HNAE3_FLR_RESET);
++}
++
++static void hns3_reset_done(struct pci_dev *pdev)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	dev_info(&pdev->dev, "FLR done\n");
++	if (ae_dev && ae_dev->ops && ae_dev->ops->reset_done)
++		ae_dev->ops->reset_done(ae_dev);
++}
++
++static const struct pci_error_handlers hns3_err_handler = {
++	.error_detected = hns3_error_detected,
++	.slot_reset     = hns3_slot_reset,
++	.reset_prepare	= hns3_reset_prepare,
++	.reset_done	= hns3_reset_done,
++};
++
++static SIMPLE_DEV_PM_OPS(hns3_pm_ops, hns3_suspend, hns3_resume);
++
++static struct pci_driver hns3_driver = {
++	.name     = hns3_driver_name,
++	.id_table = hns3_pci_tbl,
++	.probe    = hns3_probe,
++	.remove   = hns3_remove,
++	.shutdown = hns3_shutdown,
++	.driver.pm  = &hns3_pm_ops,
++	.sriov_configure = hns3_pci_sriov_configure,
++	.err_handler    = &hns3_err_handler,
++};
++
++/* set default feature to hns3 */
++static void hns3_set_default_feature(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct pci_dev *pdev = h->pdev;
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	netdev->priv_flags |= IFF_UNICAST_FLT;
++
++	netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM;
++
++	netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER |
++		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
++		NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO |
++		NETIF_F_GRO | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_GSO_GRE |
++		NETIF_F_GSO_GRE_CSUM | NETIF_F_GSO_UDP_TUNNEL |
++		NETIF_F_SCTP_CRC | NETIF_F_FRAGLIST;
++
++	if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V2) {
++		netdev->features |= NETIF_F_GRO_HW;
++
++		if (!(h->flags & HNAE3_SUPPORT_VF))
++			netdev->features |= NETIF_F_NTUPLE;
++	}
++
++	if (test_bit(HNAE3_DEV_SUPPORT_UDP_GSO_B, ae_dev->caps))
++		netdev->features |= NETIF_F_GSO_UDP_L4;
++
++	if (test_bit(HNAE3_DEV_SUPPORT_HW_TX_CSUM_B, ae_dev->caps))
++		netdev->features |= NETIF_F_HW_CSUM;
++	else
++		netdev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
++
++	if (test_bit(HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B, ae_dev->caps))
++		netdev->features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
++
++	if (test_bit(HNAE3_DEV_SUPPORT_FD_FORWARD_TC_B, ae_dev->caps))
++		netdev->features |= NETIF_F_HW_TC;
++
++	netdev->hw_features |= netdev->features;
++	if (!test_bit(HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, ae_dev->caps))
++		netdev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
++
++	netdev->vlan_features |= netdev->features &
++		~(NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_TX |
++		  NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_GRO_HW | NETIF_F_NTUPLE |
++		  NETIF_F_HW_TC);
++
++	netdev->hw_enc_features |= netdev->vlan_features | NETIF_F_TSO_MANGLEID;
++}
++
++static int hns3_alloc_buffer(struct hns3_enet_ring *ring,
++			     struct hns3_desc_cb *cb)
++{
++	unsigned int order = hns3_page_order(ring);
++	struct page *p;
++
++	if (ring->page_pool) {
++		p = page_pool_dev_alloc_frag(ring->page_pool,
++					     &cb->page_offset,
++					     hns3_buf_size(ring));
++		if (unlikely(!p))
++			return -ENOMEM;
++
++		cb->priv = p;
++		cb->buf = page_address(p);
++		cb->dma = page_pool_get_dma_addr(p);
++		cb->type = DESC_TYPE_PP_FRAG;
++		cb->reuse_flag = 0;
++		return 0;
++	}
++
++	p = dev_alloc_pages(order);
++	if (!p)
++		return -ENOMEM;
++
++	cb->priv = p;
++	cb->page_offset = 0;
++	cb->reuse_flag = 0;
++	cb->buf  = page_address(p);
++	cb->length = hns3_page_size(ring);
++	cb->type = DESC_TYPE_PAGE;
++	page_ref_add(p, USHRT_MAX - 1);
++	cb->pagecnt_bias = USHRT_MAX;
++
++	return 0;
++}
++
++static void hns3_free_buffer(struct hns3_enet_ring *ring,
++			     struct hns3_desc_cb *cb, int budget)
++{
++	if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD |
++			DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB))
++		napi_consume_skb(cb->priv, budget);
++	else if (!HNAE3_IS_TX_RING(ring)) {
++		if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias)
++			__page_frag_cache_drain(cb->priv, cb->pagecnt_bias);
++		else if (cb->type & DESC_TYPE_PP_FRAG)
++			page_pool_put_full_page(ring->page_pool, cb->priv,
++						false);
++	}
++	memset(cb, 0, sizeof(*cb));
++}
++
++static int hns3_map_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb)
++{
++	cb->dma = dma_map_page(ring_to_dev(ring), cb->priv, 0,
++			       cb->length, ring_to_dma_dir(ring));
++
++	if (unlikely(dma_mapping_error(ring_to_dev(ring), cb->dma)))
++		return -EIO;
++
++	return 0;
++}
++
++static void hns3_unmap_buffer(struct hns3_enet_ring *ring,
++			      struct hns3_desc_cb *cb)
++{
++	if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB))
++		dma_unmap_single(ring_to_dev(ring), cb->dma, cb->length,
++				 ring_to_dma_dir(ring));
++	else if ((cb->type & DESC_TYPE_PAGE) && cb->length)
++		dma_unmap_page(ring_to_dev(ring), cb->dma, cb->length,
++			       ring_to_dma_dir(ring));
++	else if (cb->type & (DESC_TYPE_BOUNCE_ALL | DESC_TYPE_BOUNCE_HEAD |
++			     DESC_TYPE_SGL_SKB))
++		hns3_tx_spare_reclaim_cb(ring, cb);
++}
++
++static void hns3_buffer_detach(struct hns3_enet_ring *ring, int i)
++{
++	hns3_unmap_buffer(ring, &ring->desc_cb[i]);
++	ring->desc[i].addr = 0;
++	ring->desc_cb[i].refill = 0;
++}
++
++static void hns3_free_buffer_detach(struct hns3_enet_ring *ring, int i,
++				    int budget)
++{
++	struct hns3_desc_cb *cb = &ring->desc_cb[i];
++
++	if (!ring->desc_cb[i].dma)
++		return;
++
++	hns3_buffer_detach(ring, i);
++	hns3_free_buffer(ring, cb, budget);
++}
++
++static void hns3_free_buffers(struct hns3_enet_ring *ring)
++{
++	int i;
++
++	for (i = 0; i < ring->desc_num; i++)
++		hns3_free_buffer_detach(ring, i, 0);
++}
++
++/* free desc along with its attached buffer */
++static void hns3_free_desc(struct hns3_enet_ring *ring)
++{
++	int size = ring->desc_num * sizeof(ring->desc[0]);
++
++	hns3_free_buffers(ring);
++
++	if (ring->desc) {
++		dma_free_coherent(ring_to_dev(ring), size,
++				  ring->desc, ring->desc_dma_addr);
++		ring->desc = NULL;
++	}
++}
++
++static int hns3_alloc_desc(struct hns3_enet_ring *ring)
++{
++	int size = ring->desc_num * sizeof(ring->desc[0]);
++
++	ring->desc = dma_alloc_coherent(ring_to_dev(ring), size,
++					&ring->desc_dma_addr, GFP_KERNEL);
++	if (!ring->desc)
++		return -ENOMEM;
++
++	return 0;
++}
++
++static int hns3_alloc_and_map_buffer(struct hns3_enet_ring *ring,
++				   struct hns3_desc_cb *cb)
++{
++	int ret;
++
++	ret = hns3_alloc_buffer(ring, cb);
++	if (ret || ring->page_pool)
++		goto out;
++
++	ret = hns3_map_buffer(ring, cb);
++	if (ret)
++		goto out_with_buf;
++
++	return 0;
++
++out_with_buf:
++	hns3_free_buffer(ring, cb, 0);
++out:
++	return ret;
++}
++
++static int hns3_alloc_and_attach_buffer(struct hns3_enet_ring *ring, int i)
++{
++	int ret = hns3_alloc_and_map_buffer(ring, &ring->desc_cb[i]);
++
++	if (ret)
++		return ret;
++
++	ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
++					 ring->desc_cb[i].page_offset);
++	ring->desc_cb[i].refill = 1;
++
++	return 0;
++}
++
++/* Allocate memory for raw pkg, and map with dma */
++static int hns3_alloc_ring_buffers(struct hns3_enet_ring *ring)
++{
++	int i, j, ret;
++
++	for (i = 0; i < ring->desc_num; i++) {
++		ret = hns3_alloc_and_attach_buffer(ring, i);
++		if (ret)
++			goto out_buffer_fail;
++	}
++
++	return 0;
++
++out_buffer_fail:
++	for (j = i - 1; j >= 0; j--)
++		hns3_free_buffer_detach(ring, j, 0);
++	return ret;
++}
++
++/* detach a in-used buffer and replace with a reserved one */
++static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i,
++				struct hns3_desc_cb *res_cb)
++{
++	hns3_unmap_buffer(ring, &ring->desc_cb[i]);
++	ring->desc_cb[i] = *res_cb;
++	ring->desc_cb[i].refill = 1;
++	ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
++					 ring->desc_cb[i].page_offset);
++	ring->desc[i].rx.bd_base_info = 0;
++}
++
++static void hns3_reuse_buffer(struct hns3_enet_ring *ring, int i)
++{
++	ring->desc_cb[i].reuse_flag = 0;
++	ring->desc_cb[i].refill = 1;
++	ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma +
++					 ring->desc_cb[i].page_offset);
++	ring->desc[i].rx.bd_base_info = 0;
++
++	dma_sync_single_for_device(ring_to_dev(ring),
++			ring->desc_cb[i].dma + ring->desc_cb[i].page_offset,
++			hns3_buf_size(ring),
++			DMA_FROM_DEVICE);
++}
++
++static bool hns3_nic_reclaim_desc(struct hns3_enet_ring *ring,
++				  int *bytes, int *pkts, int budget)
++{
++	/* pair with ring->last_to_use update in hns3_tx_doorbell(),
++	 * smp_store_release() is not used in hns3_tx_doorbell() because
++	 * the doorbell operation already have the needed barrier operation.
++	 */
++	int ltu = smp_load_acquire(&ring->last_to_use);
++	int ntc = ring->next_to_clean;
++	struct hns3_desc_cb *desc_cb;
++	bool reclaimed = false;
++	struct hns3_desc *desc;
++
++	while (ltu != ntc) {
++		desc = &ring->desc[ntc];
++
++		if (le16_to_cpu(desc->tx.bdtp_fe_sc_vld_ra_ri) &
++				BIT(HNS3_TXD_VLD_B))
++			break;
++
++		desc_cb = &ring->desc_cb[ntc];
++
++		if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_ALL |
++				     DESC_TYPE_BOUNCE_HEAD |
++				     DESC_TYPE_SGL_SKB)) {
++			(*pkts)++;
++			(*bytes) += desc_cb->send_bytes;
++		}
++
++		/* desc_cb will be cleaned, after hnae3_free_buffer_detach */
++		hns3_free_buffer_detach(ring, ntc, budget);
++
++		if (++ntc == ring->desc_num)
++			ntc = 0;
++
++		/* Issue prefetch for next Tx descriptor */
++		prefetch(&ring->desc_cb[ntc]);
++		reclaimed = true;
++	}
++
++	if (unlikely(!reclaimed))
++		return false;
++
++	/* This smp_store_release() pairs with smp_load_acquire() in
++	 * ring_space called by hns3_nic_net_xmit.
++	 */
++	smp_store_release(&ring->next_to_clean, ntc);
++
++	hns3_tx_spare_update(ring);
++
++	return true;
++}
++
++void hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	struct netdev_queue *dev_queue;
++	int bytes, pkts;
++
++	bytes = 0;
++	pkts = 0;
++
++	if (unlikely(!hns3_nic_reclaim_desc(ring, &bytes, &pkts, budget)))
++		return;
++
++	ring->tqp_vector->tx_group.total_bytes += bytes;
++	ring->tqp_vector->tx_group.total_packets += pkts;
++
++	u64_stats_update_begin(&ring->syncp);
++	ring->stats.tx_bytes += bytes;
++	ring->stats.tx_pkts += pkts;
++	u64_stats_update_end(&ring->syncp);
++
++	dev_queue = netdev_get_tx_queue(netdev, ring->tqp->tqp_index);
++	netdev_tx_completed_queue(dev_queue, pkts, bytes);
++
++	if (unlikely(netif_carrier_ok(netdev) &&
++		     ring_space(ring) > HNS3_MAX_TSO_BD_NUM)) {
++		/* Make sure that anybody stopping the queue after this
++		 * sees the new next_to_clean.
++		 */
++		smp_mb();
++		if (netif_tx_queue_stopped(dev_queue) &&
++		    !test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) {
++			netif_tx_wake_queue(dev_queue);
++			ring->stats.restart_queue++;
++		}
++	}
++}
++
++static int hns3_desc_unused(struct hns3_enet_ring *ring)
++{
++	int ntc = ring->next_to_clean;
++	int ntu = ring->next_to_use;
++
++	if (unlikely(ntc == ntu && !ring->desc_cb[ntc].refill))
++		return ring->desc_num;
++
++	return ((ntc >= ntu) ? 0 : ring->desc_num) + ntc - ntu;
++}
++
++/* Return true if there is any allocation failure */
++static bool hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring,
++				      int cleand_count)
++{
++	struct hns3_desc_cb *desc_cb;
++	struct hns3_desc_cb res_cbs;
++	int i, ret;
++
++	for (i = 0; i < cleand_count; i++) {
++		desc_cb = &ring->desc_cb[ring->next_to_use];
++		if (desc_cb->reuse_flag) {
++			hns3_ring_stats_update(ring, reuse_pg_cnt);
++
++			hns3_reuse_buffer(ring, ring->next_to_use);
++		} else {
++			ret = hns3_alloc_and_map_buffer(ring, &res_cbs);
++			if (ret) {
++				hns3_ring_stats_update(ring, sw_err_cnt);
++
++				hns3_rl_err(ring_to_netdev(ring),
++					    "alloc rx buffer failed: %d\n",
++					    ret);
++
++				writel(i, ring->tqp->io_base +
++				       HNS3_RING_RX_RING_HEAD_REG);
++				return true;
++			}
++			hns3_replace_buffer(ring, ring->next_to_use, &res_cbs);
++
++			hns3_ring_stats_update(ring, non_reuse_pg);
++		}
++
++		ring_ptr_move_fw(ring, next_to_use);
++	}
++
++	writel(i, ring->tqp->io_base + HNS3_RING_RX_RING_HEAD_REG);
++	return false;
++}
++
++static bool hns3_can_reuse_page(struct hns3_desc_cb *cb)
++{
++	return page_count(cb->priv) == cb->pagecnt_bias;
++}
++
++static int hns3_handle_rx_copybreak(struct sk_buff *skb, int i,
++				    struct hns3_enet_ring *ring,
++				    int pull_len,
++				    struct hns3_desc_cb *desc_cb)
++{
++	struct hns3_desc *desc = &ring->desc[ring->next_to_clean];
++	u32 frag_offset = desc_cb->page_offset + pull_len;
++	int size = le16_to_cpu(desc->rx.size);
++	u32 frag_size = size - pull_len;
++	void *frag = napi_alloc_frag(frag_size);
++
++	if (unlikely(!frag)) {
++		hns3_ring_stats_update(ring, frag_alloc_err);
++
++		hns3_rl_err(ring_to_netdev(ring),
++			    "failed to allocate rx frag\n");
++		return -ENOMEM;
++	}
++
++	desc_cb->reuse_flag = 1;
++	memcpy(frag, desc_cb->buf + frag_offset, frag_size);
++	skb_add_rx_frag(skb, i, virt_to_page(frag),
++			offset_in_page(frag), frag_size, frag_size);
++
++	hns3_ring_stats_update(ring, frag_alloc);
++	return 0;
++}
++
++static void hns3_nic_reuse_page(struct sk_buff *skb, int i,
++				struct hns3_enet_ring *ring, int pull_len,
++				struct hns3_desc_cb *desc_cb)
++{
++	struct hns3_desc *desc = &ring->desc[ring->next_to_clean];
++	u32 frag_offset = desc_cb->page_offset + pull_len;
++	int size = le16_to_cpu(desc->rx.size);
++	u32 truesize = hns3_buf_size(ring);
++	u32 frag_size = size - pull_len;
++	int ret = 0;
++	bool reused;
++
++	if (ring->page_pool) {
++		skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset,
++				frag_size, truesize);
++		return;
++	}
++
++	/* Avoid re-using remote or pfmem page */
++	if (unlikely(!dev_page_is_reusable(desc_cb->priv)))
++		goto out;
++
++	reused = hns3_can_reuse_page(desc_cb);
++
++	/* Rx page can be reused when:
++	 * 1. Rx page is only owned by the driver when page_offset
++	 *    is zero, which means 0 @ truesize will be used by
++	 *    stack after skb_add_rx_frag() is called, and the rest
++	 *    of rx page can be reused by driver.
++	 * Or
++	 * 2. Rx page is only owned by the driver when page_offset
++	 *    is non-zero, which means page_offset @ truesize will
++	 *    be used by stack after skb_add_rx_frag() is called,
++	 *    and 0 @ truesize can be reused by driver.
++	 */
++	if ((!desc_cb->page_offset && reused) ||
++	    ((desc_cb->page_offset + truesize + truesize) <=
++	     hns3_page_size(ring) && desc_cb->page_offset)) {
++		desc_cb->page_offset += truesize;
++		desc_cb->reuse_flag = 1;
++	} else if (desc_cb->page_offset && reused) {
++		desc_cb->page_offset = 0;
++		desc_cb->reuse_flag = 1;
++	} else if (frag_size <= ring->rx_copybreak) {
++		ret = hns3_handle_rx_copybreak(skb, i, ring, pull_len, desc_cb);
++		if (!ret)
++			return;
++	}
++
++out:
++	desc_cb->pagecnt_bias--;
++
++	if (unlikely(!desc_cb->pagecnt_bias)) {
++		page_ref_add(desc_cb->priv, USHRT_MAX);
++		desc_cb->pagecnt_bias = USHRT_MAX;
++	}
++
++	skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset,
++			frag_size, truesize);
++
++	if (unlikely(!desc_cb->reuse_flag))
++		__page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias);
++}
++
++static int hns3_gro_complete(struct sk_buff *skb, u32 l234info)
++{
++	__be16 type = skb->protocol;
++	struct tcphdr *th;
++	int depth = 0;
++
++	while (eth_type_vlan(type)) {
++		struct vlan_hdr *vh;
++
++		if ((depth + VLAN_HLEN) > skb_headlen(skb))
++			return -EFAULT;
++
++		vh = (struct vlan_hdr *)(skb->data + depth);
++		type = vh->h_vlan_encapsulated_proto;
++		depth += VLAN_HLEN;
++	}
++
++	skb_set_network_header(skb, depth);
++
++	if (type == htons(ETH_P_IP)) {
++		const struct iphdr *iph = ip_hdr(skb);
++
++		depth += sizeof(struct iphdr);
++		skb_set_transport_header(skb, depth);
++		th = tcp_hdr(skb);
++		th->check = ~tcp_v4_check(skb->len - depth, iph->saddr,
++					  iph->daddr, 0);
++	} else if (type == htons(ETH_P_IPV6)) {
++		const struct ipv6hdr *iph = ipv6_hdr(skb);
++
++		depth += sizeof(struct ipv6hdr);
++		skb_set_transport_header(skb, depth);
++		th = tcp_hdr(skb);
++		th->check = ~tcp_v6_check(skb->len - depth, &iph->saddr,
++					  &iph->daddr, 0);
++	} else {
++		hns3_rl_err(skb->dev,
++			    "Error: FW GRO supports only IPv4/IPv6, not 0x%04x, depth: %d\n",
++			    be16_to_cpu(type), depth);
++		return -EFAULT;
++	}
++
++	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
++	if (th->cwr)
++		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
++
++	if (l234info & BIT(HNS3_RXD_GRO_FIXID_B))
++		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
++
++	skb->csum_start = (unsigned char *)th - skb->head;
++	skb->csum_offset = offsetof(struct tcphdr, check);
++	skb->ip_summed = CHECKSUM_PARTIAL;
++
++	trace_hns3_gro(skb);
++
++	return 0;
++}
++
++static bool hns3_checksum_complete(struct hns3_enet_ring *ring,
++				   struct sk_buff *skb, u32 ptype, u16 csum)
++{
++	if (ptype == HNS3_INVALID_PTYPE ||
++	    hns3_rx_ptype_tbl[ptype].ip_summed != CHECKSUM_COMPLETE)
++		return false;
++
++	hns3_ring_stats_update(ring, csum_complete);
++	skb->ip_summed = CHECKSUM_COMPLETE;
++	skb->csum = csum_unfold((__force __sum16)csum);
++
++	return true;
++}
++
++static void hns3_rx_handle_csum(struct sk_buff *skb, u32 l234info,
++				u32 ol_info, u32 ptype)
++{
++	int l3_type, l4_type;
++	int ol4_type;
++
++	if (ptype != HNS3_INVALID_PTYPE) {
++		skb->csum_level = hns3_rx_ptype_tbl[ptype].csum_level;
++		skb->ip_summed = hns3_rx_ptype_tbl[ptype].ip_summed;
++
++		return;
++	}
++
++	ol4_type = hnae3_get_field(ol_info, HNS3_RXD_OL4ID_M,
++				   HNS3_RXD_OL4ID_S);
++	switch (ol4_type) {
++	case HNS3_OL4_TYPE_MAC_IN_UDP:
++	case HNS3_OL4_TYPE_NVGRE:
++		skb->csum_level = 1;
++		fallthrough;
++	case HNS3_OL4_TYPE_NO_TUN:
++		l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M,
++					  HNS3_RXD_L3ID_S);
++		l4_type = hnae3_get_field(l234info, HNS3_RXD_L4ID_M,
++					  HNS3_RXD_L4ID_S);
++		/* Can checksum ipv4 or ipv6 + UDP/TCP/SCTP packets */
++		if ((l3_type == HNS3_L3_TYPE_IPV4 ||
++		     l3_type == HNS3_L3_TYPE_IPV6) &&
++		    (l4_type == HNS3_L4_TYPE_UDP ||
++		     l4_type == HNS3_L4_TYPE_TCP ||
++		     l4_type == HNS3_L4_TYPE_SCTP))
++			skb->ip_summed = CHECKSUM_UNNECESSARY;
++		break;
++	default:
++		break;
++	}
++}
++
++static void hns3_rx_checksum(struct hns3_enet_ring *ring, struct sk_buff *skb,
++			     u32 l234info, u32 bd_base_info, u32 ol_info,
++			     u16 csum)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	u32 ptype = HNS3_INVALID_PTYPE;
++
++	skb->ip_summed = CHECKSUM_NONE;
++
++	skb_checksum_none_assert(skb);
++
++	if (!(netdev->features & NETIF_F_RXCSUM))
++		return;
++
++	if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state))
++		ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M,
++					HNS3_RXD_PTYPE_S);
++
++	if (hns3_checksum_complete(ring, skb, ptype, csum))
++		return;
++
++	/* check if hardware has done checksum */
++	if (!(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
++		return;
++
++	if (unlikely(l234info & (BIT(HNS3_RXD_L3E_B) | BIT(HNS3_RXD_L4E_B) |
++				 BIT(HNS3_RXD_OL3E_B) |
++				 BIT(HNS3_RXD_OL4E_B)))) {
++		hns3_ring_stats_update(ring, l3l4_csum_err);
++
++		return;
++	}
++
++	hns3_rx_handle_csum(skb, l234info, ol_info, ptype);
++}
++
++static void hns3_rx_skb(struct hns3_enet_ring *ring, struct sk_buff *skb)
++{
++	if (skb_has_frag_list(skb))
++		napi_gro_flush(&ring->tqp_vector->napi, false);
++
++	napi_gro_receive(&ring->tqp_vector->napi, skb);
++}
++
++static bool hns3_parse_vlan_tag(struct hns3_enet_ring *ring,
++				struct hns3_desc *desc, u32 l234info,
++				u16 *vlan_tag)
++{
++	struct hnae3_handle *handle = ring->tqp->handle;
++	struct pci_dev *pdev = ring->tqp->handle->pdev;
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++
++	if (unlikely(ae_dev->dev_version < HNAE3_DEVICE_VERSION_V2)) {
++		*vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
++		if (!(*vlan_tag & VLAN_VID_MASK))
++			*vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
++
++		return (*vlan_tag != 0);
++	}
++
++#define HNS3_STRP_OUTER_VLAN	0x1
++#define HNS3_STRP_INNER_VLAN	0x2
++#define HNS3_STRP_BOTH		0x3
++
++	/* Hardware always insert VLAN tag into RX descriptor when
++	 * remove the tag from packet, driver needs to determine
++	 * reporting which tag to stack.
++	 */
++	switch (hnae3_get_field(l234info, HNS3_RXD_STRP_TAGP_M,
++				HNS3_RXD_STRP_TAGP_S)) {
++	case HNS3_STRP_OUTER_VLAN:
++		if (handle->port_base_vlan_state !=
++				HNAE3_PORT_BASE_VLAN_DISABLE)
++			return false;
++
++		*vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
++		return true;
++	case HNS3_STRP_INNER_VLAN:
++		if (handle->port_base_vlan_state !=
++				HNAE3_PORT_BASE_VLAN_DISABLE)
++			return false;
++
++		*vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
++		return true;
++	case HNS3_STRP_BOTH:
++		if (handle->port_base_vlan_state ==
++				HNAE3_PORT_BASE_VLAN_DISABLE)
++			*vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
++		else
++			*vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
++
++		return true;
++	default:
++		return false;
++	}
++}
++
++static void hns3_rx_ring_move_fw(struct hns3_enet_ring *ring)
++{
++	ring->desc[ring->next_to_clean].rx.bd_base_info &=
++		cpu_to_le32(~BIT(HNS3_RXD_VLD_B));
++	ring->desc_cb[ring->next_to_clean].refill = 0;
++	ring->next_to_clean += 1;
++
++	if (unlikely(ring->next_to_clean == ring->desc_num))
++		ring->next_to_clean = 0;
++}
++
++static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length,
++			  unsigned char *va)
++{
++	struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_clean];
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct sk_buff *skb;
++
++	ring->skb = napi_alloc_skb(&ring->tqp_vector->napi, HNS3_RX_HEAD_SIZE);
++	skb = ring->skb;
++	if (unlikely(!skb)) {
++		hns3_rl_err(netdev, "alloc rx skb fail\n");
++		hns3_ring_stats_update(ring, sw_err_cnt);
++
++		return -ENOMEM;
++	}
++
++	trace_hns3_rx_desc(ring);
++	prefetchw(skb->data);
++
++	ring->pending_buf = 1;
++	ring->frag_num = 0;
++	ring->tail_skb = NULL;
++	if (length <= HNS3_RX_HEAD_SIZE) {
++		memcpy(__skb_put(skb, length), va, ALIGN(length, sizeof(long)));
++
++		/* We can reuse buffer as-is, just make sure it is reusable */
++		if (dev_page_is_reusable(desc_cb->priv))
++			desc_cb->reuse_flag = 1;
++		else if (desc_cb->type & DESC_TYPE_PP_FRAG)
++			page_pool_put_full_page(ring->page_pool, desc_cb->priv,
++						false);
++		else /* This page cannot be reused so discard it */
++			__page_frag_cache_drain(desc_cb->priv,
++						desc_cb->pagecnt_bias);
++
++		hns3_rx_ring_move_fw(ring);
++		return 0;
++	}
++
++	if (ring->page_pool)
++		skb_mark_for_recycle(skb);
++
++	hns3_ring_stats_update(ring, seg_pkt_cnt);
++
++	ring->pull_len = eth_get_headlen(netdev, va, HNS3_RX_HEAD_SIZE);
++	__skb_put(skb, ring->pull_len);
++	hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len,
++			    desc_cb);
++	hns3_rx_ring_move_fw(ring);
++
++	return 0;
++}
++
++static int hns3_add_frag(struct hns3_enet_ring *ring)
++{
++	struct sk_buff *skb = ring->skb;
++	struct sk_buff *head_skb = skb;
++	struct sk_buff *new_skb;
++	struct hns3_desc_cb *desc_cb;
++	struct hns3_desc *desc;
++	u32 bd_base_info;
++
++	do {
++		desc = &ring->desc[ring->next_to_clean];
++		desc_cb = &ring->desc_cb[ring->next_to_clean];
++		bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
++		/* make sure HW write desc complete */
++		dma_rmb();
++		if (!(bd_base_info & BIT(HNS3_RXD_VLD_B)))
++			return -ENXIO;
++
++		if (unlikely(ring->frag_num >= MAX_SKB_FRAGS)) {
++			new_skb = napi_alloc_skb(&ring->tqp_vector->napi, 0);
++			if (unlikely(!new_skb)) {
++				hns3_rl_err(ring_to_netdev(ring),
++					    "alloc rx fraglist skb fail\n");
++				return -ENXIO;
++			}
++
++			if (ring->page_pool)
++				skb_mark_for_recycle(new_skb);
++
++			ring->frag_num = 0;
++
++			if (ring->tail_skb) {
++				ring->tail_skb->next = new_skb;
++				ring->tail_skb = new_skb;
++			} else {
++				skb_shinfo(skb)->frag_list = new_skb;
++				ring->tail_skb = new_skb;
++			}
++		}
++
++		if (ring->tail_skb) {
++			head_skb->truesize += hns3_buf_size(ring);
++			head_skb->data_len += le16_to_cpu(desc->rx.size);
++			head_skb->len += le16_to_cpu(desc->rx.size);
++			skb = ring->tail_skb;
++		}
++
++		dma_sync_single_for_cpu(ring_to_dev(ring),
++				desc_cb->dma + desc_cb->page_offset,
++				hns3_buf_size(ring),
++				DMA_FROM_DEVICE);
++
++		hns3_nic_reuse_page(skb, ring->frag_num++, ring, 0, desc_cb);
++		trace_hns3_rx_desc(ring);
++		hns3_rx_ring_move_fw(ring);
++		ring->pending_buf++;
++	} while (!(bd_base_info & BIT(HNS3_RXD_FE_B)));
++
++	return 0;
++}
++
++static int hns3_set_gro_and_checksum(struct hns3_enet_ring *ring,
++				     struct sk_buff *skb, u32 l234info,
++				     u32 bd_base_info, u32 ol_info, u16 csum)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	u32 l3_type;
++
++	skb_shinfo(skb)->gso_size = hnae3_get_field(bd_base_info,
++						    HNS3_RXD_GRO_SIZE_M,
++						    HNS3_RXD_GRO_SIZE_S);
++	/* if there is no HW GRO, do not set gro params */
++	if (!skb_shinfo(skb)->gso_size) {
++		hns3_rx_checksum(ring, skb, l234info, bd_base_info, ol_info,
++				 csum);
++		return 0;
++	}
++
++	NAPI_GRO_CB(skb)->count = hnae3_get_field(l234info,
++						  HNS3_RXD_GRO_COUNT_M,
++						  HNS3_RXD_GRO_COUNT_S);
++
++	if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) {
++		u32 ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M,
++					    HNS3_RXD_PTYPE_S);
++
++		l3_type = hns3_rx_ptype_tbl[ptype].l3_type;
++	} else {
++		l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M,
++					  HNS3_RXD_L3ID_S);
++	}
++
++	if (l3_type == HNS3_L3_TYPE_IPV4)
++		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
++	else if (l3_type == HNS3_L3_TYPE_IPV6)
++		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
++	else
++		return -EFAULT;
++
++	return  hns3_gro_complete(skb, l234info);
++}
++
++static void hns3_set_rx_skb_rss_type(struct hns3_enet_ring *ring,
++				     struct sk_buff *skb, u32 rss_hash,
++				     u32 l234info, u32 ol_info)
++{
++	enum pkt_hash_types rss_type = PKT_HASH_TYPE_NONE;
++	struct net_device *netdev = ring_to_netdev(ring);
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) {
++		u32 ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M,
++					    HNS3_RXD_PTYPE_S);
++
++		rss_type = hns3_rx_ptype_tbl[ptype].hash_type;
++	} else {
++		int l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M,
++					      HNS3_RXD_L3ID_S);
++		int l4_type = hnae3_get_field(l234info, HNS3_RXD_L4ID_M,
++					      HNS3_RXD_L4ID_S);
++
++		if (l3_type == HNS3_L3_TYPE_IPV4 ||
++		    l3_type == HNS3_L3_TYPE_IPV6) {
++			if (l4_type == HNS3_L4_TYPE_UDP ||
++			    l4_type == HNS3_L4_TYPE_TCP ||
++			    l4_type == HNS3_L4_TYPE_SCTP)
++				rss_type = PKT_HASH_TYPE_L4;
++			else if (l4_type == HNS3_L4_TYPE_IGMP ||
++				 l4_type == HNS3_L4_TYPE_ICMP)
++				rss_type = PKT_HASH_TYPE_L3;
++		}
++	}
++
++	skb_set_hash(skb, rss_hash, rss_type);
++}
++
++static void hns3_handle_rx_ts_info(struct net_device *netdev,
++				   struct hns3_desc *desc, struct sk_buff *skb,
++				   u32 bd_base_info)
++{
++	if (unlikely(bd_base_info & BIT(HNS3_RXD_TS_VLD_B))) {
++		struct hnae3_handle *h = hns3_get_handle(netdev);
++		u32 nsec = le32_to_cpu(desc->ts_nsec);
++		u32 sec = le32_to_cpu(desc->ts_sec);
++
++		if (h->ae_algo->ops->get_rx_hwts)
++			h->ae_algo->ops->get_rx_hwts(h, skb, nsec, sec);
++	}
++}
++
++static void hns3_handle_rx_vlan_tag(struct hns3_enet_ring *ring,
++				    struct hns3_desc *desc, struct sk_buff *skb,
++				    u32 l234info)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++
++	/* Based on hw strategy, the tag offloaded will be stored at
++	 * ot_vlan_tag in two layer tag case, and stored at vlan_tag
++	 * in one layer tag case.
++	 */
++	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
++		u16 vlan_tag;
++
++		if (hns3_parse_vlan_tag(ring, desc, l234info, &vlan_tag))
++			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
++					       vlan_tag);
++	}
++}
++
++static int hns3_handle_bdinfo(struct hns3_enet_ring *ring, struct sk_buff *skb)
++{
++	struct net_device *netdev = ring_to_netdev(ring);
++	enum hns3_pkt_l2t_type l2_frame_type;
++	u32 bd_base_info, l234info, ol_info;
++	struct hns3_desc *desc;
++	unsigned int len;
++	int pre_ntc, ret;
++	u16 csum;
++
++	/* bdinfo handled below is only valid on the last BD of the
++	 * current packet, and ring->next_to_clean indicates the first
++	 * descriptor of next packet, so need - 1 below.
++	 */
++	pre_ntc = ring->next_to_clean ? (ring->next_to_clean - 1) :
++					(ring->desc_num - 1);
++	desc = &ring->desc[pre_ntc];
++	bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
++	l234info = le32_to_cpu(desc->rx.l234_info);
++	ol_info = le32_to_cpu(desc->rx.ol_info);
++	csum = le16_to_cpu(desc->csum);
++
++	hns3_handle_rx_ts_info(netdev, desc, skb, bd_base_info);
++
++	hns3_handle_rx_vlan_tag(ring, desc, skb, l234info);
++
++	if (unlikely(!desc->rx.pkt_len || (l234info & (BIT(HNS3_RXD_TRUNCAT_B) |
++				  BIT(HNS3_RXD_L2E_B))))) {
++		u64_stats_update_begin(&ring->syncp);
++		if (l234info & BIT(HNS3_RXD_L2E_B))
++			ring->stats.l2_err++;
++		else
++			ring->stats.err_pkt_len++;
++		u64_stats_update_end(&ring->syncp);
++
++		return -EFAULT;
++	}
++
++	len = skb->len;
++
++	/* Do update ip stack process */
++	skb->protocol = eth_type_trans(skb, netdev);
++
++	/* This is needed in order to enable forwarding support */
++	ret = hns3_set_gro_and_checksum(ring, skb, l234info,
++					bd_base_info, ol_info, csum);
++	if (unlikely(ret)) {
++		hns3_ring_stats_update(ring, rx_err_cnt);
++		return ret;
++	}
++
++	l2_frame_type = hnae3_get_field(l234info, HNS3_RXD_DMAC_M,
++					HNS3_RXD_DMAC_S);
++
++	u64_stats_update_begin(&ring->syncp);
++	ring->stats.rx_pkts++;
++	ring->stats.rx_bytes += len;
++
++	if (l2_frame_type == HNS3_L2_TYPE_MULTICAST)
++		ring->stats.rx_multicast++;
++
++	u64_stats_update_end(&ring->syncp);
++
++	ring->tqp_vector->rx_group.total_bytes += len;
++
++	hns3_set_rx_skb_rss_type(ring, skb, le32_to_cpu(desc->rx.rss_hash),
++				 l234info, ol_info);
++	return 0;
++}
++
++static int hns3_handle_rx_bd(struct hns3_enet_ring *ring)
++{
++	struct sk_buff *skb = ring->skb;
++	struct hns3_desc_cb *desc_cb;
++	struct hns3_desc *desc;
++	unsigned int length;
++	u32 bd_base_info;
++	int ret;
++
++	desc = &ring->desc[ring->next_to_clean];
++	desc_cb = &ring->desc_cb[ring->next_to_clean];
++
++	prefetch(desc);
++
++	if (!skb) {
++		bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
++		/* Check valid BD */
++		if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
++			return -ENXIO;
++
++		dma_rmb();
++		length = le16_to_cpu(desc->rx.size);
++
++		ring->va = desc_cb->buf + desc_cb->page_offset;
++
++		dma_sync_single_for_cpu(ring_to_dev(ring),
++				desc_cb->dma + desc_cb->page_offset,
++				hns3_buf_size(ring),
++				DMA_FROM_DEVICE);
++
++		/* Prefetch first cache line of first page.
++		 * Idea is to cache few bytes of the header of the packet.
++		 * Our L1 Cache line size is 64B so need to prefetch twice to make
++		 * it 128B. But in actual we can have greater size of caches with
++		 * 128B Level 1 cache lines. In such a case, single fetch would
++		 * suffice to cache in the relevant part of the header.
++		 */
++		net_prefetch(ring->va);
++
++		ret = hns3_alloc_skb(ring, length, ring->va);
++		skb = ring->skb;
++
++		if (ret < 0) /* alloc buffer fail */
++			return ret;
++		if (!(bd_base_info & BIT(HNS3_RXD_FE_B))) { /* need add frag */
++			ret = hns3_add_frag(ring);
++			if (ret)
++				return ret;
++		}
++	} else {
++		ret = hns3_add_frag(ring);
++		if (ret)
++			return ret;
++	}
++
++	/* As the head data may be changed when GRO enable, copy
++	 * the head data in after other data rx completed
++	 */
++	if (skb->len > HNS3_RX_HEAD_SIZE)
++		memcpy(skb->data, ring->va,
++		       ALIGN(ring->pull_len, sizeof(long)));
++
++	ret = hns3_handle_bdinfo(ring, skb);
++	if (unlikely(ret)) {
++		dev_kfree_skb_any(skb);
++		return ret;
++	}
++
++	skb_record_rx_queue(skb, ring->tqp->tqp_index);
++	return 0;
++}
++
++int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget,
++		       void (*rx_fn)(struct hns3_enet_ring *, struct sk_buff *))
++{
++#define RCB_NOF_ALLOC_RX_BUFF_ONCE 16
++	int unused_count = hns3_desc_unused(ring);
++	bool failure = false;
++	int recv_pkts = 0;
++	int err;
++
++	unused_count -= ring->pending_buf;
++
++	while (recv_pkts < budget) {
++		/* Reuse or realloc buffers */
++		if (unused_count >= RCB_NOF_ALLOC_RX_BUFF_ONCE) {
++			failure = failure ||
++				hns3_nic_alloc_rx_buffers(ring, unused_count);
++			unused_count = 0;
++		}
++
++		/* Poll one pkt */
++		err = hns3_handle_rx_bd(ring);
++		/* Do not get FE for the packet or failed to alloc skb */
++		if (unlikely(!ring->skb || err == -ENXIO)) {
++			goto out;
++		} else if (likely(!err)) {
++			rx_fn(ring, ring->skb);
++			recv_pkts++;
++		}
++
++		unused_count += ring->pending_buf;
++		ring->skb = NULL;
++		ring->pending_buf = 0;
++	}
++
++out:
++	/* sync head pointer before exiting, since hardware will calculate
++	 * FBD number with head pointer
++	 */
++	if (unused_count > 0)
++		failure = failure ||
++			  hns3_nic_alloc_rx_buffers(ring, unused_count);
++
++	return failure ? budget : recv_pkts;
++}
++
++static void hns3_update_rx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	struct hns3_enet_ring_group *rx_group = &tqp_vector->rx_group;
++	struct dim_sample sample = {};
++
++	if (!rx_group->coal.adapt_enable)
++		return;
++
++	dim_update_sample(tqp_vector->event_cnt, rx_group->total_packets,
++			  rx_group->total_bytes, &sample);
++	net_dim(&rx_group->dim, sample);
++}
++
++static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	struct hns3_enet_ring_group *tx_group = &tqp_vector->tx_group;
++	struct dim_sample sample = {};
++
++	if (!tx_group->coal.adapt_enable)
++		return;
++
++	dim_update_sample(tqp_vector->event_cnt, tx_group->total_packets,
++			  tx_group->total_bytes, &sample);
++	net_dim(&tx_group->dim, sample);
++}
++
++static int hns3_nic_common_poll(struct napi_struct *napi, int budget)
++{
++	struct hns3_nic_priv *priv = netdev_priv(napi->dev);
++	struct hns3_enet_ring *ring;
++	int rx_pkt_total = 0;
++
++	struct hns3_enet_tqp_vector *tqp_vector =
++		container_of(napi, struct hns3_enet_tqp_vector, napi);
++	bool clean_complete = true;
++	int rx_budget = budget;
++
++	if (unlikely(test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) {
++		napi_complete(napi);
++		return 0;
++	}
++
++	/* Since the actual Tx work is minimal, we can give the Tx a larger
++	 * budget and be more aggressive about cleaning up the Tx descriptors.
++	 */
++	hns3_for_each_ring(ring, tqp_vector->tx_group)
++		hns3_clean_tx_ring(ring, budget);
++
++	/* make sure rx ring budget not smaller than 1 */
++	if (tqp_vector->num_tqps > 1)
++		rx_budget = max(budget / tqp_vector->num_tqps, 1);
++
++	hns3_for_each_ring(ring, tqp_vector->rx_group) {
++		int rx_cleaned = hns3_clean_rx_ring(ring, rx_budget,
++						    hns3_rx_skb);
++		if (rx_cleaned >= rx_budget)
++			clean_complete = false;
++
++		rx_pkt_total += rx_cleaned;
++	}
++
++	tqp_vector->rx_group.total_packets += rx_pkt_total;
++
++	if (!clean_complete)
++		return budget;
++
++	if (napi_complete(napi) &&
++	    likely(!test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) {
++		hns3_update_rx_int_coalesce(tqp_vector);
++		hns3_update_tx_int_coalesce(tqp_vector);
++
++		hns3_mask_vector_irq(tqp_vector, 1);
++	}
++
++	return rx_pkt_total;
++}
++
++static int hns3_create_ring_chain(struct hns3_enet_tqp_vector *tqp_vector,
++				  struct hnae3_ring_chain_node **head,
++				  bool is_tx)
++{
++	u32 bit_value = is_tx ? HNAE3_RING_TYPE_TX : HNAE3_RING_TYPE_RX;
++	u32 field_value = is_tx ? HNAE3_RING_GL_TX : HNAE3_RING_GL_RX;
++	struct hnae3_ring_chain_node *cur_chain = *head;
++	struct pci_dev *pdev = tqp_vector->handle->pdev;
++	struct hnae3_ring_chain_node *chain;
++	struct hns3_enet_ring *ring;
++
++	ring = is_tx ? tqp_vector->tx_group.ring : tqp_vector->rx_group.ring;
++
++	if (cur_chain) {
++		while (cur_chain->next)
++			cur_chain = cur_chain->next;
++	}
++
++	while (ring) {
++		chain = devm_kzalloc(&pdev->dev, sizeof(*chain), GFP_KERNEL);
++		if (!chain)
++			return -ENOMEM;
++		if (cur_chain)
++			cur_chain->next = chain;
++		else
++			*head = chain;
++		chain->tqp_index = ring->tqp->tqp_index;
++		hnae3_set_bit(chain->flag, HNAE3_RING_TYPE_B,
++				bit_value);
++		hnae3_set_field(chain->int_gl_idx,
++				HNAE3_RING_GL_IDX_M,
++				HNAE3_RING_GL_IDX_S, field_value);
++
++		cur_chain = chain;
++
++		ring = ring->next;
++	}
++
++	return 0;
++}
++
++static struct hnae3_ring_chain_node *
++hns3_get_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	struct pci_dev *pdev = tqp_vector->handle->pdev;
++	struct hnae3_ring_chain_node *cur_chain = NULL;
++	struct hnae3_ring_chain_node *chain;
++
++	if (hns3_create_ring_chain(tqp_vector, &cur_chain, true))
++		goto err_free_chain;
++
++	if (hns3_create_ring_chain(tqp_vector, &cur_chain, false))
++		goto err_free_chain;
++
++	return cur_chain;
++
++err_free_chain:
++	while (cur_chain) {
++		chain = cur_chain->next;
++		devm_kfree(&pdev->dev, cur_chain);
++		cur_chain = chain;
++	}
++
++	return NULL;
++}
++
++static void hns3_free_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector,
++					struct hnae3_ring_chain_node *head)
++{
++	struct pci_dev *pdev = tqp_vector->handle->pdev;
++	struct hnae3_ring_chain_node *chain_tmp, *chain;
++
++	chain = head;
++
++	while (chain) {
++		chain_tmp = chain->next;
++		devm_kfree(&pdev->dev, chain);
++		chain = chain_tmp;
++	}
++}
++
++static void hns3_add_ring_to_group(struct hns3_enet_ring_group *group,
++				   struct hns3_enet_ring *ring)
++{
++	ring->next = group->ring;
++	group->ring = ring;
++
++	group->count++;
++}
++
++static void hns3_nic_set_cpumask(struct hns3_nic_priv *priv)
++{
++	struct pci_dev *pdev = priv->ae_handle->pdev;
++	struct hns3_enet_tqp_vector *tqp_vector;
++	int num_vectors = priv->vector_num;
++	int numa_node;
++	int vector_i;
++
++	numa_node = dev_to_node(&pdev->dev);
++
++	for (vector_i = 0; vector_i < num_vectors; vector_i++) {
++		tqp_vector = &priv->tqp_vector[vector_i];
++		cpumask_set_cpu(cpumask_local_spread(vector_i, numa_node),
++				&tqp_vector->affinity_mask);
++	}
++}
++
++static void hns3_rx_dim_work(struct work_struct *work)
++{
++	struct dim *dim = container_of(work, struct dim, work);
++	struct hns3_enet_ring_group *group = container_of(dim,
++		struct hns3_enet_ring_group, dim);
++	struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector;
++	struct dim_cq_moder cur_moder =
++		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
++
++	hns3_set_vector_coalesce_rx_gl(group->ring->tqp_vector, cur_moder.usec);
++	tqp_vector->rx_group.coal.int_gl = cur_moder.usec;
++
++	if (cur_moder.pkts < tqp_vector->rx_group.coal.int_ql_max) {
++		hns3_set_vector_coalesce_rx_ql(tqp_vector, cur_moder.pkts);
++		tqp_vector->rx_group.coal.int_ql = cur_moder.pkts;
++	}
++
++	dim->state = DIM_START_MEASURE;
++}
++
++static void hns3_tx_dim_work(struct work_struct *work)
++{
++	struct dim *dim = container_of(work, struct dim, work);
++	struct hns3_enet_ring_group *group = container_of(dim,
++		struct hns3_enet_ring_group, dim);
++	struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector;
++	struct dim_cq_moder cur_moder =
++		net_dim_get_tx_moderation(dim->mode, dim->profile_ix);
++
++	hns3_set_vector_coalesce_tx_gl(tqp_vector, cur_moder.usec);
++	tqp_vector->tx_group.coal.int_gl = cur_moder.usec;
++
++	if (cur_moder.pkts < tqp_vector->tx_group.coal.int_ql_max) {
++		hns3_set_vector_coalesce_tx_ql(tqp_vector, cur_moder.pkts);
++		tqp_vector->tx_group.coal.int_ql = cur_moder.pkts;
++	}
++
++	dim->state = DIM_START_MEASURE;
++}
++
++static void hns3_nic_init_dim(struct hns3_enet_tqp_vector *tqp_vector)
++{
++	INIT_WORK(&tqp_vector->rx_group.dim.work, hns3_rx_dim_work);
++	INIT_WORK(&tqp_vector->tx_group.dim.work, hns3_tx_dim_work);
++}
++
++static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	struct hns3_enet_tqp_vector *tqp_vector;
++	int ret;
++	int i;
++
++	hns3_nic_set_cpumask(priv);
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vector = &priv->tqp_vector[i];
++		hns3_vector_coalesce_init_hw(tqp_vector, priv);
++		tqp_vector->num_tqps = 0;
++		hns3_nic_init_dim(tqp_vector);
++	}
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		u16 vector_i = i % priv->vector_num;
++		u16 tqp_num = h->kinfo.num_tqps;
++
++		tqp_vector = &priv->tqp_vector[vector_i];
++
++		hns3_add_ring_to_group(&tqp_vector->tx_group,
++				       &priv->ring[i]);
++
++		hns3_add_ring_to_group(&tqp_vector->rx_group,
++				       &priv->ring[i + tqp_num]);
++
++		priv->ring[i].tqp_vector = tqp_vector;
++		priv->ring[i + tqp_num].tqp_vector = tqp_vector;
++		tqp_vector->num_tqps++;
++	}
++
++	for (i = 0; i < priv->vector_num; i++) {
++		struct hnae3_ring_chain_node *vector_ring_chain;
++
++		tqp_vector = &priv->tqp_vector[i];
++
++		tqp_vector->rx_group.total_bytes = 0;
++		tqp_vector->rx_group.total_packets = 0;
++		tqp_vector->tx_group.total_bytes = 0;
++		tqp_vector->tx_group.total_packets = 0;
++		tqp_vector->handle = h;
++
++		vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector);
++		if (!vector_ring_chain) {
++			ret = -ENOMEM;
++			goto map_ring_fail;
++		}
++
++		ret = h->ae_algo->ops->map_ring_to_vector(h,
++			tqp_vector->vector_irq, vector_ring_chain);
++
++		hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain);
++
++		if (ret)
++			goto map_ring_fail;
++
++		netif_napi_add(priv->netdev, &tqp_vector->napi,
++			       hns3_nic_common_poll, NAPI_POLL_WEIGHT);
++	}
++
++	return 0;
++
++map_ring_fail:
++	while (i--)
++		netif_napi_del(&priv->tqp_vector[i].napi);
++
++	return ret;
++}
++
++static void hns3_nic_init_coal_cfg(struct hns3_nic_priv *priv)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
++	struct hns3_enet_coalesce *tx_coal = &priv->tx_coal;
++	struct hns3_enet_coalesce *rx_coal = &priv->rx_coal;
++
++	/* initialize the configuration for interrupt coalescing.
++	 * 1. GL (Interrupt Gap Limiter)
++	 * 2. RL (Interrupt Rate Limiter)
++	 * 3. QL (Interrupt Quantity Limiter)
++	 *
++	 * Default: enable interrupt coalescing self-adaptive and GL
++	 */
++	tx_coal->adapt_enable = 1;
++	rx_coal->adapt_enable = 1;
++
++	tx_coal->int_gl = HNS3_INT_GL_50K;
++	rx_coal->int_gl = HNS3_INT_GL_50K;
++
++	rx_coal->flow_level = HNS3_FLOW_LOW;
++	tx_coal->flow_level = HNS3_FLOW_LOW;
++
++	if (ae_dev->dev_specs.int_ql_max) {
++		tx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG;
++		rx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG;
++	}
++}
++
++static int hns3_nic_alloc_vector_data(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	struct hns3_enet_tqp_vector *tqp_vector;
++	struct hnae3_vector_info *vector;
++	struct pci_dev *pdev = h->pdev;
++	u16 tqp_num = h->kinfo.num_tqps;
++	u16 vector_num;
++	int ret = 0;
++	u16 i;
++
++	/* RSS size, cpu online and vector_num should be the same */
++	/* Should consider 2p/4p later */
++	vector_num = min_t(u16, num_online_cpus(), tqp_num);
++
++	vector = devm_kcalloc(&pdev->dev, vector_num, sizeof(*vector),
++			      GFP_KERNEL);
++	if (!vector)
++		return -ENOMEM;
++
++	/* save the actual available vector number */
++	vector_num = h->ae_algo->ops->get_vector(h, vector_num, vector);
++
++	priv->vector_num = vector_num;
++	priv->tqp_vector = (struct hns3_enet_tqp_vector *)
++		devm_kcalloc(&pdev->dev, vector_num, sizeof(*priv->tqp_vector),
++			     GFP_KERNEL);
++	if (!priv->tqp_vector) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vector = &priv->tqp_vector[i];
++		tqp_vector->idx = i;
++		tqp_vector->mask_addr = vector[i].io_addr;
++		tqp_vector->vector_irq = vector[i].vector;
++		hns3_vector_coalesce_init(tqp_vector, priv);
++	}
++
++out:
++	devm_kfree(&pdev->dev, vector);
++	return ret;
++}
++
++static void hns3_clear_ring_group(struct hns3_enet_ring_group *group)
++{
++	group->ring = NULL;
++	group->count = 0;
++}
++
++static void hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv)
++{
++	struct hnae3_ring_chain_node *vector_ring_chain;
++	struct hnae3_handle *h = priv->ae_handle;
++	struct hns3_enet_tqp_vector *tqp_vector;
++	int i;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		tqp_vector = &priv->tqp_vector[i];
++
++		if (!tqp_vector->rx_group.ring && !tqp_vector->tx_group.ring)
++			continue;
++
++		/* Since the mapping can be overwritten, when fail to get the
++		 * chain between vector and ring, we should go on to deal with
++		 * the remaining options.
++		 */
++		vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector);
++		if (!vector_ring_chain)
++			dev_warn(priv->dev, "failed to get ring chain\n");
++
++		h->ae_algo->ops->unmap_ring_from_vector(h,
++			tqp_vector->vector_irq, vector_ring_chain);
++
++		hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain);
++
++		hns3_clear_ring_group(&tqp_vector->rx_group);
++		hns3_clear_ring_group(&tqp_vector->tx_group);
++		netif_napi_del(&priv->tqp_vector[i].napi);
++	}
++}
++
++static void hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	struct pci_dev *pdev = h->pdev;
++	int i, ret;
++
++	for (i = 0; i < priv->vector_num; i++) {
++		struct hns3_enet_tqp_vector *tqp_vector;
++
++		tqp_vector = &priv->tqp_vector[i];
++		ret = h->ae_algo->ops->put_vector(h, tqp_vector->vector_irq);
++		if (ret)
++			return;
++	}
++
++	devm_kfree(&pdev->dev, priv->tqp_vector);
++}
++
++static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv,
++			      unsigned int ring_type)
++{
++	int queue_num = priv->ae_handle->kinfo.num_tqps;
++	struct hns3_enet_ring *ring;
++	int desc_num;
++
++	if (ring_type == HNAE3_RING_TYPE_TX) {
++		ring = &priv->ring[q->tqp_index];
++		desc_num = priv->ae_handle->kinfo.num_tx_desc;
++		ring->queue_index = q->tqp_index;
++		ring->tx_copybreak = priv->tx_copybreak;
++		ring->last_to_use = 0;
++	} else {
++		ring = &priv->ring[q->tqp_index + queue_num];
++		desc_num = priv->ae_handle->kinfo.num_rx_desc;
++		ring->queue_index = q->tqp_index;
++		ring->rx_copybreak = priv->rx_copybreak;
++	}
++
++	hnae3_set_bit(ring->flag, HNAE3_RING_TYPE_B, ring_type);
++
++	ring->tqp = q;
++	ring->desc = NULL;
++	ring->desc_cb = NULL;
++	ring->dev = priv->dev;
++	ring->desc_dma_addr = 0;
++	ring->buf_size = q->buf_size;
++	ring->desc_num = desc_num;
++	ring->next_to_use = 0;
++	ring->next_to_clean = 0;
++}
++
++static void hns3_queue_to_ring(struct hnae3_queue *tqp,
++			       struct hns3_nic_priv *priv)
++{
++	hns3_ring_get_cfg(tqp, priv, HNAE3_RING_TYPE_TX);
++	hns3_ring_get_cfg(tqp, priv, HNAE3_RING_TYPE_RX);
++}
++
++static int hns3_get_ring_config(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	struct pci_dev *pdev = h->pdev;
++	int i;
++
++	priv->ring = devm_kzalloc(&pdev->dev,
++				  array3_size(h->kinfo.num_tqps,
++					      sizeof(*priv->ring), 2),
++				  GFP_KERNEL);
++	if (!priv->ring)
++		return -ENOMEM;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++)
++		hns3_queue_to_ring(h->kinfo.tqp[i], priv);
++
++	return 0;
++}
++
++static void hns3_put_ring_config(struct hns3_nic_priv *priv)
++{
++	if (!priv->ring)
++		return;
++
++	devm_kfree(priv->dev, priv->ring);
++	priv->ring = NULL;
++}
++
++static void hns3_alloc_page_pool(struct hns3_enet_ring *ring)
++{
++	struct page_pool_params pp_params = {
++		.flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG |
++				PP_FLAG_DMA_SYNC_DEV,
++		.order = hns3_page_order(ring),
++		.pool_size = ring->desc_num * hns3_buf_size(ring) /
++				(PAGE_SIZE << hns3_page_order(ring)),
++		.nid = dev_to_node(ring_to_dev(ring)),
++		.dev = ring_to_dev(ring),
++		.dma_dir = DMA_FROM_DEVICE,
++		.offset = 0,
++		.max_len = PAGE_SIZE << hns3_page_order(ring),
++	};
++
++	ring->page_pool = page_pool_create(&pp_params);
++	if (IS_ERR(ring->page_pool)) {
++		dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n",
++			 PTR_ERR(ring->page_pool));
++		ring->page_pool = NULL;
++	}
++}
++
++static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring)
++{
++	int ret;
++
++	if (ring->desc_num <= 0 || ring->buf_size <= 0)
++		return -EINVAL;
++
++	ring->desc_cb = devm_kcalloc(ring_to_dev(ring), ring->desc_num,
++				     sizeof(ring->desc_cb[0]), GFP_KERNEL);
++	if (!ring->desc_cb) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	ret = hns3_alloc_desc(ring);
++	if (ret)
++		goto out_with_desc_cb;
++
++	if (!HNAE3_IS_TX_RING(ring)) {
++		if (page_pool_enabled)
++			hns3_alloc_page_pool(ring);
++
++		ret = hns3_alloc_ring_buffers(ring);
++		if (ret)
++			goto out_with_desc;
++	} else {
++		hns3_init_tx_spare_buffer(ring);
++	}
++
++	return 0;
++
++out_with_desc:
++	hns3_free_desc(ring);
++out_with_desc_cb:
++	devm_kfree(ring_to_dev(ring), ring->desc_cb);
++	ring->desc_cb = NULL;
++out:
++	return ret;
++}
++
++void hns3_fini_ring(struct hns3_enet_ring *ring)
++{
++	hns3_free_desc(ring);
++	devm_kfree(ring_to_dev(ring), ring->desc_cb);
++	ring->desc_cb = NULL;
++	ring->next_to_clean = 0;
++	ring->next_to_use = 0;
++	ring->last_to_use = 0;
++	ring->pending_buf = 0;
++	if (!HNAE3_IS_TX_RING(ring) && ring->skb) {
++		dev_kfree_skb_any(ring->skb);
++		ring->skb = NULL;
++	} else if (HNAE3_IS_TX_RING(ring) && ring->tx_spare) {
++		struct hns3_tx_spare *tx_spare = ring->tx_spare;
++
++		dma_unmap_page(ring_to_dev(ring), tx_spare->dma, tx_spare->len,
++			       DMA_TO_DEVICE);
++		free_pages((unsigned long)tx_spare->buf,
++			   get_order(tx_spare->len));
++		devm_kfree(ring_to_dev(ring), tx_spare);
++		ring->tx_spare = NULL;
++	}
++
++	if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) {
++		page_pool_destroy(ring->page_pool);
++		ring->page_pool = NULL;
++	}
++}
++
++static int hns3_buf_size2type(u32 buf_size)
++{
++	int bd_size_type;
++
++	switch (buf_size) {
++	case 512:
++		bd_size_type = HNS3_BD_SIZE_512_TYPE;
++		break;
++	case 1024:
++		bd_size_type = HNS3_BD_SIZE_1024_TYPE;
++		break;
++	case 2048:
++		bd_size_type = HNS3_BD_SIZE_2048_TYPE;
++		break;
++	case 4096:
++		bd_size_type = HNS3_BD_SIZE_4096_TYPE;
++		break;
++	default:
++		bd_size_type = HNS3_BD_SIZE_2048_TYPE;
++	}
++
++	return bd_size_type;
++}
++
++static void hns3_init_ring_hw(struct hns3_enet_ring *ring)
++{
++	dma_addr_t dma = ring->desc_dma_addr;
++	struct hnae3_queue *q = ring->tqp;
++
++	if (!HNAE3_IS_TX_RING(ring)) {
++		hns3_write_dev(q, HNS3_RING_RX_RING_BASEADDR_L_REG, (u32)dma);
++		hns3_write_dev(q, HNS3_RING_RX_RING_BASEADDR_H_REG,
++			       (u32)((dma >> 31) >> 1));
++
++		hns3_write_dev(q, HNS3_RING_RX_RING_BD_LEN_REG,
++			       hns3_buf_size2type(ring->buf_size));
++		hns3_write_dev(q, HNS3_RING_RX_RING_BD_NUM_REG,
++			       ring->desc_num / 8 - 1);
++	} else {
++		hns3_write_dev(q, HNS3_RING_TX_RING_BASEADDR_L_REG,
++			       (u32)dma);
++		hns3_write_dev(q, HNS3_RING_TX_RING_BASEADDR_H_REG,
++			       (u32)((dma >> 31) >> 1));
++
++		hns3_write_dev(q, HNS3_RING_TX_RING_BD_NUM_REG,
++			       ring->desc_num / 8 - 1);
++	}
++}
++
++static void hns3_init_tx_ring_tc(struct hns3_nic_priv *priv)
++{
++	struct hnae3_knic_private_info *kinfo = &priv->ae_handle->kinfo;
++	struct hnae3_tc_info *tc_info = &kinfo->tc_info;
++	int i;
++
++	for (i = 0; i < tc_info->num_tc; i++) {
++		int j;
++
++		for (j = 0; j < tc_info->tqp_count[i]; j++) {
++			struct hnae3_queue *q;
++
++			q = priv->ring[tc_info->tqp_offset[i] + j].tqp;
++			hns3_write_dev(q, HNS3_RING_TX_RING_TC_REG, i);
++		}
++	}
++}
++
++int hns3_init_all_ring(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	int ring_num = h->kinfo.num_tqps * 2;
++	int i, j;
++	int ret;
++
++	for (i = 0; i < ring_num; i++) {
++		ret = hns3_alloc_ring_memory(&priv->ring[i]);
++		if (ret) {
++			dev_err(priv->dev,
++				"Alloc ring memory fail! ret=%d\n", ret);
++			goto out_when_alloc_ring_memory;
++		}
++
++		u64_stats_init(&priv->ring[i].syncp);
++	}
++
++	return 0;
++
++out_when_alloc_ring_memory:
++	for (j = i - 1; j >= 0; j--)
++		hns3_fini_ring(&priv->ring[j]);
++
++	return -ENOMEM;
++}
++
++static void hns3_uninit_all_ring(struct hns3_nic_priv *priv)
++{
++	struct hnae3_handle *h = priv->ae_handle;
++	int i;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		hns3_fini_ring(&priv->ring[i]);
++		hns3_fini_ring(&priv->ring[i + h->kinfo.num_tqps]);
++	}
++}
++
++/* Set mac addr if it is configured. or leave it to the AE driver */
++static int hns3_init_mac_addr(struct net_device *netdev)
++{
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
++	struct hnae3_handle *h = priv->ae_handle;
++	u8 mac_addr_temp[ETH_ALEN];
++	int ret = 0;
++
++	if (h->ae_algo->ops->get_mac_addr)
++		h->ae_algo->ops->get_mac_addr(h, mac_addr_temp);
++
++	/* Check if the MAC address is valid, if not get a random one */
++	if (!is_valid_ether_addr(mac_addr_temp)) {
++		eth_hw_addr_random(netdev);
++		hnae3_format_mac_addr(format_mac_addr, netdev->dev_addr);
++		dev_warn(priv->dev, "using random MAC address %s\n",
++			 format_mac_addr);
++	} else if (!ether_addr_equal(netdev->dev_addr, mac_addr_temp)) {
++		eth_hw_addr_set(netdev, mac_addr_temp);
++		ether_addr_copy(netdev->perm_addr, mac_addr_temp);
++	} else {
++		return 0;
++	}
++
++	if (h->ae_algo->ops->set_mac_addr)
++		ret = h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr, true);
++
++	return ret;
++}
++
++static int hns3_init_phy(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	int ret = 0;
++
++	if (h->ae_algo->ops->mac_connect_phy)
++		ret = h->ae_algo->ops->mac_connect_phy(h);
++
++	return ret;
++}
++
++static void hns3_uninit_phy(struct net_device *netdev)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++
++	if (h->ae_algo->ops->mac_disconnect_phy)
++		h->ae_algo->ops->mac_disconnect_phy(h);
++}
++
++static int hns3_client_start(struct hnae3_handle *handle)
++{
++	if (!handle->ae_algo->ops->client_start)
++		return 0;
++
++	return handle->ae_algo->ops->client_start(handle);
++}
++
++static void hns3_client_stop(struct hnae3_handle *handle)
++{
++	if (!handle->ae_algo->ops->client_stop)
++		return;
++
++	handle->ae_algo->ops->client_stop(handle);
++}
++
++static void hns3_info_show(struct hns3_nic_priv *priv)
++{
++	struct hnae3_knic_private_info *kinfo = &priv->ae_handle->kinfo;
++	char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
++
++	hnae3_format_mac_addr(format_mac_addr, priv->netdev->dev_addr);
++	dev_info(priv->dev, "MAC address: %s\n", format_mac_addr);
++	dev_info(priv->dev, "Task queue pairs numbers: %u\n", kinfo->num_tqps);
++	dev_info(priv->dev, "RSS size: %u\n", kinfo->rss_size);
++	dev_info(priv->dev, "Allocated RSS size: %u\n", kinfo->req_rss_size);
++	dev_info(priv->dev, "RX buffer length: %u\n", kinfo->rx_buf_len);
++	dev_info(priv->dev, "Desc num per TX queue: %u\n", kinfo->num_tx_desc);
++	dev_info(priv->dev, "Desc num per RX queue: %u\n", kinfo->num_rx_desc);
++	dev_info(priv->dev, "Total number of enabled TCs: %u\n",
++		 kinfo->tc_info.num_tc);
++	dev_info(priv->dev, "Max mtu size: %u\n", priv->netdev->max_mtu);
++}
++
++static void hns3_set_cq_period_mode(struct hns3_nic_priv *priv,
++				    enum dim_cq_period_mode mode, bool is_tx)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
++	struct hnae3_handle *handle = priv->ae_handle;
++	int i;
++
++	if (is_tx) {
++		priv->tx_cqe_mode = mode;
++
++		for (i = 0; i < priv->vector_num; i++)
++			priv->tqp_vector[i].tx_group.dim.mode = mode;
++	} else {
++		priv->rx_cqe_mode = mode;
++
++		for (i = 0; i < priv->vector_num; i++)
++			priv->tqp_vector[i].rx_group.dim.mode = mode;
++	}
++
++	if (hnae3_ae_dev_cq_supported(ae_dev)) {
++		u32 new_mode;
++		u64 reg;
++
++		new_mode = (mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE) ?
++			HNS3_CQ_MODE_CQE : HNS3_CQ_MODE_EQE;
++		reg = is_tx ? HNS3_GL1_CQ_MODE_REG : HNS3_GL0_CQ_MODE_REG;
++
++		writel(new_mode, handle->kinfo.io_base + reg);
++	}
++}
++
++void hns3_cq_period_mode_init(struct hns3_nic_priv *priv,
++			      enum dim_cq_period_mode tx_mode,
++			      enum dim_cq_period_mode rx_mode)
++{
++	hns3_set_cq_period_mode(priv, tx_mode, true);
++	hns3_set_cq_period_mode(priv, rx_mode, false);
++}
++
++static void hns3_state_init(struct hnae3_handle *handle)
++{
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev);
++	struct net_device *netdev = handle->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	set_bit(HNS3_NIC_STATE_INITED, &priv->state);
++
++	if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps))
++		set_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state);
++
++	if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3)
++		set_bit(HNAE3_PFLAG_LIMIT_PROMISC, &handle->supported_pflags);
++
++	if (test_bit(HNAE3_DEV_SUPPORT_HW_TX_CSUM_B, ae_dev->caps))
++		set_bit(HNS3_NIC_STATE_HW_TX_CSUM_ENABLE, &priv->state);
++
++	if (hnae3_ae_dev_rxd_adv_layout_supported(ae_dev))
++		set_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state);
++}
++
++static void hns3_state_uninit(struct hnae3_handle *handle)
++{
++	struct hns3_nic_priv *priv  = handle->priv;
++
++	clear_bit(HNS3_NIC_STATE_INITED, &priv->state);
++}
++
++static int hns3_client_init(struct hnae3_handle *handle)
++{
++	struct pci_dev *pdev = handle->pdev;
++	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
++	u16 alloc_tqps, max_rss_size;
++	struct hns3_nic_priv *priv;
++	struct net_device *netdev;
++	int ret;
++
++	handle->ae_algo->ops->get_tqps_and_rss_info(handle, &alloc_tqps,
++						    &max_rss_size);
++	netdev = alloc_etherdev_mq(sizeof(struct hns3_nic_priv), alloc_tqps);
++	if (!netdev)
++		return -ENOMEM;
++
++	priv = netdev_priv(netdev);
++	priv->dev = &pdev->dev;
++	priv->netdev = netdev;
++	priv->ae_handle = handle;
++	priv->tx_timeout_count = 0;
++	priv->max_non_tso_bd_num = ae_dev->dev_specs.max_non_tso_bd_num;
++	set_bit(HNS3_NIC_STATE_DOWN, &priv->state);
++
++	handle->msg_enable = netif_msg_init(debug, DEFAULT_MSG_LEVEL);
++
++	handle->kinfo.netdev = netdev;
++	handle->priv = (void *)priv;
++
++	hns3_init_mac_addr(netdev);
++
++	hns3_set_default_feature(netdev);
++
++	netdev->watchdog_timeo = HNS3_TX_TIMEOUT;
++	netdev->priv_flags |= IFF_UNICAST_FLT;
++	netdev->netdev_ops = &hns3_nic_netdev_ops;
++	SET_NETDEV_DEV(netdev, &pdev->dev);
++	hns3_ethtool_set_ops(netdev);
++
++	/* Carrier off reporting is important to ethtool even BEFORE open */
++	netif_carrier_off(netdev);
++
++	ret = hns3_get_ring_config(priv);
++	if (ret) {
++		ret = -ENOMEM;
++		goto out_get_ring_cfg;
++	}
++
++	hns3_nic_init_coal_cfg(priv);
++
++	ret = hns3_nic_alloc_vector_data(priv);
++	if (ret) {
++		ret = -ENOMEM;
++		goto out_alloc_vector_data;
++	}
++
++	ret = hns3_nic_init_vector_data(priv);
++	if (ret) {
++		ret = -ENOMEM;
++		goto out_init_vector_data;
++	}
++
++	ret = hns3_init_all_ring(priv);
++	if (ret) {
++		ret = -ENOMEM;
++		goto out_init_ring;
++	}
++
++	hns3_cq_period_mode_init(priv, DIM_CQ_PERIOD_MODE_START_FROM_EQE,
++				 DIM_CQ_PERIOD_MODE_START_FROM_EQE);
++
++	ret = hns3_init_phy(netdev);
++	if (ret)
++		goto out_init_phy;
++
++	/* the device can work without cpu rmap, only aRFS needs it */
++	ret = hns3_set_rx_cpu_rmap(netdev);
++	if (ret)
++		dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret);
++
++	ret = hns3_nic_init_irq(priv);
++	if (ret) {
++		dev_err(priv->dev, "init irq failed! ret=%d\n", ret);
++		hns3_free_rx_cpu_rmap(netdev);
++		goto out_init_irq_fail;
++	}
++
++	ret = hns3_client_start(handle);
++	if (ret) {
++		dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret);
++		goto out_client_start;
++	}
++
++	hns3_dcbnl_setup(handle);
++
++	ret = hns3_dbg_init(handle);
++	if (ret) {
++		dev_err(priv->dev, "failed to init debugfs, ret = %d\n",
++			ret);
++		goto out_client_start;
++	}
++
++	netdev->max_mtu = HNS3_MAX_MTU(ae_dev->dev_specs.max_frm_size);
++
++	hns3_state_init(handle);
++
++	ret = register_netdev(netdev);
++	if (ret) {
++		dev_err(priv->dev, "probe register netdev fail!\n");
++		goto out_reg_netdev_fail;
++	}
++
++	if (netif_msg_drv(handle))
++		hns3_info_show(priv);
++
++	return ret;
++
++out_reg_netdev_fail:
++	hns3_state_uninit(handle);
++	hns3_dbg_uninit(handle);
++	hns3_client_stop(handle);
++out_client_start:
++	hns3_free_rx_cpu_rmap(netdev);
++	hns3_nic_uninit_irq(priv);
++out_init_irq_fail:
++	hns3_uninit_phy(netdev);
++out_init_phy:
++	hns3_uninit_all_ring(priv);
++out_init_ring:
++	hns3_nic_uninit_vector_data(priv);
++out_init_vector_data:
++	hns3_nic_dealloc_vector_data(priv);
++out_alloc_vector_data:
++	priv->ring = NULL;
++out_get_ring_cfg:
++	priv->ae_handle = NULL;
++	free_netdev(netdev);
++	return ret;
++}
++
++static void hns3_client_uninit(struct hnae3_handle *handle, bool reset)
++{
++	struct net_device *netdev = handle->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	if (netdev->reg_state != NETREG_UNINITIALIZED)
++		unregister_netdev(netdev);
++
++	hns3_client_stop(handle);
++
++	hns3_uninit_phy(netdev);
++
++	if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) {
++		netdev_warn(netdev, "already uninitialized\n");
++		goto out_netdev_free;
++	}
++
++	hns3_free_rx_cpu_rmap(netdev);
++
++	hns3_nic_uninit_irq(priv);
++
++	hns3_clear_all_ring(handle, true);
++
++	hns3_nic_uninit_vector_data(priv);
++
++	hns3_nic_dealloc_vector_data(priv);
++
++	hns3_uninit_all_ring(priv);
++
++	hns3_put_ring_config(priv);
++
++out_netdev_free:
++	hns3_dbg_uninit(handle);
++	free_netdev(netdev);
++}
++
++static void hns3_link_status_change(struct hnae3_handle *handle, bool linkup)
++{
++	struct net_device *netdev = handle->kinfo.netdev;
++
++	if (!netdev)
++		return;
++
++	if (linkup) {
++		netif_tx_wake_all_queues(netdev);
++		netif_carrier_on(netdev);
++		if (netif_msg_link(handle))
++			netdev_info(netdev, "link up\n");
++	} else {
++		netif_carrier_off(netdev);
++		netif_tx_stop_all_queues(netdev);
++		if (netif_msg_link(handle))
++			netdev_info(netdev, "link down\n");
++	}
++}
++
++static void hns3_clear_tx_ring(struct hns3_enet_ring *ring)
++{
++	while (ring->next_to_clean != ring->next_to_use) {
++		ring->desc[ring->next_to_clean].tx.bdtp_fe_sc_vld_ra_ri = 0;
++		hns3_free_buffer_detach(ring, ring->next_to_clean, 0);
++		ring_ptr_move_fw(ring, next_to_clean);
++	}
++
++	ring->pending_buf = 0;
++}
++
++static int hns3_clear_rx_ring(struct hns3_enet_ring *ring)
++{
++	struct hns3_desc_cb res_cbs;
++	int ret;
++
++	while (ring->next_to_use != ring->next_to_clean) {
++		/* When a buffer is not reused, it's memory has been
++		 * freed in hns3_handle_rx_bd or will be freed by
++		 * stack, so we need to replace the buffer here.
++		 */
++		if (!ring->desc_cb[ring->next_to_use].reuse_flag) {
++			ret = hns3_alloc_and_map_buffer(ring, &res_cbs);
++			if (ret) {
++				hns3_ring_stats_update(ring, sw_err_cnt);
++				/* if alloc new buffer fail, exit directly
++				 * and reclear in up flow.
++				 */
++				netdev_warn(ring_to_netdev(ring),
++					    "reserve buffer map failed, ret = %d\n",
++					    ret);
++				return ret;
++			}
++			hns3_replace_buffer(ring, ring->next_to_use, &res_cbs);
++		}
++		ring_ptr_move_fw(ring, next_to_use);
++	}
++
++	/* Free the pending skb in rx ring */
++	if (ring->skb) {
++		dev_kfree_skb_any(ring->skb);
++		ring->skb = NULL;
++		ring->pending_buf = 0;
++	}
++
++	return 0;
++}
++
++static void hns3_force_clear_rx_ring(struct hns3_enet_ring *ring)
++{
++	while (ring->next_to_use != ring->next_to_clean) {
++		/* When a buffer is not reused, it's memory has been
++		 * freed in hns3_handle_rx_bd or will be freed by
++		 * stack, so only need to unmap the buffer here.
++		 */
++		if (!ring->desc_cb[ring->next_to_use].reuse_flag) {
++			hns3_unmap_buffer(ring,
++					  &ring->desc_cb[ring->next_to_use]);
++			ring->desc_cb[ring->next_to_use].dma = 0;
++		}
++
++		ring_ptr_move_fw(ring, next_to_use);
++	}
++}
++
++static void hns3_clear_all_ring(struct hnae3_handle *h, bool force)
++{
++	struct net_device *ndev = h->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	u32 i;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		struct hns3_enet_ring *ring;
++
++		ring = &priv->ring[i];
++		hns3_clear_tx_ring(ring);
++
++		ring = &priv->ring[i + h->kinfo.num_tqps];
++		/* Continue to clear other rings even if clearing some
++		 * rings failed.
++		 */
++		if (force)
++			hns3_force_clear_rx_ring(ring);
++		else
++			hns3_clear_rx_ring(ring);
++	}
++}
++
++int hns3_nic_reset_all_ring(struct hnae3_handle *h)
++{
++	struct net_device *ndev = h->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++	struct hns3_enet_ring *rx_ring;
++	int i, j;
++	int ret;
++
++	ret = h->ae_algo->ops->reset_queue(h);
++	if (ret)
++		return ret;
++
++	for (i = 0; i < h->kinfo.num_tqps; i++) {
++		hns3_init_ring_hw(&priv->ring[i]);
++
++		/* We need to clear tx ring here because self test will
++		 * use the ring and will not run down before up
++		 */
++		hns3_clear_tx_ring(&priv->ring[i]);
++		priv->ring[i].next_to_clean = 0;
++		priv->ring[i].next_to_use = 0;
++		priv->ring[i].last_to_use = 0;
++
++		rx_ring = &priv->ring[i + h->kinfo.num_tqps];
++		hns3_init_ring_hw(rx_ring);
++		ret = hns3_clear_rx_ring(rx_ring);
++		if (ret)
++			return ret;
++
++		/* We can not know the hardware head and tail when this
++		 * function is called in reset flow, so we reuse all desc.
++		 */
++		for (j = 0; j < rx_ring->desc_num; j++)
++			hns3_reuse_buffer(rx_ring, j);
++
++		rx_ring->next_to_clean = 0;
++		rx_ring->next_to_use = 0;
++	}
++
++	hns3_init_tx_ring_tc(priv);
++
++	return 0;
++}
++
++static int hns3_reset_notify_down_enet(struct hnae3_handle *handle)
++{
++	struct hnae3_knic_private_info *kinfo = &handle->kinfo;
++	struct net_device *ndev = kinfo->netdev;
++	struct hns3_nic_priv *priv = netdev_priv(ndev);
++
++	if (test_and_set_bit(HNS3_NIC_STATE_RESETTING, &priv->state))
++		return 0;
++
++	if (!netif_running(ndev))
++		return 0;
++
++	return hns3_nic_net_stop(ndev);
++}
++
++static int hns3_reset_notify_up_enet(struct hnae3_handle *handle)
++{
++	struct hnae3_knic_private_info *kinfo = &handle->kinfo;
++	struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev);
++	int ret = 0;
++
++	if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state)) {
++		netdev_err(kinfo->netdev, "device is not initialized yet\n");
++		return -EFAULT;
++	}
++
++	clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state);
++
++	if (netif_running(kinfo->netdev)) {
++		ret = hns3_nic_net_open(kinfo->netdev);
++		if (ret) {
++			set_bit(HNS3_NIC_STATE_RESETTING, &priv->state);
++			netdev_err(kinfo->netdev,
++				   "net up fail, ret=%d!\n", ret);
++			return ret;
++		}
++	}
++
++	return ret;
++}
++
++static int hns3_reset_notify_init_enet(struct hnae3_handle *handle)
++{
++	struct net_device *netdev = handle->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++	int ret;
++
++	/* Carrier off reporting is important to ethtool even BEFORE open */
++	netif_carrier_off(netdev);
++
++	ret = hns3_get_ring_config(priv);
++	if (ret)
++		return ret;
++
++	ret = hns3_nic_alloc_vector_data(priv);
++	if (ret)
++		goto err_put_ring;
++
++	ret = hns3_nic_init_vector_data(priv);
++	if (ret)
++		goto err_dealloc_vector;
++
++	ret = hns3_init_all_ring(priv);
++	if (ret)
++		goto err_uninit_vector;
++
++	hns3_cq_period_mode_init(priv, priv->tx_cqe_mode, priv->rx_cqe_mode);
++
++	/* the device can work without cpu rmap, only aRFS needs it */
++	ret = hns3_set_rx_cpu_rmap(netdev);
++	if (ret)
++		dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret);
++
++	ret = hns3_nic_init_irq(priv);
++	if (ret) {
++		dev_err(priv->dev, "init irq failed! ret=%d\n", ret);
++		hns3_free_rx_cpu_rmap(netdev);
++		goto err_init_irq_fail;
++	}
++
++	if (!hns3_is_phys_func(handle->pdev))
++		hns3_init_mac_addr(netdev);
++
++	ret = hns3_client_start(handle);
++	if (ret) {
++		dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret);
++		goto err_client_start_fail;
++	}
++
++	set_bit(HNS3_NIC_STATE_INITED, &priv->state);
++
++	return ret;
++
++err_client_start_fail:
++	hns3_free_rx_cpu_rmap(netdev);
++	hns3_nic_uninit_irq(priv);
++err_init_irq_fail:
++	hns3_uninit_all_ring(priv);
++err_uninit_vector:
++	hns3_nic_uninit_vector_data(priv);
++err_dealloc_vector:
++	hns3_nic_dealloc_vector_data(priv);
++err_put_ring:
++	hns3_put_ring_config(priv);
++
++	return ret;
++}
++
++static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle)
++{
++	struct net_device *netdev = handle->kinfo.netdev;
++	struct hns3_nic_priv *priv = netdev_priv(netdev);
++
++	if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) {
++		netdev_warn(netdev, "already uninitialized\n");
++		return 0;
++	}
++
++	hns3_free_rx_cpu_rmap(netdev);
++	hns3_nic_uninit_irq(priv);
++	hns3_clear_all_ring(handle, true);
++	hns3_reset_tx_queue(priv->ae_handle);
++
++	hns3_nic_uninit_vector_data(priv);
++
++	hns3_nic_dealloc_vector_data(priv);
++
++	hns3_uninit_all_ring(priv);
++
++	hns3_put_ring_config(priv);
++
++	return 0;
++}
++
++int hns3_reset_notify(struct hnae3_handle *handle,
++		      enum hnae3_reset_notify_type type)
++{
++	int ret = 0;
++
++	switch (type) {
++	case HNAE3_UP_CLIENT:
++		ret = hns3_reset_notify_up_enet(handle);
++		break;
++	case HNAE3_DOWN_CLIENT:
++		ret = hns3_reset_notify_down_enet(handle);
++		break;
++	case HNAE3_INIT_CLIENT:
++		ret = hns3_reset_notify_init_enet(handle);
++		break;
++	case HNAE3_UNINIT_CLIENT:
++		ret = hns3_reset_notify_uninit_enet(handle);
++		break;
++	default:
++		break;
++	}
++
++	return ret;
++}
++
++static int hns3_change_channels(struct hnae3_handle *handle, u32 new_tqp_num,
++				bool rxfh_configured)
++{
++	int ret;
++
++	ret = handle->ae_algo->ops->set_channels(handle, new_tqp_num,
++						 rxfh_configured);
++	if (ret) {
++		dev_err(&handle->pdev->dev,
++			"Change tqp num(%u) fail.\n", new_tqp_num);
++		return ret;
++	}
++
++	ret = hns3_reset_notify(handle, HNAE3_INIT_CLIENT);
++	if (ret)
++		return ret;
++
++	ret =  hns3_reset_notify(handle, HNAE3_UP_CLIENT);
++	if (ret)
++		hns3_reset_notify(handle, HNAE3_UNINIT_CLIENT);
++
++	return ret;
++}
++
++int hns3_set_channels(struct net_device *netdev,
++		      struct ethtool_channels *ch)
++{
++	struct hnae3_handle *h = hns3_get_handle(netdev);
++	struct hnae3_knic_private_info *kinfo = &h->kinfo;
++	bool rxfh_configured = netif_is_rxfh_configured(netdev);
++	u32 new_tqp_num = ch->combined_count;
++	u16 org_tqp_num;
++	int ret;
++
++	if (hns3_nic_resetting(netdev))
++		return -EBUSY;
++
++	if (ch->rx_count || ch->tx_count)
++		return -EINVAL;
++
++	if (kinfo->tc_info.mqprio_active) {
++		dev_err(&netdev->dev,
++			"it's not allowed to set channels via ethtool when MQPRIO mode is on\n");
++		return -EINVAL;
++	}
++
++	if (new_tqp_num > hns3_get_max_available_channels(h) ||
++	    new_tqp_num < 1) {
++		dev_err(&netdev->dev,
++			"Change tqps fail, the tqp range is from 1 to %u",
++			hns3_get_max_available_channels(h));
++		return -EINVAL;
++	}
++
++	if (kinfo->rss_size == new_tqp_num)
++		return 0;
++
++	netif_dbg(h, drv, netdev,
++		  "set channels: tqp_num=%u, rxfh=%d\n",
++		  new_tqp_num, rxfh_configured);
++
++	ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT);
++	if (ret)
++		return ret;
++
++	ret = hns3_reset_notify(h, HNAE3_UNINIT_CLIENT);
++	if (ret)
++		return ret;
++
++	org_tqp_num = h->kinfo.num_tqps;
++	ret = hns3_change_channels(h, new_tqp_num, rxfh_configured);
++	if (ret) {
++		int ret1;
++
++		netdev_warn(netdev,
++			    "Change channels fail, revert to old value\n");
++		ret1 = hns3_change_channels(h, org_tqp_num, rxfh_configured);
++		if (ret1) {
++			netdev_err(netdev,
++				   "revert to old channel fail\n");
++			return ret1;
++		}
++
++		return ret;
++	}
++
++	return 0;
++}
++
++static const struct hns3_hw_error_info hns3_hw_err[] = {
++	{ .type = HNAE3_PPU_POISON_ERROR,
++	  .msg = "PPU poison" },
++	{ .type = HNAE3_CMDQ_ECC_ERROR,
++	  .msg = "IMP CMDQ error" },
++	{ .type = HNAE3_IMP_RD_POISON_ERROR,
++	  .msg = "IMP RD poison" },
++	{ .type = HNAE3_ROCEE_AXI_RESP_ERROR,
++	  .msg = "ROCEE AXI RESP error" },
++};
++
++static void hns3_process_hw_error(struct hnae3_handle *handle,
++				  enum hnae3_hw_error_type type)
++{
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(hns3_hw_err); i++) {
++		if (hns3_hw_err[i].type == type) {
++			dev_err(&handle->pdev->dev, "Detected %s!\n",
++				hns3_hw_err[i].msg);
++			break;
++		}
++	}
++}
++
++static const struct hnae3_client_ops client_ops = {
++	.init_instance = hns3_client_init,
++	.uninit_instance = hns3_client_uninit,
++	.link_status_change = hns3_link_status_change,
++	.reset_notify = hns3_reset_notify,
++	.process_hw_error = hns3_process_hw_error,
++};
++
++/* hns3_init_module - Driver registration routine
++ * hns3_init_module is the first routine called when the driver is
++ * loaded. All it does is register with the PCI subsystem.
++ */
++static int __init hns3_init_module(void)
++{
++	int ret;
++
++	pr_info("%s: %s - version\n", hns3_driver_name, hns3_driver_string);
++	pr_info("%s: %s\n", hns3_driver_name, hns3_copyright);
++
++	client.type = HNAE3_CLIENT_KNIC;
++	snprintf(client.name, HNAE3_CLIENT_NAME_LENGTH, "%s",
++		 hns3_driver_name);
++
++	client.ops = &client_ops;
++
++	INIT_LIST_HEAD(&client.node);
++
++	hns3_dbg_register_debugfs(hns3_driver_name);
++
++	ret = hnae3_register_client(&client);
++	if (ret)
++		goto err_reg_client;
++
++	ret = pci_register_driver(&hns3_driver);
++	if (ret)
++		goto err_reg_driver;
++
++	return ret;
++
++err_reg_driver:
++	hnae3_unregister_client(&client);
++err_reg_client:
++	hns3_dbg_unregister_debugfs();
++	return ret;
++}
++module_init(hns3_init_module);
++
++/* hns3_exit_module - Driver exit cleanup routine
++ * hns3_exit_module is called just before the driver is removed
++ * from memory.
++ */
++static void __exit hns3_exit_module(void)
++{
++	pci_unregister_driver(&hns3_driver);
++	hnae3_unregister_client(&client);
++	hns3_dbg_unregister_debugfs();
++}
++module_exit(hns3_exit_module);
++
++MODULE_DESCRIPTION("HNS3: Hisilicon Ethernet Driver");
++MODULE_AUTHOR("Huawei Tech. Co., Ltd.");
++MODULE_LICENSE("GPL");
++MODULE_ALIAS("pci:hns-nic");
+diff -rupN linux.orig/drivers/net/ethernet/huawei/hinic/hinic_rx.c linux/drivers/net/ethernet/huawei/hinic/hinic_rx.c
+--- linux.orig/drivers/net/ethernet/huawei/hinic/hinic_rx.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/huawei/hinic/hinic_rx.c	2022-12-04 10:40:26.684034126 -0500
+@@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rx
  	unsigned int start;
  
  	do {
@@ -1599,11 +7449,10 @@ index e5828a658caf4..a866bea651103 100644
  }
  
  /**
-diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
-index 3b6c7b5857376..5051cdff2384b 100644
---- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c
-+++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
-@@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_txq *txq, struct hinic_txq_stats *stats)
+diff -rupN linux.orig/drivers/net/ethernet/huawei/hinic/hinic_tx.c linux/drivers/net/ethernet/huawei/hinic/hinic_tx.c
+--- linux.orig/drivers/net/ethernet/huawei/hinic/hinic_tx.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/huawei/hinic/hinic_tx.c	2022-12-04 10:40:26.684034126 -0500
+@@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_tx
  	unsigned int start;
  
  	do {
@@ -1620,11 +7469,10 @@ index 3b6c7b5857376..5051cdff2384b 100644
  }
  
  /**
-diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
-index 2cca9e84e31e1..34ab5ff9823b7 100644
---- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
-+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
-@@ -1229,10 +1229,10 @@ static void fm10k_get_stats64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c linux/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+--- linux.orig/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c	2022-12-04 10:40:26.684034126 -0500
+@@ -1229,10 +1229,10 @@ static void fm10k_get_stats64(struct net
  			continue;
  
  		do {
@@ -1637,7 +7485,7 @@ index 2cca9e84e31e1..34ab5ff9823b7 100644
  
  		stats->rx_packets += packets;
  		stats->rx_bytes   += bytes;
-@@ -1245,10 +1245,10 @@ static void fm10k_get_stats64(struct net_device *netdev,
+@@ -1245,10 +1245,10 @@ static void fm10k_get_stats64(struct net
  			continue;
  
  		do {
@@ -1650,11 +7498,10 @@ index 2cca9e84e31e1..34ab5ff9823b7 100644
  
  		stats->tx_packets += packets;
  		stats->tx_bytes   += bytes;
-diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
-index e9cd0fa6a0d2f..90f2eee78a3ee 100644
---- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
-+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
-@@ -154,7 +154,7 @@ __i40e_add_ethtool_stats(u64 **data, void *pointer,
+diff -rupN linux.orig/drivers/net/ethernet/intel/i40e/i40e_ethtool.c linux/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/i40e/i40e_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/i40e/i40e_ethtool.c	2022-12-04 10:40:26.684034126 -0500
+@@ -154,7 +154,7 @@ __i40e_add_ethtool_stats(u64 **data, voi
   * @ring: the ring to copy
   *
   * Queue statistics must be copied while protected by
@@ -1663,7 +7510,7 @@ index e9cd0fa6a0d2f..90f2eee78a3ee 100644
   * Assumes that queue stats are defined in i40e_gstrings_queue_stats. If the
   * ring pointer is null, zero out the queue stat values and update the data
   * pointer. Otherwise safely copy the stats from the ring into the supplied
-@@ -172,16 +172,16 @@ i40e_add_queue_stats(u64 **data, struct i40e_ring *ring)
+@@ -172,16 +172,16 @@ i40e_add_queue_stats(u64 **data, struct
  
  	/* To avoid invalid statistics values, ensure that we keep retrying
  	 * the copy until we get a consistent value according to
@@ -1683,11 +7530,10 @@ index e9cd0fa6a0d2f..90f2eee78a3ee 100644
  
  	/* Once we successfully copy the stats in, update the data pointer */
  	*data += size;
-diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
-index e3d9804aeb25e..09a9f67d9ebc0 100644
---- a/drivers/net/ethernet/intel/i40e/i40e_main.c
-+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
-@@ -418,10 +418,10 @@ static void i40e_get_netdev_stats_struct_tx(struct i40e_ring *ring,
+diff -rupN linux.orig/drivers/net/ethernet/intel/i40e/i40e_main.c linux/drivers/net/ethernet/intel/i40e/i40e_main.c
+--- linux.orig/drivers/net/ethernet/intel/i40e/i40e_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/i40e/i40e_main.c	2022-12-04 10:40:26.684034126 -0500
+@@ -418,10 +418,10 @@ static void i40e_get_netdev_stats_struct
  	unsigned int start;
  
  	do {
@@ -1700,7 +7546,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  
  	stats->tx_packets += packets;
  	stats->tx_bytes   += bytes;
-@@ -471,10 +471,10 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
+@@ -471,10 +471,10 @@ static void i40e_get_netdev_stats_struct
  		if (!ring)
  			continue;
  		do {
@@ -1713,7 +7559,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  
  		stats->rx_packets += packets;
  		stats->rx_bytes   += bytes;
-@@ -896,10 +896,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
+@@ -896,10 +896,10 @@ static void i40e_update_vsi_stats(struct
  			continue;
  
  		do {
@@ -1726,7 +7572,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  		tx_b += bytes;
  		tx_p += packets;
  		tx_restart += p->tx_stats.restart_queue;
-@@ -914,10 +914,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
+@@ -914,10 +914,10 @@ static void i40e_update_vsi_stats(struct
  			continue;
  
  		do {
@@ -1739,7 +7585,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  		rx_b += bytes;
  		rx_p += packets;
  		rx_buf += p->rx_stats.alloc_buff_failed;
-@@ -934,10 +934,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
+@@ -934,10 +934,10 @@ static void i40e_update_vsi_stats(struct
  				continue;
  
  			do {
@@ -1752,11 +7598,10 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644
  			tx_b += bytes;
  			tx_p += packets;
  			tx_restart += p->tx_stats.restart_queue;
-diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
-index e535d4c3da49d..fafa3406e0bcc 100644
---- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
-+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
-@@ -147,7 +147,7 @@ __iavf_add_ethtool_stats(u64 **data, void *pointer,
+diff -rupN linux.orig/drivers/net/ethernet/intel/iavf/iavf_ethtool.c linux/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/iavf/iavf_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/iavf/iavf_ethtool.c	2022-12-04 10:40:26.684034126 -0500
+@@ -147,7 +147,7 @@ __iavf_add_ethtool_stats(u64 **data, voi
   * @ring: the ring to copy
   *
   * Queue statistics must be copied while protected by
@@ -1765,7 +7610,7 @@ index e535d4c3da49d..fafa3406e0bcc 100644
   * Assumes that queue stats are defined in iavf_gstrings_queue_stats. If the
   * ring pointer is null, zero out the queue stat values and update the data
   * pointer. Otherwise safely copy the stats from the ring into the supplied
-@@ -165,14 +165,14 @@ iavf_add_queue_stats(u64 **data, struct iavf_ring *ring)
+@@ -165,14 +165,14 @@ iavf_add_queue_stats(u64 **data, struct
  
  	/* To avoid invalid statistics values, ensure that we keep retrying
  	 * the copy until we get a consistent value according to
@@ -1783,11 +7628,10 @@ index e535d4c3da49d..fafa3406e0bcc 100644
  
  	/* Once we successfully copy the stats in, update the data pointer */
  	*data += size;
-diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
-index e109cb93886be..b7394c7e5eed2 100644
---- a/drivers/net/ethernet/intel/ice/ice_main.c
-+++ b/drivers/net/ethernet/intel/ice/ice_main.c
-@@ -6295,10 +6295,10 @@ ice_fetch_u64_stats_per_ring(struct u64_stats_sync *syncp,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ice/ice_main.c linux/drivers/net/ethernet/intel/ice/ice_main.c
+--- linux.orig/drivers/net/ethernet/intel/ice/ice_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ice/ice_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -6295,10 +6295,10 @@ ice_fetch_u64_stats_per_ring(struct u64_
  	unsigned int start;
  
  	do {
@@ -1800,11 +7644,10 @@ index e109cb93886be..b7394c7e5eed2 100644
  }
  
  /**
-diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
-index c14fc871dd417..23c6fcfcb905c 100644
---- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
-+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
-@@ -2311,15 +2311,15 @@ static void igb_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/igb/igb_ethtool.c linux/drivers/net/ethernet/intel/igb/igb_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/igb/igb_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/igb/igb_ethtool.c	2022-12-04 10:40:26.688034116 -0500
+@@ -2311,15 +2311,15 @@ static void igb_get_ethtool_stats(struct
  
  		ring = adapter->tx_ring[j];
  		do {
@@ -1824,7 +7667,7 @@ index c14fc871dd417..23c6fcfcb905c 100644
  		data[i+2] += restart2;
  
  		i += IGB_TX_QUEUE_STATS_LEN;
-@@ -2327,13 +2327,13 @@ static void igb_get_ethtool_stats(struct net_device *netdev,
+@@ -2327,13 +2327,13 @@ static void igb_get_ethtool_stats(struct
  	for (j = 0; j < adapter->num_rx_queues; j++) {
  		ring = adapter->rx_ring[j];
  		do {
@@ -1840,11 +7683,10 @@ index c14fc871dd417..23c6fcfcb905c 100644
  		i += IGB_RX_QUEUE_STATS_LEN;
  	}
  	spin_unlock(&adapter->stats64_lock);
-diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
-index 2796e81d27260..98df55dc1e933 100644
---- a/drivers/net/ethernet/intel/igb/igb_main.c
-+++ b/drivers/net/ethernet/intel/igb/igb_main.c
-@@ -6633,10 +6633,10 @@ void igb_update_stats(struct igb_adapter *adapter)
+diff -rupN linux.orig/drivers/net/ethernet/intel/igb/igb_main.c linux/drivers/net/ethernet/intel/igb/igb_main.c
+--- linux.orig/drivers/net/ethernet/intel/igb/igb_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/igb/igb_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -6633,10 +6633,10 @@ void igb_update_stats(struct igb_adapter
  		}
  
  		do {
@@ -1857,7 +7699,7 @@ index 2796e81d27260..98df55dc1e933 100644
  		bytes += _bytes;
  		packets += _packets;
  	}
-@@ -6649,10 +6649,10 @@ void igb_update_stats(struct igb_adapter *adapter)
+@@ -6649,10 +6649,10 @@ void igb_update_stats(struct igb_adapter
  	for (i = 0; i < adapter->num_tx_queues; i++) {
  		struct igb_ring *ring = adapter->tx_ring[i];
  		do {
@@ -1870,11 +7712,10 @@ index 2796e81d27260..98df55dc1e933 100644
  		bytes += _bytes;
  		packets += _packets;
  	}
-diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
-index 8cc077b712add..5a26a7805ef80 100644
---- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
-+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
-@@ -839,15 +839,15 @@ static void igc_ethtool_get_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/igc/igc_ethtool.c linux/drivers/net/ethernet/intel/igc/igc_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/igc/igc_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/igc/igc_ethtool.c	2022-12-04 10:40:26.688034116 -0500
+@@ -839,15 +839,15 @@ static void igc_ethtool_get_stats(struct
  
  		ring = adapter->tx_ring[j];
  		do {
@@ -1894,7 +7735,7 @@ index 8cc077b712add..5a26a7805ef80 100644
  		data[i + 2] += restart2;
  
  		i += IGC_TX_QUEUE_STATS_LEN;
-@@ -855,13 +855,13 @@ static void igc_ethtool_get_stats(struct net_device *netdev,
+@@ -855,13 +855,13 @@ static void igc_ethtool_get_stats(struct
  	for (j = 0; j < adapter->num_rx_queues; j++) {
  		ring = adapter->rx_ring[j];
  		do {
@@ -1910,11 +7751,10 @@ index 8cc077b712add..5a26a7805ef80 100644
  		i += IGC_RX_QUEUE_STATS_LEN;
  	}
  	spin_unlock(&adapter->stats64_lock);
-diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
-index ebff0e04045d6..944299b06cc3d 100644
---- a/drivers/net/ethernet/intel/igc/igc_main.c
-+++ b/drivers/net/ethernet/intel/igc/igc_main.c
-@@ -4645,10 +4645,10 @@ void igc_update_stats(struct igc_adapter *adapter)
+diff -rupN linux.orig/drivers/net/ethernet/intel/igc/igc_main.c linux/drivers/net/ethernet/intel/igc/igc_main.c
+--- linux.orig/drivers/net/ethernet/intel/igc/igc_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/igc/igc_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -4645,10 +4645,10 @@ void igc_update_stats(struct igc_adapter
  		}
  
  		do {
@@ -1927,7 +7767,7 @@ index ebff0e04045d6..944299b06cc3d 100644
  		bytes += _bytes;
  		packets += _packets;
  	}
-@@ -4662,10 +4662,10 @@ void igc_update_stats(struct igc_adapter *adapter)
+@@ -4662,10 +4662,10 @@ void igc_update_stats(struct igc_adapter
  		struct igc_ring *ring = adapter->tx_ring[i];
  
  		do {
@@ -1940,11 +7780,10 @@ index ebff0e04045d6..944299b06cc3d 100644
  		bytes += _bytes;
  		packets += _packets;
  	}
-diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
-index 04f453eabef64..51bcf0df3adcc 100644
---- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
-+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
-@@ -1335,10 +1335,10 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c linux/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c	2022-12-04 10:40:26.688034116 -0500
+@@ -1335,10 +1335,10 @@ static void ixgbe_get_ethtool_stats(stru
  		}
  
  		do {
@@ -1957,7 +7796,7 @@ index 04f453eabef64..51bcf0df3adcc 100644
  		i += 2;
  	}
  	for (j = 0; j < IXGBE_NUM_RX_QUEUES; j++) {
-@@ -1351,10 +1351,10 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
+@@ -1351,10 +1351,10 @@ static void ixgbe_get_ethtool_stats(stru
  		}
  
  		do {
@@ -1970,11 +7809,10 @@ index 04f453eabef64..51bcf0df3adcc 100644
  		i += 2;
  	}
  
-diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
-index d1e430b8c8aa1..01c5548f181d5 100644
---- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
-+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
-@@ -9041,10 +9041,10 @@ static void ixgbe_get_ring_stats64(struct rtnl_link_stats64 *stats,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c linux/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+--- linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -9041,10 +9041,10 @@ static void ixgbe_get_ring_stats64(struc
  
  	if (ring) {
  		do {
@@ -1987,7 +7825,7 @@ index d1e430b8c8aa1..01c5548f181d5 100644
  		stats->tx_packets += packets;
  		stats->tx_bytes   += bytes;
  	}
-@@ -9064,10 +9064,10 @@ static void ixgbe_get_stats64(struct net_device *netdev,
+@@ -9064,10 +9064,10 @@ static void ixgbe_get_stats64(struct net
  
  		if (ring) {
  			do {
@@ -2000,11 +7838,10 @@ index d1e430b8c8aa1..01c5548f181d5 100644
  			stats->rx_packets += packets;
  			stats->rx_bytes   += bytes;
  		}
-diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
-index fed46872af2bf..b4632b67ab143 100644
---- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
-+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
-@@ -458,10 +458,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbevf/ethtool.c linux/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+--- linux.orig/drivers/net/ethernet/intel/ixgbevf/ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ixgbevf/ethtool.c	2022-12-04 10:40:26.688034116 -0500
+@@ -458,10 +458,10 @@ static void ixgbevf_get_ethtool_stats(st
  		}
  
  		do {
@@ -2017,7 +7854,7 @@ index fed46872af2bf..b4632b67ab143 100644
  		i += 2;
  	}
  
-@@ -475,10 +475,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev,
+@@ -475,10 +475,10 @@ static void ixgbevf_get_ethtool_stats(st
  		}
  
  		do {
@@ -2030,7 +7867,7 @@ index fed46872af2bf..b4632b67ab143 100644
  		i += 2;
  	}
  
-@@ -492,10 +492,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev,
+@@ -492,10 +492,10 @@ static void ixgbevf_get_ethtool_stats(st
  		}
  
  		do {
@@ -2043,11 +7880,10 @@ index fed46872af2bf..b4632b67ab143 100644
  		i += 2;
  	}
  }
-diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
-index 2f12fbe229c15..1d31b8cff4f10 100644
---- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
-+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
-@@ -4350,10 +4350,10 @@ static void ixgbevf_get_tx_ring_stats(struct rtnl_link_stats64 *stats,
+diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c linux/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+--- linux.orig/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c	2022-12-04 10:40:26.688034116 -0500
+@@ -4350,10 +4350,10 @@ static void ixgbevf_get_tx_ring_stats(st
  
  	if (ring) {
  		do {
@@ -2060,7 +7896,7 @@ index 2f12fbe229c15..1d31b8cff4f10 100644
  		stats->tx_bytes += bytes;
  		stats->tx_packets += packets;
  	}
-@@ -4376,10 +4376,10 @@ static void ixgbevf_get_stats(struct net_device *netdev,
+@@ -4376,10 +4376,10 @@ static void ixgbevf_get_stats(struct net
  	for (i = 0; i < adapter->num_rx_queues; i++) {
  		ring = adapter->rx_ring[i];
  		do {
@@ -2073,11 +7909,10 @@ index 2f12fbe229c15..1d31b8cff4f10 100644
  		stats->rx_bytes += bytes;
  		stats->rx_packets += packets;
  	}
-diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
-index 0caa2df87c044..89ea3ef0ee162 100644
---- a/drivers/net/ethernet/marvell/mvneta.c
-+++ b/drivers/net/ethernet/marvell/mvneta.c
-@@ -813,14 +813,14 @@ mvneta_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/marvell/mvneta.c linux/drivers/net/ethernet/marvell/mvneta.c
+--- linux.orig/drivers/net/ethernet/marvell/mvneta.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/marvell/mvneta.c	2022-12-04 10:40:26.692034106 -0500
+@@ -813,14 +813,14 @@ mvneta_get_stats64(struct net_device *de
  
  		cpu_stats = per_cpu_ptr(pp->stats, cpu);
  		do {
@@ -2094,7 +7929,7 @@ index 0caa2df87c044..89ea3ef0ee162 100644
  
  		stats->rx_packets += rx_packets;
  		stats->rx_bytes   += rx_bytes;
-@@ -4762,7 +4762,7 @@ mvneta_ethtool_update_pcpu_stats(struct mvneta_port *pp,
+@@ -4762,7 +4762,7 @@ mvneta_ethtool_update_pcpu_stats(struct
  
  		stats = per_cpu_ptr(pp->stats, cpu);
  		do {
@@ -2103,7 +7938,7 @@ index 0caa2df87c044..89ea3ef0ee162 100644
  			skb_alloc_error = stats->es.skb_alloc_error;
  			refill_error = stats->es.refill_error;
  			xdp_redirect = stats->es.ps.xdp_redirect;
-@@ -4772,7 +4772,7 @@ mvneta_ethtool_update_pcpu_stats(struct mvneta_port *pp,
+@@ -4772,7 +4772,7 @@ mvneta_ethtool_update_pcpu_stats(struct
  			xdp_xmit_err = stats->es.ps.xdp_xmit_err;
  			xdp_tx = stats->es.ps.xdp_tx;
  			xdp_tx_err = stats->es.ps.xdp_tx_err;
@@ -2112,11 +7947,10 @@ index 0caa2df87c044..89ea3ef0ee162 100644
  
  		es->skb_alloc_error += skb_alloc_error;
  		es->refill_error += refill_error;
-diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
-index eaa51cd7456b6..9dd8e0315dd4f 100644
---- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
-+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
-@@ -2008,7 +2008,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *port, struct mvpp2_pcpu_stats *xdp_stats)
+diff -rupN linux.orig/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c linux/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+--- linux.orig/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c	2022-12-04 10:40:26.692034106 -0500
+@@ -2008,7 +2008,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *p
  
  		cpu_stats = per_cpu_ptr(port->stats, cpu);
  		do {
@@ -2125,7 +7959,7 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644
  			xdp_redirect = cpu_stats->xdp_redirect;
  			xdp_pass   = cpu_stats->xdp_pass;
  			xdp_drop = cpu_stats->xdp_drop;
-@@ -2016,7 +2016,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *port, struct mvpp2_pcpu_stats *xdp_stats)
+@@ -2016,7 +2016,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *p
  			xdp_xmit_err   = cpu_stats->xdp_xmit_err;
  			xdp_tx   = cpu_stats->xdp_tx;
  			xdp_tx_err   = cpu_stats->xdp_tx_err;
@@ -2134,7 +7968,7 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644
  
  		xdp_stats->xdp_redirect += xdp_redirect;
  		xdp_stats->xdp_pass   += xdp_pass;
-@@ -5115,12 +5115,12 @@ mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+@@ -5115,12 +5115,12 @@ mvpp2_get_stats64(struct net_device *dev
  
  		cpu_stats = per_cpu_ptr(port->stats, cpu);
  		do {
@@ -2149,11 +7983,10 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644
  
  		stats->rx_packets += rx_packets;
  		stats->rx_bytes   += rx_bytes;
-diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
-index bbea5458000bf..c9bb92187719c 100644
---- a/drivers/net/ethernet/marvell/sky2.c
-+++ b/drivers/net/ethernet/marvell/sky2.c
-@@ -3894,19 +3894,19 @@ static void sky2_get_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/marvell/sky2.c linux/drivers/net/ethernet/marvell/sky2.c
+--- linux.orig/drivers/net/ethernet/marvell/sky2.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/marvell/sky2.c	2022-12-04 10:40:26.692034106 -0500
+@@ -3894,19 +3894,19 @@ static void sky2_get_stats(struct net_de
  	u64 _bytes, _packets;
  
  	do {
@@ -2177,11 +8010,10 @@ index bbea5458000bf..c9bb92187719c 100644
  
  	stats->tx_packets = _packets;
  	stats->tx_bytes = _bytes;
-diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
-index b344632beaddf..988927f8c5d7d 100644
---- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
-+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
-@@ -853,7 +853,7 @@ static void mtk_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+--- linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c	2022-12-04 10:40:26.692034106 -0500
+@@ -853,7 +853,7 @@ static void mtk_get_stats64(struct net_d
  	}
  
  	do {
@@ -2190,7 +8022,7 @@ index b344632beaddf..988927f8c5d7d 100644
  		storage->rx_packets = hw_stats->rx_packets;
  		storage->tx_packets = hw_stats->tx_packets;
  		storage->rx_bytes = hw_stats->rx_bytes;
-@@ -865,7 +865,7 @@ static void mtk_get_stats64(struct net_device *dev,
+@@ -865,7 +865,7 @@ static void mtk_get_stats64(struct net_d
  		storage->rx_crc_errors = hw_stats->rx_fcs_errors;
  		storage->rx_errors = hw_stats->rx_checksum_errors;
  		storage->tx_aborted_errors = hw_stats->tx_skip;
@@ -2199,7 +8031,7 @@ index b344632beaddf..988927f8c5d7d 100644
  
  	storage->tx_errors = dev->stats.tx_errors;
  	storage->rx_dropped = dev->stats.rx_dropped;
-@@ -3664,13 +3664,13 @@ static void mtk_get_ethtool_stats(struct net_device *dev,
+@@ -3668,13 +3668,13 @@ static void mtk_get_ethtool_stats(struct
  
  	do {
  		data_dst = data;
@@ -2215,11 +8047,4339 @@ index b344632beaddf..988927f8c5d7d 100644
  }
  
  static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
-diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
-index 30c7b0e157218..fa2753318cdf7 100644
---- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
-+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
-@@ -827,12 +827,12 @@ mlxsw_sp_port_get_sw_stats64(const struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig
+--- linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig	2022-12-04 10:40:18.136056029 -0500
+@@ -0,0 +1,4325 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ *
++ *   Copyright (C) 2009-2016 John Crispin <blogic@openwrt.org>
++ *   Copyright (C) 2009-2016 Felix Fietkau <nbd@openwrt.org>
++ *   Copyright (C) 2013-2016 Michael Lee <igvtee@gmail.com>
++ */
++
++#include <linux/of_device.h>
++#include <linux/of_mdio.h>
++#include <linux/of_net.h>
++#include <linux/of_address.h>
++#include <linux/mfd/syscon.h>
++#include <linux/regmap.h>
++#include <linux/clk.h>
++#include <linux/pm_runtime.h>
++#include <linux/if_vlan.h>
++#include <linux/reset.h>
++#include <linux/tcp.h>
++#include <linux/interrupt.h>
++#include <linux/pinctrl/devinfo.h>
++#include <linux/phylink.h>
++#include <linux/jhash.h>
++#include <linux/bitfield.h>
++#include <net/dsa.h>
++
++#include "mtk_eth_soc.h"
++#include "mtk_wed.h"
++
++static int mtk_msg_level = -1;
++module_param_named(msg_level, mtk_msg_level, int, 0);
++MODULE_PARM_DESC(msg_level, "Message level (-1=defaults,0=none,...,16=all)");
++
++#define MTK_ETHTOOL_STAT(x) { #x, \
++			      offsetof(struct mtk_hw_stats, x) / sizeof(u64) }
++
++#define MTK_ETHTOOL_XDP_STAT(x) { #x, \
++				  offsetof(struct mtk_hw_stats, xdp_stats.x) / \
++				  sizeof(u64) }
++
++static const struct mtk_reg_map mtk_reg_map = {
++	.tx_irq_mask		= 0x1a1c,
++	.tx_irq_status		= 0x1a18,
++	.pdma = {
++		.rx_ptr		= 0x0900,
++		.rx_cnt_cfg	= 0x0904,
++		.pcrx_ptr	= 0x0908,
++		.glo_cfg	= 0x0a04,
++		.rst_idx	= 0x0a08,
++		.delay_irq	= 0x0a0c,
++		.irq_status	= 0x0a20,
++		.irq_mask	= 0x0a28,
++		.int_grp	= 0x0a50,
++	},
++	.qdma = {
++		.qtx_cfg	= 0x1800,
++		.rx_ptr		= 0x1900,
++		.rx_cnt_cfg	= 0x1904,
++		.qcrx_ptr	= 0x1908,
++		.glo_cfg	= 0x1a04,
++		.rst_idx	= 0x1a08,
++		.delay_irq	= 0x1a0c,
++		.fc_th		= 0x1a10,
++		.int_grp	= 0x1a20,
++		.hred		= 0x1a44,
++		.ctx_ptr	= 0x1b00,
++		.dtx_ptr	= 0x1b04,
++		.crx_ptr	= 0x1b10,
++		.drx_ptr	= 0x1b14,
++		.fq_head	= 0x1b20,
++		.fq_tail	= 0x1b24,
++		.fq_count	= 0x1b28,
++		.fq_blen	= 0x1b2c,
++	},
++	.gdm1_cnt		= 0x2400,
++};
++
++static const struct mtk_reg_map mt7628_reg_map = {
++	.tx_irq_mask		= 0x0a28,
++	.tx_irq_status		= 0x0a20,
++	.pdma = {
++		.rx_ptr		= 0x0900,
++		.rx_cnt_cfg	= 0x0904,
++		.pcrx_ptr	= 0x0908,
++		.glo_cfg	= 0x0a04,
++		.rst_idx	= 0x0a08,
++		.delay_irq	= 0x0a0c,
++		.irq_status	= 0x0a20,
++		.irq_mask	= 0x0a28,
++		.int_grp	= 0x0a50,
++	},
++};
++
++static const struct mtk_reg_map mt7986_reg_map = {
++	.tx_irq_mask		= 0x461c,
++	.tx_irq_status		= 0x4618,
++	.pdma = {
++		.rx_ptr		= 0x6100,
++		.rx_cnt_cfg	= 0x6104,
++		.pcrx_ptr	= 0x6108,
++		.glo_cfg	= 0x6204,
++		.rst_idx	= 0x6208,
++		.delay_irq	= 0x620c,
++		.irq_status	= 0x6220,
++		.irq_mask	= 0x6228,
++		.int_grp	= 0x6250,
++	},
++	.qdma = {
++		.qtx_cfg	= 0x4400,
++		.rx_ptr		= 0x4500,
++		.rx_cnt_cfg	= 0x4504,
++		.qcrx_ptr	= 0x4508,
++		.glo_cfg	= 0x4604,
++		.rst_idx	= 0x4608,
++		.delay_irq	= 0x460c,
++		.fc_th		= 0x4610,
++		.int_grp	= 0x4620,
++		.hred		= 0x4644,
++		.ctx_ptr	= 0x4700,
++		.dtx_ptr	= 0x4704,
++		.crx_ptr	= 0x4710,
++		.drx_ptr	= 0x4714,
++		.fq_head	= 0x4720,
++		.fq_tail	= 0x4724,
++		.fq_count	= 0x4728,
++		.fq_blen	= 0x472c,
++	},
++	.gdm1_cnt		= 0x1c00,
++};
++
++/* strings used by ethtool */
++static const struct mtk_ethtool_stats {
++	char str[ETH_GSTRING_LEN];
++	u32 offset;
++} mtk_ethtool_stats[] = {
++	MTK_ETHTOOL_STAT(tx_bytes),
++	MTK_ETHTOOL_STAT(tx_packets),
++	MTK_ETHTOOL_STAT(tx_skip),
++	MTK_ETHTOOL_STAT(tx_collisions),
++	MTK_ETHTOOL_STAT(rx_bytes),
++	MTK_ETHTOOL_STAT(rx_packets),
++	MTK_ETHTOOL_STAT(rx_overflow),
++	MTK_ETHTOOL_STAT(rx_fcs_errors),
++	MTK_ETHTOOL_STAT(rx_short_errors),
++	MTK_ETHTOOL_STAT(rx_long_errors),
++	MTK_ETHTOOL_STAT(rx_checksum_errors),
++	MTK_ETHTOOL_STAT(rx_flow_control_packets),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_redirect),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_pass),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_drop),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_tx),
++	MTK_ETHTOOL_XDP_STAT(rx_xdp_tx_errors),
++	MTK_ETHTOOL_XDP_STAT(tx_xdp_xmit),
++	MTK_ETHTOOL_XDP_STAT(tx_xdp_xmit_errors),
++};
++
++static const char * const mtk_clks_source_name[] = {
++	"ethif", "sgmiitop", "esw", "gp0", "gp1", "gp2", "fe", "trgpll",
++	"sgmii_tx250m", "sgmii_rx250m", "sgmii_cdr_ref", "sgmii_cdr_fb",
++	"sgmii2_tx250m", "sgmii2_rx250m", "sgmii2_cdr_ref", "sgmii2_cdr_fb",
++	"sgmii_ck", "eth2pll", "wocpu0", "wocpu1", "netsys0", "netsys1"
++};
++
++void mtk_w32(struct mtk_eth *eth, u32 val, unsigned reg)
++{
++	__raw_writel(val, eth->base + reg);
++}
++
++u32 mtk_r32(struct mtk_eth *eth, unsigned reg)
++{
++	return __raw_readl(eth->base + reg);
++}
++
++static u32 mtk_m32(struct mtk_eth *eth, u32 mask, u32 set, unsigned reg)
++{
++	u32 val;
++
++	val = mtk_r32(eth, reg);
++	val &= ~mask;
++	val |= set;
++	mtk_w32(eth, val, reg);
++	return reg;
++}
++
++static int mtk_mdio_busy_wait(struct mtk_eth *eth)
++{
++	unsigned long t_start = jiffies;
++
++	while (1) {
++		if (!(mtk_r32(eth, MTK_PHY_IAC) & PHY_IAC_ACCESS))
++			return 0;
++		if (time_after(jiffies, t_start + PHY_IAC_TIMEOUT))
++			break;
++		cond_resched();
++	}
++
++	dev_err(eth->dev, "mdio: MDIO timeout\n");
++	return -ETIMEDOUT;
++}
++
++static int _mtk_mdio_write(struct mtk_eth *eth, u32 phy_addr, u32 phy_reg,
++			   u32 write_data)
++{
++	int ret;
++
++	ret = mtk_mdio_busy_wait(eth);
++	if (ret < 0)
++		return ret;
++
++	if (phy_reg & MII_ADDR_C45) {
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C45 |
++			     PHY_IAC_CMD_C45_ADDR |
++			     PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) |
++			     PHY_IAC_ADDR(phy_addr) |
++			     PHY_IAC_DATA(mdiobus_c45_regad(phy_reg)),
++			MTK_PHY_IAC);
++
++		ret = mtk_mdio_busy_wait(eth);
++		if (ret < 0)
++			return ret;
++
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C45 |
++			     PHY_IAC_CMD_WRITE |
++			     PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) |
++			     PHY_IAC_ADDR(phy_addr) |
++			     PHY_IAC_DATA(write_data),
++			MTK_PHY_IAC);
++	} else {
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C22 |
++			     PHY_IAC_CMD_WRITE |
++			     PHY_IAC_REG(phy_reg) |
++			     PHY_IAC_ADDR(phy_addr) |
++			     PHY_IAC_DATA(write_data),
++			MTK_PHY_IAC);
++	}
++
++	ret = mtk_mdio_busy_wait(eth);
++	if (ret < 0)
++		return ret;
++
++	return 0;
++}
++
++static int _mtk_mdio_read(struct mtk_eth *eth, u32 phy_addr, u32 phy_reg)
++{
++	int ret;
++
++	ret = mtk_mdio_busy_wait(eth);
++	if (ret < 0)
++		return ret;
++
++	if (phy_reg & MII_ADDR_C45) {
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C45 |
++			     PHY_IAC_CMD_C45_ADDR |
++			     PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) |
++			     PHY_IAC_ADDR(phy_addr) |
++			     PHY_IAC_DATA(mdiobus_c45_regad(phy_reg)),
++			MTK_PHY_IAC);
++
++		ret = mtk_mdio_busy_wait(eth);
++		if (ret < 0)
++			return ret;
++
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C45 |
++			     PHY_IAC_CMD_C45_READ |
++			     PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) |
++			     PHY_IAC_ADDR(phy_addr),
++			MTK_PHY_IAC);
++	} else {
++		mtk_w32(eth, PHY_IAC_ACCESS |
++			     PHY_IAC_START_C22 |
++			     PHY_IAC_CMD_C22_READ |
++			     PHY_IAC_REG(phy_reg) |
++			     PHY_IAC_ADDR(phy_addr),
++			MTK_PHY_IAC);
++	}
++
++	ret = mtk_mdio_busy_wait(eth);
++	if (ret < 0)
++		return ret;
++
++	return mtk_r32(eth, MTK_PHY_IAC) & PHY_IAC_DATA_MASK;
++}
++
++static int mtk_mdio_write(struct mii_bus *bus, int phy_addr,
++			  int phy_reg, u16 val)
++{
++	struct mtk_eth *eth = bus->priv;
++
++	return _mtk_mdio_write(eth, phy_addr, phy_reg, val);
++}
++
++static int mtk_mdio_read(struct mii_bus *bus, int phy_addr, int phy_reg)
++{
++	struct mtk_eth *eth = bus->priv;
++
++	return _mtk_mdio_read(eth, phy_addr, phy_reg);
++}
++
++static int mt7621_gmac0_rgmii_adjust(struct mtk_eth *eth,
++				     phy_interface_t interface)
++{
++	u32 val;
++
++	/* Check DDR memory type.
++	 * Currently TRGMII mode with DDR2 memory is not supported.
++	 */
++	regmap_read(eth->ethsys, ETHSYS_SYSCFG, &val);
++	if (interface == PHY_INTERFACE_MODE_TRGMII &&
++	    val & SYSCFG_DRAM_TYPE_DDR2) {
++		dev_err(eth->dev,
++			"TRGMII mode with DDR2 memory is not supported!\n");
++		return -EOPNOTSUPP;
++	}
++
++	val = (interface == PHY_INTERFACE_MODE_TRGMII) ?
++		ETHSYS_TRGMII_MT7621_DDR_PLL : 0;
++
++	regmap_update_bits(eth->ethsys, ETHSYS_CLKCFG0,
++			   ETHSYS_TRGMII_MT7621_MASK, val);
++
++	return 0;
++}
++
++static void mtk_gmac0_rgmii_adjust(struct mtk_eth *eth,
++				   phy_interface_t interface, int speed)
++{
++	u32 val;
++	int ret;
++
++	if (interface == PHY_INTERFACE_MODE_TRGMII) {
++		mtk_w32(eth, TRGMII_MODE, INTF_MODE);
++		val = 500000000;
++		ret = clk_set_rate(eth->clks[MTK_CLK_TRGPLL], val);
++		if (ret)
++			dev_err(eth->dev, "Failed to set trgmii pll: %d\n", ret);
++		return;
++	}
++
++	val = (speed == SPEED_1000) ?
++		INTF_MODE_RGMII_1000 : INTF_MODE_RGMII_10_100;
++	mtk_w32(eth, val, INTF_MODE);
++
++	regmap_update_bits(eth->ethsys, ETHSYS_CLKCFG0,
++			   ETHSYS_TRGMII_CLK_SEL362_5,
++			   ETHSYS_TRGMII_CLK_SEL362_5);
++
++	val = (speed == SPEED_1000) ? 250000000 : 500000000;
++	ret = clk_set_rate(eth->clks[MTK_CLK_TRGPLL], val);
++	if (ret)
++		dev_err(eth->dev, "Failed to set trgmii pll: %d\n", ret);
++
++	val = (speed == SPEED_1000) ?
++		RCK_CTRL_RGMII_1000 : RCK_CTRL_RGMII_10_100;
++	mtk_w32(eth, val, TRGMII_RCK_CTRL);
++
++	val = (speed == SPEED_1000) ?
++		TCK_CTRL_RGMII_1000 : TCK_CTRL_RGMII_10_100;
++	mtk_w32(eth, val, TRGMII_TCK_CTRL);
++}
++
++static struct phylink_pcs *mtk_mac_select_pcs(struct phylink_config *config,
++					      phy_interface_t interface)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	struct mtk_eth *eth = mac->hw;
++	unsigned int sid;
++
++	if (interface == PHY_INTERFACE_MODE_SGMII ||
++	    phy_interface_mode_is_8023z(interface)) {
++		sid = (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_SGMII)) ?
++		       0 : mac->id;
++
++		return mtk_sgmii_select_pcs(eth->sgmii, sid);
++	}
++
++	return NULL;
++}
++
++static void mtk_mac_config(struct phylink_config *config, unsigned int mode,
++			   const struct phylink_link_state *state)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	struct mtk_eth *eth = mac->hw;
++	int val, ge_mode, err = 0;
++	u32 i;
++
++	/* MT76x8 has no hardware settings between for the MAC */
++	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) &&
++	    mac->interface != state->interface) {
++		/* Setup soc pin functions */
++		switch (state->interface) {
++		case PHY_INTERFACE_MODE_TRGMII:
++			if (mac->id)
++				goto err_phy;
++			if (!MTK_HAS_CAPS(mac->hw->soc->caps,
++					  MTK_GMAC1_TRGMII))
++				goto err_phy;
++			fallthrough;
++		case PHY_INTERFACE_MODE_RGMII_TXID:
++		case PHY_INTERFACE_MODE_RGMII_RXID:
++		case PHY_INTERFACE_MODE_RGMII_ID:
++		case PHY_INTERFACE_MODE_RGMII:
++		case PHY_INTERFACE_MODE_MII:
++		case PHY_INTERFACE_MODE_REVMII:
++		case PHY_INTERFACE_MODE_RMII:
++			if (MTK_HAS_CAPS(eth->soc->caps, MTK_RGMII)) {
++				err = mtk_gmac_rgmii_path_setup(eth, mac->id);
++				if (err)
++					goto init_err;
++			}
++			break;
++		case PHY_INTERFACE_MODE_1000BASEX:
++		case PHY_INTERFACE_MODE_2500BASEX:
++		case PHY_INTERFACE_MODE_SGMII:
++			if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) {
++				err = mtk_gmac_sgmii_path_setup(eth, mac->id);
++				if (err)
++					goto init_err;
++			}
++			break;
++		case PHY_INTERFACE_MODE_GMII:
++			if (MTK_HAS_CAPS(eth->soc->caps, MTK_GEPHY)) {
++				err = mtk_gmac_gephy_path_setup(eth, mac->id);
++				if (err)
++					goto init_err;
++			}
++			break;
++		default:
++			goto err_phy;
++		}
++
++		/* Setup clock for 1st gmac */
++		if (!mac->id && state->interface != PHY_INTERFACE_MODE_SGMII &&
++		    !phy_interface_mode_is_8023z(state->interface) &&
++		    MTK_HAS_CAPS(mac->hw->soc->caps, MTK_GMAC1_TRGMII)) {
++			if (MTK_HAS_CAPS(mac->hw->soc->caps,
++					 MTK_TRGMII_MT7621_CLK)) {
++				if (mt7621_gmac0_rgmii_adjust(mac->hw,
++							      state->interface))
++					goto err_phy;
++			} else {
++				/* FIXME: this is incorrect. Not only does it
++				 * use state->speed (which is not guaranteed
++				 * to be correct) but it also makes use of it
++				 * in a code path that will only be reachable
++				 * when the PHY interface mode changes, not
++				 * when the speed changes. Consequently, RGMII
++				 * is probably broken.
++				 */
++				mtk_gmac0_rgmii_adjust(mac->hw,
++						       state->interface,
++						       state->speed);
++
++				/* mt7623_pad_clk_setup */
++				for (i = 0 ; i < NUM_TRGMII_CTRL; i++)
++					mtk_w32(mac->hw,
++						TD_DM_DRVP(8) | TD_DM_DRVN(8),
++						TRGMII_TD_ODT(i));
++
++				/* Assert/release MT7623 RXC reset */
++				mtk_m32(mac->hw, 0, RXC_RST | RXC_DQSISEL,
++					TRGMII_RCK_CTRL);
++				mtk_m32(mac->hw, RXC_RST, 0, TRGMII_RCK_CTRL);
++			}
++		}
++
++		ge_mode = 0;
++		switch (state->interface) {
++		case PHY_INTERFACE_MODE_MII:
++		case PHY_INTERFACE_MODE_GMII:
++			ge_mode = 1;
++			break;
++		case PHY_INTERFACE_MODE_REVMII:
++			ge_mode = 2;
++			break;
++		case PHY_INTERFACE_MODE_RMII:
++			if (mac->id)
++				goto err_phy;
++			ge_mode = 3;
++			break;
++		default:
++			break;
++		}
++
++		/* put the gmac into the right mode */
++		regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val);
++		val &= ~SYSCFG0_GE_MODE(SYSCFG0_GE_MASK, mac->id);
++		val |= SYSCFG0_GE_MODE(ge_mode, mac->id);
++		regmap_write(eth->ethsys, ETHSYS_SYSCFG0, val);
++
++		mac->interface = state->interface;
++	}
++
++	/* SGMII */
++	if (state->interface == PHY_INTERFACE_MODE_SGMII ||
++	    phy_interface_mode_is_8023z(state->interface)) {
++		/* The path GMAC to SGMII will be enabled once the SGMIISYS is
++		 * being setup done.
++		 */
++		regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val);
++
++		regmap_update_bits(eth->ethsys, ETHSYS_SYSCFG0,
++				   SYSCFG0_SGMII_MASK,
++				   ~(u32)SYSCFG0_SGMII_MASK);
++
++		/* Save the syscfg0 value for mac_finish */
++		mac->syscfg0 = val;
++	} else if (phylink_autoneg_inband(mode)) {
++		dev_err(eth->dev,
++			"In-band mode not supported in non SGMII mode!\n");
++		return;
++	}
++
++	return;
++
++err_phy:
++	dev_err(eth->dev, "%s: GMAC%d mode %s not supported!\n", __func__,
++		mac->id, phy_modes(state->interface));
++	return;
++
++init_err:
++	dev_err(eth->dev, "%s: GMAC%d mode %s err: %d!\n", __func__,
++		mac->id, phy_modes(state->interface), err);
++}
++
++static int mtk_mac_finish(struct phylink_config *config, unsigned int mode,
++			  phy_interface_t interface)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	struct mtk_eth *eth = mac->hw;
++	u32 mcr_cur, mcr_new;
++
++	/* Enable SGMII */
++	if (interface == PHY_INTERFACE_MODE_SGMII ||
++	    phy_interface_mode_is_8023z(interface))
++		regmap_update_bits(eth->ethsys, ETHSYS_SYSCFG0,
++				   SYSCFG0_SGMII_MASK, mac->syscfg0);
++
++	/* Setup gmac */
++	mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
++	mcr_new = mcr_cur;
++	mcr_new |= MAC_MCR_IPG_CFG | MAC_MCR_FORCE_MODE |
++		   MAC_MCR_BACKOFF_EN | MAC_MCR_BACKPR_EN | MAC_MCR_FORCE_LINK;
++
++	/* Only update control register when needed! */
++	if (mcr_new != mcr_cur)
++		mtk_w32(mac->hw, mcr_new, MTK_MAC_MCR(mac->id));
++
++	return 0;
++}
++
++static void mtk_mac_pcs_get_state(struct phylink_config *config,
++				  struct phylink_link_state *state)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	u32 pmsr = mtk_r32(mac->hw, MTK_MAC_MSR(mac->id));
++
++	state->link = (pmsr & MAC_MSR_LINK);
++	state->duplex = (pmsr & MAC_MSR_DPX) >> 1;
++
++	switch (pmsr & (MAC_MSR_SPEED_1000 | MAC_MSR_SPEED_100)) {
++	case 0:
++		state->speed = SPEED_10;
++		break;
++	case MAC_MSR_SPEED_100:
++		state->speed = SPEED_100;
++		break;
++	case MAC_MSR_SPEED_1000:
++		state->speed = SPEED_1000;
++		break;
++	default:
++		state->speed = SPEED_UNKNOWN;
++		break;
++	}
++
++	state->pause &= (MLO_PAUSE_RX | MLO_PAUSE_TX);
++	if (pmsr & MAC_MSR_RX_FC)
++		state->pause |= MLO_PAUSE_RX;
++	if (pmsr & MAC_MSR_TX_FC)
++		state->pause |= MLO_PAUSE_TX;
++}
++
++static void mtk_mac_link_down(struct phylink_config *config, unsigned int mode,
++			      phy_interface_t interface)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	u32 mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
++
++	mcr &= ~(MAC_MCR_TX_EN | MAC_MCR_RX_EN);
++	mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id));
++}
++
++static void mtk_mac_link_up(struct phylink_config *config,
++			    struct phy_device *phy,
++			    unsigned int mode, phy_interface_t interface,
++			    int speed, int duplex, bool tx_pause, bool rx_pause)
++{
++	struct mtk_mac *mac = container_of(config, struct mtk_mac,
++					   phylink_config);
++	u32 mcr;
++
++	mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
++	mcr &= ~(MAC_MCR_SPEED_100 | MAC_MCR_SPEED_1000 |
++		 MAC_MCR_FORCE_DPX | MAC_MCR_FORCE_TX_FC |
++		 MAC_MCR_FORCE_RX_FC);
++
++	/* Configure speed */
++	switch (speed) {
++	case SPEED_2500:
++	case SPEED_1000:
++		mcr |= MAC_MCR_SPEED_1000;
++		break;
++	case SPEED_100:
++		mcr |= MAC_MCR_SPEED_100;
++		break;
++	}
++
++	/* Configure duplex */
++	if (duplex == DUPLEX_FULL)
++		mcr |= MAC_MCR_FORCE_DPX;
++
++	/* Configure pause modes - phylink will avoid these for half duplex */
++	if (tx_pause)
++		mcr |= MAC_MCR_FORCE_TX_FC;
++	if (rx_pause)
++		mcr |= MAC_MCR_FORCE_RX_FC;
++
++	mcr |= MAC_MCR_TX_EN | MAC_MCR_RX_EN;
++	mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id));
++}
++
++static const struct phylink_mac_ops mtk_phylink_ops = {
++	.validate = phylink_generic_validate,
++	.mac_select_pcs = mtk_mac_select_pcs,
++	.mac_pcs_get_state = mtk_mac_pcs_get_state,
++	.mac_config = mtk_mac_config,
++	.mac_finish = mtk_mac_finish,
++	.mac_link_down = mtk_mac_link_down,
++	.mac_link_up = mtk_mac_link_up,
++};
++
++static int mtk_mdio_init(struct mtk_eth *eth)
++{
++	struct device_node *mii_np;
++	int ret;
++
++	mii_np = of_get_child_by_name(eth->dev->of_node, "mdio-bus");
++	if (!mii_np) {
++		dev_err(eth->dev, "no %s child node found", "mdio-bus");
++		return -ENODEV;
++	}
++
++	if (!of_device_is_available(mii_np)) {
++		ret = -ENODEV;
++		goto err_put_node;
++	}
++
++	eth->mii_bus = devm_mdiobus_alloc(eth->dev);
++	if (!eth->mii_bus) {
++		ret = -ENOMEM;
++		goto err_put_node;
++	}
++
++	eth->mii_bus->name = "mdio";
++	eth->mii_bus->read = mtk_mdio_read;
++	eth->mii_bus->write = mtk_mdio_write;
++	eth->mii_bus->probe_capabilities = MDIOBUS_C22_C45;
++	eth->mii_bus->priv = eth;
++	eth->mii_bus->parent = eth->dev;
++
++	snprintf(eth->mii_bus->id, MII_BUS_ID_SIZE, "%pOFn", mii_np);
++	ret = of_mdiobus_register(eth->mii_bus, mii_np);
++
++err_put_node:
++	of_node_put(mii_np);
++	return ret;
++}
++
++static void mtk_mdio_cleanup(struct mtk_eth *eth)
++{
++	if (!eth->mii_bus)
++		return;
++
++	mdiobus_unregister(eth->mii_bus);
++}
++
++static inline void mtk_tx_irq_disable(struct mtk_eth *eth, u32 mask)
++{
++	unsigned long flags;
++	u32 val;
++
++	spin_lock_irqsave(&eth->tx_irq_lock, flags);
++	val = mtk_r32(eth, eth->soc->reg_map->tx_irq_mask);
++	mtk_w32(eth, val & ~mask, eth->soc->reg_map->tx_irq_mask);
++	spin_unlock_irqrestore(&eth->tx_irq_lock, flags);
++}
++
++static inline void mtk_tx_irq_enable(struct mtk_eth *eth, u32 mask)
++{
++	unsigned long flags;
++	u32 val;
++
++	spin_lock_irqsave(&eth->tx_irq_lock, flags);
++	val = mtk_r32(eth, eth->soc->reg_map->tx_irq_mask);
++	mtk_w32(eth, val | mask, eth->soc->reg_map->tx_irq_mask);
++	spin_unlock_irqrestore(&eth->tx_irq_lock, flags);
++}
++
++static inline void mtk_rx_irq_disable(struct mtk_eth *eth, u32 mask)
++{
++	unsigned long flags;
++	u32 val;
++
++	spin_lock_irqsave(&eth->rx_irq_lock, flags);
++	val = mtk_r32(eth, eth->soc->reg_map->pdma.irq_mask);
++	mtk_w32(eth, val & ~mask, eth->soc->reg_map->pdma.irq_mask);
++	spin_unlock_irqrestore(&eth->rx_irq_lock, flags);
++}
++
++static inline void mtk_rx_irq_enable(struct mtk_eth *eth, u32 mask)
++{
++	unsigned long flags;
++	u32 val;
++
++	spin_lock_irqsave(&eth->rx_irq_lock, flags);
++	val = mtk_r32(eth, eth->soc->reg_map->pdma.irq_mask);
++	mtk_w32(eth, val | mask, eth->soc->reg_map->pdma.irq_mask);
++	spin_unlock_irqrestore(&eth->rx_irq_lock, flags);
++}
++
++static int mtk_set_mac_address(struct net_device *dev, void *p)
++{
++	int ret = eth_mac_addr(dev, p);
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	const char *macaddr = dev->dev_addr;
++
++	if (ret)
++		return ret;
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return -EBUSY;
++
++	spin_lock_bh(&mac->hw->page_lock);
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1],
++			MT7628_SDM_MAC_ADRH);
++		mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) |
++			(macaddr[4] << 8) | macaddr[5],
++			MT7628_SDM_MAC_ADRL);
++	} else {
++		mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1],
++			MTK_GDMA_MAC_ADRH(mac->id));
++		mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) |
++			(macaddr[4] << 8) | macaddr[5],
++			MTK_GDMA_MAC_ADRL(mac->id));
++	}
++	spin_unlock_bh(&mac->hw->page_lock);
++
++	return 0;
++}
++
++void mtk_stats_update_mac(struct mtk_mac *mac)
++{
++	struct mtk_hw_stats *hw_stats = mac->hw_stats;
++	struct mtk_eth *eth = mac->hw;
++
++	u64_stats_update_begin(&hw_stats->syncp);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		hw_stats->tx_packets += mtk_r32(mac->hw, MT7628_SDM_TPCNT);
++		hw_stats->tx_bytes += mtk_r32(mac->hw, MT7628_SDM_TBCNT);
++		hw_stats->rx_packets += mtk_r32(mac->hw, MT7628_SDM_RPCNT);
++		hw_stats->rx_bytes += mtk_r32(mac->hw, MT7628_SDM_RBCNT);
++		hw_stats->rx_checksum_errors +=
++			mtk_r32(mac->hw, MT7628_SDM_CS_ERR);
++	} else {
++		const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++		unsigned int offs = hw_stats->reg_offset;
++		u64 stats;
++
++		hw_stats->rx_bytes += mtk_r32(mac->hw, reg_map->gdm1_cnt + offs);
++		stats = mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x4 + offs);
++		if (stats)
++			hw_stats->rx_bytes += (stats << 32);
++		hw_stats->rx_packets +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x8 + offs);
++		hw_stats->rx_overflow +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x10 + offs);
++		hw_stats->rx_fcs_errors +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x14 + offs);
++		hw_stats->rx_short_errors +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x18 + offs);
++		hw_stats->rx_long_errors +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x1c + offs);
++		hw_stats->rx_checksum_errors +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x20 + offs);
++		hw_stats->rx_flow_control_packets +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x24 + offs);
++		hw_stats->tx_skip +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x28 + offs);
++		hw_stats->tx_collisions +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x2c + offs);
++		hw_stats->tx_bytes +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x30 + offs);
++		stats =  mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x34 + offs);
++		if (stats)
++			hw_stats->tx_bytes += (stats << 32);
++		hw_stats->tx_packets +=
++			mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x38 + offs);
++	}
++
++	u64_stats_update_end(&hw_stats->syncp);
++}
++
++static void mtk_stats_update(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->mac[i] || !eth->mac[i]->hw_stats)
++			continue;
++		if (spin_trylock(&eth->mac[i]->hw_stats->stats_lock)) {
++			mtk_stats_update_mac(eth->mac[i]);
++			spin_unlock(&eth->mac[i]->hw_stats->stats_lock);
++		}
++	}
++}
++
++static void mtk_get_stats64(struct net_device *dev,
++			    struct rtnl_link_stats64 *storage)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_hw_stats *hw_stats = mac->hw_stats;
++	unsigned int start;
++
++	if (netif_running(dev) && netif_device_present(dev)) {
++		if (spin_trylock_bh(&hw_stats->stats_lock)) {
++			mtk_stats_update_mac(mac);
++			spin_unlock_bh(&hw_stats->stats_lock);
++		}
++	}
++
++	do {
++		start = u64_stats_fetch_begin_irq(&hw_stats->syncp);
++		storage->rx_packets = hw_stats->rx_packets;
++		storage->tx_packets = hw_stats->tx_packets;
++		storage->rx_bytes = hw_stats->rx_bytes;
++		storage->tx_bytes = hw_stats->tx_bytes;
++		storage->collisions = hw_stats->tx_collisions;
++		storage->rx_length_errors = hw_stats->rx_short_errors +
++			hw_stats->rx_long_errors;
++		storage->rx_over_errors = hw_stats->rx_overflow;
++		storage->rx_crc_errors = hw_stats->rx_fcs_errors;
++		storage->rx_errors = hw_stats->rx_checksum_errors;
++		storage->tx_aborted_errors = hw_stats->tx_skip;
++	} while (u64_stats_fetch_retry_irq(&hw_stats->syncp, start));
++
++	storage->tx_errors = dev->stats.tx_errors;
++	storage->rx_dropped = dev->stats.rx_dropped;
++	storage->tx_dropped = dev->stats.tx_dropped;
++}
++
++static inline int mtk_max_frag_size(int mtu)
++{
++	/* make sure buf_size will be at least MTK_MAX_RX_LENGTH */
++	if (mtu + MTK_RX_ETH_HLEN < MTK_MAX_RX_LENGTH_2K)
++		mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN;
++
++	return SKB_DATA_ALIGN(MTK_RX_HLEN + mtu) +
++		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++}
++
++static inline int mtk_max_buf_size(int frag_size)
++{
++	int buf_size = frag_size - NET_SKB_PAD - NET_IP_ALIGN -
++		       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++
++	WARN_ON(buf_size < MTK_MAX_RX_LENGTH_2K);
++
++	return buf_size;
++}
++
++static bool mtk_rx_get_desc(struct mtk_eth *eth, struct mtk_rx_dma_v2 *rxd,
++			    struct mtk_rx_dma_v2 *dma_rxd)
++{
++	rxd->rxd2 = READ_ONCE(dma_rxd->rxd2);
++	if (!(rxd->rxd2 & RX_DMA_DONE))
++		return false;
++
++	rxd->rxd1 = READ_ONCE(dma_rxd->rxd1);
++	rxd->rxd3 = READ_ONCE(dma_rxd->rxd3);
++	rxd->rxd4 = READ_ONCE(dma_rxd->rxd4);
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++		rxd->rxd5 = READ_ONCE(dma_rxd->rxd5);
++		rxd->rxd6 = READ_ONCE(dma_rxd->rxd6);
++	}
++
++	return true;
++}
++
++static void *mtk_max_lro_buf_alloc(gfp_t gfp_mask)
++{
++	unsigned int size = mtk_max_frag_size(MTK_MAX_LRO_RX_LENGTH);
++	unsigned long data;
++
++	data = __get_free_pages(gfp_mask | __GFP_COMP | __GFP_NOWARN,
++				get_order(size));
++
++	return (void *)data;
++}
++
++/* the qdma core needs scratch memory to be setup */
++static int mtk_init_fq_dma(struct mtk_eth *eth)
++{
++	const struct mtk_soc_data *soc = eth->soc;
++	dma_addr_t phy_ring_tail;
++	int cnt = MTK_DMA_SIZE;
++	dma_addr_t dma_addr;
++	int i;
++
++	eth->scratch_ring = dma_alloc_coherent(eth->dma_dev,
++					       cnt * soc->txrx.txd_size,
++					       &eth->phy_scratch_ring,
++					       GFP_KERNEL);
++	if (unlikely(!eth->scratch_ring))
++		return -ENOMEM;
++
++	eth->scratch_head = kcalloc(cnt, MTK_QDMA_PAGE_SIZE, GFP_KERNEL);
++	if (unlikely(!eth->scratch_head))
++		return -ENOMEM;
++
++	dma_addr = dma_map_single(eth->dma_dev,
++				  eth->scratch_head, cnt * MTK_QDMA_PAGE_SIZE,
++				  DMA_FROM_DEVICE);
++	if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr)))
++		return -ENOMEM;
++
++	phy_ring_tail = eth->phy_scratch_ring + soc->txrx.txd_size * (cnt - 1);
++
++	for (i = 0; i < cnt; i++) {
++		struct mtk_tx_dma_v2 *txd;
++
++		txd = eth->scratch_ring + i * soc->txrx.txd_size;
++		txd->txd1 = dma_addr + i * MTK_QDMA_PAGE_SIZE;
++		if (i < cnt - 1)
++			txd->txd2 = eth->phy_scratch_ring +
++				    (i + 1) * soc->txrx.txd_size;
++
++		txd->txd3 = TX_DMA_PLEN0(MTK_QDMA_PAGE_SIZE);
++		txd->txd4 = 0;
++		if (MTK_HAS_CAPS(soc->caps, MTK_NETSYS_V2)) {
++			txd->txd5 = 0;
++			txd->txd6 = 0;
++			txd->txd7 = 0;
++			txd->txd8 = 0;
++		}
++	}
++
++	mtk_w32(eth, eth->phy_scratch_ring, soc->reg_map->qdma.fq_head);
++	mtk_w32(eth, phy_ring_tail, soc->reg_map->qdma.fq_tail);
++	mtk_w32(eth, (cnt << 16) | cnt, soc->reg_map->qdma.fq_count);
++	mtk_w32(eth, MTK_QDMA_PAGE_SIZE << 16, soc->reg_map->qdma.fq_blen);
++
++	return 0;
++}
++
++static void *mtk_qdma_phys_to_virt(struct mtk_tx_ring *ring, u32 desc)
++{
++	return ring->dma + (desc - ring->phys);
++}
++
++static struct mtk_tx_buf *mtk_desc_to_tx_buf(struct mtk_tx_ring *ring,
++					     void *txd, u32 txd_size)
++{
++	int idx = (txd - ring->dma) / txd_size;
++
++	return &ring->buf[idx];
++}
++
++static struct mtk_tx_dma *qdma_to_pdma(struct mtk_tx_ring *ring,
++				       struct mtk_tx_dma *dma)
++{
++	return ring->dma_pdma - (struct mtk_tx_dma *)ring->dma + dma;
++}
++
++static int txd_to_idx(struct mtk_tx_ring *ring, void *dma, u32 txd_size)
++{
++	return (dma - ring->dma) / txd_size;
++}
++
++static void mtk_tx_unmap(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf,
++			 struct xdp_frame_bulk *bq, bool napi)
++{
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) {
++			dma_unmap_single(eth->dma_dev,
++					 dma_unmap_addr(tx_buf, dma_addr0),
++					 dma_unmap_len(tx_buf, dma_len0),
++					 DMA_TO_DEVICE);
++		} else if (tx_buf->flags & MTK_TX_FLAGS_PAGE0) {
++			dma_unmap_page(eth->dma_dev,
++				       dma_unmap_addr(tx_buf, dma_addr0),
++				       dma_unmap_len(tx_buf, dma_len0),
++				       DMA_TO_DEVICE);
++		}
++	} else {
++		if (dma_unmap_len(tx_buf, dma_len0)) {
++			dma_unmap_page(eth->dma_dev,
++				       dma_unmap_addr(tx_buf, dma_addr0),
++				       dma_unmap_len(tx_buf, dma_len0),
++				       DMA_TO_DEVICE);
++		}
++
++		if (dma_unmap_len(tx_buf, dma_len1)) {
++			dma_unmap_page(eth->dma_dev,
++				       dma_unmap_addr(tx_buf, dma_addr1),
++				       dma_unmap_len(tx_buf, dma_len1),
++				       DMA_TO_DEVICE);
++		}
++	}
++
++	if (tx_buf->data && tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) {
++		if (tx_buf->type == MTK_TYPE_SKB) {
++			struct sk_buff *skb = tx_buf->data;
++
++			if (napi)
++				napi_consume_skb(skb, napi);
++			else
++				dev_kfree_skb_any(skb);
++		} else {
++			struct xdp_frame *xdpf = tx_buf->data;
++
++			if (napi && tx_buf->type == MTK_TYPE_XDP_TX)
++				xdp_return_frame_rx_napi(xdpf);
++			else if (bq)
++				xdp_return_frame_bulk(xdpf, bq);
++			else
++				xdp_return_frame(xdpf);
++		}
++	}
++	tx_buf->flags = 0;
++	tx_buf->data = NULL;
++}
++
++static void setup_tx_buf(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf,
++			 struct mtk_tx_dma *txd, dma_addr_t mapped_addr,
++			 size_t size, int idx)
++{
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
++		dma_unmap_len_set(tx_buf, dma_len0, size);
++	} else {
++		if (idx & 1) {
++			txd->txd3 = mapped_addr;
++			txd->txd2 |= TX_DMA_PLEN1(size);
++			dma_unmap_addr_set(tx_buf, dma_addr1, mapped_addr);
++			dma_unmap_len_set(tx_buf, dma_len1, size);
++		} else {
++			tx_buf->data = (void *)MTK_DMA_DUMMY_DESC;
++			txd->txd1 = mapped_addr;
++			txd->txd2 = TX_DMA_PLEN0(size);
++			dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
++			dma_unmap_len_set(tx_buf, dma_len0, size);
++		}
++	}
++}
++
++static void mtk_tx_set_dma_desc_v1(struct net_device *dev, void *txd,
++				   struct mtk_tx_dma_desc_info *info)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	struct mtk_tx_dma *desc = txd;
++	u32 data;
++
++	WRITE_ONCE(desc->txd1, info->addr);
++
++	data = TX_DMA_SWC | TX_DMA_PLEN0(info->size);
++	if (info->last)
++		data |= TX_DMA_LS0;
++	WRITE_ONCE(desc->txd3, data);
++
++	data = (mac->id + 1) << TX_DMA_FPORT_SHIFT; /* forward port */
++	if (info->first) {
++		if (info->gso)
++			data |= TX_DMA_TSO;
++		/* tx checksum offload */
++		if (info->csum)
++			data |= TX_DMA_CHKSUM;
++		/* vlan header offload */
++		if (info->vlan)
++			data |= TX_DMA_INS_VLAN | info->vlan_tci;
++	}
++	WRITE_ONCE(desc->txd4, data);
++}
++
++static void mtk_tx_set_dma_desc_v2(struct net_device *dev, void *txd,
++				   struct mtk_tx_dma_desc_info *info)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_tx_dma_v2 *desc = txd;
++	struct mtk_eth *eth = mac->hw;
++	u32 data;
++
++	WRITE_ONCE(desc->txd1, info->addr);
++
++	data = TX_DMA_PLEN0(info->size);
++	if (info->last)
++		data |= TX_DMA_LS0;
++	WRITE_ONCE(desc->txd3, data);
++
++	if (!info->qid && mac->id)
++		info->qid = MTK_QDMA_GMAC2_QID;
++
++	data = (mac->id + 1) << TX_DMA_FPORT_SHIFT_V2; /* forward port */
++	data |= TX_DMA_SWC_V2 | QID_BITS_V2(info->qid);
++	WRITE_ONCE(desc->txd4, data);
++
++	data = 0;
++	if (info->first) {
++		if (info->gso)
++			data |= TX_DMA_TSO_V2;
++		/* tx checksum offload */
++		if (info->csum)
++			data |= TX_DMA_CHKSUM_V2;
++	}
++	WRITE_ONCE(desc->txd5, data);
++
++	data = 0;
++	if (info->first && info->vlan)
++		data |= TX_DMA_INS_VLAN_V2 | info->vlan_tci;
++	WRITE_ONCE(desc->txd6, data);
++
++	WRITE_ONCE(desc->txd7, 0);
++	WRITE_ONCE(desc->txd8, 0);
++}
++
++static void mtk_tx_set_dma_desc(struct net_device *dev, void *txd,
++				struct mtk_tx_dma_desc_info *info)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
++		mtk_tx_set_dma_desc_v2(dev, txd, info);
++	else
++		mtk_tx_set_dma_desc_v1(dev, txd, info);
++}
++
++static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
++		      int tx_num, struct mtk_tx_ring *ring, bool gso)
++{
++	struct mtk_tx_dma_desc_info txd_info = {
++		.size = skb_headlen(skb),
++		.gso = gso,
++		.csum = skb->ip_summed == CHECKSUM_PARTIAL,
++		.vlan = skb_vlan_tag_present(skb),
++		.qid = skb->mark & MTK_QDMA_TX_MASK,
++		.vlan_tci = skb_vlan_tag_get(skb),
++		.first = true,
++		.last = !skb_is_nonlinear(skb),
++	};
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	const struct mtk_soc_data *soc = eth->soc;
++	struct mtk_tx_dma *itxd, *txd;
++	struct mtk_tx_dma *itxd_pdma, *txd_pdma;
++	struct mtk_tx_buf *itx_buf, *tx_buf;
++	int i, n_desc = 1;
++	int k = 0;
++
++	itxd = ring->next_free;
++	itxd_pdma = qdma_to_pdma(ring, itxd);
++	if (itxd == ring->last_free)
++		return -ENOMEM;
++
++	itx_buf = mtk_desc_to_tx_buf(ring, itxd, soc->txrx.txd_size);
++	memset(itx_buf, 0, sizeof(*itx_buf));
++
++	txd_info.addr = dma_map_single(eth->dma_dev, skb->data, txd_info.size,
++				       DMA_TO_DEVICE);
++	if (unlikely(dma_mapping_error(eth->dma_dev, txd_info.addr)))
++		return -ENOMEM;
++
++	mtk_tx_set_dma_desc(dev, itxd, &txd_info);
++
++	itx_buf->flags |= MTK_TX_FLAGS_SINGLE0;
++	itx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 :
++			  MTK_TX_FLAGS_FPORT1;
++	setup_tx_buf(eth, itx_buf, itxd_pdma, txd_info.addr, txd_info.size,
++		     k++);
++
++	/* TX SG offload */
++	txd = itxd;
++	txd_pdma = qdma_to_pdma(ring, txd);
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++		unsigned int offset = 0;
++		int frag_size = skb_frag_size(frag);
++
++		while (frag_size) {
++			bool new_desc = true;
++
++			if (MTK_HAS_CAPS(soc->caps, MTK_QDMA) ||
++			    (i & 0x1)) {
++				txd = mtk_qdma_phys_to_virt(ring, txd->txd2);
++				txd_pdma = qdma_to_pdma(ring, txd);
++				if (txd == ring->last_free)
++					goto err_dma;
++
++				n_desc++;
++			} else {
++				new_desc = false;
++			}
++
++			memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info));
++			txd_info.size = min_t(unsigned int, frag_size,
++					      soc->txrx.dma_max_len);
++			txd_info.qid = skb->mark & MTK_QDMA_TX_MASK;
++			txd_info.last = i == skb_shinfo(skb)->nr_frags - 1 &&
++					!(frag_size - txd_info.size);
++			txd_info.addr = skb_frag_dma_map(eth->dma_dev, frag,
++							 offset, txd_info.size,
++							 DMA_TO_DEVICE);
++			if (unlikely(dma_mapping_error(eth->dma_dev, txd_info.addr)))
++				goto err_dma;
++
++			mtk_tx_set_dma_desc(dev, txd, &txd_info);
++
++			tx_buf = mtk_desc_to_tx_buf(ring, txd,
++						    soc->txrx.txd_size);
++			if (new_desc)
++				memset(tx_buf, 0, sizeof(*tx_buf));
++			tx_buf->data = (void *)MTK_DMA_DUMMY_DESC;
++			tx_buf->flags |= MTK_TX_FLAGS_PAGE0;
++			tx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 :
++					 MTK_TX_FLAGS_FPORT1;
++
++			setup_tx_buf(eth, tx_buf, txd_pdma, txd_info.addr,
++				     txd_info.size, k++);
++
++			frag_size -= txd_info.size;
++			offset += txd_info.size;
++		}
++	}
++
++	/* store skb to cleanup */
++	itx_buf->type = MTK_TYPE_SKB;
++	itx_buf->data = skb;
++
++	if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		if (k & 0x1)
++			txd_pdma->txd2 |= TX_DMA_LS0;
++		else
++			txd_pdma->txd2 |= TX_DMA_LS1;
++	}
++
++	netdev_sent_queue(dev, skb->len);
++	skb_tx_timestamp(skb);
++
++	ring->next_free = mtk_qdma_phys_to_virt(ring, txd->txd2);
++	atomic_sub(n_desc, &ring->free_count);
++
++	/* make sure that all changes to the dma ring are flushed before we
++	 * continue
++	 */
++	wmb();
++
++	if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)) ||
++		    !netdev_xmit_more())
++			mtk_w32(eth, txd->txd2, soc->reg_map->qdma.ctx_ptr);
++	} else {
++		int next_idx;
++
++		next_idx = NEXT_DESP_IDX(txd_to_idx(ring, txd, soc->txrx.txd_size),
++					 ring->dma_size);
++		mtk_w32(eth, next_idx, MT7628_TX_CTX_IDX0);
++	}
++
++	return 0;
++
++err_dma:
++	do {
++		tx_buf = mtk_desc_to_tx_buf(ring, itxd, soc->txrx.txd_size);
++
++		/* unmap dma */
++		mtk_tx_unmap(eth, tx_buf, NULL, false);
++
++		itxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU;
++		if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA))
++			itxd_pdma->txd2 = TX_DMA_DESP2_DEF;
++
++		itxd = mtk_qdma_phys_to_virt(ring, itxd->txd2);
++		itxd_pdma = qdma_to_pdma(ring, itxd);
++	} while (itxd != txd);
++
++	return -ENOMEM;
++}
++
++static int mtk_cal_txd_req(struct mtk_eth *eth, struct sk_buff *skb)
++{
++	int i, nfrags = 1;
++	skb_frag_t *frag;
++
++	if (skb_is_gso(skb)) {
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++			frag = &skb_shinfo(skb)->frags[i];
++			nfrags += DIV_ROUND_UP(skb_frag_size(frag),
++					       eth->soc->txrx.dma_max_len);
++		}
++	} else {
++		nfrags += skb_shinfo(skb)->nr_frags;
++	}
++
++	return nfrags;
++}
++
++static int mtk_queue_stopped(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		if (netif_queue_stopped(eth->netdev[i]))
++			return 1;
++	}
++
++	return 0;
++}
++
++static void mtk_wake_queue(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		netif_wake_queue(eth->netdev[i]);
++	}
++}
++
++static netdev_tx_t mtk_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct net_device_stats *stats = &dev->stats;
++	bool gso = false;
++	int tx_num;
++
++	/* normally we can rely on the stack not calling this more than once,
++	 * however we have 2 queues running on the same ring so we need to lock
++	 * the ring access
++	 */
++	spin_lock(&eth->page_lock);
++
++	if (unlikely(test_bit(MTK_RESETTING, &eth->state)))
++		goto drop;
++
++	tx_num = mtk_cal_txd_req(eth, skb);
++	if (unlikely(atomic_read(&ring->free_count) <= tx_num)) {
++		netif_stop_queue(dev);
++		netif_err(eth, tx_queued, dev,
++			  "Tx Ring full when queue awake!\n");
++		spin_unlock(&eth->page_lock);
++		return NETDEV_TX_BUSY;
++	}
++
++	/* TSO: fill MSS info in tcp checksum field */
++	if (skb_is_gso(skb)) {
++		if (skb_cow_head(skb, 0)) {
++			netif_warn(eth, tx_err, dev,
++				   "GSO expand head fail.\n");
++			goto drop;
++		}
++
++		if (skb_shinfo(skb)->gso_type &
++				(SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) {
++			gso = true;
++			tcp_hdr(skb)->check = htons(skb_shinfo(skb)->gso_size);
++		}
++	}
++
++	if (mtk_tx_map(skb, dev, tx_num, ring, gso) < 0)
++		goto drop;
++
++	if (unlikely(atomic_read(&ring->free_count) <= ring->thresh))
++		netif_stop_queue(dev);
++
++	spin_unlock(&eth->page_lock);
++
++	return NETDEV_TX_OK;
++
++drop:
++	spin_unlock(&eth->page_lock);
++	stats->tx_dropped++;
++	dev_kfree_skb_any(skb);
++	return NETDEV_TX_OK;
++}
++
++static struct mtk_rx_ring *mtk_get_rx_ring(struct mtk_eth *eth)
++{
++	int i;
++	struct mtk_rx_ring *ring;
++	int idx;
++
++	if (!eth->hwlro)
++		return &eth->rx_ring[0];
++
++	for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) {
++		struct mtk_rx_dma *rxd;
++
++		ring = &eth->rx_ring[i];
++		idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
++		rxd = ring->dma + idx * eth->soc->txrx.rxd_size;
++		if (rxd->rxd2 & RX_DMA_DONE) {
++			ring->calc_idx_update = true;
++			return ring;
++		}
++	}
++
++	return NULL;
++}
++
++static void mtk_update_rx_cpu_idx(struct mtk_eth *eth)
++{
++	struct mtk_rx_ring *ring;
++	int i;
++
++	if (!eth->hwlro) {
++		ring = &eth->rx_ring[0];
++		mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg);
++	} else {
++		for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) {
++			ring = &eth->rx_ring[i];
++			if (ring->calc_idx_update) {
++				ring->calc_idx_update = false;
++				mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg);
++			}
++		}
++	}
++}
++
++static bool mtk_page_pool_enabled(struct mtk_eth *eth)
++{
++	return MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2);
++}
++
++static struct page_pool *mtk_create_page_pool(struct mtk_eth *eth,
++					      struct xdp_rxq_info *xdp_q,
++					      int id, int size)
++{
++	struct page_pool_params pp_params = {
++		.order = 0,
++		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
++		.pool_size = size,
++		.nid = NUMA_NO_NODE,
++		.dev = eth->dma_dev,
++		.offset = MTK_PP_HEADROOM,
++		.max_len = MTK_PP_MAX_BUF_SIZE,
++	};
++	struct page_pool *pp;
++	int err;
++
++	pp_params.dma_dir = rcu_access_pointer(eth->prog) ? DMA_BIDIRECTIONAL
++							  : DMA_FROM_DEVICE;
++	pp = page_pool_create(&pp_params);
++	if (IS_ERR(pp))
++		return pp;
++
++	err = __xdp_rxq_info_reg(xdp_q, &eth->dummy_dev, eth->rx_napi.napi_id,
++				 id, PAGE_SIZE);
++	if (err < 0)
++		goto err_free_pp;
++
++	err = xdp_rxq_info_reg_mem_model(xdp_q, MEM_TYPE_PAGE_POOL, pp);
++	if (err)
++		goto err_unregister_rxq;
++
++	return pp;
++
++err_unregister_rxq:
++	xdp_rxq_info_unreg(xdp_q);
++err_free_pp:
++	page_pool_destroy(pp);
++
++	return ERR_PTR(err);
++}
++
++static void *mtk_page_pool_get_buff(struct page_pool *pp, dma_addr_t *dma_addr,
++				    gfp_t gfp_mask)
++{
++	struct page *page;
++
++	page = page_pool_alloc_pages(pp, gfp_mask | __GFP_NOWARN);
++	if (!page)
++		return NULL;
++
++	*dma_addr = page_pool_get_dma_addr(page) + MTK_PP_HEADROOM;
++	return page_address(page);
++}
++
++static void mtk_rx_put_buff(struct mtk_rx_ring *ring, void *data, bool napi)
++{
++	if (ring->page_pool)
++		page_pool_put_full_page(ring->page_pool,
++					virt_to_head_page(data), napi);
++	else
++		skb_free_frag(data);
++}
++
++static int mtk_xdp_frame_map(struct mtk_eth *eth, struct net_device *dev,
++			     struct mtk_tx_dma_desc_info *txd_info,
++			     struct mtk_tx_dma *txd, struct mtk_tx_buf *tx_buf,
++			     void *data, u16 headroom, int index, bool dma_map)
++{
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_tx_dma *txd_pdma;
++
++	if (dma_map) {  /* ndo_xdp_xmit */
++		txd_info->addr = dma_map_single(eth->dma_dev, data,
++						txd_info->size, DMA_TO_DEVICE);
++		if (unlikely(dma_mapping_error(eth->dma_dev, txd_info->addr)))
++			return -ENOMEM;
++
++		tx_buf->flags |= MTK_TX_FLAGS_SINGLE0;
++	} else {
++		struct page *page = virt_to_head_page(data);
++
++		txd_info->addr = page_pool_get_dma_addr(page) +
++				 sizeof(struct xdp_frame) + headroom;
++		dma_sync_single_for_device(eth->dma_dev, txd_info->addr,
++					   txd_info->size, DMA_BIDIRECTIONAL);
++	}
++	mtk_tx_set_dma_desc(dev, txd, txd_info);
++
++	tx_buf->flags |= !mac->id ? MTK_TX_FLAGS_FPORT0 : MTK_TX_FLAGS_FPORT1;
++	tx_buf->type = dma_map ? MTK_TYPE_XDP_NDO : MTK_TYPE_XDP_TX;
++	tx_buf->data = (void *)MTK_DMA_DUMMY_DESC;
++
++	txd_pdma = qdma_to_pdma(ring, txd);
++	setup_tx_buf(eth, tx_buf, txd_pdma, txd_info->addr, txd_info->size,
++		     index);
++
++	return 0;
++}
++
++static int mtk_xdp_submit_frame(struct mtk_eth *eth, struct xdp_frame *xdpf,
++				struct net_device *dev, bool dma_map)
++{
++	struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
++	const struct mtk_soc_data *soc = eth->soc;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct mtk_tx_dma_desc_info txd_info = {
++		.size	= xdpf->len,
++		.first	= true,
++		.last	= !xdp_frame_has_frags(xdpf),
++	};
++	int err, index = 0, n_desc = 1, nr_frags;
++	struct mtk_tx_dma *htxd, *txd, *txd_pdma;
++	struct mtk_tx_buf *htx_buf, *tx_buf;
++	void *data = xdpf->data;
++
++	if (unlikely(test_bit(MTK_RESETTING, &eth->state)))
++		return -EBUSY;
++
++	nr_frags = unlikely(xdp_frame_has_frags(xdpf)) ? sinfo->nr_frags : 0;
++	if (unlikely(atomic_read(&ring->free_count) <= 1 + nr_frags))
++		return -EBUSY;
++
++	spin_lock(&eth->page_lock);
++
++	txd = ring->next_free;
++	if (txd == ring->last_free) {
++		spin_unlock(&eth->page_lock);
++		return -ENOMEM;
++	}
++	htxd = txd;
++
++	tx_buf = mtk_desc_to_tx_buf(ring, txd, soc->txrx.txd_size);
++	memset(tx_buf, 0, sizeof(*tx_buf));
++	htx_buf = tx_buf;
++
++	for (;;) {
++		err = mtk_xdp_frame_map(eth, dev, &txd_info, txd, tx_buf,
++					data, xdpf->headroom, index, dma_map);
++		if (err < 0)
++			goto unmap;
++
++		if (txd_info.last)
++			break;
++
++		if (MTK_HAS_CAPS(soc->caps, MTK_QDMA) || (index & 0x1)) {
++			txd = mtk_qdma_phys_to_virt(ring, txd->txd2);
++			txd_pdma = qdma_to_pdma(ring, txd);
++			if (txd == ring->last_free)
++				goto unmap;
++
++			tx_buf = mtk_desc_to_tx_buf(ring, txd,
++						    soc->txrx.txd_size);
++			memset(tx_buf, 0, sizeof(*tx_buf));
++			n_desc++;
++		}
++
++		memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info));
++		txd_info.size = skb_frag_size(&sinfo->frags[index]);
++		txd_info.last = index + 1 == nr_frags;
++		data = skb_frag_address(&sinfo->frags[index]);
++
++		index++;
++	}
++	/* store xdpf for cleanup */
++	htx_buf->data = xdpf;
++
++	if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		txd_pdma = qdma_to_pdma(ring, txd);
++		if (index & 1)
++			txd_pdma->txd2 |= TX_DMA_LS0;
++		else
++			txd_pdma->txd2 |= TX_DMA_LS1;
++	}
++
++	ring->next_free = mtk_qdma_phys_to_virt(ring, txd->txd2);
++	atomic_sub(n_desc, &ring->free_count);
++
++	/* make sure that all changes to the dma ring are flushed before we
++	 * continue
++	 */
++	wmb();
++
++	if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		mtk_w32(eth, txd->txd2, soc->reg_map->qdma.ctx_ptr);
++	} else {
++		int idx;
++
++		idx = txd_to_idx(ring, txd, soc->txrx.txd_size);
++		mtk_w32(eth, NEXT_DESP_IDX(idx, ring->dma_size),
++			MT7628_TX_CTX_IDX0);
++	}
++
++	spin_unlock(&eth->page_lock);
++
++	return 0;
++
++unmap:
++	while (htxd != txd) {
++		txd_pdma = qdma_to_pdma(ring, htxd);
++		tx_buf = mtk_desc_to_tx_buf(ring, htxd, soc->txrx.txd_size);
++		mtk_tx_unmap(eth, tx_buf, NULL, false);
++
++		htxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU;
++		if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA))
++			txd_pdma->txd2 = TX_DMA_DESP2_DEF;
++
++		htxd = mtk_qdma_phys_to_virt(ring, htxd->txd2);
++	}
++
++	spin_unlock(&eth->page_lock);
++
++	return err;
++}
++
++static int mtk_xdp_xmit(struct net_device *dev, int num_frame,
++			struct xdp_frame **frames, u32 flags)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_hw_stats *hw_stats = mac->hw_stats;
++	struct mtk_eth *eth = mac->hw;
++	int i, nxmit = 0;
++
++	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
++		return -EINVAL;
++
++	for (i = 0; i < num_frame; i++) {
++		if (mtk_xdp_submit_frame(eth, frames[i], dev, true))
++			break;
++		nxmit++;
++	}
++
++	u64_stats_update_begin(&hw_stats->syncp);
++	hw_stats->xdp_stats.tx_xdp_xmit += nxmit;
++	hw_stats->xdp_stats.tx_xdp_xmit_errors += num_frame - nxmit;
++	u64_stats_update_end(&hw_stats->syncp);
++
++	return nxmit;
++}
++
++static u32 mtk_xdp_run(struct mtk_eth *eth, struct mtk_rx_ring *ring,
++		       struct xdp_buff *xdp, struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_hw_stats *hw_stats = mac->hw_stats;
++	u64 *count = &hw_stats->xdp_stats.rx_xdp_drop;
++	struct bpf_prog *prog;
++	u32 act = XDP_PASS;
++
++	rcu_read_lock();
++
++	prog = rcu_dereference(eth->prog);
++	if (!prog)
++		goto out;
++
++	act = bpf_prog_run_xdp(prog, xdp);
++	switch (act) {
++	case XDP_PASS:
++		count = &hw_stats->xdp_stats.rx_xdp_pass;
++		goto update_stats;
++	case XDP_REDIRECT:
++		if (unlikely(xdp_do_redirect(dev, xdp, prog))) {
++			act = XDP_DROP;
++			break;
++		}
++
++		count = &hw_stats->xdp_stats.rx_xdp_redirect;
++		goto update_stats;
++	case XDP_TX: {
++		struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
++
++		if (!xdpf || mtk_xdp_submit_frame(eth, xdpf, dev, false)) {
++			count = &hw_stats->xdp_stats.rx_xdp_tx_errors;
++			act = XDP_DROP;
++			break;
++		}
++
++		count = &hw_stats->xdp_stats.rx_xdp_tx;
++		goto update_stats;
++	}
++	default:
++		bpf_warn_invalid_xdp_action(dev, prog, act);
++		fallthrough;
++	case XDP_ABORTED:
++		trace_xdp_exception(dev, prog, act);
++		fallthrough;
++	case XDP_DROP:
++		break;
++	}
++
++	page_pool_put_full_page(ring->page_pool,
++				virt_to_head_page(xdp->data), true);
++
++update_stats:
++	u64_stats_update_begin(&hw_stats->syncp);
++	*count = *count + 1;
++	u64_stats_update_end(&hw_stats->syncp);
++out:
++	rcu_read_unlock();
++
++	return act;
++}
++
++static int mtk_poll_rx(struct napi_struct *napi, int budget,
++		       struct mtk_eth *eth)
++{
++	struct dim_sample dim_sample = {};
++	struct mtk_rx_ring *ring;
++	bool xdp_flush = false;
++	int idx;
++	struct sk_buff *skb;
++	u8 *data, *new_data;
++	struct mtk_rx_dma_v2 *rxd, trxd;
++	int done = 0, bytes = 0;
++
++	while (done < budget) {
++		unsigned int pktlen, *rxdcsum;
++		struct net_device *netdev;
++		dma_addr_t dma_addr;
++		u32 hash, reason;
++		int mac = 0;
++
++		ring = mtk_get_rx_ring(eth);
++		if (unlikely(!ring))
++			goto rx_done;
++
++		idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
++		rxd = ring->dma + idx * eth->soc->txrx.rxd_size;
++		data = ring->data[idx];
++
++		if (!mtk_rx_get_desc(eth, &trxd, rxd))
++			break;
++
++		/* find out which mac the packet come from. values start at 1 */
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
++			mac = RX_DMA_GET_SPORT_V2(trxd.rxd5) - 1;
++		else if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) &&
++			 !(trxd.rxd4 & RX_DMA_SPECIAL_TAG))
++			mac = RX_DMA_GET_SPORT(trxd.rxd4) - 1;
++
++		if (unlikely(mac < 0 || mac >= MTK_MAC_COUNT ||
++			     !eth->netdev[mac]))
++			goto release_desc;
++
++		netdev = eth->netdev[mac];
++
++		if (unlikely(test_bit(MTK_RESETTING, &eth->state)))
++			goto release_desc;
++
++		pktlen = RX_DMA_GET_PLEN0(trxd.rxd2);
++
++		/* alloc new buffer */
++		if (ring->page_pool) {
++			struct page *page = virt_to_head_page(data);
++			struct xdp_buff xdp;
++			u32 ret;
++
++			new_data = mtk_page_pool_get_buff(ring->page_pool,
++							  &dma_addr,
++							  GFP_ATOMIC);
++			if (unlikely(!new_data)) {
++				netdev->stats.rx_dropped++;
++				goto release_desc;
++			}
++
++			dma_sync_single_for_cpu(eth->dma_dev,
++				page_pool_get_dma_addr(page) + MTK_PP_HEADROOM,
++				pktlen, page_pool_get_dma_dir(ring->page_pool));
++
++			xdp_init_buff(&xdp, PAGE_SIZE, &ring->xdp_q);
++			xdp_prepare_buff(&xdp, data, MTK_PP_HEADROOM, pktlen,
++					 false);
++			xdp_buff_clear_frags_flag(&xdp);
++
++			ret = mtk_xdp_run(eth, ring, &xdp, netdev);
++			if (ret == XDP_REDIRECT)
++				xdp_flush = true;
++
++			if (ret != XDP_PASS)
++				goto skip_rx;
++
++			skb = build_skb(data, PAGE_SIZE);
++			if (unlikely(!skb)) {
++				page_pool_put_full_page(ring->page_pool,
++							page, true);
++				netdev->stats.rx_dropped++;
++				goto skip_rx;
++			}
++
++			skb_reserve(skb, xdp.data - xdp.data_hard_start);
++			skb_put(skb, xdp.data_end - xdp.data);
++			skb_mark_for_recycle(skb);
++		} else {
++			if (ring->frag_size <= PAGE_SIZE)
++				new_data = napi_alloc_frag(ring->frag_size);
++			else
++				new_data = mtk_max_lro_buf_alloc(GFP_ATOMIC);
++
++			if (unlikely(!new_data)) {
++				netdev->stats.rx_dropped++;
++				goto release_desc;
++			}
++
++			dma_addr = dma_map_single(eth->dma_dev,
++				new_data + NET_SKB_PAD + eth->ip_align,
++				ring->buf_size, DMA_FROM_DEVICE);
++			if (unlikely(dma_mapping_error(eth->dma_dev,
++						       dma_addr))) {
++				skb_free_frag(new_data);
++				netdev->stats.rx_dropped++;
++				goto release_desc;
++			}
++
++			dma_unmap_single(eth->dma_dev, trxd.rxd1,
++					 ring->buf_size, DMA_FROM_DEVICE);
++
++			skb = build_skb(data, ring->frag_size);
++			if (unlikely(!skb)) {
++				netdev->stats.rx_dropped++;
++				skb_free_frag(data);
++				goto skip_rx;
++			}
++
++			skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
++			skb_put(skb, pktlen);
++		}
++
++		skb->dev = netdev;
++		bytes += skb->len;
++
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++			hash = trxd.rxd5 & MTK_RXD5_FOE_ENTRY;
++			if (hash != MTK_RXD5_FOE_ENTRY)
++				skb_set_hash(skb, jhash_1word(hash, 0),
++					     PKT_HASH_TYPE_L4);
++			rxdcsum = &trxd.rxd3;
++		} else {
++			hash = trxd.rxd4 & MTK_RXD4_FOE_ENTRY;
++			if (hash != MTK_RXD4_FOE_ENTRY)
++				skb_set_hash(skb, jhash_1word(hash, 0),
++					     PKT_HASH_TYPE_L4);
++			rxdcsum = &trxd.rxd4;
++		}
++
++		if (*rxdcsum & eth->soc->txrx.rx_dma_l4_valid)
++			skb->ip_summed = CHECKSUM_UNNECESSARY;
++		else
++			skb_checksum_none_assert(skb);
++		skb->protocol = eth_type_trans(skb, netdev);
++
++		reason = FIELD_GET(MTK_RXD4_PPE_CPU_REASON, trxd.rxd4);
++		if (reason == MTK_PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED)
++			mtk_ppe_check_skb(eth->ppe, skb, hash);
++
++		if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
++			if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++				if (trxd.rxd3 & RX_DMA_VTAG_V2)
++					__vlan_hwaccel_put_tag(skb,
++						htons(RX_DMA_VPID(trxd.rxd4)),
++						RX_DMA_VID(trxd.rxd4));
++			} else if (trxd.rxd2 & RX_DMA_VTAG) {
++				__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
++						       RX_DMA_VID(trxd.rxd3));
++			}
++
++			/* If the device is attached to a dsa switch, the special
++			 * tag inserted in VLAN field by hw switch can * be offloaded
++			 * by RX HW VLAN offload. Clear vlan info.
++			 */
++			if (netdev_uses_dsa(netdev))
++				__vlan_hwaccel_clear_tag(skb);
++		}
++
++		skb_record_rx_queue(skb, 0);
++		napi_gro_receive(napi, skb);
++
++skip_rx:
++		ring->data[idx] = new_data;
++		rxd->rxd1 = (unsigned int)dma_addr;
++release_desc:
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++			rxd->rxd2 = RX_DMA_LSO;
++		else
++			rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size);
++
++		ring->calc_idx = idx;
++		done++;
++	}
++
++rx_done:
++	if (done) {
++		/* make sure that all changes to the dma ring are flushed before
++		 * we continue
++		 */
++		wmb();
++		mtk_update_rx_cpu_idx(eth);
++	}
++
++	eth->rx_packets += done;
++	eth->rx_bytes += bytes;
++	dim_update_sample(eth->rx_events, eth->rx_packets, eth->rx_bytes,
++			  &dim_sample);
++	net_dim(&eth->rx_dim, dim_sample);
++
++	if (xdp_flush)
++		xdp_do_flush_map();
++
++	return done;
++}
++
++static int mtk_poll_tx_qdma(struct mtk_eth *eth, int budget,
++			    unsigned int *done, unsigned int *bytes)
++{
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct mtk_tx_buf *tx_buf;
++	struct xdp_frame_bulk bq;
++	struct mtk_tx_dma *desc;
++	u32 cpu, dma;
++
++	cpu = ring->last_free_ptr;
++	dma = mtk_r32(eth, reg_map->qdma.drx_ptr);
++
++	desc = mtk_qdma_phys_to_virt(ring, cpu);
++	xdp_frame_bulk_init(&bq);
++
++	while ((cpu != dma) && budget) {
++		u32 next_cpu = desc->txd2;
++		int mac = 0;
++
++		desc = mtk_qdma_phys_to_virt(ring, desc->txd2);
++		if ((desc->txd3 & TX_DMA_OWNER_CPU) == 0)
++			break;
++
++		tx_buf = mtk_desc_to_tx_buf(ring, desc,
++					    eth->soc->txrx.txd_size);
++		if (tx_buf->flags & MTK_TX_FLAGS_FPORT1)
++			mac = 1;
++
++		if (!tx_buf->data)
++			break;
++
++		if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) {
++			if (tx_buf->type == MTK_TYPE_SKB) {
++				struct sk_buff *skb = tx_buf->data;
++
++				bytes[mac] += skb->len;
++				done[mac]++;
++			}
++			budget--;
++		}
++		mtk_tx_unmap(eth, tx_buf, &bq, true);
++
++		ring->last_free = desc;
++		atomic_inc(&ring->free_count);
++
++		cpu = next_cpu;
++	}
++	xdp_flush_frame_bulk(&bq);
++
++	ring->last_free_ptr = cpu;
++	mtk_w32(eth, cpu, reg_map->qdma.crx_ptr);
++
++	return budget;
++}
++
++static int mtk_poll_tx_pdma(struct mtk_eth *eth, int budget,
++			    unsigned int *done, unsigned int *bytes)
++{
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct mtk_tx_buf *tx_buf;
++	struct xdp_frame_bulk bq;
++	struct mtk_tx_dma *desc;
++	u32 cpu, dma;
++
++	cpu = ring->cpu_idx;
++	dma = mtk_r32(eth, MT7628_TX_DTX_IDX0);
++	xdp_frame_bulk_init(&bq);
++
++	while ((cpu != dma) && budget) {
++		tx_buf = &ring->buf[cpu];
++		if (!tx_buf->data)
++			break;
++
++		if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) {
++			if (tx_buf->type == MTK_TYPE_SKB) {
++				struct sk_buff *skb = tx_buf->data;
++
++				bytes[0] += skb->len;
++				done[0]++;
++			}
++			budget--;
++		}
++		mtk_tx_unmap(eth, tx_buf, &bq, true);
++
++		desc = ring->dma + cpu * eth->soc->txrx.txd_size;
++		ring->last_free = desc;
++		atomic_inc(&ring->free_count);
++
++		cpu = NEXT_DESP_IDX(cpu, ring->dma_size);
++	}
++	xdp_flush_frame_bulk(&bq);
++
++	ring->cpu_idx = cpu;
++
++	return budget;
++}
++
++static int mtk_poll_tx(struct mtk_eth *eth, int budget)
++{
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	struct dim_sample dim_sample = {};
++	unsigned int done[MTK_MAX_DEVS];
++	unsigned int bytes[MTK_MAX_DEVS];
++	int total = 0, i;
++
++	memset(done, 0, sizeof(done));
++	memset(bytes, 0, sizeof(bytes));
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		budget = mtk_poll_tx_qdma(eth, budget, done, bytes);
++	else
++		budget = mtk_poll_tx_pdma(eth, budget, done, bytes);
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i] || !done[i])
++			continue;
++		netdev_completed_queue(eth->netdev[i], done[i], bytes[i]);
++		total += done[i];
++		eth->tx_packets += done[i];
++		eth->tx_bytes += bytes[i];
++	}
++
++	dim_update_sample(eth->tx_events, eth->tx_packets, eth->tx_bytes,
++			  &dim_sample);
++	net_dim(&eth->tx_dim, dim_sample);
++
++	if (mtk_queue_stopped(eth) &&
++	    (atomic_read(&ring->free_count) > ring->thresh))
++		mtk_wake_queue(eth);
++
++	return total;
++}
++
++static void mtk_handle_status_irq(struct mtk_eth *eth)
++{
++	u32 status2 = mtk_r32(eth, MTK_INT_STATUS2);
++
++	if (unlikely(status2 & (MTK_GDM1_AF | MTK_GDM2_AF))) {
++		mtk_stats_update(eth);
++		mtk_w32(eth, (MTK_GDM1_AF | MTK_GDM2_AF),
++			MTK_INT_STATUS2);
++	}
++}
++
++static int mtk_napi_tx(struct napi_struct *napi, int budget)
++{
++	struct mtk_eth *eth = container_of(napi, struct mtk_eth, tx_napi);
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	int tx_done = 0;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		mtk_handle_status_irq(eth);
++	mtk_w32(eth, MTK_TX_DONE_INT, reg_map->tx_irq_status);
++	tx_done = mtk_poll_tx(eth, budget);
++
++	if (unlikely(netif_msg_intr(eth))) {
++		dev_info(eth->dev,
++			 "done tx %d, intr 0x%08x/0x%x\n", tx_done,
++			 mtk_r32(eth, reg_map->tx_irq_status),
++			 mtk_r32(eth, reg_map->tx_irq_mask));
++	}
++
++	if (tx_done == budget)
++		return budget;
++
++	if (mtk_r32(eth, reg_map->tx_irq_status) & MTK_TX_DONE_INT)
++		return budget;
++
++	if (napi_complete_done(napi, tx_done))
++		mtk_tx_irq_enable(eth, MTK_TX_DONE_INT);
++
++	return tx_done;
++}
++
++static int mtk_napi_rx(struct napi_struct *napi, int budget)
++{
++	struct mtk_eth *eth = container_of(napi, struct mtk_eth, rx_napi);
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	int rx_done_total = 0;
++
++	mtk_handle_status_irq(eth);
++
++	do {
++		int rx_done;
++
++		mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask,
++			reg_map->pdma.irq_status);
++		rx_done = mtk_poll_rx(napi, budget - rx_done_total, eth);
++		rx_done_total += rx_done;
++
++		if (unlikely(netif_msg_intr(eth))) {
++			dev_info(eth->dev,
++				 "done rx %d, intr 0x%08x/0x%x\n", rx_done,
++				 mtk_r32(eth, reg_map->pdma.irq_status),
++				 mtk_r32(eth, reg_map->pdma.irq_mask));
++		}
++
++		if (rx_done_total == budget)
++			return budget;
++
++	} while (mtk_r32(eth, reg_map->pdma.irq_status) &
++		 eth->soc->txrx.rx_irq_done_mask);
++
++	if (napi_complete_done(napi, rx_done_total))
++		mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask);
++
++	return rx_done_total;
++}
++
++static int mtk_tx_alloc(struct mtk_eth *eth)
++{
++	const struct mtk_soc_data *soc = eth->soc;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	int i, sz = soc->txrx.txd_size;
++	struct mtk_tx_dma_v2 *txd;
++
++	ring->buf = kcalloc(MTK_DMA_SIZE, sizeof(*ring->buf),
++			       GFP_KERNEL);
++	if (!ring->buf)
++		goto no_tx_mem;
++
++	ring->dma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
++				       &ring->phys, GFP_KERNEL);
++	if (!ring->dma)
++		goto no_tx_mem;
++
++	for (i = 0; i < MTK_DMA_SIZE; i++) {
++		int next = (i + 1) % MTK_DMA_SIZE;
++		u32 next_ptr = ring->phys + next * sz;
++
++		txd = ring->dma + i * sz;
++		txd->txd2 = next_ptr;
++		txd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU;
++		txd->txd4 = 0;
++		if (MTK_HAS_CAPS(soc->caps, MTK_NETSYS_V2)) {
++			txd->txd5 = 0;
++			txd->txd6 = 0;
++			txd->txd7 = 0;
++			txd->txd8 = 0;
++		}
++	}
++
++	/* On MT7688 (PDMA only) this driver uses the ring->dma structs
++	 * only as the framework. The real HW descriptors are the PDMA
++	 * descriptors in ring->dma_pdma.
++	 */
++	if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		ring->dma_pdma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
++						    &ring->phys_pdma, GFP_KERNEL);
++		if (!ring->dma_pdma)
++			goto no_tx_mem;
++
++		for (i = 0; i < MTK_DMA_SIZE; i++) {
++			ring->dma_pdma[i].txd2 = TX_DMA_DESP2_DEF;
++			ring->dma_pdma[i].txd4 = 0;
++		}
++	}
++
++	ring->dma_size = MTK_DMA_SIZE;
++	atomic_set(&ring->free_count, MTK_DMA_SIZE - 2);
++	ring->next_free = ring->dma;
++	ring->last_free = (void *)txd;
++	ring->last_free_ptr = (u32)(ring->phys + ((MTK_DMA_SIZE - 1) * sz));
++	ring->thresh = MAX_SKB_FRAGS;
++
++	/* make sure that all changes to the dma ring are flushed before we
++	 * continue
++	 */
++	wmb();
++
++	if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
++		mtk_w32(eth, ring->phys, soc->reg_map->qdma.ctx_ptr);
++		mtk_w32(eth, ring->phys, soc->reg_map->qdma.dtx_ptr);
++		mtk_w32(eth,
++			ring->phys + ((MTK_DMA_SIZE - 1) * sz),
++			soc->reg_map->qdma.crx_ptr);
++		mtk_w32(eth, ring->last_free_ptr, soc->reg_map->qdma.drx_ptr);
++		mtk_w32(eth, (QDMA_RES_THRES << 8) | QDMA_RES_THRES,
++			soc->reg_map->qdma.qtx_cfg);
++	} else {
++		mtk_w32(eth, ring->phys_pdma, MT7628_TX_BASE_PTR0);
++		mtk_w32(eth, MTK_DMA_SIZE, MT7628_TX_MAX_CNT0);
++		mtk_w32(eth, 0, MT7628_TX_CTX_IDX0);
++		mtk_w32(eth, MT7628_PST_DTX_IDX0, soc->reg_map->pdma.rst_idx);
++	}
++
++	return 0;
++
++no_tx_mem:
++	return -ENOMEM;
++}
++
++static void mtk_tx_clean(struct mtk_eth *eth)
++{
++	const struct mtk_soc_data *soc = eth->soc;
++	struct mtk_tx_ring *ring = &eth->tx_ring;
++	int i;
++
++	if (ring->buf) {
++		for (i = 0; i < MTK_DMA_SIZE; i++)
++			mtk_tx_unmap(eth, &ring->buf[i], NULL, false);
++		kfree(ring->buf);
++		ring->buf = NULL;
++	}
++
++	if (ring->dma) {
++		dma_free_coherent(eth->dma_dev,
++				  MTK_DMA_SIZE * soc->txrx.txd_size,
++				  ring->dma, ring->phys);
++		ring->dma = NULL;
++	}
++
++	if (ring->dma_pdma) {
++		dma_free_coherent(eth->dma_dev,
++				  MTK_DMA_SIZE * soc->txrx.txd_size,
++				  ring->dma_pdma, ring->phys_pdma);
++		ring->dma_pdma = NULL;
++	}
++}
++
++static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag)
++{
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	struct mtk_rx_ring *ring;
++	int rx_data_len, rx_dma_size;
++	int i;
++
++	if (rx_flag == MTK_RX_FLAGS_QDMA) {
++		if (ring_no)
++			return -EINVAL;
++		ring = &eth->rx_ring_qdma;
++	} else {
++		ring = &eth->rx_ring[ring_no];
++	}
++
++	if (rx_flag == MTK_RX_FLAGS_HWLRO) {
++		rx_data_len = MTK_MAX_LRO_RX_LENGTH;
++		rx_dma_size = MTK_HW_LRO_DMA_SIZE;
++	} else {
++		rx_data_len = ETH_DATA_LEN;
++		rx_dma_size = MTK_DMA_SIZE;
++	}
++
++	ring->frag_size = mtk_max_frag_size(rx_data_len);
++	ring->buf_size = mtk_max_buf_size(ring->frag_size);
++	ring->data = kcalloc(rx_dma_size, sizeof(*ring->data),
++			     GFP_KERNEL);
++	if (!ring->data)
++		return -ENOMEM;
++
++	if (mtk_page_pool_enabled(eth)) {
++		struct page_pool *pp;
++
++		pp = mtk_create_page_pool(eth, &ring->xdp_q, ring_no,
++					  rx_dma_size);
++		if (IS_ERR(pp))
++			return PTR_ERR(pp);
++
++		ring->page_pool = pp;
++	}
++
++	ring->dma = dma_alloc_coherent(eth->dma_dev,
++				       rx_dma_size * eth->soc->txrx.rxd_size,
++				       &ring->phys, GFP_KERNEL);
++	if (!ring->dma)
++		return -ENOMEM;
++
++	for (i = 0; i < rx_dma_size; i++) {
++		struct mtk_rx_dma_v2 *rxd;
++		dma_addr_t dma_addr;
++		void *data;
++
++		rxd = ring->dma + i * eth->soc->txrx.rxd_size;
++		if (ring->page_pool) {
++			data = mtk_page_pool_get_buff(ring->page_pool,
++						      &dma_addr, GFP_KERNEL);
++			if (!data)
++				return -ENOMEM;
++		} else {
++			if (ring->frag_size <= PAGE_SIZE)
++				data = netdev_alloc_frag(ring->frag_size);
++			else
++				data = mtk_max_lro_buf_alloc(GFP_KERNEL);
++
++			if (!data)
++				return -ENOMEM;
++
++			dma_addr = dma_map_single(eth->dma_dev,
++				data + NET_SKB_PAD + eth->ip_align,
++				ring->buf_size, DMA_FROM_DEVICE);
++			if (unlikely(dma_mapping_error(eth->dma_dev,
++						       dma_addr))) {
++				skb_free_frag(data);
++				return -ENOMEM;
++			}
++		}
++		rxd->rxd1 = (unsigned int)dma_addr;
++		ring->data[i] = data;
++
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++			rxd->rxd2 = RX_DMA_LSO;
++		else
++			rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size);
++
++		rxd->rxd3 = 0;
++		rxd->rxd4 = 0;
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++			rxd->rxd5 = 0;
++			rxd->rxd6 = 0;
++			rxd->rxd7 = 0;
++			rxd->rxd8 = 0;
++		}
++	}
++
++	ring->dma_size = rx_dma_size;
++	ring->calc_idx_update = false;
++	ring->calc_idx = rx_dma_size - 1;
++	if (rx_flag == MTK_RX_FLAGS_QDMA)
++		ring->crx_idx_reg = reg_map->qdma.qcrx_ptr +
++				    ring_no * MTK_QRX_OFFSET;
++	else
++		ring->crx_idx_reg = reg_map->pdma.pcrx_ptr +
++				    ring_no * MTK_QRX_OFFSET;
++	/* make sure that all changes to the dma ring are flushed before we
++	 * continue
++	 */
++	wmb();
++
++	if (rx_flag == MTK_RX_FLAGS_QDMA) {
++		mtk_w32(eth, ring->phys,
++			reg_map->qdma.rx_ptr + ring_no * MTK_QRX_OFFSET);
++		mtk_w32(eth, rx_dma_size,
++			reg_map->qdma.rx_cnt_cfg + ring_no * MTK_QRX_OFFSET);
++		mtk_w32(eth, MTK_PST_DRX_IDX_CFG(ring_no),
++			reg_map->qdma.rst_idx);
++	} else {
++		mtk_w32(eth, ring->phys,
++			reg_map->pdma.rx_ptr + ring_no * MTK_QRX_OFFSET);
++		mtk_w32(eth, rx_dma_size,
++			reg_map->pdma.rx_cnt_cfg + ring_no * MTK_QRX_OFFSET);
++		mtk_w32(eth, MTK_PST_DRX_IDX_CFG(ring_no),
++			reg_map->pdma.rst_idx);
++	}
++	mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg);
++
++	return 0;
++}
++
++static void mtk_rx_clean(struct mtk_eth *eth, struct mtk_rx_ring *ring)
++{
++	int i;
++
++	if (ring->data && ring->dma) {
++		for (i = 0; i < ring->dma_size; i++) {
++			struct mtk_rx_dma *rxd;
++
++			if (!ring->data[i])
++				continue;
++
++			rxd = ring->dma + i * eth->soc->txrx.rxd_size;
++			if (!rxd->rxd1)
++				continue;
++
++			dma_unmap_single(eth->dma_dev, rxd->rxd1,
++					 ring->buf_size, DMA_FROM_DEVICE);
++			mtk_rx_put_buff(ring, ring->data[i], false);
++		}
++		kfree(ring->data);
++		ring->data = NULL;
++	}
++
++	if (ring->dma) {
++		dma_free_coherent(eth->dma_dev,
++				  ring->dma_size * eth->soc->txrx.rxd_size,
++				  ring->dma, ring->phys);
++		ring->dma = NULL;
++	}
++
++	if (ring->page_pool) {
++		if (xdp_rxq_info_is_reg(&ring->xdp_q))
++			xdp_rxq_info_unreg(&ring->xdp_q);
++		page_pool_destroy(ring->page_pool);
++		ring->page_pool = NULL;
++	}
++}
++
++static int mtk_hwlro_rx_init(struct mtk_eth *eth)
++{
++	int i;
++	u32 ring_ctrl_dw1 = 0, ring_ctrl_dw2 = 0, ring_ctrl_dw3 = 0;
++	u32 lro_ctrl_dw0 = 0, lro_ctrl_dw3 = 0;
++
++	/* set LRO rings to auto-learn modes */
++	ring_ctrl_dw2 |= MTK_RING_AUTO_LERAN_MODE;
++
++	/* validate LRO ring */
++	ring_ctrl_dw2 |= MTK_RING_VLD;
++
++	/* set AGE timer (unit: 20us) */
++	ring_ctrl_dw2 |= MTK_RING_AGE_TIME_H;
++	ring_ctrl_dw1 |= MTK_RING_AGE_TIME_L;
++
++	/* set max AGG timer (unit: 20us) */
++	ring_ctrl_dw2 |= MTK_RING_MAX_AGG_TIME;
++
++	/* set max LRO AGG count */
++	ring_ctrl_dw2 |= MTK_RING_MAX_AGG_CNT_L;
++	ring_ctrl_dw3 |= MTK_RING_MAX_AGG_CNT_H;
++
++	for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) {
++		mtk_w32(eth, ring_ctrl_dw1, MTK_LRO_CTRL_DW1_CFG(i));
++		mtk_w32(eth, ring_ctrl_dw2, MTK_LRO_CTRL_DW2_CFG(i));
++		mtk_w32(eth, ring_ctrl_dw3, MTK_LRO_CTRL_DW3_CFG(i));
++	}
++
++	/* IPv4 checksum update enable */
++	lro_ctrl_dw0 |= MTK_L3_CKS_UPD_EN;
++
++	/* switch priority comparison to packet count mode */
++	lro_ctrl_dw0 |= MTK_LRO_ALT_PKT_CNT_MODE;
++
++	/* bandwidth threshold setting */
++	mtk_w32(eth, MTK_HW_LRO_BW_THRE, MTK_PDMA_LRO_CTRL_DW2);
++
++	/* auto-learn score delta setting */
++	mtk_w32(eth, MTK_HW_LRO_REPLACE_DELTA, MTK_PDMA_LRO_ALT_SCORE_DELTA);
++
++	/* set refresh timer for altering flows to 1 sec. (unit: 20us) */
++	mtk_w32(eth, (MTK_HW_LRO_TIMER_UNIT << 16) | MTK_HW_LRO_REFRESH_TIME,
++		MTK_PDMA_LRO_ALT_REFRESH_TIMER);
++
++	/* set HW LRO mode & the max aggregation count for rx packets */
++	lro_ctrl_dw3 |= MTK_ADMA_MODE | (MTK_HW_LRO_MAX_AGG_CNT & 0xff);
++
++	/* the minimal remaining room of SDL0 in RXD for lro aggregation */
++	lro_ctrl_dw3 |= MTK_LRO_MIN_RXD_SDL;
++
++	/* enable HW LRO */
++	lro_ctrl_dw0 |= MTK_LRO_EN;
++
++	mtk_w32(eth, lro_ctrl_dw3, MTK_PDMA_LRO_CTRL_DW3);
++	mtk_w32(eth, lro_ctrl_dw0, MTK_PDMA_LRO_CTRL_DW0);
++
++	return 0;
++}
++
++static void mtk_hwlro_rx_uninit(struct mtk_eth *eth)
++{
++	int i;
++	u32 val;
++
++	/* relinquish lro rings, flush aggregated packets */
++	mtk_w32(eth, MTK_LRO_RING_RELINQUISH_REQ, MTK_PDMA_LRO_CTRL_DW0);
++
++	/* wait for relinquishments done */
++	for (i = 0; i < 10; i++) {
++		val = mtk_r32(eth, MTK_PDMA_LRO_CTRL_DW0);
++		if (val & MTK_LRO_RING_RELINQUISH_DONE) {
++			msleep(20);
++			continue;
++		}
++		break;
++	}
++
++	/* invalidate lro rings */
++	for (i = 1; i < MTK_MAX_RX_RING_NUM; i++)
++		mtk_w32(eth, 0, MTK_LRO_CTRL_DW2_CFG(i));
++
++	/* disable HW LRO */
++	mtk_w32(eth, 0, MTK_PDMA_LRO_CTRL_DW0);
++}
++
++static void mtk_hwlro_val_ipaddr(struct mtk_eth *eth, int idx, __be32 ip)
++{
++	u32 reg_val;
++
++	reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx));
++
++	/* invalidate the IP setting */
++	mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx));
++
++	mtk_w32(eth, ip, MTK_LRO_DIP_DW0_CFG(idx));
++
++	/* validate the IP setting */
++	mtk_w32(eth, (reg_val | MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx));
++}
++
++static void mtk_hwlro_inval_ipaddr(struct mtk_eth *eth, int idx)
++{
++	u32 reg_val;
++
++	reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx));
++
++	/* invalidate the IP setting */
++	mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx));
++
++	mtk_w32(eth, 0, MTK_LRO_DIP_DW0_CFG(idx));
++}
++
++static int mtk_hwlro_get_ip_cnt(struct mtk_mac *mac)
++{
++	int cnt = 0;
++	int i;
++
++	for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) {
++		if (mac->hwlro_ip[i])
++			cnt++;
++	}
++
++	return cnt;
++}
++
++static int mtk_hwlro_add_ipaddr(struct net_device *dev,
++				struct ethtool_rxnfc *cmd)
++{
++	struct ethtool_rx_flow_spec *fsp =
++		(struct ethtool_rx_flow_spec *)&cmd->fs;
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int hwlro_idx;
++
++	if ((fsp->flow_type != TCP_V4_FLOW) ||
++	    (!fsp->h_u.tcp_ip4_spec.ip4dst) ||
++	    (fsp->location > 1))
++		return -EINVAL;
++
++	mac->hwlro_ip[fsp->location] = htonl(fsp->h_u.tcp_ip4_spec.ip4dst);
++	hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location;
++
++	mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac);
++
++	mtk_hwlro_val_ipaddr(eth, hwlro_idx, mac->hwlro_ip[fsp->location]);
++
++	return 0;
++}
++
++static int mtk_hwlro_del_ipaddr(struct net_device *dev,
++				struct ethtool_rxnfc *cmd)
++{
++	struct ethtool_rx_flow_spec *fsp =
++		(struct ethtool_rx_flow_spec *)&cmd->fs;
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int hwlro_idx;
++
++	if (fsp->location > 1)
++		return -EINVAL;
++
++	mac->hwlro_ip[fsp->location] = 0;
++	hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location;
++
++	mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac);
++
++	mtk_hwlro_inval_ipaddr(eth, hwlro_idx);
++
++	return 0;
++}
++
++static void mtk_hwlro_netdev_disable(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int i, hwlro_idx;
++
++	for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) {
++		mac->hwlro_ip[i] = 0;
++		hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + i;
++
++		mtk_hwlro_inval_ipaddr(eth, hwlro_idx);
++	}
++
++	mac->hwlro_ip_cnt = 0;
++}
++
++static int mtk_hwlro_get_fdir_entry(struct net_device *dev,
++				    struct ethtool_rxnfc *cmd)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct ethtool_rx_flow_spec *fsp =
++		(struct ethtool_rx_flow_spec *)&cmd->fs;
++
++	if (fsp->location >= ARRAY_SIZE(mac->hwlro_ip))
++		return -EINVAL;
++
++	/* only tcp dst ipv4 is meaningful, others are meaningless */
++	fsp->flow_type = TCP_V4_FLOW;
++	fsp->h_u.tcp_ip4_spec.ip4dst = ntohl(mac->hwlro_ip[fsp->location]);
++	fsp->m_u.tcp_ip4_spec.ip4dst = 0;
++
++	fsp->h_u.tcp_ip4_spec.ip4src = 0;
++	fsp->m_u.tcp_ip4_spec.ip4src = 0xffffffff;
++	fsp->h_u.tcp_ip4_spec.psrc = 0;
++	fsp->m_u.tcp_ip4_spec.psrc = 0xffff;
++	fsp->h_u.tcp_ip4_spec.pdst = 0;
++	fsp->m_u.tcp_ip4_spec.pdst = 0xffff;
++	fsp->h_u.tcp_ip4_spec.tos = 0;
++	fsp->m_u.tcp_ip4_spec.tos = 0xff;
++
++	return 0;
++}
++
++static int mtk_hwlro_get_fdir_all(struct net_device *dev,
++				  struct ethtool_rxnfc *cmd,
++				  u32 *rule_locs)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	int cnt = 0;
++	int i;
++
++	for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) {
++		if (mac->hwlro_ip[i]) {
++			rule_locs[cnt] = i;
++			cnt++;
++		}
++	}
++
++	cmd->rule_cnt = cnt;
++
++	return 0;
++}
++
++static netdev_features_t mtk_fix_features(struct net_device *dev,
++					  netdev_features_t features)
++{
++	if (!(features & NETIF_F_LRO)) {
++		struct mtk_mac *mac = netdev_priv(dev);
++		int ip_cnt = mtk_hwlro_get_ip_cnt(mac);
++
++		if (ip_cnt) {
++			netdev_info(dev, "RX flow is programmed, LRO should keep on\n");
++
++			features |= NETIF_F_LRO;
++		}
++	}
++
++	return features;
++}
++
++static int mtk_set_features(struct net_device *dev, netdev_features_t features)
++{
++	int err = 0;
++
++	if (!((dev->features ^ features) & NETIF_F_LRO))
++		return 0;
++
++	if (!(features & NETIF_F_LRO))
++		mtk_hwlro_netdev_disable(dev);
++
++	return err;
++}
++
++/* wait for DMA to finish whatever it is doing before we start using it again */
++static int mtk_dma_busy_wait(struct mtk_eth *eth)
++{
++	unsigned int reg;
++	int ret;
++	u32 val;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		reg = eth->soc->reg_map->qdma.glo_cfg;
++	else
++		reg = eth->soc->reg_map->pdma.glo_cfg;
++
++	ret = readx_poll_timeout_atomic(__raw_readl, eth->base + reg, val,
++					!(val & (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)),
++					5, MTK_DMA_BUSY_TIMEOUT_US);
++	if (ret)
++		dev_err(eth->dev, "DMA init timeout\n");
++
++	return ret;
++}
++
++static int mtk_dma_init(struct mtk_eth *eth)
++{
++	int err;
++	u32 i;
++
++	if (mtk_dma_busy_wait(eth))
++		return -EBUSY;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		/* QDMA needs scratch memory for internal reordering of the
++		 * descriptors
++		 */
++		err = mtk_init_fq_dma(eth);
++		if (err)
++			return err;
++	}
++
++	err = mtk_tx_alloc(eth);
++	if (err)
++		return err;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_QDMA);
++		if (err)
++			return err;
++	}
++
++	err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_NORMAL);
++	if (err)
++		return err;
++
++	if (eth->hwlro) {
++		for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) {
++			err = mtk_rx_alloc(eth, i, MTK_RX_FLAGS_HWLRO);
++			if (err)
++				return err;
++		}
++		err = mtk_hwlro_rx_init(eth);
++		if (err)
++			return err;
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		/* Enable random early drop and set drop threshold
++		 * automatically
++		 */
++		mtk_w32(eth, FC_THRES_DROP_MODE | FC_THRES_DROP_EN |
++			FC_THRES_MIN, eth->soc->reg_map->qdma.fc_th);
++		mtk_w32(eth, 0x0, eth->soc->reg_map->qdma.hred);
++	}
++
++	return 0;
++}
++
++static void mtk_dma_free(struct mtk_eth *eth)
++{
++	const struct mtk_soc_data *soc = eth->soc;
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++)
++		if (eth->netdev[i])
++			netdev_reset_queue(eth->netdev[i]);
++	if (eth->scratch_ring) {
++		dma_free_coherent(eth->dma_dev,
++				  MTK_DMA_SIZE * soc->txrx.txd_size,
++				  eth->scratch_ring, eth->phy_scratch_ring);
++		eth->scratch_ring = NULL;
++		eth->phy_scratch_ring = 0;
++	}
++	mtk_tx_clean(eth);
++	mtk_rx_clean(eth, &eth->rx_ring[0]);
++	mtk_rx_clean(eth, &eth->rx_ring_qdma);
++
++	if (eth->hwlro) {
++		mtk_hwlro_rx_uninit(eth);
++		for (i = 1; i < MTK_MAX_RX_RING_NUM; i++)
++			mtk_rx_clean(eth, &eth->rx_ring[i]);
++	}
++
++	kfree(eth->scratch_head);
++}
++
++static void mtk_tx_timeout(struct net_device *dev, unsigned int txqueue)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	eth->netdev[mac->id]->stats.tx_errors++;
++	netif_err(eth, tx_err, dev,
++		  "transmit timed out\n");
++	schedule_work(&eth->pending_work);
++}
++
++static irqreturn_t mtk_handle_irq_rx(int irq, void *_eth)
++{
++	struct mtk_eth *eth = _eth;
++
++	eth->rx_events++;
++	if (likely(napi_schedule_prep(&eth->rx_napi))) {
++		__napi_schedule(&eth->rx_napi);
++		mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask);
++	}
++
++	return IRQ_HANDLED;
++}
++
++static irqreturn_t mtk_handle_irq_tx(int irq, void *_eth)
++{
++	struct mtk_eth *eth = _eth;
++
++	eth->tx_events++;
++	if (likely(napi_schedule_prep(&eth->tx_napi))) {
++		__napi_schedule(&eth->tx_napi);
++		mtk_tx_irq_disable(eth, MTK_TX_DONE_INT);
++	}
++
++	return IRQ_HANDLED;
++}
++
++static irqreturn_t mtk_handle_irq(int irq, void *_eth)
++{
++	struct mtk_eth *eth = _eth;
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++
++	if (mtk_r32(eth, reg_map->pdma.irq_mask) &
++	    eth->soc->txrx.rx_irq_done_mask) {
++		if (mtk_r32(eth, reg_map->pdma.irq_status) &
++		    eth->soc->txrx.rx_irq_done_mask)
++			mtk_handle_irq_rx(irq, _eth);
++	}
++	if (mtk_r32(eth, reg_map->tx_irq_mask) & MTK_TX_DONE_INT) {
++		if (mtk_r32(eth, reg_map->tx_irq_status) & MTK_TX_DONE_INT)
++			mtk_handle_irq_tx(irq, _eth);
++	}
++
++	return IRQ_HANDLED;
++}
++
++#ifdef CONFIG_NET_POLL_CONTROLLER
++static void mtk_poll_controller(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	mtk_tx_irq_disable(eth, MTK_TX_DONE_INT);
++	mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask);
++	mtk_handle_irq_rx(eth->irq[2], dev);
++	mtk_tx_irq_enable(eth, MTK_TX_DONE_INT);
++	mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask);
++}
++#endif
++
++static int mtk_start_dma(struct mtk_eth *eth)
++{
++	u32 val, rx_2b_offset = (NET_IP_ALIGN == 2) ? MTK_RX_2B_OFFSET : 0;
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	int err;
++
++	err = mtk_dma_init(eth);
++	if (err) {
++		mtk_dma_free(eth);
++		return err;
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
++		val = mtk_r32(eth, reg_map->qdma.glo_cfg);
++		val |= MTK_TX_DMA_EN | MTK_RX_DMA_EN |
++		       MTK_TX_BT_32DWORDS | MTK_NDP_CO_PRO |
++		       MTK_RX_2B_OFFSET | MTK_TX_WB_DDONE;
++
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
++			val |= MTK_MUTLI_CNT | MTK_RESV_BUF |
++			       MTK_WCOMP_EN | MTK_DMAD_WR_WDONE |
++			       MTK_CHK_DDONE_EN;
++		else
++			val |= MTK_RX_BT_32DWORDS;
++		mtk_w32(eth, val, reg_map->qdma.glo_cfg);
++
++		mtk_w32(eth,
++			MTK_RX_DMA_EN | rx_2b_offset |
++			MTK_RX_BT_32DWORDS | MTK_MULTI_EN,
++			reg_map->pdma.glo_cfg);
++	} else {
++		mtk_w32(eth, MTK_TX_WB_DDONE | MTK_TX_DMA_EN | MTK_RX_DMA_EN |
++			MTK_MULTI_EN | MTK_PDMA_SIZE_8DWORDS,
++			reg_map->pdma.glo_cfg);
++	}
++
++	return 0;
++}
++
++static void mtk_gdm_config(struct mtk_eth *eth, u32 config)
++{
++	int i;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++		return;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		u32 val = mtk_r32(eth, MTK_GDMA_FWD_CFG(i));
++
++		/* default setup the forward port to send frame to PDMA */
++		val &= ~0xffff;
++
++		/* Enable RX checksum */
++		val |= MTK_GDMA_ICS_EN | MTK_GDMA_TCS_EN | MTK_GDMA_UCS_EN;
++
++		val |= config;
++
++		if (!i && eth->netdev[0] && netdev_uses_dsa(eth->netdev[0]))
++			val |= MTK_GDMA_SPECIAL_TAG;
++
++		mtk_w32(eth, val, MTK_GDMA_FWD_CFG(i));
++	}
++	/* Reset and enable PSE */
++	mtk_w32(eth, RST_GL_PSE, MTK_RST_GL);
++	mtk_w32(eth, 0, MTK_RST_GL);
++}
++
++static int mtk_open(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int err;
++
++	err = phylink_of_phy_connect(mac->phylink, mac->of_node, 0);
++	if (err) {
++		netdev_err(dev, "%s: could not attach PHY: %d\n", __func__,
++			   err);
++		return err;
++	}
++
++	/* we run 2 netdevs on the same dma ring so we only bring it up once */
++	if (!refcount_read(&eth->dma_refcnt)) {
++		u32 gdm_config = MTK_GDMA_TO_PDMA;
++
++		err = mtk_start_dma(eth);
++		if (err) {
++			phylink_disconnect_phy(mac->phylink);
++			return err;
++		}
++
++		if (eth->soc->offload_version && mtk_ppe_start(eth->ppe) == 0)
++			gdm_config = MTK_GDMA_TO_PPE;
++
++		mtk_gdm_config(eth, gdm_config);
++
++		napi_enable(&eth->tx_napi);
++		napi_enable(&eth->rx_napi);
++		mtk_tx_irq_enable(eth, MTK_TX_DONE_INT);
++		mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask);
++		refcount_set(&eth->dma_refcnt, 1);
++	}
++	else
++		refcount_inc(&eth->dma_refcnt);
++
++	phylink_start(mac->phylink);
++	netif_start_queue(dev);
++	return 0;
++}
++
++static void mtk_stop_dma(struct mtk_eth *eth, u32 glo_cfg)
++{
++	u32 val;
++	int i;
++
++	/* stop the dma engine */
++	spin_lock_bh(&eth->page_lock);
++	val = mtk_r32(eth, glo_cfg);
++	mtk_w32(eth, val & ~(MTK_TX_WB_DDONE | MTK_RX_DMA_EN | MTK_TX_DMA_EN),
++		glo_cfg);
++	spin_unlock_bh(&eth->page_lock);
++
++	/* wait for dma stop */
++	for (i = 0; i < 10; i++) {
++		val = mtk_r32(eth, glo_cfg);
++		if (val & (MTK_TX_DMA_BUSY | MTK_RX_DMA_BUSY)) {
++			msleep(20);
++			continue;
++		}
++		break;
++	}
++}
++
++static int mtk_stop(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	phylink_stop(mac->phylink);
++
++	netif_tx_disable(dev);
++
++	phylink_disconnect_phy(mac->phylink);
++
++	/* only shutdown DMA if this is the last user */
++	if (!refcount_dec_and_test(&eth->dma_refcnt))
++		return 0;
++
++	mtk_gdm_config(eth, MTK_GDMA_DROP_ALL);
++
++	mtk_tx_irq_disable(eth, MTK_TX_DONE_INT);
++	mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask);
++	napi_disable(&eth->tx_napi);
++	napi_disable(&eth->rx_napi);
++
++	cancel_work_sync(&eth->rx_dim.work);
++	cancel_work_sync(&eth->tx_dim.work);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		mtk_stop_dma(eth, eth->soc->reg_map->qdma.glo_cfg);
++	mtk_stop_dma(eth, eth->soc->reg_map->pdma.glo_cfg);
++
++	mtk_dma_free(eth);
++
++	if (eth->soc->offload_version)
++		mtk_ppe_stop(eth->ppe);
++
++	return 0;
++}
++
++static int mtk_xdp_setup(struct net_device *dev, struct bpf_prog *prog,
++			 struct netlink_ext_ack *extack)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	struct bpf_prog *old_prog;
++	bool need_update;
++
++	if (eth->hwlro) {
++		NL_SET_ERR_MSG_MOD(extack, "XDP not supported with HWLRO");
++		return -EOPNOTSUPP;
++	}
++
++	if (dev->mtu > MTK_PP_MAX_BUF_SIZE) {
++		NL_SET_ERR_MSG_MOD(extack, "MTU too large for XDP");
++		return -EOPNOTSUPP;
++	}
++
++	need_update = !!eth->prog != !!prog;
++	if (netif_running(dev) && need_update)
++		mtk_stop(dev);
++
++	old_prog = rcu_replace_pointer(eth->prog, prog, lockdep_rtnl_is_held());
++	if (old_prog)
++		bpf_prog_put(old_prog);
++
++	if (netif_running(dev) && need_update)
++		return mtk_open(dev);
++
++	return 0;
++}
++
++static int mtk_xdp(struct net_device *dev, struct netdev_bpf *xdp)
++{
++	switch (xdp->command) {
++	case XDP_SETUP_PROG:
++		return mtk_xdp_setup(dev, xdp->prog, xdp->extack);
++	default:
++		return -EINVAL;
++	}
++}
++
++static void ethsys_reset(struct mtk_eth *eth, u32 reset_bits)
++{
++	regmap_update_bits(eth->ethsys, ETHSYS_RSTCTRL,
++			   reset_bits,
++			   reset_bits);
++
++	usleep_range(1000, 1100);
++	regmap_update_bits(eth->ethsys, ETHSYS_RSTCTRL,
++			   reset_bits,
++			   ~reset_bits);
++	mdelay(10);
++}
++
++static void mtk_clk_disable(struct mtk_eth *eth)
++{
++	int clk;
++
++	for (clk = MTK_CLK_MAX - 1; clk >= 0; clk--)
++		clk_disable_unprepare(eth->clks[clk]);
++}
++
++static int mtk_clk_enable(struct mtk_eth *eth)
++{
++	int clk, ret;
++
++	for (clk = 0; clk < MTK_CLK_MAX ; clk++) {
++		ret = clk_prepare_enable(eth->clks[clk]);
++		if (ret)
++			goto err_disable_clks;
++	}
++
++	return 0;
++
++err_disable_clks:
++	while (--clk >= 0)
++		clk_disable_unprepare(eth->clks[clk]);
++
++	return ret;
++}
++
++static void mtk_dim_rx(struct work_struct *work)
++{
++	struct dim *dim = container_of(work, struct dim, work);
++	struct mtk_eth *eth = container_of(dim, struct mtk_eth, rx_dim);
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	struct dim_cq_moder cur_profile;
++	u32 val, cur;
++
++	cur_profile = net_dim_get_rx_moderation(eth->rx_dim.mode,
++						dim->profile_ix);
++	spin_lock_bh(&eth->dim_lock);
++
++	val = mtk_r32(eth, reg_map->pdma.delay_irq);
++	val &= MTK_PDMA_DELAY_TX_MASK;
++	val |= MTK_PDMA_DELAY_RX_EN;
++
++	cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK);
++	val |= cur << MTK_PDMA_DELAY_RX_PTIME_SHIFT;
++
++	cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK);
++	val |= cur << MTK_PDMA_DELAY_RX_PINT_SHIFT;
++
++	mtk_w32(eth, val, reg_map->pdma.delay_irq);
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		mtk_w32(eth, val, reg_map->qdma.delay_irq);
++
++	spin_unlock_bh(&eth->dim_lock);
++
++	dim->state = DIM_START_MEASURE;
++}
++
++static void mtk_dim_tx(struct work_struct *work)
++{
++	struct dim *dim = container_of(work, struct dim, work);
++	struct mtk_eth *eth = container_of(dim, struct mtk_eth, tx_dim);
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	struct dim_cq_moder cur_profile;
++	u32 val, cur;
++
++	cur_profile = net_dim_get_tx_moderation(eth->tx_dim.mode,
++						dim->profile_ix);
++	spin_lock_bh(&eth->dim_lock);
++
++	val = mtk_r32(eth, reg_map->pdma.delay_irq);
++	val &= MTK_PDMA_DELAY_RX_MASK;
++	val |= MTK_PDMA_DELAY_TX_EN;
++
++	cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK);
++	val |= cur << MTK_PDMA_DELAY_TX_PTIME_SHIFT;
++
++	cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK);
++	val |= cur << MTK_PDMA_DELAY_TX_PINT_SHIFT;
++
++	mtk_w32(eth, val, reg_map->pdma.delay_irq);
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
++		mtk_w32(eth, val, reg_map->qdma.delay_irq);
++
++	spin_unlock_bh(&eth->dim_lock);
++
++	dim->state = DIM_START_MEASURE;
++}
++
++static int mtk_hw_init(struct mtk_eth *eth)
++{
++	u32 dma_mask = ETHSYS_DMA_AG_MAP_PDMA | ETHSYS_DMA_AG_MAP_QDMA |
++		       ETHSYS_DMA_AG_MAP_PPE;
++	const struct mtk_reg_map *reg_map = eth->soc->reg_map;
++	int i, val, ret;
++
++	if (test_and_set_bit(MTK_HW_INIT, &eth->state))
++		return 0;
++
++	pm_runtime_enable(eth->dev);
++	pm_runtime_get_sync(eth->dev);
++
++	ret = mtk_clk_enable(eth);
++	if (ret)
++		goto err_disable_pm;
++
++	if (eth->ethsys)
++		regmap_update_bits(eth->ethsys, ETHSYS_DMA_AG_MAP, dma_mask,
++				   of_dma_is_coherent(eth->dma_dev->of_node) * dma_mask);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		ret = device_reset(eth->dev);
++		if (ret) {
++			dev_err(eth->dev, "MAC reset failed!\n");
++			goto err_disable_pm;
++		}
++
++		/* set interrupt delays based on current Net DIM sample */
++		mtk_dim_rx(&eth->rx_dim.work);
++		mtk_dim_tx(&eth->tx_dim.work);
++
++		/* disable delay and normal interrupt */
++		mtk_tx_irq_disable(eth, ~0);
++		mtk_rx_irq_disable(eth, ~0);
++
++		return 0;
++	}
++
++	val = RSTCTRL_FE | RSTCTRL_PPE;
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++		regmap_write(eth->ethsys, ETHSYS_FE_RST_CHK_IDLE_EN, 0);
++
++		val |= RSTCTRL_ETH;
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_RSTCTRL_PPE1))
++			val |= RSTCTRL_PPE1;
++	}
++
++	ethsys_reset(eth, val);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++		regmap_write(eth->ethsys, ETHSYS_FE_RST_CHK_IDLE_EN,
++			     0x3ffffff);
++
++		/* Set FE to PDMAv2 if necessary */
++		val = mtk_r32(eth, MTK_FE_GLO_MISC);
++		mtk_w32(eth,  val | BIT(4), MTK_FE_GLO_MISC);
++	}
++
++	if (eth->pctl) {
++		/* Set GE2 driving and slew rate */
++		regmap_write(eth->pctl, GPIO_DRV_SEL10, 0xa00);
++
++		/* set GE2 TDSEL */
++		regmap_write(eth->pctl, GPIO_OD33_CTRL8, 0x5);
++
++		/* set GE2 TUNE */
++		regmap_write(eth->pctl, GPIO_BIAS_CTRL, 0x0);
++	}
++
++	/* Set linkdown as the default for each GMAC. Its own MCR would be set
++	 * up with the more appropriate value when mtk_mac_config call is being
++	 * invoked.
++	 */
++	for (i = 0; i < MTK_MAC_COUNT; i++)
++		mtk_w32(eth, MAC_MCR_FORCE_LINK_DOWN, MTK_MAC_MCR(i));
++
++	/* Indicates CDM to parse the MTK special tag from CPU
++	 * which also is working out for untag packets.
++	 */
++	val = mtk_r32(eth, MTK_CDMQ_IG_CTRL);
++	mtk_w32(eth, val | MTK_CDMQ_STAG_EN, MTK_CDMQ_IG_CTRL);
++
++	/* Enable RX VLan Offloading */
++	mtk_w32(eth, 1, MTK_CDMP_EG_CTRL);
++
++	/* set interrupt delays based on current Net DIM sample */
++	mtk_dim_rx(&eth->rx_dim.work);
++	mtk_dim_tx(&eth->tx_dim.work);
++
++	/* disable delay and normal interrupt */
++	mtk_tx_irq_disable(eth, ~0);
++	mtk_rx_irq_disable(eth, ~0);
++
++	/* FE int grouping */
++	mtk_w32(eth, MTK_TX_DONE_INT, reg_map->pdma.int_grp);
++	mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, reg_map->pdma.int_grp + 4);
++	mtk_w32(eth, MTK_TX_DONE_INT, reg_map->qdma.int_grp);
++	mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, reg_map->qdma.int_grp + 4);
++	mtk_w32(eth, 0x21021000, MTK_FE_INT_GRP);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) {
++		/* PSE should not drop port8 and port9 packets */
++		mtk_w32(eth, 0x00000300, PSE_DROP_CFG);
++
++		/* PSE Free Queue Flow Control  */
++		mtk_w32(eth, 0x01fa01f4, PSE_FQFC_CFG2);
++
++		/* PSE config input queue threshold */
++		mtk_w32(eth, 0x001a000e, PSE_IQ_REV(1));
++		mtk_w32(eth, 0x01ff001a, PSE_IQ_REV(2));
++		mtk_w32(eth, 0x000e01ff, PSE_IQ_REV(3));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(4));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(5));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(6));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(7));
++		mtk_w32(eth, 0x000e000e, PSE_IQ_REV(8));
++
++		/* PSE config output queue threshold */
++		mtk_w32(eth, 0x000f000a, PSE_OQ_TH(1));
++		mtk_w32(eth, 0x001a000f, PSE_OQ_TH(2));
++		mtk_w32(eth, 0x000f001a, PSE_OQ_TH(3));
++		mtk_w32(eth, 0x01ff000f, PSE_OQ_TH(4));
++		mtk_w32(eth, 0x000f000f, PSE_OQ_TH(5));
++		mtk_w32(eth, 0x0006000f, PSE_OQ_TH(6));
++		mtk_w32(eth, 0x00060006, PSE_OQ_TH(7));
++		mtk_w32(eth, 0x00060006, PSE_OQ_TH(8));
++
++		/* GDM and CDM Threshold */
++		mtk_w32(eth, 0x00000004, MTK_GDM2_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDMW0_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDMW1_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDME0_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDME1_THRES);
++		mtk_w32(eth, 0x00000004, MTK_CDMM_THRES);
++	}
++
++	return 0;
++
++err_disable_pm:
++	pm_runtime_put_sync(eth->dev);
++	pm_runtime_disable(eth->dev);
++
++	return ret;
++}
++
++static int mtk_hw_deinit(struct mtk_eth *eth)
++{
++	if (!test_and_clear_bit(MTK_HW_INIT, &eth->state))
++		return 0;
++
++	mtk_clk_disable(eth);
++
++	pm_runtime_put_sync(eth->dev);
++	pm_runtime_disable(eth->dev);
++
++	return 0;
++}
++
++static int __init mtk_init(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	int ret;
++
++	ret = of_get_ethdev_address(mac->of_node, dev);
++	if (ret) {
++		/* If the mac address is invalid, use random mac address */
++		eth_hw_addr_random(dev);
++		dev_err(eth->dev, "generated random MAC address %pM\n",
++			dev->dev_addr);
++	}
++
++	return 0;
++}
++
++static void mtk_uninit(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++
++	phylink_disconnect_phy(mac->phylink);
++	mtk_tx_irq_disable(eth, ~0);
++	mtk_rx_irq_disable(eth, ~0);
++}
++
++static int mtk_change_mtu(struct net_device *dev, int new_mtu)
++{
++	int length = new_mtu + MTK_RX_ETH_HLEN;
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_eth *eth = mac->hw;
++	u32 mcr_cur, mcr_new;
++
++	if (rcu_access_pointer(eth->prog) &&
++	    length > MTK_PP_MAX_BUF_SIZE) {
++		netdev_err(dev, "Invalid MTU for XDP mode\n");
++		return -EINVAL;
++	}
++
++	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
++		mcr_new = mcr_cur & ~MAC_MCR_MAX_RX_MASK;
++
++		if (length <= 1518)
++			mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1518);
++		else if (length <= 1536)
++			mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1536);
++		else if (length <= 1552)
++			mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1552);
++		else
++			mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_2048);
++
++		if (mcr_new != mcr_cur)
++			mtk_w32(mac->hw, mcr_new, MTK_MAC_MCR(mac->id));
++	}
++
++	dev->mtu = new_mtu;
++
++	return 0;
++}
++
++static int mtk_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	switch (cmd) {
++	case SIOCGMIIPHY:
++	case SIOCGMIIREG:
++	case SIOCSMIIREG:
++		return phylink_mii_ioctl(mac->phylink, ifr, cmd);
++	default:
++		break;
++	}
++
++	return -EOPNOTSUPP;
++}
++
++static void mtk_pending_work(struct work_struct *work)
++{
++	struct mtk_eth *eth = container_of(work, struct mtk_eth, pending_work);
++	int err, i;
++	unsigned long restart = 0;
++
++	rtnl_lock();
++
++	dev_dbg(eth->dev, "[%s][%d] reset\n", __func__, __LINE__);
++
++	while (test_and_set_bit_lock(MTK_RESETTING, &eth->state))
++		cpu_relax();
++
++	dev_dbg(eth->dev, "[%s][%d] mtk_stop starts\n", __func__, __LINE__);
++	/* stop all devices to make sure that dma is properly shut down */
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		mtk_stop(eth->netdev[i]);
++		__set_bit(i, &restart);
++	}
++	dev_dbg(eth->dev, "[%s][%d] mtk_stop ends\n", __func__, __LINE__);
++
++	/* restart underlying hardware such as power, clock, pin mux
++	 * and the connected phy
++	 */
++	mtk_hw_deinit(eth);
++
++	if (eth->dev->pins)
++		pinctrl_select_state(eth->dev->pins->p,
++				     eth->dev->pins->default_state);
++	mtk_hw_init(eth);
++
++	/* restart DMA and enable IRQs */
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!test_bit(i, &restart))
++			continue;
++		err = mtk_open(eth->netdev[i]);
++		if (err) {
++			netif_alert(eth, ifup, eth->netdev[i],
++			      "Driver up/down cycle failed, closing device.\n");
++			dev_close(eth->netdev[i]);
++		}
++	}
++
++	dev_dbg(eth->dev, "[%s][%d] reset done\n", __func__, __LINE__);
++
++	clear_bit_unlock(MTK_RESETTING, &eth->state);
++
++	rtnl_unlock();
++}
++
++static int mtk_free_dev(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		free_netdev(eth->netdev[i]);
++	}
++
++	return 0;
++}
++
++static int mtk_unreg_dev(struct mtk_eth *eth)
++{
++	int i;
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		unregister_netdev(eth->netdev[i]);
++	}
++
++	return 0;
++}
++
++static int mtk_cleanup(struct mtk_eth *eth)
++{
++	mtk_unreg_dev(eth);
++	mtk_free_dev(eth);
++	cancel_work_sync(&eth->pending_work);
++
++	return 0;
++}
++
++static int mtk_get_link_ksettings(struct net_device *ndev,
++				  struct ethtool_link_ksettings *cmd)
++{
++	struct mtk_mac *mac = netdev_priv(ndev);
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return -EBUSY;
++
++	return phylink_ethtool_ksettings_get(mac->phylink, cmd);
++}
++
++static int mtk_set_link_ksettings(struct net_device *ndev,
++				  const struct ethtool_link_ksettings *cmd)
++{
++	struct mtk_mac *mac = netdev_priv(ndev);
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return -EBUSY;
++
++	return phylink_ethtool_ksettings_set(mac->phylink, cmd);
++}
++
++static void mtk_get_drvinfo(struct net_device *dev,
++			    struct ethtool_drvinfo *info)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	strlcpy(info->driver, mac->hw->dev->driver->name, sizeof(info->driver));
++	strlcpy(info->bus_info, dev_name(mac->hw->dev), sizeof(info->bus_info));
++	info->n_stats = ARRAY_SIZE(mtk_ethtool_stats);
++}
++
++static u32 mtk_get_msglevel(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	return mac->hw->msg_enable;
++}
++
++static void mtk_set_msglevel(struct net_device *dev, u32 value)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	mac->hw->msg_enable = value;
++}
++
++static int mtk_nway_reset(struct net_device *dev)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return -EBUSY;
++
++	if (!mac->phylink)
++		return -ENOTSUPP;
++
++	return phylink_ethtool_nway_reset(mac->phylink);
++}
++
++static void mtk_get_strings(struct net_device *dev, u32 stringset, u8 *data)
++{
++	int i;
++
++	switch (stringset) {
++	case ETH_SS_STATS: {
++		struct mtk_mac *mac = netdev_priv(dev);
++
++		for (i = 0; i < ARRAY_SIZE(mtk_ethtool_stats); i++) {
++			memcpy(data, mtk_ethtool_stats[i].str, ETH_GSTRING_LEN);
++			data += ETH_GSTRING_LEN;
++		}
++		if (mtk_page_pool_enabled(mac->hw))
++			page_pool_ethtool_stats_get_strings(data);
++		break;
++	}
++	default:
++		break;
++	}
++}
++
++static int mtk_get_sset_count(struct net_device *dev, int sset)
++{
++	switch (sset) {
++	case ETH_SS_STATS: {
++		int count = ARRAY_SIZE(mtk_ethtool_stats);
++		struct mtk_mac *mac = netdev_priv(dev);
++
++		if (mtk_page_pool_enabled(mac->hw))
++			count += page_pool_ethtool_stats_get_count();
++		return count;
++	}
++	default:
++		return -EOPNOTSUPP;
++	}
++}
++
++static void mtk_ethtool_pp_stats(struct mtk_eth *eth, u64 *data)
++{
++	struct page_pool_stats stats = {};
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(eth->rx_ring); i++) {
++		struct mtk_rx_ring *ring = &eth->rx_ring[i];
++
++		if (!ring->page_pool)
++			continue;
++
++		page_pool_get_stats(ring->page_pool, &stats);
++	}
++	page_pool_ethtool_stats_get(data, &stats);
++}
++
++static void mtk_get_ethtool_stats(struct net_device *dev,
++				  struct ethtool_stats *stats, u64 *data)
++{
++	struct mtk_mac *mac = netdev_priv(dev);
++	struct mtk_hw_stats *hwstats = mac->hw_stats;
++	u64 *data_src, *data_dst;
++	unsigned int start;
++	int i;
++
++	if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state)))
++		return;
++
++	if (netif_running(dev) && netif_device_present(dev)) {
++		if (spin_trylock_bh(&hwstats->stats_lock)) {
++			mtk_stats_update_mac(mac);
++			spin_unlock_bh(&hwstats->stats_lock);
++		}
++	}
++
++	data_src = (u64 *)hwstats;
++
++	do {
++		data_dst = data;
++		start = u64_stats_fetch_begin_irq(&hwstats->syncp);
++
++		for (i = 0; i < ARRAY_SIZE(mtk_ethtool_stats); i++)
++			*data_dst++ = *(data_src + mtk_ethtool_stats[i].offset);
++		if (mtk_page_pool_enabled(mac->hw))
++			mtk_ethtool_pp_stats(mac->hw, data_dst);
++	} while (u64_stats_fetch_retry_irq(&hwstats->syncp, start));
++}
++
++static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
++			 u32 *rule_locs)
++{
++	int ret = -EOPNOTSUPP;
++
++	switch (cmd->cmd) {
++	case ETHTOOL_GRXRINGS:
++		if (dev->hw_features & NETIF_F_LRO) {
++			cmd->data = MTK_MAX_RX_RING_NUM;
++			ret = 0;
++		}
++		break;
++	case ETHTOOL_GRXCLSRLCNT:
++		if (dev->hw_features & NETIF_F_LRO) {
++			struct mtk_mac *mac = netdev_priv(dev);
++
++			cmd->rule_cnt = mac->hwlro_ip_cnt;
++			ret = 0;
++		}
++		break;
++	case ETHTOOL_GRXCLSRULE:
++		if (dev->hw_features & NETIF_F_LRO)
++			ret = mtk_hwlro_get_fdir_entry(dev, cmd);
++		break;
++	case ETHTOOL_GRXCLSRLALL:
++		if (dev->hw_features & NETIF_F_LRO)
++			ret = mtk_hwlro_get_fdir_all(dev, cmd,
++						     rule_locs);
++		break;
++	default:
++		break;
++	}
++
++	return ret;
++}
++
++static int mtk_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
++{
++	int ret = -EOPNOTSUPP;
++
++	switch (cmd->cmd) {
++	case ETHTOOL_SRXCLSRLINS:
++		if (dev->hw_features & NETIF_F_LRO)
++			ret = mtk_hwlro_add_ipaddr(dev, cmd);
++		break;
++	case ETHTOOL_SRXCLSRLDEL:
++		if (dev->hw_features & NETIF_F_LRO)
++			ret = mtk_hwlro_del_ipaddr(dev, cmd);
++		break;
++	default:
++		break;
++	}
++
++	return ret;
++}
++
++static const struct ethtool_ops mtk_ethtool_ops = {
++	.get_link_ksettings	= mtk_get_link_ksettings,
++	.set_link_ksettings	= mtk_set_link_ksettings,
++	.get_drvinfo		= mtk_get_drvinfo,
++	.get_msglevel		= mtk_get_msglevel,
++	.set_msglevel		= mtk_set_msglevel,
++	.nway_reset		= mtk_nway_reset,
++	.get_link		= ethtool_op_get_link,
++	.get_strings		= mtk_get_strings,
++	.get_sset_count		= mtk_get_sset_count,
++	.get_ethtool_stats	= mtk_get_ethtool_stats,
++	.get_rxnfc		= mtk_get_rxnfc,
++	.set_rxnfc              = mtk_set_rxnfc,
++};
++
++static const struct net_device_ops mtk_netdev_ops = {
++	.ndo_init		= mtk_init,
++	.ndo_uninit		= mtk_uninit,
++	.ndo_open		= mtk_open,
++	.ndo_stop		= mtk_stop,
++	.ndo_start_xmit		= mtk_start_xmit,
++	.ndo_set_mac_address	= mtk_set_mac_address,
++	.ndo_validate_addr	= eth_validate_addr,
++	.ndo_eth_ioctl		= mtk_do_ioctl,
++	.ndo_change_mtu		= mtk_change_mtu,
++	.ndo_tx_timeout		= mtk_tx_timeout,
++	.ndo_get_stats64        = mtk_get_stats64,
++	.ndo_fix_features	= mtk_fix_features,
++	.ndo_set_features	= mtk_set_features,
++#ifdef CONFIG_NET_POLL_CONTROLLER
++	.ndo_poll_controller	= mtk_poll_controller,
++#endif
++	.ndo_setup_tc		= mtk_eth_setup_tc,
++	.ndo_bpf		= mtk_xdp,
++	.ndo_xdp_xmit		= mtk_xdp_xmit,
++};
++
++static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np)
++{
++	const __be32 *_id = of_get_property(np, "reg", NULL);
++	phy_interface_t phy_mode;
++	struct phylink *phylink;
++	struct mtk_mac *mac;
++	int id, err;
++
++	if (!_id) {
++		dev_err(eth->dev, "missing mac id\n");
++		return -EINVAL;
++	}
++
++	id = be32_to_cpup(_id);
++	if (id >= MTK_MAC_COUNT) {
++		dev_err(eth->dev, "%d is not a valid mac id\n", id);
++		return -EINVAL;
++	}
++
++	if (eth->netdev[id]) {
++		dev_err(eth->dev, "duplicate mac id found: %d\n", id);
++		return -EINVAL;
++	}
++
++	eth->netdev[id] = alloc_etherdev(sizeof(*mac));
++	if (!eth->netdev[id]) {
++		dev_err(eth->dev, "alloc_etherdev failed\n");
++		return -ENOMEM;
++	}
++	mac = netdev_priv(eth->netdev[id]);
++	eth->mac[id] = mac;
++	mac->id = id;
++	mac->hw = eth;
++	mac->of_node = np;
++
++	memset(mac->hwlro_ip, 0, sizeof(mac->hwlro_ip));
++	mac->hwlro_ip_cnt = 0;
++
++	mac->hw_stats = devm_kzalloc(eth->dev,
++				     sizeof(*mac->hw_stats),
++				     GFP_KERNEL);
++	if (!mac->hw_stats) {
++		dev_err(eth->dev, "failed to allocate counter memory\n");
++		err = -ENOMEM;
++		goto free_netdev;
++	}
++	spin_lock_init(&mac->hw_stats->stats_lock);
++	u64_stats_init(&mac->hw_stats->syncp);
++	mac->hw_stats->reg_offset = id * MTK_STAT_OFFSET;
++
++	/* phylink create */
++	err = of_get_phy_mode(np, &phy_mode);
++	if (err) {
++		dev_err(eth->dev, "incorrect phy-mode\n");
++		goto free_netdev;
++	}
++
++	/* mac config is not set */
++	mac->interface = PHY_INTERFACE_MODE_NA;
++	mac->speed = SPEED_UNKNOWN;
++
++	mac->phylink_config.dev = &eth->netdev[id]->dev;
++	mac->phylink_config.type = PHYLINK_NETDEV;
++	/* This driver makes use of state->speed in mac_config */
++	mac->phylink_config.legacy_pre_march2020 = true;
++	mac->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE |
++		MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD;
++
++	__set_bit(PHY_INTERFACE_MODE_MII,
++		  mac->phylink_config.supported_interfaces);
++	__set_bit(PHY_INTERFACE_MODE_GMII,
++		  mac->phylink_config.supported_interfaces);
++
++	if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_RGMII))
++		phy_interface_set_rgmii(mac->phylink_config.supported_interfaces);
++
++	if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_TRGMII) && !mac->id)
++		__set_bit(PHY_INTERFACE_MODE_TRGMII,
++			  mac->phylink_config.supported_interfaces);
++
++	if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_SGMII)) {
++		__set_bit(PHY_INTERFACE_MODE_SGMII,
++			  mac->phylink_config.supported_interfaces);
++		__set_bit(PHY_INTERFACE_MODE_1000BASEX,
++			  mac->phylink_config.supported_interfaces);
++		__set_bit(PHY_INTERFACE_MODE_2500BASEX,
++			  mac->phylink_config.supported_interfaces);
++	}
++
++	phylink = phylink_create(&mac->phylink_config,
++				 of_fwnode_handle(mac->of_node),
++				 phy_mode, &mtk_phylink_ops);
++	if (IS_ERR(phylink)) {
++		err = PTR_ERR(phylink);
++		goto free_netdev;
++	}
++
++	mac->phylink = phylink;
++
++	SET_NETDEV_DEV(eth->netdev[id], eth->dev);
++	eth->netdev[id]->watchdog_timeo = 5 * HZ;
++	eth->netdev[id]->netdev_ops = &mtk_netdev_ops;
++	eth->netdev[id]->base_addr = (unsigned long)eth->base;
++
++	eth->netdev[id]->hw_features = eth->soc->hw_features;
++	if (eth->hwlro)
++		eth->netdev[id]->hw_features |= NETIF_F_LRO;
++
++	eth->netdev[id]->vlan_features = eth->soc->hw_features &
++		~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX);
++	eth->netdev[id]->features |= eth->soc->hw_features;
++	eth->netdev[id]->ethtool_ops = &mtk_ethtool_ops;
++
++	eth->netdev[id]->irq = eth->irq[0];
++	eth->netdev[id]->dev.of_node = np;
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++		eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH - MTK_RX_ETH_HLEN;
++	else
++		eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN;
++
++	return 0;
++
++free_netdev:
++	free_netdev(eth->netdev[id]);
++	return err;
++}
++
++void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev)
++{
++	struct net_device *dev, *tmp;
++	LIST_HEAD(dev_list);
++	int i;
++
++	rtnl_lock();
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		dev = eth->netdev[i];
++
++		if (!dev || !(dev->flags & IFF_UP))
++			continue;
++
++		list_add_tail(&dev->close_list, &dev_list);
++	}
++
++	dev_close_many(&dev_list, false);
++
++	eth->dma_dev = dma_dev;
++
++	list_for_each_entry_safe(dev, tmp, &dev_list, close_list) {
++		list_del_init(&dev->close_list);
++		dev_open(dev, NULL);
++	}
++
++	rtnl_unlock();
++}
++
++static int mtk_probe(struct platform_device *pdev)
++{
++	struct device_node *mac_np;
++	struct mtk_eth *eth;
++	int err, i;
++
++	eth = devm_kzalloc(&pdev->dev, sizeof(*eth), GFP_KERNEL);
++	if (!eth)
++		return -ENOMEM;
++
++	eth->soc = of_device_get_match_data(&pdev->dev);
++
++	eth->dev = &pdev->dev;
++	eth->dma_dev = &pdev->dev;
++	eth->base = devm_platform_ioremap_resource(pdev, 0);
++	if (IS_ERR(eth->base))
++		return PTR_ERR(eth->base);
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
++		eth->ip_align = NET_IP_ALIGN;
++
++	spin_lock_init(&eth->page_lock);
++	spin_lock_init(&eth->tx_irq_lock);
++	spin_lock_init(&eth->rx_irq_lock);
++	spin_lock_init(&eth->dim_lock);
++
++	eth->rx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
++	INIT_WORK(&eth->rx_dim.work, mtk_dim_rx);
++
++	eth->tx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
++	INIT_WORK(&eth->tx_dim.work, mtk_dim_tx);
++
++	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		eth->ethsys = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
++							      "mediatek,ethsys");
++		if (IS_ERR(eth->ethsys)) {
++			dev_err(&pdev->dev, "no ethsys regmap found\n");
++			return PTR_ERR(eth->ethsys);
++		}
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_INFRA)) {
++		eth->infra = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
++							     "mediatek,infracfg");
++		if (IS_ERR(eth->infra)) {
++			dev_err(&pdev->dev, "no infracfg regmap found\n");
++			return PTR_ERR(eth->infra);
++		}
++	}
++
++	if (of_dma_is_coherent(pdev->dev.of_node)) {
++		struct regmap *cci;
++
++		cci = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
++						      "cci-control-port");
++		/* enable CPU/bus coherency */
++		if (!IS_ERR(cci))
++			regmap_write(cci, 0, 3);
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) {
++		eth->sgmii = devm_kzalloc(eth->dev, sizeof(*eth->sgmii),
++					  GFP_KERNEL);
++		if (!eth->sgmii)
++			return -ENOMEM;
++
++		err = mtk_sgmii_init(eth->sgmii, pdev->dev.of_node,
++				     eth->soc->ana_rgc3);
++
++		if (err)
++			return err;
++	}
++
++	if (eth->soc->required_pctl) {
++		eth->pctl = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
++							    "mediatek,pctl");
++		if (IS_ERR(eth->pctl)) {
++			dev_err(&pdev->dev, "no pctl regmap found\n");
++			return PTR_ERR(eth->pctl);
++		}
++	}
++
++	for (i = 0;; i++) {
++		struct device_node *np = of_parse_phandle(pdev->dev.of_node,
++							  "mediatek,wed", i);
++		static const u32 wdma_regs[] = {
++			MTK_WDMA0_BASE,
++			MTK_WDMA1_BASE
++		};
++		void __iomem *wdma;
++
++		if (!np || i >= ARRAY_SIZE(wdma_regs))
++			break;
++
++		wdma = eth->base + wdma_regs[i];
++		mtk_wed_add_hw(np, eth, wdma, i);
++	}
++
++	for (i = 0; i < 3; i++) {
++		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_INT) && i > 0)
++			eth->irq[i] = eth->irq[0];
++		else
++			eth->irq[i] = platform_get_irq(pdev, i);
++		if (eth->irq[i] < 0) {
++			dev_err(&pdev->dev, "no IRQ%d resource found\n", i);
++			err = -ENXIO;
++			goto err_wed_exit;
++		}
++	}
++	for (i = 0; i < ARRAY_SIZE(eth->clks); i++) {
++		eth->clks[i] = devm_clk_get(eth->dev,
++					    mtk_clks_source_name[i]);
++		if (IS_ERR(eth->clks[i])) {
++			if (PTR_ERR(eth->clks[i]) == -EPROBE_DEFER) {
++				err = -EPROBE_DEFER;
++				goto err_wed_exit;
++			}
++			if (eth->soc->required_clks & BIT(i)) {
++				dev_err(&pdev->dev, "clock %s not found\n",
++					mtk_clks_source_name[i]);
++				err = -EINVAL;
++				goto err_wed_exit;
++			}
++			eth->clks[i] = NULL;
++		}
++	}
++
++	eth->msg_enable = netif_msg_init(mtk_msg_level, MTK_DEFAULT_MSG_ENABLE);
++	INIT_WORK(&eth->pending_work, mtk_pending_work);
++
++	err = mtk_hw_init(eth);
++	if (err)
++		goto err_wed_exit;
++
++	eth->hwlro = MTK_HAS_CAPS(eth->soc->caps, MTK_HWLRO);
++
++	for_each_child_of_node(pdev->dev.of_node, mac_np) {
++		if (!of_device_is_compatible(mac_np,
++					     "mediatek,eth-mac"))
++			continue;
++
++		if (!of_device_is_available(mac_np))
++			continue;
++
++		err = mtk_add_mac(eth, mac_np);
++		if (err) {
++			of_node_put(mac_np);
++			goto err_deinit_hw;
++		}
++	}
++
++	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_INT)) {
++		err = devm_request_irq(eth->dev, eth->irq[0],
++				       mtk_handle_irq, 0,
++				       dev_name(eth->dev), eth);
++	} else {
++		err = devm_request_irq(eth->dev, eth->irq[1],
++				       mtk_handle_irq_tx, 0,
++				       dev_name(eth->dev), eth);
++		if (err)
++			goto err_free_dev;
++
++		err = devm_request_irq(eth->dev, eth->irq[2],
++				       mtk_handle_irq_rx, 0,
++				       dev_name(eth->dev), eth);
++	}
++	if (err)
++		goto err_free_dev;
++
++	/* No MT7628/88 support yet */
++	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
++		err = mtk_mdio_init(eth);
++		if (err)
++			goto err_free_dev;
++	}
++
++	if (eth->soc->offload_version) {
++		eth->ppe = mtk_ppe_init(eth, eth->base + MTK_ETH_PPE_BASE, 2);
++		if (!eth->ppe) {
++			err = -ENOMEM;
++			goto err_deinit_mdio;
++		}
++
++		err = mtk_eth_offload_init(eth);
++		if (err)
++			goto err_deinit_mdio;
++	}
++
++	for (i = 0; i < MTK_MAX_DEVS; i++) {
++		if (!eth->netdev[i])
++			continue;
++
++		err = register_netdev(eth->netdev[i]);
++		if (err) {
++			dev_err(eth->dev, "error bringing up device\n");
++			goto err_deinit_mdio;
++		} else
++			netif_info(eth, probe, eth->netdev[i],
++				   "mediatek frame engine at 0x%08lx, irq %d\n",
++				   eth->netdev[i]->base_addr, eth->irq[0]);
++	}
++
++	/* we run 2 devices on the same DMA ring so we need a dummy device
++	 * for NAPI to work
++	 */
++	init_dummy_netdev(&eth->dummy_dev);
++	netif_napi_add(&eth->dummy_dev, &eth->tx_napi, mtk_napi_tx,
++		       NAPI_POLL_WEIGHT);
++	netif_napi_add(&eth->dummy_dev, &eth->rx_napi, mtk_napi_rx,
++		       NAPI_POLL_WEIGHT);
++
++	platform_set_drvdata(pdev, eth);
++
++	return 0;
++
++err_deinit_mdio:
++	mtk_mdio_cleanup(eth);
++err_free_dev:
++	mtk_free_dev(eth);
++err_deinit_hw:
++	mtk_hw_deinit(eth);
++err_wed_exit:
++	mtk_wed_exit();
++
++	return err;
++}
++
++static int mtk_remove(struct platform_device *pdev)
++{
++	struct mtk_eth *eth = platform_get_drvdata(pdev);
++	struct mtk_mac *mac;
++	int i;
++
++	/* stop all devices to make sure that dma is properly shut down */
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		if (!eth->netdev[i])
++			continue;
++		mtk_stop(eth->netdev[i]);
++		mac = netdev_priv(eth->netdev[i]);
++		phylink_disconnect_phy(mac->phylink);
++	}
++
++	mtk_wed_exit();
++	mtk_hw_deinit(eth);
++
++	netif_napi_del(&eth->tx_napi);
++	netif_napi_del(&eth->rx_napi);
++	mtk_cleanup(eth);
++	mtk_mdio_cleanup(eth);
++
++	return 0;
++}
++
++static const struct mtk_soc_data mt2701_data = {
++	.reg_map = &mtk_reg_map,
++	.caps = MT7623_CAPS | MTK_HWLRO,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7623_CLKS_BITMAP,
++	.required_pctl = true,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7621_data = {
++	.reg_map = &mtk_reg_map,
++	.caps = MT7621_CAPS,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7621_CLKS_BITMAP,
++	.required_pctl = false,
++	.offload_version = 2,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7622_data = {
++	.reg_map = &mtk_reg_map,
++	.ana_rgc3 = 0x2028,
++	.caps = MT7622_CAPS | MTK_HWLRO,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7622_CLKS_BITMAP,
++	.required_pctl = false,
++	.offload_version = 2,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7623_data = {
++	.reg_map = &mtk_reg_map,
++	.caps = MT7623_CAPS | MTK_HWLRO,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7623_CLKS_BITMAP,
++	.required_pctl = true,
++	.offload_version = 2,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7629_data = {
++	.reg_map = &mtk_reg_map,
++	.ana_rgc3 = 0x128,
++	.caps = MT7629_CAPS | MTK_HWLRO,
++	.hw_features = MTK_HW_FEATURES,
++	.required_clks = MT7629_CLKS_BITMAP,
++	.required_pctl = false,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++static const struct mtk_soc_data mt7986_data = {
++	.reg_map = &mt7986_reg_map,
++	.ana_rgc3 = 0x128,
++	.caps = MT7986_CAPS,
++	.required_clks = MT7986_CLKS_BITMAP,
++	.required_pctl = false,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma_v2),
++		.rxd_size = sizeof(struct mtk_rx_dma_v2),
++		.rx_irq_done_mask = MTK_RX_DONE_INT_V2,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID_V2,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN_V2,
++		.dma_len_offset = 8,
++	},
++};
++
++static const struct mtk_soc_data rt5350_data = {
++	.reg_map = &mt7628_reg_map,
++	.caps = MT7628_CAPS,
++	.hw_features = MTK_HW_FEATURES_MT7628,
++	.required_clks = MT7628_CLKS_BITMAP,
++	.required_pctl = false,
++	.txrx = {
++		.txd_size = sizeof(struct mtk_tx_dma),
++		.rxd_size = sizeof(struct mtk_rx_dma),
++		.rx_irq_done_mask = MTK_RX_DONE_INT,
++		.rx_dma_l4_valid = RX_DMA_L4_VALID_PDMA,
++		.dma_max_len = MTK_TX_DMA_BUF_LEN,
++		.dma_len_offset = 16,
++	},
++};
++
++const struct of_device_id of_mtk_match[] = {
++	{ .compatible = "mediatek,mt2701-eth", .data = &mt2701_data},
++	{ .compatible = "mediatek,mt7621-eth", .data = &mt7621_data},
++	{ .compatible = "mediatek,mt7622-eth", .data = &mt7622_data},
++	{ .compatible = "mediatek,mt7623-eth", .data = &mt7623_data},
++	{ .compatible = "mediatek,mt7629-eth", .data = &mt7629_data},
++	{ .compatible = "mediatek,mt7986-eth", .data = &mt7986_data},
++	{ .compatible = "ralink,rt5350-eth", .data = &rt5350_data},
++	{},
++};
++MODULE_DEVICE_TABLE(of, of_mtk_match);
++
++static struct platform_driver mtk_driver = {
++	.probe = mtk_probe,
++	.remove = mtk_remove,
++	.driver = {
++		.name = "mtk_soc_eth",
++		.of_match_table = of_mtk_match,
++	},
++};
++
++module_platform_driver(mtk_driver);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("John Crispin <blogic@openwrt.org>");
++MODULE_DESCRIPTION("Ethernet driver for MediaTek SoC");
+diff -rupN linux.orig/drivers/net/ethernet/mellanox/mlxsw/spectrum.c linux/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+--- linux.orig/drivers/net/ethernet/mellanox/mlxsw/spectrum.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/mellanox/mlxsw/spectrum.c	2022-12-04 10:40:26.692034106 -0500
+@@ -827,12 +827,12 @@ mlxsw_sp_port_get_sw_stats64(const struc
  	for_each_possible_cpu(i) {
  		p = per_cpu_ptr(mlxsw_sp_port->pcpu_stats, i);
  		do {
@@ -2234,11 +12394,10 @@ index 30c7b0e157218..fa2753318cdf7 100644
  
  		stats->rx_packets	+= rx_packets;
  		stats->rx_bytes		+= rx_bytes;
-diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
-index 9259a74eca40b..318dbbb482797 100644
---- a/drivers/net/ethernet/microsoft/mana/mana_en.c
-+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
-@@ -315,10 +315,10 @@ static void mana_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/ethernet/microsoft/mana/mana_en.c linux/drivers/net/ethernet/microsoft/mana/mana_en.c
+--- linux.orig/drivers/net/ethernet/microsoft/mana/mana_en.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/microsoft/mana/mana_en.c	2022-12-04 10:40:26.692034106 -0500
+@@ -315,10 +315,10 @@ static void mana_get_stats64(struct net_
  		rx_stats = &apc->rxqs[q]->stats;
  
  		do {
@@ -2251,7 +12410,7 @@ index 9259a74eca40b..318dbbb482797 100644
  
  		st->rx_packets += packets;
  		st->rx_bytes += bytes;
-@@ -328,10 +328,10 @@ static void mana_get_stats64(struct net_device *ndev,
+@@ -328,10 +328,10 @@ static void mana_get_stats64(struct net_
  		tx_stats = &apc->tx_qp[q].txq.stats;
  
  		do {
@@ -2264,11 +12423,10 @@ index 9259a74eca40b..318dbbb482797 100644
  
  		st->tx_packets += packets;
  		st->tx_bytes += bytes;
-diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
-index c530db76880f0..96d55c91c9698 100644
---- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
-+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
-@@ -90,13 +90,13 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/ethernet/microsoft/mana/mana_ethtool.c linux/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+--- linux.orig/drivers/net/ethernet/microsoft/mana/mana_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/microsoft/mana/mana_ethtool.c	2022-12-04 10:40:26.692034106 -0500
+@@ -90,13 +90,13 @@ static void mana_get_ethtool_stats(struc
  		rx_stats = &apc->rxqs[q]->stats;
  
  		do {
@@ -2284,7 +12442,7 @@ index c530db76880f0..96d55c91c9698 100644
  
  		data[i++] = packets;
  		data[i++] = bytes;
-@@ -109,11 +109,11 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
+@@ -109,11 +109,11 @@ static void mana_get_ethtool_stats(struc
  		tx_stats = &apc->tx_qp[q].txq.stats;
  
  		do {
@@ -2298,11 +12456,10 @@ index c530db76880f0..96d55c91c9698 100644
  
  		data[i++] = packets;
  		data[i++] = bytes;
-diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
-index 349a2b1a19a24..cf4d6f1129fa2 100644
---- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
-+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
-@@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_common.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_common.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_common.c	2022-12-04 10:40:26.692034106 -0500
+@@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_de
  		unsigned int start;
  
  		do {
@@ -2328,11 +12485,10 @@ index 349a2b1a19a24..cf4d6f1129fa2 100644
  		stats->tx_packets += data[0];
  		stats->tx_bytes += data[1];
  		stats->tx_errors += data[2];
-diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
-index b1b1b648e40cb..eeb1455a4e5db 100644
---- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
-+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
-@@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
+diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c	2022-12-04 10:40:26.692034106 -0500
+@@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct
  		unsigned int start;
  
  		do {
@@ -2341,7 +12497,7 @@ index b1b1b648e40cb..eeb1455a4e5db 100644
  			data[0] = nn->r_vecs[i].rx_pkts;
  			tmp[0] = nn->r_vecs[i].hw_csum_rx_ok;
  			tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok;
-@@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
+@@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct
  			tmp[3] = nn->r_vecs[i].hw_csum_rx_error;
  			tmp[4] = nn->r_vecs[i].rx_replace_buf_alloc_fail;
  			tmp[5] = nn->r_vecs[i].hw_tls_rx;
@@ -2354,7 +12510,7 @@ index b1b1b648e40cb..eeb1455a4e5db 100644
  			data[1] = nn->r_vecs[i].tx_pkts;
  			data[2] = nn->r_vecs[i].tx_busy;
  			tmp[6] = nn->r_vecs[i].hw_csum_tx;
-@@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
+@@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct
  			tmp[10] = nn->r_vecs[i].hw_tls_tx;
  			tmp[11] = nn->r_vecs[i].tls_tx_fallback;
  			tmp[12] = nn->r_vecs[i].tls_tx_no_fallback;
@@ -2363,11 +12519,10 @@ index b1b1b648e40cb..eeb1455a4e5db 100644
  
  		data += NN_RVEC_PER_Q_STATS;
  
-diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
-index 8b77582bdfa01..a6b6ca1fd55ee 100644
---- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
-+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
-@@ -134,13 +134,13 @@ nfp_repr_get_host_stats64(const struct net_device *netdev,
+diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c	2022-12-04 10:40:26.692034106 -0500
+@@ -134,13 +134,13 @@ nfp_repr_get_host_stats64(const struct n
  
  		repr_stats = per_cpu_ptr(repr->stats, i);
  		do {
@@ -2383,11 +12538,10 @@ index 8b77582bdfa01..a6b6ca1fd55ee 100644
  
  		stats->tx_bytes += tbytes;
  		stats->tx_packets += tpkts;
-diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
-index 5116badaf0919..50ebbd7e91c48 100644
---- a/drivers/net/ethernet/nvidia/forcedeth.c
-+++ b/drivers/net/ethernet/nvidia/forcedeth.c
-@@ -1734,12 +1734,12 @@ static void nv_get_stats(int cpu, struct fe_priv *np,
+diff -rupN linux.orig/drivers/net/ethernet/nvidia/forcedeth.c linux/drivers/net/ethernet/nvidia/forcedeth.c
+--- linux.orig/drivers/net/ethernet/nvidia/forcedeth.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/nvidia/forcedeth.c	2022-12-04 10:40:26.692034106 -0500
+@@ -1734,12 +1734,12 @@ static void nv_get_stats(int cpu, struct
  	u64 tx_packets, tx_bytes, tx_dropped;
  
  	do {
@@ -2402,7 +12556,7 @@ index 5116badaf0919..50ebbd7e91c48 100644
  
  	storage->rx_packets       += rx_packets;
  	storage->rx_bytes         += rx_bytes;
-@@ -1747,11 +1747,11 @@ static void nv_get_stats(int cpu, struct fe_priv *np,
+@@ -1747,11 +1747,11 @@ static void nv_get_stats(int cpu, struct
  	storage->rx_missed_errors += rx_missed_errors;
  
  	do {
@@ -2416,11 +12570,10 @@ index 5116badaf0919..50ebbd7e91c48 100644
  
  	storage->tx_packets += tx_packets;
  	storage->tx_bytes   += tx_bytes;
-diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
-index 1b2119b1d48aa..3f5e6572d20e7 100644
---- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
-+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
-@@ -135,9 +135,9 @@ static void rmnet_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c linux/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
+--- linux.orig/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c	2022-12-04 10:40:26.692034106 -0500
+@@ -135,9 +135,9 @@ static void rmnet_get_stats64(struct net
  		pcpu_ptr = per_cpu_ptr(priv->pcpu_stats, cpu);
  
  		do {
@@ -2432,11 +12585,10 @@ index 1b2119b1d48aa..3f5e6572d20e7 100644
  
  		total_stats.rx_pkts += snapshot.rx_pkts;
  		total_stats.rx_bytes += snapshot.rx_bytes;
-diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
-index 15b40fd93cd2e..82bd0eb614634 100644
---- a/drivers/net/ethernet/realtek/8139too.c
-+++ b/drivers/net/ethernet/realtek/8139too.c
-@@ -2532,16 +2532,16 @@ rtl8139_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/ethernet/realtek/8139too.c linux/drivers/net/ethernet/realtek/8139too.c
+--- linux.orig/drivers/net/ethernet/realtek/8139too.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/realtek/8139too.c	2022-12-04 10:40:26.692034106 -0500
+@@ -2532,16 +2532,16 @@ rtl8139_get_stats64(struct net_device *d
  	netdev_stats_to_stats64(stats, &dev->stats);
  
  	do {
@@ -2457,11 +12609,10 @@ index 15b40fd93cd2e..82bd0eb614634 100644
  }
  
  /* Set or clear the multicast filter for this adaptor.
-diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
-index f0c8de2c60755..d4f7238333bb7 100644
---- a/drivers/net/ethernet/socionext/sni_ave.c
-+++ b/drivers/net/ethernet/socionext/sni_ave.c
-@@ -1506,16 +1506,16 @@ static void ave_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/ethernet/socionext/sni_ave.c linux/drivers/net/ethernet/socionext/sni_ave.c
+--- linux.orig/drivers/net/ethernet/socionext/sni_ave.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/socionext/sni_ave.c	2022-12-04 10:40:26.692034106 -0500
+@@ -1508,16 +1508,16 @@ static void ave_get_stats64(struct net_d
  	unsigned int start;
  
  	do {
@@ -2482,11 +12633,2010 @@ index f0c8de2c60755..d4f7238333bb7 100644
  
  	stats->rx_errors      = priv->stats_rx.errors;
  	stats->tx_errors      = priv->stats_tx.errors;
-diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
-index f4a6b590a1e39..1b62400c19049 100644
---- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
-+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
-@@ -1365,12 +1365,12 @@ static void am65_cpsw_nuss_ndo_get_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ethernet/socionext/sni_ave.c.orig linux/drivers/net/ethernet/socionext/sni_ave.c.orig
+--- linux.orig/drivers/net/ethernet/socionext/sni_ave.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/ethernet/socionext/sni_ave.c.orig	2022-12-04 10:40:18.168055947 -0500
+@@ -0,0 +1,1996 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * sni_ave.c - Socionext UniPhier AVE ethernet driver
++ * Copyright 2014 Panasonic Corporation
++ * Copyright 2015-2017 Socionext Inc.
++ */
++
++#include <linux/bitops.h>
++#include <linux/clk.h>
++#include <linux/etherdevice.h>
++#include <linux/interrupt.h>
++#include <linux/io.h>
++#include <linux/iopoll.h>
++#include <linux/mfd/syscon.h>
++#include <linux/mii.h>
++#include <linux/module.h>
++#include <linux/netdevice.h>
++#include <linux/of_net.h>
++#include <linux/of_mdio.h>
++#include <linux/of_platform.h>
++#include <linux/phy.h>
++#include <linux/regmap.h>
++#include <linux/reset.h>
++#include <linux/types.h>
++#include <linux/u64_stats_sync.h>
++
++/* General Register Group */
++#define AVE_IDR			0x000	/* ID */
++#define AVE_VR			0x004	/* Version */
++#define AVE_GRR			0x008	/* Global Reset */
++#define AVE_CFGR		0x00c	/* Configuration */
++
++/* Interrupt Register Group */
++#define AVE_GIMR		0x100	/* Global Interrupt Mask */
++#define AVE_GISR		0x104	/* Global Interrupt Status */
++
++/* MAC Register Group */
++#define AVE_TXCR		0x200	/* TX Setup */
++#define AVE_RXCR		0x204	/* RX Setup */
++#define AVE_RXMAC1R		0x208	/* MAC address (lower) */
++#define AVE_RXMAC2R		0x20c	/* MAC address (upper) */
++#define AVE_MDIOCTR		0x214	/* MDIO Control */
++#define AVE_MDIOAR		0x218	/* MDIO Address */
++#define AVE_MDIOWDR		0x21c	/* MDIO Data */
++#define AVE_MDIOSR		0x220	/* MDIO Status */
++#define AVE_MDIORDR		0x224	/* MDIO Rd Data */
++
++/* Descriptor Control Register Group */
++#define AVE_DESCC		0x300	/* Descriptor Control */
++#define AVE_TXDC		0x304	/* TX Descriptor Configuration */
++#define AVE_RXDC0		0x308	/* RX Descriptor Ring0 Configuration */
++#define AVE_IIRQC		0x34c	/* Interval IRQ Control */
++
++/* Packet Filter Register Group */
++#define AVE_PKTF_BASE		0x800	/* PF Base Address */
++#define AVE_PFMBYTE_BASE	0xd00	/* PF Mask Byte Base Address */
++#define AVE_PFMBIT_BASE		0xe00	/* PF Mask Bit Base Address */
++#define AVE_PFSEL_BASE		0xf00	/* PF Selector Base Address */
++#define AVE_PFEN		0xffc	/* Packet Filter Enable */
++#define AVE_PKTF(ent)		(AVE_PKTF_BASE + (ent) * 0x40)
++#define AVE_PFMBYTE(ent)	(AVE_PFMBYTE_BASE + (ent) * 8)
++#define AVE_PFMBIT(ent)		(AVE_PFMBIT_BASE + (ent) * 4)
++#define AVE_PFSEL(ent)		(AVE_PFSEL_BASE + (ent) * 4)
++
++/* 64bit descriptor memory */
++#define AVE_DESC_SIZE_64	12	/* Descriptor Size */
++
++#define AVE_TXDM_64		0x1000	/* Tx Descriptor Memory */
++#define AVE_RXDM_64		0x1c00	/* Rx Descriptor Memory */
++
++#define AVE_TXDM_SIZE_64	0x0ba0	/* Tx Descriptor Memory Size 3KB */
++#define AVE_RXDM_SIZE_64	0x6000	/* Rx Descriptor Memory Size 24KB */
++
++/* 32bit descriptor memory */
++#define AVE_DESC_SIZE_32	8	/* Descriptor Size */
++
++#define AVE_TXDM_32		0x1000	/* Tx Descriptor Memory */
++#define AVE_RXDM_32		0x1800	/* Rx Descriptor Memory */
++
++#define AVE_TXDM_SIZE_32	0x07c0	/* Tx Descriptor Memory Size 2KB */
++#define AVE_RXDM_SIZE_32	0x4000	/* Rx Descriptor Memory Size 16KB */
++
++/* RMII Bridge Register Group */
++#define AVE_RSTCTRL		0x8028	/* Reset control */
++#define AVE_RSTCTRL_RMIIRST	BIT(16)
++#define AVE_LINKSEL		0x8034	/* Link speed setting */
++#define AVE_LINKSEL_100M	BIT(0)
++
++/* AVE_GRR */
++#define AVE_GRR_RXFFR		BIT(5)	/* Reset RxFIFO */
++#define AVE_GRR_PHYRST		BIT(4)	/* Reset external PHY */
++#define AVE_GRR_GRST		BIT(0)	/* Reset all MAC */
++
++/* AVE_CFGR */
++#define AVE_CFGR_FLE		BIT(31)	/* Filter Function */
++#define AVE_CFGR_CHE		BIT(30)	/* Checksum Function */
++#define AVE_CFGR_MII		BIT(27)	/* Func mode (1:MII/RMII, 0:RGMII) */
++#define AVE_CFGR_IPFCEN		BIT(24)	/* IP fragment sum Enable */
++
++/* AVE_GISR (common with GIMR) */
++#define AVE_GI_PHY		BIT(24)	/* PHY interrupt */
++#define AVE_GI_TX		BIT(16)	/* Tx complete */
++#define AVE_GI_RXERR		BIT(8)	/* Receive frame more than max size */
++#define AVE_GI_RXOVF		BIT(7)	/* Overflow at the RxFIFO */
++#define AVE_GI_RXDROP		BIT(6)	/* Drop packet */
++#define AVE_GI_RXIINT		BIT(5)	/* Interval interrupt */
++
++/* AVE_TXCR */
++#define AVE_TXCR_FLOCTR		BIT(18)	/* Flow control */
++#define AVE_TXCR_TXSPD_1G	BIT(17)
++#define AVE_TXCR_TXSPD_100	BIT(16)
++
++/* AVE_RXCR */
++#define AVE_RXCR_RXEN		BIT(30)	/* Rx enable */
++#define AVE_RXCR_FDUPEN		BIT(22)	/* Interface mode */
++#define AVE_RXCR_FLOCTR		BIT(21)	/* Flow control */
++#define AVE_RXCR_AFEN		BIT(19)	/* MAC address filter */
++#define AVE_RXCR_DRPEN		BIT(18)	/* Drop pause frame */
++#define AVE_RXCR_MPSIZ_MASK	GENMASK(10, 0)
++
++/* AVE_MDIOCTR */
++#define AVE_MDIOCTR_RREQ	BIT(3)	/* Read request */
++#define AVE_MDIOCTR_WREQ	BIT(2)	/* Write request */
++
++/* AVE_MDIOSR */
++#define AVE_MDIOSR_STS		BIT(0)	/* access status */
++
++/* AVE_DESCC */
++#define AVE_DESCC_STATUS_MASK	GENMASK(31, 16)
++#define AVE_DESCC_RD0		BIT(8)	/* Enable Rx descriptor Ring0 */
++#define AVE_DESCC_RDSTP		BIT(4)	/* Pause Rx descriptor */
++#define AVE_DESCC_TD		BIT(0)	/* Enable Tx descriptor */
++
++/* AVE_TXDC */
++#define AVE_TXDC_SIZE		GENMASK(27, 16)	/* Size of Tx descriptor */
++#define AVE_TXDC_ADDR		GENMASK(11, 0)	/* Start address */
++#define AVE_TXDC_ADDR_START	0
++
++/* AVE_RXDC0 */
++#define AVE_RXDC0_SIZE		GENMASK(30, 16)	/* Size of Rx descriptor */
++#define AVE_RXDC0_ADDR		GENMASK(14, 0)	/* Start address */
++#define AVE_RXDC0_ADDR_START	0
++
++/* AVE_IIRQC */
++#define AVE_IIRQC_EN0		BIT(27)	/* Enable interval interrupt Ring0 */
++#define AVE_IIRQC_BSCK		GENMASK(15, 0)	/* Interval count unit */
++
++/* Command status for descriptor */
++#define AVE_STS_OWN		BIT(31)	/* Descriptor ownership */
++#define AVE_STS_INTR		BIT(29)	/* Request for interrupt */
++#define AVE_STS_OK		BIT(27)	/* Normal transmit */
++/* TX */
++#define AVE_STS_NOCSUM		BIT(28)	/* No use HW checksum */
++#define AVE_STS_1ST		BIT(26)	/* Head of buffer chain */
++#define AVE_STS_LAST		BIT(25)	/* Tail of buffer chain */
++#define AVE_STS_OWC		BIT(21)	/* Out of window,Late Collision */
++#define AVE_STS_EC		BIT(20)	/* Excess collision occurred */
++#define AVE_STS_PKTLEN_TX_MASK	GENMASK(15, 0)
++/* RX */
++#define AVE_STS_CSSV		BIT(21)	/* Checksum check performed */
++#define AVE_STS_CSER		BIT(20)	/* Checksum error detected */
++#define AVE_STS_PKTLEN_RX_MASK	GENMASK(10, 0)
++
++/* Packet filter */
++#define AVE_PFMBYTE_MASK0	(GENMASK(31, 8) | GENMASK(5, 0))
++#define AVE_PFMBYTE_MASK1	GENMASK(25, 0)
++#define AVE_PFMBIT_MASK		GENMASK(15, 0)
++
++#define AVE_PF_SIZE		17	/* Number of all packet filter */
++#define AVE_PF_MULTICAST_SIZE	7	/* Number of multicast filter */
++
++#define AVE_PFNUM_FILTER	0	/* No.0 */
++#define AVE_PFNUM_UNICAST	1	/* No.1 */
++#define AVE_PFNUM_BROADCAST	2	/* No.2 */
++#define AVE_PFNUM_MULTICAST	11	/* No.11-17 */
++
++/* NETIF Message control */
++#define AVE_DEFAULT_MSG_ENABLE	(NETIF_MSG_DRV    |	\
++				 NETIF_MSG_PROBE  |	\
++				 NETIF_MSG_LINK   |	\
++				 NETIF_MSG_TIMER  |	\
++				 NETIF_MSG_IFDOWN |	\
++				 NETIF_MSG_IFUP   |	\
++				 NETIF_MSG_RX_ERR |	\
++				 NETIF_MSG_TX_ERR)
++
++/* Parameter for descriptor */
++#define AVE_NR_TXDESC		64	/* Tx descriptor */
++#define AVE_NR_RXDESC		256	/* Rx descriptor */
++
++#define AVE_DESC_OFS_CMDSTS	0
++#define AVE_DESC_OFS_ADDRL	4
++#define AVE_DESC_OFS_ADDRU	8
++
++/* Parameter for ethernet frame */
++#define AVE_MAX_ETHFRAME	1518
++#define AVE_FRAME_HEADROOM	2
++
++/* Parameter for interrupt */
++#define AVE_INTM_COUNT		20
++#define AVE_FORCE_TXINTCNT	1
++
++/* SG */
++#define SG_ETPINMODE		0x540
++#define SG_ETPINMODE_EXTPHY	BIT(1)	/* for LD11 */
++#define SG_ETPINMODE_RMII(ins)	BIT(ins)
++
++#define IS_DESC_64BIT(p)	((p)->data->is_desc_64bit)
++
++#define AVE_MAX_CLKS		4
++#define AVE_MAX_RSTS		2
++
++enum desc_id {
++	AVE_DESCID_RX,
++	AVE_DESCID_TX,
++};
++
++enum desc_state {
++	AVE_DESC_RX_PERMIT,
++	AVE_DESC_RX_SUSPEND,
++	AVE_DESC_START,
++	AVE_DESC_STOP,
++};
++
++struct ave_desc {
++	struct sk_buff	*skbs;
++	dma_addr_t	skbs_dma;
++	size_t		skbs_dmalen;
++};
++
++struct ave_desc_info {
++	u32	ndesc;		/* number of descriptor */
++	u32	daddr;		/* start address of descriptor */
++	u32	proc_idx;	/* index of processing packet */
++	u32	done_idx;	/* index of processed packet */
++	struct ave_desc *desc;	/* skb info related descriptor */
++};
++
++struct ave_stats {
++	struct	u64_stats_sync	syncp;
++	u64	packets;
++	u64	bytes;
++	u64	errors;
++	u64	dropped;
++	u64	collisions;
++	u64	fifo_errors;
++};
++
++struct ave_private {
++	void __iomem            *base;
++	int                     irq;
++	int			phy_id;
++	unsigned int		desc_size;
++	u32			msg_enable;
++	int			nclks;
++	struct clk		*clk[AVE_MAX_CLKS];
++	int			nrsts;
++	struct reset_control	*rst[AVE_MAX_RSTS];
++	phy_interface_t		phy_mode;
++	struct phy_device	*phydev;
++	struct mii_bus		*mdio;
++	struct regmap		*regmap;
++	unsigned int		pinmode_mask;
++	unsigned int		pinmode_val;
++	u32			wolopts;
++
++	/* stats */
++	struct ave_stats	stats_rx;
++	struct ave_stats	stats_tx;
++
++	/* NAPI support */
++	struct net_device	*ndev;
++	struct napi_struct	napi_rx;
++	struct napi_struct	napi_tx;
++
++	/* descriptor */
++	struct ave_desc_info	rx;
++	struct ave_desc_info	tx;
++
++	/* flow control */
++	int pause_auto;
++	int pause_rx;
++	int pause_tx;
++
++	const struct ave_soc_data *data;
++};
++
++struct ave_soc_data {
++	bool	is_desc_64bit;
++	const char	*clock_names[AVE_MAX_CLKS];
++	const char	*reset_names[AVE_MAX_RSTS];
++	int	(*get_pinmode)(struct ave_private *priv,
++			       phy_interface_t phy_mode, u32 arg);
++};
++
++static u32 ave_desc_read(struct net_device *ndev, enum desc_id id, int entry,
++			 int offset)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 addr;
++
++	addr = ((id == AVE_DESCID_TX) ? priv->tx.daddr : priv->rx.daddr)
++		+ entry * priv->desc_size + offset;
++
++	return readl(priv->base + addr);
++}
++
++static u32 ave_desc_read_cmdsts(struct net_device *ndev, enum desc_id id,
++				int entry)
++{
++	return ave_desc_read(ndev, id, entry, AVE_DESC_OFS_CMDSTS);
++}
++
++static void ave_desc_write(struct net_device *ndev, enum desc_id id,
++			   int entry, int offset, u32 val)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 addr;
++
++	addr = ((id == AVE_DESCID_TX) ? priv->tx.daddr : priv->rx.daddr)
++		+ entry * priv->desc_size + offset;
++
++	writel(val, priv->base + addr);
++}
++
++static void ave_desc_write_cmdsts(struct net_device *ndev, enum desc_id id,
++				  int entry, u32 val)
++{
++	ave_desc_write(ndev, id, entry, AVE_DESC_OFS_CMDSTS, val);
++}
++
++static void ave_desc_write_addr(struct net_device *ndev, enum desc_id id,
++				int entry, dma_addr_t paddr)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	ave_desc_write(ndev, id, entry, AVE_DESC_OFS_ADDRL,
++		       lower_32_bits(paddr));
++	if (IS_DESC_64BIT(priv))
++		ave_desc_write(ndev, id,
++			       entry, AVE_DESC_OFS_ADDRU,
++			       upper_32_bits(paddr));
++}
++
++static u32 ave_irq_disable_all(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 ret;
++
++	ret = readl(priv->base + AVE_GIMR);
++	writel(0, priv->base + AVE_GIMR);
++
++	return ret;
++}
++
++static void ave_irq_restore(struct net_device *ndev, u32 val)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	writel(val, priv->base + AVE_GIMR);
++}
++
++static void ave_irq_enable(struct net_device *ndev, u32 bitflag)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	writel(readl(priv->base + AVE_GIMR) | bitflag, priv->base + AVE_GIMR);
++	writel(bitflag, priv->base + AVE_GISR);
++}
++
++static void ave_hw_write_macaddr(struct net_device *ndev,
++				 const unsigned char *mac_addr,
++				 int reg1, int reg2)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	writel(mac_addr[0] | mac_addr[1] << 8 |
++	       mac_addr[2] << 16 | mac_addr[3] << 24, priv->base + reg1);
++	writel(mac_addr[4] | mac_addr[5] << 8, priv->base + reg2);
++}
++
++static void ave_hw_read_version(struct net_device *ndev, char *buf, int len)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 major, minor, vr;
++
++	vr = readl(priv->base + AVE_VR);
++	major = (vr & GENMASK(15, 8)) >> 8;
++	minor = (vr & GENMASK(7, 0));
++	snprintf(buf, len, "v%u.%u", major, minor);
++}
++
++static void ave_ethtool_get_drvinfo(struct net_device *ndev,
++				    struct ethtool_drvinfo *info)
++{
++	struct device *dev = ndev->dev.parent;
++
++	strlcpy(info->driver, dev->driver->name, sizeof(info->driver));
++	strlcpy(info->bus_info, dev_name(dev), sizeof(info->bus_info));
++	ave_hw_read_version(ndev, info->fw_version, sizeof(info->fw_version));
++}
++
++static u32 ave_ethtool_get_msglevel(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	return priv->msg_enable;
++}
++
++static void ave_ethtool_set_msglevel(struct net_device *ndev, u32 val)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	priv->msg_enable = val;
++}
++
++static void ave_ethtool_get_wol(struct net_device *ndev,
++				struct ethtool_wolinfo *wol)
++{
++	wol->supported = 0;
++	wol->wolopts   = 0;
++
++	if (ndev->phydev)
++		phy_ethtool_get_wol(ndev->phydev, wol);
++}
++
++static int __ave_ethtool_set_wol(struct net_device *ndev,
++				 struct ethtool_wolinfo *wol)
++{
++	if (!ndev->phydev ||
++	    (wol->wolopts & (WAKE_ARP | WAKE_MAGICSECURE)))
++		return -EOPNOTSUPP;
++
++	return phy_ethtool_set_wol(ndev->phydev, wol);
++}
++
++static int ave_ethtool_set_wol(struct net_device *ndev,
++			       struct ethtool_wolinfo *wol)
++{
++	int ret;
++
++	ret = __ave_ethtool_set_wol(ndev, wol);
++	if (!ret)
++		device_set_wakeup_enable(&ndev->dev, !!wol->wolopts);
++
++	return ret;
++}
++
++static void ave_ethtool_get_pauseparam(struct net_device *ndev,
++				       struct ethtool_pauseparam *pause)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	pause->autoneg  = priv->pause_auto;
++	pause->rx_pause = priv->pause_rx;
++	pause->tx_pause = priv->pause_tx;
++}
++
++static int ave_ethtool_set_pauseparam(struct net_device *ndev,
++				      struct ethtool_pauseparam *pause)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	struct phy_device *phydev = ndev->phydev;
++
++	if (!phydev)
++		return -EINVAL;
++
++	priv->pause_auto = pause->autoneg;
++	priv->pause_rx   = pause->rx_pause;
++	priv->pause_tx   = pause->tx_pause;
++
++	phy_set_asym_pause(phydev, pause->rx_pause, pause->tx_pause);
++
++	return 0;
++}
++
++static const struct ethtool_ops ave_ethtool_ops = {
++	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
++	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
++	.get_drvinfo		= ave_ethtool_get_drvinfo,
++	.nway_reset		= phy_ethtool_nway_reset,
++	.get_link		= ethtool_op_get_link,
++	.get_msglevel		= ave_ethtool_get_msglevel,
++	.set_msglevel		= ave_ethtool_set_msglevel,
++	.get_wol		= ave_ethtool_get_wol,
++	.set_wol		= ave_ethtool_set_wol,
++	.get_pauseparam         = ave_ethtool_get_pauseparam,
++	.set_pauseparam         = ave_ethtool_set_pauseparam,
++};
++
++static int ave_mdiobus_read(struct mii_bus *bus, int phyid, int regnum)
++{
++	struct net_device *ndev = bus->priv;
++	struct ave_private *priv;
++	u32 mdioctl, mdiosr;
++	int ret;
++
++	priv = netdev_priv(ndev);
++
++	/* write address */
++	writel((phyid << 8) | regnum, priv->base + AVE_MDIOAR);
++
++	/* read request */
++	mdioctl = readl(priv->base + AVE_MDIOCTR);
++	writel((mdioctl | AVE_MDIOCTR_RREQ) & ~AVE_MDIOCTR_WREQ,
++	       priv->base + AVE_MDIOCTR);
++
++	ret = readl_poll_timeout(priv->base + AVE_MDIOSR, mdiosr,
++				 !(mdiosr & AVE_MDIOSR_STS), 20, 2000);
++	if (ret) {
++		netdev_err(ndev, "failed to read (phy:%d reg:%x)\n",
++			   phyid, regnum);
++		return ret;
++	}
++
++	return readl(priv->base + AVE_MDIORDR) & GENMASK(15, 0);
++}
++
++static int ave_mdiobus_write(struct mii_bus *bus, int phyid, int regnum,
++			     u16 val)
++{
++	struct net_device *ndev = bus->priv;
++	struct ave_private *priv;
++	u32 mdioctl, mdiosr;
++	int ret;
++
++	priv = netdev_priv(ndev);
++
++	/* write address */
++	writel((phyid << 8) | regnum, priv->base + AVE_MDIOAR);
++
++	/* write data */
++	writel(val, priv->base + AVE_MDIOWDR);
++
++	/* write request */
++	mdioctl = readl(priv->base + AVE_MDIOCTR);
++	writel((mdioctl | AVE_MDIOCTR_WREQ) & ~AVE_MDIOCTR_RREQ,
++	       priv->base + AVE_MDIOCTR);
++
++	ret = readl_poll_timeout(priv->base + AVE_MDIOSR, mdiosr,
++				 !(mdiosr & AVE_MDIOSR_STS), 20, 2000);
++	if (ret)
++		netdev_err(ndev, "failed to write (phy:%d reg:%x)\n",
++			   phyid, regnum);
++
++	return ret;
++}
++
++static int ave_dma_map(struct net_device *ndev, struct ave_desc *desc,
++		       void *ptr, size_t len, enum dma_data_direction dir,
++		       dma_addr_t *paddr)
++{
++	dma_addr_t map_addr;
++
++	map_addr = dma_map_single(ndev->dev.parent, ptr, len, dir);
++	if (unlikely(dma_mapping_error(ndev->dev.parent, map_addr)))
++		return -ENOMEM;
++
++	desc->skbs_dma = map_addr;
++	desc->skbs_dmalen = len;
++	*paddr = map_addr;
++
++	return 0;
++}
++
++static void ave_dma_unmap(struct net_device *ndev, struct ave_desc *desc,
++			  enum dma_data_direction dir)
++{
++	if (!desc->skbs_dma)
++		return;
++
++	dma_unmap_single(ndev->dev.parent,
++			 desc->skbs_dma, desc->skbs_dmalen, dir);
++	desc->skbs_dma = 0;
++}
++
++/* Prepare Rx descriptor and memory */
++static int ave_rxdesc_prepare(struct net_device *ndev, int entry)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	struct sk_buff *skb;
++	dma_addr_t paddr;
++	int ret;
++
++	skb = priv->rx.desc[entry].skbs;
++	if (!skb) {
++		skb = netdev_alloc_skb(ndev, AVE_MAX_ETHFRAME);
++		if (!skb) {
++			netdev_err(ndev, "can't allocate skb for Rx\n");
++			return -ENOMEM;
++		}
++		skb->data += AVE_FRAME_HEADROOM;
++		skb->tail += AVE_FRAME_HEADROOM;
++	}
++
++	/* set disable to cmdsts */
++	ave_desc_write_cmdsts(ndev, AVE_DESCID_RX, entry,
++			      AVE_STS_INTR | AVE_STS_OWN);
++
++	/* map Rx buffer
++	 * Rx buffer set to the Rx descriptor has two restrictions:
++	 * - Rx buffer address is 4 byte aligned.
++	 * - Rx buffer begins with 2 byte headroom, and data will be put from
++	 *   (buffer + 2).
++	 * To satisfy this, specify the address to put back the buffer
++	 * pointer advanced by AVE_FRAME_HEADROOM, and expand the map size
++	 * by AVE_FRAME_HEADROOM.
++	 */
++	ret = ave_dma_map(ndev, &priv->rx.desc[entry],
++			  skb->data - AVE_FRAME_HEADROOM,
++			  AVE_MAX_ETHFRAME + AVE_FRAME_HEADROOM,
++			  DMA_FROM_DEVICE, &paddr);
++	if (ret) {
++		netdev_err(ndev, "can't map skb for Rx\n");
++		dev_kfree_skb_any(skb);
++		return ret;
++	}
++	priv->rx.desc[entry].skbs = skb;
++
++	/* set buffer pointer */
++	ave_desc_write_addr(ndev, AVE_DESCID_RX, entry, paddr);
++
++	/* set enable to cmdsts */
++	ave_desc_write_cmdsts(ndev, AVE_DESCID_RX, entry,
++			      AVE_STS_INTR | AVE_MAX_ETHFRAME);
++
++	return ret;
++}
++
++/* Switch state of descriptor */
++static int ave_desc_switch(struct net_device *ndev, enum desc_state state)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	int ret = 0;
++	u32 val;
++
++	switch (state) {
++	case AVE_DESC_START:
++		writel(AVE_DESCC_TD | AVE_DESCC_RD0, priv->base + AVE_DESCC);
++		break;
++
++	case AVE_DESC_STOP:
++		writel(0, priv->base + AVE_DESCC);
++		if (readl_poll_timeout(priv->base + AVE_DESCC, val, !val,
++				       150, 15000)) {
++			netdev_err(ndev, "can't stop descriptor\n");
++			ret = -EBUSY;
++		}
++		break;
++
++	case AVE_DESC_RX_SUSPEND:
++		val = readl(priv->base + AVE_DESCC);
++		val |= AVE_DESCC_RDSTP;
++		val &= ~AVE_DESCC_STATUS_MASK;
++		writel(val, priv->base + AVE_DESCC);
++		if (readl_poll_timeout(priv->base + AVE_DESCC, val,
++				       val & (AVE_DESCC_RDSTP << 16),
++				       150, 150000)) {
++			netdev_err(ndev, "can't suspend descriptor\n");
++			ret = -EBUSY;
++		}
++		break;
++
++	case AVE_DESC_RX_PERMIT:
++		val = readl(priv->base + AVE_DESCC);
++		val &= ~AVE_DESCC_RDSTP;
++		val &= ~AVE_DESCC_STATUS_MASK;
++		writel(val, priv->base + AVE_DESCC);
++		break;
++
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	return ret;
++}
++
++static int ave_tx_complete(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 proc_idx, done_idx, ndesc, cmdsts;
++	unsigned int nr_freebuf = 0;
++	unsigned int tx_packets = 0;
++	unsigned int tx_bytes = 0;
++
++	proc_idx = priv->tx.proc_idx;
++	done_idx = priv->tx.done_idx;
++	ndesc    = priv->tx.ndesc;
++
++	/* free pre-stored skb from done_idx to proc_idx */
++	while (proc_idx != done_idx) {
++		cmdsts = ave_desc_read_cmdsts(ndev, AVE_DESCID_TX, done_idx);
++
++		/* do nothing if owner is HW (==1 for Tx) */
++		if (cmdsts & AVE_STS_OWN)
++			break;
++
++		/* check Tx status and updates statistics */
++		if (cmdsts & AVE_STS_OK) {
++			tx_bytes += cmdsts & AVE_STS_PKTLEN_TX_MASK;
++			/* success */
++			if (cmdsts & AVE_STS_LAST)
++				tx_packets++;
++		} else {
++			/* error */
++			if (cmdsts & AVE_STS_LAST) {
++				priv->stats_tx.errors++;
++				if (cmdsts & (AVE_STS_OWC | AVE_STS_EC))
++					priv->stats_tx.collisions++;
++			}
++		}
++
++		/* release skb */
++		if (priv->tx.desc[done_idx].skbs) {
++			ave_dma_unmap(ndev, &priv->tx.desc[done_idx],
++				      DMA_TO_DEVICE);
++			dev_consume_skb_any(priv->tx.desc[done_idx].skbs);
++			priv->tx.desc[done_idx].skbs = NULL;
++			nr_freebuf++;
++		}
++		done_idx = (done_idx + 1) % ndesc;
++	}
++
++	priv->tx.done_idx = done_idx;
++
++	/* update stats */
++	u64_stats_update_begin(&priv->stats_tx.syncp);
++	priv->stats_tx.packets += tx_packets;
++	priv->stats_tx.bytes   += tx_bytes;
++	u64_stats_update_end(&priv->stats_tx.syncp);
++
++	/* wake queue for freeing buffer */
++	if (unlikely(netif_queue_stopped(ndev)) && nr_freebuf)
++		netif_wake_queue(ndev);
++
++	return nr_freebuf;
++}
++
++static int ave_rx_receive(struct net_device *ndev, int num)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	unsigned int rx_packets = 0;
++	unsigned int rx_bytes = 0;
++	u32 proc_idx, done_idx;
++	struct sk_buff *skb;
++	unsigned int pktlen;
++	int restpkt, npkts;
++	u32 ndesc, cmdsts;
++
++	proc_idx = priv->rx.proc_idx;
++	done_idx = priv->rx.done_idx;
++	ndesc    = priv->rx.ndesc;
++	restpkt  = ((proc_idx + ndesc - 1) - done_idx) % ndesc;
++
++	for (npkts = 0; npkts < num; npkts++) {
++		/* we can't receive more packet, so fill desc quickly */
++		if (--restpkt < 0)
++			break;
++
++		cmdsts = ave_desc_read_cmdsts(ndev, AVE_DESCID_RX, proc_idx);
++
++		/* do nothing if owner is HW (==0 for Rx) */
++		if (!(cmdsts & AVE_STS_OWN))
++			break;
++
++		if (!(cmdsts & AVE_STS_OK)) {
++			priv->stats_rx.errors++;
++			proc_idx = (proc_idx + 1) % ndesc;
++			continue;
++		}
++
++		pktlen = cmdsts & AVE_STS_PKTLEN_RX_MASK;
++
++		/* get skbuff for rx */
++		skb = priv->rx.desc[proc_idx].skbs;
++		priv->rx.desc[proc_idx].skbs = NULL;
++
++		ave_dma_unmap(ndev, &priv->rx.desc[proc_idx], DMA_FROM_DEVICE);
++
++		skb->dev = ndev;
++		skb_put(skb, pktlen);
++		skb->protocol = eth_type_trans(skb, ndev);
++
++		if ((cmdsts & AVE_STS_CSSV) && (!(cmdsts & AVE_STS_CSER)))
++			skb->ip_summed = CHECKSUM_UNNECESSARY;
++
++		rx_packets++;
++		rx_bytes += pktlen;
++
++		netif_receive_skb(skb);
++
++		proc_idx = (proc_idx + 1) % ndesc;
++	}
++
++	priv->rx.proc_idx = proc_idx;
++
++	/* update stats */
++	u64_stats_update_begin(&priv->stats_rx.syncp);
++	priv->stats_rx.packets += rx_packets;
++	priv->stats_rx.bytes   += rx_bytes;
++	u64_stats_update_end(&priv->stats_rx.syncp);
++
++	/* refill the Rx buffers */
++	while (proc_idx != done_idx) {
++		if (ave_rxdesc_prepare(ndev, done_idx))
++			break;
++		done_idx = (done_idx + 1) % ndesc;
++	}
++
++	priv->rx.done_idx = done_idx;
++
++	return npkts;
++}
++
++static int ave_napi_poll_rx(struct napi_struct *napi, int budget)
++{
++	struct ave_private *priv;
++	struct net_device *ndev;
++	int num;
++
++	priv = container_of(napi, struct ave_private, napi_rx);
++	ndev = priv->ndev;
++
++	num = ave_rx_receive(ndev, budget);
++	if (num < budget) {
++		napi_complete_done(napi, num);
++
++		/* enable Rx interrupt when NAPI finishes */
++		ave_irq_enable(ndev, AVE_GI_RXIINT);
++	}
++
++	return num;
++}
++
++static int ave_napi_poll_tx(struct napi_struct *napi, int budget)
++{
++	struct ave_private *priv;
++	struct net_device *ndev;
++	int num;
++
++	priv = container_of(napi, struct ave_private, napi_tx);
++	ndev = priv->ndev;
++
++	num = ave_tx_complete(ndev);
++	napi_complete(napi);
++
++	/* enable Tx interrupt when NAPI finishes */
++	ave_irq_enable(ndev, AVE_GI_TX);
++
++	return num;
++}
++
++static void ave_global_reset(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 val;
++
++	/* set config register */
++	val = AVE_CFGR_FLE | AVE_CFGR_IPFCEN | AVE_CFGR_CHE;
++	if (!phy_interface_mode_is_rgmii(priv->phy_mode))
++		val |= AVE_CFGR_MII;
++	writel(val, priv->base + AVE_CFGR);
++
++	/* reset RMII register */
++	val = readl(priv->base + AVE_RSTCTRL);
++	val &= ~AVE_RSTCTRL_RMIIRST;
++	writel(val, priv->base + AVE_RSTCTRL);
++
++	/* assert reset */
++	writel(AVE_GRR_GRST | AVE_GRR_PHYRST, priv->base + AVE_GRR);
++	msleep(20);
++
++	/* 1st, negate PHY reset only */
++	writel(AVE_GRR_GRST, priv->base + AVE_GRR);
++	msleep(40);
++
++	/* negate reset */
++	writel(0, priv->base + AVE_GRR);
++	msleep(40);
++
++	/* negate RMII register */
++	val = readl(priv->base + AVE_RSTCTRL);
++	val |= AVE_RSTCTRL_RMIIRST;
++	writel(val, priv->base + AVE_RSTCTRL);
++
++	ave_irq_disable_all(ndev);
++}
++
++static void ave_rxfifo_reset(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 rxcr_org;
++
++	/* save and disable MAC receive op */
++	rxcr_org = readl(priv->base + AVE_RXCR);
++	writel(rxcr_org & (~AVE_RXCR_RXEN), priv->base + AVE_RXCR);
++
++	/* suspend Rx descriptor */
++	ave_desc_switch(ndev, AVE_DESC_RX_SUSPEND);
++
++	/* receive all packets before descriptor starts */
++	ave_rx_receive(ndev, priv->rx.ndesc);
++
++	/* assert reset */
++	writel(AVE_GRR_RXFFR, priv->base + AVE_GRR);
++	udelay(50);
++
++	/* negate reset */
++	writel(0, priv->base + AVE_GRR);
++	udelay(20);
++
++	/* negate interrupt status */
++	writel(AVE_GI_RXOVF, priv->base + AVE_GISR);
++
++	/* permit descriptor */
++	ave_desc_switch(ndev, AVE_DESC_RX_PERMIT);
++
++	/* restore MAC reccieve op */
++	writel(rxcr_org, priv->base + AVE_RXCR);
++}
++
++static irqreturn_t ave_irq_handler(int irq, void *netdev)
++{
++	struct net_device *ndev = (struct net_device *)netdev;
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 gimr_val, gisr_val;
++
++	gimr_val = ave_irq_disable_all(ndev);
++
++	/* get interrupt status */
++	gisr_val = readl(priv->base + AVE_GISR);
++
++	/* PHY */
++	if (gisr_val & AVE_GI_PHY)
++		writel(AVE_GI_PHY, priv->base + AVE_GISR);
++
++	/* check exceeding packet */
++	if (gisr_val & AVE_GI_RXERR) {
++		writel(AVE_GI_RXERR, priv->base + AVE_GISR);
++		netdev_err(ndev, "receive a packet exceeding frame buffer\n");
++	}
++
++	gisr_val &= gimr_val;
++	if (!gisr_val)
++		goto exit_isr;
++
++	/* RxFIFO overflow */
++	if (gisr_val & AVE_GI_RXOVF) {
++		priv->stats_rx.fifo_errors++;
++		ave_rxfifo_reset(ndev);
++		goto exit_isr;
++	}
++
++	/* Rx drop */
++	if (gisr_val & AVE_GI_RXDROP) {
++		priv->stats_rx.dropped++;
++		writel(AVE_GI_RXDROP, priv->base + AVE_GISR);
++	}
++
++	/* Rx interval */
++	if (gisr_val & AVE_GI_RXIINT) {
++		napi_schedule(&priv->napi_rx);
++		/* still force to disable Rx interrupt until NAPI finishes */
++		gimr_val &= ~AVE_GI_RXIINT;
++	}
++
++	/* Tx completed */
++	if (gisr_val & AVE_GI_TX) {
++		napi_schedule(&priv->napi_tx);
++		/* still force to disable Tx interrupt until NAPI finishes */
++		gimr_val &= ~AVE_GI_TX;
++	}
++
++exit_isr:
++	ave_irq_restore(ndev, gimr_val);
++
++	return IRQ_HANDLED;
++}
++
++static int ave_pfsel_start(struct net_device *ndev, unsigned int entry)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 val;
++
++	if (WARN_ON(entry > AVE_PF_SIZE))
++		return -EINVAL;
++
++	val = readl(priv->base + AVE_PFEN);
++	writel(val | BIT(entry), priv->base + AVE_PFEN);
++
++	return 0;
++}
++
++static int ave_pfsel_stop(struct net_device *ndev, unsigned int entry)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 val;
++
++	if (WARN_ON(entry > AVE_PF_SIZE))
++		return -EINVAL;
++
++	val = readl(priv->base + AVE_PFEN);
++	writel(val & ~BIT(entry), priv->base + AVE_PFEN);
++
++	return 0;
++}
++
++static int ave_pfsel_set_macaddr(struct net_device *ndev,
++				 unsigned int entry,
++				 const unsigned char *mac_addr,
++				 unsigned int set_size)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	if (WARN_ON(entry > AVE_PF_SIZE))
++		return -EINVAL;
++	if (WARN_ON(set_size > 6))
++		return -EINVAL;
++
++	ave_pfsel_stop(ndev, entry);
++
++	/* set MAC address for the filter */
++	ave_hw_write_macaddr(ndev, mac_addr,
++			     AVE_PKTF(entry), AVE_PKTF(entry) + 4);
++
++	/* set byte mask */
++	writel(GENMASK(31, set_size) & AVE_PFMBYTE_MASK0,
++	       priv->base + AVE_PFMBYTE(entry));
++	writel(AVE_PFMBYTE_MASK1, priv->base + AVE_PFMBYTE(entry) + 4);
++
++	/* set bit mask filter */
++	writel(AVE_PFMBIT_MASK, priv->base + AVE_PFMBIT(entry));
++
++	/* set selector to ring 0 */
++	writel(0, priv->base + AVE_PFSEL(entry));
++
++	/* restart filter */
++	ave_pfsel_start(ndev, entry);
++
++	return 0;
++}
++
++static void ave_pfsel_set_promisc(struct net_device *ndev,
++				  unsigned int entry, u32 rxring)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++
++	if (WARN_ON(entry > AVE_PF_SIZE))
++		return;
++
++	ave_pfsel_stop(ndev, entry);
++
++	/* set byte mask */
++	writel(AVE_PFMBYTE_MASK0, priv->base + AVE_PFMBYTE(entry));
++	writel(AVE_PFMBYTE_MASK1, priv->base + AVE_PFMBYTE(entry) + 4);
++
++	/* set bit mask filter */
++	writel(AVE_PFMBIT_MASK, priv->base + AVE_PFMBIT(entry));
++
++	/* set selector to rxring */
++	writel(rxring, priv->base + AVE_PFSEL(entry));
++
++	ave_pfsel_start(ndev, entry);
++}
++
++static void ave_pfsel_init(struct net_device *ndev)
++{
++	unsigned char bcast_mac[ETH_ALEN];
++	int i;
++
++	eth_broadcast_addr(bcast_mac);
++
++	for (i = 0; i < AVE_PF_SIZE; i++)
++		ave_pfsel_stop(ndev, i);
++
++	/* promiscious entry, select ring 0 */
++	ave_pfsel_set_promisc(ndev, AVE_PFNUM_FILTER, 0);
++
++	/* unicast entry */
++	ave_pfsel_set_macaddr(ndev, AVE_PFNUM_UNICAST, ndev->dev_addr, 6);
++
++	/* broadcast entry */
++	ave_pfsel_set_macaddr(ndev, AVE_PFNUM_BROADCAST, bcast_mac, 6);
++}
++
++static void ave_phy_adjust_link(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	struct phy_device *phydev = ndev->phydev;
++	u32 val, txcr, rxcr, rxcr_org;
++	u16 rmt_adv = 0, lcl_adv = 0;
++	u8 cap;
++
++	/* set RGMII speed */
++	val = readl(priv->base + AVE_TXCR);
++	val &= ~(AVE_TXCR_TXSPD_100 | AVE_TXCR_TXSPD_1G);
++
++	if (phy_interface_is_rgmii(phydev) && phydev->speed == SPEED_1000)
++		val |= AVE_TXCR_TXSPD_1G;
++	else if (phydev->speed == SPEED_100)
++		val |= AVE_TXCR_TXSPD_100;
++
++	writel(val, priv->base + AVE_TXCR);
++
++	/* set RMII speed (100M/10M only) */
++	if (!phy_interface_is_rgmii(phydev)) {
++		val = readl(priv->base + AVE_LINKSEL);
++		if (phydev->speed == SPEED_10)
++			val &= ~AVE_LINKSEL_100M;
++		else
++			val |= AVE_LINKSEL_100M;
++		writel(val, priv->base + AVE_LINKSEL);
++	}
++
++	/* check current RXCR/TXCR */
++	rxcr = readl(priv->base + AVE_RXCR);
++	txcr = readl(priv->base + AVE_TXCR);
++	rxcr_org = rxcr;
++
++	if (phydev->duplex) {
++		rxcr |= AVE_RXCR_FDUPEN;
++
++		if (phydev->pause)
++			rmt_adv |= LPA_PAUSE_CAP;
++		if (phydev->asym_pause)
++			rmt_adv |= LPA_PAUSE_ASYM;
++
++		lcl_adv = linkmode_adv_to_lcl_adv_t(phydev->advertising);
++		cap = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv);
++		if (cap & FLOW_CTRL_TX)
++			txcr |= AVE_TXCR_FLOCTR;
++		else
++			txcr &= ~AVE_TXCR_FLOCTR;
++		if (cap & FLOW_CTRL_RX)
++			rxcr |= AVE_RXCR_FLOCTR;
++		else
++			rxcr &= ~AVE_RXCR_FLOCTR;
++	} else {
++		rxcr &= ~AVE_RXCR_FDUPEN;
++		rxcr &= ~AVE_RXCR_FLOCTR;
++		txcr &= ~AVE_TXCR_FLOCTR;
++	}
++
++	if (rxcr_org != rxcr) {
++		/* disable Rx mac */
++		writel(rxcr & ~AVE_RXCR_RXEN, priv->base + AVE_RXCR);
++		/* change and enable TX/Rx mac */
++		writel(txcr, priv->base + AVE_TXCR);
++		writel(rxcr, priv->base + AVE_RXCR);
++	}
++
++	phy_print_status(phydev);
++}
++
++static void ave_macaddr_init(struct net_device *ndev)
++{
++	ave_hw_write_macaddr(ndev, ndev->dev_addr, AVE_RXMAC1R, AVE_RXMAC2R);
++
++	/* pfsel unicast entry */
++	ave_pfsel_set_macaddr(ndev, AVE_PFNUM_UNICAST, ndev->dev_addr, 6);
++}
++
++static int ave_init(struct net_device *ndev)
++{
++	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
++	struct ave_private *priv = netdev_priv(ndev);
++	struct device *dev = ndev->dev.parent;
++	struct device_node *np = dev->of_node;
++	struct device_node *mdio_np;
++	struct phy_device *phydev;
++	int nc, nr, ret;
++
++	/* enable clk because of hw access until ndo_open */
++	for (nc = 0; nc < priv->nclks; nc++) {
++		ret = clk_prepare_enable(priv->clk[nc]);
++		if (ret) {
++			dev_err(dev, "can't enable clock\n");
++			goto out_clk_disable;
++		}
++	}
++
++	for (nr = 0; nr < priv->nrsts; nr++) {
++		ret = reset_control_deassert(priv->rst[nr]);
++		if (ret) {
++			dev_err(dev, "can't deassert reset\n");
++			goto out_reset_assert;
++		}
++	}
++
++	ret = regmap_update_bits(priv->regmap, SG_ETPINMODE,
++				 priv->pinmode_mask, priv->pinmode_val);
++	if (ret)
++		goto out_reset_assert;
++
++	ave_global_reset(ndev);
++
++	mdio_np = of_get_child_by_name(np, "mdio");
++	if (!mdio_np) {
++		dev_err(dev, "mdio node not found\n");
++		ret = -EINVAL;
++		goto out_reset_assert;
++	}
++	ret = of_mdiobus_register(priv->mdio, mdio_np);
++	of_node_put(mdio_np);
++	if (ret) {
++		dev_err(dev, "failed to register mdiobus\n");
++		goto out_reset_assert;
++	}
++
++	phydev = of_phy_get_and_connect(ndev, np, ave_phy_adjust_link);
++	if (!phydev) {
++		dev_err(dev, "could not attach to PHY\n");
++		ret = -ENODEV;
++		goto out_mdio_unregister;
++	}
++
++	priv->phydev = phydev;
++
++	ave_ethtool_get_wol(ndev, &wol);
++	device_set_wakeup_capable(&ndev->dev, !!wol.supported);
++
++	/* set wol initial state disabled */
++	wol.wolopts = 0;
++	__ave_ethtool_set_wol(ndev, &wol);
++
++	if (!phy_interface_is_rgmii(phydev))
++		phy_set_max_speed(phydev, SPEED_100);
++
++	phy_support_asym_pause(phydev);
++
++	phydev->mac_managed_pm = true;
++
++	phy_attached_info(phydev);
++
++	return 0;
++
++out_mdio_unregister:
++	mdiobus_unregister(priv->mdio);
++out_reset_assert:
++	while (--nr >= 0)
++		reset_control_assert(priv->rst[nr]);
++out_clk_disable:
++	while (--nc >= 0)
++		clk_disable_unprepare(priv->clk[nc]);
++
++	return ret;
++}
++
++static void ave_uninit(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	int i;
++
++	phy_disconnect(priv->phydev);
++	mdiobus_unregister(priv->mdio);
++
++	/* disable clk because of hw access after ndo_stop */
++	for (i = 0; i < priv->nrsts; i++)
++		reset_control_assert(priv->rst[i]);
++	for (i = 0; i < priv->nclks; i++)
++		clk_disable_unprepare(priv->clk[i]);
++}
++
++static int ave_open(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	int entry;
++	int ret;
++	u32 val;
++
++	ret = request_irq(priv->irq, ave_irq_handler, IRQF_SHARED, ndev->name,
++			  ndev);
++	if (ret)
++		return ret;
++
++	priv->tx.desc = kcalloc(priv->tx.ndesc, sizeof(*priv->tx.desc),
++				GFP_KERNEL);
++	if (!priv->tx.desc) {
++		ret = -ENOMEM;
++		goto out_free_irq;
++	}
++
++	priv->rx.desc = kcalloc(priv->rx.ndesc, sizeof(*priv->rx.desc),
++				GFP_KERNEL);
++	if (!priv->rx.desc) {
++		kfree(priv->tx.desc);
++		ret = -ENOMEM;
++		goto out_free_irq;
++	}
++
++	/* initialize Tx work and descriptor */
++	priv->tx.proc_idx = 0;
++	priv->tx.done_idx = 0;
++	for (entry = 0; entry < priv->tx.ndesc; entry++) {
++		ave_desc_write_cmdsts(ndev, AVE_DESCID_TX, entry, 0);
++		ave_desc_write_addr(ndev, AVE_DESCID_TX, entry, 0);
++	}
++	writel(AVE_TXDC_ADDR_START |
++	       (((priv->tx.ndesc * priv->desc_size) << 16) & AVE_TXDC_SIZE),
++	       priv->base + AVE_TXDC);
++
++	/* initialize Rx work and descriptor */
++	priv->rx.proc_idx = 0;
++	priv->rx.done_idx = 0;
++	for (entry = 0; entry < priv->rx.ndesc; entry++) {
++		if (ave_rxdesc_prepare(ndev, entry))
++			break;
++	}
++	writel(AVE_RXDC0_ADDR_START |
++	       (((priv->rx.ndesc * priv->desc_size) << 16) & AVE_RXDC0_SIZE),
++	       priv->base + AVE_RXDC0);
++
++	ave_desc_switch(ndev, AVE_DESC_START);
++
++	ave_pfsel_init(ndev);
++	ave_macaddr_init(ndev);
++
++	/* set Rx configuration */
++	/* full duplex, enable pause drop, enalbe flow control */
++	val = AVE_RXCR_RXEN | AVE_RXCR_FDUPEN | AVE_RXCR_DRPEN |
++		AVE_RXCR_FLOCTR | (AVE_MAX_ETHFRAME & AVE_RXCR_MPSIZ_MASK);
++	writel(val, priv->base + AVE_RXCR);
++
++	/* set Tx configuration */
++	/* enable flow control, disable loopback */
++	writel(AVE_TXCR_FLOCTR, priv->base + AVE_TXCR);
++
++	/* enable timer, clear EN,INTM, and mask interval unit(BSCK) */
++	val = readl(priv->base + AVE_IIRQC) & AVE_IIRQC_BSCK;
++	val |= AVE_IIRQC_EN0 | (AVE_INTM_COUNT << 16);
++	writel(val, priv->base + AVE_IIRQC);
++
++	val = AVE_GI_RXIINT | AVE_GI_RXOVF | AVE_GI_TX | AVE_GI_RXDROP;
++	ave_irq_restore(ndev, val);
++
++	napi_enable(&priv->napi_rx);
++	napi_enable(&priv->napi_tx);
++
++	phy_start(ndev->phydev);
++	phy_start_aneg(ndev->phydev);
++	netif_start_queue(ndev);
++
++	return 0;
++
++out_free_irq:
++	disable_irq(priv->irq);
++	free_irq(priv->irq, ndev);
++
++	return ret;
++}
++
++static int ave_stop(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	int entry;
++
++	ave_irq_disable_all(ndev);
++	disable_irq(priv->irq);
++	free_irq(priv->irq, ndev);
++
++	netif_tx_disable(ndev);
++	phy_stop(ndev->phydev);
++	napi_disable(&priv->napi_tx);
++	napi_disable(&priv->napi_rx);
++
++	ave_desc_switch(ndev, AVE_DESC_STOP);
++
++	/* free Tx buffer */
++	for (entry = 0; entry < priv->tx.ndesc; entry++) {
++		if (!priv->tx.desc[entry].skbs)
++			continue;
++
++		ave_dma_unmap(ndev, &priv->tx.desc[entry], DMA_TO_DEVICE);
++		dev_kfree_skb_any(priv->tx.desc[entry].skbs);
++		priv->tx.desc[entry].skbs = NULL;
++	}
++	priv->tx.proc_idx = 0;
++	priv->tx.done_idx = 0;
++
++	/* free Rx buffer */
++	for (entry = 0; entry < priv->rx.ndesc; entry++) {
++		if (!priv->rx.desc[entry].skbs)
++			continue;
++
++		ave_dma_unmap(ndev, &priv->rx.desc[entry], DMA_FROM_DEVICE);
++		dev_kfree_skb_any(priv->rx.desc[entry].skbs);
++		priv->rx.desc[entry].skbs = NULL;
++	}
++	priv->rx.proc_idx = 0;
++	priv->rx.done_idx = 0;
++
++	kfree(priv->tx.desc);
++	kfree(priv->rx.desc);
++
++	return 0;
++}
++
++static netdev_tx_t ave_start_xmit(struct sk_buff *skb, struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	u32 proc_idx, done_idx, ndesc, cmdsts;
++	int ret, freepkt;
++	dma_addr_t paddr;
++
++	proc_idx = priv->tx.proc_idx;
++	done_idx = priv->tx.done_idx;
++	ndesc = priv->tx.ndesc;
++	freepkt = ((done_idx + ndesc - 1) - proc_idx) % ndesc;
++
++	/* stop queue when not enough entry */
++	if (unlikely(freepkt < 1)) {
++		netif_stop_queue(ndev);
++		return NETDEV_TX_BUSY;
++	}
++
++	/* add padding for short packet */
++	if (skb_put_padto(skb, ETH_ZLEN)) {
++		priv->stats_tx.dropped++;
++		return NETDEV_TX_OK;
++	}
++
++	/* map Tx buffer
++	 * Tx buffer set to the Tx descriptor doesn't have any restriction.
++	 */
++	ret = ave_dma_map(ndev, &priv->tx.desc[proc_idx],
++			  skb->data, skb->len, DMA_TO_DEVICE, &paddr);
++	if (ret) {
++		dev_kfree_skb_any(skb);
++		priv->stats_tx.dropped++;
++		return NETDEV_TX_OK;
++	}
++
++	priv->tx.desc[proc_idx].skbs = skb;
++
++	ave_desc_write_addr(ndev, AVE_DESCID_TX, proc_idx, paddr);
++
++	cmdsts = AVE_STS_OWN | AVE_STS_1ST | AVE_STS_LAST |
++		(skb->len & AVE_STS_PKTLEN_TX_MASK);
++
++	/* set interrupt per AVE_FORCE_TXINTCNT or when queue is stopped */
++	if (!(proc_idx % AVE_FORCE_TXINTCNT) || netif_queue_stopped(ndev))
++		cmdsts |= AVE_STS_INTR;
++
++	/* disable checksum calculation when skb doesn't calurate checksum */
++	if (skb->ip_summed == CHECKSUM_NONE ||
++	    skb->ip_summed == CHECKSUM_UNNECESSARY)
++		cmdsts |= AVE_STS_NOCSUM;
++
++	ave_desc_write_cmdsts(ndev, AVE_DESCID_TX, proc_idx, cmdsts);
++
++	priv->tx.proc_idx = (proc_idx + 1) % ndesc;
++
++	return NETDEV_TX_OK;
++}
++
++static int ave_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd)
++{
++	return phy_mii_ioctl(ndev->phydev, ifr, cmd);
++}
++
++static const u8 v4multi_macadr[] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 };
++static const u8 v6multi_macadr[] = { 0x33, 0x00, 0x00, 0x00, 0x00, 0x00 };
++
++static void ave_set_rx_mode(struct net_device *ndev)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	struct netdev_hw_addr *hw_adr;
++	int count, mc_cnt;
++	u32 val;
++
++	/* MAC addr filter enable for promiscious mode */
++	mc_cnt = netdev_mc_count(ndev);
++	val = readl(priv->base + AVE_RXCR);
++	if (ndev->flags & IFF_PROMISC || !mc_cnt)
++		val &= ~AVE_RXCR_AFEN;
++	else
++		val |= AVE_RXCR_AFEN;
++	writel(val, priv->base + AVE_RXCR);
++
++	/* set all multicast address */
++	if ((ndev->flags & IFF_ALLMULTI) || mc_cnt > AVE_PF_MULTICAST_SIZE) {
++		ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST,
++				      v4multi_macadr, 1);
++		ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST + 1,
++				      v6multi_macadr, 1);
++	} else {
++		/* stop all multicast filter */
++		for (count = 0; count < AVE_PF_MULTICAST_SIZE; count++)
++			ave_pfsel_stop(ndev, AVE_PFNUM_MULTICAST + count);
++
++		/* set multicast addresses */
++		count = 0;
++		netdev_for_each_mc_addr(hw_adr, ndev) {
++			if (count == mc_cnt)
++				break;
++			ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST + count,
++					      hw_adr->addr, 6);
++			count++;
++		}
++	}
++}
++
++static void ave_get_stats64(struct net_device *ndev,
++			    struct rtnl_link_stats64 *stats)
++{
++	struct ave_private *priv = netdev_priv(ndev);
++	unsigned int start;
++
++	do {
++		start = u64_stats_fetch_begin_irq(&priv->stats_rx.syncp);
++		stats->rx_packets = priv->stats_rx.packets;
++		stats->rx_bytes	  = priv->stats_rx.bytes;
++	} while (u64_stats_fetch_retry_irq(&priv->stats_rx.syncp, start));
++
++	do {
++		start = u64_stats_fetch_begin_irq(&priv->stats_tx.syncp);
++		stats->tx_packets = priv->stats_tx.packets;
++		stats->tx_bytes	  = priv->stats_tx.bytes;
++	} while (u64_stats_fetch_retry_irq(&priv->stats_tx.syncp, start));
++
++	stats->rx_errors      = priv->stats_rx.errors;
++	stats->tx_errors      = priv->stats_tx.errors;
++	stats->rx_dropped     = priv->stats_rx.dropped;
++	stats->tx_dropped     = priv->stats_tx.dropped;
++	stats->rx_fifo_errors = priv->stats_rx.fifo_errors;
++	stats->collisions     = priv->stats_tx.collisions;
++}
++
++static int ave_set_mac_address(struct net_device *ndev, void *p)
++{
++	int ret = eth_mac_addr(ndev, p);
++
++	if (ret)
++		return ret;
++
++	ave_macaddr_init(ndev);
++
++	return 0;
++}
++
++static const struct net_device_ops ave_netdev_ops = {
++	.ndo_init		= ave_init,
++	.ndo_uninit		= ave_uninit,
++	.ndo_open		= ave_open,
++	.ndo_stop		= ave_stop,
++	.ndo_start_xmit		= ave_start_xmit,
++	.ndo_eth_ioctl		= ave_ioctl,
++	.ndo_set_rx_mode	= ave_set_rx_mode,
++	.ndo_get_stats64	= ave_get_stats64,
++	.ndo_set_mac_address	= ave_set_mac_address,
++};
++
++static int ave_probe(struct platform_device *pdev)
++{
++	const struct ave_soc_data *data;
++	struct device *dev = &pdev->dev;
++	char buf[ETHTOOL_FWVERS_LEN];
++	struct of_phandle_args args;
++	phy_interface_t phy_mode;
++	struct ave_private *priv;
++	struct net_device *ndev;
++	struct device_node *np;
++	void __iomem *base;
++	const char *name;
++	int i, irq, ret;
++	u64 dma_mask;
++	u32 ave_id;
++
++	data = of_device_get_match_data(dev);
++	if (WARN_ON(!data))
++		return -EINVAL;
++
++	np = dev->of_node;
++	ret = of_get_phy_mode(np, &phy_mode);
++	if (ret) {
++		dev_err(dev, "phy-mode not found\n");
++		return ret;
++	}
++
++	irq = platform_get_irq(pdev, 0);
++	if (irq < 0)
++		return irq;
++
++	base = devm_platform_ioremap_resource(pdev, 0);
++	if (IS_ERR(base))
++		return PTR_ERR(base);
++
++	ndev = devm_alloc_etherdev(dev, sizeof(struct ave_private));
++	if (!ndev) {
++		dev_err(dev, "can't allocate ethernet device\n");
++		return -ENOMEM;
++	}
++
++	ndev->netdev_ops = &ave_netdev_ops;
++	ndev->ethtool_ops = &ave_ethtool_ops;
++	SET_NETDEV_DEV(ndev, dev);
++
++	ndev->features    |= (NETIF_F_IP_CSUM | NETIF_F_RXCSUM);
++	ndev->hw_features |= (NETIF_F_IP_CSUM | NETIF_F_RXCSUM);
++
++	ndev->max_mtu = AVE_MAX_ETHFRAME - (ETH_HLEN + ETH_FCS_LEN);
++
++	ret = of_get_ethdev_address(np, ndev);
++	if (ret) {
++		/* if the mac address is invalid, use random mac address */
++		eth_hw_addr_random(ndev);
++		dev_warn(dev, "Using random MAC address: %pM\n",
++			 ndev->dev_addr);
++	}
++
++	priv = netdev_priv(ndev);
++	priv->base = base;
++	priv->irq = irq;
++	priv->ndev = ndev;
++	priv->msg_enable = netif_msg_init(-1, AVE_DEFAULT_MSG_ENABLE);
++	priv->phy_mode = phy_mode;
++	priv->data = data;
++
++	if (IS_DESC_64BIT(priv)) {
++		priv->desc_size = AVE_DESC_SIZE_64;
++		priv->tx.daddr  = AVE_TXDM_64;
++		priv->rx.daddr  = AVE_RXDM_64;
++		dma_mask = DMA_BIT_MASK(64);
++	} else {
++		priv->desc_size = AVE_DESC_SIZE_32;
++		priv->tx.daddr  = AVE_TXDM_32;
++		priv->rx.daddr  = AVE_RXDM_32;
++		dma_mask = DMA_BIT_MASK(32);
++	}
++	ret = dma_set_mask(dev, dma_mask);
++	if (ret)
++		return ret;
++
++	priv->tx.ndesc = AVE_NR_TXDESC;
++	priv->rx.ndesc = AVE_NR_RXDESC;
++
++	u64_stats_init(&priv->stats_tx.syncp);
++	u64_stats_init(&priv->stats_rx.syncp);
++
++	for (i = 0; i < AVE_MAX_CLKS; i++) {
++		name = priv->data->clock_names[i];
++		if (!name)
++			break;
++		priv->clk[i] = devm_clk_get(dev, name);
++		if (IS_ERR(priv->clk[i]))
++			return PTR_ERR(priv->clk[i]);
++		priv->nclks++;
++	}
++
++	for (i = 0; i < AVE_MAX_RSTS; i++) {
++		name = priv->data->reset_names[i];
++		if (!name)
++			break;
++		priv->rst[i] = devm_reset_control_get_shared(dev, name);
++		if (IS_ERR(priv->rst[i]))
++			return PTR_ERR(priv->rst[i]);
++		priv->nrsts++;
++	}
++
++	ret = of_parse_phandle_with_fixed_args(np,
++					       "socionext,syscon-phy-mode",
++					       1, 0, &args);
++	if (ret) {
++		dev_err(dev, "can't get syscon-phy-mode property\n");
++		return ret;
++	}
++	priv->regmap = syscon_node_to_regmap(args.np);
++	of_node_put(args.np);
++	if (IS_ERR(priv->regmap)) {
++		dev_err(dev, "can't map syscon-phy-mode\n");
++		return PTR_ERR(priv->regmap);
++	}
++	ret = priv->data->get_pinmode(priv, phy_mode, args.args[0]);
++	if (ret) {
++		dev_err(dev, "invalid phy-mode setting\n");
++		return ret;
++	}
++
++	priv->mdio = devm_mdiobus_alloc(dev);
++	if (!priv->mdio)
++		return -ENOMEM;
++	priv->mdio->priv = ndev;
++	priv->mdio->parent = dev;
++	priv->mdio->read = ave_mdiobus_read;
++	priv->mdio->write = ave_mdiobus_write;
++	priv->mdio->name = "uniphier-mdio";
++	snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "%s-%x",
++		 pdev->name, pdev->id);
++
++	/* Register as a NAPI supported driver */
++	netif_napi_add(ndev, &priv->napi_rx, ave_napi_poll_rx,
++		       NAPI_POLL_WEIGHT);
++	netif_napi_add_tx(ndev, &priv->napi_tx, ave_napi_poll_tx);
++
++	platform_set_drvdata(pdev, ndev);
++
++	ret = register_netdev(ndev);
++	if (ret) {
++		dev_err(dev, "failed to register netdevice\n");
++		goto out_del_napi;
++	}
++
++	/* get ID and version */
++	ave_id = readl(priv->base + AVE_IDR);
++	ave_hw_read_version(ndev, buf, sizeof(buf));
++
++	dev_info(dev, "Socionext %c%c%c%c Ethernet IP %s (irq=%d, phy=%s)\n",
++		 (ave_id >> 24) & 0xff, (ave_id >> 16) & 0xff,
++		 (ave_id >> 8) & 0xff, (ave_id >> 0) & 0xff,
++		 buf, priv->irq, phy_modes(phy_mode));
++
++	return 0;
++
++out_del_napi:
++	netif_napi_del(&priv->napi_rx);
++	netif_napi_del(&priv->napi_tx);
++
++	return ret;
++}
++
++static int ave_remove(struct platform_device *pdev)
++{
++	struct net_device *ndev = platform_get_drvdata(pdev);
++	struct ave_private *priv = netdev_priv(ndev);
++
++	unregister_netdev(ndev);
++	netif_napi_del(&priv->napi_rx);
++	netif_napi_del(&priv->napi_tx);
++
++	return 0;
++}
++
++#ifdef CONFIG_PM_SLEEP
++static int ave_suspend(struct device *dev)
++{
++	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
++	struct net_device *ndev = dev_get_drvdata(dev);
++	struct ave_private *priv = netdev_priv(ndev);
++	int ret = 0;
++
++	if (netif_running(ndev)) {
++		ret = ave_stop(ndev);
++		netif_device_detach(ndev);
++	}
++
++	ave_ethtool_get_wol(ndev, &wol);
++	priv->wolopts = wol.wolopts;
++
++	return ret;
++}
++
++static int ave_resume(struct device *dev)
++{
++	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
++	struct net_device *ndev = dev_get_drvdata(dev);
++	struct ave_private *priv = netdev_priv(ndev);
++	int ret = 0;
++
++	ave_global_reset(ndev);
++
++	ret = phy_init_hw(ndev->phydev);
++	if (ret)
++		return ret;
++
++	ave_ethtool_get_wol(ndev, &wol);
++	wol.wolopts = priv->wolopts;
++	__ave_ethtool_set_wol(ndev, &wol);
++
++	if (ndev->phydev) {
++		ret = phy_resume(ndev->phydev);
++		if (ret)
++			return ret;
++	}
++
++	if (netif_running(ndev)) {
++		ret = ave_open(ndev);
++		netif_device_attach(ndev);
++	}
++
++	return ret;
++}
++
++static SIMPLE_DEV_PM_OPS(ave_pm_ops, ave_suspend, ave_resume);
++#define AVE_PM_OPS	(&ave_pm_ops)
++#else
++#define AVE_PM_OPS	NULL
++#endif
++
++static int ave_pro4_get_pinmode(struct ave_private *priv,
++				phy_interface_t phy_mode, u32 arg)
++{
++	if (arg > 0)
++		return -EINVAL;
++
++	priv->pinmode_mask = SG_ETPINMODE_RMII(0);
++
++	switch (phy_mode) {
++	case PHY_INTERFACE_MODE_RMII:
++		priv->pinmode_val = SG_ETPINMODE_RMII(0);
++		break;
++	case PHY_INTERFACE_MODE_MII:
++	case PHY_INTERFACE_MODE_RGMII:
++	case PHY_INTERFACE_MODE_RGMII_ID:
++	case PHY_INTERFACE_MODE_RGMII_RXID:
++	case PHY_INTERFACE_MODE_RGMII_TXID:
++		priv->pinmode_val = 0;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int ave_ld11_get_pinmode(struct ave_private *priv,
++				phy_interface_t phy_mode, u32 arg)
++{
++	if (arg > 0)
++		return -EINVAL;
++
++	priv->pinmode_mask = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0);
++
++	switch (phy_mode) {
++	case PHY_INTERFACE_MODE_INTERNAL:
++		priv->pinmode_val = 0;
++		break;
++	case PHY_INTERFACE_MODE_RMII:
++		priv->pinmode_val = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0);
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int ave_ld20_get_pinmode(struct ave_private *priv,
++				phy_interface_t phy_mode, u32 arg)
++{
++	if (arg > 0)
++		return -EINVAL;
++
++	priv->pinmode_mask = SG_ETPINMODE_RMII(0);
++
++	switch (phy_mode) {
++	case PHY_INTERFACE_MODE_RMII:
++		priv->pinmode_val = SG_ETPINMODE_RMII(0);
++		break;
++	case PHY_INTERFACE_MODE_RGMII:
++	case PHY_INTERFACE_MODE_RGMII_ID:
++	case PHY_INTERFACE_MODE_RGMII_RXID:
++	case PHY_INTERFACE_MODE_RGMII_TXID:
++		priv->pinmode_val = 0;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int ave_pxs3_get_pinmode(struct ave_private *priv,
++				phy_interface_t phy_mode, u32 arg)
++{
++	if (arg > 1)
++		return -EINVAL;
++
++	priv->pinmode_mask = SG_ETPINMODE_RMII(arg);
++
++	switch (phy_mode) {
++	case PHY_INTERFACE_MODE_RMII:
++		priv->pinmode_val = SG_ETPINMODE_RMII(arg);
++		break;
++	case PHY_INTERFACE_MODE_RGMII:
++	case PHY_INTERFACE_MODE_RGMII_ID:
++	case PHY_INTERFACE_MODE_RGMII_RXID:
++	case PHY_INTERFACE_MODE_RGMII_TXID:
++		priv->pinmode_val = 0;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static const struct ave_soc_data ave_pro4_data = {
++	.is_desc_64bit = false,
++	.clock_names = {
++		"gio", "ether", "ether-gb", "ether-phy",
++	},
++	.reset_names = {
++		"gio", "ether",
++	},
++	.get_pinmode = ave_pro4_get_pinmode,
++};
++
++static const struct ave_soc_data ave_pxs2_data = {
++	.is_desc_64bit = false,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_pro4_get_pinmode,
++};
++
++static const struct ave_soc_data ave_ld11_data = {
++	.is_desc_64bit = false,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_ld11_get_pinmode,
++};
++
++static const struct ave_soc_data ave_ld20_data = {
++	.is_desc_64bit = true,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_ld20_get_pinmode,
++};
++
++static const struct ave_soc_data ave_pxs3_data = {
++	.is_desc_64bit = false,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_pxs3_get_pinmode,
++};
++
++static const struct ave_soc_data ave_nx1_data = {
++	.is_desc_64bit = true,
++	.clock_names = {
++		"ether",
++	},
++	.reset_names = {
++		"ether",
++	},
++	.get_pinmode = ave_pxs3_get_pinmode,
++};
++
++static const struct of_device_id of_ave_match[] = {
++	{
++		.compatible = "socionext,uniphier-pro4-ave4",
++		.data = &ave_pro4_data,
++	},
++	{
++		.compatible = "socionext,uniphier-pxs2-ave4",
++		.data = &ave_pxs2_data,
++	},
++	{
++		.compatible = "socionext,uniphier-ld11-ave4",
++		.data = &ave_ld11_data,
++	},
++	{
++		.compatible = "socionext,uniphier-ld20-ave4",
++		.data = &ave_ld20_data,
++	},
++	{
++		.compatible = "socionext,uniphier-pxs3-ave4",
++		.data = &ave_pxs3_data,
++	},
++	{
++		.compatible = "socionext,uniphier-nx1-ave4",
++		.data = &ave_nx1_data,
++	},
++	{ /* Sentinel */ }
++};
++MODULE_DEVICE_TABLE(of, of_ave_match);
++
++static struct platform_driver ave_driver = {
++	.probe  = ave_probe,
++	.remove = ave_remove,
++	.driver	= {
++		.name = "ave",
++		.pm   = AVE_PM_OPS,
++		.of_match_table	= of_ave_match,
++	},
++};
++module_platform_driver(ave_driver);
++
++MODULE_AUTHOR("Kunihiko Hayashi <hayashi.kunihiko@socionext.com>");
++MODULE_DESCRIPTION("Socionext UniPhier AVE ethernet driver");
++MODULE_LICENSE("GPL v2");
+diff -rupN linux.orig/drivers/net/ethernet/ti/am65-cpsw-nuss.c linux/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+--- linux.orig/drivers/net/ethernet/ti/am65-cpsw-nuss.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/ti/am65-cpsw-nuss.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1365,12 +1365,12 @@ static void am65_cpsw_nuss_ndo_get_stats
  
  		cpu_stats = per_cpu_ptr(ndev_priv->stats, cpu);
  		do {
@@ -2501,11 +14651,10 @@ index f4a6b590a1e39..1b62400c19049 100644
  
  		stats->rx_packets += rx_packets;
  		stats->rx_bytes   += rx_bytes;
-diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
-index b15d44261e766..68c7b2c05aab3 100644
---- a/drivers/net/ethernet/ti/netcp_core.c
-+++ b/drivers/net/ethernet/ti/netcp_core.c
-@@ -1916,16 +1916,16 @@ netcp_get_stats(struct net_device *ndev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/ethernet/ti/netcp_core.c linux/drivers/net/ethernet/ti/netcp_core.c
+--- linux.orig/drivers/net/ethernet/ti/netcp_core.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/ti/netcp_core.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1916,16 +1916,16 @@ netcp_get_stats(struct net_device *ndev,
  	unsigned int start;
  
  	do {
@@ -2526,11 +14675,10 @@ index b15d44261e766..68c7b2c05aab3 100644
  
  	stats->rx_packets = rxpackets;
  	stats->rx_bytes = rxbytes;
-diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
-index 509c5e9b29dfa..5301c907b5ae3 100644
---- a/drivers/net/ethernet/via/via-rhine.c
-+++ b/drivers/net/ethernet/via/via-rhine.c
-@@ -2217,16 +2217,16 @@ rhine_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/ethernet/via/via-rhine.c linux/drivers/net/ethernet/via/via-rhine.c
+--- linux.orig/drivers/net/ethernet/via/via-rhine.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/via/via-rhine.c	2022-12-04 10:40:26.696034096 -0500
+@@ -2217,16 +2217,16 @@ rhine_get_stats64(struct net_device *dev
  	netdev_stats_to_stats64(stats, &dev->stats);
  
  	do {
@@ -2551,11 +14699,10 @@ index 509c5e9b29dfa..5301c907b5ae3 100644
  }
  
  static void rhine_set_rx_mode(struct net_device *dev)
-diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
-index 9262988d26a32..2c233b59e7d93 100644
---- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
-+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
-@@ -1305,16 +1305,16 @@ axienet_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/ethernet/xilinx/xilinx_axienet_main.c linux/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+--- linux.orig/drivers/net/ethernet/xilinx/xilinx_axienet_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ethernet/xilinx/xilinx_axienet_main.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1305,16 +1305,16 @@ axienet_get_stats64(struct net_device *d
  	netdev_stats_to_stats64(stats, &dev->stats);
  
  	do {
@@ -2576,11 +14723,10 @@ index 9262988d26a32..2c233b59e7d93 100644
  }
  
  static const struct net_device_ops axienet_netdev_ops = {
-diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
-index 8113ac17ab70a..2fd8b9c51e839 100644
---- a/drivers/net/hyperv/netvsc_drv.c
-+++ b/drivers/net/hyperv/netvsc_drv.c
-@@ -1264,12 +1264,12 @@ static void netvsc_get_vf_stats(struct net_device *net,
+diff -rupN linux.orig/drivers/net/hyperv/netvsc_drv.c linux/drivers/net/hyperv/netvsc_drv.c
+--- linux.orig/drivers/net/hyperv/netvsc_drv.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/hyperv/netvsc_drv.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1264,12 +1264,12 @@ static void netvsc_get_vf_stats(struct n
  		unsigned int start;
  
  		do {
@@ -2595,7 +14741,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  
  		tot->rx_packets += rx_packets;
  		tot->tx_packets += tx_packets;
-@@ -1294,12 +1294,12 @@ static void netvsc_get_pcpu_stats(struct net_device *net,
+@@ -1294,12 +1294,12 @@ static void netvsc_get_pcpu_stats(struct
  		unsigned int start;
  
  		do {
@@ -2610,7 +14756,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  		this_tot->rx_packets = this_tot->vf_rx_packets;
  		this_tot->tx_packets = this_tot->vf_tx_packets;
  		this_tot->rx_bytes   = this_tot->vf_rx_bytes;
-@@ -1318,20 +1318,20 @@ static void netvsc_get_pcpu_stats(struct net_device *net,
+@@ -1318,20 +1318,20 @@ static void netvsc_get_pcpu_stats(struct
  
  		tx_stats = &nvchan->tx_stats;
  		do {
@@ -2635,7 +14781,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  
  		this_tot->rx_bytes	+= bytes;
  		this_tot->rx_packets	+= packets;
-@@ -1370,21 +1370,21 @@ static void netvsc_get_stats64(struct net_device *net,
+@@ -1370,21 +1370,21 @@ static void netvsc_get_stats64(struct ne
  
  		tx_stats = &nvchan->tx_stats;
  		do {
@@ -2661,7 +14807,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  
  		t->rx_bytes	+= bytes;
  		t->rx_packets	+= packets;
-@@ -1527,24 +1527,24 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
+@@ -1527,24 +1527,24 @@ static void netvsc_get_ethtool_stats(str
  		tx_stats = &nvdev->chan_table[j].tx_stats;
  
  		do {
@@ -2690,11 +14836,10 @@ index 8113ac17ab70a..2fd8b9c51e839 100644
  		data[i++] = packets;
  		data[i++] = bytes;
  		data[i++] = xdp_drop;
-diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
-index 1c64d5347b8e0..78253ad57b2ef 100644
---- a/drivers/net/ifb.c
-+++ b/drivers/net/ifb.c
-@@ -162,18 +162,18 @@ static void ifb_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ifb.c linux/drivers/net/ifb.c
+--- linux.orig/drivers/net/ifb.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ifb.c	2022-12-04 10:40:26.696034096 -0500
+@@ -162,18 +162,18 @@ static void ifb_stats64(struct net_devic
  
  	for (i = 0; i < dev->num_tx_queues; i++,txp++) {
  		do {
@@ -2717,7 +14862,7 @@ index 1c64d5347b8e0..78253ad57b2ef 100644
  		stats->tx_packets += packets;
  		stats->tx_bytes += bytes;
  	}
-@@ -245,12 +245,12 @@ static void ifb_fill_stats_data(u64 **data,
+@@ -245,12 +245,12 @@ static void ifb_fill_stats_data(u64 **da
  	int j;
  
  	do {
@@ -2732,11 +14877,10 @@ index 1c64d5347b8e0..78253ad57b2ef 100644
  
  	*data += IFB_Q_STATS_LEN;
  }
-diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
-index 49ba8a50dfb1e..8a58d74638cd8 100644
---- a/drivers/net/ipvlan/ipvlan_main.c
-+++ b/drivers/net/ipvlan/ipvlan_main.c
-@@ -299,13 +299,13 @@ static void ipvlan_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/ipvlan/ipvlan_main.c linux/drivers/net/ipvlan/ipvlan_main.c
+--- linux.orig/drivers/net/ipvlan/ipvlan_main.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/ipvlan/ipvlan_main.c	2022-12-04 10:40:26.696034096 -0500
+@@ -301,13 +301,13 @@ static void ipvlan_get_stats64(struct ne
  		for_each_possible_cpu(idx) {
  			pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx);
  			do {
@@ -2752,11 +14896,1096 @@ index 49ba8a50dfb1e..8a58d74638cd8 100644
  							   strt));
  
  			s->rx_packets += rx_pkts;
-diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
-index 14e8d04cb4347..c4ad98d39ea60 100644
---- a/drivers/net/loopback.c
-+++ b/drivers/net/loopback.c
-@@ -106,10 +106,10 @@ void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes)
+diff -rupN linux.orig/drivers/net/ipvlan/ipvlan_main.c.orig linux/drivers/net/ipvlan/ipvlan_main.c.orig
+--- linux.orig/drivers/net/ipvlan/ipvlan_main.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/ipvlan/ipvlan_main.c.orig	2022-12-04 10:40:18.180055916 -0500
+@@ -0,0 +1,1082 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
++ */
++
++#include <linux/ethtool.h>
++
++#include "ipvlan.h"
++
++static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval,
++				struct netlink_ext_ack *extack)
++{
++	struct ipvl_dev *ipvlan;
++	unsigned int flags;
++	int err;
++
++	ASSERT_RTNL();
++	if (port->mode != nval) {
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
++			flags = ipvlan->dev->flags;
++			if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) {
++				err = dev_change_flags(ipvlan->dev,
++						       flags | IFF_NOARP,
++						       extack);
++			} else {
++				err = dev_change_flags(ipvlan->dev,
++						       flags & ~IFF_NOARP,
++						       extack);
++			}
++			if (unlikely(err))
++				goto fail;
++		}
++		if (nval == IPVLAN_MODE_L3S) {
++			/* New mode is L3S */
++			err = ipvlan_l3s_register(port);
++			if (err)
++				goto fail;
++		} else if (port->mode == IPVLAN_MODE_L3S) {
++			/* Old mode was L3S */
++			ipvlan_l3s_unregister(port);
++		}
++		port->mode = nval;
++	}
++	return 0;
++
++fail:
++	/* Undo the flags changes that have been done so far. */
++	list_for_each_entry_continue_reverse(ipvlan, &port->ipvlans, pnode) {
++		flags = ipvlan->dev->flags;
++		if (port->mode == IPVLAN_MODE_L3 ||
++		    port->mode == IPVLAN_MODE_L3S)
++			dev_change_flags(ipvlan->dev, flags | IFF_NOARP,
++					 NULL);
++		else
++			dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP,
++					 NULL);
++	}
++
++	return err;
++}
++
++static int ipvlan_port_create(struct net_device *dev)
++{
++	struct ipvl_port *port;
++	int err, idx;
++
++	port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL);
++	if (!port)
++		return -ENOMEM;
++
++	write_pnet(&port->pnet, dev_net(dev));
++	port->dev = dev;
++	port->mode = IPVLAN_MODE_L3;
++	INIT_LIST_HEAD(&port->ipvlans);
++	for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++)
++		INIT_HLIST_HEAD(&port->hlhead[idx]);
++
++	skb_queue_head_init(&port->backlog);
++	INIT_WORK(&port->wq, ipvlan_process_multicast);
++	ida_init(&port->ida);
++	port->dev_id_start = 1;
++
++	err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
++	if (err)
++		goto err;
++
++	netdev_hold(dev, &port->dev_tracker, GFP_KERNEL);
++	return 0;
++
++err:
++	kfree(port);
++	return err;
++}
++
++static void ipvlan_port_destroy(struct net_device *dev)
++{
++	struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
++	struct sk_buff *skb;
++
++	netdev_put(dev, &port->dev_tracker);
++	if (port->mode == IPVLAN_MODE_L3S)
++		ipvlan_l3s_unregister(port);
++	netdev_rx_handler_unregister(dev);
++	cancel_work_sync(&port->wq);
++	while ((skb = __skb_dequeue(&port->backlog)) != NULL) {
++		dev_put(skb->dev);
++		kfree_skb(skb);
++	}
++	ida_destroy(&port->ida);
++	kfree(port);
++}
++
++#define IPVLAN_ALWAYS_ON_OFLOADS \
++	(NETIF_F_SG | NETIF_F_HW_CSUM | \
++	 NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL)
++
++#define IPVLAN_ALWAYS_ON \
++	(IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED)
++
++#define IPVLAN_FEATURES \
++	(NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
++	 NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \
++	 NETIF_F_GRO | NETIF_F_RXCSUM | \
++	 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
++
++	/* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */
++
++#define IPVLAN_STATE_MASK \
++	((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))
++
++static int ipvlan_init(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++	struct ipvl_port *port;
++	int err;
++
++	dev->state = (dev->state & ~IPVLAN_STATE_MASK) |
++		     (phy_dev->state & IPVLAN_STATE_MASK);
++	dev->features = phy_dev->features & IPVLAN_FEATURES;
++	dev->features |= IPVLAN_ALWAYS_ON;
++	dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES;
++	dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS;
++	dev->hw_enc_features |= dev->features;
++	netif_inherit_tso_max(dev, phy_dev);
++	dev->hard_header_len = phy_dev->hard_header_len;
++
++	netdev_lockdep_set_classes(dev);
++
++	ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats);
++	if (!ipvlan->pcpu_stats)
++		return -ENOMEM;
++
++	if (!netif_is_ipvlan_port(phy_dev)) {
++		err = ipvlan_port_create(phy_dev);
++		if (err < 0) {
++			free_percpu(ipvlan->pcpu_stats);
++			return err;
++		}
++	}
++	port = ipvlan_port_get_rtnl(phy_dev);
++	port->count += 1;
++	return 0;
++}
++
++static void ipvlan_uninit(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++	struct ipvl_port *port;
++
++	free_percpu(ipvlan->pcpu_stats);
++
++	port = ipvlan_port_get_rtnl(phy_dev);
++	port->count -= 1;
++	if (!port->count)
++		ipvlan_port_destroy(port->dev);
++}
++
++static int ipvlan_open(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_addr *addr;
++
++	if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
++	    ipvlan->port->mode == IPVLAN_MODE_L3S)
++		dev->flags |= IFF_NOARP;
++	else
++		dev->flags &= ~IFF_NOARP;
++
++	rcu_read_lock();
++	list_for_each_entry_rcu(addr, &ipvlan->addrs, anode)
++		ipvlan_ht_addr_add(ipvlan, addr);
++	rcu_read_unlock();
++
++	return 0;
++}
++
++static int ipvlan_stop(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++	struct ipvl_addr *addr;
++
++	dev_uc_unsync(phy_dev, dev);
++	dev_mc_unsync(phy_dev, dev);
++
++	rcu_read_lock();
++	list_for_each_entry_rcu(addr, &ipvlan->addrs, anode)
++		ipvlan_ht_addr_del(addr);
++	rcu_read_unlock();
++
++	return 0;
++}
++
++static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb,
++				     struct net_device *dev)
++{
++	const struct ipvl_dev *ipvlan = netdev_priv(dev);
++	int skblen = skb->len;
++	int ret;
++
++	ret = ipvlan_queue_xmit(skb, dev);
++	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
++		struct ipvl_pcpu_stats *pcptr;
++
++		pcptr = this_cpu_ptr(ipvlan->pcpu_stats);
++
++		u64_stats_update_begin(&pcptr->syncp);
++		u64_stats_inc(&pcptr->tx_pkts);
++		u64_stats_add(&pcptr->tx_bytes, skblen);
++		u64_stats_update_end(&pcptr->syncp);
++	} else {
++		this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
++	}
++	return ret;
++}
++
++static netdev_features_t ipvlan_fix_features(struct net_device *dev,
++					     netdev_features_t features)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	features |= NETIF_F_ALL_FOR_ALL;
++	features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES);
++	features = netdev_increment_features(ipvlan->phy_dev->features,
++					     features, features);
++	features |= IPVLAN_ALWAYS_ON;
++	features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON);
++
++	return features;
++}
++
++static void ipvlan_change_rx_flags(struct net_device *dev, int change)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++
++	if (change & IFF_ALLMULTI)
++		dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1);
++}
++
++static void ipvlan_set_multicast_mac_filter(struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
++		bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE);
++	} else {
++		struct netdev_hw_addr *ha;
++		DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE);
++
++		bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE);
++		netdev_for_each_mc_addr(ha, dev)
++			__set_bit(ipvlan_mac_hash(ha->addr), mc_filters);
++
++		/* Turn-on broadcast bit irrespective of address family,
++		 * since broadcast is deferred to a work-queue, hence no
++		 * impact on fast-path processing.
++		 */
++		__set_bit(ipvlan_mac_hash(dev->broadcast), mc_filters);
++
++		bitmap_copy(ipvlan->mac_filters, mc_filters,
++			    IPVLAN_MAC_FILTER_SIZE);
++	}
++	dev_uc_sync(ipvlan->phy_dev, dev);
++	dev_mc_sync(ipvlan->phy_dev, dev);
++}
++
++static void ipvlan_get_stats64(struct net_device *dev,
++			       struct rtnl_link_stats64 *s)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (ipvlan->pcpu_stats) {
++		struct ipvl_pcpu_stats *pcptr;
++		u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes;
++		u32 rx_errs = 0, tx_drps = 0;
++		u32 strt;
++		int idx;
++
++		for_each_possible_cpu(idx) {
++			pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx);
++			do {
++				strt= u64_stats_fetch_begin_irq(&pcptr->syncp);
++				rx_pkts = u64_stats_read(&pcptr->rx_pkts);
++				rx_bytes = u64_stats_read(&pcptr->rx_bytes);
++				rx_mcast = u64_stats_read(&pcptr->rx_mcast);
++				tx_pkts = u64_stats_read(&pcptr->tx_pkts);
++				tx_bytes = u64_stats_read(&pcptr->tx_bytes);
++			} while (u64_stats_fetch_retry_irq(&pcptr->syncp,
++							   strt));
++
++			s->rx_packets += rx_pkts;
++			s->rx_bytes += rx_bytes;
++			s->multicast += rx_mcast;
++			s->tx_packets += tx_pkts;
++			s->tx_bytes += tx_bytes;
++
++			/* u32 values are updated without syncp protection. */
++			rx_errs += READ_ONCE(pcptr->rx_errs);
++			tx_drps += READ_ONCE(pcptr->tx_drps);
++		}
++		s->rx_errors = rx_errs;
++		s->rx_dropped = rx_errs;
++		s->tx_dropped = tx_drps;
++	}
++}
++
++static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++
++	return vlan_vid_add(phy_dev, proto, vid);
++}
++
++static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
++				   u16 vid)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++
++	vlan_vid_del(phy_dev, proto, vid);
++	return 0;
++}
++
++static int ipvlan_get_iflink(const struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	return ipvlan->phy_dev->ifindex;
++}
++
++static const struct net_device_ops ipvlan_netdev_ops = {
++	.ndo_init		= ipvlan_init,
++	.ndo_uninit		= ipvlan_uninit,
++	.ndo_open		= ipvlan_open,
++	.ndo_stop		= ipvlan_stop,
++	.ndo_start_xmit		= ipvlan_start_xmit,
++	.ndo_fix_features	= ipvlan_fix_features,
++	.ndo_change_rx_flags	= ipvlan_change_rx_flags,
++	.ndo_set_rx_mode	= ipvlan_set_multicast_mac_filter,
++	.ndo_get_stats64	= ipvlan_get_stats64,
++	.ndo_vlan_rx_add_vid	= ipvlan_vlan_rx_add_vid,
++	.ndo_vlan_rx_kill_vid	= ipvlan_vlan_rx_kill_vid,
++	.ndo_get_iflink		= ipvlan_get_iflink,
++};
++
++static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
++			      unsigned short type, const void *daddr,
++			      const void *saddr, unsigned len)
++{
++	const struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct net_device *phy_dev = ipvlan->phy_dev;
++
++	/* TODO Probably use a different field than dev_addr so that the
++	 * mac-address on the virtual device is portable and can be carried
++	 * while the packets use the mac-addr on the physical device.
++	 */
++	return dev_hard_header(skb, phy_dev, type, daddr,
++			       saddr ? : phy_dev->dev_addr, len);
++}
++
++static const struct header_ops ipvlan_header_ops = {
++	.create  	= ipvlan_hard_header,
++	.parse		= eth_header_parse,
++	.cache		= eth_header_cache,
++	.cache_update	= eth_header_cache_update,
++};
++
++static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
++{
++	ipvlan->dev->mtu = dev->mtu;
++}
++
++static bool netif_is_ipvlan(const struct net_device *dev)
++{
++	/* both ipvlan and ipvtap devices use the same netdev_ops */
++	return dev->netdev_ops == &ipvlan_netdev_ops;
++}
++
++static int ipvlan_ethtool_get_link_ksettings(struct net_device *dev,
++					     struct ethtool_link_ksettings *cmd)
++{
++	const struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	return __ethtool_get_link_ksettings(ipvlan->phy_dev, cmd);
++}
++
++static void ipvlan_ethtool_get_drvinfo(struct net_device *dev,
++				       struct ethtool_drvinfo *drvinfo)
++{
++	strlcpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver));
++	strlcpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version));
++}
++
++static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev)
++{
++	const struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	return ipvlan->msg_enable;
++}
++
++static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	ipvlan->msg_enable = value;
++}
++
++static const struct ethtool_ops ipvlan_ethtool_ops = {
++	.get_link	= ethtool_op_get_link,
++	.get_link_ksettings	= ipvlan_ethtool_get_link_ksettings,
++	.get_drvinfo	= ipvlan_ethtool_get_drvinfo,
++	.get_msglevel	= ipvlan_ethtool_get_msglevel,
++	.set_msglevel	= ipvlan_ethtool_set_msglevel,
++};
++
++static int ipvlan_nl_changelink(struct net_device *dev,
++				struct nlattr *tb[], struct nlattr *data[],
++				struct netlink_ext_ack *extack)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
++	int err = 0;
++
++	if (!data)
++		return 0;
++	if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN))
++		return -EPERM;
++
++	if (data[IFLA_IPVLAN_MODE]) {
++		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
++
++		err = ipvlan_set_port_mode(port, nmode, extack);
++	}
++
++	if (!err && data[IFLA_IPVLAN_FLAGS]) {
++		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
++
++		if (flags & IPVLAN_F_PRIVATE)
++			ipvlan_mark_private(port);
++		else
++			ipvlan_clear_private(port);
++
++		if (flags & IPVLAN_F_VEPA)
++			ipvlan_mark_vepa(port);
++		else
++			ipvlan_clear_vepa(port);
++	}
++
++	return err;
++}
++
++static size_t ipvlan_nl_getsize(const struct net_device *dev)
++{
++	return (0
++		+ nla_total_size(2) /* IFLA_IPVLAN_MODE */
++		+ nla_total_size(2) /* IFLA_IPVLAN_FLAGS */
++		);
++}
++
++static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[],
++			      struct netlink_ext_ack *extack)
++{
++	if (!data)
++		return 0;
++
++	if (data[IFLA_IPVLAN_MODE]) {
++		u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
++
++		if (mode >= IPVLAN_MODE_MAX)
++			return -EINVAL;
++	}
++	if (data[IFLA_IPVLAN_FLAGS]) {
++		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
++
++		/* Only two bits are used at this moment. */
++		if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
++			return -EINVAL;
++		/* Also both flags can't be active at the same time. */
++		if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ==
++		    (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
++			return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int ipvlan_nl_fillinfo(struct sk_buff *skb,
++			      const struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
++	int ret = -EINVAL;
++
++	if (!port)
++		goto err;
++
++	ret = -EMSGSIZE;
++	if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode))
++		goto err;
++	if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags))
++		goto err;
++
++	return 0;
++
++err:
++	return ret;
++}
++
++int ipvlan_link_new(struct net *src_net, struct net_device *dev,
++		    struct nlattr *tb[], struct nlattr *data[],
++		    struct netlink_ext_ack *extack)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_port *port;
++	struct net_device *phy_dev;
++	int err;
++	u16 mode = IPVLAN_MODE_L3;
++
++	if (!tb[IFLA_LINK])
++		return -EINVAL;
++
++	phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
++	if (!phy_dev)
++		return -ENODEV;
++
++	if (netif_is_ipvlan(phy_dev)) {
++		struct ipvl_dev *tmp = netdev_priv(phy_dev);
++
++		phy_dev = tmp->phy_dev;
++		if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN))
++			return -EPERM;
++	} else if (!netif_is_ipvlan_port(phy_dev)) {
++		/* Exit early if the underlying link is invalid or busy */
++		if (phy_dev->type != ARPHRD_ETHER ||
++		    phy_dev->flags & IFF_LOOPBACK) {
++			netdev_err(phy_dev,
++				   "Master is either lo or non-ether device\n");
++			return -EINVAL;
++		}
++
++		if (netdev_is_rx_handler_busy(phy_dev)) {
++			netdev_err(phy_dev, "Device is already in use.\n");
++			return -EBUSY;
++		}
++	}
++
++	ipvlan->phy_dev = phy_dev;
++	ipvlan->dev = dev;
++	ipvlan->sfeatures = IPVLAN_FEATURES;
++	if (!tb[IFLA_MTU])
++		ipvlan_adjust_mtu(ipvlan, phy_dev);
++	INIT_LIST_HEAD(&ipvlan->addrs);
++	spin_lock_init(&ipvlan->addrs_lock);
++
++	/* TODO Probably put random address here to be presented to the
++	 * world but keep using the physical-dev address for the outgoing
++	 * packets.
++	 */
++	eth_hw_addr_set(dev, phy_dev->dev_addr);
++
++	dev->priv_flags |= IFF_NO_RX_HANDLER;
++
++	err = register_netdevice(dev);
++	if (err < 0)
++		return err;
++
++	/* ipvlan_init() would have created the port, if required */
++	port = ipvlan_port_get_rtnl(phy_dev);
++	ipvlan->port = port;
++
++	/* If the port-id base is at the MAX value, then wrap it around and
++	 * begin from 0x1 again. This may be due to a busy system where lots
++	 * of slaves are getting created and deleted.
++	 */
++	if (port->dev_id_start == 0xFFFE)
++		port->dev_id_start = 0x1;
++
++	/* Since L2 address is shared among all IPvlan slaves including
++	 * master, use unique 16 bit dev-ids to diffentiate among them.
++	 * Assign IDs between 0x1 and 0xFFFE (used by the master) to each
++	 * slave link [see addrconf_ifid_eui48()].
++	 */
++	err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE,
++			     GFP_KERNEL);
++	if (err < 0)
++		err = ida_simple_get(&port->ida, 0x1, port->dev_id_start,
++				     GFP_KERNEL);
++	if (err < 0)
++		goto unregister_netdev;
++	dev->dev_id = err;
++
++	/* Increment id-base to the next slot for the future assignment */
++	port->dev_id_start = err + 1;
++
++	err = netdev_upper_dev_link(phy_dev, dev, extack);
++	if (err)
++		goto remove_ida;
++
++	/* Flags are per port and latest update overrides. User has
++	 * to be consistent in setting it just like the mode attribute.
++	 */
++	if (data && data[IFLA_IPVLAN_FLAGS])
++		port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
++
++	if (data && data[IFLA_IPVLAN_MODE])
++		mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
++
++	err = ipvlan_set_port_mode(port, mode, extack);
++	if (err)
++		goto unlink_netdev;
++
++	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
++	netif_stacked_transfer_operstate(phy_dev, dev);
++	return 0;
++
++unlink_netdev:
++	netdev_upper_dev_unlink(phy_dev, dev);
++remove_ida:
++	ida_simple_remove(&port->ida, dev->dev_id);
++unregister_netdev:
++	unregister_netdevice(dev);
++	return err;
++}
++EXPORT_SYMBOL_GPL(ipvlan_link_new);
++
++void ipvlan_link_delete(struct net_device *dev, struct list_head *head)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct ipvl_addr *addr, *next;
++
++	spin_lock_bh(&ipvlan->addrs_lock);
++	list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) {
++		ipvlan_ht_addr_del(addr);
++		list_del_rcu(&addr->anode);
++		kfree_rcu(addr, rcu);
++	}
++	spin_unlock_bh(&ipvlan->addrs_lock);
++
++	ida_simple_remove(&ipvlan->port->ida, dev->dev_id);
++	list_del_rcu(&ipvlan->pnode);
++	unregister_netdevice_queue(dev, head);
++	netdev_upper_dev_unlink(ipvlan->phy_dev, dev);
++}
++EXPORT_SYMBOL_GPL(ipvlan_link_delete);
++
++void ipvlan_link_setup(struct net_device *dev)
++{
++	ether_setup(dev);
++
++	dev->max_mtu = ETH_MAX_MTU;
++	dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
++	dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
++	dev->netdev_ops = &ipvlan_netdev_ops;
++	dev->needs_free_netdev = true;
++	dev->header_ops = &ipvlan_header_ops;
++	dev->ethtool_ops = &ipvlan_ethtool_ops;
++}
++EXPORT_SYMBOL_GPL(ipvlan_link_setup);
++
++static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] =
++{
++	[IFLA_IPVLAN_MODE] = { .type = NLA_U16 },
++	[IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 },
++};
++
++static struct net *ipvlan_get_link_net(const struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	return dev_net(ipvlan->phy_dev);
++}
++
++static struct rtnl_link_ops ipvlan_link_ops = {
++	.kind		= "ipvlan",
++	.priv_size	= sizeof(struct ipvl_dev),
++
++	.setup		= ipvlan_link_setup,
++	.newlink	= ipvlan_link_new,
++	.dellink	= ipvlan_link_delete,
++	.get_link_net   = ipvlan_get_link_net,
++};
++
++int ipvlan_link_register(struct rtnl_link_ops *ops)
++{
++	ops->get_size	= ipvlan_nl_getsize;
++	ops->policy	= ipvlan_nl_policy;
++	ops->validate	= ipvlan_nl_validate;
++	ops->fill_info	= ipvlan_nl_fillinfo;
++	ops->changelink = ipvlan_nl_changelink;
++	ops->maxtype	= IFLA_IPVLAN_MAX;
++	return rtnl_link_register(ops);
++}
++EXPORT_SYMBOL_GPL(ipvlan_link_register);
++
++static int ipvlan_device_event(struct notifier_block *unused,
++			       unsigned long event, void *ptr)
++{
++	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
++	struct netdev_notifier_pre_changeaddr_info *prechaddr_info;
++	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
++	struct ipvl_dev *ipvlan, *next;
++	struct ipvl_port *port;
++	LIST_HEAD(lst_kill);
++	int err;
++
++	if (!netif_is_ipvlan_port(dev))
++		return NOTIFY_DONE;
++
++	port = ipvlan_port_get_rtnl(dev);
++
++	switch (event) {
++	case NETDEV_UP:
++	case NETDEV_CHANGE:
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
++			netif_stacked_transfer_operstate(ipvlan->phy_dev,
++							 ipvlan->dev);
++		break;
++
++	case NETDEV_REGISTER: {
++		struct net *oldnet, *newnet = dev_net(dev);
++
++		oldnet = read_pnet(&port->pnet);
++		if (net_eq(newnet, oldnet))
++			break;
++
++		write_pnet(&port->pnet, newnet);
++
++		ipvlan_migrate_l3s_hook(oldnet, newnet);
++		break;
++	}
++	case NETDEV_UNREGISTER:
++		if (dev->reg_state != NETREG_UNREGISTERING)
++			break;
++
++		list_for_each_entry_safe(ipvlan, next, &port->ipvlans, pnode)
++			ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev,
++							    &lst_kill);
++		unregister_netdevice_many(&lst_kill);
++		break;
++
++	case NETDEV_FEAT_CHANGE:
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
++			netif_inherit_tso_max(ipvlan->dev, dev);
++			netdev_update_features(ipvlan->dev);
++		}
++		break;
++
++	case NETDEV_CHANGEMTU:
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
++			ipvlan_adjust_mtu(ipvlan, dev);
++		break;
++
++	case NETDEV_PRE_CHANGEADDR:
++		prechaddr_info = ptr;
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
++			err = dev_pre_changeaddr_notify(ipvlan->dev,
++						    prechaddr_info->dev_addr,
++						    extack);
++			if (err)
++				return notifier_from_errno(err);
++		}
++		break;
++
++	case NETDEV_CHANGEADDR:
++		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
++			eth_hw_addr_set(ipvlan->dev, dev->dev_addr);
++			call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev);
++		}
++		break;
++
++	case NETDEV_PRE_TYPE_CHANGE:
++		/* Forbid underlying device to change its type. */
++		return NOTIFY_BAD;
++	}
++	return NOTIFY_DONE;
++}
++
++/* the caller must held the addrs lock */
++static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
++{
++	struct ipvl_addr *addr;
++
++	addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC);
++	if (!addr)
++		return -ENOMEM;
++
++	addr->master = ipvlan;
++	if (!is_v6) {
++		memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr));
++		addr->atype = IPVL_IPV4;
++#if IS_ENABLED(CONFIG_IPV6)
++	} else {
++		memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr));
++		addr->atype = IPVL_IPV6;
++#endif
++	}
++
++	list_add_tail_rcu(&addr->anode, &ipvlan->addrs);
++
++	/* If the interface is not up, the address will be added to the hash
++	 * list by ipvlan_open.
++	 */
++	if (netif_running(ipvlan->dev))
++		ipvlan_ht_addr_add(ipvlan, addr);
++
++	return 0;
++}
++
++static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
++{
++	struct ipvl_addr *addr;
++
++	spin_lock_bh(&ipvlan->addrs_lock);
++	addr = ipvlan_find_addr(ipvlan, iaddr, is_v6);
++	if (!addr) {
++		spin_unlock_bh(&ipvlan->addrs_lock);
++		return;
++	}
++
++	ipvlan_ht_addr_del(addr);
++	list_del_rcu(&addr->anode);
++	spin_unlock_bh(&ipvlan->addrs_lock);
++	kfree_rcu(addr, rcu);
++}
++
++static bool ipvlan_is_valid_dev(const struct net_device *dev)
++{
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (!netif_is_ipvlan(dev))
++		return false;
++
++	if (!ipvlan || !ipvlan->port)
++		return false;
++
++	return true;
++}
++
++#if IS_ENABLED(CONFIG_IPV6)
++static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
++{
++	int ret = -EINVAL;
++
++	spin_lock_bh(&ipvlan->addrs_lock);
++	if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true))
++		netif_err(ipvlan, ifup, ipvlan->dev,
++			  "Failed to add IPv6=%pI6c addr for %s intf\n",
++			  ip6_addr, ipvlan->dev->name);
++	else
++		ret = ipvlan_add_addr(ipvlan, ip6_addr, true);
++	spin_unlock_bh(&ipvlan->addrs_lock);
++	return ret;
++}
++
++static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
++{
++	return ipvlan_del_addr(ipvlan, ip6_addr, true);
++}
++
++static int ipvlan_addr6_event(struct notifier_block *unused,
++			      unsigned long event, void *ptr)
++{
++	struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr;
++	struct net_device *dev = (struct net_device *)if6->idev->dev;
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (!ipvlan_is_valid_dev(dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_UP:
++		if (ipvlan_add_addr6(ipvlan, &if6->addr))
++			return NOTIFY_BAD;
++		break;
++
++	case NETDEV_DOWN:
++		ipvlan_del_addr6(ipvlan, &if6->addr);
++		break;
++	}
++
++	return NOTIFY_OK;
++}
++
++static int ipvlan_addr6_validator_event(struct notifier_block *unused,
++					unsigned long event, void *ptr)
++{
++	struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr;
++	struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev;
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (!ipvlan_is_valid_dev(dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_UP:
++		if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) {
++			NL_SET_ERR_MSG(i6vi->extack,
++				       "Address already assigned to an ipvlan device");
++			return notifier_from_errno(-EADDRINUSE);
++		}
++		break;
++	}
++
++	return NOTIFY_OK;
++}
++#endif
++
++static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
++{
++	int ret = -EINVAL;
++
++	spin_lock_bh(&ipvlan->addrs_lock);
++	if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false))
++		netif_err(ipvlan, ifup, ipvlan->dev,
++			  "Failed to add IPv4=%pI4 on %s intf.\n",
++			  ip4_addr, ipvlan->dev->name);
++	else
++		ret = ipvlan_add_addr(ipvlan, ip4_addr, false);
++	spin_unlock_bh(&ipvlan->addrs_lock);
++	return ret;
++}
++
++static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
++{
++	return ipvlan_del_addr(ipvlan, ip4_addr, false);
++}
++
++static int ipvlan_addr4_event(struct notifier_block *unused,
++			      unsigned long event, void *ptr)
++{
++	struct in_ifaddr *if4 = (struct in_ifaddr *)ptr;
++	struct net_device *dev = (struct net_device *)if4->ifa_dev->dev;
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++	struct in_addr ip4_addr;
++
++	if (!ipvlan_is_valid_dev(dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_UP:
++		ip4_addr.s_addr = if4->ifa_address;
++		if (ipvlan_add_addr4(ipvlan, &ip4_addr))
++			return NOTIFY_BAD;
++		break;
++
++	case NETDEV_DOWN:
++		ip4_addr.s_addr = if4->ifa_address;
++		ipvlan_del_addr4(ipvlan, &ip4_addr);
++		break;
++	}
++
++	return NOTIFY_OK;
++}
++
++static int ipvlan_addr4_validator_event(struct notifier_block *unused,
++					unsigned long event, void *ptr)
++{
++	struct in_validator_info *ivi = (struct in_validator_info *)ptr;
++	struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev;
++	struct ipvl_dev *ipvlan = netdev_priv(dev);
++
++	if (!ipvlan_is_valid_dev(dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_UP:
++		if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) {
++			NL_SET_ERR_MSG(ivi->extack,
++				       "Address already assigned to an ipvlan device");
++			return notifier_from_errno(-EADDRINUSE);
++		}
++		break;
++	}
++
++	return NOTIFY_OK;
++}
++
++static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_addr4_event,
++};
++
++static struct notifier_block ipvlan_addr4_vtor_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_addr4_validator_event,
++};
++
++static struct notifier_block ipvlan_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_device_event,
++};
++
++#if IS_ENABLED(CONFIG_IPV6)
++static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_addr6_event,
++};
++
++static struct notifier_block ipvlan_addr6_vtor_notifier_block __read_mostly = {
++	.notifier_call = ipvlan_addr6_validator_event,
++};
++#endif
++
++static int __init ipvlan_init_module(void)
++{
++	int err;
++
++	ipvlan_init_secret();
++	register_netdevice_notifier(&ipvlan_notifier_block);
++#if IS_ENABLED(CONFIG_IPV6)
++	register_inet6addr_notifier(&ipvlan_addr6_notifier_block);
++	register_inet6addr_validator_notifier(
++	    &ipvlan_addr6_vtor_notifier_block);
++#endif
++	register_inetaddr_notifier(&ipvlan_addr4_notifier_block);
++	register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block);
++
++	err = ipvlan_l3s_init();
++	if (err < 0)
++		goto error;
++
++	err = ipvlan_link_register(&ipvlan_link_ops);
++	if (err < 0) {
++		ipvlan_l3s_cleanup();
++		goto error;
++	}
++
++	return 0;
++error:
++	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
++	unregister_inetaddr_validator_notifier(
++	    &ipvlan_addr4_vtor_notifier_block);
++#if IS_ENABLED(CONFIG_IPV6)
++	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
++	unregister_inet6addr_validator_notifier(
++	    &ipvlan_addr6_vtor_notifier_block);
++#endif
++	unregister_netdevice_notifier(&ipvlan_notifier_block);
++	return err;
++}
++
++static void __exit ipvlan_cleanup_module(void)
++{
++	rtnl_link_unregister(&ipvlan_link_ops);
++	ipvlan_l3s_cleanup();
++	unregister_netdevice_notifier(&ipvlan_notifier_block);
++	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
++	unregister_inetaddr_validator_notifier(
++	    &ipvlan_addr4_vtor_notifier_block);
++#if IS_ENABLED(CONFIG_IPV6)
++	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
++	unregister_inet6addr_validator_notifier(
++	    &ipvlan_addr6_vtor_notifier_block);
++#endif
++}
++
++module_init(ipvlan_init_module);
++module_exit(ipvlan_cleanup_module);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>");
++MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs");
++MODULE_ALIAS_RTNL_LINK("ipvlan");
+diff -rupN linux.orig/drivers/net/loopback.c linux/drivers/net/loopback.c
+--- linux.orig/drivers/net/loopback.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/loopback.c	2022-12-04 10:40:26.696034096 -0500
+@@ -106,10 +106,10 @@ void dev_lstats_read(struct net_device *
  
  		lb_stats = per_cpu_ptr(dev->lstats, i);
  		do {
@@ -2769,11 +15998,10 @@ index 14e8d04cb4347..c4ad98d39ea60 100644
  		*bytes   += tbytes;
  		*packets += tpackets;
  	}
-diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
-index c6d271e5687e9..5056f3cd5699a 100644
---- a/drivers/net/macsec.c
-+++ b/drivers/net/macsec.c
-@@ -2823,9 +2823,9 @@ static void get_rx_sc_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/macsec.c linux/drivers/net/macsec.c
+--- linux.orig/drivers/net/macsec.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/macsec.c	2022-12-04 10:40:26.696034096 -0500
+@@ -2821,9 +2821,9 @@ static void get_rx_sc_stats(struct net_d
  
  		stats = per_cpu_ptr(rx_sc->stats, cpu);
  		do {
@@ -2785,7 +16013,7 @@ index c6d271e5687e9..5056f3cd5699a 100644
  
  		sum->InOctetsValidated += tmp.InOctetsValidated;
  		sum->InOctetsDecrypted += tmp.InOctetsDecrypted;
-@@ -2904,9 +2904,9 @@ static void get_tx_sc_stats(struct net_device *dev,
+@@ -2902,9 +2902,9 @@ static void get_tx_sc_stats(struct net_d
  
  		stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu);
  		do {
@@ -2797,7 +16025,7 @@ index c6d271e5687e9..5056f3cd5699a 100644
  
  		sum->OutPktsProtected   += tmp.OutPktsProtected;
  		sum->OutPktsEncrypted   += tmp.OutPktsEncrypted;
-@@ -2960,9 +2960,9 @@ static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum)
+@@ -2958,9 +2958,9 @@ static void get_secy_stats(struct net_de
  
  		stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu);
  		do {
@@ -2809,11 +16037,4431 @@ index c6d271e5687e9..5056f3cd5699a 100644
  
  		sum->OutPktsUntagged  += tmp.OutPktsUntagged;
  		sum->InPktsUntagged   += tmp.InPktsUntagged;
-diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
-index 1080d6ebff63b..a1c7823f0ba66 100644
---- a/drivers/net/macvlan.c
-+++ b/drivers/net/macvlan.c
-@@ -948,13 +948,13 @@ static void macvlan_dev_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/macsec.c.orig linux/drivers/net/macsec.c.orig
+--- linux.orig/drivers/net/macsec.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/net/macsec.c.orig	2022-12-04 10:40:18.180055916 -0500
+@@ -0,0 +1,4417 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * drivers/net/macsec.c - MACsec device
++ *
++ * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net>
++ */
++
++#include <linux/types.h>
++#include <linux/skbuff.h>
++#include <linux/socket.h>
++#include <linux/module.h>
++#include <crypto/aead.h>
++#include <linux/etherdevice.h>
++#include <linux/netdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/refcount.h>
++#include <net/genetlink.h>
++#include <net/sock.h>
++#include <net/gro_cells.h>
++#include <net/macsec.h>
++#include <linux/phy.h>
++#include <linux/byteorder/generic.h>
++#include <linux/if_arp.h>
++
++#include <uapi/linux/if_macsec.h>
++
++#define MACSEC_SCI_LEN 8
++
++/* SecTAG length = macsec_eth_header without the optional SCI */
++#define MACSEC_TAG_LEN 6
++
++struct macsec_eth_header {
++	struct ethhdr eth;
++	/* SecTAG */
++	u8  tci_an;
++#if defined(__LITTLE_ENDIAN_BITFIELD)
++	u8  short_length:6,
++		  unused:2;
++#elif defined(__BIG_ENDIAN_BITFIELD)
++	u8        unused:2,
++	    short_length:6;
++#else
++#error	"Please fix <asm/byteorder.h>"
++#endif
++	__be32 packet_number;
++	u8 secure_channel_id[8]; /* optional */
++} __packed;
++
++#define MACSEC_TCI_VERSION 0x80
++#define MACSEC_TCI_ES      0x40 /* end station */
++#define MACSEC_TCI_SC      0x20 /* SCI present */
++#define MACSEC_TCI_SCB     0x10 /* epon */
++#define MACSEC_TCI_E       0x08 /* encryption */
++#define MACSEC_TCI_C       0x04 /* changed text */
++#define MACSEC_AN_MASK     0x03 /* association number */
++#define MACSEC_TCI_CONFID  (MACSEC_TCI_E | MACSEC_TCI_C)
++
++/* minimum secure data length deemed "not short", see IEEE 802.1AE-2006 9.7 */
++#define MIN_NON_SHORT_LEN 48
++
++#define GCM_AES_IV_LEN 12
++#define DEFAULT_ICV_LEN 16
++
++#define for_each_rxsc(secy, sc)				\
++	for (sc = rcu_dereference_bh(secy->rx_sc);	\
++	     sc;					\
++	     sc = rcu_dereference_bh(sc->next))
++#define for_each_rxsc_rtnl(secy, sc)			\
++	for (sc = rtnl_dereference(secy->rx_sc);	\
++	     sc;					\
++	     sc = rtnl_dereference(sc->next))
++
++#define pn_same_half(pn1, pn2) (!(((pn1) >> 31) ^ ((pn2) >> 31)))
++
++struct gcm_iv_xpn {
++	union {
++		u8 short_secure_channel_id[4];
++		ssci_t ssci;
++	};
++	__be64 pn;
++} __packed;
++
++struct gcm_iv {
++	union {
++		u8 secure_channel_id[8];
++		sci_t sci;
++	};
++	__be32 pn;
++};
++
++#define MACSEC_VALIDATE_DEFAULT MACSEC_VALIDATE_STRICT
++
++struct pcpu_secy_stats {
++	struct macsec_dev_stats stats;
++	struct u64_stats_sync syncp;
++};
++
++/**
++ * struct macsec_dev - private data
++ * @secy: SecY config
++ * @real_dev: pointer to underlying netdevice
++ * @dev_tracker: refcount tracker for @real_dev reference
++ * @stats: MACsec device stats
++ * @secys: linked list of SecY's on the underlying device
++ * @gro_cells: pointer to the Generic Receive Offload cell
++ * @offload: status of offloading on the MACsec device
++ */
++struct macsec_dev {
++	struct macsec_secy secy;
++	struct net_device *real_dev;
++	netdevice_tracker dev_tracker;
++	struct pcpu_secy_stats __percpu *stats;
++	struct list_head secys;
++	struct gro_cells gro_cells;
++	enum macsec_offload offload;
++};
++
++/**
++ * struct macsec_rxh_data - rx_handler private argument
++ * @secys: linked list of SecY's on this underlying device
++ */
++struct macsec_rxh_data {
++	struct list_head secys;
++};
++
++static struct macsec_dev *macsec_priv(const struct net_device *dev)
++{
++	return (struct macsec_dev *)netdev_priv(dev);
++}
++
++static struct macsec_rxh_data *macsec_data_rcu(const struct net_device *dev)
++{
++	return rcu_dereference_bh(dev->rx_handler_data);
++}
++
++static struct macsec_rxh_data *macsec_data_rtnl(const struct net_device *dev)
++{
++	return rtnl_dereference(dev->rx_handler_data);
++}
++
++struct macsec_cb {
++	struct aead_request *req;
++	union {
++		struct macsec_tx_sa *tx_sa;
++		struct macsec_rx_sa *rx_sa;
++	};
++	u8 assoc_num;
++	bool valid;
++	bool has_sci;
++};
++
++static struct macsec_rx_sa *macsec_rxsa_get(struct macsec_rx_sa __rcu *ptr)
++{
++	struct macsec_rx_sa *sa = rcu_dereference_bh(ptr);
++
++	if (!sa || !sa->active)
++		return NULL;
++
++	if (!refcount_inc_not_zero(&sa->refcnt))
++		return NULL;
++
++	return sa;
++}
++
++static struct macsec_rx_sa *macsec_active_rxsa_get(struct macsec_rx_sc *rx_sc)
++{
++	struct macsec_rx_sa *sa = NULL;
++	int an;
++
++	for (an = 0; an < MACSEC_NUM_AN; an++)	{
++		sa = macsec_rxsa_get(rx_sc->sa[an]);
++		if (sa)
++			break;
++	}
++	return sa;
++}
++
++static void free_rx_sc_rcu(struct rcu_head *head)
++{
++	struct macsec_rx_sc *rx_sc = container_of(head, struct macsec_rx_sc, rcu_head);
++
++	free_percpu(rx_sc->stats);
++	kfree(rx_sc);
++}
++
++static struct macsec_rx_sc *macsec_rxsc_get(struct macsec_rx_sc *sc)
++{
++	return refcount_inc_not_zero(&sc->refcnt) ? sc : NULL;
++}
++
++static void macsec_rxsc_put(struct macsec_rx_sc *sc)
++{
++	if (refcount_dec_and_test(&sc->refcnt))
++		call_rcu(&sc->rcu_head, free_rx_sc_rcu);
++}
++
++static void free_rxsa(struct rcu_head *head)
++{
++	struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu);
++
++	crypto_free_aead(sa->key.tfm);
++	free_percpu(sa->stats);
++	kfree(sa);
++}
++
++static void macsec_rxsa_put(struct macsec_rx_sa *sa)
++{
++	if (refcount_dec_and_test(&sa->refcnt))
++		call_rcu(&sa->rcu, free_rxsa);
++}
++
++static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr)
++{
++	struct macsec_tx_sa *sa = rcu_dereference_bh(ptr);
++
++	if (!sa || !sa->active)
++		return NULL;
++
++	if (!refcount_inc_not_zero(&sa->refcnt))
++		return NULL;
++
++	return sa;
++}
++
++static void free_txsa(struct rcu_head *head)
++{
++	struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu);
++
++	crypto_free_aead(sa->key.tfm);
++	free_percpu(sa->stats);
++	kfree(sa);
++}
++
++static void macsec_txsa_put(struct macsec_tx_sa *sa)
++{
++	if (refcount_dec_and_test(&sa->refcnt))
++		call_rcu(&sa->rcu, free_txsa);
++}
++
++static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb)
++{
++	BUILD_BUG_ON(sizeof(struct macsec_cb) > sizeof(skb->cb));
++	return (struct macsec_cb *)skb->cb;
++}
++
++#define MACSEC_PORT_ES (htons(0x0001))
++#define MACSEC_PORT_SCB (0x0000)
++#define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL)
++#define MACSEC_UNDEF_SSCI ((__force ssci_t)0xffffffff)
++
++#define MACSEC_GCM_AES_128_SAK_LEN 16
++#define MACSEC_GCM_AES_256_SAK_LEN 32
++
++#define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN
++#define DEFAULT_XPN false
++#define DEFAULT_SEND_SCI true
++#define DEFAULT_ENCRYPT false
++#define DEFAULT_ENCODING_SA 0
++#define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1))
++
++static bool send_sci(const struct macsec_secy *secy)
++{
++	const struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++
++	return tx_sc->send_sci ||
++		(secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb);
++}
++
++static sci_t make_sci(const u8 *addr, __be16 port)
++{
++	sci_t sci;
++
++	memcpy(&sci, addr, ETH_ALEN);
++	memcpy(((char *)&sci) + ETH_ALEN, &port, sizeof(port));
++
++	return sci;
++}
++
++static sci_t macsec_frame_sci(struct macsec_eth_header *hdr, bool sci_present)
++{
++	sci_t sci;
++
++	if (sci_present)
++		memcpy(&sci, hdr->secure_channel_id,
++		       sizeof(hdr->secure_channel_id));
++	else
++		sci = make_sci(hdr->eth.h_source, MACSEC_PORT_ES);
++
++	return sci;
++}
++
++static unsigned int macsec_sectag_len(bool sci_present)
++{
++	return MACSEC_TAG_LEN + (sci_present ? MACSEC_SCI_LEN : 0);
++}
++
++static unsigned int macsec_hdr_len(bool sci_present)
++{
++	return macsec_sectag_len(sci_present) + ETH_HLEN;
++}
++
++static unsigned int macsec_extra_len(bool sci_present)
++{
++	return macsec_sectag_len(sci_present) + sizeof(__be16);
++}
++
++/* Fill SecTAG according to IEEE 802.1AE-2006 10.5.3 */
++static void macsec_fill_sectag(struct macsec_eth_header *h,
++			       const struct macsec_secy *secy, u32 pn,
++			       bool sci_present)
++{
++	const struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++
++	memset(&h->tci_an, 0, macsec_sectag_len(sci_present));
++	h->eth.h_proto = htons(ETH_P_MACSEC);
++
++	if (sci_present) {
++		h->tci_an |= MACSEC_TCI_SC;
++		memcpy(&h->secure_channel_id, &secy->sci,
++		       sizeof(h->secure_channel_id));
++	} else {
++		if (tx_sc->end_station)
++			h->tci_an |= MACSEC_TCI_ES;
++		if (tx_sc->scb)
++			h->tci_an |= MACSEC_TCI_SCB;
++	}
++
++	h->packet_number = htonl(pn);
++
++	/* with GCM, C/E clear for !encrypt, both set for encrypt */
++	if (tx_sc->encrypt)
++		h->tci_an |= MACSEC_TCI_CONFID;
++	else if (secy->icv_len != DEFAULT_ICV_LEN)
++		h->tci_an |= MACSEC_TCI_C;
++
++	h->tci_an |= tx_sc->encoding_sa;
++}
++
++static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len)
++{
++	if (data_len < MIN_NON_SHORT_LEN)
++		h->short_length = data_len;
++}
++
++/* Checks if a MACsec interface is being offloaded to an hardware engine */
++static bool macsec_is_offloaded(struct macsec_dev *macsec)
++{
++	if (macsec->offload == MACSEC_OFFLOAD_MAC ||
++	    macsec->offload == MACSEC_OFFLOAD_PHY)
++		return true;
++
++	return false;
++}
++
++/* Checks if underlying layers implement MACsec offloading functions. */
++static bool macsec_check_offload(enum macsec_offload offload,
++				 struct macsec_dev *macsec)
++{
++	if (!macsec || !macsec->real_dev)
++		return false;
++
++	if (offload == MACSEC_OFFLOAD_PHY)
++		return macsec->real_dev->phydev &&
++		       macsec->real_dev->phydev->macsec_ops;
++	else if (offload == MACSEC_OFFLOAD_MAC)
++		return macsec->real_dev->features & NETIF_F_HW_MACSEC &&
++		       macsec->real_dev->macsec_ops;
++
++	return false;
++}
++
++static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload,
++						 struct macsec_dev *macsec,
++						 struct macsec_context *ctx)
++{
++	if (ctx) {
++		memset(ctx, 0, sizeof(*ctx));
++		ctx->offload = offload;
++
++		if (offload == MACSEC_OFFLOAD_PHY)
++			ctx->phydev = macsec->real_dev->phydev;
++		else if (offload == MACSEC_OFFLOAD_MAC)
++			ctx->netdev = macsec->real_dev;
++	}
++
++	if (offload == MACSEC_OFFLOAD_PHY)
++		return macsec->real_dev->phydev->macsec_ops;
++	else
++		return macsec->real_dev->macsec_ops;
++}
++
++/* Returns a pointer to the MACsec ops struct if any and updates the MACsec
++ * context device reference if provided.
++ */
++static const struct macsec_ops *macsec_get_ops(struct macsec_dev *macsec,
++					       struct macsec_context *ctx)
++{
++	if (!macsec_check_offload(macsec->offload, macsec))
++		return NULL;
++
++	return __macsec_get_ops(macsec->offload, macsec, ctx);
++}
++
++/* validate MACsec packet according to IEEE 802.1AE-2018 9.12 */
++static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len, bool xpn)
++{
++	struct macsec_eth_header *h = (struct macsec_eth_header *)skb->data;
++	int len = skb->len - 2 * ETH_ALEN;
++	int extra_len = macsec_extra_len(!!(h->tci_an & MACSEC_TCI_SC)) + icv_len;
++
++	/* a) It comprises at least 17 octets */
++	if (skb->len <= 16)
++		return false;
++
++	/* b) MACsec EtherType: already checked */
++
++	/* c) V bit is clear */
++	if (h->tci_an & MACSEC_TCI_VERSION)
++		return false;
++
++	/* d) ES or SCB => !SC */
++	if ((h->tci_an & MACSEC_TCI_ES || h->tci_an & MACSEC_TCI_SCB) &&
++	    (h->tci_an & MACSEC_TCI_SC))
++		return false;
++
++	/* e) Bits 7 and 8 of octet 4 of the SecTAG are clear */
++	if (h->unused)
++		return false;
++
++	/* rx.pn != 0 if not XPN (figure 10-5 with 802.11AEbw-2013 amendment) */
++	if (!h->packet_number && !xpn)
++		return false;
++
++	/* length check, f) g) h) i) */
++	if (h->short_length)
++		return len == extra_len + h->short_length;
++	return len >= extra_len + MIN_NON_SHORT_LEN;
++}
++
++#define MACSEC_NEEDED_HEADROOM (macsec_extra_len(true))
++#define MACSEC_NEEDED_TAILROOM MACSEC_STD_ICV_LEN
++
++static void macsec_fill_iv_xpn(unsigned char *iv, ssci_t ssci, u64 pn,
++			       salt_t salt)
++{
++	struct gcm_iv_xpn *gcm_iv = (struct gcm_iv_xpn *)iv;
++
++	gcm_iv->ssci = ssci ^ salt.ssci;
++	gcm_iv->pn = cpu_to_be64(pn) ^ salt.pn;
++}
++
++static void macsec_fill_iv(unsigned char *iv, sci_t sci, u32 pn)
++{
++	struct gcm_iv *gcm_iv = (struct gcm_iv *)iv;
++
++	gcm_iv->sci = sci;
++	gcm_iv->pn = htonl(pn);
++}
++
++static struct macsec_eth_header *macsec_ethhdr(struct sk_buff *skb)
++{
++	return (struct macsec_eth_header *)skb_mac_header(skb);
++}
++
++static void __macsec_pn_wrapped(struct macsec_secy *secy,
++				struct macsec_tx_sa *tx_sa)
++{
++	pr_debug("PN wrapped, transitioning to !oper\n");
++	tx_sa->active = false;
++	if (secy->protect_frames)
++		secy->operational = false;
++}
++
++void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa)
++{
++	spin_lock_bh(&tx_sa->lock);
++	__macsec_pn_wrapped(secy, tx_sa);
++	spin_unlock_bh(&tx_sa->lock);
++}
++EXPORT_SYMBOL_GPL(macsec_pn_wrapped);
++
++static pn_t tx_sa_update_pn(struct macsec_tx_sa *tx_sa,
++			    struct macsec_secy *secy)
++{
++	pn_t pn;
++
++	spin_lock_bh(&tx_sa->lock);
++
++	pn = tx_sa->next_pn_halves;
++	if (secy->xpn)
++		tx_sa->next_pn++;
++	else
++		tx_sa->next_pn_halves.lower++;
++
++	if (tx_sa->next_pn == 0)
++		__macsec_pn_wrapped(secy, tx_sa);
++	spin_unlock_bh(&tx_sa->lock);
++
++	return pn;
++}
++
++static void macsec_encrypt_finish(struct sk_buff *skb, struct net_device *dev)
++{
++	struct macsec_dev *macsec = netdev_priv(dev);
++
++	skb->dev = macsec->real_dev;
++	skb_reset_mac_header(skb);
++	skb->protocol = eth_hdr(skb)->h_proto;
++}
++
++static unsigned int macsec_msdu_len(struct sk_buff *skb)
++{
++	struct macsec_dev *macsec = macsec_priv(skb->dev);
++	struct macsec_secy *secy = &macsec->secy;
++	bool sci_present = macsec_skb_cb(skb)->has_sci;
++
++	return skb->len - macsec_hdr_len(sci_present) - secy->icv_len;
++}
++
++static void macsec_count_tx(struct sk_buff *skb, struct macsec_tx_sc *tx_sc,
++			    struct macsec_tx_sa *tx_sa)
++{
++	unsigned int msdu_len = macsec_msdu_len(skb);
++	struct pcpu_tx_sc_stats *txsc_stats = this_cpu_ptr(tx_sc->stats);
++
++	u64_stats_update_begin(&txsc_stats->syncp);
++	if (tx_sc->encrypt) {
++		txsc_stats->stats.OutOctetsEncrypted += msdu_len;
++		txsc_stats->stats.OutPktsEncrypted++;
++		this_cpu_inc(tx_sa->stats->OutPktsEncrypted);
++	} else {
++		txsc_stats->stats.OutOctetsProtected += msdu_len;
++		txsc_stats->stats.OutPktsProtected++;
++		this_cpu_inc(tx_sa->stats->OutPktsProtected);
++	}
++	u64_stats_update_end(&txsc_stats->syncp);
++}
++
++static void count_tx(struct net_device *dev, int ret, int len)
++{
++	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
++		struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats);
++
++		u64_stats_update_begin(&stats->syncp);
++		u64_stats_inc(&stats->tx_packets);
++		u64_stats_add(&stats->tx_bytes, len);
++		u64_stats_update_end(&stats->syncp);
++	}
++}
++
++static void macsec_encrypt_done(struct crypto_async_request *base, int err)
++{
++	struct sk_buff *skb = base->data;
++	struct net_device *dev = skb->dev;
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa;
++	int len, ret;
++
++	aead_request_free(macsec_skb_cb(skb)->req);
++
++	rcu_read_lock_bh();
++	macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa);
++	/* packet is encrypted/protected so tx_bytes must be calculated */
++	len = macsec_msdu_len(skb) + 2 * ETH_ALEN;
++	macsec_encrypt_finish(skb, dev);
++	ret = dev_queue_xmit(skb);
++	count_tx(dev, ret, len);
++	rcu_read_unlock_bh();
++
++	macsec_txsa_put(sa);
++	dev_put(dev);
++}
++
++static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm,
++					     unsigned char **iv,
++					     struct scatterlist **sg,
++					     int num_frags)
++{
++	size_t size, iv_offset, sg_offset;
++	struct aead_request *req;
++	void *tmp;
++
++	size = sizeof(struct aead_request) + crypto_aead_reqsize(tfm);
++	iv_offset = size;
++	size += GCM_AES_IV_LEN;
++
++	size = ALIGN(size, __alignof__(struct scatterlist));
++	sg_offset = size;
++	size += sizeof(struct scatterlist) * num_frags;
++
++	tmp = kmalloc(size, GFP_ATOMIC);
++	if (!tmp)
++		return NULL;
++
++	*iv = (unsigned char *)(tmp + iv_offset);
++	*sg = (struct scatterlist *)(tmp + sg_offset);
++	req = tmp;
++
++	aead_request_set_tfm(req, tfm);
++
++	return req;
++}
++
++static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
++				      struct net_device *dev)
++{
++	int ret;
++	struct scatterlist *sg;
++	struct sk_buff *trailer;
++	unsigned char *iv;
++	struct ethhdr *eth;
++	struct macsec_eth_header *hh;
++	size_t unprotected_len;
++	struct aead_request *req;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++	struct macsec_dev *macsec = macsec_priv(dev);
++	bool sci_present;
++	pn_t pn;
++
++	secy = &macsec->secy;
++	tx_sc = &secy->tx_sc;
++
++	/* 10.5.1 TX SA assignment */
++	tx_sa = macsec_txsa_get(tx_sc->sa[tx_sc->encoding_sa]);
++	if (!tx_sa) {
++		secy->operational = false;
++		kfree_skb(skb);
++		return ERR_PTR(-EINVAL);
++	}
++
++	if (unlikely(skb_headroom(skb) < MACSEC_NEEDED_HEADROOM ||
++		     skb_tailroom(skb) < MACSEC_NEEDED_TAILROOM)) {
++		struct sk_buff *nskb = skb_copy_expand(skb,
++						       MACSEC_NEEDED_HEADROOM,
++						       MACSEC_NEEDED_TAILROOM,
++						       GFP_ATOMIC);
++		if (likely(nskb)) {
++			consume_skb(skb);
++			skb = nskb;
++		} else {
++			macsec_txsa_put(tx_sa);
++			kfree_skb(skb);
++			return ERR_PTR(-ENOMEM);
++		}
++	} else {
++		skb = skb_unshare(skb, GFP_ATOMIC);
++		if (!skb) {
++			macsec_txsa_put(tx_sa);
++			return ERR_PTR(-ENOMEM);
++		}
++	}
++
++	unprotected_len = skb->len;
++	eth = eth_hdr(skb);
++	sci_present = send_sci(secy);
++	hh = skb_push(skb, macsec_extra_len(sci_present));
++	memmove(hh, eth, 2 * ETH_ALEN);
++
++	pn = tx_sa_update_pn(tx_sa, secy);
++	if (pn.full64 == 0) {
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(-ENOLINK);
++	}
++	macsec_fill_sectag(hh, secy, pn.lower, sci_present);
++	macsec_set_shortlen(hh, unprotected_len - 2 * ETH_ALEN);
++
++	skb_put(skb, secy->icv_len);
++
++	if (skb->len - ETH_HLEN > macsec_priv(dev)->real_dev->mtu) {
++		struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats);
++
++		u64_stats_update_begin(&secy_stats->syncp);
++		secy_stats->stats.OutPktsTooLong++;
++		u64_stats_update_end(&secy_stats->syncp);
++
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(-EINVAL);
++	}
++
++	ret = skb_cow_data(skb, 0, &trailer);
++	if (unlikely(ret < 0)) {
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(ret);
++	}
++
++	req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret);
++	if (!req) {
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(-ENOMEM);
++	}
++
++	if (secy->xpn)
++		macsec_fill_iv_xpn(iv, tx_sa->ssci, pn.full64, tx_sa->key.salt);
++	else
++		macsec_fill_iv(iv, secy->sci, pn.lower);
++
++	sg_init_table(sg, ret);
++	ret = skb_to_sgvec(skb, sg, 0, skb->len);
++	if (unlikely(ret < 0)) {
++		aead_request_free(req);
++		macsec_txsa_put(tx_sa);
++		kfree_skb(skb);
++		return ERR_PTR(ret);
++	}
++
++	if (tx_sc->encrypt) {
++		int len = skb->len - macsec_hdr_len(sci_present) -
++			  secy->icv_len;
++		aead_request_set_crypt(req, sg, sg, len, iv);
++		aead_request_set_ad(req, macsec_hdr_len(sci_present));
++	} else {
++		aead_request_set_crypt(req, sg, sg, 0, iv);
++		aead_request_set_ad(req, skb->len - secy->icv_len);
++	}
++
++	macsec_skb_cb(skb)->req = req;
++	macsec_skb_cb(skb)->tx_sa = tx_sa;
++	macsec_skb_cb(skb)->has_sci = sci_present;
++	aead_request_set_callback(req, 0, macsec_encrypt_done, skb);
++
++	dev_hold(skb->dev);
++	ret = crypto_aead_encrypt(req);
++	if (ret == -EINPROGRESS) {
++		return ERR_PTR(ret);
++	} else if (ret != 0) {
++		dev_put(skb->dev);
++		kfree_skb(skb);
++		aead_request_free(req);
++		macsec_txsa_put(tx_sa);
++		return ERR_PTR(-EINVAL);
++	}
++
++	dev_put(skb->dev);
++	aead_request_free(req);
++	macsec_txsa_put(tx_sa);
++
++	return skb;
++}
++
++static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u32 pn)
++{
++	struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa;
++	struct pcpu_rx_sc_stats *rxsc_stats = this_cpu_ptr(rx_sa->sc->stats);
++	struct macsec_eth_header *hdr = macsec_ethhdr(skb);
++	u32 lowest_pn = 0;
++
++	spin_lock(&rx_sa->lock);
++	if (rx_sa->next_pn_halves.lower >= secy->replay_window)
++		lowest_pn = rx_sa->next_pn_halves.lower - secy->replay_window;
++
++	/* Now perform replay protection check again
++	 * (see IEEE 802.1AE-2006 figure 10-5)
++	 */
++	if (secy->replay_protect && pn < lowest_pn &&
++	    (!secy->xpn || pn_same_half(pn, lowest_pn))) {
++		spin_unlock(&rx_sa->lock);
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		rxsc_stats->stats.InPktsLate++;
++		u64_stats_update_end(&rxsc_stats->syncp);
++		secy->netdev->stats.rx_dropped++;
++		return false;
++	}
++
++	if (secy->validate_frames != MACSEC_VALIDATE_DISABLED) {
++		unsigned int msdu_len = macsec_msdu_len(skb);
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		if (hdr->tci_an & MACSEC_TCI_E)
++			rxsc_stats->stats.InOctetsDecrypted += msdu_len;
++		else
++			rxsc_stats->stats.InOctetsValidated += msdu_len;
++		u64_stats_update_end(&rxsc_stats->syncp);
++	}
++
++	if (!macsec_skb_cb(skb)->valid) {
++		spin_unlock(&rx_sa->lock);
++
++		/* 10.6.5 */
++		if (hdr->tci_an & MACSEC_TCI_C ||
++		    secy->validate_frames == MACSEC_VALIDATE_STRICT) {
++			u64_stats_update_begin(&rxsc_stats->syncp);
++			rxsc_stats->stats.InPktsNotValid++;
++			u64_stats_update_end(&rxsc_stats->syncp);
++			this_cpu_inc(rx_sa->stats->InPktsNotValid);
++			secy->netdev->stats.rx_errors++;
++			return false;
++		}
++
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		if (secy->validate_frames == MACSEC_VALIDATE_CHECK) {
++			rxsc_stats->stats.InPktsInvalid++;
++			this_cpu_inc(rx_sa->stats->InPktsInvalid);
++		} else if (pn < lowest_pn) {
++			rxsc_stats->stats.InPktsDelayed++;
++		} else {
++			rxsc_stats->stats.InPktsUnchecked++;
++		}
++		u64_stats_update_end(&rxsc_stats->syncp);
++	} else {
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		if (pn < lowest_pn) {
++			rxsc_stats->stats.InPktsDelayed++;
++		} else {
++			rxsc_stats->stats.InPktsOK++;
++			this_cpu_inc(rx_sa->stats->InPktsOK);
++		}
++		u64_stats_update_end(&rxsc_stats->syncp);
++
++		// Instead of "pn >=" - to support pn overflow in xpn
++		if (pn + 1 > rx_sa->next_pn_halves.lower) {
++			rx_sa->next_pn_halves.lower = pn + 1;
++		} else if (secy->xpn &&
++			   !pn_same_half(pn, rx_sa->next_pn_halves.lower)) {
++			rx_sa->next_pn_halves.upper++;
++			rx_sa->next_pn_halves.lower = pn + 1;
++		}
++
++		spin_unlock(&rx_sa->lock);
++	}
++
++	return true;
++}
++
++static void macsec_reset_skb(struct sk_buff *skb, struct net_device *dev)
++{
++	skb->pkt_type = PACKET_HOST;
++	skb->protocol = eth_type_trans(skb, dev);
++
++	skb_reset_network_header(skb);
++	if (!skb_transport_header_was_set(skb))
++		skb_reset_transport_header(skb);
++	skb_reset_mac_len(skb);
++}
++
++static void macsec_finalize_skb(struct sk_buff *skb, u8 icv_len, u8 hdr_len)
++{
++	skb->ip_summed = CHECKSUM_NONE;
++	memmove(skb->data + hdr_len, skb->data, 2 * ETH_ALEN);
++	skb_pull(skb, hdr_len);
++	pskb_trim_unique(skb, skb->len - icv_len);
++}
++
++static void count_rx(struct net_device *dev, int len)
++{
++	struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats);
++
++	u64_stats_update_begin(&stats->syncp);
++	u64_stats_inc(&stats->rx_packets);
++	u64_stats_add(&stats->rx_bytes, len);
++	u64_stats_update_end(&stats->syncp);
++}
++
++static void macsec_decrypt_done(struct crypto_async_request *base, int err)
++{
++	struct sk_buff *skb = base->data;
++	struct net_device *dev = skb->dev;
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa;
++	struct macsec_rx_sc *rx_sc = rx_sa->sc;
++	int len;
++	u32 pn;
++
++	aead_request_free(macsec_skb_cb(skb)->req);
++
++	if (!err)
++		macsec_skb_cb(skb)->valid = true;
++
++	rcu_read_lock_bh();
++	pn = ntohl(macsec_ethhdr(skb)->packet_number);
++	if (!macsec_post_decrypt(skb, &macsec->secy, pn)) {
++		rcu_read_unlock_bh();
++		kfree_skb(skb);
++		goto out;
++	}
++
++	macsec_finalize_skb(skb, macsec->secy.icv_len,
++			    macsec_extra_len(macsec_skb_cb(skb)->has_sci));
++	len = skb->len;
++	macsec_reset_skb(skb, macsec->secy.netdev);
++
++	if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS)
++		count_rx(dev, len);
++
++	rcu_read_unlock_bh();
++
++out:
++	macsec_rxsa_put(rx_sa);
++	macsec_rxsc_put(rx_sc);
++	dev_put(dev);
++}
++
++static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
++				      struct net_device *dev,
++				      struct macsec_rx_sa *rx_sa,
++				      sci_t sci,
++				      struct macsec_secy *secy)
++{
++	int ret;
++	struct scatterlist *sg;
++	struct sk_buff *trailer;
++	unsigned char *iv;
++	struct aead_request *req;
++	struct macsec_eth_header *hdr;
++	u32 hdr_pn;
++	u16 icv_len = secy->icv_len;
++
++	macsec_skb_cb(skb)->valid = false;
++	skb = skb_share_check(skb, GFP_ATOMIC);
++	if (!skb)
++		return ERR_PTR(-ENOMEM);
++
++	ret = skb_cow_data(skb, 0, &trailer);
++	if (unlikely(ret < 0)) {
++		kfree_skb(skb);
++		return ERR_PTR(ret);
++	}
++	req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret);
++	if (!req) {
++		kfree_skb(skb);
++		return ERR_PTR(-ENOMEM);
++	}
++
++	hdr = (struct macsec_eth_header *)skb->data;
++	hdr_pn = ntohl(hdr->packet_number);
++
++	if (secy->xpn) {
++		pn_t recovered_pn = rx_sa->next_pn_halves;
++
++		recovered_pn.lower = hdr_pn;
++		if (hdr_pn < rx_sa->next_pn_halves.lower &&
++		    !pn_same_half(hdr_pn, rx_sa->next_pn_halves.lower))
++			recovered_pn.upper++;
++
++		macsec_fill_iv_xpn(iv, rx_sa->ssci, recovered_pn.full64,
++				   rx_sa->key.salt);
++	} else {
++		macsec_fill_iv(iv, sci, hdr_pn);
++	}
++
++	sg_init_table(sg, ret);
++	ret = skb_to_sgvec(skb, sg, 0, skb->len);
++	if (unlikely(ret < 0)) {
++		aead_request_free(req);
++		kfree_skb(skb);
++		return ERR_PTR(ret);
++	}
++
++	if (hdr->tci_an & MACSEC_TCI_E) {
++		/* confidentiality: ethernet + macsec header
++		 * authenticated, encrypted payload
++		 */
++		int len = skb->len - macsec_hdr_len(macsec_skb_cb(skb)->has_sci);
++
++		aead_request_set_crypt(req, sg, sg, len, iv);
++		aead_request_set_ad(req, macsec_hdr_len(macsec_skb_cb(skb)->has_sci));
++		skb = skb_unshare(skb, GFP_ATOMIC);
++		if (!skb) {
++			aead_request_free(req);
++			return ERR_PTR(-ENOMEM);
++		}
++	} else {
++		/* integrity only: all headers + data authenticated */
++		aead_request_set_crypt(req, sg, sg, icv_len, iv);
++		aead_request_set_ad(req, skb->len - icv_len);
++	}
++
++	macsec_skb_cb(skb)->req = req;
++	skb->dev = dev;
++	aead_request_set_callback(req, 0, macsec_decrypt_done, skb);
++
++	dev_hold(dev);
++	ret = crypto_aead_decrypt(req);
++	if (ret == -EINPROGRESS) {
++		return ERR_PTR(ret);
++	} else if (ret != 0) {
++		/* decryption/authentication failed
++		 * 10.6 if validateFrames is disabled, deliver anyway
++		 */
++		if (ret != -EBADMSG) {
++			kfree_skb(skb);
++			skb = ERR_PTR(ret);
++		}
++	} else {
++		macsec_skb_cb(skb)->valid = true;
++	}
++	dev_put(dev);
++
++	aead_request_free(req);
++
++	return skb;
++}
++
++static struct macsec_rx_sc *find_rx_sc(struct macsec_secy *secy, sci_t sci)
++{
++	struct macsec_rx_sc *rx_sc;
++
++	for_each_rxsc(secy, rx_sc) {
++		if (rx_sc->sci == sci)
++			return rx_sc;
++	}
++
++	return NULL;
++}
++
++static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci)
++{
++	struct macsec_rx_sc *rx_sc;
++
++	for_each_rxsc_rtnl(secy, rx_sc) {
++		if (rx_sc->sci == sci)
++			return rx_sc;
++	}
++
++	return NULL;
++}
++
++static enum rx_handler_result handle_not_macsec(struct sk_buff *skb)
++{
++	/* Deliver to the uncontrolled port by default */
++	enum rx_handler_result ret = RX_HANDLER_PASS;
++	struct ethhdr *hdr = eth_hdr(skb);
++	struct macsec_rxh_data *rxd;
++	struct macsec_dev *macsec;
++
++	rcu_read_lock();
++	rxd = macsec_data_rcu(skb->dev);
++
++	list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
++		struct sk_buff *nskb;
++		struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats);
++		struct net_device *ndev = macsec->secy.netdev;
++
++		/* If h/w offloading is enabled, HW decodes frames and strips
++		 * the SecTAG, so we have to deduce which port to deliver to.
++		 */
++		if (macsec_is_offloaded(macsec) && netif_running(ndev)) {
++			if (ether_addr_equal_64bits(hdr->h_dest,
++						    ndev->dev_addr)) {
++				/* exact match, divert skb to this port */
++				skb->dev = ndev;
++				skb->pkt_type = PACKET_HOST;
++				ret = RX_HANDLER_ANOTHER;
++				goto out;
++			} else if (is_multicast_ether_addr_64bits(
++					   hdr->h_dest)) {
++				/* multicast frame, deliver on this port too */
++				nskb = skb_clone(skb, GFP_ATOMIC);
++				if (!nskb)
++					break;
++
++				nskb->dev = ndev;
++				if (ether_addr_equal_64bits(hdr->h_dest,
++							    ndev->broadcast))
++					nskb->pkt_type = PACKET_BROADCAST;
++				else
++					nskb->pkt_type = PACKET_MULTICAST;
++
++				__netif_rx(nskb);
++			}
++			continue;
++		}
++
++		/* 10.6 If the management control validateFrames is not
++		 * Strict, frames without a SecTAG are received, counted, and
++		 * delivered to the Controlled Port
++		 */
++		if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) {
++			u64_stats_update_begin(&secy_stats->syncp);
++			secy_stats->stats.InPktsNoTag++;
++			u64_stats_update_end(&secy_stats->syncp);
++			macsec->secy.netdev->stats.rx_dropped++;
++			continue;
++		}
++
++		/* deliver on this port */
++		nskb = skb_clone(skb, GFP_ATOMIC);
++		if (!nskb)
++			break;
++
++		nskb->dev = ndev;
++
++		if (__netif_rx(nskb) == NET_RX_SUCCESS) {
++			u64_stats_update_begin(&secy_stats->syncp);
++			secy_stats->stats.InPktsUntagged++;
++			u64_stats_update_end(&secy_stats->syncp);
++		}
++	}
++
++out:
++	rcu_read_unlock();
++	return ret;
++}
++
++static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb)
++{
++	struct sk_buff *skb = *pskb;
++	struct net_device *dev = skb->dev;
++	struct macsec_eth_header *hdr;
++	struct macsec_secy *secy = NULL;
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++	struct macsec_rxh_data *rxd;
++	struct macsec_dev *macsec;
++	unsigned int len;
++	sci_t sci;
++	u32 hdr_pn;
++	bool cbit;
++	struct pcpu_rx_sc_stats *rxsc_stats;
++	struct pcpu_secy_stats *secy_stats;
++	bool pulled_sci;
++	int ret;
++
++	if (skb_headroom(skb) < ETH_HLEN)
++		goto drop_direct;
++
++	hdr = macsec_ethhdr(skb);
++	if (hdr->eth.h_proto != htons(ETH_P_MACSEC))
++		return handle_not_macsec(skb);
++
++	skb = skb_unshare(skb, GFP_ATOMIC);
++	*pskb = skb;
++	if (!skb)
++		return RX_HANDLER_CONSUMED;
++
++	pulled_sci = pskb_may_pull(skb, macsec_extra_len(true));
++	if (!pulled_sci) {
++		if (!pskb_may_pull(skb, macsec_extra_len(false)))
++			goto drop_direct;
++	}
++
++	hdr = macsec_ethhdr(skb);
++
++	/* Frames with a SecTAG that has the TCI E bit set but the C
++	 * bit clear are discarded, as this reserved encoding is used
++	 * to identify frames with a SecTAG that are not to be
++	 * delivered to the Controlled Port.
++	 */
++	if ((hdr->tci_an & (MACSEC_TCI_C | MACSEC_TCI_E)) == MACSEC_TCI_E)
++		return RX_HANDLER_PASS;
++
++	/* now, pull the extra length */
++	if (hdr->tci_an & MACSEC_TCI_SC) {
++		if (!pulled_sci)
++			goto drop_direct;
++	}
++
++	/* ethernet header is part of crypto processing */
++	skb_push(skb, ETH_HLEN);
++
++	macsec_skb_cb(skb)->has_sci = !!(hdr->tci_an & MACSEC_TCI_SC);
++	macsec_skb_cb(skb)->assoc_num = hdr->tci_an & MACSEC_AN_MASK;
++	sci = macsec_frame_sci(hdr, macsec_skb_cb(skb)->has_sci);
++
++	rcu_read_lock();
++	rxd = macsec_data_rcu(skb->dev);
++
++	list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
++		struct macsec_rx_sc *sc = find_rx_sc(&macsec->secy, sci);
++
++		sc = sc ? macsec_rxsc_get(sc) : NULL;
++
++		if (sc) {
++			secy = &macsec->secy;
++			rx_sc = sc;
++			break;
++		}
++	}
++
++	if (!secy)
++		goto nosci;
++
++	dev = secy->netdev;
++	macsec = macsec_priv(dev);
++	secy_stats = this_cpu_ptr(macsec->stats);
++	rxsc_stats = this_cpu_ptr(rx_sc->stats);
++
++	if (!macsec_validate_skb(skb, secy->icv_len, secy->xpn)) {
++		u64_stats_update_begin(&secy_stats->syncp);
++		secy_stats->stats.InPktsBadTag++;
++		u64_stats_update_end(&secy_stats->syncp);
++		secy->netdev->stats.rx_errors++;
++		goto drop_nosa;
++	}
++
++	rx_sa = macsec_rxsa_get(rx_sc->sa[macsec_skb_cb(skb)->assoc_num]);
++	if (!rx_sa) {
++		/* 10.6.1 if the SA is not in use */
++
++		/* If validateFrames is Strict or the C bit in the
++		 * SecTAG is set, discard
++		 */
++		struct macsec_rx_sa *active_rx_sa = macsec_active_rxsa_get(rx_sc);
++		if (hdr->tci_an & MACSEC_TCI_C ||
++		    secy->validate_frames == MACSEC_VALIDATE_STRICT) {
++			u64_stats_update_begin(&rxsc_stats->syncp);
++			rxsc_stats->stats.InPktsNotUsingSA++;
++			u64_stats_update_end(&rxsc_stats->syncp);
++			secy->netdev->stats.rx_errors++;
++			if (active_rx_sa)
++				this_cpu_inc(active_rx_sa->stats->InPktsNotUsingSA);
++			goto drop_nosa;
++		}
++
++		/* not Strict, the frame (with the SecTAG and ICV
++		 * removed) is delivered to the Controlled Port.
++		 */
++		u64_stats_update_begin(&rxsc_stats->syncp);
++		rxsc_stats->stats.InPktsUnusedSA++;
++		u64_stats_update_end(&rxsc_stats->syncp);
++		if (active_rx_sa)
++			this_cpu_inc(active_rx_sa->stats->InPktsUnusedSA);
++		goto deliver;
++	}
++
++	/* First, PN check to avoid decrypting obviously wrong packets */
++	hdr_pn = ntohl(hdr->packet_number);
++	if (secy->replay_protect) {
++		bool late;
++
++		spin_lock(&rx_sa->lock);
++		late = rx_sa->next_pn_halves.lower >= secy->replay_window &&
++		       hdr_pn < (rx_sa->next_pn_halves.lower - secy->replay_window);
++
++		if (secy->xpn)
++			late = late && pn_same_half(rx_sa->next_pn_halves.lower, hdr_pn);
++		spin_unlock(&rx_sa->lock);
++
++		if (late) {
++			u64_stats_update_begin(&rxsc_stats->syncp);
++			rxsc_stats->stats.InPktsLate++;
++			u64_stats_update_end(&rxsc_stats->syncp);
++			macsec->secy.netdev->stats.rx_dropped++;
++			goto drop;
++		}
++	}
++
++	macsec_skb_cb(skb)->rx_sa = rx_sa;
++
++	/* Disabled && !changed text => skip validation */
++	if (hdr->tci_an & MACSEC_TCI_C ||
++	    secy->validate_frames != MACSEC_VALIDATE_DISABLED)
++		skb = macsec_decrypt(skb, dev, rx_sa, sci, secy);
++
++	if (IS_ERR(skb)) {
++		/* the decrypt callback needs the reference */
++		if (PTR_ERR(skb) != -EINPROGRESS) {
++			macsec_rxsa_put(rx_sa);
++			macsec_rxsc_put(rx_sc);
++		}
++		rcu_read_unlock();
++		*pskb = NULL;
++		return RX_HANDLER_CONSUMED;
++	}
++
++	if (!macsec_post_decrypt(skb, secy, hdr_pn))
++		goto drop;
++
++deliver:
++	macsec_finalize_skb(skb, secy->icv_len,
++			    macsec_extra_len(macsec_skb_cb(skb)->has_sci));
++	len = skb->len;
++	macsec_reset_skb(skb, secy->netdev);
++
++	if (rx_sa)
++		macsec_rxsa_put(rx_sa);
++	macsec_rxsc_put(rx_sc);
++
++	skb_orphan(skb);
++	ret = gro_cells_receive(&macsec->gro_cells, skb);
++	if (ret == NET_RX_SUCCESS)
++		count_rx(dev, len);
++	else
++		macsec->secy.netdev->stats.rx_dropped++;
++
++	rcu_read_unlock();
++
++	*pskb = NULL;
++	return RX_HANDLER_CONSUMED;
++
++drop:
++	macsec_rxsa_put(rx_sa);
++drop_nosa:
++	macsec_rxsc_put(rx_sc);
++	rcu_read_unlock();
++drop_direct:
++	kfree_skb(skb);
++	*pskb = NULL;
++	return RX_HANDLER_CONSUMED;
++
++nosci:
++	/* 10.6.1 if the SC is not found */
++	cbit = !!(hdr->tci_an & MACSEC_TCI_C);
++	if (!cbit)
++		macsec_finalize_skb(skb, DEFAULT_ICV_LEN,
++				    macsec_extra_len(macsec_skb_cb(skb)->has_sci));
++
++	list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
++		struct sk_buff *nskb;
++
++		secy_stats = this_cpu_ptr(macsec->stats);
++
++		/* If validateFrames is Strict or the C bit in the
++		 * SecTAG is set, discard
++		 */
++		if (cbit ||
++		    macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) {
++			u64_stats_update_begin(&secy_stats->syncp);
++			secy_stats->stats.InPktsNoSCI++;
++			u64_stats_update_end(&secy_stats->syncp);
++			macsec->secy.netdev->stats.rx_errors++;
++			continue;
++		}
++
++		/* not strict, the frame (with the SecTAG and ICV
++		 * removed) is delivered to the Controlled Port.
++		 */
++		nskb = skb_clone(skb, GFP_ATOMIC);
++		if (!nskb)
++			break;
++
++		macsec_reset_skb(nskb, macsec->secy.netdev);
++
++		ret = __netif_rx(nskb);
++		if (ret == NET_RX_SUCCESS) {
++			u64_stats_update_begin(&secy_stats->syncp);
++			secy_stats->stats.InPktsUnknownSCI++;
++			u64_stats_update_end(&secy_stats->syncp);
++		} else {
++			macsec->secy.netdev->stats.rx_dropped++;
++		}
++	}
++
++	rcu_read_unlock();
++	*pskb = skb;
++	return RX_HANDLER_PASS;
++}
++
++static struct crypto_aead *macsec_alloc_tfm(char *key, int key_len, int icv_len)
++{
++	struct crypto_aead *tfm;
++	int ret;
++
++	/* Pick a sync gcm(aes) cipher to ensure order is preserved. */
++	tfm = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
++
++	if (IS_ERR(tfm))
++		return tfm;
++
++	ret = crypto_aead_setkey(tfm, key, key_len);
++	if (ret < 0)
++		goto fail;
++
++	ret = crypto_aead_setauthsize(tfm, icv_len);
++	if (ret < 0)
++		goto fail;
++
++	return tfm;
++fail:
++	crypto_free_aead(tfm);
++	return ERR_PTR(ret);
++}
++
++static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len,
++		      int icv_len)
++{
++	rx_sa->stats = alloc_percpu(struct macsec_rx_sa_stats);
++	if (!rx_sa->stats)
++		return -ENOMEM;
++
++	rx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len);
++	if (IS_ERR(rx_sa->key.tfm)) {
++		free_percpu(rx_sa->stats);
++		return PTR_ERR(rx_sa->key.tfm);
++	}
++
++	rx_sa->ssci = MACSEC_UNDEF_SSCI;
++	rx_sa->active = false;
++	rx_sa->next_pn = 1;
++	refcount_set(&rx_sa->refcnt, 1);
++	spin_lock_init(&rx_sa->lock);
++
++	return 0;
++}
++
++static void clear_rx_sa(struct macsec_rx_sa *rx_sa)
++{
++	rx_sa->active = false;
++
++	macsec_rxsa_put(rx_sa);
++}
++
++static void free_rx_sc(struct macsec_rx_sc *rx_sc)
++{
++	int i;
++
++	for (i = 0; i < MACSEC_NUM_AN; i++) {
++		struct macsec_rx_sa *sa = rtnl_dereference(rx_sc->sa[i]);
++
++		RCU_INIT_POINTER(rx_sc->sa[i], NULL);
++		if (sa)
++			clear_rx_sa(sa);
++	}
++
++	macsec_rxsc_put(rx_sc);
++}
++
++static struct macsec_rx_sc *del_rx_sc(struct macsec_secy *secy, sci_t sci)
++{
++	struct macsec_rx_sc *rx_sc, __rcu **rx_scp;
++
++	for (rx_scp = &secy->rx_sc, rx_sc = rtnl_dereference(*rx_scp);
++	     rx_sc;
++	     rx_scp = &rx_sc->next, rx_sc = rtnl_dereference(*rx_scp)) {
++		if (rx_sc->sci == sci) {
++			if (rx_sc->active)
++				secy->n_rx_sc--;
++			rcu_assign_pointer(*rx_scp, rx_sc->next);
++			return rx_sc;
++		}
++	}
++
++	return NULL;
++}
++
++static struct macsec_rx_sc *create_rx_sc(struct net_device *dev, sci_t sci,
++					 bool active)
++{
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_dev *macsec;
++	struct net_device *real_dev = macsec_priv(dev)->real_dev;
++	struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);
++	struct macsec_secy *secy;
++
++	list_for_each_entry(macsec, &rxd->secys, secys) {
++		if (find_rx_sc_rtnl(&macsec->secy, sci))
++			return ERR_PTR(-EEXIST);
++	}
++
++	rx_sc = kzalloc(sizeof(*rx_sc), GFP_KERNEL);
++	if (!rx_sc)
++		return ERR_PTR(-ENOMEM);
++
++	rx_sc->stats = netdev_alloc_pcpu_stats(struct pcpu_rx_sc_stats);
++	if (!rx_sc->stats) {
++		kfree(rx_sc);
++		return ERR_PTR(-ENOMEM);
++	}
++
++	rx_sc->sci = sci;
++	rx_sc->active = active;
++	refcount_set(&rx_sc->refcnt, 1);
++
++	secy = &macsec_priv(dev)->secy;
++	rcu_assign_pointer(rx_sc->next, secy->rx_sc);
++	rcu_assign_pointer(secy->rx_sc, rx_sc);
++
++	if (rx_sc->active)
++		secy->n_rx_sc++;
++
++	return rx_sc;
++}
++
++static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len,
++		      int icv_len)
++{
++	tx_sa->stats = alloc_percpu(struct macsec_tx_sa_stats);
++	if (!tx_sa->stats)
++		return -ENOMEM;
++
++	tx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len);
++	if (IS_ERR(tx_sa->key.tfm)) {
++		free_percpu(tx_sa->stats);
++		return PTR_ERR(tx_sa->key.tfm);
++	}
++
++	tx_sa->ssci = MACSEC_UNDEF_SSCI;
++	tx_sa->active = false;
++	refcount_set(&tx_sa->refcnt, 1);
++	spin_lock_init(&tx_sa->lock);
++
++	return 0;
++}
++
++static void clear_tx_sa(struct macsec_tx_sa *tx_sa)
++{
++	tx_sa->active = false;
++
++	macsec_txsa_put(tx_sa);
++}
++
++static struct genl_family macsec_fam;
++
++static struct net_device *get_dev_from_nl(struct net *net,
++					  struct nlattr **attrs)
++{
++	int ifindex = nla_get_u32(attrs[MACSEC_ATTR_IFINDEX]);
++	struct net_device *dev;
++
++	dev = __dev_get_by_index(net, ifindex);
++	if (!dev)
++		return ERR_PTR(-ENODEV);
++
++	if (!netif_is_macsec(dev))
++		return ERR_PTR(-ENODEV);
++
++	return dev;
++}
++
++static enum macsec_offload nla_get_offload(const struct nlattr *nla)
++{
++	return (__force enum macsec_offload)nla_get_u8(nla);
++}
++
++static sci_t nla_get_sci(const struct nlattr *nla)
++{
++	return (__force sci_t)nla_get_u64(nla);
++}
++
++static int nla_put_sci(struct sk_buff *skb, int attrtype, sci_t value,
++		       int padattr)
++{
++	return nla_put_u64_64bit(skb, attrtype, (__force u64)value, padattr);
++}
++
++static ssci_t nla_get_ssci(const struct nlattr *nla)
++{
++	return (__force ssci_t)nla_get_u32(nla);
++}
++
++static int nla_put_ssci(struct sk_buff *skb, int attrtype, ssci_t value)
++{
++	return nla_put_u32(skb, attrtype, (__force u64)value);
++}
++
++static struct macsec_tx_sa *get_txsa_from_nl(struct net *net,
++					     struct nlattr **attrs,
++					     struct nlattr **tb_sa,
++					     struct net_device **devp,
++					     struct macsec_secy **secyp,
++					     struct macsec_tx_sc **scp,
++					     u8 *assoc_num)
++{
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++
++	if (!tb_sa[MACSEC_SA_ATTR_AN])
++		return ERR_PTR(-EINVAL);
++
++	*assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);
++
++	dev = get_dev_from_nl(net, attrs);
++	if (IS_ERR(dev))
++		return ERR_CAST(dev);
++
++	if (*assoc_num >= MACSEC_NUM_AN)
++		return ERR_PTR(-EINVAL);
++
++	secy = &macsec_priv(dev)->secy;
++	tx_sc = &secy->tx_sc;
++
++	tx_sa = rtnl_dereference(tx_sc->sa[*assoc_num]);
++	if (!tx_sa)
++		return ERR_PTR(-ENODEV);
++
++	*devp = dev;
++	*scp = tx_sc;
++	*secyp = secy;
++	return tx_sa;
++}
++
++static struct macsec_rx_sc *get_rxsc_from_nl(struct net *net,
++					     struct nlattr **attrs,
++					     struct nlattr **tb_rxsc,
++					     struct net_device **devp,
++					     struct macsec_secy **secyp)
++{
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	sci_t sci;
++
++	dev = get_dev_from_nl(net, attrs);
++	if (IS_ERR(dev))
++		return ERR_CAST(dev);
++
++	secy = &macsec_priv(dev)->secy;
++
++	if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI])
++		return ERR_PTR(-EINVAL);
++
++	sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]);
++	rx_sc = find_rx_sc_rtnl(secy, sci);
++	if (!rx_sc)
++		return ERR_PTR(-ENODEV);
++
++	*secyp = secy;
++	*devp = dev;
++
++	return rx_sc;
++}
++
++static struct macsec_rx_sa *get_rxsa_from_nl(struct net *net,
++					     struct nlattr **attrs,
++					     struct nlattr **tb_rxsc,
++					     struct nlattr **tb_sa,
++					     struct net_device **devp,
++					     struct macsec_secy **secyp,
++					     struct macsec_rx_sc **scp,
++					     u8 *assoc_num)
++{
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++
++	if (!tb_sa[MACSEC_SA_ATTR_AN])
++		return ERR_PTR(-EINVAL);
++
++	*assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);
++	if (*assoc_num >= MACSEC_NUM_AN)
++		return ERR_PTR(-EINVAL);
++
++	rx_sc = get_rxsc_from_nl(net, attrs, tb_rxsc, devp, secyp);
++	if (IS_ERR(rx_sc))
++		return ERR_CAST(rx_sc);
++
++	rx_sa = rtnl_dereference(rx_sc->sa[*assoc_num]);
++	if (!rx_sa)
++		return ERR_PTR(-ENODEV);
++
++	*scp = rx_sc;
++	return rx_sa;
++}
++
++static const struct nla_policy macsec_genl_policy[NUM_MACSEC_ATTR] = {
++	[MACSEC_ATTR_IFINDEX] = { .type = NLA_U32 },
++	[MACSEC_ATTR_RXSC_CONFIG] = { .type = NLA_NESTED },
++	[MACSEC_ATTR_SA_CONFIG] = { .type = NLA_NESTED },
++	[MACSEC_ATTR_OFFLOAD] = { .type = NLA_NESTED },
++};
++
++static const struct nla_policy macsec_genl_rxsc_policy[NUM_MACSEC_RXSC_ATTR] = {
++	[MACSEC_RXSC_ATTR_SCI] = { .type = NLA_U64 },
++	[MACSEC_RXSC_ATTR_ACTIVE] = { .type = NLA_U8 },
++};
++
++static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = {
++	[MACSEC_SA_ATTR_AN] = { .type = NLA_U8 },
++	[MACSEC_SA_ATTR_ACTIVE] = { .type = NLA_U8 },
++	[MACSEC_SA_ATTR_PN] = NLA_POLICY_MIN_LEN(4),
++	[MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY,
++				   .len = MACSEC_KEYID_LEN, },
++	[MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY,
++				 .len = MACSEC_MAX_KEY_LEN, },
++	[MACSEC_SA_ATTR_SSCI] = { .type = NLA_U32 },
++	[MACSEC_SA_ATTR_SALT] = { .type = NLA_BINARY,
++				  .len = MACSEC_SALT_LEN, },
++};
++
++static const struct nla_policy macsec_genl_offload_policy[NUM_MACSEC_OFFLOAD_ATTR] = {
++	[MACSEC_OFFLOAD_ATTR_TYPE] = { .type = NLA_U8 },
++};
++
++/* Offloads an operation to a device driver */
++static int macsec_offload(int (* const func)(struct macsec_context *),
++			  struct macsec_context *ctx)
++{
++	int ret;
++
++	if (unlikely(!func))
++		return 0;
++
++	if (ctx->offload == MACSEC_OFFLOAD_PHY)
++		mutex_lock(&ctx->phydev->lock);
++
++	/* Phase I: prepare. The drive should fail here if there are going to be
++	 * issues in the commit phase.
++	 */
++	ctx->prepare = true;
++	ret = (*func)(ctx);
++	if (ret)
++		goto phy_unlock;
++
++	/* Phase II: commit. This step cannot fail. */
++	ctx->prepare = false;
++	ret = (*func)(ctx);
++	/* This should never happen: commit is not allowed to fail */
++	if (unlikely(ret))
++		WARN(1, "MACsec offloading commit failed (%d)\n", ret);
++
++phy_unlock:
++	if (ctx->offload == MACSEC_OFFLOAD_PHY)
++		mutex_unlock(&ctx->phydev->lock);
++
++	return ret;
++}
++
++static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa)
++{
++	if (!attrs[MACSEC_ATTR_SA_CONFIG])
++		return -EINVAL;
++
++	if (nla_parse_nested_deprecated(tb_sa, MACSEC_SA_ATTR_MAX, attrs[MACSEC_ATTR_SA_CONFIG], macsec_genl_sa_policy, NULL))
++		return -EINVAL;
++
++	return 0;
++}
++
++static int parse_rxsc_config(struct nlattr **attrs, struct nlattr **tb_rxsc)
++{
++	if (!attrs[MACSEC_ATTR_RXSC_CONFIG])
++		return -EINVAL;
++
++	if (nla_parse_nested_deprecated(tb_rxsc, MACSEC_RXSC_ATTR_MAX, attrs[MACSEC_ATTR_RXSC_CONFIG], macsec_genl_rxsc_policy, NULL))
++		return -EINVAL;
++
++	return 0;
++}
++
++static bool validate_add_rxsa(struct nlattr **attrs)
++{
++	if (!attrs[MACSEC_SA_ATTR_AN] ||
++	    !attrs[MACSEC_SA_ATTR_KEY] ||
++	    !attrs[MACSEC_SA_ATTR_KEYID])
++		return false;
++
++	if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_PN] &&
++	    nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
++		if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1)
++			return false;
++	}
++
++	if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN)
++		return false;
++
++	return true;
++}
++
++static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct net_device *dev;
++	struct nlattr **attrs = info->attrs;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++	unsigned char assoc_num;
++	int pn_len;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	int err;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (!validate_add_rxsa(tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy);
++	if (IS_ERR(rx_sc)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sc);
++	}
++
++	assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);
++
++	if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) {
++		pr_notice("macsec: nl: add_rxsa: bad key length: %d != %d\n",
++			  nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len);
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
++	if (tb_sa[MACSEC_SA_ATTR_PN] &&
++	    nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
++		pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n",
++			  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	if (secy->xpn) {
++		if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) {
++			rtnl_unlock();
++			return -EINVAL;
++		}
++
++		if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) {
++			pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n",
++				  nla_len(tb_sa[MACSEC_SA_ATTR_SALT]),
++				  MACSEC_SALT_LEN);
++			rtnl_unlock();
++			return -EINVAL;
++		}
++	}
++
++	rx_sa = rtnl_dereference(rx_sc->sa[assoc_num]);
++	if (rx_sa) {
++		rtnl_unlock();
++		return -EBUSY;
++	}
++
++	rx_sa = kmalloc(sizeof(*rx_sa), GFP_KERNEL);
++	if (!rx_sa) {
++		rtnl_unlock();
++		return -ENOMEM;
++	}
++
++	err = init_rx_sa(rx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
++			 secy->key_len, secy->icv_len);
++	if (err < 0) {
++		kfree(rx_sa);
++		rtnl_unlock();
++		return err;
++	}
++
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		spin_lock_bh(&rx_sa->lock);
++		rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
++		spin_unlock_bh(&rx_sa->lock);
++	}
++
++	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
++		rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
++
++	rx_sa->sc = rx_sc;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			err = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.rx_sa = rx_sa;
++		ctx.secy = secy;
++		memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
++		       secy->key_len);
++
++		err = macsec_offload(ops->mdo_add_rxsa, &ctx);
++		memzero_explicit(ctx.sa.key, secy->key_len);
++		if (err)
++			goto cleanup;
++	}
++
++	if (secy->xpn) {
++		rx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]);
++		nla_memcpy(rx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT],
++			   MACSEC_SALT_LEN);
++	}
++
++	nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
++	rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa);
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	macsec_rxsa_put(rx_sa);
++	rtnl_unlock();
++	return err;
++}
++
++static bool validate_add_rxsc(struct nlattr **attrs)
++{
++	if (!attrs[MACSEC_RXSC_ATTR_SCI])
++		return false;
++
++	if (attrs[MACSEC_RXSC_ATTR_ACTIVE]) {
++		if (nla_get_u8(attrs[MACSEC_RXSC_ATTR_ACTIVE]) > 1)
++			return false;
++	}
++
++	return true;
++}
++
++static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info)
++{
++	struct net_device *dev;
++	sci_t sci = MACSEC_UNDEF_SCI;
++	struct nlattr **attrs = info->attrs;
++	struct macsec_rx_sc *rx_sc;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	struct macsec_secy *secy;
++	bool active = true;
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (!validate_add_rxsc(tb_rxsc))
++		return -EINVAL;
++
++	rtnl_lock();
++	dev = get_dev_from_nl(genl_info_net(info), attrs);
++	if (IS_ERR(dev)) {
++		rtnl_unlock();
++		return PTR_ERR(dev);
++	}
++
++	secy = &macsec_priv(dev)->secy;
++	sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]);
++
++	if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE])
++		active = nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]);
++
++	rx_sc = create_rx_sc(dev, sci, active);
++	if (IS_ERR(rx_sc)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sc);
++	}
++
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.rx_sc = rx_sc;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_add_rxsc, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	del_rx_sc(secy, sci);
++	free_rx_sc(rx_sc);
++	rtnl_unlock();
++	return ret;
++}
++
++static bool validate_add_txsa(struct nlattr **attrs)
++{
++	if (!attrs[MACSEC_SA_ATTR_AN] ||
++	    !attrs[MACSEC_SA_ATTR_PN] ||
++	    !attrs[MACSEC_SA_ATTR_KEY] ||
++	    !attrs[MACSEC_SA_ATTR_KEYID])
++		return false;
++
++	if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
++		return false;
++
++	if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
++		if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1)
++			return false;
++	}
++
++	if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN)
++		return false;
++
++	return true;
++}
++
++static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct net_device *dev;
++	struct nlattr **attrs = info->attrs;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++	unsigned char assoc_num;
++	int pn_len;
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	bool was_operational;
++	int err;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (!validate_add_txsa(tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	dev = get_dev_from_nl(genl_info_net(info), attrs);
++	if (IS_ERR(dev)) {
++		rtnl_unlock();
++		return PTR_ERR(dev);
++	}
++
++	secy = &macsec_priv(dev)->secy;
++	tx_sc = &secy->tx_sc;
++
++	assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);
++
++	if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) {
++		pr_notice("macsec: nl: add_txsa: bad key length: %d != %d\n",
++			  nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len);
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
++	if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
++		pr_notice("macsec: nl: add_txsa: bad pn length: %d != %d\n",
++			  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	if (secy->xpn) {
++		if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) {
++			rtnl_unlock();
++			return -EINVAL;
++		}
++
++		if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) {
++			pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n",
++				  nla_len(tb_sa[MACSEC_SA_ATTR_SALT]),
++				  MACSEC_SALT_LEN);
++			rtnl_unlock();
++			return -EINVAL;
++		}
++	}
++
++	tx_sa = rtnl_dereference(tx_sc->sa[assoc_num]);
++	if (tx_sa) {
++		rtnl_unlock();
++		return -EBUSY;
++	}
++
++	tx_sa = kmalloc(sizeof(*tx_sa), GFP_KERNEL);
++	if (!tx_sa) {
++		rtnl_unlock();
++		return -ENOMEM;
++	}
++
++	err = init_tx_sa(tx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
++			 secy->key_len, secy->icv_len);
++	if (err < 0) {
++		kfree(tx_sa);
++		rtnl_unlock();
++		return err;
++	}
++
++	spin_lock_bh(&tx_sa->lock);
++	tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
++	spin_unlock_bh(&tx_sa->lock);
++
++	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
++		tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
++
++	was_operational = secy->operational;
++	if (assoc_num == tx_sc->encoding_sa && tx_sa->active)
++		secy->operational = true;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			err = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.tx_sa = tx_sa;
++		ctx.secy = secy;
++		memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
++		       secy->key_len);
++
++		err = macsec_offload(ops->mdo_add_txsa, &ctx);
++		memzero_explicit(ctx.sa.key, secy->key_len);
++		if (err)
++			goto cleanup;
++	}
++
++	if (secy->xpn) {
++		tx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]);
++		nla_memcpy(tx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT],
++			   MACSEC_SALT_LEN);
++	}
++
++	nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
++	rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa);
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	secy->operational = was_operational;
++	macsec_txsa_put(tx_sa);
++	rtnl_unlock();
++	return err;
++}
++
++static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++	u8 assoc_num;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	rtnl_lock();
++	rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa,
++				 &dev, &secy, &rx_sc, &assoc_num);
++	if (IS_ERR(rx_sa)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sa);
++	}
++
++	if (rx_sa->active) {
++		rtnl_unlock();
++		return -EBUSY;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.rx_sa = rx_sa;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_del_rxsa, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL);
++	clear_rx_sa(rx_sa);
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	rtnl_unlock();
++	return ret;
++}
++
++static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	sci_t sci;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI])
++		return -EINVAL;
++
++	rtnl_lock();
++	dev = get_dev_from_nl(genl_info_net(info), info->attrs);
++	if (IS_ERR(dev)) {
++		rtnl_unlock();
++		return PTR_ERR(dev);
++	}
++
++	secy = &macsec_priv(dev)->secy;
++	sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]);
++
++	rx_sc = del_rx_sc(secy, sci);
++	if (!rx_sc) {
++		rtnl_unlock();
++		return -ENODEV;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.rx_sc = rx_sc;
++		ctx.secy = secy;
++		ret = macsec_offload(ops->mdo_del_rxsc, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	free_rx_sc(rx_sc);
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	rtnl_unlock();
++	return ret;
++}
++
++static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++	u8 assoc_num;
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa,
++				 &dev, &secy, &tx_sc, &assoc_num);
++	if (IS_ERR(tx_sa)) {
++		rtnl_unlock();
++		return PTR_ERR(tx_sa);
++	}
++
++	if (tx_sa->active) {
++		rtnl_unlock();
++		return -EBUSY;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.tx_sa = tx_sa;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_del_txsa, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL);
++	clear_tx_sa(tx_sa);
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	rtnl_unlock();
++	return ret;
++}
++
++static bool validate_upd_sa(struct nlattr **attrs)
++{
++	if (!attrs[MACSEC_SA_ATTR_AN] ||
++	    attrs[MACSEC_SA_ATTR_KEY] ||
++	    attrs[MACSEC_SA_ATTR_KEYID] ||
++	    attrs[MACSEC_SA_ATTR_SSCI] ||
++	    attrs[MACSEC_SA_ATTR_SALT])
++		return false;
++
++	if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
++		return false;
++
++	if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
++		if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1)
++			return false;
++	}
++
++	return true;
++}
++
++static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++	struct macsec_tx_sa *tx_sa;
++	u8 assoc_num;
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	bool was_operational, was_active;
++	pn_t prev_pn;
++	int ret = 0;
++
++	prev_pn.full64 = 0;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (!validate_upd_sa(tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa,
++				 &dev, &secy, &tx_sc, &assoc_num);
++	if (IS_ERR(tx_sa)) {
++		rtnl_unlock();
++		return PTR_ERR(tx_sa);
++	}
++
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		int pn_len;
++
++		pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
++		if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
++			pr_notice("macsec: nl: upd_txsa: bad pn length: %d != %d\n",
++				  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
++			rtnl_unlock();
++			return -EINVAL;
++		}
++
++		spin_lock_bh(&tx_sa->lock);
++		prev_pn = tx_sa->next_pn_halves;
++		tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
++		spin_unlock_bh(&tx_sa->lock);
++	}
++
++	was_active = tx_sa->active;
++	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
++		tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
++
++	was_operational = secy->operational;
++	if (assoc_num == tx_sc->encoding_sa)
++		secy->operational = tx_sa->active;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.tx_sa = tx_sa;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_upd_txsa, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		spin_lock_bh(&tx_sa->lock);
++		tx_sa->next_pn_halves = prev_pn;
++		spin_unlock_bh(&tx_sa->lock);
++	}
++	tx_sa->active = was_active;
++	secy->operational = was_operational;
++	rtnl_unlock();
++	return ret;
++}
++
++static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	struct macsec_rx_sa *rx_sa;
++	u8 assoc_num;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
++	bool was_active;
++	pn_t prev_pn;
++	int ret = 0;
++
++	prev_pn.full64 = 0;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (parse_sa_config(attrs, tb_sa))
++		return -EINVAL;
++
++	if (!validate_upd_sa(tb_sa))
++		return -EINVAL;
++
++	rtnl_lock();
++	rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa,
++				 &dev, &secy, &rx_sc, &assoc_num);
++	if (IS_ERR(rx_sa)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sa);
++	}
++
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		int pn_len;
++
++		pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
++		if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
++			pr_notice("macsec: nl: upd_rxsa: bad pn length: %d != %d\n",
++				  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
++			rtnl_unlock();
++			return -EINVAL;
++		}
++
++		spin_lock_bh(&rx_sa->lock);
++		prev_pn = rx_sa->next_pn_halves;
++		rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
++		spin_unlock_bh(&rx_sa->lock);
++	}
++
++	was_active = rx_sa->active;
++	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
++		rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.sa.assoc_num = assoc_num;
++		ctx.sa.rx_sa = rx_sa;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_upd_rxsa, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	rtnl_unlock();
++	return 0;
++
++cleanup:
++	if (tb_sa[MACSEC_SA_ATTR_PN]) {
++		spin_lock_bh(&rx_sa->lock);
++		rx_sa->next_pn_halves = prev_pn;
++		spin_unlock_bh(&rx_sa->lock);
++	}
++	rx_sa->active = was_active;
++	rtnl_unlock();
++	return ret;
++}
++
++static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	struct macsec_secy *secy;
++	struct macsec_rx_sc *rx_sc;
++	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
++	unsigned int prev_n_rx_sc;
++	bool was_active;
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (parse_rxsc_config(attrs, tb_rxsc))
++		return -EINVAL;
++
++	if (!validate_add_rxsc(tb_rxsc))
++		return -EINVAL;
++
++	rtnl_lock();
++	rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy);
++	if (IS_ERR(rx_sc)) {
++		rtnl_unlock();
++		return PTR_ERR(rx_sc);
++	}
++
++	was_active = rx_sc->active;
++	prev_n_rx_sc = secy->n_rx_sc;
++	if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) {
++		bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]);
++
++		if (rx_sc->active != new)
++			secy->n_rx_sc += new ? 1 : -1;
++
++		rx_sc->active = new;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.rx_sc = rx_sc;
++		ctx.secy = secy;
++
++		ret = macsec_offload(ops->mdo_upd_rxsc, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	rtnl_unlock();
++
++	return 0;
++
++cleanup:
++	secy->n_rx_sc = prev_n_rx_sc;
++	rx_sc->active = was_active;
++	rtnl_unlock();
++	return ret;
++}
++
++static bool macsec_is_configured(struct macsec_dev *macsec)
++{
++	struct macsec_secy *secy = &macsec->secy;
++	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++	int i;
++
++	if (secy->rx_sc)
++		return true;
++
++	for (i = 0; i < MACSEC_NUM_AN; i++)
++		if (tx_sc->sa[i])
++			return true;
++
++	return false;
++}
++
++static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info)
++{
++	struct nlattr *tb_offload[MACSEC_OFFLOAD_ATTR_MAX + 1];
++	enum macsec_offload offload, prev_offload;
++	int (*func)(struct macsec_context *ctx);
++	struct nlattr **attrs = info->attrs;
++	struct net_device *dev;
++	const struct macsec_ops *ops;
++	struct macsec_context ctx;
++	struct macsec_dev *macsec;
++	int ret;
++
++	if (!attrs[MACSEC_ATTR_IFINDEX])
++		return -EINVAL;
++
++	if (!attrs[MACSEC_ATTR_OFFLOAD])
++		return -EINVAL;
++
++	if (nla_parse_nested_deprecated(tb_offload, MACSEC_OFFLOAD_ATTR_MAX,
++					attrs[MACSEC_ATTR_OFFLOAD],
++					macsec_genl_offload_policy, NULL))
++		return -EINVAL;
++
++	dev = get_dev_from_nl(genl_info_net(info), attrs);
++	if (IS_ERR(dev))
++		return PTR_ERR(dev);
++	macsec = macsec_priv(dev);
++
++	if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE])
++		return -EINVAL;
++
++	offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]);
++	if (macsec->offload == offload)
++		return 0;
++
++	/* Check if the offloading mode is supported by the underlying layers */
++	if (offload != MACSEC_OFFLOAD_OFF &&
++	    !macsec_check_offload(offload, macsec))
++		return -EOPNOTSUPP;
++
++	/* Check if the net device is busy. */
++	if (netif_running(dev))
++		return -EBUSY;
++
++	rtnl_lock();
++
++	prev_offload = macsec->offload;
++	macsec->offload = offload;
++
++	/* Check if the device already has rules configured: we do not support
++	 * rules migration.
++	 */
++	if (macsec_is_configured(macsec)) {
++		ret = -EBUSY;
++		goto rollback;
++	}
++
++	ops = __macsec_get_ops(offload == MACSEC_OFFLOAD_OFF ? prev_offload : offload,
++			       macsec, &ctx);
++	if (!ops) {
++		ret = -EOPNOTSUPP;
++		goto rollback;
++	}
++
++	if (prev_offload == MACSEC_OFFLOAD_OFF)
++		func = ops->mdo_add_secy;
++	else
++		func = ops->mdo_del_secy;
++
++	ctx.secy = &macsec->secy;
++	ret = macsec_offload(func, &ctx);
++	if (ret)
++		goto rollback;
++
++	rtnl_unlock();
++	return 0;
++
++rollback:
++	macsec->offload = prev_offload;
++
++	rtnl_unlock();
++	return ret;
++}
++
++static void get_tx_sa_stats(struct net_device *dev, int an,
++			    struct macsec_tx_sa *tx_sa,
++			    struct macsec_tx_sa_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.sa.assoc_num = an;
++			ctx.sa.tx_sa = tx_sa;
++			ctx.stats.tx_sa_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			macsec_offload(ops->mdo_get_tx_sa_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct macsec_tx_sa_stats *stats =
++			per_cpu_ptr(tx_sa->stats, cpu);
++
++		sum->OutPktsProtected += stats->OutPktsProtected;
++		sum->OutPktsEncrypted += stats->OutPktsEncrypted;
++	}
++}
++
++static int copy_tx_sa_stats(struct sk_buff *skb, struct macsec_tx_sa_stats *sum)
++{
++	if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED,
++			sum->OutPktsProtected) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED,
++			sum->OutPktsEncrypted))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static void get_rx_sa_stats(struct net_device *dev,
++			    struct macsec_rx_sc *rx_sc, int an,
++			    struct macsec_rx_sa *rx_sa,
++			    struct macsec_rx_sa_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.sa.assoc_num = an;
++			ctx.sa.rx_sa = rx_sa;
++			ctx.stats.rx_sa_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			ctx.rx_sc = rx_sc;
++			macsec_offload(ops->mdo_get_rx_sa_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct macsec_rx_sa_stats *stats =
++			per_cpu_ptr(rx_sa->stats, cpu);
++
++		sum->InPktsOK         += stats->InPktsOK;
++		sum->InPktsInvalid    += stats->InPktsInvalid;
++		sum->InPktsNotValid   += stats->InPktsNotValid;
++		sum->InPktsNotUsingSA += stats->InPktsNotUsingSA;
++		sum->InPktsUnusedSA   += stats->InPktsUnusedSA;
++	}
++}
++
++static int copy_rx_sa_stats(struct sk_buff *skb,
++			    struct macsec_rx_sa_stats *sum)
++{
++	if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID,
++			sum->InPktsInvalid) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID,
++			sum->InPktsNotValid) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA,
++			sum->InPktsNotUsingSA) ||
++	    nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA,
++			sum->InPktsUnusedSA))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static void get_rx_sc_stats(struct net_device *dev,
++			    struct macsec_rx_sc *rx_sc,
++			    struct macsec_rx_sc_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.stats.rx_sc_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			ctx.rx_sc = rx_sc;
++			macsec_offload(ops->mdo_get_rx_sc_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct pcpu_rx_sc_stats *stats;
++		struct macsec_rx_sc_stats tmp;
++		unsigned int start;
++
++		stats = per_cpu_ptr(rx_sc->stats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&stats->syncp);
++			memcpy(&tmp, &stats->stats, sizeof(tmp));
++		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
++
++		sum->InOctetsValidated += tmp.InOctetsValidated;
++		sum->InOctetsDecrypted += tmp.InOctetsDecrypted;
++		sum->InPktsUnchecked   += tmp.InPktsUnchecked;
++		sum->InPktsDelayed     += tmp.InPktsDelayed;
++		sum->InPktsOK          += tmp.InPktsOK;
++		sum->InPktsInvalid     += tmp.InPktsInvalid;
++		sum->InPktsLate        += tmp.InPktsLate;
++		sum->InPktsNotValid    += tmp.InPktsNotValid;
++		sum->InPktsNotUsingSA  += tmp.InPktsNotUsingSA;
++		sum->InPktsUnusedSA    += tmp.InPktsUnusedSA;
++	}
++}
++
++static int copy_rx_sc_stats(struct sk_buff *skb, struct macsec_rx_sc_stats *sum)
++{
++	if (nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED,
++			      sum->InOctetsValidated,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED,
++			      sum->InOctetsDecrypted,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED,
++			      sum->InPktsUnchecked,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED,
++			      sum->InPktsDelayed,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK,
++			      sum->InPktsOK,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID,
++			      sum->InPktsInvalid,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE,
++			      sum->InPktsLate,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID,
++			      sum->InPktsNotValid,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA,
++			      sum->InPktsNotUsingSA,
++			      MACSEC_RXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA,
++			      sum->InPktsUnusedSA,
++			      MACSEC_RXSC_STATS_ATTR_PAD))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static void get_tx_sc_stats(struct net_device *dev,
++			    struct macsec_tx_sc_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.stats.tx_sc_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			macsec_offload(ops->mdo_get_tx_sc_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct pcpu_tx_sc_stats *stats;
++		struct macsec_tx_sc_stats tmp;
++		unsigned int start;
++
++		stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&stats->syncp);
++			memcpy(&tmp, &stats->stats, sizeof(tmp));
++		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
++
++		sum->OutPktsProtected   += tmp.OutPktsProtected;
++		sum->OutPktsEncrypted   += tmp.OutPktsEncrypted;
++		sum->OutOctetsProtected += tmp.OutOctetsProtected;
++		sum->OutOctetsEncrypted += tmp.OutOctetsEncrypted;
++	}
++}
++
++static int copy_tx_sc_stats(struct sk_buff *skb, struct macsec_tx_sc_stats *sum)
++{
++	if (nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED,
++			      sum->OutPktsProtected,
++			      MACSEC_TXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED,
++			      sum->OutPktsEncrypted,
++			      MACSEC_TXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED,
++			      sum->OutOctetsProtected,
++			      MACSEC_TXSC_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED,
++			      sum->OutOctetsEncrypted,
++			      MACSEC_TXSC_STATS_ATTR_PAD))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	int cpu;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.stats.dev_stats = sum;
++			ctx.secy = &macsec_priv(dev)->secy;
++			macsec_offload(ops->mdo_get_dev_stats, &ctx);
++		}
++		return;
++	}
++
++	for_each_possible_cpu(cpu) {
++		const struct pcpu_secy_stats *stats;
++		struct macsec_dev_stats tmp;
++		unsigned int start;
++
++		stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&stats->syncp);
++			memcpy(&tmp, &stats->stats, sizeof(tmp));
++		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
++
++		sum->OutPktsUntagged  += tmp.OutPktsUntagged;
++		sum->InPktsUntagged   += tmp.InPktsUntagged;
++		sum->OutPktsTooLong   += tmp.OutPktsTooLong;
++		sum->InPktsNoTag      += tmp.InPktsNoTag;
++		sum->InPktsBadTag     += tmp.InPktsBadTag;
++		sum->InPktsUnknownSCI += tmp.InPktsUnknownSCI;
++		sum->InPktsNoSCI      += tmp.InPktsNoSCI;
++		sum->InPktsOverrun    += tmp.InPktsOverrun;
++	}
++}
++
++static int copy_secy_stats(struct sk_buff *skb, struct macsec_dev_stats *sum)
++{
++	if (nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED,
++			      sum->OutPktsUntagged,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED,
++			      sum->InPktsUntagged,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG,
++			      sum->OutPktsTooLong,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG,
++			      sum->InPktsNoTag,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG,
++			      sum->InPktsBadTag,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI,
++			      sum->InPktsUnknownSCI,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI,
++			      sum->InPktsNoSCI,
++			      MACSEC_SECY_STATS_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN,
++			      sum->InPktsOverrun,
++			      MACSEC_SECY_STATS_ATTR_PAD))
++		return -EMSGSIZE;
++
++	return 0;
++}
++
++static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb)
++{
++	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++	struct nlattr *secy_nest = nla_nest_start_noflag(skb,
++							 MACSEC_ATTR_SECY);
++	u64 csid;
++
++	if (!secy_nest)
++		return 1;
++
++	switch (secy->key_len) {
++	case MACSEC_GCM_AES_128_SAK_LEN:
++		csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID;
++		break;
++	case MACSEC_GCM_AES_256_SAK_LEN:
++		csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256;
++		break;
++	default:
++		goto cancel;
++	}
++
++	if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci,
++			MACSEC_SECY_ATTR_PAD) ||
++	    nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE,
++			      csid, MACSEC_SECY_ATTR_PAD) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_REPLAY, secy->replay_protect) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_VALIDATE, secy->validate_frames) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_ENCRYPT, tx_sc->encrypt) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_INC_SCI, tx_sc->send_sci) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_ES, tx_sc->end_station) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_SCB, tx_sc->scb) ||
++	    nla_put_u8(skb, MACSEC_SECY_ATTR_ENCODING_SA, tx_sc->encoding_sa))
++		goto cancel;
++
++	if (secy->replay_protect) {
++		if (nla_put_u32(skb, MACSEC_SECY_ATTR_WINDOW, secy->replay_window))
++			goto cancel;
++	}
++
++	nla_nest_end(skb, secy_nest);
++	return 0;
++
++cancel:
++	nla_nest_cancel(skb, secy_nest);
++	return 1;
++}
++
++static noinline_for_stack int
++dump_secy(struct macsec_secy *secy, struct net_device *dev,
++	  struct sk_buff *skb, struct netlink_callback *cb)
++{
++	struct macsec_tx_sc_stats tx_sc_stats = {0, };
++	struct macsec_tx_sa_stats tx_sa_stats = {0, };
++	struct macsec_rx_sc_stats rx_sc_stats = {0, };
++	struct macsec_rx_sa_stats rx_sa_stats = {0, };
++	struct macsec_dev *macsec = netdev_priv(dev);
++	struct macsec_dev_stats dev_stats = {0, };
++	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++	struct nlattr *txsa_list, *rxsc_list;
++	struct macsec_rx_sc *rx_sc;
++	struct nlattr *attr;
++	void *hdr;
++	int i, j;
++
++	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
++			  &macsec_fam, NLM_F_MULTI, MACSEC_CMD_GET_TXSC);
++	if (!hdr)
++		return -EMSGSIZE;
++
++	genl_dump_check_consistent(cb, hdr);
++
++	if (nla_put_u32(skb, MACSEC_ATTR_IFINDEX, dev->ifindex))
++		goto nla_put_failure;
++
++	attr = nla_nest_start_noflag(skb, MACSEC_ATTR_OFFLOAD);
++	if (!attr)
++		goto nla_put_failure;
++	if (nla_put_u8(skb, MACSEC_OFFLOAD_ATTR_TYPE, macsec->offload))
++		goto nla_put_failure;
++	nla_nest_end(skb, attr);
++
++	if (nla_put_secy(secy, skb))
++		goto nla_put_failure;
++
++	attr = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSC_STATS);
++	if (!attr)
++		goto nla_put_failure;
++
++	get_tx_sc_stats(dev, &tx_sc_stats);
++	if (copy_tx_sc_stats(skb, &tx_sc_stats)) {
++		nla_nest_cancel(skb, attr);
++		goto nla_put_failure;
++	}
++	nla_nest_end(skb, attr);
++
++	attr = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY_STATS);
++	if (!attr)
++		goto nla_put_failure;
++	get_secy_stats(dev, &dev_stats);
++	if (copy_secy_stats(skb, &dev_stats)) {
++		nla_nest_cancel(skb, attr);
++		goto nla_put_failure;
++	}
++	nla_nest_end(skb, attr);
++
++	txsa_list = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSA_LIST);
++	if (!txsa_list)
++		goto nla_put_failure;
++	for (i = 0, j = 1; i < MACSEC_NUM_AN; i++) {
++		struct macsec_tx_sa *tx_sa = rtnl_dereference(tx_sc->sa[i]);
++		struct nlattr *txsa_nest;
++		u64 pn;
++		int pn_len;
++
++		if (!tx_sa)
++			continue;
++
++		txsa_nest = nla_nest_start_noflag(skb, j++);
++		if (!txsa_nest) {
++			nla_nest_cancel(skb, txsa_list);
++			goto nla_put_failure;
++		}
++
++		attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS);
++		if (!attr) {
++			nla_nest_cancel(skb, txsa_nest);
++			nla_nest_cancel(skb, txsa_list);
++			goto nla_put_failure;
++		}
++		memset(&tx_sa_stats, 0, sizeof(tx_sa_stats));
++		get_tx_sa_stats(dev, i, tx_sa, &tx_sa_stats);
++		if (copy_tx_sa_stats(skb, &tx_sa_stats)) {
++			nla_nest_cancel(skb, attr);
++			nla_nest_cancel(skb, txsa_nest);
++			nla_nest_cancel(skb, txsa_list);
++			goto nla_put_failure;
++		}
++		nla_nest_end(skb, attr);
++
++		if (secy->xpn) {
++			pn = tx_sa->next_pn;
++			pn_len = MACSEC_XPN_PN_LEN;
++		} else {
++			pn = tx_sa->next_pn_halves.lower;
++			pn_len = MACSEC_DEFAULT_PN_LEN;
++		}
++
++		if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) ||
++		    nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) ||
++		    nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, tx_sa->key.id) ||
++		    (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, tx_sa->ssci)) ||
++		    nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, tx_sa->active)) {
++			nla_nest_cancel(skb, txsa_nest);
++			nla_nest_cancel(skb, txsa_list);
++			goto nla_put_failure;
++		}
++
++		nla_nest_end(skb, txsa_nest);
++	}
++	nla_nest_end(skb, txsa_list);
++
++	rxsc_list = nla_nest_start_noflag(skb, MACSEC_ATTR_RXSC_LIST);
++	if (!rxsc_list)
++		goto nla_put_failure;
++
++	j = 1;
++	for_each_rxsc_rtnl(secy, rx_sc) {
++		int k;
++		struct nlattr *rxsa_list;
++		struct nlattr *rxsc_nest = nla_nest_start_noflag(skb, j++);
++
++		if (!rxsc_nest) {
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++
++		if (nla_put_u8(skb, MACSEC_RXSC_ATTR_ACTIVE, rx_sc->active) ||
++		    nla_put_sci(skb, MACSEC_RXSC_ATTR_SCI, rx_sc->sci,
++				MACSEC_RXSC_ATTR_PAD)) {
++			nla_nest_cancel(skb, rxsc_nest);
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++
++		attr = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_STATS);
++		if (!attr) {
++			nla_nest_cancel(skb, rxsc_nest);
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++		memset(&rx_sc_stats, 0, sizeof(rx_sc_stats));
++		get_rx_sc_stats(dev, rx_sc, &rx_sc_stats);
++		if (copy_rx_sc_stats(skb, &rx_sc_stats)) {
++			nla_nest_cancel(skb, attr);
++			nla_nest_cancel(skb, rxsc_nest);
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++		nla_nest_end(skb, attr);
++
++		rxsa_list = nla_nest_start_noflag(skb,
++						  MACSEC_RXSC_ATTR_SA_LIST);
++		if (!rxsa_list) {
++			nla_nest_cancel(skb, rxsc_nest);
++			nla_nest_cancel(skb, rxsc_list);
++			goto nla_put_failure;
++		}
++
++		for (i = 0, k = 1; i < MACSEC_NUM_AN; i++) {
++			struct macsec_rx_sa *rx_sa = rtnl_dereference(rx_sc->sa[i]);
++			struct nlattr *rxsa_nest;
++			u64 pn;
++			int pn_len;
++
++			if (!rx_sa)
++				continue;
++
++			rxsa_nest = nla_nest_start_noflag(skb, k++);
++			if (!rxsa_nest) {
++				nla_nest_cancel(skb, rxsa_list);
++				nla_nest_cancel(skb, rxsc_nest);
++				nla_nest_cancel(skb, rxsc_list);
++				goto nla_put_failure;
++			}
++
++			attr = nla_nest_start_noflag(skb,
++						     MACSEC_SA_ATTR_STATS);
++			if (!attr) {
++				nla_nest_cancel(skb, rxsa_list);
++				nla_nest_cancel(skb, rxsc_nest);
++				nla_nest_cancel(skb, rxsc_list);
++				goto nla_put_failure;
++			}
++			memset(&rx_sa_stats, 0, sizeof(rx_sa_stats));
++			get_rx_sa_stats(dev, rx_sc, i, rx_sa, &rx_sa_stats);
++			if (copy_rx_sa_stats(skb, &rx_sa_stats)) {
++				nla_nest_cancel(skb, attr);
++				nla_nest_cancel(skb, rxsa_list);
++				nla_nest_cancel(skb, rxsc_nest);
++				nla_nest_cancel(skb, rxsc_list);
++				goto nla_put_failure;
++			}
++			nla_nest_end(skb, attr);
++
++			if (secy->xpn) {
++				pn = rx_sa->next_pn;
++				pn_len = MACSEC_XPN_PN_LEN;
++			} else {
++				pn = rx_sa->next_pn_halves.lower;
++				pn_len = MACSEC_DEFAULT_PN_LEN;
++			}
++
++			if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) ||
++			    nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) ||
++			    nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, rx_sa->key.id) ||
++			    (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, rx_sa->ssci)) ||
++			    nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, rx_sa->active)) {
++				nla_nest_cancel(skb, rxsa_nest);
++				nla_nest_cancel(skb, rxsc_nest);
++				nla_nest_cancel(skb, rxsc_list);
++				goto nla_put_failure;
++			}
++			nla_nest_end(skb, rxsa_nest);
++		}
++
++		nla_nest_end(skb, rxsa_list);
++		nla_nest_end(skb, rxsc_nest);
++	}
++
++	nla_nest_end(skb, rxsc_list);
++
++	genlmsg_end(skb, hdr);
++
++	return 0;
++
++nla_put_failure:
++	genlmsg_cancel(skb, hdr);
++	return -EMSGSIZE;
++}
++
++static int macsec_generation = 1; /* protected by RTNL */
++
++static int macsec_dump_txsc(struct sk_buff *skb, struct netlink_callback *cb)
++{
++	struct net *net = sock_net(skb->sk);
++	struct net_device *dev;
++	int dev_idx, d;
++
++	dev_idx = cb->args[0];
++
++	d = 0;
++	rtnl_lock();
++
++	cb->seq = macsec_generation;
++
++	for_each_netdev(net, dev) {
++		struct macsec_secy *secy;
++
++		if (d < dev_idx)
++			goto next;
++
++		if (!netif_is_macsec(dev))
++			goto next;
++
++		secy = &macsec_priv(dev)->secy;
++		if (dump_secy(secy, dev, skb, cb) < 0)
++			goto done;
++next:
++		d++;
++	}
++
++done:
++	rtnl_unlock();
++	cb->args[0] = d;
++	return skb->len;
++}
++
++static const struct genl_small_ops macsec_genl_ops[] = {
++	{
++		.cmd = MACSEC_CMD_GET_TXSC,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.dumpit = macsec_dump_txsc,
++	},
++	{
++		.cmd = MACSEC_CMD_ADD_RXSC,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_add_rxsc,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_DEL_RXSC,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_del_rxsc,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_UPD_RXSC,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_upd_rxsc,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_ADD_TXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_add_txsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_DEL_TXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_del_txsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_UPD_TXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_upd_txsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_ADD_RXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_add_rxsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_DEL_RXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_del_rxsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_UPD_RXSA,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_upd_rxsa,
++		.flags = GENL_ADMIN_PERM,
++	},
++	{
++		.cmd = MACSEC_CMD_UPD_OFFLOAD,
++		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
++		.doit = macsec_upd_offload,
++		.flags = GENL_ADMIN_PERM,
++	},
++};
++
++static struct genl_family macsec_fam __ro_after_init = {
++	.name		= MACSEC_GENL_NAME,
++	.hdrsize	= 0,
++	.version	= MACSEC_GENL_VERSION,
++	.maxattr	= MACSEC_ATTR_MAX,
++	.policy = macsec_genl_policy,
++	.netnsok	= true,
++	.module		= THIS_MODULE,
++	.small_ops	= macsec_genl_ops,
++	.n_small_ops	= ARRAY_SIZE(macsec_genl_ops),
++};
++
++static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
++				     struct net_device *dev)
++{
++	struct macsec_dev *macsec = netdev_priv(dev);
++	struct macsec_secy *secy = &macsec->secy;
++	struct pcpu_secy_stats *secy_stats;
++	int ret, len;
++
++	if (macsec_is_offloaded(netdev_priv(dev))) {
++		skb->dev = macsec->real_dev;
++		return dev_queue_xmit(skb);
++	}
++
++	/* 10.5 */
++	if (!secy->protect_frames) {
++		secy_stats = this_cpu_ptr(macsec->stats);
++		u64_stats_update_begin(&secy_stats->syncp);
++		secy_stats->stats.OutPktsUntagged++;
++		u64_stats_update_end(&secy_stats->syncp);
++		skb->dev = macsec->real_dev;
++		len = skb->len;
++		ret = dev_queue_xmit(skb);
++		count_tx(dev, ret, len);
++		return ret;
++	}
++
++	if (!secy->operational) {
++		kfree_skb(skb);
++		dev->stats.tx_dropped++;
++		return NETDEV_TX_OK;
++	}
++
++	len = skb->len;
++	skb = macsec_encrypt(skb, dev);
++	if (IS_ERR(skb)) {
++		if (PTR_ERR(skb) != -EINPROGRESS)
++			dev->stats.tx_dropped++;
++		return NETDEV_TX_OK;
++	}
++
++	macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa);
++
++	macsec_encrypt_finish(skb, dev);
++	ret = dev_queue_xmit(skb);
++	count_tx(dev, ret, len);
++	return ret;
++}
++
++#define MACSEC_FEATURES \
++	(NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST)
++
++static int macsec_dev_init(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++	int err;
++
++	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
++	if (!dev->tstats)
++		return -ENOMEM;
++
++	err = gro_cells_init(&macsec->gro_cells, dev);
++	if (err) {
++		free_percpu(dev->tstats);
++		return err;
++	}
++
++	dev->features = real_dev->features & MACSEC_FEATURES;
++	dev->features |= NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE;
++
++	dev->needed_headroom = real_dev->needed_headroom +
++			       MACSEC_NEEDED_HEADROOM;
++	dev->needed_tailroom = real_dev->needed_tailroom +
++			       MACSEC_NEEDED_TAILROOM;
++
++	if (is_zero_ether_addr(dev->dev_addr))
++		eth_hw_addr_inherit(dev, real_dev);
++	if (is_zero_ether_addr(dev->broadcast))
++		memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
++
++	/* Get macsec's reference to real_dev */
++	netdev_hold(real_dev, &macsec->dev_tracker, GFP_KERNEL);
++
++	return 0;
++}
++
++static void macsec_dev_uninit(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++
++	gro_cells_destroy(&macsec->gro_cells);
++	free_percpu(dev->tstats);
++}
++
++static netdev_features_t macsec_fix_features(struct net_device *dev,
++					     netdev_features_t features)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++
++	features &= (real_dev->features & MACSEC_FEATURES) |
++		    NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES;
++	features |= NETIF_F_LLTX;
++
++	return features;
++}
++
++static int macsec_dev_open(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++	int err;
++
++	err = dev_uc_add(real_dev, dev->dev_addr);
++	if (err < 0)
++		return err;
++
++	if (dev->flags & IFF_ALLMULTI) {
++		err = dev_set_allmulti(real_dev, 1);
++		if (err < 0)
++			goto del_unicast;
++	}
++
++	if (dev->flags & IFF_PROMISC) {
++		err = dev_set_promiscuity(real_dev, 1);
++		if (err < 0)
++			goto clear_allmulti;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			err = -EOPNOTSUPP;
++			goto clear_allmulti;
++		}
++
++		ctx.secy = &macsec->secy;
++		err = macsec_offload(ops->mdo_dev_open, &ctx);
++		if (err)
++			goto clear_allmulti;
++	}
++
++	if (netif_carrier_ok(real_dev))
++		netif_carrier_on(dev);
++
++	return 0;
++clear_allmulti:
++	if (dev->flags & IFF_ALLMULTI)
++		dev_set_allmulti(real_dev, -1);
++del_unicast:
++	dev_uc_del(real_dev, dev->dev_addr);
++	netif_carrier_off(dev);
++	return err;
++}
++
++static int macsec_dev_stop(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++
++	netif_carrier_off(dev);
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.secy = &macsec->secy;
++			macsec_offload(ops->mdo_dev_stop, &ctx);
++		}
++	}
++
++	dev_mc_unsync(real_dev, dev);
++	dev_uc_unsync(real_dev, dev);
++
++	if (dev->flags & IFF_ALLMULTI)
++		dev_set_allmulti(real_dev, -1);
++
++	if (dev->flags & IFF_PROMISC)
++		dev_set_promiscuity(real_dev, -1);
++
++	dev_uc_del(real_dev, dev->dev_addr);
++
++	return 0;
++}
++
++static void macsec_dev_change_rx_flags(struct net_device *dev, int change)
++{
++	struct net_device *real_dev = macsec_priv(dev)->real_dev;
++
++	if (!(dev->flags & IFF_UP))
++		return;
++
++	if (change & IFF_ALLMULTI)
++		dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
++
++	if (change & IFF_PROMISC)
++		dev_set_promiscuity(real_dev,
++				    dev->flags & IFF_PROMISC ? 1 : -1);
++}
++
++static void macsec_dev_set_rx_mode(struct net_device *dev)
++{
++	struct net_device *real_dev = macsec_priv(dev)->real_dev;
++
++	dev_mc_sync(real_dev, dev);
++	dev_uc_sync(real_dev, dev);
++}
++
++static int macsec_set_mac_address(struct net_device *dev, void *p)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++	struct sockaddr *addr = p;
++	int err;
++
++	if (!is_valid_ether_addr(addr->sa_data))
++		return -EADDRNOTAVAIL;
++
++	if (!(dev->flags & IFF_UP))
++		goto out;
++
++	err = dev_uc_add(real_dev, addr->sa_data);
++	if (err < 0)
++		return err;
++
++	dev_uc_del(real_dev, dev->dev_addr);
++
++out:
++	eth_hw_addr_set(dev, addr->sa_data);
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.secy = &macsec->secy;
++			macsec_offload(ops->mdo_upd_secy, &ctx);
++		}
++	}
++
++	return 0;
++}
++
++static int macsec_change_mtu(struct net_device *dev, int new_mtu)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	unsigned int extra = macsec->secy.icv_len + macsec_extra_len(true);
++
++	if (macsec->real_dev->mtu - extra < new_mtu)
++		return -ERANGE;
++
++	dev->mtu = new_mtu;
++
++	return 0;
++}
++
++static void macsec_get_stats64(struct net_device *dev,
++			       struct rtnl_link_stats64 *s)
++{
++	if (!dev->tstats)
++		return;
++
++	dev_fetch_sw_netstats(s, dev->tstats);
++
++	s->rx_dropped = dev->stats.rx_dropped;
++	s->tx_dropped = dev->stats.tx_dropped;
++	s->rx_errors = dev->stats.rx_errors;
++}
++
++static int macsec_get_iflink(const struct net_device *dev)
++{
++	return macsec_priv(dev)->real_dev->ifindex;
++}
++
++static const struct net_device_ops macsec_netdev_ops = {
++	.ndo_init		= macsec_dev_init,
++	.ndo_uninit		= macsec_dev_uninit,
++	.ndo_open		= macsec_dev_open,
++	.ndo_stop		= macsec_dev_stop,
++	.ndo_fix_features	= macsec_fix_features,
++	.ndo_change_mtu		= macsec_change_mtu,
++	.ndo_set_rx_mode	= macsec_dev_set_rx_mode,
++	.ndo_change_rx_flags	= macsec_dev_change_rx_flags,
++	.ndo_set_mac_address	= macsec_set_mac_address,
++	.ndo_start_xmit		= macsec_start_xmit,
++	.ndo_get_stats64	= macsec_get_stats64,
++	.ndo_get_iflink		= macsec_get_iflink,
++};
++
++static const struct device_type macsec_type = {
++	.name = "macsec",
++};
++
++static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = {
++	[IFLA_MACSEC_SCI] = { .type = NLA_U64 },
++	[IFLA_MACSEC_PORT] = { .type = NLA_U16 },
++	[IFLA_MACSEC_ICV_LEN] = { .type = NLA_U8 },
++	[IFLA_MACSEC_CIPHER_SUITE] = { .type = NLA_U64 },
++	[IFLA_MACSEC_WINDOW] = { .type = NLA_U32 },
++	[IFLA_MACSEC_ENCODING_SA] = { .type = NLA_U8 },
++	[IFLA_MACSEC_ENCRYPT] = { .type = NLA_U8 },
++	[IFLA_MACSEC_PROTECT] = { .type = NLA_U8 },
++	[IFLA_MACSEC_INC_SCI] = { .type = NLA_U8 },
++	[IFLA_MACSEC_ES] = { .type = NLA_U8 },
++	[IFLA_MACSEC_SCB] = { .type = NLA_U8 },
++	[IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 },
++	[IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 },
++};
++
++static void macsec_free_netdev(struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++
++	free_percpu(macsec->stats);
++	free_percpu(macsec->secy.tx_sc.stats);
++
++	/* Get rid of the macsec's reference to real_dev */
++	netdev_put(macsec->real_dev, &macsec->dev_tracker);
++}
++
++static void macsec_setup(struct net_device *dev)
++{
++	ether_setup(dev);
++	dev->min_mtu = 0;
++	dev->max_mtu = ETH_MAX_MTU;
++	dev->priv_flags |= IFF_NO_QUEUE;
++	dev->netdev_ops = &macsec_netdev_ops;
++	dev->needs_free_netdev = true;
++	dev->priv_destructor = macsec_free_netdev;
++	SET_NETDEV_DEVTYPE(dev, &macsec_type);
++
++	eth_zero_addr(dev->broadcast);
++}
++
++static int macsec_changelink_common(struct net_device *dev,
++				    struct nlattr *data[])
++{
++	struct macsec_secy *secy;
++	struct macsec_tx_sc *tx_sc;
++
++	secy = &macsec_priv(dev)->secy;
++	tx_sc = &secy->tx_sc;
++
++	if (data[IFLA_MACSEC_ENCODING_SA]) {
++		struct macsec_tx_sa *tx_sa;
++
++		tx_sc->encoding_sa = nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]);
++		tx_sa = rtnl_dereference(tx_sc->sa[tx_sc->encoding_sa]);
++
++		secy->operational = tx_sa && tx_sa->active;
++	}
++
++	if (data[IFLA_MACSEC_ENCRYPT])
++		tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]);
++
++	if (data[IFLA_MACSEC_PROTECT])
++		secy->protect_frames = !!nla_get_u8(data[IFLA_MACSEC_PROTECT]);
++
++	if (data[IFLA_MACSEC_INC_SCI])
++		tx_sc->send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]);
++
++	if (data[IFLA_MACSEC_ES])
++		tx_sc->end_station = !!nla_get_u8(data[IFLA_MACSEC_ES]);
++
++	if (data[IFLA_MACSEC_SCB])
++		tx_sc->scb = !!nla_get_u8(data[IFLA_MACSEC_SCB]);
++
++	if (data[IFLA_MACSEC_REPLAY_PROTECT])
++		secy->replay_protect = !!nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT]);
++
++	if (data[IFLA_MACSEC_VALIDATION])
++		secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]);
++
++	if (data[IFLA_MACSEC_CIPHER_SUITE]) {
++		switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) {
++		case MACSEC_CIPHER_ID_GCM_AES_128:
++		case MACSEC_DEFAULT_CIPHER_ID:
++			secy->key_len = MACSEC_GCM_AES_128_SAK_LEN;
++			secy->xpn = false;
++			break;
++		case MACSEC_CIPHER_ID_GCM_AES_256:
++			secy->key_len = MACSEC_GCM_AES_256_SAK_LEN;
++			secy->xpn = false;
++			break;
++		case MACSEC_CIPHER_ID_GCM_AES_XPN_128:
++			secy->key_len = MACSEC_GCM_AES_128_SAK_LEN;
++			secy->xpn = true;
++			break;
++		case MACSEC_CIPHER_ID_GCM_AES_XPN_256:
++			secy->key_len = MACSEC_GCM_AES_256_SAK_LEN;
++			secy->xpn = true;
++			break;
++		default:
++			return -EINVAL;
++		}
++	}
++
++	if (data[IFLA_MACSEC_WINDOW]) {
++		secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]);
++
++		/* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window
++		 * for XPN cipher suites */
++		if (secy->xpn &&
++		    secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW)
++			return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int macsec_changelink(struct net_device *dev, struct nlattr *tb[],
++			     struct nlattr *data[],
++			     struct netlink_ext_ack *extack)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_tx_sc tx_sc;
++	struct macsec_secy secy;
++	int ret;
++
++	if (!data)
++		return 0;
++
++	if (data[IFLA_MACSEC_CIPHER_SUITE] ||
++	    data[IFLA_MACSEC_ICV_LEN] ||
++	    data[IFLA_MACSEC_SCI] ||
++	    data[IFLA_MACSEC_PORT])
++		return -EINVAL;
++
++	/* Keep a copy of unmodified secy and tx_sc, in case the offload
++	 * propagation fails, to revert macsec_changelink_common.
++	 */
++	memcpy(&secy, &macsec->secy, sizeof(secy));
++	memcpy(&tx_sc, &macsec->secy.tx_sc, sizeof(tx_sc));
++
++	ret = macsec_changelink_common(dev, data);
++	if (ret)
++		goto cleanup;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (!ops) {
++			ret = -EOPNOTSUPP;
++			goto cleanup;
++		}
++
++		ctx.secy = &macsec->secy;
++		ret = macsec_offload(ops->mdo_upd_secy, &ctx);
++		if (ret)
++			goto cleanup;
++	}
++
++	return 0;
++
++cleanup:
++	memcpy(&macsec->secy.tx_sc, &tx_sc, sizeof(tx_sc));
++	memcpy(&macsec->secy, &secy, sizeof(secy));
++
++	return ret;
++}
++
++static void macsec_del_dev(struct macsec_dev *macsec)
++{
++	int i;
++
++	while (macsec->secy.rx_sc) {
++		struct macsec_rx_sc *rx_sc = rtnl_dereference(macsec->secy.rx_sc);
++
++		rcu_assign_pointer(macsec->secy.rx_sc, rx_sc->next);
++		free_rx_sc(rx_sc);
++	}
++
++	for (i = 0; i < MACSEC_NUM_AN; i++) {
++		struct macsec_tx_sa *sa = rtnl_dereference(macsec->secy.tx_sc.sa[i]);
++
++		if (sa) {
++			RCU_INIT_POINTER(macsec->secy.tx_sc.sa[i], NULL);
++			clear_tx_sa(sa);
++		}
++	}
++}
++
++static void macsec_common_dellink(struct net_device *dev, struct list_head *head)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(netdev_priv(dev), &ctx);
++		if (ops) {
++			ctx.secy = &macsec->secy;
++			macsec_offload(ops->mdo_del_secy, &ctx);
++		}
++	}
++
++	unregister_netdevice_queue(dev, head);
++	list_del_rcu(&macsec->secys);
++	macsec_del_dev(macsec);
++	netdev_upper_dev_unlink(real_dev, dev);
++
++	macsec_generation++;
++}
++
++static void macsec_dellink(struct net_device *dev, struct list_head *head)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct net_device *real_dev = macsec->real_dev;
++	struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);
++
++	macsec_common_dellink(dev, head);
++
++	if (list_empty(&rxd->secys)) {
++		netdev_rx_handler_unregister(real_dev);
++		kfree(rxd);
++	}
++}
++
++static int register_macsec_dev(struct net_device *real_dev,
++			       struct net_device *dev)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);
++
++	if (!rxd) {
++		int err;
++
++		rxd = kmalloc(sizeof(*rxd), GFP_KERNEL);
++		if (!rxd)
++			return -ENOMEM;
++
++		INIT_LIST_HEAD(&rxd->secys);
++
++		err = netdev_rx_handler_register(real_dev, macsec_handle_frame,
++						 rxd);
++		if (err < 0) {
++			kfree(rxd);
++			return err;
++		}
++	}
++
++	list_add_tail_rcu(&macsec->secys, &rxd->secys);
++	return 0;
++}
++
++static bool sci_exists(struct net_device *dev, sci_t sci)
++{
++	struct macsec_rxh_data *rxd = macsec_data_rtnl(dev);
++	struct macsec_dev *macsec;
++
++	list_for_each_entry(macsec, &rxd->secys, secys) {
++		if (macsec->secy.sci == sci)
++			return true;
++	}
++
++	return false;
++}
++
++static sci_t dev_to_sci(struct net_device *dev, __be16 port)
++{
++	return make_sci(dev->dev_addr, port);
++}
++
++static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	struct macsec_secy *secy = &macsec->secy;
++
++	macsec->stats = netdev_alloc_pcpu_stats(struct pcpu_secy_stats);
++	if (!macsec->stats)
++		return -ENOMEM;
++
++	secy->tx_sc.stats = netdev_alloc_pcpu_stats(struct pcpu_tx_sc_stats);
++	if (!secy->tx_sc.stats) {
++		free_percpu(macsec->stats);
++		return -ENOMEM;
++	}
++
++	if (sci == MACSEC_UNDEF_SCI)
++		sci = dev_to_sci(dev, MACSEC_PORT_ES);
++
++	secy->netdev = dev;
++	secy->operational = true;
++	secy->key_len = DEFAULT_SAK_LEN;
++	secy->icv_len = icv_len;
++	secy->validate_frames = MACSEC_VALIDATE_DEFAULT;
++	secy->protect_frames = true;
++	secy->replay_protect = false;
++	secy->xpn = DEFAULT_XPN;
++
++	secy->sci = sci;
++	secy->tx_sc.active = true;
++	secy->tx_sc.encoding_sa = DEFAULT_ENCODING_SA;
++	secy->tx_sc.encrypt = DEFAULT_ENCRYPT;
++	secy->tx_sc.send_sci = DEFAULT_SEND_SCI;
++	secy->tx_sc.end_station = false;
++	secy->tx_sc.scb = false;
++
++	return 0;
++}
++
++static struct lock_class_key macsec_netdev_addr_lock_key;
++
++static int macsec_newlink(struct net *net, struct net_device *dev,
++			  struct nlattr *tb[], struct nlattr *data[],
++			  struct netlink_ext_ack *extack)
++{
++	struct macsec_dev *macsec = macsec_priv(dev);
++	rx_handler_func_t *rx_handler;
++	u8 icv_len = DEFAULT_ICV_LEN;
++	struct net_device *real_dev;
++	int err, mtu;
++	sci_t sci;
++
++	if (!tb[IFLA_LINK])
++		return -EINVAL;
++	real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
++	if (!real_dev)
++		return -ENODEV;
++	if (real_dev->type != ARPHRD_ETHER)
++		return -EINVAL;
++
++	dev->priv_flags |= IFF_MACSEC;
++
++	macsec->real_dev = real_dev;
++
++	if (data && data[IFLA_MACSEC_OFFLOAD])
++		macsec->offload = nla_get_offload(data[IFLA_MACSEC_OFFLOAD]);
++	else
++		/* MACsec offloading is off by default */
++		macsec->offload = MACSEC_OFFLOAD_OFF;
++
++	/* Check if the offloading mode is supported by the underlying layers */
++	if (macsec->offload != MACSEC_OFFLOAD_OFF &&
++	    !macsec_check_offload(macsec->offload, macsec))
++		return -EOPNOTSUPP;
++
++	/* send_sci must be set to true when transmit sci explicitly is set */
++	if ((data && data[IFLA_MACSEC_SCI]) &&
++	    (data && data[IFLA_MACSEC_INC_SCI])) {
++		u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]);
++
++		if (!send_sci)
++			return -EINVAL;
++	}
++
++	if (data && data[IFLA_MACSEC_ICV_LEN])
++		icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]);
++	mtu = real_dev->mtu - icv_len - macsec_extra_len(true);
++	if (mtu < 0)
++		dev->mtu = 0;
++	else
++		dev->mtu = mtu;
++
++	rx_handler = rtnl_dereference(real_dev->rx_handler);
++	if (rx_handler && rx_handler != macsec_handle_frame)
++		return -EBUSY;
++
++	err = register_netdevice(dev);
++	if (err < 0)
++		return err;
++
++	netdev_lockdep_set_classes(dev);
++	lockdep_set_class(&dev->addr_list_lock,
++			  &macsec_netdev_addr_lock_key);
++
++	err = netdev_upper_dev_link(real_dev, dev, extack);
++	if (err < 0)
++		goto unregister;
++
++	/* need to be already registered so that ->init has run and
++	 * the MAC addr is set
++	 */
++	if (data && data[IFLA_MACSEC_SCI])
++		sci = nla_get_sci(data[IFLA_MACSEC_SCI]);
++	else if (data && data[IFLA_MACSEC_PORT])
++		sci = dev_to_sci(dev, nla_get_be16(data[IFLA_MACSEC_PORT]));
++	else
++		sci = dev_to_sci(dev, MACSEC_PORT_ES);
++
++	if (rx_handler && sci_exists(real_dev, sci)) {
++		err = -EBUSY;
++		goto unlink;
++	}
++
++	err = macsec_add_dev(dev, sci, icv_len);
++	if (err)
++		goto unlink;
++
++	if (data) {
++		err = macsec_changelink_common(dev, data);
++		if (err)
++			goto del_dev;
++	}
++
++	/* If h/w offloading is available, propagate to the device */
++	if (macsec_is_offloaded(macsec)) {
++		const struct macsec_ops *ops;
++		struct macsec_context ctx;
++
++		ops = macsec_get_ops(macsec, &ctx);
++		if (ops) {
++			ctx.secy = &macsec->secy;
++			err = macsec_offload(ops->mdo_add_secy, &ctx);
++			if (err)
++				goto del_dev;
++		}
++	}
++
++	err = register_macsec_dev(real_dev, dev);
++	if (err < 0)
++		goto del_dev;
++
++	netif_stacked_transfer_operstate(real_dev, dev);
++	linkwatch_fire_event(dev);
++
++	macsec_generation++;
++
++	return 0;
++
++del_dev:
++	macsec_del_dev(macsec);
++unlink:
++	netdev_upper_dev_unlink(real_dev, dev);
++unregister:
++	unregister_netdevice(dev);
++	return err;
++}
++
++static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[],
++				struct netlink_ext_ack *extack)
++{
++	u64 csid = MACSEC_DEFAULT_CIPHER_ID;
++	u8 icv_len = DEFAULT_ICV_LEN;
++	int flag;
++	bool es, scb, sci;
++
++	if (!data)
++		return 0;
++
++	if (data[IFLA_MACSEC_CIPHER_SUITE])
++		csid = nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE]);
++
++	if (data[IFLA_MACSEC_ICV_LEN]) {
++		icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]);
++		if (icv_len != DEFAULT_ICV_LEN) {
++			char dummy_key[DEFAULT_SAK_LEN] = { 0 };
++			struct crypto_aead *dummy_tfm;
++
++			dummy_tfm = macsec_alloc_tfm(dummy_key,
++						     DEFAULT_SAK_LEN,
++						     icv_len);
++			if (IS_ERR(dummy_tfm))
++				return PTR_ERR(dummy_tfm);
++			crypto_free_aead(dummy_tfm);
++		}
++	}
++
++	switch (csid) {
++	case MACSEC_CIPHER_ID_GCM_AES_128:
++	case MACSEC_CIPHER_ID_GCM_AES_256:
++	case MACSEC_CIPHER_ID_GCM_AES_XPN_128:
++	case MACSEC_CIPHER_ID_GCM_AES_XPN_256:
++	case MACSEC_DEFAULT_CIPHER_ID:
++		if (icv_len < MACSEC_MIN_ICV_LEN ||
++		    icv_len > MACSEC_STD_ICV_LEN)
++			return -EINVAL;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	if (data[IFLA_MACSEC_ENCODING_SA]) {
++		if (nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]) >= MACSEC_NUM_AN)
++			return -EINVAL;
++	}
++
++	for (flag = IFLA_MACSEC_ENCODING_SA + 1;
++	     flag < IFLA_MACSEC_VALIDATION;
++	     flag++) {
++		if (data[flag]) {
++			if (nla_get_u8(data[flag]) > 1)
++				return -EINVAL;
++		}
++	}
++
++	es  = data[IFLA_MACSEC_ES] ? nla_get_u8(data[IFLA_MACSEC_ES]) : false;
++	sci = data[IFLA_MACSEC_INC_SCI] ? nla_get_u8(data[IFLA_MACSEC_INC_SCI]) : false;
++	scb = data[IFLA_MACSEC_SCB] ? nla_get_u8(data[IFLA_MACSEC_SCB]) : false;
++
++	if ((sci && (scb || es)) || (scb && es))
++		return -EINVAL;
++
++	if (data[IFLA_MACSEC_VALIDATION] &&
++	    nla_get_u8(data[IFLA_MACSEC_VALIDATION]) > MACSEC_VALIDATE_MAX)
++		return -EINVAL;
++
++	if ((data[IFLA_MACSEC_REPLAY_PROTECT] &&
++	     nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT])) &&
++	    !data[IFLA_MACSEC_WINDOW])
++		return -EINVAL;
++
++	return 0;
++}
++
++static struct net *macsec_get_link_net(const struct net_device *dev)
++{
++	return dev_net(macsec_priv(dev)->real_dev);
++}
++
++static size_t macsec_get_size(const struct net_device *dev)
++{
++	return  nla_total_size_64bit(8) + /* IFLA_MACSEC_SCI */
++		nla_total_size(1) + /* IFLA_MACSEC_ICV_LEN */
++		nla_total_size_64bit(8) + /* IFLA_MACSEC_CIPHER_SUITE */
++		nla_total_size(4) + /* IFLA_MACSEC_WINDOW */
++		nla_total_size(1) + /* IFLA_MACSEC_ENCODING_SA */
++		nla_total_size(1) + /* IFLA_MACSEC_ENCRYPT */
++		nla_total_size(1) + /* IFLA_MACSEC_PROTECT */
++		nla_total_size(1) + /* IFLA_MACSEC_INC_SCI */
++		nla_total_size(1) + /* IFLA_MACSEC_ES */
++		nla_total_size(1) + /* IFLA_MACSEC_SCB */
++		nla_total_size(1) + /* IFLA_MACSEC_REPLAY_PROTECT */
++		nla_total_size(1) + /* IFLA_MACSEC_VALIDATION */
++		0;
++}
++
++static int macsec_fill_info(struct sk_buff *skb,
++			    const struct net_device *dev)
++{
++	struct macsec_secy *secy = &macsec_priv(dev)->secy;
++	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
++	u64 csid;
++
++	switch (secy->key_len) {
++	case MACSEC_GCM_AES_128_SAK_LEN:
++		csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID;
++		break;
++	case MACSEC_GCM_AES_256_SAK_LEN:
++		csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256;
++		break;
++	default:
++		goto nla_put_failure;
++	}
++
++	if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci,
++			IFLA_MACSEC_PAD) ||
++	    nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) ||
++	    nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE,
++			      csid, IFLA_MACSEC_PAD) ||
++	    nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) ||
++	    nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) ||
++	    nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) ||
++	    nla_put_u8(skb, IFLA_MACSEC_INC_SCI, tx_sc->send_sci) ||
++	    nla_put_u8(skb, IFLA_MACSEC_ES, tx_sc->end_station) ||
++	    nla_put_u8(skb, IFLA_MACSEC_SCB, tx_sc->scb) ||
++	    nla_put_u8(skb, IFLA_MACSEC_REPLAY_PROTECT, secy->replay_protect) ||
++	    nla_put_u8(skb, IFLA_MACSEC_VALIDATION, secy->validate_frames) ||
++	    0)
++		goto nla_put_failure;
++
++	if (secy->replay_protect) {
++		if (nla_put_u32(skb, IFLA_MACSEC_WINDOW, secy->replay_window))
++			goto nla_put_failure;
++	}
++
++	return 0;
++
++nla_put_failure:
++	return -EMSGSIZE;
++}
++
++static struct rtnl_link_ops macsec_link_ops __read_mostly = {
++	.kind		= "macsec",
++	.priv_size	= sizeof(struct macsec_dev),
++	.maxtype	= IFLA_MACSEC_MAX,
++	.policy		= macsec_rtnl_policy,
++	.setup		= macsec_setup,
++	.validate	= macsec_validate_attr,
++	.newlink	= macsec_newlink,
++	.changelink	= macsec_changelink,
++	.dellink	= macsec_dellink,
++	.get_size	= macsec_get_size,
++	.fill_info	= macsec_fill_info,
++	.get_link_net	= macsec_get_link_net,
++};
++
++static bool is_macsec_master(struct net_device *dev)
++{
++	return rcu_access_pointer(dev->rx_handler) == macsec_handle_frame;
++}
++
++static int macsec_notify(struct notifier_block *this, unsigned long event,
++			 void *ptr)
++{
++	struct net_device *real_dev = netdev_notifier_info_to_dev(ptr);
++	LIST_HEAD(head);
++
++	if (!is_macsec_master(real_dev))
++		return NOTIFY_DONE;
++
++	switch (event) {
++	case NETDEV_DOWN:
++	case NETDEV_UP:
++	case NETDEV_CHANGE: {
++		struct macsec_dev *m, *n;
++		struct macsec_rxh_data *rxd;
++
++		rxd = macsec_data_rtnl(real_dev);
++		list_for_each_entry_safe(m, n, &rxd->secys, secys) {
++			struct net_device *dev = m->secy.netdev;
++
++			netif_stacked_transfer_operstate(real_dev, dev);
++		}
++		break;
++	}
++	case NETDEV_UNREGISTER: {
++		struct macsec_dev *m, *n;
++		struct macsec_rxh_data *rxd;
++
++		rxd = macsec_data_rtnl(real_dev);
++		list_for_each_entry_safe(m, n, &rxd->secys, secys) {
++			macsec_common_dellink(m->secy.netdev, &head);
++		}
++
++		netdev_rx_handler_unregister(real_dev);
++		kfree(rxd);
++
++		unregister_netdevice_many(&head);
++		break;
++	}
++	case NETDEV_CHANGEMTU: {
++		struct macsec_dev *m;
++		struct macsec_rxh_data *rxd;
++
++		rxd = macsec_data_rtnl(real_dev);
++		list_for_each_entry(m, &rxd->secys, secys) {
++			struct net_device *dev = m->secy.netdev;
++			unsigned int mtu = real_dev->mtu - (m->secy.icv_len +
++							    macsec_extra_len(true));
++
++			if (dev->mtu > mtu)
++				dev_set_mtu(dev, mtu);
++		}
++	}
++	}
++
++	return NOTIFY_OK;
++}
++
++static struct notifier_block macsec_notifier = {
++	.notifier_call = macsec_notify,
++};
++
++static int __init macsec_init(void)
++{
++	int err;
++
++	pr_info("MACsec IEEE 802.1AE\n");
++	err = register_netdevice_notifier(&macsec_notifier);
++	if (err)
++		return err;
++
++	err = rtnl_link_register(&macsec_link_ops);
++	if (err)
++		goto notifier;
++
++	err = genl_register_family(&macsec_fam);
++	if (err)
++		goto rtnl;
++
++	return 0;
++
++rtnl:
++	rtnl_link_unregister(&macsec_link_ops);
++notifier:
++	unregister_netdevice_notifier(&macsec_notifier);
++	return err;
++}
++
++static void __exit macsec_exit(void)
++{
++	genl_unregister_family(&macsec_fam);
++	rtnl_link_unregister(&macsec_link_ops);
++	unregister_netdevice_notifier(&macsec_notifier);
++	rcu_barrier();
++}
++
++module_init(macsec_init);
++module_exit(macsec_exit);
++
++MODULE_ALIAS_RTNL_LINK("macsec");
++MODULE_ALIAS_GENL_FAMILY("macsec");
++
++MODULE_DESCRIPTION("MACsec IEEE 802.1AE");
++MODULE_LICENSE("GPL v2");
+diff -rupN linux.orig/drivers/net/macvlan.c linux/drivers/net/macvlan.c
+--- linux.orig/drivers/net/macvlan.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/macvlan.c	2022-12-04 10:40:26.696034096 -0500
+@@ -948,13 +948,13 @@ static void macvlan_dev_get_stats64(stru
  		for_each_possible_cpu(i) {
  			p = per_cpu_ptr(vlan->pcpu_stats, i);
  			do {
@@ -2829,11 +20477,10 @@ index 1080d6ebff63b..a1c7823f0ba66 100644
  
  			stats->rx_packets	+= rx_packets;
  			stats->rx_bytes		+= rx_bytes;
-diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c
-index 0b1b6f650104b..ff302144029de 100644
---- a/drivers/net/mhi_net.c
-+++ b/drivers/net/mhi_net.c
-@@ -104,19 +104,19 @@ static void mhi_ndo_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/mhi_net.c linux/drivers/net/mhi_net.c
+--- linux.orig/drivers/net/mhi_net.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/mhi_net.c	2022-12-04 10:40:26.696034096 -0500
+@@ -104,19 +104,19 @@ static void mhi_ndo_get_stats64(struct n
  	unsigned int start;
  
  	do {
@@ -2857,11 +20504,10 @@ index 0b1b6f650104b..ff302144029de 100644
  }
  
  static const struct net_device_ops mhi_netdev_ops = {
-diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
-index 9a1a5b2036240..e470e3398abc2 100644
---- a/drivers/net/netdevsim/netdev.c
-+++ b/drivers/net/netdevsim/netdev.c
-@@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/netdevsim/netdev.c linux/drivers/net/netdevsim/netdev.c
+--- linux.orig/drivers/net/netdevsim/netdev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/netdevsim/netdev.c	2022-12-04 10:40:26.696034096 -0500
+@@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev,
  	unsigned int start;
  
  	do {
@@ -2874,11 +20520,10 @@ index 9a1a5b2036240..e470e3398abc2 100644
  }
  
  static int
-diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
-index 154a3c0a6dfd8..3de937141c168 100644
---- a/drivers/net/team/team.c
-+++ b/drivers/net/team/team.c
-@@ -1865,13 +1865,13 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+diff -rupN linux.orig/drivers/net/team/team.c linux/drivers/net/team/team.c
+--- linux.orig/drivers/net/team/team.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/team/team.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1865,13 +1865,13 @@ team_get_stats64(struct net_device *dev,
  	for_each_possible_cpu(i) {
  		p = per_cpu_ptr(team->pcpu_stats, i);
  		do {
@@ -2894,11 +20539,10 @@ index 154a3c0a6dfd8..3de937141c168 100644
  
  		stats->rx_packets	+= rx_packets;
  		stats->rx_bytes		+= rx_bytes;
-diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
-index b095a4b4957bb..18d99fda997cf 100644
---- a/drivers/net/team/team_mode_loadbalance.c
-+++ b/drivers/net/team/team_mode_loadbalance.c
-@@ -466,9 +466,9 @@ static void __lb_one_cpu_stats_add(struct lb_stats *acc_stats,
+diff -rupN linux.orig/drivers/net/team/team_mode_loadbalance.c linux/drivers/net/team/team_mode_loadbalance.c
+--- linux.orig/drivers/net/team/team_mode_loadbalance.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/team/team_mode_loadbalance.c	2022-12-04 10:40:26.696034096 -0500
+@@ -466,9 +466,9 @@ static void __lb_one_cpu_stats_add(struc
  	struct lb_stats tmp;
  
  	do {
@@ -2910,11 +20554,10 @@ index b095a4b4957bb..18d99fda997cf 100644
  	acc_stats->tx_bytes += tmp.tx_bytes;
  }
  
-diff --git a/drivers/net/veth.c b/drivers/net/veth.c
-index 466da01ba2e3e..2da7cfcfe1c31 100644
---- a/drivers/net/veth.c
-+++ b/drivers/net/veth.c
-@@ -182,12 +182,12 @@ static void veth_get_ethtool_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/veth.c linux/drivers/net/veth.c
+--- linux.orig/drivers/net/veth.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/veth.c	2022-12-04 10:40:26.696034096 -0500
+@@ -182,12 +182,12 @@ static void veth_get_ethtool_stats(struc
  		size_t offset;
  
  		do {
@@ -2929,7 +20572,7 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644
  		idx += VETH_RQ_STATS_LEN;
  	}
  
-@@ -203,12 +203,12 @@ static void veth_get_ethtool_stats(struct net_device *dev,
+@@ -203,12 +203,12 @@ static void veth_get_ethtool_stats(struc
  
  		tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
  		do {
@@ -2944,7 +20587,7 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644
  	}
  }
  
-@@ -379,13 +379,13 @@ static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
+@@ -379,13 +379,13 @@ static void veth_stats_rx(struct veth_st
  		unsigned int start;
  
  		do {
@@ -2960,11 +20603,10 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644
  		result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err;
  		result->xdp_tx_err += xdp_tx_err;
  		result->xdp_packets += packets;
-diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
-index 9cce7dec7366d..a94d9d8f67fd0 100644
---- a/drivers/net/virtio_net.c
-+++ b/drivers/net/virtio_net.c
-@@ -2066,18 +2066,18 @@ static void virtnet_stats(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/virtio_net.c linux/drivers/net/virtio_net.c
+--- linux.orig/drivers/net/virtio_net.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/virtio_net.c	2022-12-04 10:40:26.696034096 -0500
+@@ -2066,18 +2066,18 @@ static void virtnet_stats(struct net_dev
  		struct send_queue *sq = &vi->sq[i];
  
  		do {
@@ -2987,7 +20629,7 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644
  
  		tot->rx_packets += rpackets;
  		tot->tx_packets += tpackets;
-@@ -2688,12 +2688,12 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
+@@ -2688,12 +2688,12 @@ static void virtnet_get_ethtool_stats(st
  
  		stats_base = (u8 *)&rq->stats;
  		do {
@@ -3002,7 +20644,7 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644
  		idx += VIRTNET_RQ_STATS_LEN;
  	}
  
-@@ -2702,12 +2702,12 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
+@@ -2702,12 +2702,12 @@ static void virtnet_get_ethtool_stats(st
  
  		stats_base = (u8 *)&sq->stats;
  		do {
@@ -3017,11 +20659,10 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644
  		idx += VIRTNET_SQ_STATS_LEN;
  	}
  }
-diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
-index 5df7a0abc39d5..191ebc482f0c1 100644
---- a/drivers/net/vrf.c
-+++ b/drivers/net/vrf.c
-@@ -159,13 +159,13 @@ static void vrf_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/vrf.c linux/drivers/net/vrf.c
+--- linux.orig/drivers/net/vrf.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/vrf.c	2022-12-04 10:40:26.696034096 -0500
+@@ -159,13 +159,13 @@ static void vrf_get_stats64(struct net_d
  
  		dstats = per_cpu_ptr(dev->dstats, i);
  		do {
@@ -3037,11 +20678,10 @@ index 5df7a0abc39d5..191ebc482f0c1 100644
  		stats->tx_bytes += tbytes;
  		stats->tx_packets += tpkts;
  		stats->tx_dropped += tdrops;
-diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c
-index 3e04af4c5daa1..a3de081cda5ee 100644
---- a/drivers/net/vxlan/vxlan_vnifilter.c
-+++ b/drivers/net/vxlan/vxlan_vnifilter.c
-@@ -129,9 +129,9 @@ static void vxlan_vnifilter_stats_get(const struct vxlan_vni_node *vninode,
+diff -rupN linux.orig/drivers/net/vxlan/vxlan_vnifilter.c linux/drivers/net/vxlan/vxlan_vnifilter.c
+--- linux.orig/drivers/net/vxlan/vxlan_vnifilter.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/vxlan/vxlan_vnifilter.c	2022-12-04 10:40:26.696034096 -0500
+@@ -129,9 +129,9 @@ static void vxlan_vnifilter_stats_get(co
  
  		pstats = per_cpu_ptr(vninode->stats, i);
  		do {
@@ -3053,11 +20693,10 @@ index 3e04af4c5daa1..a3de081cda5ee 100644
  
  		dest->rx_packets += temp.rx_packets;
  		dest->rx_bytes += temp.rx_bytes;
-diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c
-index 6872782e8dd89..22b5939a42bb3 100644
---- a/drivers/net/wwan/mhi_wwan_mbim.c
-+++ b/drivers/net/wwan/mhi_wwan_mbim.c
-@@ -456,19 +456,19 @@ static void mhi_mbim_ndo_get_stats64(struct net_device *ndev,
+diff -rupN linux.orig/drivers/net/wwan/mhi_wwan_mbim.c linux/drivers/net/wwan/mhi_wwan_mbim.c
+--- linux.orig/drivers/net/wwan/mhi_wwan_mbim.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/wwan/mhi_wwan_mbim.c	2022-12-04 10:40:26.696034096 -0500
+@@ -456,19 +456,19 @@ static void mhi_mbim_ndo_get_stats64(str
  	unsigned int start;
  
  	do {
@@ -3081,11 +20720,10 @@ index 6872782e8dd89..22b5939a42bb3 100644
  }
  
  static void mhi_mbim_ul_callback(struct mhi_device *mhi_dev,
-diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
-index 27a11cc08c61e..df4dc02638a00 100644
---- a/drivers/net/xen-netfront.c
-+++ b/drivers/net/xen-netfront.c
-@@ -1392,16 +1392,16 @@ static void xennet_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/drivers/net/xen-netfront.c linux/drivers/net/xen-netfront.c
+--- linux.orig/drivers/net/xen-netfront.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/net/xen-netfront.c	2022-12-04 10:40:26.696034096 -0500
+@@ -1392,16 +1392,16 @@ static void xennet_get_stats64(struct ne
  		unsigned int start;
  
  		do {
@@ -3106,11 +20744,10 @@ index 27a11cc08c61e..df4dc02638a00 100644
  
  		tot->rx_packets += rx_packets;
  		tot->tx_packets += tx_packets;
-diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
-index 2a4b3efb7e12b..9f6ed09538cd0 100644
---- a/drivers/pinctrl/pinctrl-amd.c
-+++ b/drivers/pinctrl/pinctrl-amd.c
-@@ -639,7 +639,7 @@ static bool do_amd_gpio_irq_handler(int irq, void *dev_id)
+diff -rupN linux.orig/drivers/pinctrl/pinctrl-amd.c linux/drivers/pinctrl/pinctrl-amd.c
+--- linux.orig/drivers/pinctrl/pinctrl-amd.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/pinctrl/pinctrl-amd.c	2022-12-04 10:40:26.696034096 -0500
+@@ -639,7 +639,7 @@ static bool do_amd_gpio_irq_handler(int
  			if (!(regval & PIN_IRQ_PENDING) ||
  			    !(regval & BIT(INTERRUPT_MASK_OFF)))
  				continue;
@@ -3119,11 +20756,10 @@ index 2a4b3efb7e12b..9f6ed09538cd0 100644
  
  			/* Clear interrupt.
  			 * We must read the pin register again, in case the
-diff --git a/drivers/platform/x86/intel/int0002_vgpio.c b/drivers/platform/x86/intel/int0002_vgpio.c
-index 617dbf98980ec..97cfbc520a02c 100644
---- a/drivers/platform/x86/intel/int0002_vgpio.c
-+++ b/drivers/platform/x86/intel/int0002_vgpio.c
-@@ -125,8 +125,7 @@ static irqreturn_t int0002_irq(int irq, void *data)
+diff -rupN linux.orig/drivers/platform/x86/intel/int0002_vgpio.c linux/drivers/platform/x86/intel/int0002_vgpio.c
+--- linux.orig/drivers/platform/x86/intel/int0002_vgpio.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/platform/x86/intel/int0002_vgpio.c	2022-12-04 10:40:26.696034096 -0500
+@@ -125,8 +125,7 @@ static irqreturn_t int0002_irq(int irq,
  	if (!(gpe_sts_reg & GPE0A_PME_B0_STS_BIT))
  		return IRQ_NONE;
  
@@ -3133,10 +20769,9 @@ index 617dbf98980ec..97cfbc520a02c 100644
  
  	pm_wakeup_hard_event(chip->parent);
  
-diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
-index 4b42f2302a8a8..d4f77f6688cf7 100644
---- a/drivers/spi/spi.c
-+++ b/drivers/spi/spi.c
+diff -rupN linux.orig/drivers/spi/spi.c linux/drivers/spi/spi.c
+--- linux.orig/drivers/spi/spi.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/spi/spi.c	2022-12-04 10:40:26.700034085 -0500
 @@ -127,10 +127,10 @@ do {									\
  		unsigned int start;					\
  		pcpu_stats = per_cpu_ptr(in, i);			\
@@ -3150,11 +20785,10 @@ index 4b42f2302a8a8..d4f77f6688cf7 100644
  					&pcpu_stats->syncp, start));	\
  		ret += inc;						\
  	}								\
-diff --git a/drivers/ssb/driver_gpio.c b/drivers/ssb/driver_gpio.c
-index 2de3896489c84..897cb8db5084f 100644
---- a/drivers/ssb/driver_gpio.c
-+++ b/drivers/ssb/driver_gpio.c
-@@ -132,7 +132,8 @@ static irqreturn_t ssb_gpio_irq_chipco_handler(int irq, void *dev_id)
+diff -rupN linux.orig/drivers/ssb/driver_gpio.c linux/drivers/ssb/driver_gpio.c
+--- linux.orig/drivers/ssb/driver_gpio.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/ssb/driver_gpio.c	2022-12-04 10:40:26.700034085 -0500
+@@ -132,7 +132,8 @@ static irqreturn_t ssb_gpio_irq_chipco_h
  		return IRQ_NONE;
  
  	for_each_set_bit(gpio, &irqs, bus->gpio.ngpio)
@@ -3164,7 +20798,7 @@ index 2de3896489c84..897cb8db5084f 100644
  	ssb_chipco_gpio_polarity(chipco, irqs, val & irqs);
  
  	return IRQ_HANDLED;
-@@ -330,7 +331,8 @@ static irqreturn_t ssb_gpio_irq_extif_handler(int irq, void *dev_id)
+@@ -330,7 +331,8 @@ static irqreturn_t ssb_gpio_irq_extif_ha
  		return IRQ_NONE;
  
  	for_each_set_bit(gpio, &irqs, bus->gpio.ngpio)
@@ -3174,11 +20808,207 @@ index 2de3896489c84..897cb8db5084f 100644
  	ssb_extif_gpio_polarity(extif, irqs, val & irqs);
  
  	return IRQ_HANDLED;
-diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
-index 287153d325365..81f5fce6e895f 100644
---- a/drivers/tty/serial/8250/8250.h
-+++ b/drivers/tty/serial/8250/8250.h
-@@ -177,12 +177,74 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value)
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_aspeed_vuart.c linux/drivers/tty/serial/8250/8250_aspeed_vuart.c
+--- linux.orig/drivers/tty/serial/8250/8250_aspeed_vuart.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_aspeed_vuart.c	2022-12-04 10:40:26.700034085 -0500
+@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle(
+ 	up->ier &= ~irqs;
+ 	if (!throttle)
+ 		up->ier |= irqs;
+-	serial_out(up, UART_IER, up->ier);
++	serial8250_set_IER(up, up->ier);
+ }
+ static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle)
+ {
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_bcm7271.c linux/drivers/tty/serial/8250/8250_bcm7271.c
+--- linux.orig/drivers/tty/serial/8250/8250_bcm7271.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_bcm7271.c	2022-12-04 10:40:26.700034085 -0500
+@@ -609,7 +609,7 @@ static int brcmuart_startup(struct uart_
+ 	 * will handle this.
+ 	 */
+ 	up->ier &= ~UART_IER_RDI;
+-	serial_port_out(port, UART_IER, up->ier);
++	serial8250_set_IER(up, up->ier);
+ 
+ 	priv->tx_running = false;
+ 	priv->dma.rx_dma = NULL;
+@@ -775,10 +775,12 @@ static int brcmuart_handle_irq(struct ua
+ 	unsigned int iir = serial_port_in(p, UART_IIR);
+ 	struct brcmuart_priv *priv = p->private_data;
+ 	struct uart_8250_port *up = up_to_u8250p(p);
++	unsigned long cs_flags;
+ 	unsigned int status;
+ 	unsigned long flags;
+ 	unsigned int ier;
+ 	unsigned int mcr;
++	bool is_console;
+ 	int handled = 0;
+ 
+ 	/*
+@@ -789,6 +791,10 @@ static int brcmuart_handle_irq(struct ua
+ 		spin_lock_irqsave(&p->lock, flags);
+ 		status = serial_port_in(p, UART_LSR);
+ 		if ((status & UART_LSR_DR) == 0) {
++			is_console = uart_console(p);
++
++			if (is_console)
++				printk_cpu_sync_get_irqsave(cs_flags);
+ 
+ 			ier = serial_port_in(p, UART_IER);
+ 			/*
+@@ -809,6 +815,9 @@ static int brcmuart_handle_irq(struct ua
+ 				serial_port_in(p, UART_RX);
+ 			}
+ 
++			if (is_console)
++				printk_cpu_sync_put_irqrestore(cs_flags);
++
+ 			handled = 1;
+ 		}
+ 		spin_unlock_irqrestore(&p->lock, flags);
+@@ -823,8 +832,10 @@ static enum hrtimer_restart brcmuart_hrt
+ 	struct brcmuart_priv *priv = container_of(t, struct brcmuart_priv, hrt);
+ 	struct uart_port *p = priv->up;
+ 	struct uart_8250_port *up = up_to_u8250p(p);
++	unsigned long cs_flags;
+ 	unsigned int status;
+ 	unsigned long flags;
++	bool is_console;
+ 
+ 	if (priv->shutdown)
+ 		return HRTIMER_NORESTART;
+@@ -846,12 +857,20 @@ static enum hrtimer_restart brcmuart_hrt
+ 	/* re-enable receive unless upper layer has disabled it */
+ 	if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) ==
+ 	    (UART_IER_RLSI | UART_IER_RDI)) {
++		is_console = uart_console(p);
++
++		if (is_console)
++			printk_cpu_sync_get_irqsave(cs_flags);
++
+ 		status = serial_port_in(p, UART_IER);
+ 		status |= (UART_IER_RLSI | UART_IER_RDI);
+ 		serial_port_out(p, UART_IER, status);
+ 		status = serial_port_in(p, UART_MCR);
+ 		status |= UART_MCR_RTS;
+ 		serial_port_out(p, UART_MCR, status);
++
++		if (is_console)
++			printk_cpu_sync_put_irqrestore(cs_flags);
+ 	}
+ 	spin_unlock_irqrestore(&p->lock, flags);
+ 	return HRTIMER_NORESTART;
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_core.c linux/drivers/tty/serial/8250/8250_core.c
+--- linux.orig/drivers/tty/serial/8250/8250_core.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_core.c	2022-12-04 10:40:26.700034085 -0500
+@@ -255,8 +255,11 @@ static void serial8250_timeout(struct ti
+ static void serial8250_backup_timeout(struct timer_list *t)
+ {
+ 	struct uart_8250_port *up = from_timer(up, t, timer);
++	struct uart_port *port = &up->port;
+ 	unsigned int iir, ier = 0, lsr;
++	unsigned long cs_flags;
+ 	unsigned long flags;
++	bool is_console;
+ 
+ 	spin_lock_irqsave(&up->port.lock, flags);
+ 
+@@ -265,8 +268,16 @@ static void serial8250_backup_timeout(st
+ 	 * based handler.
+ 	 */
+ 	if (up->port.irq) {
++		is_console = uart_console(port);
++
++		if (is_console)
++			printk_cpu_sync_get_irqsave(cs_flags);
++
+ 		ier = serial_in(up, UART_IER);
+ 		serial_out(up, UART_IER, 0);
++
++		if (is_console)
++			printk_cpu_sync_put_irqrestore(cs_flags);
+ 	}
+ 
+ 	iir = serial_in(up, UART_IIR);
+@@ -289,7 +300,7 @@ static void serial8250_backup_timeout(st
+ 		serial8250_tx_chars(up);
+ 
+ 	if (up->port.irq)
+-		serial_out(up, UART_IER, ier);
++		serial8250_set_IER(up, ier);
+ 
+ 	spin_unlock_irqrestore(&up->port.lock, flags);
+ 
+@@ -575,6 +586,14 @@ serial8250_register_ports(struct uart_dr
+ 
+ #ifdef CONFIG_SERIAL_8250_CONSOLE
+ 
++static void univ8250_console_write_atomic(struct console *co, const char *s,
++					  unsigned int count)
++{
++	struct uart_8250_port *up = &serial8250_ports[co->index];
++
++	serial8250_console_write_atomic(up, s, count);
++}
++
+ static void univ8250_console_write(struct console *co, const char *s,
+ 				   unsigned int count)
+ {
+@@ -668,6 +687,7 @@ static int univ8250_console_match(struct
+ 
+ static struct console univ8250_console = {
+ 	.name		= "ttyS",
++	.write_atomic	= univ8250_console_write_atomic,
+ 	.write		= univ8250_console_write,
+ 	.device		= uart_console_device,
+ 	.setup		= univ8250_console_setup,
+@@ -961,7 +981,7 @@ static void serial_8250_overrun_backoff_
+ 	spin_lock_irqsave(&port->lock, flags);
+ 	up->ier |= UART_IER_RLSI | UART_IER_RDI;
+ 	up->port.read_status_mask |= UART_LSR_DR;
+-	serial_out(up, UART_IER, up->ier);
++	serial8250_set_IER(up, up->ier);
+ 	spin_unlock_irqrestore(&port->lock, flags);
+ }
+ 
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_exar.c linux/drivers/tty/serial/8250/8250_exar.c
+--- linux.orig/drivers/tty/serial/8250/8250_exar.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_exar.c	2022-12-04 10:40:26.700034085 -0500
+@@ -179,6 +179,8 @@ static void xr17v35x_set_divisor(struct
+ 
+ static int xr17v35x_startup(struct uart_port *port)
+ {
++	struct uart_8250_port *up = up_to_u8250p(port);
++
+ 	/*
+ 	 * First enable access to IER [7:5], ISR [5:4], FCR [5:4],
+ 	 * MCR [7:5] and MSR [7:0]
+@@ -189,7 +191,7 @@ static int xr17v35x_startup(struct uart_
+ 	 * Make sure all interrups are masked until initialization is
+ 	 * complete and the FIFOs are cleared
+ 	 */
+-	serial_port_out(port, UART_IER, 0);
++	serial8250_set_IER(up, 0);
+ 
+ 	return serial8250_do_startup(port);
+ }
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_fsl.c linux/drivers/tty/serial/8250/8250_fsl.c
+--- linux.orig/drivers/tty/serial/8250/8250_fsl.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_fsl.c	2022-12-04 10:40:26.700034085 -0500
+@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port
+ 	if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
+ 		unsigned long delay;
+ 
+-		up->ier = port->serial_in(port, UART_IER);
++		up->ier = serial8250_in_IER(up);
++
+ 		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
+ 			port->ops->stop_rx(port);
+ 		} else {
+diff -rupN linux.orig/drivers/tty/serial/8250/8250.h linux/drivers/tty/serial/8250/8250.h
+--- linux.orig/drivers/tty/serial/8250/8250.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250.h	2022-12-04 10:40:26.700034085 -0500
+@@ -177,12 +177,74 @@ static inline void serial_dl_write(struc
  	up->dl_write(up, value);
  }
  
@@ -3254,7 +21084,7 @@ index 287153d325365..81f5fce6e895f 100644
  	return true;
  }
  
-@@ -191,7 +253,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
+@@ -191,7 +253,7 @@ static inline bool serial8250_clear_THRI
  	if (!(up->ier & UART_IER_THRI))
  		return false;
  	up->ier &= ~UART_IER_THRI;
@@ -3263,213 +21093,10 @@ index 287153d325365..81f5fce6e895f 100644
  	return true;
  }
  
-diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c
-index 9d2a7856784f7..7cc6b527c088b 100644
---- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
-+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
-@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle(struct uart_8250_port *up,
- 	up->ier &= ~irqs;
- 	if (!throttle)
- 		up->ier |= irqs;
--	serial_out(up, UART_IER, up->ier);
-+	serial8250_set_IER(up, up->ier);
- }
- static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle)
- {
-diff --git a/drivers/tty/serial/8250/8250_bcm7271.c b/drivers/tty/serial/8250/8250_bcm7271.c
-index 8efdc271eb75f..d30c74618411f 100644
---- a/drivers/tty/serial/8250/8250_bcm7271.c
-+++ b/drivers/tty/serial/8250/8250_bcm7271.c
-@@ -609,7 +609,7 @@ static int brcmuart_startup(struct uart_port *port)
- 	 * will handle this.
- 	 */
- 	up->ier &= ~UART_IER_RDI;
--	serial_port_out(port, UART_IER, up->ier);
-+	serial8250_set_IER(up, up->ier);
- 
- 	priv->tx_running = false;
- 	priv->dma.rx_dma = NULL;
-@@ -775,10 +775,12 @@ static int brcmuart_handle_irq(struct uart_port *p)
- 	unsigned int iir = serial_port_in(p, UART_IIR);
- 	struct brcmuart_priv *priv = p->private_data;
- 	struct uart_8250_port *up = up_to_u8250p(p);
-+	unsigned long cs_flags;
- 	unsigned int status;
- 	unsigned long flags;
- 	unsigned int ier;
- 	unsigned int mcr;
-+	bool is_console;
- 	int handled = 0;
- 
- 	/*
-@@ -789,6 +791,10 @@ static int brcmuart_handle_irq(struct uart_port *p)
- 		spin_lock_irqsave(&p->lock, flags);
- 		status = serial_port_in(p, UART_LSR);
- 		if ((status & UART_LSR_DR) == 0) {
-+			is_console = uart_console(p);
-+
-+			if (is_console)
-+				printk_cpu_sync_get_irqsave(cs_flags);
- 
- 			ier = serial_port_in(p, UART_IER);
- 			/*
-@@ -809,6 +815,9 @@ static int brcmuart_handle_irq(struct uart_port *p)
- 				serial_port_in(p, UART_RX);
- 			}
- 
-+			if (is_console)
-+				printk_cpu_sync_put_irqrestore(cs_flags);
-+
- 			handled = 1;
- 		}
- 		spin_unlock_irqrestore(&p->lock, flags);
-@@ -823,8 +832,10 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t)
- 	struct brcmuart_priv *priv = container_of(t, struct brcmuart_priv, hrt);
- 	struct uart_port *p = priv->up;
- 	struct uart_8250_port *up = up_to_u8250p(p);
-+	unsigned long cs_flags;
- 	unsigned int status;
- 	unsigned long flags;
-+	bool is_console;
- 
- 	if (priv->shutdown)
- 		return HRTIMER_NORESTART;
-@@ -846,12 +857,20 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t)
- 	/* re-enable receive unless upper layer has disabled it */
- 	if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) ==
- 	    (UART_IER_RLSI | UART_IER_RDI)) {
-+		is_console = uart_console(p);
-+
-+		if (is_console)
-+			printk_cpu_sync_get_irqsave(cs_flags);
-+
- 		status = serial_port_in(p, UART_IER);
- 		status |= (UART_IER_RLSI | UART_IER_RDI);
- 		serial_port_out(p, UART_IER, status);
- 		status = serial_port_in(p, UART_MCR);
- 		status |= UART_MCR_RTS;
- 		serial_port_out(p, UART_MCR, status);
-+
-+		if (is_console)
-+			printk_cpu_sync_put_irqrestore(cs_flags);
- 	}
- 	spin_unlock_irqrestore(&p->lock, flags);
- 	return HRTIMER_NORESTART;
-diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
-index 94fbf0add2ce2..196d0c55dfe99 100644
---- a/drivers/tty/serial/8250/8250_core.c
-+++ b/drivers/tty/serial/8250/8250_core.c
-@@ -255,8 +255,11 @@ static void serial8250_timeout(struct timer_list *t)
- static void serial8250_backup_timeout(struct timer_list *t)
- {
- 	struct uart_8250_port *up = from_timer(up, t, timer);
-+	struct uart_port *port = &up->port;
- 	unsigned int iir, ier = 0, lsr;
-+	unsigned long cs_flags;
- 	unsigned long flags;
-+	bool is_console;
- 
- 	spin_lock_irqsave(&up->port.lock, flags);
- 
-@@ -265,8 +268,16 @@ static void serial8250_backup_timeout(struct timer_list *t)
- 	 * based handler.
- 	 */
- 	if (up->port.irq) {
-+		is_console = uart_console(port);
-+
-+		if (is_console)
-+			printk_cpu_sync_get_irqsave(cs_flags);
-+
- 		ier = serial_in(up, UART_IER);
- 		serial_out(up, UART_IER, 0);
-+
-+		if (is_console)
-+			printk_cpu_sync_put_irqrestore(cs_flags);
- 	}
- 
- 	iir = serial_in(up, UART_IIR);
-@@ -289,7 +300,7 @@ static void serial8250_backup_timeout(struct timer_list *t)
- 		serial8250_tx_chars(up);
- 
- 	if (up->port.irq)
--		serial_out(up, UART_IER, ier);
-+		serial8250_set_IER(up, ier);
- 
- 	spin_unlock_irqrestore(&up->port.lock, flags);
- 
-@@ -575,6 +586,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
- 
- #ifdef CONFIG_SERIAL_8250_CONSOLE
- 
-+static void univ8250_console_write_atomic(struct console *co, const char *s,
-+					  unsigned int count)
-+{
-+	struct uart_8250_port *up = &serial8250_ports[co->index];
-+
-+	serial8250_console_write_atomic(up, s, count);
-+}
-+
- static void univ8250_console_write(struct console *co, const char *s,
- 				   unsigned int count)
- {
-@@ -668,6 +687,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx,
- 
- static struct console univ8250_console = {
- 	.name		= "ttyS",
-+	.write_atomic	= univ8250_console_write_atomic,
- 	.write		= univ8250_console_write,
- 	.device		= uart_console_device,
- 	.setup		= univ8250_console_setup,
-@@ -961,7 +981,7 @@ static void serial_8250_overrun_backoff_work(struct work_struct *work)
- 	spin_lock_irqsave(&port->lock, flags);
- 	up->ier |= UART_IER_RLSI | UART_IER_RDI;
- 	up->port.read_status_mask |= UART_LSR_DR;
--	serial_out(up, UART_IER, up->ier);
-+	serial8250_set_IER(up, up->ier);
- 	spin_unlock_irqrestore(&port->lock, flags);
- }
- 
-diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
-index 314a05e009df9..9809517de8270 100644
---- a/drivers/tty/serial/8250/8250_exar.c
-+++ b/drivers/tty/serial/8250/8250_exar.c
-@@ -179,6 +179,8 @@ static void xr17v35x_set_divisor(struct uart_port *p, unsigned int baud,
- 
- static int xr17v35x_startup(struct uart_port *port)
- {
-+	struct uart_8250_port *up = up_to_u8250p(port);
-+
- 	/*
- 	 * First enable access to IER [7:5], ISR [5:4], FCR [5:4],
- 	 * MCR [7:5] and MSR [7:0]
-@@ -189,7 +191,7 @@ static int xr17v35x_startup(struct uart_port *port)
- 	 * Make sure all interrups are masked until initialization is
- 	 * complete and the FIFOs are cleared
- 	 */
--	serial_port_out(port, UART_IER, 0);
-+	serial8250_set_IER(up, 0);
- 
- 	return serial8250_do_startup(port);
- }
-diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c
-index 8aad15622a2e5..74bb85b705e7f 100644
---- a/drivers/tty/serial/8250/8250_fsl.c
-+++ b/drivers/tty/serial/8250/8250_fsl.c
-@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port *port)
- 	if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
- 		unsigned long delay;
- 
--		up->ier = port->serial_in(port, UART_IER);
-+		up->ier = serial8250_in_IER(up);
-+
- 		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
- 			port->ops->stop_rx(port);
- 		} else {
-diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c
-index 2b2f5d8d24b91..2b78e6c394fb9 100644
---- a/drivers/tty/serial/8250/8250_ingenic.c
-+++ b/drivers/tty/serial/8250/8250_ingenic.c
-@@ -146,6 +146,7 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart",
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_ingenic.c linux/drivers/tty/serial/8250/8250_ingenic.c
+--- linux.orig/drivers/tty/serial/8250/8250_ingenic.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_ingenic.c	2022-12-04 10:40:26.700034085 -0500
+@@ -146,6 +146,7 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic
  
  static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
  {
@@ -3477,7 +21104,7 @@ index 2b2f5d8d24b91..2b78e6c394fb9 100644
  	int ier;
  
  	switch (offset) {
-@@ -167,7 +168,7 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
+@@ -167,7 +168,7 @@ static void ingenic_uart_serial_out(stru
  		 * If we have enabled modem status IRQs we should enable
  		 * modem mode.
  		 */
@@ -3486,11 +21113,10 @@ index 2b2f5d8d24b91..2b78e6c394fb9 100644
  
  		if (ier & UART_IER_MSI)
  			value |= UART_MCR_MDCE | UART_MCR_FCM;
-diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c
-index 54051ec7b4992..6092c75808fb9 100644
---- a/drivers/tty/serial/8250/8250_mtk.c
-+++ b/drivers/tty/serial/8250/8250_mtk.c
-@@ -222,12 +222,40 @@ static void mtk8250_shutdown(struct uart_port *port)
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_mtk.c linux/drivers/tty/serial/8250/8250_mtk.c
+--- linux.orig/drivers/tty/serial/8250/8250_mtk.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_mtk.c	2022-12-04 10:40:26.700034085 -0500
+@@ -222,12 +222,40 @@ static void mtk8250_shutdown(struct uart
  
  static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask)
  {
@@ -3533,20 +21159,19 @@ index 54051ec7b4992..6092c75808fb9 100644
  }
  
  static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode)
-diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
-index 38ee3e42251af..8dc983a8cad15 100644
---- a/drivers/tty/serial/8250/8250_omap.c
-+++ b/drivers/tty/serial/8250/8250_omap.c
-@@ -325,7 +325,7 @@ static void omap8250_restore_regs(struct uart_8250_port *up)
- 
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_omap.c linux/drivers/tty/serial/8250/8250_omap.c
+--- linux.orig/drivers/tty/serial/8250/8250_omap.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_omap.c	2022-12-04 10:41:15.271907054 -0500
+@@ -328,7 +328,7 @@ static void omap8250_restore_regs(struct
  	/* drop TCR + TLR access, we setup XON/XOFF later */
- 	serial8250_out_MCR(up, up->mcr);
+ 	serial8250_out_MCR(up, mcr);
+ 
 -	serial_out(up, UART_IER, up->ier);
 +	serial8250_set_IER(up, up->ier);
  
  	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
  	serial_dl_write(up, priv->quot);
-@@ -515,7 +515,7 @@ static void omap_8250_pm(struct uart_port *port, unsigned int state,
+@@ -518,7 +518,7 @@ static void omap_8250_pm(struct uart_por
  	serial_out(up, UART_EFR, efr | UART_EFR_ECB);
  	serial_out(up, UART_LCR, 0);
  
@@ -3555,7 +21180,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
  	serial_out(up, UART_EFR, efr);
  	serial_out(up, UART_LCR, 0);
-@@ -636,7 +636,7 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id)
+@@ -639,7 +639,7 @@ static irqreturn_t omap8250_irq(int irq,
  	if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) {
  		unsigned long delay;
  
@@ -3564,7 +21189,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
  			port->ops->stop_rx(port);
  		} else {
-@@ -696,7 +696,7 @@ static int omap_8250_startup(struct uart_port *port)
+@@ -698,7 +698,7 @@ static int omap_8250_startup(struct uart
  		goto err;
  
  	up->ier = UART_IER_RLSI | UART_IER_RDI;
@@ -3573,7 +21198,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  
  #ifdef CONFIG_PM
  	up->capabilities |= UART_CAP_RPM;
-@@ -737,7 +737,7 @@ static void omap_8250_shutdown(struct uart_port *port)
+@@ -739,7 +739,7 @@ static void omap_8250_shutdown(struct ua
  		serial_out(up, UART_OMAP_EFR2, 0x0);
  
  	up->ier = 0;
@@ -3582,7 +21207,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  
  	if (up->dma)
  		serial8250_release_dma(up);
-@@ -785,7 +785,7 @@ static void omap_8250_unthrottle(struct uart_port *port)
+@@ -787,7 +787,7 @@ static void omap_8250_unthrottle(struct
  		up->dma->rx_dma(up);
  	up->ier |= UART_IER_RLSI | UART_IER_RDI;
  	port->read_status_mask |= UART_LSR_DR;
@@ -3591,7 +21216,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  	spin_unlock_irqrestore(&port->lock, flags);
  
  	pm_runtime_mark_last_busy(port->dev);
-@@ -876,7 +876,7 @@ static void __dma_rx_complete(void *param)
+@@ -878,7 +878,7 @@ static void __dma_rx_complete(void *para
  	__dma_rx_do_complete(p);
  	if (!priv->throttled) {
  		p->ier |= UART_IER_RLSI | UART_IER_RDI;
@@ -3600,7 +21225,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  		if (!(priv->habit & UART_HAS_EFR2))
  			omap_8250_rx_dma(p);
  	}
-@@ -933,7 +933,7 @@ static int omap_8250_rx_dma(struct uart_8250_port *p)
+@@ -935,7 +935,7 @@ static int omap_8250_rx_dma(struct uart_
  			 * callback to run.
  			 */
  			p->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
@@ -3609,7 +21234,7 @@ index 38ee3e42251af..8dc983a8cad15 100644
  		}
  		goto out;
  	}
-@@ -1148,12 +1148,12 @@ static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir,
+@@ -1150,12 +1150,12 @@ static void am654_8250_handle_rx_dma(str
  		 * periodic timeouts, re-enable interrupts.
  		 */
  		up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
@@ -3624,11 +21249,1731 @@ index 38ee3e42251af..8dc983a8cad15 100644
  	}
  }
  
-diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
-index 2030a92ac66e7..326549603740d 100644
---- a/drivers/tty/serial/8250/8250_port.c
-+++ b/drivers/tty/serial/8250/8250_port.c
-@@ -743,7 +743,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_omap.c.orig linux/drivers/tty/serial/8250/8250_omap.c.orig
+--- linux.orig/drivers/tty/serial/8250/8250_omap.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_omap.c.orig	2022-12-04 10:40:18.432055273 -0500
+@@ -0,0 +1,1716 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * 8250-core based driver for the OMAP internal UART
++ *
++ * based on omap-serial.c, Copyright (C) 2010 Texas Instruments.
++ *
++ * Copyright (C) 2014 Sebastian Andrzej Siewior
++ *
++ */
++
++#include <linux/clk.h>
++#include <linux/device.h>
++#include <linux/io.h>
++#include <linux/module.h>
++#include <linux/serial_8250.h>
++#include <linux/serial_reg.h>
++#include <linux/tty_flip.h>
++#include <linux/platform_device.h>
++#include <linux/slab.h>
++#include <linux/of.h>
++#include <linux/of_device.h>
++#include <linux/of_gpio.h>
++#include <linux/of_irq.h>
++#include <linux/delay.h>
++#include <linux/pm_runtime.h>
++#include <linux/console.h>
++#include <linux/pm_qos.h>
++#include <linux/pm_wakeirq.h>
++#include <linux/dma-mapping.h>
++#include <linux/sys_soc.h>
++
++#include "8250.h"
++
++#define DEFAULT_CLK_SPEED	48000000
++
++#define UART_ERRATA_i202_MDR1_ACCESS	(1 << 0)
++#define OMAP_UART_WER_HAS_TX_WAKEUP	(1 << 1)
++#define OMAP_DMA_TX_KICK		(1 << 2)
++/*
++ * See Advisory 21 in AM437x errata SPRZ408B, updated April 2015.
++ * The same errata is applicable to AM335x and DRA7x processors too.
++ */
++#define UART_ERRATA_CLOCK_DISABLE	(1 << 3)
++#define	UART_HAS_EFR2			BIT(4)
++#define UART_HAS_RHR_IT_DIS		BIT(5)
++#define UART_RX_TIMEOUT_QUIRK		BIT(6)
++
++#define OMAP_UART_FCR_RX_TRIG		6
++#define OMAP_UART_FCR_TX_TRIG		4
++
++/* SCR register bitmasks */
++#define OMAP_UART_SCR_RX_TRIG_GRANU1_MASK	(1 << 7)
++#define OMAP_UART_SCR_TX_TRIG_GRANU1_MASK	(1 << 6)
++#define OMAP_UART_SCR_TX_EMPTY			(1 << 3)
++#define OMAP_UART_SCR_DMAMODE_MASK		(3 << 1)
++#define OMAP_UART_SCR_DMAMODE_1			(1 << 1)
++#define OMAP_UART_SCR_DMAMODE_CTL		(1 << 0)
++
++/* MVR register bitmasks */
++#define OMAP_UART_MVR_SCHEME_SHIFT	30
++#define OMAP_UART_LEGACY_MVR_MAJ_MASK	0xf0
++#define OMAP_UART_LEGACY_MVR_MAJ_SHIFT	4
++#define OMAP_UART_LEGACY_MVR_MIN_MASK	0x0f
++#define OMAP_UART_MVR_MAJ_MASK		0x700
++#define OMAP_UART_MVR_MAJ_SHIFT		8
++#define OMAP_UART_MVR_MIN_MASK		0x3f
++
++/* SYSC register bitmasks */
++#define OMAP_UART_SYSC_SOFTRESET	(1 << 1)
++
++/* SYSS register bitmasks */
++#define OMAP_UART_SYSS_RESETDONE	(1 << 0)
++
++#define UART_TI752_TLR_TX	0
++#define UART_TI752_TLR_RX	4
++
++#define TRIGGER_TLR_MASK(x)	((x & 0x3c) >> 2)
++#define TRIGGER_FCR_MASK(x)	(x & 3)
++
++/* Enable XON/XOFF flow control on output */
++#define OMAP_UART_SW_TX		0x08
++/* Enable XON/XOFF flow control on input */
++#define OMAP_UART_SW_RX		0x02
++
++#define OMAP_UART_WER_MOD_WKUP	0x7f
++#define OMAP_UART_TX_WAKEUP_EN	(1 << 7)
++
++#define TX_TRIGGER	1
++#define RX_TRIGGER	48
++
++#define OMAP_UART_TCR_RESTORE(x)	((x / 4) << 4)
++#define OMAP_UART_TCR_HALT(x)		((x / 4) << 0)
++
++#define UART_BUILD_REVISION(x, y)	(((x) << 8) | (y))
++
++#define OMAP_UART_REV_46 0x0406
++#define OMAP_UART_REV_52 0x0502
++#define OMAP_UART_REV_63 0x0603
++
++/* Interrupt Enable Register 2 */
++#define UART_OMAP_IER2			0x1B
++#define UART_OMAP_IER2_RHR_IT_DIS	BIT(2)
++
++/* Enhanced features register 2 */
++#define UART_OMAP_EFR2			0x23
++#define UART_OMAP_EFR2_TIMEOUT_BEHAVE	BIT(6)
++
++/* RX FIFO occupancy indicator */
++#define UART_OMAP_RX_LVL		0x19
++
++struct omap8250_priv {
++	int line;
++	u8 habit;
++	u8 mdr1;
++	u8 efr;
++	u8 scr;
++	u8 wer;
++	u8 xon;
++	u8 xoff;
++	u8 delayed_restore;
++	u16 quot;
++
++	u8 tx_trigger;
++	u8 rx_trigger;
++	bool is_suspending;
++	int wakeirq;
++	int wakeups_enabled;
++	u32 latency;
++	u32 calc_latency;
++	struct pm_qos_request pm_qos_request;
++	struct work_struct qos_work;
++	struct uart_8250_dma omap8250_dma;
++	spinlock_t rx_dma_lock;
++	bool rx_dma_broken;
++	bool throttled;
++};
++
++struct omap8250_dma_params {
++	u32 rx_size;
++	u8 rx_trigger;
++	u8 tx_trigger;
++};
++
++struct omap8250_platdata {
++	struct omap8250_dma_params *dma_params;
++	u8 habit;
++};
++
++#ifdef CONFIG_SERIAL_8250_DMA
++static void omap_8250_rx_dma_flush(struct uart_8250_port *p);
++#else
++static inline void omap_8250_rx_dma_flush(struct uart_8250_port *p) { }
++#endif
++
++static u32 uart_read(struct uart_8250_port *up, u32 reg)
++{
++	return readl(up->port.membase + (reg << up->port.regshift));
++}
++
++/*
++ * Called on runtime PM resume path from omap8250_restore_regs(), and
++ * omap8250_set_mctrl().
++ */
++static void __omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = up->port.private_data;
++	u8 lcr;
++
++	serial8250_do_set_mctrl(port, mctrl);
++
++	if (!mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_RTS)) {
++		/*
++		 * Turn off autoRTS if RTS is lowered and restore autoRTS
++		 * setting if RTS is raised
++		 */
++		lcr = serial_in(up, UART_LCR);
++		serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++		if ((mctrl & TIOCM_RTS) && (port->status & UPSTAT_AUTORTS))
++			priv->efr |= UART_EFR_RTS;
++		else
++			priv->efr &= ~UART_EFR_RTS;
++		serial_out(up, UART_EFR, priv->efr);
++		serial_out(up, UART_LCR, lcr);
++	}
++}
++
++static void omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
++{
++	int err;
++
++	err = pm_runtime_resume_and_get(port->dev);
++	if (err)
++		return;
++
++	__omap8250_set_mctrl(port, mctrl);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++}
++
++/*
++ * Work Around for Errata i202 (2430, 3430, 3630, 4430 and 4460)
++ * The access to uart register after MDR1 Access
++ * causes UART to corrupt data.
++ *
++ * Need a delay =
++ * 5 L4 clock cycles + 5 UART functional clock cycle (@48MHz = ~0.2uS)
++ * give 10 times as much
++ */
++static void omap_8250_mdr1_errataset(struct uart_8250_port *up,
++				     struct omap8250_priv *priv)
++{
++	serial_out(up, UART_OMAP_MDR1, priv->mdr1);
++	udelay(2);
++	serial_out(up, UART_FCR, up->fcr | UART_FCR_CLEAR_XMIT |
++			UART_FCR_CLEAR_RCVR);
++}
++
++static void omap_8250_get_divisor(struct uart_port *port, unsigned int baud,
++				  struct omap8250_priv *priv)
++{
++	unsigned int uartclk = port->uartclk;
++	unsigned int div_13, div_16;
++	unsigned int abs_d13, abs_d16;
++
++	/*
++	 * Old custom speed handling.
++	 */
++	if (baud == 38400 && (port->flags & UPF_SPD_MASK) == UPF_SPD_CUST) {
++		priv->quot = port->custom_divisor & UART_DIV_MAX;
++		/*
++		 * I assume that nobody is using this. But hey, if somebody
++		 * would like to specify the divisor _and_ the mode then the
++		 * driver is ready and waiting for it.
++		 */
++		if (port->custom_divisor & (1 << 16))
++			priv->mdr1 = UART_OMAP_MDR1_13X_MODE;
++		else
++			priv->mdr1 = UART_OMAP_MDR1_16X_MODE;
++		return;
++	}
++	div_13 = DIV_ROUND_CLOSEST(uartclk, 13 * baud);
++	div_16 = DIV_ROUND_CLOSEST(uartclk, 16 * baud);
++
++	if (!div_13)
++		div_13 = 1;
++	if (!div_16)
++		div_16 = 1;
++
++	abs_d13 = abs(baud - uartclk / 13 / div_13);
++	abs_d16 = abs(baud - uartclk / 16 / div_16);
++
++	if (abs_d13 >= abs_d16) {
++		priv->mdr1 = UART_OMAP_MDR1_16X_MODE;
++		priv->quot = div_16;
++	} else {
++		priv->mdr1 = UART_OMAP_MDR1_13X_MODE;
++		priv->quot = div_13;
++	}
++}
++
++static void omap8250_update_scr(struct uart_8250_port *up,
++				struct omap8250_priv *priv)
++{
++	u8 old_scr;
++
++	old_scr = serial_in(up, UART_OMAP_SCR);
++	if (old_scr == priv->scr)
++		return;
++
++	/*
++	 * The manual recommends not to enable the DMA mode selector in the SCR
++	 * (instead of the FCR) register _and_ selecting the DMA mode as one
++	 * register write because this may lead to malfunction.
++	 */
++	if (priv->scr & OMAP_UART_SCR_DMAMODE_MASK)
++		serial_out(up, UART_OMAP_SCR,
++			   priv->scr & ~OMAP_UART_SCR_DMAMODE_MASK);
++	serial_out(up, UART_OMAP_SCR, priv->scr);
++}
++
++static void omap8250_update_mdr1(struct uart_8250_port *up,
++				 struct omap8250_priv *priv)
++{
++	if (priv->habit & UART_ERRATA_i202_MDR1_ACCESS)
++		omap_8250_mdr1_errataset(up, priv);
++	else
++		serial_out(up, UART_OMAP_MDR1, priv->mdr1);
++}
++
++static void omap8250_restore_regs(struct uart_8250_port *up)
++{
++	struct omap8250_priv *priv = up->port.private_data;
++	struct uart_8250_dma	*dma = up->dma;
++	u8 mcr = serial8250_in_MCR(up);
++
++	if (dma && dma->tx_running) {
++		/*
++		 * TCSANOW requests the change to occur immediately however if
++		 * we have a TX-DMA operation in progress then it has been
++		 * observed that it might stall and never complete. Therefore we
++		 * delay DMA completes to prevent this hang from happen.
++		 */
++		priv->delayed_restore = 1;
++		return;
++	}
++
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_EFR, UART_EFR_ECB);
++
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	serial8250_out_MCR(up, mcr | UART_MCR_TCRTLR);
++	serial_out(up, UART_FCR, up->fcr);
++
++	omap8250_update_scr(up, priv);
++
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++
++	serial_out(up, UART_TI752_TCR, OMAP_UART_TCR_RESTORE(16) |
++			OMAP_UART_TCR_HALT(52));
++	serial_out(up, UART_TI752_TLR,
++		   TRIGGER_TLR_MASK(priv->tx_trigger) << UART_TI752_TLR_TX |
++		   TRIGGER_TLR_MASK(priv->rx_trigger) << UART_TI752_TLR_RX);
++
++	serial_out(up, UART_LCR, 0);
++
++	/* drop TCR + TLR access, we setup XON/XOFF later */
++	serial8250_out_MCR(up, mcr);
++
++	serial_out(up, UART_IER, up->ier);
++
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_dl_write(up, priv->quot);
++
++	serial_out(up, UART_EFR, priv->efr);
++
++	/* Configure flow control */
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_XON1, priv->xon);
++	serial_out(up, UART_XOFF1, priv->xoff);
++
++	serial_out(up, UART_LCR, up->lcr);
++
++	omap8250_update_mdr1(up, priv);
++
++	__omap8250_set_mctrl(&up->port, up->port.mctrl);
++
++	if (up->port.rs485.flags & SER_RS485_ENABLED)
++		serial8250_em485_stop_tx(up);
++}
++
++/*
++ * OMAP can use "CLK / (16 or 13) / div" for baud rate. And then we have have
++ * some differences in how we want to handle flow control.
++ */
++static void omap_8250_set_termios(struct uart_port *port,
++				  struct ktermios *termios,
++				  struct ktermios *old)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = up->port.private_data;
++	unsigned char cval = 0;
++	unsigned int baud;
++
++	cval = UART_LCR_WLEN(tty_get_char_size(termios->c_cflag));
++
++	if (termios->c_cflag & CSTOPB)
++		cval |= UART_LCR_STOP;
++	if (termios->c_cflag & PARENB)
++		cval |= UART_LCR_PARITY;
++	if (!(termios->c_cflag & PARODD))
++		cval |= UART_LCR_EPAR;
++	if (termios->c_cflag & CMSPAR)
++		cval |= UART_LCR_SPAR;
++
++	/*
++	 * Ask the core to calculate the divisor for us.
++	 */
++	baud = uart_get_baud_rate(port, termios, old,
++				  port->uartclk / 16 / UART_DIV_MAX,
++				  port->uartclk / 13);
++	omap_8250_get_divisor(port, baud, priv);
++
++	/*
++	 * Ok, we're now changing the port state. Do it with
++	 * interrupts disabled.
++	 */
++	pm_runtime_get_sync(port->dev);
++	spin_lock_irq(&port->lock);
++
++	/*
++	 * Update the per-port timeout.
++	 */
++	uart_update_timeout(port, termios->c_cflag, baud);
++
++	up->port.read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR;
++	if (termios->c_iflag & INPCK)
++		up->port.read_status_mask |= UART_LSR_FE | UART_LSR_PE;
++	if (termios->c_iflag & (IGNBRK | PARMRK))
++		up->port.read_status_mask |= UART_LSR_BI;
++
++	/*
++	 * Characters to ignore
++	 */
++	up->port.ignore_status_mask = 0;
++	if (termios->c_iflag & IGNPAR)
++		up->port.ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
++	if (termios->c_iflag & IGNBRK) {
++		up->port.ignore_status_mask |= UART_LSR_BI;
++		/*
++		 * If we're ignoring parity and break indicators,
++		 * ignore overruns too (for real raw support).
++		 */
++		if (termios->c_iflag & IGNPAR)
++			up->port.ignore_status_mask |= UART_LSR_OE;
++	}
++
++	/*
++	 * ignore all characters if CREAD is not set
++	 */
++	if ((termios->c_cflag & CREAD) == 0)
++		up->port.ignore_status_mask |= UART_LSR_DR;
++
++	/*
++	 * Modem status interrupts
++	 */
++	up->ier &= ~UART_IER_MSI;
++	if (UART_ENABLE_MS(&up->port, termios->c_cflag))
++		up->ier |= UART_IER_MSI;
++
++	up->lcr = cval;
++	/* Up to here it was mostly serial8250_do_set_termios() */
++
++	/*
++	 * We enable TRIG_GRANU for RX and TX and additionally we set
++	 * SCR_TX_EMPTY bit. The result is the following:
++	 * - RX_TRIGGER amount of bytes in the FIFO will cause an interrupt.
++	 * - less than RX_TRIGGER number of bytes will also cause an interrupt
++	 *   once the UART decides that there no new bytes arriving.
++	 * - Once THRE is enabled, the interrupt will be fired once the FIFO is
++	 *   empty - the trigger level is ignored here.
++	 *
++	 * Once DMA is enabled:
++	 * - UART will assert the TX DMA line once there is room for TX_TRIGGER
++	 *   bytes in the TX FIFO. On each assert the DMA engine will move
++	 *   TX_TRIGGER bytes into the FIFO.
++	 * - UART will assert the RX DMA line once there are RX_TRIGGER bytes in
++	 *   the FIFO and move RX_TRIGGER bytes.
++	 * This is because threshold and trigger values are the same.
++	 */
++	up->fcr = UART_FCR_ENABLE_FIFO;
++	up->fcr |= TRIGGER_FCR_MASK(priv->tx_trigger) << OMAP_UART_FCR_TX_TRIG;
++	up->fcr |= TRIGGER_FCR_MASK(priv->rx_trigger) << OMAP_UART_FCR_RX_TRIG;
++
++	priv->scr = OMAP_UART_SCR_RX_TRIG_GRANU1_MASK | OMAP_UART_SCR_TX_EMPTY |
++		OMAP_UART_SCR_TX_TRIG_GRANU1_MASK;
++
++	if (up->dma)
++		priv->scr |= OMAP_UART_SCR_DMAMODE_1 |
++			OMAP_UART_SCR_DMAMODE_CTL;
++
++	priv->xon = termios->c_cc[VSTART];
++	priv->xoff = termios->c_cc[VSTOP];
++
++	priv->efr = 0;
++	up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF);
++
++	if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW &&
++	    !mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_RTS) &&
++	    !mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_CTS)) {
++		/* Enable AUTOCTS (autoRTS is enabled when RTS is raised) */
++		up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS;
++		priv->efr |= UART_EFR_CTS;
++	} else	if (up->port.flags & UPF_SOFT_FLOW) {
++		/*
++		 * OMAP rx s/w flow control is borked; the transmitter remains
++		 * stuck off even if rx flow control is subsequently disabled
++		 */
++
++		/*
++		 * IXOFF Flag:
++		 * Enable XON/XOFF flow control on output.
++		 * Transmit XON1, XOFF1
++		 */
++		if (termios->c_iflag & IXOFF) {
++			up->port.status |= UPSTAT_AUTOXOFF;
++			priv->efr |= OMAP_UART_SW_TX;
++		}
++	}
++	omap8250_restore_regs(up);
++
++	spin_unlock_irq(&up->port.lock);
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++
++	/* calculate wakeup latency constraint */
++	priv->calc_latency = USEC_PER_SEC * 64 * 8 / baud;
++	priv->latency = priv->calc_latency;
++
++	schedule_work(&priv->qos_work);
++
++	/* Don't rewrite B0 */
++	if (tty_termios_baud_rate(termios))
++		tty_termios_encode_baud_rate(termios, baud, baud);
++}
++
++/* same as 8250 except that we may have extra flow bits set in EFR */
++static void omap_8250_pm(struct uart_port *port, unsigned int state,
++			 unsigned int oldstate)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	u8 efr;
++
++	pm_runtime_get_sync(port->dev);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	efr = serial_in(up, UART_EFR);
++	serial_out(up, UART_EFR, efr | UART_EFR_ECB);
++	serial_out(up, UART_LCR, 0);
++
++	serial_out(up, UART_IER, (state != 0) ? UART_IERX_SLEEP : 0);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_EFR, efr);
++	serial_out(up, UART_LCR, 0);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++}
++
++static void omap_serial_fill_features_erratas(struct uart_8250_port *up,
++					      struct omap8250_priv *priv)
++{
++	static const struct soc_device_attribute k3_soc_devices[] = {
++		{ .family = "AM65X",  },
++		{ .family = "J721E", .revision = "SR1.0" },
++		{ /* sentinel */ }
++	};
++	u32 mvr, scheme;
++	u16 revision, major, minor;
++
++	mvr = uart_read(up, UART_OMAP_MVER);
++
++	/* Check revision register scheme */
++	scheme = mvr >> OMAP_UART_MVR_SCHEME_SHIFT;
++
++	switch (scheme) {
++	case 0: /* Legacy Scheme: OMAP2/3 */
++		/* MINOR_REV[0:4], MAJOR_REV[4:7] */
++		major = (mvr & OMAP_UART_LEGACY_MVR_MAJ_MASK) >>
++			OMAP_UART_LEGACY_MVR_MAJ_SHIFT;
++		minor = (mvr & OMAP_UART_LEGACY_MVR_MIN_MASK);
++		break;
++	case 1:
++		/* New Scheme: OMAP4+ */
++		/* MINOR_REV[0:5], MAJOR_REV[8:10] */
++		major = (mvr & OMAP_UART_MVR_MAJ_MASK) >>
++			OMAP_UART_MVR_MAJ_SHIFT;
++		minor = (mvr & OMAP_UART_MVR_MIN_MASK);
++		break;
++	default:
++		dev_warn(up->port.dev,
++			 "Unknown revision, defaulting to highest\n");
++		/* highest possible revision */
++		major = 0xff;
++		minor = 0xff;
++	}
++	/* normalize revision for the driver */
++	revision = UART_BUILD_REVISION(major, minor);
++
++	switch (revision) {
++	case OMAP_UART_REV_46:
++		priv->habit |= UART_ERRATA_i202_MDR1_ACCESS;
++		break;
++	case OMAP_UART_REV_52:
++		priv->habit |= UART_ERRATA_i202_MDR1_ACCESS |
++				OMAP_UART_WER_HAS_TX_WAKEUP;
++		break;
++	case OMAP_UART_REV_63:
++		priv->habit |= UART_ERRATA_i202_MDR1_ACCESS |
++			OMAP_UART_WER_HAS_TX_WAKEUP;
++		break;
++	default:
++		break;
++	}
++
++	/*
++	 * AM65x SR1.0, AM65x SR2.0 and J721e SR1.0 don't
++	 * don't have RHR_IT_DIS bit in IER2 register. So drop to flag
++	 * to enable errata workaround.
++	 */
++	if (soc_device_match(k3_soc_devices))
++		priv->habit &= ~UART_HAS_RHR_IT_DIS;
++}
++
++static void omap8250_uart_qos_work(struct work_struct *work)
++{
++	struct omap8250_priv *priv;
++
++	priv = container_of(work, struct omap8250_priv, qos_work);
++	cpu_latency_qos_update_request(&priv->pm_qos_request, priv->latency);
++}
++
++#ifdef CONFIG_SERIAL_8250_DMA
++static int omap_8250_dma_handle_irq(struct uart_port *port);
++#endif
++
++static irqreturn_t omap8250_irq(int irq, void *dev_id)
++{
++	struct uart_port *port = dev_id;
++	struct omap8250_priv *priv = port->private_data;
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned int iir, lsr;
++	int ret;
++
++#ifdef CONFIG_SERIAL_8250_DMA
++	if (up->dma) {
++		ret = omap_8250_dma_handle_irq(port);
++		return IRQ_RETVAL(ret);
++	}
++#endif
++
++	serial8250_rpm_get(up);
++	lsr = serial_port_in(port, UART_LSR);
++	iir = serial_port_in(port, UART_IIR);
++	ret = serial8250_handle_irq(port, iir);
++
++	/*
++	 * On K3 SoCs, it is observed that RX TIMEOUT is signalled after
++	 * FIFO has been drained, in which case a dummy read of RX FIFO
++	 * is required to clear RX TIMEOUT condition.
++	 */
++	if (priv->habit & UART_RX_TIMEOUT_QUIRK &&
++	    (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT &&
++	    serial_port_in(port, UART_OMAP_RX_LVL) == 0) {
++		serial_port_in(port, UART_RX);
++	}
++
++	/* Stop processing interrupts on input overrun */
++	if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) {
++		unsigned long delay;
++
++		up->ier = port->serial_in(port, UART_IER);
++		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
++			port->ops->stop_rx(port);
++		} else {
++			/* Keep restarting the timer until
++			 * the input overrun subsides.
++			 */
++			cancel_delayed_work(&up->overrun_backoff);
++		}
++
++		delay = msecs_to_jiffies(up->overrun_backoff_time_ms);
++		schedule_delayed_work(&up->overrun_backoff, delay);
++	}
++
++	serial8250_rpm_put(up);
++
++	return IRQ_RETVAL(ret);
++}
++
++static int omap_8250_startup(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = port->private_data;
++	int ret;
++
++	if (priv->wakeirq) {
++		ret = dev_pm_set_dedicated_wake_irq(port->dev, priv->wakeirq);
++		if (ret)
++			return ret;
++	}
++
++	pm_runtime_get_sync(port->dev);
++
++	serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
++
++	serial_out(up, UART_LCR, UART_LCR_WLEN8);
++
++	up->lsr_saved_flags = 0;
++	up->msr_saved_flags = 0;
++
++	/* Disable DMA for console UART */
++	if (uart_console(port))
++		up->dma = NULL;
++
++	if (up->dma) {
++		ret = serial8250_request_dma(up);
++		if (ret) {
++			dev_warn_ratelimited(port->dev,
++					     "failed to request DMA\n");
++			up->dma = NULL;
++		}
++	}
++
++	ret = request_irq(port->irq, omap8250_irq, IRQF_SHARED,
++			  dev_name(port->dev), port);
++	if (ret < 0)
++		goto err;
++
++	up->ier = UART_IER_RLSI | UART_IER_RDI;
++	serial_out(up, UART_IER, up->ier);
++
++#ifdef CONFIG_PM
++	up->capabilities |= UART_CAP_RPM;
++#endif
++
++	/* Enable module level wake up */
++	priv->wer = OMAP_UART_WER_MOD_WKUP;
++	if (priv->habit & OMAP_UART_WER_HAS_TX_WAKEUP)
++		priv->wer |= OMAP_UART_TX_WAKEUP_EN;
++	serial_out(up, UART_OMAP_WER, priv->wer);
++
++	if (up->dma && !(priv->habit & UART_HAS_EFR2))
++		up->dma->rx_dma(up);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++	return 0;
++err:
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++	dev_pm_clear_wake_irq(port->dev);
++	return ret;
++}
++
++static void omap_8250_shutdown(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = port->private_data;
++
++	flush_work(&priv->qos_work);
++	if (up->dma)
++		omap_8250_rx_dma_flush(up);
++
++	pm_runtime_get_sync(port->dev);
++
++	serial_out(up, UART_OMAP_WER, 0);
++	if (priv->habit & UART_HAS_EFR2)
++		serial_out(up, UART_OMAP_EFR2, 0x0);
++
++	up->ier = 0;
++	serial_out(up, UART_IER, 0);
++
++	if (up->dma)
++		serial8250_release_dma(up);
++
++	/*
++	 * Disable break condition and FIFOs
++	 */
++	if (up->lcr & UART_LCR_SBC)
++		serial_out(up, UART_LCR, up->lcr & ~UART_LCR_SBC);
++	serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++	free_irq(port->irq, port);
++	dev_pm_clear_wake_irq(port->dev);
++}
++
++static void omap_8250_throttle(struct uart_port *port)
++{
++	struct omap8250_priv *priv = port->private_data;
++	unsigned long flags;
++
++	pm_runtime_get_sync(port->dev);
++
++	spin_lock_irqsave(&port->lock, flags);
++	port->ops->stop_rx(port);
++	priv->throttled = true;
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++}
++
++static void omap_8250_unthrottle(struct uart_port *port)
++{
++	struct omap8250_priv *priv = port->private_data;
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++
++	pm_runtime_get_sync(port->dev);
++
++	spin_lock_irqsave(&port->lock, flags);
++	priv->throttled = false;
++	if (up->dma)
++		up->dma->rx_dma(up);
++	up->ier |= UART_IER_RLSI | UART_IER_RDI;
++	port->read_status_mask |= UART_LSR_DR;
++	serial_out(up, UART_IER, up->ier);
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	pm_runtime_mark_last_busy(port->dev);
++	pm_runtime_put_autosuspend(port->dev);
++}
++
++#ifdef CONFIG_SERIAL_8250_DMA
++static int omap_8250_rx_dma(struct uart_8250_port *p);
++
++/* Must be called while priv->rx_dma_lock is held */
++static void __dma_rx_do_complete(struct uart_8250_port *p)
++{
++	struct uart_8250_dma    *dma = p->dma;
++	struct tty_port         *tty_port = &p->port.state->port;
++	struct omap8250_priv	*priv = p->port.private_data;
++	struct dma_chan		*rxchan = dma->rxchan;
++	dma_cookie_t		cookie;
++	struct dma_tx_state     state;
++	int                     count;
++	int			ret;
++	u32			reg;
++
++	if (!dma->rx_running)
++		goto out;
++
++	cookie = dma->rx_cookie;
++	dma->rx_running = 0;
++
++	/* Re-enable RX FIFO interrupt now that transfer is complete */
++	if (priv->habit & UART_HAS_RHR_IT_DIS) {
++		reg = serial_in(p, UART_OMAP_IER2);
++		reg &= ~UART_OMAP_IER2_RHR_IT_DIS;
++		serial_out(p, UART_OMAP_IER2, UART_OMAP_IER2_RHR_IT_DIS);
++	}
++
++	dmaengine_tx_status(rxchan, cookie, &state);
++
++	count = dma->rx_size - state.residue + state.in_flight_bytes;
++	if (count < dma->rx_size) {
++		dmaengine_terminate_async(rxchan);
++
++		/*
++		 * Poll for teardown to complete which guarantees in
++		 * flight data is drained.
++		 */
++		if (state.in_flight_bytes) {
++			int poll_count = 25;
++
++			while (dmaengine_tx_status(rxchan, cookie, NULL) &&
++			       poll_count--)
++				cpu_relax();
++
++			if (poll_count == -1)
++				dev_err(p->port.dev, "teardown incomplete\n");
++		}
++	}
++	if (!count)
++		goto out;
++	ret = tty_insert_flip_string(tty_port, dma->rx_buf, count);
++
++	p->port.icount.rx += ret;
++	p->port.icount.buf_overrun += count - ret;
++out:
++
++	tty_flip_buffer_push(tty_port);
++}
++
++static void __dma_rx_complete(void *param)
++{
++	struct uart_8250_port *p = param;
++	struct omap8250_priv *priv = p->port.private_data;
++	struct uart_8250_dma *dma = p->dma;
++	struct dma_tx_state     state;
++	unsigned long flags;
++
++	spin_lock_irqsave(&p->port.lock, flags);
++
++	/*
++	 * If the tx status is not DMA_COMPLETE, then this is a delayed
++	 * completion callback. A previous RX timeout flush would have
++	 * already pushed the data, so exit.
++	 */
++	if (dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state) !=
++			DMA_COMPLETE) {
++		spin_unlock_irqrestore(&p->port.lock, flags);
++		return;
++	}
++	__dma_rx_do_complete(p);
++	if (!priv->throttled) {
++		p->ier |= UART_IER_RLSI | UART_IER_RDI;
++		serial_out(p, UART_IER, p->ier);
++		if (!(priv->habit & UART_HAS_EFR2))
++			omap_8250_rx_dma(p);
++	}
++
++	spin_unlock_irqrestore(&p->port.lock, flags);
++}
++
++static void omap_8250_rx_dma_flush(struct uart_8250_port *p)
++{
++	struct omap8250_priv	*priv = p->port.private_data;
++	struct uart_8250_dma	*dma = p->dma;
++	struct dma_tx_state     state;
++	unsigned long		flags;
++	int ret;
++
++	spin_lock_irqsave(&priv->rx_dma_lock, flags);
++
++	if (!dma->rx_running) {
++		spin_unlock_irqrestore(&priv->rx_dma_lock, flags);
++		return;
++	}
++
++	ret = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state);
++	if (ret == DMA_IN_PROGRESS) {
++		ret = dmaengine_pause(dma->rxchan);
++		if (WARN_ON_ONCE(ret))
++			priv->rx_dma_broken = true;
++	}
++	__dma_rx_do_complete(p);
++	spin_unlock_irqrestore(&priv->rx_dma_lock, flags);
++}
++
++static int omap_8250_rx_dma(struct uart_8250_port *p)
++{
++	struct omap8250_priv		*priv = p->port.private_data;
++	struct uart_8250_dma            *dma = p->dma;
++	int				err = 0;
++	struct dma_async_tx_descriptor  *desc;
++	unsigned long			flags;
++	u32				reg;
++
++	if (priv->rx_dma_broken)
++		return -EINVAL;
++
++	spin_lock_irqsave(&priv->rx_dma_lock, flags);
++
++	if (dma->rx_running) {
++		enum dma_status state;
++
++		state = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, NULL);
++		if (state == DMA_COMPLETE) {
++			/*
++			 * Disable RX interrupts to allow RX DMA completion
++			 * callback to run.
++			 */
++			p->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
++			serial_out(p, UART_IER, p->ier);
++		}
++		goto out;
++	}
++
++	desc = dmaengine_prep_slave_single(dma->rxchan, dma->rx_addr,
++					   dma->rx_size, DMA_DEV_TO_MEM,
++					   DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
++	if (!desc) {
++		err = -EBUSY;
++		goto out;
++	}
++
++	dma->rx_running = 1;
++	desc->callback = __dma_rx_complete;
++	desc->callback_param = p;
++
++	dma->rx_cookie = dmaengine_submit(desc);
++
++	/*
++	 * Disable RX FIFO interrupt while RX DMA is enabled, else
++	 * spurious interrupt may be raised when data is in the RX FIFO
++	 * but is yet to be drained by DMA.
++	 */
++	if (priv->habit & UART_HAS_RHR_IT_DIS) {
++		reg = serial_in(p, UART_OMAP_IER2);
++		reg |= UART_OMAP_IER2_RHR_IT_DIS;
++		serial_out(p, UART_OMAP_IER2, UART_OMAP_IER2_RHR_IT_DIS);
++	}
++
++	dma_async_issue_pending(dma->rxchan);
++out:
++	spin_unlock_irqrestore(&priv->rx_dma_lock, flags);
++	return err;
++}
++
++static int omap_8250_tx_dma(struct uart_8250_port *p);
++
++static void omap_8250_dma_tx_complete(void *param)
++{
++	struct uart_8250_port	*p = param;
++	struct uart_8250_dma	*dma = p->dma;
++	struct circ_buf		*xmit = &p->port.state->xmit;
++	unsigned long		flags;
++	bool			en_thri = false;
++	struct omap8250_priv	*priv = p->port.private_data;
++
++	dma_sync_single_for_cpu(dma->txchan->device->dev, dma->tx_addr,
++				UART_XMIT_SIZE, DMA_TO_DEVICE);
++
++	spin_lock_irqsave(&p->port.lock, flags);
++
++	dma->tx_running = 0;
++
++	xmit->tail += dma->tx_size;
++	xmit->tail &= UART_XMIT_SIZE - 1;
++	p->port.icount.tx += dma->tx_size;
++
++	if (priv->delayed_restore) {
++		priv->delayed_restore = 0;
++		omap8250_restore_regs(p);
++	}
++
++	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
++		uart_write_wakeup(&p->port);
++
++	if (!uart_circ_empty(xmit) && !uart_tx_stopped(&p->port)) {
++		int ret;
++
++		ret = omap_8250_tx_dma(p);
++		if (ret)
++			en_thri = true;
++	} else if (p->capabilities & UART_CAP_RPM) {
++		en_thri = true;
++	}
++
++	if (en_thri) {
++		dma->tx_err = 1;
++		serial8250_set_THRI(p);
++	}
++
++	spin_unlock_irqrestore(&p->port.lock, flags);
++}
++
++static int omap_8250_tx_dma(struct uart_8250_port *p)
++{
++	struct uart_8250_dma		*dma = p->dma;
++	struct omap8250_priv		*priv = p->port.private_data;
++	struct circ_buf			*xmit = &p->port.state->xmit;
++	struct dma_async_tx_descriptor	*desc;
++	unsigned int	skip_byte = 0;
++	int ret;
++
++	if (dma->tx_running)
++		return 0;
++	if (uart_tx_stopped(&p->port) || uart_circ_empty(xmit)) {
++
++		/*
++		 * Even if no data, we need to return an error for the two cases
++		 * below so serial8250_tx_chars() is invoked and properly clears
++		 * THRI and/or runtime suspend.
++		 */
++		if (dma->tx_err || p->capabilities & UART_CAP_RPM) {
++			ret = -EBUSY;
++			goto err;
++		}
++		serial8250_clear_THRI(p);
++		return 0;
++	}
++
++	dma->tx_size = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
++	if (priv->habit & OMAP_DMA_TX_KICK) {
++		u8 tx_lvl;
++
++		/*
++		 * We need to put the first byte into the FIFO in order to start
++		 * the DMA transfer. For transfers smaller than four bytes we
++		 * don't bother doing DMA at all. It seem not matter if there
++		 * are still bytes in the FIFO from the last transfer (in case
++		 * we got here directly from omap_8250_dma_tx_complete()). Bytes
++		 * leaving the FIFO seem not to trigger the DMA transfer. It is
++		 * really the byte that we put into the FIFO.
++		 * If the FIFO is already full then we most likely got here from
++		 * omap_8250_dma_tx_complete(). And this means the DMA engine
++		 * just completed its work. We don't have to wait the complete
++		 * 86us at 115200,8n1 but around 60us (not to mention lower
++		 * baudrates). So in that case we take the interrupt and try
++		 * again with an empty FIFO.
++		 */
++		tx_lvl = serial_in(p, UART_OMAP_TX_LVL);
++		if (tx_lvl == p->tx_loadsz) {
++			ret = -EBUSY;
++			goto err;
++		}
++		if (dma->tx_size < 4) {
++			ret = -EINVAL;
++			goto err;
++		}
++		skip_byte = 1;
++	}
++
++	desc = dmaengine_prep_slave_single(dma->txchan,
++			dma->tx_addr + xmit->tail + skip_byte,
++			dma->tx_size - skip_byte, DMA_MEM_TO_DEV,
++			DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
++	if (!desc) {
++		ret = -EBUSY;
++		goto err;
++	}
++
++	dma->tx_running = 1;
++
++	desc->callback = omap_8250_dma_tx_complete;
++	desc->callback_param = p;
++
++	dma->tx_cookie = dmaengine_submit(desc);
++
++	dma_sync_single_for_device(dma->txchan->device->dev, dma->tx_addr,
++				   UART_XMIT_SIZE, DMA_TO_DEVICE);
++
++	dma_async_issue_pending(dma->txchan);
++	if (dma->tx_err)
++		dma->tx_err = 0;
++
++	serial8250_clear_THRI(p);
++	if (skip_byte)
++		serial_out(p, UART_TX, xmit->buf[xmit->tail]);
++	return 0;
++err:
++	dma->tx_err = 1;
++	return ret;
++}
++
++static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
++{
++	switch (iir & 0x3f) {
++	case UART_IIR_RLSI:
++	case UART_IIR_RX_TIMEOUT:
++	case UART_IIR_RDI:
++		omap_8250_rx_dma_flush(up);
++		return true;
++	}
++	return omap_8250_rx_dma(up);
++}
++
++static u16 omap_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, u16 status)
++{
++	if ((status & (UART_LSR_DR | UART_LSR_BI)) &&
++	    (iir & UART_IIR_RDI)) {
++		if (handle_rx_dma(up, iir)) {
++			status = serial8250_rx_chars(up, status);
++			omap_8250_rx_dma(up);
++		}
++	}
++
++	return status;
++}
++
++static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir,
++				     u16 status)
++{
++	/*
++	 * Queue a new transfer if FIFO has data.
++	 */
++	if ((status & (UART_LSR_DR | UART_LSR_BI)) &&
++	    (up->ier & UART_IER_RDI)) {
++		omap_8250_rx_dma(up);
++		serial_out(up, UART_OMAP_EFR2, UART_OMAP_EFR2_TIMEOUT_BEHAVE);
++	} else if ((iir & 0x3f) == UART_IIR_RX_TIMEOUT) {
++		/*
++		 * Disable RX timeout, read IIR to clear
++		 * current timeout condition, clear EFR2 to
++		 * periodic timeouts, re-enable interrupts.
++		 */
++		up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
++		serial_out(up, UART_IER, up->ier);
++		omap_8250_rx_dma_flush(up);
++		serial_in(up, UART_IIR);
++		serial_out(up, UART_OMAP_EFR2, 0x0);
++		up->ier |= UART_IER_RLSI | UART_IER_RDI;
++		serial_out(up, UART_IER, up->ier);
++	}
++}
++
++/*
++ * This is mostly serial8250_handle_irq(). We have a slightly different DMA
++ * hoook for RX/TX and need different logic for them in the ISR. Therefore we
++ * use the default routine in the non-DMA case and this one for with DMA.
++ */
++static int omap_8250_dma_handle_irq(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct omap8250_priv *priv = up->port.private_data;
++	u16 status;
++	u8 iir;
++
++	serial8250_rpm_get(up);
++
++	iir = serial_port_in(port, UART_IIR);
++	if (iir & UART_IIR_NO_INT) {
++		serial8250_rpm_put(up);
++		return IRQ_HANDLED;
++	}
++
++	spin_lock(&port->lock);
++
++	status = serial_port_in(port, UART_LSR);
++
++	if (priv->habit & UART_HAS_EFR2)
++		am654_8250_handle_rx_dma(up, iir, status);
++	else
++		status = omap_8250_handle_rx_dma(up, iir, status);
++
++	serial8250_modem_status(up);
++	if (status & UART_LSR_THRE && up->dma->tx_err) {
++		if (uart_tx_stopped(&up->port) ||
++		    uart_circ_empty(&up->port.state->xmit)) {
++			up->dma->tx_err = 0;
++			serial8250_tx_chars(up);
++		} else  {
++			/*
++			 * try again due to an earlier failer which
++			 * might have been resolved by now.
++			 */
++			if (omap_8250_tx_dma(up))
++				serial8250_tx_chars(up);
++		}
++	}
++
++	uart_unlock_and_check_sysrq(port);
++
++	serial8250_rpm_put(up);
++	return 1;
++}
++
++static bool the_no_dma_filter_fn(struct dma_chan *chan, void *param)
++{
++	return false;
++}
++
++#else
++
++static inline int omap_8250_rx_dma(struct uart_8250_port *p)
++{
++	return -EINVAL;
++}
++#endif
++
++static int omap8250_no_handle_irq(struct uart_port *port)
++{
++	/* IRQ has not been requested but handling irq? */
++	WARN_ONCE(1, "Unexpected irq handling before port startup\n");
++	return 0;
++}
++
++static struct omap8250_dma_params am654_dma = {
++	.rx_size = SZ_2K,
++	.rx_trigger = 1,
++	.tx_trigger = TX_TRIGGER,
++};
++
++static struct omap8250_dma_params am33xx_dma = {
++	.rx_size = RX_TRIGGER,
++	.rx_trigger = RX_TRIGGER,
++	.tx_trigger = TX_TRIGGER,
++};
++
++static struct omap8250_platdata am654_platdata = {
++	.dma_params	= &am654_dma,
++	.habit		= UART_HAS_EFR2 | UART_HAS_RHR_IT_DIS |
++			  UART_RX_TIMEOUT_QUIRK,
++};
++
++static struct omap8250_platdata am33xx_platdata = {
++	.dma_params	= &am33xx_dma,
++	.habit		= OMAP_DMA_TX_KICK | UART_ERRATA_CLOCK_DISABLE,
++};
++
++static struct omap8250_platdata omap4_platdata = {
++	.dma_params	= &am33xx_dma,
++	.habit		= UART_ERRATA_CLOCK_DISABLE,
++};
++
++static const struct of_device_id omap8250_dt_ids[] = {
++	{ .compatible = "ti,am654-uart", .data = &am654_platdata, },
++	{ .compatible = "ti,omap2-uart" },
++	{ .compatible = "ti,omap3-uart" },
++	{ .compatible = "ti,omap4-uart", .data = &omap4_platdata, },
++	{ .compatible = "ti,am3352-uart", .data = &am33xx_platdata, },
++	{ .compatible = "ti,am4372-uart", .data = &am33xx_platdata, },
++	{ .compatible = "ti,dra742-uart", .data = &omap4_platdata, },
++	{},
++};
++MODULE_DEVICE_TABLE(of, omap8250_dt_ids);
++
++static int omap8250_probe(struct platform_device *pdev)
++{
++	struct device_node *np = pdev->dev.of_node;
++	struct omap8250_priv *priv;
++	const struct omap8250_platdata *pdata;
++	struct uart_8250_port up;
++	struct resource *regs;
++	void __iomem *membase;
++	int irq, ret;
++
++	irq = platform_get_irq(pdev, 0);
++	if (irq < 0)
++		return irq;
++
++	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
++	if (!regs) {
++		dev_err(&pdev->dev, "missing registers\n");
++		return -EINVAL;
++	}
++
++	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
++	if (!priv)
++		return -ENOMEM;
++
++	membase = devm_ioremap(&pdev->dev, regs->start,
++				       resource_size(regs));
++	if (!membase)
++		return -ENODEV;
++
++	memset(&up, 0, sizeof(up));
++	up.port.dev = &pdev->dev;
++	up.port.mapbase = regs->start;
++	up.port.membase = membase;
++	up.port.irq = irq;
++	/*
++	 * It claims to be 16C750 compatible however it is a little different.
++	 * It has EFR and has no FCR7_64byte bit. The AFE (which it claims to
++	 * have) is enabled via EFR instead of MCR. The type is set here 8250
++	 * just to get things going. UNKNOWN does not work for a few reasons and
++	 * we don't need our own type since we don't use 8250's set_termios()
++	 * or pm callback.
++	 */
++	up.port.type = PORT_8250;
++	up.port.iotype = UPIO_MEM;
++	up.port.flags = UPF_FIXED_PORT | UPF_FIXED_TYPE | UPF_SOFT_FLOW |
++		UPF_HARD_FLOW;
++	up.port.private_data = priv;
++
++	up.port.regshift = 2;
++	up.port.fifosize = 64;
++	up.tx_loadsz = 64;
++	up.capabilities = UART_CAP_FIFO;
++#ifdef CONFIG_PM
++	/*
++	 * Runtime PM is mostly transparent. However to do it right we need to a
++	 * TX empty interrupt before we can put the device to auto idle. So if
++	 * PM is not enabled we don't add that flag and can spare that one extra
++	 * interrupt in the TX path.
++	 */
++	up.capabilities |= UART_CAP_RPM;
++#endif
++	up.port.set_termios = omap_8250_set_termios;
++	up.port.set_mctrl = omap8250_set_mctrl;
++	up.port.pm = omap_8250_pm;
++	up.port.startup = omap_8250_startup;
++	up.port.shutdown = omap_8250_shutdown;
++	up.port.throttle = omap_8250_throttle;
++	up.port.unthrottle = omap_8250_unthrottle;
++	up.port.rs485_config = serial8250_em485_config;
++	up.port.rs485_supported = serial8250_em485_supported;
++	up.rs485_start_tx = serial8250_em485_start_tx;
++	up.rs485_stop_tx = serial8250_em485_stop_tx;
++	up.port.has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE);
++
++	ret = of_alias_get_id(np, "serial");
++	if (ret < 0) {
++		dev_err(&pdev->dev, "failed to get alias\n");
++		return ret;
++	}
++	up.port.line = ret;
++
++	if (of_property_read_u32(np, "clock-frequency", &up.port.uartclk)) {
++		struct clk *clk;
++
++		clk = devm_clk_get(&pdev->dev, NULL);
++		if (IS_ERR(clk)) {
++			if (PTR_ERR(clk) == -EPROBE_DEFER)
++				return -EPROBE_DEFER;
++		} else {
++			up.port.uartclk = clk_get_rate(clk);
++		}
++	}
++
++	if (of_property_read_u32(np, "overrun-throttle-ms",
++				 &up.overrun_backoff_time_ms) != 0)
++		up.overrun_backoff_time_ms = 0;
++
++	priv->wakeirq = irq_of_parse_and_map(np, 1);
++
++	pdata = of_device_get_match_data(&pdev->dev);
++	if (pdata)
++		priv->habit |= pdata->habit;
++
++	if (!up.port.uartclk) {
++		up.port.uartclk = DEFAULT_CLK_SPEED;
++		dev_warn(&pdev->dev,
++			 "No clock speed specified: using default: %d\n",
++			 DEFAULT_CLK_SPEED);
++	}
++
++	priv->latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE;
++	priv->calc_latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE;
++	cpu_latency_qos_add_request(&priv->pm_qos_request, priv->latency);
++	INIT_WORK(&priv->qos_work, omap8250_uart_qos_work);
++
++	spin_lock_init(&priv->rx_dma_lock);
++
++	device_init_wakeup(&pdev->dev, true);
++	pm_runtime_enable(&pdev->dev);
++	pm_runtime_use_autosuspend(&pdev->dev);
++
++	/*
++	 * Disable runtime PM until autosuspend delay unless specifically
++	 * enabled by the user via sysfs. This is the historic way to
++	 * prevent an unsafe default policy with lossy characters on wake-up.
++	 * For serdev devices this is not needed, the policy can be managed by
++	 * the serdev driver.
++	 */
++	if (!of_get_available_child_count(pdev->dev.of_node))
++		pm_runtime_set_autosuspend_delay(&pdev->dev, -1);
++
++	pm_runtime_irq_safe(&pdev->dev);
++
++	pm_runtime_get_sync(&pdev->dev);
++
++	omap_serial_fill_features_erratas(&up, priv);
++	up.port.handle_irq = omap8250_no_handle_irq;
++	priv->rx_trigger = RX_TRIGGER;
++	priv->tx_trigger = TX_TRIGGER;
++#ifdef CONFIG_SERIAL_8250_DMA
++	/*
++	 * Oh DMA support. If there are no DMA properties in the DT then
++	 * we will fall back to a generic DMA channel which does not
++	 * really work here. To ensure that we do not get a generic DMA
++	 * channel assigned, we have the the_no_dma_filter_fn() here.
++	 * To avoid "failed to request DMA" messages we check for DMA
++	 * properties in DT.
++	 */
++	ret = of_property_count_strings(np, "dma-names");
++	if (ret == 2) {
++		struct omap8250_dma_params *dma_params = NULL;
++
++		up.dma = &priv->omap8250_dma;
++		up.dma->fn = the_no_dma_filter_fn;
++		up.dma->tx_dma = omap_8250_tx_dma;
++		up.dma->rx_dma = omap_8250_rx_dma;
++		if (pdata)
++			dma_params = pdata->dma_params;
++
++		if (dma_params) {
++			up.dma->rx_size = dma_params->rx_size;
++			up.dma->rxconf.src_maxburst = dma_params->rx_trigger;
++			up.dma->txconf.dst_maxburst = dma_params->tx_trigger;
++			priv->rx_trigger = dma_params->rx_trigger;
++			priv->tx_trigger = dma_params->tx_trigger;
++		} else {
++			up.dma->rx_size = RX_TRIGGER;
++			up.dma->rxconf.src_maxburst = RX_TRIGGER;
++			up.dma->txconf.dst_maxburst = TX_TRIGGER;
++		}
++	}
++#endif
++	ret = serial8250_register_8250_port(&up);
++	if (ret < 0) {
++		dev_err(&pdev->dev, "unable to register 8250 port\n");
++		goto err;
++	}
++	priv->line = ret;
++	platform_set_drvdata(pdev, priv);
++	pm_runtime_mark_last_busy(&pdev->dev);
++	pm_runtime_put_autosuspend(&pdev->dev);
++	return 0;
++err:
++	pm_runtime_dont_use_autosuspend(&pdev->dev);
++	pm_runtime_put_sync(&pdev->dev);
++	pm_runtime_disable(&pdev->dev);
++	return ret;
++}
++
++static int omap8250_remove(struct platform_device *pdev)
++{
++	struct omap8250_priv *priv = platform_get_drvdata(pdev);
++	int err;
++
++	err = pm_runtime_resume_and_get(&pdev->dev);
++	if (err)
++		return err;
++
++	pm_runtime_dont_use_autosuspend(&pdev->dev);
++	pm_runtime_put_sync(&pdev->dev);
++	flush_work(&priv->qos_work);
++	pm_runtime_disable(&pdev->dev);
++	serial8250_unregister_port(priv->line);
++	cpu_latency_qos_remove_request(&priv->pm_qos_request);
++	device_init_wakeup(&pdev->dev, false);
++	return 0;
++}
++
++#ifdef CONFIG_PM_SLEEP
++static int omap8250_prepare(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++
++	if (!priv)
++		return 0;
++	priv->is_suspending = true;
++	return 0;
++}
++
++static void omap8250_complete(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++
++	if (!priv)
++		return;
++	priv->is_suspending = false;
++}
++
++static int omap8250_suspend(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++	struct uart_8250_port *up = serial8250_get_port(priv->line);
++
++	serial8250_suspend_port(priv->line);
++
++	pm_runtime_get_sync(dev);
++	if (!device_may_wakeup(dev))
++		priv->wer = 0;
++	serial_out(up, UART_OMAP_WER, priv->wer);
++	pm_runtime_mark_last_busy(dev);
++	pm_runtime_put_autosuspend(dev);
++
++	flush_work(&priv->qos_work);
++	return 0;
++}
++
++static int omap8250_resume(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++
++	serial8250_resume_port(priv->line);
++	return 0;
++}
++#else
++#define omap8250_prepare NULL
++#define omap8250_complete NULL
++#endif
++
++#ifdef CONFIG_PM
++static int omap8250_lost_context(struct uart_8250_port *up)
++{
++	u32 val;
++
++	val = serial_in(up, UART_OMAP_SCR);
++	/*
++	 * If we lose context, then SCR is set to its reset value of zero.
++	 * After set_termios() we set bit 3 of SCR (TX_EMPTY_CTL_IT) to 1,
++	 * among other bits, to never set the register back to zero again.
++	 */
++	if (!val)
++		return 1;
++	return 0;
++}
++
++/* TODO: in future, this should happen via API in drivers/reset/ */
++static int omap8250_soft_reset(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++	struct uart_8250_port *up = serial8250_get_port(priv->line);
++	int timeout = 100;
++	int sysc;
++	int syss;
++
++	/*
++	 * At least on omap4, unused uarts may not idle after reset without
++	 * a basic scr dma configuration even with no dma in use. The
++	 * module clkctrl status bits will be 1 instead of 3 blocking idle
++	 * for the whole clockdomain. The softreset below will clear scr,
++	 * and we restore it on resume so this is safe to do on all SoCs
++	 * needing omap8250_soft_reset() quirk. Do it in two writes as
++	 * recommended in the comment for omap8250_update_scr().
++	 */
++	serial_out(up, UART_OMAP_SCR, OMAP_UART_SCR_DMAMODE_1);
++	serial_out(up, UART_OMAP_SCR,
++		   OMAP_UART_SCR_DMAMODE_1 | OMAP_UART_SCR_DMAMODE_CTL);
++
++	sysc = serial_in(up, UART_OMAP_SYSC);
++
++	/* softreset the UART */
++	sysc |= OMAP_UART_SYSC_SOFTRESET;
++	serial_out(up, UART_OMAP_SYSC, sysc);
++
++	/* By experiments, 1us enough for reset complete on AM335x */
++	do {
++		udelay(1);
++		syss = serial_in(up, UART_OMAP_SYSS);
++	} while (--timeout && !(syss & OMAP_UART_SYSS_RESETDONE));
++
++	if (!timeout) {
++		dev_err(dev, "timed out waiting for reset done\n");
++		return -ETIMEDOUT;
++	}
++
++	return 0;
++}
++
++static int omap8250_runtime_suspend(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++	struct uart_8250_port *up;
++
++	/* In case runtime-pm tries this before we are setup */
++	if (!priv)
++		return 0;
++
++	up = serial8250_get_port(priv->line);
++	/*
++	 * When using 'no_console_suspend', the console UART must not be
++	 * suspended. Since driver suspend is managed by runtime suspend,
++	 * preventing runtime suspend (by returning error) will keep device
++	 * active during suspend.
++	 */
++	if (priv->is_suspending && !console_suspend_enabled) {
++		if (uart_console(&up->port))
++			return -EBUSY;
++	}
++
++	if (priv->habit & UART_ERRATA_CLOCK_DISABLE) {
++		int ret;
++
++		ret = omap8250_soft_reset(dev);
++		if (ret)
++			return ret;
++
++		/* Restore to UART mode after reset (for wakeup) */
++		omap8250_update_mdr1(up, priv);
++		/* Restore wakeup enable register */
++		serial_out(up, UART_OMAP_WER, priv->wer);
++	}
++
++	if (up->dma && up->dma->rxchan)
++		omap_8250_rx_dma_flush(up);
++
++	priv->latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE;
++	schedule_work(&priv->qos_work);
++
++	return 0;
++}
++
++static int omap8250_runtime_resume(struct device *dev)
++{
++	struct omap8250_priv *priv = dev_get_drvdata(dev);
++	struct uart_8250_port *up;
++
++	/* In case runtime-pm tries this before we are setup */
++	if (!priv)
++		return 0;
++
++	up = serial8250_get_port(priv->line);
++
++	if (omap8250_lost_context(up))
++		omap8250_restore_regs(up);
++
++	if (up->dma && up->dma->rxchan && !(priv->habit & UART_HAS_EFR2))
++		omap_8250_rx_dma(up);
++
++	priv->latency = priv->calc_latency;
++	schedule_work(&priv->qos_work);
++	return 0;
++}
++#endif
++
++#ifdef CONFIG_SERIAL_8250_OMAP_TTYO_FIXUP
++static int __init omap8250_console_fixup(void)
++{
++	char *omap_str;
++	char *options;
++	u8 idx;
++
++	if (strstr(boot_command_line, "console=ttyS"))
++		/* user set a ttyS based name for the console */
++		return 0;
++
++	omap_str = strstr(boot_command_line, "console=ttyO");
++	if (!omap_str)
++		/* user did not set ttyO based console, so we don't care */
++		return 0;
++
++	omap_str += 12;
++	if ('0' <= *omap_str && *omap_str <= '9')
++		idx = *omap_str - '0';
++	else
++		return 0;
++
++	omap_str++;
++	if (omap_str[0] == ',') {
++		omap_str++;
++		options = omap_str;
++	} else {
++		options = NULL;
++	}
++
++	add_preferred_console("ttyS", idx, options);
++	pr_err("WARNING: Your 'console=ttyO%d' has been replaced by 'ttyS%d'\n",
++	       idx, idx);
++	pr_err("This ensures that you still see kernel messages. Please\n");
++	pr_err("update your kernel commandline.\n");
++	return 0;
++}
++console_initcall(omap8250_console_fixup);
++#endif
++
++static const struct dev_pm_ops omap8250_dev_pm_ops = {
++	SET_SYSTEM_SLEEP_PM_OPS(omap8250_suspend, omap8250_resume)
++	SET_RUNTIME_PM_OPS(omap8250_runtime_suspend,
++			   omap8250_runtime_resume, NULL)
++	.prepare        = omap8250_prepare,
++	.complete       = omap8250_complete,
++};
++
++static struct platform_driver omap8250_platform_driver = {
++	.driver = {
++		.name		= "omap8250",
++		.pm		= &omap8250_dev_pm_ops,
++		.of_match_table = omap8250_dt_ids,
++	},
++	.probe			= omap8250_probe,
++	.remove			= omap8250_remove,
++};
++module_platform_driver(omap8250_platform_driver);
++
++MODULE_AUTHOR("Sebastian Andrzej Siewior");
++MODULE_DESCRIPTION("OMAP 8250 Driver");
++MODULE_LICENSE("GPL v2");
+Binary files linux.orig/drivers/tty/serial/8250/.8250_omap.c.rej.swp and linux/drivers/tty/serial/8250/.8250_omap.c.rej.swp differ
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_port.c linux/drivers/tty/serial/8250/8250_port.c
+--- linux.orig/drivers/tty/serial/8250/8250_port.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_port.c	2022-12-04 10:40:26.700034085 -0500
+@@ -743,7 +743,7 @@ static void serial8250_set_sleep(struct
  			serial_out(p, UART_EFR, UART_EFR_ECB);
  			serial_out(p, UART_LCR, 0);
  		}
@@ -3637,7 +22982,7 @@ index 2030a92ac66e7..326549603740d 100644
  		if (p->capabilities & UART_CAP_EFR) {
  			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
  			serial_out(p, UART_EFR, efr);
-@@ -1017,8 +1017,11 @@ static int broken_efr(struct uart_8250_port *up)
+@@ -1017,8 +1017,11 @@ static int broken_efr(struct uart_8250_p
   */
  static void autoconfig_16550a(struct uart_8250_port *up)
  {
@@ -3649,7 +22994,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	up->port.type = PORT_16550A;
  	up->capabilities |= UART_CAP_FIFO;
-@@ -1130,6 +1133,11 @@ static void autoconfig_16550a(struct uart_8250_port *up)
+@@ -1130,6 +1133,11 @@ static void autoconfig_16550a(struct uar
  		return;
  	}
  
@@ -3661,7 +23006,7 @@ index 2030a92ac66e7..326549603740d 100644
  	/*
  	 * Try writing and reading the UART_IER_UUE bit (b6).
  	 * If it works, this is probably one of the Xscale platform's
-@@ -1165,6 +1173,9 @@ static void autoconfig_16550a(struct uart_8250_port *up)
+@@ -1165,6 +1173,9 @@ static void autoconfig_16550a(struct uar
  	}
  	serial_out(up, UART_IER, iersave);
  
@@ -3671,7 +23016,7 @@ index 2030a92ac66e7..326549603740d 100644
  	/*
  	 * We distinguish between 16550A and U6 16550A by counting
  	 * how many bytes are in the FIFO.
-@@ -1187,8 +1198,10 @@ static void autoconfig(struct uart_8250_port *up)
+@@ -1187,8 +1198,10 @@ static void autoconfig(struct uart_8250_
  	unsigned char status1, scratch, scratch2, scratch3;
  	unsigned char save_lcr, save_mcr;
  	struct uart_port *port = &up->port;
@@ -3682,7 +23027,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	if (!port->iobase && !port->mapbase && !port->membase)
  		return;
-@@ -1206,6 +1219,11 @@ static void autoconfig(struct uart_8250_port *up)
+@@ -1206,6 +1219,11 @@ static void autoconfig(struct uart_8250_
  	up->bugs = 0;
  
  	if (!(port->flags & UPF_BUGGY_UART)) {
@@ -3694,7 +23039,7 @@ index 2030a92ac66e7..326549603740d 100644
  		/*
  		 * Do a simple existence test first; if we fail this,
  		 * there's no point trying anything else.
-@@ -1235,6 +1253,10 @@ static void autoconfig(struct uart_8250_port *up)
+@@ -1235,6 +1253,10 @@ static void autoconfig(struct uart_8250_
  #endif
  		scratch3 = serial_in(up, UART_IER) & 0x0f;
  		serial_out(up, UART_IER, scratch);
@@ -3705,7 +23050,7 @@ index 2030a92ac66e7..326549603740d 100644
  		if (scratch2 != 0 || scratch3 != 0x0F) {
  			/*
  			 * We failed; there's nothing here
-@@ -1332,10 +1354,7 @@ static void autoconfig(struct uart_8250_port *up)
+@@ -1332,10 +1354,7 @@ static void autoconfig(struct uart_8250_
  	serial8250_out_MCR(up, save_mcr);
  	serial8250_clear_fifos(up);
  	serial_in(up, UART_RX);
@@ -3717,7 +23062,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  out_unlock:
  	spin_unlock_irqrestore(&port->lock, flags);
-@@ -1361,7 +1380,9 @@ static void autoconfig_irq(struct uart_8250_port *up)
+@@ -1361,7 +1380,9 @@ static void autoconfig_irq(struct uart_8
  	unsigned char save_mcr, save_ier;
  	unsigned char save_ICP = 0;
  	unsigned int ICP = 0;
@@ -3727,7 +23072,7 @@ index 2030a92ac66e7..326549603740d 100644
  	int irq;
  
  	if (port->flags & UPF_FOURPORT) {
-@@ -1371,8 +1392,12 @@ static void autoconfig_irq(struct uart_8250_port *up)
+@@ -1371,8 +1392,12 @@ static void autoconfig_irq(struct uart_8
  		inb_p(ICP);
  	}
  
@@ -3741,7 +23086,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	/* forget possible initially masked and pending IRQ */
  	probe_irq_off(probe_irq_on());
-@@ -1404,8 +1429,10 @@ static void autoconfig_irq(struct uart_8250_port *up)
+@@ -1404,8 +1429,10 @@ static void autoconfig_irq(struct uart_8
  	if (port->flags & UPF_FOURPORT)
  		outb_p(save_ICP, ICP);
  
@@ -3753,7 +23098,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	port->irq = (irq > 0) ? irq : 0;
  }
-@@ -1418,7 +1445,7 @@ static void serial8250_stop_rx(struct uart_port *port)
+@@ -1418,7 +1445,7 @@ static void serial8250_stop_rx(struct ua
  
  	up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
  	up->port.read_status_mask &= ~UART_LSR_DR;
@@ -3762,7 +23107,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	serial8250_rpm_put(up);
  }
-@@ -1448,7 +1475,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p)
+@@ -1448,7 +1475,7 @@ void serial8250_em485_stop_tx(struct uar
  		serial8250_clear_and_reinit_fifos(p);
  
  		p->ier |= UART_IER_RLSI | UART_IER_RDI;
@@ -3771,7 +23116,7 @@ index 2030a92ac66e7..326549603740d 100644
  	}
  }
  EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);
-@@ -1697,7 +1724,7 @@ static void serial8250_disable_ms(struct uart_port *port)
+@@ -1697,7 +1724,7 @@ static void serial8250_disable_ms(struct
  	mctrl_gpio_disable_ms(up->gpios);
  
  	up->ier &= ~UART_IER_MSI;
@@ -3780,7 +23125,7 @@ index 2030a92ac66e7..326549603740d 100644
  }
  
  static void serial8250_enable_ms(struct uart_port *port)
-@@ -1713,7 +1740,7 @@ static void serial8250_enable_ms(struct uart_port *port)
+@@ -1713,7 +1740,7 @@ static void serial8250_enable_ms(struct
  	up->ier |= UART_IER_MSI;
  
  	serial8250_rpm_get(up);
@@ -3789,7 +23134,7 @@ index 2030a92ac66e7..326549603740d 100644
  	serial8250_rpm_put(up);
  }
  
-@@ -2144,14 +2171,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
+@@ -2147,14 +2174,7 @@ static void serial8250_put_poll_char(str
  	struct uart_8250_port *up = up_to_u8250p(port);
  
  	serial8250_rpm_get(up);
@@ -3805,7 +23150,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
  	/*
-@@ -2164,7 +2184,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
+@@ -2167,7 +2187,7 @@ static void serial8250_put_poll_char(str
  	 *	and restore the IER
  	 */
  	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
@@ -3814,7 +23159,7 @@ index 2030a92ac66e7..326549603740d 100644
  	serial8250_rpm_put(up);
  }
  
-@@ -2173,8 +2193,10 @@ static void serial8250_put_poll_char(struct uart_port *port,
+@@ -2176,8 +2196,10 @@ static void serial8250_put_poll_char(str
  int serial8250_do_startup(struct uart_port *port)
  {
  	struct uart_8250_port *up = up_to_u8250p(port);
@@ -3825,7 +23170,7 @@ index 2030a92ac66e7..326549603740d 100644
  	int retval;
  	u16 lsr;
  
-@@ -2195,7 +2217,7 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2198,7 +2220,7 @@ int serial8250_do_startup(struct uart_po
  		up->acr = 0;
  		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
  		serial_port_out(port, UART_EFR, UART_EFR_ECB);
@@ -3834,7 +23179,7 @@ index 2030a92ac66e7..326549603740d 100644
  		serial_port_out(port, UART_LCR, 0);
  		serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
  		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
-@@ -2205,7 +2227,7 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2208,7 +2230,7 @@ int serial8250_do_startup(struct uart_po
  
  	if (port->type == PORT_DA830) {
  		/* Reset the port */
@@ -3843,7 +23188,7 @@ index 2030a92ac66e7..326549603740d 100644
  		serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
  		mdelay(10);
  
-@@ -2304,6 +2326,8 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2307,6 +2329,8 @@ int serial8250_do_startup(struct uart_po
  	if (retval)
  		goto out;
  
@@ -3852,7 +23197,7 @@ index 2030a92ac66e7..326549603740d 100644
  	if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) {
  		unsigned char iir1;
  
-@@ -2320,6 +2344,9 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2323,6 +2347,9 @@ int serial8250_do_startup(struct uart_po
  		 */
  		spin_lock_irqsave(&port->lock, flags);
  
@@ -3862,7 +23207,7 @@ index 2030a92ac66e7..326549603740d 100644
  		wait_for_xmitr(up, UART_LSR_THRE);
  		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
  		udelay(1); /* allow THRE to set */
-@@ -2330,6 +2357,9 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2333,6 +2360,9 @@ int serial8250_do_startup(struct uart_po
  		iir = serial_port_in(port, UART_IIR);
  		serial_port_out(port, UART_IER, 0);
  
@@ -3872,7 +23217,7 @@ index 2030a92ac66e7..326549603740d 100644
  		spin_unlock_irqrestore(&port->lock, flags);
  
  		if (port->irqflags & IRQF_SHARED)
-@@ -2384,10 +2414,14 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2387,10 +2417,14 @@ int serial8250_do_startup(struct uart_po
  	 * Do a quick test to see if we receive an interrupt when we enable
  	 * the TX irq.
  	 */
@@ -3887,7 +23232,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
  		if (!(up->bugs & UART_BUG_TXEN)) {
-@@ -2419,7 +2453,7 @@ int serial8250_do_startup(struct uart_port *port)
+@@ -2422,7 +2456,7 @@ dont_test_tx_en:
  	if (up->dma) {
  		const char *msg = NULL;
  
@@ -3896,7 +23241,7 @@ index 2030a92ac66e7..326549603740d 100644
  			msg = "forbid DMA for kernel console";
  		else if (serial8250_request_dma(up))
  			msg = "failed to request DMA";
-@@ -2470,7 +2504,7 @@ void serial8250_do_shutdown(struct uart_port *port)
+@@ -2473,7 +2507,7 @@ void serial8250_do_shutdown(struct uart_
  	 */
  	spin_lock_irqsave(&port->lock, flags);
  	up->ier = 0;
@@ -3905,7 +23250,7 @@ index 2030a92ac66e7..326549603740d 100644
  	spin_unlock_irqrestore(&port->lock, flags);
  
  	synchronize_irq(port->irq);
-@@ -2836,7 +2870,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
+@@ -2839,7 +2873,7 @@ serial8250_do_set_termios(struct uart_po
  	if (up->capabilities & UART_CAP_RTOIE)
  		up->ier |= UART_IER_RTOIE;
  
@@ -3914,7 +23259,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	if (up->capabilities & UART_CAP_EFR) {
  		unsigned char efr = 0;
-@@ -3301,7 +3335,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults);
+@@ -3304,7 +3338,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_default
  
  #ifdef CONFIG_SERIAL_8250_CONSOLE
  
@@ -3923,7 +23268,7 @@ index 2030a92ac66e7..326549603740d 100644
  {
  	struct uart_8250_port *up = up_to_u8250p(port);
  
-@@ -3309,6 +3343,18 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
+@@ -3312,6 +3346,18 @@ static void serial8250_console_putchar(s
  	serial_port_out(port, UART_TX, ch);
  }
  
@@ -3942,7 +23287,7 @@ index 2030a92ac66e7..326549603740d 100644
  /*
   *	Restore serial console when h/w power-off detected
   */
-@@ -3335,6 +3381,32 @@ static void serial8250_console_restore(struct uart_8250_port *up)
+@@ -3338,6 +3384,32 @@ static void serial8250_console_restore(s
  	serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
  }
  
@@ -3975,7 +23320,7 @@ index 2030a92ac66e7..326549603740d 100644
  /*
   * Print a string to the serial port using the device FIFO
   *
-@@ -3380,24 +3452,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
+@@ -3383,24 +3455,12 @@ void serial8250_console_write(struct uar
  	struct uart_port *port = &up->port;
  	unsigned long flags;
  	unsigned int ier, use_fifo;
@@ -3987,13 +23332,13 @@ index 2030a92ac66e7..326549603740d 100644
 -		locked = spin_trylock_irqsave(&port->lock, flags);
 -	else
 -		spin_lock_irqsave(&port->lock, flags);
-+	spin_lock_irqsave(&port->lock, flags);
- 
+-
 -	/*
 -	 *	First save the IER then disable the interrupts
 -	 */
 -	ier = serial_port_in(port, UART_IER);
--
++	spin_lock_irqsave(&port->lock, flags);
+ 
 -	if (up->capabilities & UART_CAP_UUE)
 -		serial_port_out(port, UART_IER, UART_IER_UUE);
 -	else
@@ -4002,7 +23347,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	/* check scratch reg to see if port powered off during system sleep */
  	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
-@@ -3431,10 +3491,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
+@@ -3434,10 +3494,12 @@ void serial8250_console_write(struct uar
  		 */
  		!(up->port.flags & UPF_CONS_FLOW);
  
@@ -4015,7 +23360,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	/*
  	 *	Finally, wait for transmitter to become empty
-@@ -3447,8 +3509,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
+@@ -3450,8 +3512,7 @@ void serial8250_console_write(struct uar
  		if (em485->tx_stopped)
  			up->rs485_stop_tx(up);
  	}
@@ -4025,7 +23370,7 @@ index 2030a92ac66e7..326549603740d 100644
  
  	/*
  	 *	The receive handling will happen properly because the
-@@ -3460,8 +3521,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
+@@ -3463,8 +3524,7 @@ void serial8250_console_write(struct uar
  	if (up->msr_saved_flags)
  		serial8250_modem_status(up);
  
@@ -4035,7 +23380,7 @@ index 2030a92ac66e7..326549603740d 100644
  }
  
  static unsigned int probe_baud(struct uart_port *port)
-@@ -3481,6 +3541,7 @@ static unsigned int probe_baud(struct uart_port *port)
+@@ -3484,6 +3544,7 @@ static unsigned int probe_baud(struct ua
  
  int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
  {
@@ -4043,7 +23388,7 @@ index 2030a92ac66e7..326549603740d 100644
  	int baud = 9600;
  	int bits = 8;
  	int parity = 'n';
-@@ -3490,6 +3551,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
+@@ -3493,6 +3554,8 @@ int serial8250_console_setup(struct uart
  	if (!port->iobase && !port->membase)
  		return -ENODEV;
  
@@ -4052,10 +23397,3534 @@ index 2030a92ac66e7..326549603740d 100644
  	if (options)
  		uart_parse_options(options, &baud, &parity, &bits, &flow);
  	else if (probe)
-diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
-index d0b49e15fbf5e..02c308467339c 100644
---- a/drivers/tty/serial/8250/Kconfig
-+++ b/drivers/tty/serial/8250/Kconfig
+diff -rupN linux.orig/drivers/tty/serial/8250/8250_port.c.orig linux/drivers/tty/serial/8250/8250_port.c.orig
+--- linux.orig/drivers/tty/serial/8250/8250_port.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/drivers/tty/serial/8250/8250_port.c.orig	2022-12-04 10:40:18.432055273 -0500
+@@ -0,0 +1,3521 @@
++// SPDX-License-Identifier: GPL-2.0+
++/*
++ *  Base port operations for 8250/16550-type serial ports
++ *
++ *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
++ *  Split from 8250_core.c, Copyright (C) 2001 Russell King.
++ *
++ * A note about mapbase / membase
++ *
++ *  mapbase is the physical address of the IO port.
++ *  membase is an 'ioremapped' cookie.
++ */
++
++#include <linux/module.h>
++#include <linux/moduleparam.h>
++#include <linux/ioport.h>
++#include <linux/init.h>
++#include <linux/console.h>
++#include <linux/gpio/consumer.h>
++#include <linux/sysrq.h>
++#include <linux/delay.h>
++#include <linux/platform_device.h>
++#include <linux/tty.h>
++#include <linux/ratelimit.h>
++#include <linux/tty_flip.h>
++#include <linux/serial.h>
++#include <linux/serial_8250.h>
++#include <linux/nmi.h>
++#include <linux/mutex.h>
++#include <linux/slab.h>
++#include <linux/uaccess.h>
++#include <linux/pm_runtime.h>
++#include <linux/ktime.h>
++
++#include <asm/io.h>
++#include <asm/irq.h>
++
++#include "8250.h"
++
++/* Nuvoton NPCM timeout register */
++#define UART_NPCM_TOR          7
++#define UART_NPCM_TOIE         BIT(7)  /* Timeout Interrupt Enable */
++
++/*
++ * Debugging.
++ */
++#if 0
++#define DEBUG_AUTOCONF(fmt...)	printk(fmt)
++#else
++#define DEBUG_AUTOCONF(fmt...)	do { } while (0)
++#endif
++
++/*
++ * Here we define the default xmit fifo size used for each type of UART.
++ */
++static const struct serial8250_config uart_config[] = {
++	[PORT_UNKNOWN] = {
++		.name		= "unknown",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_8250] = {
++		.name		= "8250",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16450] = {
++		.name		= "16450",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16550] = {
++		.name		= "16550",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16550A] = {
++		.name		= "16550A",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_CIRRUS] = {
++		.name		= "Cirrus",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16650] = {
++		.name		= "ST16650",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
++	},
++	[PORT_16650V2] = {
++		.name		= "ST16650V2",
++		.fifo_size	= 32,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
++				  UART_FCR_T_TRIG_00,
++		.rxtrig_bytes	= {8, 16, 24, 28},
++		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
++	},
++	[PORT_16750] = {
++		.name		= "TI16750",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
++				  UART_FCR7_64BYTE,
++		.rxtrig_bytes	= {1, 16, 32, 56},
++		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE,
++	},
++	[PORT_STARTECH] = {
++		.name		= "Startech",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1,
++	},
++	[PORT_16C950] = {
++		.name		= "16C950/954",
++		.fifo_size	= 128,
++		.tx_loadsz	= 128,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
++		.rxtrig_bytes	= {16, 32, 112, 120},
++		/* UART_CAP_EFR breaks billionon CF bluetooth card. */
++		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP,
++	},
++	[PORT_16654] = {
++		.name		= "ST16654",
++		.fifo_size	= 64,
++		.tx_loadsz	= 32,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
++				  UART_FCR_T_TRIG_10,
++		.rxtrig_bytes	= {8, 16, 56, 60},
++		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
++	},
++	[PORT_16850] = {
++		.name		= "XR16850",
++		.fifo_size	= 128,
++		.tx_loadsz	= 128,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
++	},
++	[PORT_RSA] = {
++		.name		= "RSA",
++		.fifo_size	= 2048,
++		.tx_loadsz	= 2048,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11,
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_NS16550A] = {
++		.name		= "NS16550A",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_NATSEMI,
++	},
++	[PORT_XSCALE] = {
++		.name		= "XScale",
++		.fifo_size	= 32,
++		.tx_loadsz	= 32,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE,
++	},
++	[PORT_OCTEON] = {
++		.name		= "OCTEON",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_AR7] = {
++		.name		= "AR7",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
++		.flags		= UART_CAP_FIFO /* | UART_CAP_AFE */,
++	},
++	[PORT_U6_16550A] = {
++		.name		= "U6_16550A",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	[PORT_TEGRA] = {
++		.name		= "Tegra",
++		.fifo_size	= 32,
++		.tx_loadsz	= 8,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
++				  UART_FCR_T_TRIG_01,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO | UART_CAP_RTOIE,
++	},
++	[PORT_XR17D15X] = {
++		.name		= "XR17D15X",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
++				  UART_CAP_SLEEP,
++	},
++	[PORT_XR17V35X] = {
++		.name		= "XR17V35X",
++		.fifo_size	= 256,
++		.tx_loadsz	= 256,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 |
++				  UART_FCR_T_TRIG_11,
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
++				  UART_CAP_SLEEP,
++	},
++	[PORT_LPC3220] = {
++		.name		= "LPC3220",
++		.fifo_size	= 64,
++		.tx_loadsz	= 32,
++		.fcr		= UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
++				  UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00,
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_BRCM_TRUMANAGE] = {
++		.name		= "TruManage",
++		.fifo_size	= 1,
++		.tx_loadsz	= 1024,
++		.flags		= UART_CAP_HFIFO,
++	},
++	[PORT_8250_CIR] = {
++		.name		= "CIR port"
++	},
++	[PORT_ALTR_16550_F32] = {
++		.name		= "Altera 16550 FIFO32",
++		.fifo_size	= 32,
++		.tx_loadsz	= 32,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 8, 16, 30},
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	[PORT_ALTR_16550_F64] = {
++		.name		= "Altera 16550 FIFO64",
++		.fifo_size	= 64,
++		.tx_loadsz	= 64,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 16, 32, 62},
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	[PORT_ALTR_16550_F128] = {
++		.name		= "Altera 16550 FIFO128",
++		.fifo_size	= 128,
++		.tx_loadsz	= 128,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 32, 64, 126},
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	/*
++	 * tx_loadsz is set to 63-bytes instead of 64-bytes to implement
++	 * workaround of errata A-008006 which states that tx_loadsz should
++	 * be configured less than Maximum supported fifo bytes.
++	 */
++	[PORT_16550A_FSL64] = {
++		.name		= "16550A_FSL64",
++		.fifo_size	= 64,
++		.tx_loadsz	= 63,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
++				  UART_FCR7_64BYTE,
++		.flags		= UART_CAP_FIFO | UART_CAP_NOTEMT,
++	},
++	[PORT_RT2880] = {
++		.name		= "Palmchip BK-3103",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_DA830] = {
++		.name		= "TI DA8xx/66AK2x",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
++				  UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
++	},
++	[PORT_MTK_BTIF] = {
++		.name		= "MediaTek BTIF",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO |
++				  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_NPCM] = {
++		.name		= "Nuvoton 16550",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
++				  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO,
++	},
++	[PORT_SUNIX] = {
++		.name		= "Sunix",
++		.fifo_size	= 128,
++		.tx_loadsz	= 128,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
++		.rxtrig_bytes	= {1, 32, 64, 112},
++		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP,
++	},
++	[PORT_ASPEED_VUART] = {
++		.name		= "ASPEED VUART",
++		.fifo_size	= 16,
++		.tx_loadsz	= 16,
++		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
++		.rxtrig_bytes	= {1, 4, 8, 14},
++		.flags		= UART_CAP_FIFO,
++	},
++};
++
++/* Uart divisor latch read */
++static int default_serial_dl_read(struct uart_8250_port *up)
++{
++	/* Assign these in pieces to truncate any bits above 7.  */
++	unsigned char dll = serial_in(up, UART_DLL);
++	unsigned char dlm = serial_in(up, UART_DLM);
++
++	return dll | dlm << 8;
++}
++
++/* Uart divisor latch write */
++static void default_serial_dl_write(struct uart_8250_port *up, int value)
++{
++	serial_out(up, UART_DLL, value & 0xff);
++	serial_out(up, UART_DLM, value >> 8 & 0xff);
++}
++
++#ifdef CONFIG_SERIAL_8250_RT288X
++
++#define UART_REG_UNMAPPED	-1
++
++/* Au1x00/RT288x UART hardware has a weird register layout */
++static const s8 au_io_in_map[8] = {
++	[UART_RX]	= 0,
++	[UART_IER]	= 2,
++	[UART_IIR]	= 3,
++	[UART_LCR]	= 5,
++	[UART_MCR]	= 6,
++	[UART_LSR]	= 7,
++	[UART_MSR]	= 8,
++	[UART_SCR]	= UART_REG_UNMAPPED,
++};
++
++static const s8 au_io_out_map[8] = {
++	[UART_TX]	= 1,
++	[UART_IER]	= 2,
++	[UART_FCR]	= 4,
++	[UART_LCR]	= 5,
++	[UART_MCR]	= 6,
++	[UART_LSR]	= UART_REG_UNMAPPED,
++	[UART_MSR]	= UART_REG_UNMAPPED,
++	[UART_SCR]	= UART_REG_UNMAPPED,
++};
++
++unsigned int au_serial_in(struct uart_port *p, int offset)
++{
++	if (offset >= ARRAY_SIZE(au_io_in_map))
++		return UINT_MAX;
++	offset = au_io_in_map[offset];
++	if (offset == UART_REG_UNMAPPED)
++		return UINT_MAX;
++	return __raw_readl(p->membase + (offset << p->regshift));
++}
++
++void au_serial_out(struct uart_port *p, int offset, int value)
++{
++	if (offset >= ARRAY_SIZE(au_io_out_map))
++		return;
++	offset = au_io_out_map[offset];
++	if (offset == UART_REG_UNMAPPED)
++		return;
++	__raw_writel(value, p->membase + (offset << p->regshift));
++}
++
++/* Au1x00 haven't got a standard divisor latch */
++static int au_serial_dl_read(struct uart_8250_port *up)
++{
++	return __raw_readl(up->port.membase + 0x28);
++}
++
++static void au_serial_dl_write(struct uart_8250_port *up, int value)
++{
++	__raw_writel(value, up->port.membase + 0x28);
++}
++
++#endif
++
++static unsigned int hub6_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	outb(p->hub6 - 1 + offset, p->iobase);
++	return inb(p->iobase + 1);
++}
++
++static void hub6_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	outb(p->hub6 - 1 + offset, p->iobase);
++	outb(value, p->iobase + 1);
++}
++
++static unsigned int mem_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return readb(p->membase + offset);
++}
++
++static void mem_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	writeb(value, p->membase + offset);
++}
++
++static void mem16_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	writew(value, p->membase + offset);
++}
++
++static unsigned int mem16_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return readw(p->membase + offset);
++}
++
++static void mem32_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	writel(value, p->membase + offset);
++}
++
++static unsigned int mem32_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return readl(p->membase + offset);
++}
++
++static void mem32be_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	iowrite32be(value, p->membase + offset);
++}
++
++static unsigned int mem32be_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return ioread32be(p->membase + offset);
++}
++
++static unsigned int io_serial_in(struct uart_port *p, int offset)
++{
++	offset = offset << p->regshift;
++	return inb(p->iobase + offset);
++}
++
++static void io_serial_out(struct uart_port *p, int offset, int value)
++{
++	offset = offset << p->regshift;
++	outb(value, p->iobase + offset);
++}
++
++static int serial8250_default_handle_irq(struct uart_port *port);
++
++static void set_io_from_upio(struct uart_port *p)
++{
++	struct uart_8250_port *up = up_to_u8250p(p);
++
++	up->dl_read = default_serial_dl_read;
++	up->dl_write = default_serial_dl_write;
++
++	switch (p->iotype) {
++	case UPIO_HUB6:
++		p->serial_in = hub6_serial_in;
++		p->serial_out = hub6_serial_out;
++		break;
++
++	case UPIO_MEM:
++		p->serial_in = mem_serial_in;
++		p->serial_out = mem_serial_out;
++		break;
++
++	case UPIO_MEM16:
++		p->serial_in = mem16_serial_in;
++		p->serial_out = mem16_serial_out;
++		break;
++
++	case UPIO_MEM32:
++		p->serial_in = mem32_serial_in;
++		p->serial_out = mem32_serial_out;
++		break;
++
++	case UPIO_MEM32BE:
++		p->serial_in = mem32be_serial_in;
++		p->serial_out = mem32be_serial_out;
++		break;
++
++#ifdef CONFIG_SERIAL_8250_RT288X
++	case UPIO_AU:
++		p->serial_in = au_serial_in;
++		p->serial_out = au_serial_out;
++		up->dl_read = au_serial_dl_read;
++		up->dl_write = au_serial_dl_write;
++		break;
++#endif
++
++	default:
++		p->serial_in = io_serial_in;
++		p->serial_out = io_serial_out;
++		break;
++	}
++	/* Remember loaded iotype */
++	up->cur_iotype = p->iotype;
++	p->handle_irq = serial8250_default_handle_irq;
++}
++
++static void
++serial_port_out_sync(struct uart_port *p, int offset, int value)
++{
++	switch (p->iotype) {
++	case UPIO_MEM:
++	case UPIO_MEM16:
++	case UPIO_MEM32:
++	case UPIO_MEM32BE:
++	case UPIO_AU:
++		p->serial_out(p, offset, value);
++		p->serial_in(p, UART_LCR);	/* safe, no side-effects */
++		break;
++	default:
++		p->serial_out(p, offset, value);
++	}
++}
++
++/*
++ * FIFO support.
++ */
++static void serial8250_clear_fifos(struct uart_8250_port *p)
++{
++	if (p->capabilities & UART_CAP_FIFO) {
++		serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO);
++		serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO |
++			       UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
++		serial_out(p, UART_FCR, 0);
++	}
++}
++
++static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t);
++static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t);
++
++void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p)
++{
++	serial8250_clear_fifos(p);
++	serial_out(p, UART_FCR, p->fcr);
++}
++EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos);
++
++void serial8250_rpm_get(struct uart_8250_port *p)
++{
++	if (!(p->capabilities & UART_CAP_RPM))
++		return;
++	pm_runtime_get_sync(p->port.dev);
++}
++EXPORT_SYMBOL_GPL(serial8250_rpm_get);
++
++void serial8250_rpm_put(struct uart_8250_port *p)
++{
++	if (!(p->capabilities & UART_CAP_RPM))
++		return;
++	pm_runtime_mark_last_busy(p->port.dev);
++	pm_runtime_put_autosuspend(p->port.dev);
++}
++EXPORT_SYMBOL_GPL(serial8250_rpm_put);
++
++/**
++ *	serial8250_em485_init() - put uart_8250_port into rs485 emulating
++ *	@p:	uart_8250_port port instance
++ *
++ *	The function is used to start rs485 software emulating on the
++ *	&struct uart_8250_port* @p. Namely, RTS is switched before/after
++ *	transmission. The function is idempotent, so it is safe to call it
++ *	multiple times.
++ *
++ *	The caller MUST enable interrupt on empty shift register before
++ *	calling serial8250_em485_init(). This interrupt is not a part of
++ *	8250 standard, but implementation defined.
++ *
++ *	The function is supposed to be called from .rs485_config callback
++ *	or from any other callback protected with p->port.lock spinlock.
++ *
++ *	See also serial8250_em485_destroy()
++ *
++ *	Return 0 - success, -errno - otherwise
++ */
++static int serial8250_em485_init(struct uart_8250_port *p)
++{
++	if (p->em485)
++		goto deassert_rts;
++
++	p->em485 = kmalloc(sizeof(struct uart_8250_em485), GFP_ATOMIC);
++	if (!p->em485)
++		return -ENOMEM;
++
++	hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC,
++		     HRTIMER_MODE_REL);
++	hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC,
++		     HRTIMER_MODE_REL);
++	p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx;
++	p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx;
++	p->em485->port = p;
++	p->em485->active_timer = NULL;
++	p->em485->tx_stopped = true;
++
++deassert_rts:
++	if (p->em485->tx_stopped)
++		p->rs485_stop_tx(p);
++
++	return 0;
++}
++
++/**
++ *	serial8250_em485_destroy() - put uart_8250_port into normal state
++ *	@p:	uart_8250_port port instance
++ *
++ *	The function is used to stop rs485 software emulating on the
++ *	&struct uart_8250_port* @p. The function is idempotent, so it is safe to
++ *	call it multiple times.
++ *
++ *	The function is supposed to be called from .rs485_config callback
++ *	or from any other callback protected with p->port.lock spinlock.
++ *
++ *	See also serial8250_em485_init()
++ */
++void serial8250_em485_destroy(struct uart_8250_port *p)
++{
++	if (!p->em485)
++		return;
++
++	hrtimer_cancel(&p->em485->start_tx_timer);
++	hrtimer_cancel(&p->em485->stop_tx_timer);
++
++	kfree(p->em485);
++	p->em485 = NULL;
++}
++EXPORT_SYMBOL_GPL(serial8250_em485_destroy);
++
++struct serial_rs485 serial8250_em485_supported = {
++	.flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND |
++		 SER_RS485_TERMINATE_BUS | SER_RS485_RX_DURING_TX,
++	.delay_rts_before_send = 1,
++	.delay_rts_after_send = 1,
++};
++EXPORT_SYMBOL_GPL(serial8250_em485_supported);
++
++/**
++ * serial8250_em485_config() - generic ->rs485_config() callback
++ * @port: uart port
++ * @rs485: rs485 settings
++ *
++ * Generic callback usable by 8250 uart drivers to activate rs485 settings
++ * if the uart is incapable of driving RTS as a Transmit Enable signal in
++ * hardware, relying on software emulation instead.
++ */
++int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
++			    struct serial_rs485 *rs485)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	/* pick sane settings if the user hasn't */
++	if (!!(rs485->flags & SER_RS485_RTS_ON_SEND) ==
++	    !!(rs485->flags & SER_RS485_RTS_AFTER_SEND)) {
++		rs485->flags |= SER_RS485_RTS_ON_SEND;
++		rs485->flags &= ~SER_RS485_RTS_AFTER_SEND;
++	}
++
++	/*
++	 * Both serial8250_em485_init() and serial8250_em485_destroy()
++	 * are idempotent.
++	 */
++	if (rs485->flags & SER_RS485_ENABLED)
++		return serial8250_em485_init(up);
++
++	serial8250_em485_destroy(up);
++	return 0;
++}
++EXPORT_SYMBOL_GPL(serial8250_em485_config);
++
++/*
++ * These two wrappers ensure that enable_runtime_pm_tx() can be called more than
++ * once and disable_runtime_pm_tx() will still disable RPM because the fifo is
++ * empty and the HW can idle again.
++ */
++void serial8250_rpm_get_tx(struct uart_8250_port *p)
++{
++	unsigned char rpm_active;
++
++	if (!(p->capabilities & UART_CAP_RPM))
++		return;
++
++	rpm_active = xchg(&p->rpm_tx_active, 1);
++	if (rpm_active)
++		return;
++	pm_runtime_get_sync(p->port.dev);
++}
++EXPORT_SYMBOL_GPL(serial8250_rpm_get_tx);
++
++void serial8250_rpm_put_tx(struct uart_8250_port *p)
++{
++	unsigned char rpm_active;
++
++	if (!(p->capabilities & UART_CAP_RPM))
++		return;
++
++	rpm_active = xchg(&p->rpm_tx_active, 0);
++	if (!rpm_active)
++		return;
++	pm_runtime_mark_last_busy(p->port.dev);
++	pm_runtime_put_autosuspend(p->port.dev);
++}
++EXPORT_SYMBOL_GPL(serial8250_rpm_put_tx);
++
++/*
++ * IER sleep support.  UARTs which have EFRs need the "extended
++ * capability" bit enabled.  Note that on XR16C850s, we need to
++ * reset LCR to write to IER.
++ */
++static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
++{
++	unsigned char lcr = 0, efr = 0;
++
++	serial8250_rpm_get(p);
++
++	if (p->capabilities & UART_CAP_SLEEP) {
++		if (p->capabilities & UART_CAP_EFR) {
++			lcr = serial_in(p, UART_LCR);
++			efr = serial_in(p, UART_EFR);
++			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
++			serial_out(p, UART_EFR, UART_EFR_ECB);
++			serial_out(p, UART_LCR, 0);
++		}
++		serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
++		if (p->capabilities & UART_CAP_EFR) {
++			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
++			serial_out(p, UART_EFR, efr);
++			serial_out(p, UART_LCR, lcr);
++		}
++	}
++
++	serial8250_rpm_put(p);
++}
++
++#ifdef CONFIG_SERIAL_8250_RSA
++/*
++ * Attempts to turn on the RSA FIFO.  Returns zero on failure.
++ * We set the port uart clock rate if we succeed.
++ */
++static int __enable_rsa(struct uart_8250_port *up)
++{
++	unsigned char mode;
++	int result;
++
++	mode = serial_in(up, UART_RSA_MSR);
++	result = mode & UART_RSA_MSR_FIFO;
++
++	if (!result) {
++		serial_out(up, UART_RSA_MSR, mode | UART_RSA_MSR_FIFO);
++		mode = serial_in(up, UART_RSA_MSR);
++		result = mode & UART_RSA_MSR_FIFO;
++	}
++
++	if (result)
++		up->port.uartclk = SERIAL_RSA_BAUD_BASE * 16;
++
++	return result;
++}
++
++static void enable_rsa(struct uart_8250_port *up)
++{
++	if (up->port.type == PORT_RSA) {
++		if (up->port.uartclk != SERIAL_RSA_BAUD_BASE * 16) {
++			spin_lock_irq(&up->port.lock);
++			__enable_rsa(up);
++			spin_unlock_irq(&up->port.lock);
++		}
++		if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16)
++			serial_out(up, UART_RSA_FRR, 0);
++	}
++}
++
++/*
++ * Attempts to turn off the RSA FIFO.  Returns zero on failure.
++ * It is unknown why interrupts were disabled in here.  However,
++ * the caller is expected to preserve this behaviour by grabbing
++ * the spinlock before calling this function.
++ */
++static void disable_rsa(struct uart_8250_port *up)
++{
++	unsigned char mode;
++	int result;
++
++	if (up->port.type == PORT_RSA &&
++	    up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) {
++		spin_lock_irq(&up->port.lock);
++
++		mode = serial_in(up, UART_RSA_MSR);
++		result = !(mode & UART_RSA_MSR_FIFO);
++
++		if (!result) {
++			serial_out(up, UART_RSA_MSR, mode & ~UART_RSA_MSR_FIFO);
++			mode = serial_in(up, UART_RSA_MSR);
++			result = !(mode & UART_RSA_MSR_FIFO);
++		}
++
++		if (result)
++			up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16;
++		spin_unlock_irq(&up->port.lock);
++	}
++}
++#endif /* CONFIG_SERIAL_8250_RSA */
++
++/*
++ * This is a quickie test to see how big the FIFO is.
++ * It doesn't work at all the time, more's the pity.
++ */
++static int size_fifo(struct uart_8250_port *up)
++{
++	unsigned char old_fcr, old_mcr, old_lcr;
++	unsigned short old_dl;
++	int count;
++
++	old_lcr = serial_in(up, UART_LCR);
++	serial_out(up, UART_LCR, 0);
++	old_fcr = serial_in(up, UART_FCR);
++	old_mcr = serial8250_in_MCR(up);
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
++		    UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
++	serial8250_out_MCR(up, UART_MCR_LOOP);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	old_dl = serial_dl_read(up);
++	serial_dl_write(up, 0x0001);
++	serial_out(up, UART_LCR, UART_LCR_WLEN8);
++	for (count = 0; count < 256; count++)
++		serial_out(up, UART_TX, count);
++	mdelay(20);/* FIXME - schedule_timeout */
++	for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) &&
++	     (count < 256); count++)
++		serial_in(up, UART_RX);
++	serial_out(up, UART_FCR, old_fcr);
++	serial8250_out_MCR(up, old_mcr);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	serial_dl_write(up, old_dl);
++	serial_out(up, UART_LCR, old_lcr);
++
++	return count;
++}
++
++/*
++ * Read UART ID using the divisor method - set DLL and DLM to zero
++ * and the revision will be in DLL and device type in DLM.  We
++ * preserve the device state across this.
++ */
++static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p)
++{
++	unsigned char old_lcr;
++	unsigned int id, old_dl;
++
++	old_lcr = serial_in(p, UART_LCR);
++	serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A);
++	old_dl = serial_dl_read(p);
++	serial_dl_write(p, 0);
++	id = serial_dl_read(p);
++	serial_dl_write(p, old_dl);
++
++	serial_out(p, UART_LCR, old_lcr);
++
++	return id;
++}
++
++/*
++ * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's.
++ * When this function is called we know it is at least a StarTech
++ * 16650 V2, but it might be one of several StarTech UARTs, or one of
++ * its clones.  (We treat the broken original StarTech 16650 V1 as a
++ * 16550, and why not?  Startech doesn't seem to even acknowledge its
++ * existence.)
++ *
++ * What evil have men's minds wrought...
++ */
++static void autoconfig_has_efr(struct uart_8250_port *up)
++{
++	unsigned int id1, id2, id3, rev;
++
++	/*
++	 * Everything with an EFR has SLEEP
++	 */
++	up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
++
++	/*
++	 * First we check to see if it's an Oxford Semiconductor UART.
++	 *
++	 * If we have to do this here because some non-National
++	 * Semiconductor clone chips lock up if you try writing to the
++	 * LSR register (which serial_icr_read does)
++	 */
++
++	/*
++	 * Check for Oxford Semiconductor 16C950.
++	 *
++	 * EFR [4] must be set else this test fails.
++	 *
++	 * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca)
++	 * claims that it's needed for 952 dual UART's (which are not
++	 * recommended for new designs).
++	 */
++	up->acr = 0;
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_EFR, UART_EFR_ECB);
++	serial_out(up, UART_LCR, 0x00);
++	id1 = serial_icr_read(up, UART_ID1);
++	id2 = serial_icr_read(up, UART_ID2);
++	id3 = serial_icr_read(up, UART_ID3);
++	rev = serial_icr_read(up, UART_REV);
++
++	DEBUG_AUTOCONF("950id=%02x:%02x:%02x:%02x ", id1, id2, id3, rev);
++
++	if (id1 == 0x16 && id2 == 0xC9 &&
++	    (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) {
++		up->port.type = PORT_16C950;
++
++		/*
++		 * Enable work around for the Oxford Semiconductor 952 rev B
++		 * chip which causes it to seriously miscalculate baud rates
++		 * when DLL is 0.
++		 */
++		if (id3 == 0x52 && rev == 0x01)
++			up->bugs |= UART_BUG_QUOT;
++		return;
++	}
++
++	/*
++	 * We check for a XR16C850 by setting DLL and DLM to 0, and then
++	 * reading back DLL and DLM.  The chip type depends on the DLM
++	 * value read back:
++	 *  0x10 - XR16C850 and the DLL contains the chip revision.
++	 *  0x12 - XR16C2850.
++	 *  0x14 - XR16C854.
++	 */
++	id1 = autoconfig_read_divisor_id(up);
++	DEBUG_AUTOCONF("850id=%04x ", id1);
++
++	id2 = id1 >> 8;
++	if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) {
++		up->port.type = PORT_16850;
++		return;
++	}
++
++	/*
++	 * It wasn't an XR16C850.
++	 *
++	 * We distinguish between the '654 and the '650 by counting
++	 * how many bytes are in the FIFO.  I'm using this for now,
++	 * since that's the technique that was sent to me in the
++	 * serial driver update, but I'm not convinced this works.
++	 * I've had problems doing this in the past.  -TYT
++	 */
++	if (size_fifo(up) == 64)
++		up->port.type = PORT_16654;
++	else
++		up->port.type = PORT_16650V2;
++}
++
++/*
++ * We detected a chip without a FIFO.  Only two fall into
++ * this category - the original 8250 and the 16450.  The
++ * 16450 has a scratch register (accessible with LCR=0)
++ */
++static void autoconfig_8250(struct uart_8250_port *up)
++{
++	unsigned char scratch, status1, status2;
++
++	up->port.type = PORT_8250;
++
++	scratch = serial_in(up, UART_SCR);
++	serial_out(up, UART_SCR, 0xa5);
++	status1 = serial_in(up, UART_SCR);
++	serial_out(up, UART_SCR, 0x5a);
++	status2 = serial_in(up, UART_SCR);
++	serial_out(up, UART_SCR, scratch);
++
++	if (status1 == 0xa5 && status2 == 0x5a)
++		up->port.type = PORT_16450;
++}
++
++static int broken_efr(struct uart_8250_port *up)
++{
++	/*
++	 * Exar ST16C2550 "A2" devices incorrectly detect as
++	 * having an EFR, and report an ID of 0x0201.  See
++	 * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html
++	 */
++	if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16)
++		return 1;
++
++	return 0;
++}
++
++/*
++ * We know that the chip has FIFOs.  Does it have an EFR?  The
++ * EFR is located in the same register position as the IIR and
++ * we know the top two bits of the IIR are currently set.  The
++ * EFR should contain zero.  Try to read the EFR.
++ */
++static void autoconfig_16550a(struct uart_8250_port *up)
++{
++	unsigned char status1, status2;
++	unsigned int iersave;
++
++	up->port.type = PORT_16550A;
++	up->capabilities |= UART_CAP_FIFO;
++
++	if (!IS_ENABLED(CONFIG_SERIAL_8250_16550A_VARIANTS) &&
++	    !(up->port.flags & UPF_FULL_PROBE))
++		return;
++
++	/*
++	 * Check for presence of the EFR when DLAB is set.
++	 * Only ST16C650V1 UARTs pass this test.
++	 */
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	if (serial_in(up, UART_EFR) == 0) {
++		serial_out(up, UART_EFR, 0xA8);
++		if (serial_in(up, UART_EFR) != 0) {
++			DEBUG_AUTOCONF("EFRv1 ");
++			up->port.type = PORT_16650;
++			up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
++		} else {
++			serial_out(up, UART_LCR, 0);
++			serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
++				   UART_FCR7_64BYTE);
++			status1 = serial_in(up, UART_IIR) >> 5;
++			serial_out(up, UART_FCR, 0);
++			serial_out(up, UART_LCR, 0);
++
++			if (status1 == 7)
++				up->port.type = PORT_16550A_FSL64;
++			else
++				DEBUG_AUTOCONF("Motorola 8xxx DUART ");
++		}
++		serial_out(up, UART_EFR, 0);
++		return;
++	}
++
++	/*
++	 * Maybe it requires 0xbf to be written to the LCR.
++	 * (other ST16C650V2 UARTs, TI16C752A, etc)
++	 */
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) {
++		DEBUG_AUTOCONF("EFRv2 ");
++		autoconfig_has_efr(up);
++		return;
++	}
++
++	/*
++	 * Check for a National Semiconductor SuperIO chip.
++	 * Attempt to switch to bank 2, read the value of the LOOP bit
++	 * from EXCR1. Switch back to bank 0, change it in MCR. Then
++	 * switch back to bank 2, read it from EXCR1 again and check
++	 * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2
++	 */
++	serial_out(up, UART_LCR, 0);
++	status1 = serial8250_in_MCR(up);
++	serial_out(up, UART_LCR, 0xE0);
++	status2 = serial_in(up, 0x02); /* EXCR1 */
++
++	if (!((status2 ^ status1) & UART_MCR_LOOP)) {
++		serial_out(up, UART_LCR, 0);
++		serial8250_out_MCR(up, status1 ^ UART_MCR_LOOP);
++		serial_out(up, UART_LCR, 0xE0);
++		status2 = serial_in(up, 0x02); /* EXCR1 */
++		serial_out(up, UART_LCR, 0);
++		serial8250_out_MCR(up, status1);
++
++		if ((status2 ^ status1) & UART_MCR_LOOP) {
++			unsigned short quot;
++
++			serial_out(up, UART_LCR, 0xE0);
++
++			quot = serial_dl_read(up);
++			quot <<= 3;
++
++			if (ns16550a_goto_highspeed(up))
++				serial_dl_write(up, quot);
++
++			serial_out(up, UART_LCR, 0);
++
++			up->port.uartclk = 921600*16;
++			up->port.type = PORT_NS16550A;
++			up->capabilities |= UART_NATSEMI;
++			return;
++		}
++	}
++
++	/*
++	 * No EFR.  Try to detect a TI16750, which only sets bit 5 of
++	 * the IIR when 64 byte FIFO mode is enabled when DLAB is set.
++	 * Try setting it with and without DLAB set.  Cheap clones
++	 * set bit 5 without DLAB set.
++	 */
++	serial_out(up, UART_LCR, 0);
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
++	status1 = serial_in(up, UART_IIR) >> 5;
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
++	status2 = serial_in(up, UART_IIR) >> 5;
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
++	serial_out(up, UART_LCR, 0);
++
++	DEBUG_AUTOCONF("iir1=%d iir2=%d ", status1, status2);
++
++	if (status1 == 6 && status2 == 7) {
++		up->port.type = PORT_16750;
++		up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP;
++		return;
++	}
++
++	/*
++	 * Try writing and reading the UART_IER_UUE bit (b6).
++	 * If it works, this is probably one of the Xscale platform's
++	 * internal UARTs.
++	 * We're going to explicitly set the UUE bit to 0 before
++	 * trying to write and read a 1 just to make sure it's not
++	 * already a 1 and maybe locked there before we even start start.
++	 */
++	iersave = serial_in(up, UART_IER);
++	serial_out(up, UART_IER, iersave & ~UART_IER_UUE);
++	if (!(serial_in(up, UART_IER) & UART_IER_UUE)) {
++		/*
++		 * OK it's in a known zero state, try writing and reading
++		 * without disturbing the current state of the other bits.
++		 */
++		serial_out(up, UART_IER, iersave | UART_IER_UUE);
++		if (serial_in(up, UART_IER) & UART_IER_UUE) {
++			/*
++			 * It's an Xscale.
++			 * We'll leave the UART_IER_UUE bit set to 1 (enabled).
++			 */
++			DEBUG_AUTOCONF("Xscale ");
++			up->port.type = PORT_XSCALE;
++			up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE;
++			return;
++		}
++	} else {
++		/*
++		 * If we got here we couldn't force the IER_UUE bit to 0.
++		 * Log it and continue.
++		 */
++		DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 ");
++	}
++	serial_out(up, UART_IER, iersave);
++
++	/*
++	 * We distinguish between 16550A and U6 16550A by counting
++	 * how many bytes are in the FIFO.
++	 */
++	if (up->port.type == PORT_16550A && size_fifo(up) == 64) {
++		up->port.type = PORT_U6_16550A;
++		up->capabilities |= UART_CAP_AFE;
++	}
++}
++
++/*
++ * This routine is called by rs_init() to initialize a specific serial
++ * port.  It determines what type of UART chip this serial port is
++ * using: 8250, 16450, 16550, 16550A.  The important question is
++ * whether or not this UART is a 16550A or not, since this will
++ * determine whether or not we can use its FIFO features or not.
++ */
++static void autoconfig(struct uart_8250_port *up)
++{
++	unsigned char status1, scratch, scratch2, scratch3;
++	unsigned char save_lcr, save_mcr;
++	struct uart_port *port = &up->port;
++	unsigned long flags;
++	unsigned int old_capabilities;
++
++	if (!port->iobase && !port->mapbase && !port->membase)
++		return;
++
++	DEBUG_AUTOCONF("%s: autoconf (0x%04lx, 0x%p): ",
++		       port->name, port->iobase, port->membase);
++
++	/*
++	 * We really do need global IRQs disabled here - we're going to
++	 * be frobbing the chips IRQ enable register to see if it exists.
++	 */
++	spin_lock_irqsave(&port->lock, flags);
++
++	up->capabilities = 0;
++	up->bugs = 0;
++
++	if (!(port->flags & UPF_BUGGY_UART)) {
++		/*
++		 * Do a simple existence test first; if we fail this,
++		 * there's no point trying anything else.
++		 *
++		 * 0x80 is used as a nonsense port to prevent against
++		 * false positives due to ISA bus float.  The
++		 * assumption is that 0x80 is a non-existent port;
++		 * which should be safe since include/asm/io.h also
++		 * makes this assumption.
++		 *
++		 * Note: this is safe as long as MCR bit 4 is clear
++		 * and the device is in "PC" mode.
++		 */
++		scratch = serial_in(up, UART_IER);
++		serial_out(up, UART_IER, 0);
++#ifdef __i386__
++		outb(0xff, 0x080);
++#endif
++		/*
++		 * Mask out IER[7:4] bits for test as some UARTs (e.g. TL
++		 * 16C754B) allow only to modify them if an EFR bit is set.
++		 */
++		scratch2 = serial_in(up, UART_IER) & 0x0f;
++		serial_out(up, UART_IER, 0x0F);
++#ifdef __i386__
++		outb(0, 0x080);
++#endif
++		scratch3 = serial_in(up, UART_IER) & 0x0f;
++		serial_out(up, UART_IER, scratch);
++		if (scratch2 != 0 || scratch3 != 0x0F) {
++			/*
++			 * We failed; there's nothing here
++			 */
++			spin_unlock_irqrestore(&port->lock, flags);
++			DEBUG_AUTOCONF("IER test failed (%02x, %02x) ",
++				       scratch2, scratch3);
++			goto out;
++		}
++	}
++
++	save_mcr = serial8250_in_MCR(up);
++	save_lcr = serial_in(up, UART_LCR);
++
++	/*
++	 * Check to see if a UART is really there.  Certain broken
++	 * internal modems based on the Rockwell chipset fail this
++	 * test, because they apparently don't implement the loopback
++	 * test mode.  So this test is skipped on the COM 1 through
++	 * COM 4 ports.  This *should* be safe, since no board
++	 * manufacturer would be stupid enough to design a board
++	 * that conflicts with COM 1-4 --- we hope!
++	 */
++	if (!(port->flags & UPF_SKIP_TEST)) {
++		serial8250_out_MCR(up, UART_MCR_LOOP | 0x0A);
++		status1 = serial_in(up, UART_MSR) & 0xF0;
++		serial8250_out_MCR(up, save_mcr);
++		if (status1 != 0x90) {
++			spin_unlock_irqrestore(&port->lock, flags);
++			DEBUG_AUTOCONF("LOOP test failed (%02x) ",
++				       status1);
++			goto out;
++		}
++	}
++
++	/*
++	 * We're pretty sure there's a port here.  Lets find out what
++	 * type of port it is.  The IIR top two bits allows us to find
++	 * out if it's 8250 or 16450, 16550, 16550A or later.  This
++	 * determines what we test for next.
++	 *
++	 * We also initialise the EFR (if any) to zero for later.  The
++	 * EFR occupies the same register location as the FCR and IIR.
++	 */
++	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++	serial_out(up, UART_EFR, 0);
++	serial_out(up, UART_LCR, 0);
++
++	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
++
++	/* Assign this as it is to truncate any bits above 7.  */
++	scratch = serial_in(up, UART_IIR);
++
++	switch (scratch >> 6) {
++	case 0:
++		autoconfig_8250(up);
++		break;
++	case 1:
++		port->type = PORT_UNKNOWN;
++		break;
++	case 2:
++		port->type = PORT_16550;
++		break;
++	case 3:
++		autoconfig_16550a(up);
++		break;
++	}
++
++#ifdef CONFIG_SERIAL_8250_RSA
++	/*
++	 * Only probe for RSA ports if we got the region.
++	 */
++	if (port->type == PORT_16550A && up->probe & UART_PROBE_RSA &&
++	    __enable_rsa(up))
++		port->type = PORT_RSA;
++#endif
++
++	serial_out(up, UART_LCR, save_lcr);
++
++	port->fifosize = uart_config[up->port.type].fifo_size;
++	old_capabilities = up->capabilities;
++	up->capabilities = uart_config[port->type].flags;
++	up->tx_loadsz = uart_config[port->type].tx_loadsz;
++
++	if (port->type == PORT_UNKNOWN)
++		goto out_unlock;
++
++	/*
++	 * Reset the UART.
++	 */
++#ifdef CONFIG_SERIAL_8250_RSA
++	if (port->type == PORT_RSA)
++		serial_out(up, UART_RSA_FRR, 0);
++#endif
++	serial8250_out_MCR(up, save_mcr);
++	serial8250_clear_fifos(up);
++	serial_in(up, UART_RX);
++	if (up->capabilities & UART_CAP_UUE)
++		serial_out(up, UART_IER, UART_IER_UUE);
++	else
++		serial_out(up, UART_IER, 0);
++
++out_unlock:
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	/*
++	 * Check if the device is a Fintek F81216A
++	 */
++	if (port->type == PORT_16550A && port->iotype == UPIO_PORT)
++		fintek_8250_probe(up);
++
++	if (up->capabilities != old_capabilities) {
++		dev_warn(port->dev, "detected caps %08x should be %08x\n",
++			 old_capabilities, up->capabilities);
++	}
++out:
++	DEBUG_AUTOCONF("iir=%d ", scratch);
++	DEBUG_AUTOCONF("type=%s\n", uart_config[port->type].name);
++}
++
++static void autoconfig_irq(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++	unsigned char save_mcr, save_ier;
++	unsigned char save_ICP = 0;
++	unsigned int ICP = 0;
++	unsigned long irqs;
++	int irq;
++
++	if (port->flags & UPF_FOURPORT) {
++		ICP = (port->iobase & 0xfe0) | 0x1f;
++		save_ICP = inb_p(ICP);
++		outb_p(0x80, ICP);
++		inb_p(ICP);
++	}
++
++	if (uart_console(port))
++		console_lock();
++
++	/* forget possible initially masked and pending IRQ */
++	probe_irq_off(probe_irq_on());
++	save_mcr = serial8250_in_MCR(up);
++	save_ier = serial_in(up, UART_IER);
++	serial8250_out_MCR(up, UART_MCR_OUT1 | UART_MCR_OUT2);
++
++	irqs = probe_irq_on();
++	serial8250_out_MCR(up, 0);
++	udelay(10);
++	if (port->flags & UPF_FOURPORT) {
++		serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS);
++	} else {
++		serial8250_out_MCR(up,
++			UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2);
++	}
++	serial_out(up, UART_IER, 0x0f);	/* enable all intrs */
++	serial_in(up, UART_LSR);
++	serial_in(up, UART_RX);
++	serial_in(up, UART_IIR);
++	serial_in(up, UART_MSR);
++	serial_out(up, UART_TX, 0xFF);
++	udelay(20);
++	irq = probe_irq_off(irqs);
++
++	serial8250_out_MCR(up, save_mcr);
++	serial_out(up, UART_IER, save_ier);
++
++	if (port->flags & UPF_FOURPORT)
++		outb_p(save_ICP, ICP);
++
++	if (uart_console(port))
++		console_unlock();
++
++	port->irq = (irq > 0) ? irq : 0;
++}
++
++static void serial8250_stop_rx(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	serial8250_rpm_get(up);
++
++	up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
++	up->port.read_status_mask &= ~UART_LSR_DR;
++	serial_port_out(port, UART_IER, up->ier);
++
++	serial8250_rpm_put(up);
++}
++
++/**
++ * serial8250_em485_stop_tx() - generic ->rs485_stop_tx() callback
++ * @p: uart 8250 port
++ *
++ * Generic callback usable by 8250 uart drivers to stop rs485 transmission.
++ */
++void serial8250_em485_stop_tx(struct uart_8250_port *p)
++{
++	unsigned char mcr = serial8250_in_MCR(p);
++
++	if (p->port.rs485.flags & SER_RS485_RTS_AFTER_SEND)
++		mcr |= UART_MCR_RTS;
++	else
++		mcr &= ~UART_MCR_RTS;
++	serial8250_out_MCR(p, mcr);
++
++	/*
++	 * Empty the RX FIFO, we are not interested in anything
++	 * received during the half-duplex transmission.
++	 * Enable previously disabled RX interrupts.
++	 */
++	if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) {
++		serial8250_clear_and_reinit_fifos(p);
++
++		p->ier |= UART_IER_RLSI | UART_IER_RDI;
++		serial_port_out(&p->port, UART_IER, p->ier);
++	}
++}
++EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);
++
++static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t)
++{
++	struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
++			stop_tx_timer);
++	struct uart_8250_port *p = em485->port;
++	unsigned long flags;
++
++	serial8250_rpm_get(p);
++	spin_lock_irqsave(&p->port.lock, flags);
++	if (em485->active_timer == &em485->stop_tx_timer) {
++		p->rs485_stop_tx(p);
++		em485->active_timer = NULL;
++		em485->tx_stopped = true;
++	}
++	spin_unlock_irqrestore(&p->port.lock, flags);
++	serial8250_rpm_put(p);
++
++	return HRTIMER_NORESTART;
++}
++
++static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec)
++{
++	hrtimer_start(hrt, ms_to_ktime(msec), HRTIMER_MODE_REL);
++}
++
++static void __stop_tx_rs485(struct uart_8250_port *p, u64 stop_delay)
++{
++	struct uart_8250_em485 *em485 = p->em485;
++
++	stop_delay += (u64)p->port.rs485.delay_rts_after_send * NSEC_PER_MSEC;
++
++	/*
++	 * rs485_stop_tx() is going to set RTS according to config
++	 * AND flush RX FIFO if required.
++	 */
++	if (stop_delay > 0) {
++		em485->active_timer = &em485->stop_tx_timer;
++		hrtimer_start(&em485->stop_tx_timer, ns_to_ktime(stop_delay), HRTIMER_MODE_REL);
++	} else {
++		p->rs485_stop_tx(p);
++		em485->active_timer = NULL;
++		em485->tx_stopped = true;
++	}
++}
++
++static inline void __stop_tx(struct uart_8250_port *p)
++{
++	struct uart_8250_em485 *em485 = p->em485;
++
++	if (em485) {
++		u16 lsr = serial_lsr_in(p);
++		u64 stop_delay = 0;
++
++		p->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
++
++		if (!(lsr & UART_LSR_THRE))
++			return;
++		/*
++		 * To provide required timing and allow FIFO transfer,
++		 * __stop_tx_rs485() must be called only when both FIFO and
++		 * shift register are empty. The device driver should either
++		 * enable interrupt on TEMT or set UART_CAP_NOTEMT that will
++		 * enlarge stop_tx_timer by the tx time of one frame to cover
++		 * for emptying of the shift register.
++		 */
++		if (!(lsr & UART_LSR_TEMT)) {
++			if (!(p->capabilities & UART_CAP_NOTEMT))
++				return;
++			/*
++			 * RTS might get deasserted too early with the normal
++			 * frame timing formula. It seems to suggest THRE might
++			 * get asserted already during tx of the stop bit
++			 * rather than after it is fully sent.
++			 * Roughly estimate 1 extra bit here with / 7.
++			 */
++			stop_delay = p->port.frame_time + DIV_ROUND_UP(p->port.frame_time, 7);
++		}
++
++		__stop_tx_rs485(p, stop_delay);
++	}
++
++	if (serial8250_clear_THRI(p))
++		serial8250_rpm_put_tx(p);
++}
++
++static void serial8250_stop_tx(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	serial8250_rpm_get(up);
++	__stop_tx(up);
++
++	/*
++	 * We really want to stop the transmitter from sending.
++	 */
++	if (port->type == PORT_16C950) {
++		up->acr |= UART_ACR_TXDIS;
++		serial_icr_write(up, UART_ACR, up->acr);
++	}
++	serial8250_rpm_put(up);
++}
++
++static inline void __start_tx(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	if (up->dma && !up->dma->tx_dma(up))
++		return;
++
++	if (serial8250_set_THRI(up)) {
++		if (up->bugs & UART_BUG_TXEN) {
++			u16 lsr = serial_lsr_in(up);
++
++			if (lsr & UART_LSR_THRE)
++				serial8250_tx_chars(up);
++		}
++	}
++
++	/*
++	 * Re-enable the transmitter if we disabled it.
++	 */
++	if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) {
++		up->acr &= ~UART_ACR_TXDIS;
++		serial_icr_write(up, UART_ACR, up->acr);
++	}
++}
++
++/**
++ * serial8250_em485_start_tx() - generic ->rs485_start_tx() callback
++ * @up: uart 8250 port
++ *
++ * Generic callback usable by 8250 uart drivers to start rs485 transmission.
++ * Assumes that setting the RTS bit in the MCR register means RTS is high.
++ * (Some chips use inverse semantics.)  Further assumes that reception is
++ * stoppable by disabling the UART_IER_RDI interrupt.  (Some chips set the
++ * UART_LSR_DR bit even when UART_IER_RDI is disabled, foiling this approach.)
++ */
++void serial8250_em485_start_tx(struct uart_8250_port *up)
++{
++	unsigned char mcr = serial8250_in_MCR(up);
++
++	if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX))
++		serial8250_stop_rx(&up->port);
++
++	if (up->port.rs485.flags & SER_RS485_RTS_ON_SEND)
++		mcr |= UART_MCR_RTS;
++	else
++		mcr &= ~UART_MCR_RTS;
++	serial8250_out_MCR(up, mcr);
++}
++EXPORT_SYMBOL_GPL(serial8250_em485_start_tx);
++
++/* Returns false, if start_tx_timer was setup to defer TX start */
++static bool start_tx_rs485(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct uart_8250_em485 *em485 = up->em485;
++
++	/*
++	 * While serial8250_em485_handle_stop_tx() is a noop if
++	 * em485->active_timer != &em485->stop_tx_timer, it might happen that
++	 * the timer is still armed and triggers only after the current bunch of
++	 * chars is send and em485->active_timer == &em485->stop_tx_timer again.
++	 * So cancel the timer. There is still a theoretical race condition if
++	 * the timer is already running and only comes around to check for
++	 * em485->active_timer when &em485->stop_tx_timer is armed again.
++	 */
++	if (em485->active_timer == &em485->stop_tx_timer)
++		hrtimer_try_to_cancel(&em485->stop_tx_timer);
++
++	em485->active_timer = NULL;
++
++	if (em485->tx_stopped) {
++		em485->tx_stopped = false;
++
++		up->rs485_start_tx(up);
++
++		if (up->port.rs485.delay_rts_before_send > 0) {
++			em485->active_timer = &em485->start_tx_timer;
++			start_hrtimer_ms(&em485->start_tx_timer,
++					 up->port.rs485.delay_rts_before_send);
++			return false;
++		}
++	}
++
++	return true;
++}
++
++static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t)
++{
++	struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
++			start_tx_timer);
++	struct uart_8250_port *p = em485->port;
++	unsigned long flags;
++
++	spin_lock_irqsave(&p->port.lock, flags);
++	if (em485->active_timer == &em485->start_tx_timer) {
++		__start_tx(&p->port);
++		em485->active_timer = NULL;
++	}
++	spin_unlock_irqrestore(&p->port.lock, flags);
++
++	return HRTIMER_NORESTART;
++}
++
++static void serial8250_start_tx(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct uart_8250_em485 *em485 = up->em485;
++
++	if (!port->x_char && uart_circ_empty(&port->state->xmit))
++		return;
++
++	serial8250_rpm_get_tx(up);
++
++	if (em485) {
++		if ((em485->active_timer == &em485->start_tx_timer) ||
++		    !start_tx_rs485(port))
++			return;
++	}
++	__start_tx(port);
++}
++
++static void serial8250_throttle(struct uart_port *port)
++{
++	port->throttle(port);
++}
++
++static void serial8250_unthrottle(struct uart_port *port)
++{
++	port->unthrottle(port);
++}
++
++static void serial8250_disable_ms(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	/* no MSR capabilities */
++	if (up->bugs & UART_BUG_NOMSR)
++		return;
++
++	mctrl_gpio_disable_ms(up->gpios);
++
++	up->ier &= ~UART_IER_MSI;
++	serial_port_out(port, UART_IER, up->ier);
++}
++
++static void serial8250_enable_ms(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	/* no MSR capabilities */
++	if (up->bugs & UART_BUG_NOMSR)
++		return;
++
++	mctrl_gpio_enable_ms(up->gpios);
++
++	up->ier |= UART_IER_MSI;
++
++	serial8250_rpm_get(up);
++	serial_port_out(port, UART_IER, up->ier);
++	serial8250_rpm_put(up);
++}
++
++void serial8250_read_char(struct uart_8250_port *up, u16 lsr)
++{
++	struct uart_port *port = &up->port;
++	unsigned char ch;
++	char flag = TTY_NORMAL;
++
++	if (likely(lsr & UART_LSR_DR))
++		ch = serial_in(up, UART_RX);
++	else
++		/*
++		 * Intel 82571 has a Serial Over Lan device that will
++		 * set UART_LSR_BI without setting UART_LSR_DR when
++		 * it receives a break. To avoid reading from the
++		 * receive buffer without UART_LSR_DR bit set, we
++		 * just force the read character to be 0
++		 */
++		ch = 0;
++
++	port->icount.rx++;
++
++	lsr |= up->lsr_saved_flags;
++	up->lsr_saved_flags = 0;
++
++	if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) {
++		if (lsr & UART_LSR_BI) {
++			lsr &= ~(UART_LSR_FE | UART_LSR_PE);
++			port->icount.brk++;
++			/*
++			 * We do the SysRQ and SAK checking
++			 * here because otherwise the break
++			 * may get masked by ignore_status_mask
++			 * or read_status_mask.
++			 */
++			if (uart_handle_break(port))
++				return;
++		} else if (lsr & UART_LSR_PE)
++			port->icount.parity++;
++		else if (lsr & UART_LSR_FE)
++			port->icount.frame++;
++		if (lsr & UART_LSR_OE)
++			port->icount.overrun++;
++
++		/*
++		 * Mask off conditions which should be ignored.
++		 */
++		lsr &= port->read_status_mask;
++
++		if (lsr & UART_LSR_BI) {
++			dev_dbg(port->dev, "handling break\n");
++			flag = TTY_BREAK;
++		} else if (lsr & UART_LSR_PE)
++			flag = TTY_PARITY;
++		else if (lsr & UART_LSR_FE)
++			flag = TTY_FRAME;
++	}
++	if (uart_prepare_sysrq_char(port, ch))
++		return;
++
++	uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
++}
++EXPORT_SYMBOL_GPL(serial8250_read_char);
++
++/*
++ * serial8250_rx_chars - Read characters. The first LSR value must be passed in.
++ *
++ * Returns LSR bits. The caller should rely only on non-Rx related LSR bits
++ * (such as THRE) because the LSR value might come from an already consumed
++ * character.
++ */
++u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr)
++{
++	struct uart_port *port = &up->port;
++	int max_count = 256;
++
++	do {
++		serial8250_read_char(up, lsr);
++		if (--max_count == 0)
++			break;
++		lsr = serial_in(up, UART_LSR);
++	} while (lsr & (UART_LSR_DR | UART_LSR_BI));
++
++	tty_flip_buffer_push(&port->state->port);
++	return lsr;
++}
++EXPORT_SYMBOL_GPL(serial8250_rx_chars);
++
++void serial8250_tx_chars(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++	struct circ_buf *xmit = &port->state->xmit;
++	int count;
++
++	if (port->x_char) {
++		uart_xchar_out(port, UART_TX);
++		return;
++	}
++	if (uart_tx_stopped(port)) {
++		serial8250_stop_tx(port);
++		return;
++	}
++	if (uart_circ_empty(xmit)) {
++		__stop_tx(up);
++		return;
++	}
++
++	count = up->tx_loadsz;
++	do {
++		serial_out(up, UART_TX, xmit->buf[xmit->tail]);
++		if (up->bugs & UART_BUG_TXRACE) {
++			/*
++			 * The Aspeed BMC virtual UARTs have a bug where data
++			 * may get stuck in the BMC's Tx FIFO from bursts of
++			 * writes on the APB interface.
++			 *
++			 * Delay back-to-back writes by a read cycle to avoid
++			 * stalling the VUART. Read a register that won't have
++			 * side-effects and discard the result.
++			 */
++			serial_in(up, UART_SCR);
++		}
++		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
++		port->icount.tx++;
++		if (uart_circ_empty(xmit))
++			break;
++		if ((up->capabilities & UART_CAP_HFIFO) &&
++		    !uart_lsr_tx_empty(serial_in(up, UART_LSR)))
++			break;
++		/* The BCM2835 MINI UART THRE bit is really a not-full bit. */
++		if ((up->capabilities & UART_CAP_MINI) &&
++		    !(serial_in(up, UART_LSR) & UART_LSR_THRE))
++			break;
++	} while (--count > 0);
++
++	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
++		uart_write_wakeup(port);
++
++	/*
++	 * With RPM enabled, we have to wait until the FIFO is empty before the
++	 * HW can go idle. So we get here once again with empty FIFO and disable
++	 * the interrupt and RPM in __stop_tx()
++	 */
++	if (uart_circ_empty(xmit) && !(up->capabilities & UART_CAP_RPM))
++		__stop_tx(up);
++}
++EXPORT_SYMBOL_GPL(serial8250_tx_chars);
++
++/* Caller holds uart port lock */
++unsigned int serial8250_modem_status(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++	unsigned int status = serial_in(up, UART_MSR);
++
++	status |= up->msr_saved_flags;
++	up->msr_saved_flags = 0;
++	if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI &&
++	    port->state != NULL) {
++		if (status & UART_MSR_TERI)
++			port->icount.rng++;
++		if (status & UART_MSR_DDSR)
++			port->icount.dsr++;
++		if (status & UART_MSR_DDCD)
++			uart_handle_dcd_change(port, status & UART_MSR_DCD);
++		if (status & UART_MSR_DCTS)
++			uart_handle_cts_change(port, status & UART_MSR_CTS);
++
++		wake_up_interruptible(&port->state->port.delta_msr_wait);
++	}
++
++	return status;
++}
++EXPORT_SYMBOL_GPL(serial8250_modem_status);
++
++static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
++{
++	switch (iir & 0x3f) {
++	case UART_IIR_RDI:
++		if (!up->dma->rx_running)
++			break;
++		fallthrough;
++	case UART_IIR_RLSI:
++	case UART_IIR_RX_TIMEOUT:
++		serial8250_rx_dma_flush(up);
++		return true;
++	}
++	return up->dma->rx_dma(up);
++}
++
++/*
++ * This handles the interrupt from one port.
++ */
++int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	bool skip_rx = false;
++	unsigned long flags;
++	u16 status;
++
++	if (iir & UART_IIR_NO_INT)
++		return 0;
++
++	spin_lock_irqsave(&port->lock, flags);
++
++	status = serial_lsr_in(up);
++
++	/*
++	 * If port is stopped and there are no error conditions in the
++	 * FIFO, then don't drain the FIFO, as this may lead to TTY buffer
++	 * overflow. Not servicing, RX FIFO would trigger auto HW flow
++	 * control when FIFO occupancy reaches preset threshold, thus
++	 * halting RX. This only works when auto HW flow control is
++	 * available.
++	 */
++	if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) &&
++	    (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) &&
++	    !(port->read_status_mask & UART_LSR_DR))
++		skip_rx = true;
++
++	if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) {
++		if (!up->dma || handle_rx_dma(up, iir))
++			status = serial8250_rx_chars(up, status);
++	}
++	serial8250_modem_status(up);
++	if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) {
++		if (!up->dma || up->dma->tx_err)
++			serial8250_tx_chars(up);
++		else if (!up->dma->tx_running)
++			__stop_tx(up);
++	}
++
++	uart_unlock_and_check_sysrq_irqrestore(port, flags);
++
++	return 1;
++}
++EXPORT_SYMBOL_GPL(serial8250_handle_irq);
++
++static int serial8250_default_handle_irq(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned int iir;
++	int ret;
++
++	serial8250_rpm_get(up);
++
++	iir = serial_port_in(port, UART_IIR);
++	ret = serial8250_handle_irq(port, iir);
++
++	serial8250_rpm_put(up);
++	return ret;
++}
++
++/*
++ * Newer 16550 compatible parts such as the SC16C650 & Altera 16550 Soft IP
++ * have a programmable TX threshold that triggers the THRE interrupt in
++ * the IIR register. In this case, the THRE interrupt indicates the FIFO
++ * has space available. Load it up with tx_loadsz bytes.
++ */
++static int serial8250_tx_threshold_handle_irq(struct uart_port *port)
++{
++	unsigned long flags;
++	unsigned int iir = serial_port_in(port, UART_IIR);
++
++	/* TX Threshold IRQ triggered so load up FIFO */
++	if ((iir & UART_IIR_ID) == UART_IIR_THRI) {
++		struct uart_8250_port *up = up_to_u8250p(port);
++
++		spin_lock_irqsave(&port->lock, flags);
++		serial8250_tx_chars(up);
++		spin_unlock_irqrestore(&port->lock, flags);
++	}
++
++	iir = serial_port_in(port, UART_IIR);
++	return serial8250_handle_irq(port, iir);
++}
++
++static unsigned int serial8250_tx_empty(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++	u16 lsr;
++
++	serial8250_rpm_get(up);
++
++	spin_lock_irqsave(&port->lock, flags);
++	lsr = serial_lsr_in(up);
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	serial8250_rpm_put(up);
++
++	return uart_lsr_tx_empty(lsr) ? TIOCSER_TEMT : 0;
++}
++
++unsigned int serial8250_do_get_mctrl(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned int status;
++	unsigned int val;
++
++	serial8250_rpm_get(up);
++	status = serial8250_modem_status(up);
++	serial8250_rpm_put(up);
++
++	val = serial8250_MSR_to_TIOCM(status);
++	if (up->gpios)
++		return mctrl_gpio_get(up->gpios, &val);
++
++	return val;
++}
++EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl);
++
++static unsigned int serial8250_get_mctrl(struct uart_port *port)
++{
++	if (port->get_mctrl)
++		return port->get_mctrl(port);
++	return serial8250_do_get_mctrl(port);
++}
++
++void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned char mcr;
++
++	mcr = serial8250_TIOCM_to_MCR(mctrl);
++
++	mcr |= up->mcr;
++
++	serial8250_out_MCR(up, mcr);
++}
++EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl);
++
++static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
++{
++	if (port->rs485.flags & SER_RS485_ENABLED)
++		return;
++
++	if (port->set_mctrl)
++		port->set_mctrl(port, mctrl);
++	else
++		serial8250_do_set_mctrl(port, mctrl);
++}
++
++static void serial8250_break_ctl(struct uart_port *port, int break_state)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++
++	serial8250_rpm_get(up);
++	spin_lock_irqsave(&port->lock, flags);
++	if (break_state == -1)
++		up->lcr |= UART_LCR_SBC;
++	else
++		up->lcr &= ~UART_LCR_SBC;
++	serial_port_out(port, UART_LCR, up->lcr);
++	spin_unlock_irqrestore(&port->lock, flags);
++	serial8250_rpm_put(up);
++}
++
++static void wait_for_lsr(struct uart_8250_port *up, int bits)
++{
++	unsigned int status, tmout = 10000;
++
++	/* Wait up to 10ms for the character(s) to be sent. */
++	for (;;) {
++		status = serial_lsr_in(up);
++
++		if ((status & bits) == bits)
++			break;
++		if (--tmout == 0)
++			break;
++		udelay(1);
++		touch_nmi_watchdog();
++	}
++}
++
++/*
++ *	Wait for transmitter & holding register to empty
++ */
++static void wait_for_xmitr(struct uart_8250_port *up, int bits)
++{
++	unsigned int tmout;
++
++	wait_for_lsr(up, bits);
++
++	/* Wait up to 1s for flow control if necessary */
++	if (up->port.flags & UPF_CONS_FLOW) {
++		for (tmout = 1000000; tmout; tmout--) {
++			unsigned int msr = serial_in(up, UART_MSR);
++			up->msr_saved_flags |= msr & MSR_SAVE_FLAGS;
++			if (msr & UART_MSR_CTS)
++				break;
++			udelay(1);
++			touch_nmi_watchdog();
++		}
++	}
++}
++
++#ifdef CONFIG_CONSOLE_POLL
++/*
++ * Console polling routines for writing and reading from the uart while
++ * in an interrupt or debug context.
++ */
++
++static int serial8250_get_poll_char(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	int status;
++	u16 lsr;
++
++	serial8250_rpm_get(up);
++
++	lsr = serial_port_in(port, UART_LSR);
++
++	if (!(lsr & UART_LSR_DR)) {
++		status = NO_POLL_CHAR;
++		goto out;
++	}
++
++	status = serial_port_in(port, UART_RX);
++out:
++	serial8250_rpm_put(up);
++	return status;
++}
++
++
++static void serial8250_put_poll_char(struct uart_port *port,
++			 unsigned char c)
++{
++	unsigned int ier;
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	serial8250_rpm_get(up);
++	/*
++	 *	First save the IER then disable the interrupts
++	 */
++	ier = serial_port_in(port, UART_IER);
++	if (up->capabilities & UART_CAP_UUE)
++		serial_port_out(port, UART_IER, UART_IER_UUE);
++	else
++		serial_port_out(port, UART_IER, 0);
++
++	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
++	/*
++	 *	Send the character out.
++	 */
++	serial_port_out(port, UART_TX, c);
++
++	/*
++	 *	Finally, wait for transmitter to become empty
++	 *	and restore the IER
++	 */
++	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
++	serial_port_out(port, UART_IER, ier);
++	serial8250_rpm_put(up);
++}
++
++#endif /* CONFIG_CONSOLE_POLL */
++
++int serial8250_do_startup(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++	unsigned char iir;
++	int retval;
++	u16 lsr;
++
++	if (!port->fifosize)
++		port->fifosize = uart_config[port->type].fifo_size;
++	if (!up->tx_loadsz)
++		up->tx_loadsz = uart_config[port->type].tx_loadsz;
++	if (!up->capabilities)
++		up->capabilities = uart_config[port->type].flags;
++	up->mcr = 0;
++
++	if (port->iotype != up->cur_iotype)
++		set_io_from_upio(port);
++
++	serial8250_rpm_get(up);
++	if (port->type == PORT_16C950) {
++		/* Wake up and initialize UART */
++		up->acr = 0;
++		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
++		serial_port_out(port, UART_EFR, UART_EFR_ECB);
++		serial_port_out(port, UART_IER, 0);
++		serial_port_out(port, UART_LCR, 0);
++		serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
++		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
++		serial_port_out(port, UART_EFR, UART_EFR_ECB);
++		serial_port_out(port, UART_LCR, 0);
++	}
++
++	if (port->type == PORT_DA830) {
++		/* Reset the port */
++		serial_port_out(port, UART_IER, 0);
++		serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
++		mdelay(10);
++
++		/* Enable Tx, Rx and free run mode */
++		serial_port_out(port, UART_DA830_PWREMU_MGMT,
++				UART_DA830_PWREMU_MGMT_UTRST |
++				UART_DA830_PWREMU_MGMT_URRST |
++				UART_DA830_PWREMU_MGMT_FREE);
++	}
++
++	if (port->type == PORT_NPCM) {
++		/*
++		 * Nuvoton calls the scratch register 'UART_TOR' (timeout
++		 * register). Enable it, and set TIOC (timeout interrupt
++		 * comparator) to be 0x20 for correct operation.
++		 */
++		serial_port_out(port, UART_NPCM_TOR, UART_NPCM_TOIE | 0x20);
++	}
++
++#ifdef CONFIG_SERIAL_8250_RSA
++	/*
++	 * If this is an RSA port, see if we can kick it up to the
++	 * higher speed clock.
++	 */
++	enable_rsa(up);
++#endif
++
++	/*
++	 * Clear the FIFO buffers and disable them.
++	 * (they will be reenabled in set_termios())
++	 */
++	serial8250_clear_fifos(up);
++
++	/*
++	 * Clear the interrupt registers.
++	 */
++	serial_port_in(port, UART_LSR);
++	serial_port_in(port, UART_RX);
++	serial_port_in(port, UART_IIR);
++	serial_port_in(port, UART_MSR);
++
++	/*
++	 * At this point, there's no way the LSR could still be 0xff;
++	 * if it is, then bail out, because there's likely no UART
++	 * here.
++	 */
++	if (!(port->flags & UPF_BUGGY_UART) &&
++	    (serial_port_in(port, UART_LSR) == 0xff)) {
++		dev_info_ratelimited(port->dev, "LSR safety check engaged!\n");
++		retval = -ENODEV;
++		goto out;
++	}
++
++	/*
++	 * For a XR16C850, we need to set the trigger levels
++	 */
++	if (port->type == PORT_16850) {
++		unsigned char fctr;
++
++		serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
++
++		fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX);
++		serial_port_out(port, UART_FCTR,
++				fctr | UART_FCTR_TRGD | UART_FCTR_RX);
++		serial_port_out(port, UART_TRG, UART_TRG_96);
++		serial_port_out(port, UART_FCTR,
++				fctr | UART_FCTR_TRGD | UART_FCTR_TX);
++		serial_port_out(port, UART_TRG, UART_TRG_96);
++
++		serial_port_out(port, UART_LCR, 0);
++	}
++
++	/*
++	 * For the Altera 16550 variants, set TX threshold trigger level.
++	 */
++	if (((port->type == PORT_ALTR_16550_F32) ||
++	     (port->type == PORT_ALTR_16550_F64) ||
++	     (port->type == PORT_ALTR_16550_F128)) && (port->fifosize > 1)) {
++		/* Bounds checking of TX threshold (valid 0 to fifosize-2) */
++		if ((up->tx_loadsz < 2) || (up->tx_loadsz > port->fifosize)) {
++			dev_err(port->dev, "TX FIFO Threshold errors, skipping\n");
++		} else {
++			serial_port_out(port, UART_ALTR_AFR,
++					UART_ALTR_EN_TXFIFO_LW);
++			serial_port_out(port, UART_ALTR_TX_LOW,
++					port->fifosize - up->tx_loadsz);
++			port->handle_irq = serial8250_tx_threshold_handle_irq;
++		}
++	}
++
++	/* Check if we need to have shared IRQs */
++	if (port->irq && (up->port.flags & UPF_SHARE_IRQ))
++		up->port.irqflags |= IRQF_SHARED;
++
++	retval = up->ops->setup_irq(up);
++	if (retval)
++		goto out;
++
++	if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) {
++		unsigned char iir1;
++
++		if (port->irqflags & IRQF_SHARED)
++			disable_irq_nosync(port->irq);
++
++		/*
++		 * Test for UARTs that do not reassert THRE when the
++		 * transmitter is idle and the interrupt has already
++		 * been cleared.  Real 16550s should always reassert
++		 * this interrupt whenever the transmitter is idle and
++		 * the interrupt is enabled.  Delays are necessary to
++		 * allow register changes to become visible.
++		 */
++		spin_lock_irqsave(&port->lock, flags);
++
++		wait_for_xmitr(up, UART_LSR_THRE);
++		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
++		udelay(1); /* allow THRE to set */
++		iir1 = serial_port_in(port, UART_IIR);
++		serial_port_out(port, UART_IER, 0);
++		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
++		udelay(1); /* allow a working UART time to re-assert THRE */
++		iir = serial_port_in(port, UART_IIR);
++		serial_port_out(port, UART_IER, 0);
++
++		spin_unlock_irqrestore(&port->lock, flags);
++
++		if (port->irqflags & IRQF_SHARED)
++			enable_irq(port->irq);
++
++		/*
++		 * If the interrupt is not reasserted, or we otherwise
++		 * don't trust the iir, setup a timer to kick the UART
++		 * on a regular basis.
++		 */
++		if ((!(iir1 & UART_IIR_NO_INT) && (iir & UART_IIR_NO_INT)) ||
++		    up->port.flags & UPF_BUG_THRE) {
++			up->bugs |= UART_BUG_THRE;
++		}
++	}
++
++	up->ops->setup_timer(up);
++
++	/*
++	 * Now, initialize the UART
++	 */
++	serial_port_out(port, UART_LCR, UART_LCR_WLEN8);
++
++	spin_lock_irqsave(&port->lock, flags);
++	if (up->port.flags & UPF_FOURPORT) {
++		if (!up->port.irq)
++			up->port.mctrl |= TIOCM_OUT1;
++	} else
++		/*
++		 * Most PC uarts need OUT2 raised to enable interrupts.
++		 */
++		if (port->irq)
++			up->port.mctrl |= TIOCM_OUT2;
++
++	serial8250_set_mctrl(port, port->mctrl);
++
++	/*
++	 * Serial over Lan (SoL) hack:
++	 * Intel 8257x Gigabit ethernet chips have a 16550 emulation, to be
++	 * used for Serial Over Lan.  Those chips take a longer time than a
++	 * normal serial device to signalize that a transmission data was
++	 * queued. Due to that, the above test generally fails. One solution
++	 * would be to delay the reading of iir. However, this is not
++	 * reliable, since the timeout is variable. So, let's just don't
++	 * test if we receive TX irq.  This way, we'll never enable
++	 * UART_BUG_TXEN.
++	 */
++	if (up->port.quirks & UPQ_NO_TXEN_TEST)
++		goto dont_test_tx_en;
++
++	/*
++	 * Do a quick test to see if we receive an interrupt when we enable
++	 * the TX irq.
++	 */
++	serial_port_out(port, UART_IER, UART_IER_THRI);
++	lsr = serial_port_in(port, UART_LSR);
++	iir = serial_port_in(port, UART_IIR);
++	serial_port_out(port, UART_IER, 0);
++
++	if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
++		if (!(up->bugs & UART_BUG_TXEN)) {
++			up->bugs |= UART_BUG_TXEN;
++			dev_dbg(port->dev, "enabling bad tx status workarounds\n");
++		}
++	} else {
++		up->bugs &= ~UART_BUG_TXEN;
++	}
++
++dont_test_tx_en:
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	/*
++	 * Clear the interrupt registers again for luck, and clear the
++	 * saved flags to avoid getting false values from polling
++	 * routines or the previous session.
++	 */
++	serial_port_in(port, UART_LSR);
++	serial_port_in(port, UART_RX);
++	serial_port_in(port, UART_IIR);
++	serial_port_in(port, UART_MSR);
++	up->lsr_saved_flags = 0;
++	up->msr_saved_flags = 0;
++
++	/*
++	 * Request DMA channels for both RX and TX.
++	 */
++	if (up->dma) {
++		const char *msg = NULL;
++
++		if (uart_console(port))
++			msg = "forbid DMA for kernel console";
++		else if (serial8250_request_dma(up))
++			msg = "failed to request DMA";
++		if (msg) {
++			dev_warn_ratelimited(port->dev, "%s\n", msg);
++			up->dma = NULL;
++		}
++	}
++
++	/*
++	 * Set the IER shadow for rx interrupts but defer actual interrupt
++	 * enable until after the FIFOs are enabled; otherwise, an already-
++	 * active sender can swamp the interrupt handler with "too much work".
++	 */
++	up->ier = UART_IER_RLSI | UART_IER_RDI;
++
++	if (port->flags & UPF_FOURPORT) {
++		unsigned int icp;
++		/*
++		 * Enable interrupts on the AST Fourport board
++		 */
++		icp = (port->iobase & 0xfe0) | 0x01f;
++		outb_p(0x80, icp);
++		inb_p(icp);
++	}
++	retval = 0;
++out:
++	serial8250_rpm_put(up);
++	return retval;
++}
++EXPORT_SYMBOL_GPL(serial8250_do_startup);
++
++static int serial8250_startup(struct uart_port *port)
++{
++	if (port->startup)
++		return port->startup(port);
++	return serial8250_do_startup(port);
++}
++
++void serial8250_do_shutdown(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned long flags;
++
++	serial8250_rpm_get(up);
++	/*
++	 * Disable interrupts from this port
++	 */
++	spin_lock_irqsave(&port->lock, flags);
++	up->ier = 0;
++	serial_port_out(port, UART_IER, 0);
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	synchronize_irq(port->irq);
++
++	if (up->dma)
++		serial8250_release_dma(up);
++
++	spin_lock_irqsave(&port->lock, flags);
++	if (port->flags & UPF_FOURPORT) {
++		/* reset interrupts on the AST Fourport board */
++		inb((port->iobase & 0xfe0) | 0x1f);
++		port->mctrl |= TIOCM_OUT1;
++	} else
++		port->mctrl &= ~TIOCM_OUT2;
++
++	serial8250_set_mctrl(port, port->mctrl);
++	spin_unlock_irqrestore(&port->lock, flags);
++
++	/*
++	 * Disable break condition and FIFOs
++	 */
++	serial_port_out(port, UART_LCR,
++			serial_port_in(port, UART_LCR) & ~UART_LCR_SBC);
++	serial8250_clear_fifos(up);
++
++#ifdef CONFIG_SERIAL_8250_RSA
++	/*
++	 * Reset the RSA board back to 115kbps compat mode.
++	 */
++	disable_rsa(up);
++#endif
++
++	/*
++	 * Read data port to reset things, and then unlink from
++	 * the IRQ chain.
++	 */
++	serial_port_in(port, UART_RX);
++	serial8250_rpm_put(up);
++
++	up->ops->release_irq(up);
++}
++EXPORT_SYMBOL_GPL(serial8250_do_shutdown);
++
++static void serial8250_shutdown(struct uart_port *port)
++{
++	if (port->shutdown)
++		port->shutdown(port);
++	else
++		serial8250_do_shutdown(port);
++}
++
++/* Nuvoton NPCM UARTs have a custom divisor calculation */
++static unsigned int npcm_get_divisor(struct uart_8250_port *up,
++		unsigned int baud)
++{
++	struct uart_port *port = &up->port;
++
++	return DIV_ROUND_CLOSEST(port->uartclk, 16 * baud + 2) - 2;
++}
++
++static unsigned int serial8250_do_get_divisor(struct uart_port *port,
++					      unsigned int baud,
++					      unsigned int *frac)
++{
++	upf_t magic_multiplier = port->flags & UPF_MAGIC_MULTIPLIER;
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned int quot;
++
++	/*
++	 * Handle magic divisors for baud rates above baud_base on SMSC
++	 * Super I/O chips.  We clamp custom rates from clk/6 and clk/12
++	 * up to clk/4 (0x8001) and clk/8 (0x8002) respectively.  These
++	 * magic divisors actually reprogram the baud rate generator's
++	 * reference clock derived from chips's 14.318MHz clock input.
++	 *
++	 * Documentation claims that with these magic divisors the base
++	 * frequencies of 7.3728MHz and 3.6864MHz are used respectively
++	 * for the extra baud rates of 460800bps and 230400bps rather
++	 * than the usual base frequency of 1.8462MHz.  However empirical
++	 * evidence contradicts that.
++	 *
++	 * Instead bit 7 of the DLM register (bit 15 of the divisor) is
++	 * effectively used as a clock prescaler selection bit for the
++	 * base frequency of 7.3728MHz, always used.  If set to 0, then
++	 * the base frequency is divided by 4 for use by the Baud Rate
++	 * Generator, for the usual arrangement where the value of 1 of
++	 * the divisor produces the baud rate of 115200bps.  Conversely,
++	 * if set to 1 and high-speed operation has been enabled with the
++	 * Serial Port Mode Register in the Device Configuration Space,
++	 * then the base frequency is supplied directly to the Baud Rate
++	 * Generator, so for the divisor values of 0x8001, 0x8002, 0x8003,
++	 * 0x8004, etc. the respective baud rates produced are 460800bps,
++	 * 230400bps, 153600bps, 115200bps, etc.
++	 *
++	 * In all cases only low 15 bits of the divisor are used to divide
++	 * the baud base and therefore 32767 is the maximum divisor value
++	 * possible, even though documentation says that the programmable
++	 * Baud Rate Generator is capable of dividing the internal PLL
++	 * clock by any divisor from 1 to 65535.
++	 */
++	if (magic_multiplier && baud >= port->uartclk / 6)
++		quot = 0x8001;
++	else if (magic_multiplier && baud >= port->uartclk / 12)
++		quot = 0x8002;
++	else if (up->port.type == PORT_NPCM)
++		quot = npcm_get_divisor(up, baud);
++	else
++		quot = uart_get_divisor(port, baud);
++
++	/*
++	 * Oxford Semi 952 rev B workaround
++	 */
++	if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0)
++		quot++;
++
++	return quot;
++}
++
++static unsigned int serial8250_get_divisor(struct uart_port *port,
++					   unsigned int baud,
++					   unsigned int *frac)
++{
++	if (port->get_divisor)
++		return port->get_divisor(port, baud, frac);
++
++	return serial8250_do_get_divisor(port, baud, frac);
++}
++
++static unsigned char serial8250_compute_lcr(struct uart_8250_port *up,
++					    tcflag_t c_cflag)
++{
++	unsigned char cval;
++
++	cval = UART_LCR_WLEN(tty_get_char_size(c_cflag));
++
++	if (c_cflag & CSTOPB)
++		cval |= UART_LCR_STOP;
++	if (c_cflag & PARENB) {
++		cval |= UART_LCR_PARITY;
++		if (up->bugs & UART_BUG_PARITY)
++			up->fifo_bug = true;
++	}
++	if (!(c_cflag & PARODD))
++		cval |= UART_LCR_EPAR;
++	if (c_cflag & CMSPAR)
++		cval |= UART_LCR_SPAR;
++
++	return cval;
++}
++
++void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
++			       unsigned int quot, unsigned int quot_frac)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	/* Workaround to enable 115200 baud on OMAP1510 internal ports */
++	if (is_omap1510_8250(up)) {
++		if (baud == 115200) {
++			quot = 1;
++			serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1);
++		} else
++			serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0);
++	}
++
++	/*
++	 * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2,
++	 * otherwise just set DLAB
++	 */
++	if (up->capabilities & UART_NATSEMI)
++		serial_port_out(port, UART_LCR, 0xe0);
++	else
++		serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB);
++
++	serial_dl_write(up, quot);
++}
++EXPORT_SYMBOL_GPL(serial8250_do_set_divisor);
++
++static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
++				   unsigned int quot, unsigned int quot_frac)
++{
++	if (port->set_divisor)
++		port->set_divisor(port, baud, quot, quot_frac);
++	else
++		serial8250_do_set_divisor(port, baud, quot, quot_frac);
++}
++
++static unsigned int serial8250_get_baud_rate(struct uart_port *port,
++					     struct ktermios *termios,
++					     struct ktermios *old)
++{
++	unsigned int tolerance = port->uartclk / 100;
++	unsigned int min;
++	unsigned int max;
++
++	/*
++	 * Handle magic divisors for baud rates above baud_base on SMSC
++	 * Super I/O chips.  Enable custom rates of clk/4 and clk/8, but
++	 * disable divisor values beyond 32767, which are unavailable.
++	 */
++	if (port->flags & UPF_MAGIC_MULTIPLIER) {
++		min = port->uartclk / 16 / UART_DIV_MAX >> 1;
++		max = (port->uartclk + tolerance) / 4;
++	} else {
++		min = port->uartclk / 16 / UART_DIV_MAX;
++		max = (port->uartclk + tolerance) / 16;
++	}
++
++	/*
++	 * Ask the core to calculate the divisor for us.
++	 * Allow 1% tolerance at the upper limit so uart clks marginally
++	 * slower than nominal still match standard baud rates without
++	 * causing transmission errors.
++	 */
++	return uart_get_baud_rate(port, termios, old, min, max);
++}
++
++/*
++ * Note in order to avoid the tty port mutex deadlock don't use the next method
++ * within the uart port callbacks. Primarily it's supposed to be utilized to
++ * handle a sudden reference clock rate change.
++ */
++void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	struct tty_port *tport = &port->state->port;
++	unsigned int baud, quot, frac = 0;
++	struct ktermios *termios;
++	struct tty_struct *tty;
++	unsigned long flags;
++
++	tty = tty_port_tty_get(tport);
++	if (!tty) {
++		mutex_lock(&tport->mutex);
++		port->uartclk = uartclk;
++		mutex_unlock(&tport->mutex);
++		return;
++	}
++
++	down_write(&tty->termios_rwsem);
++	mutex_lock(&tport->mutex);
++
++	if (port->uartclk == uartclk)
++		goto out_unlock;
++
++	port->uartclk = uartclk;
++
++	if (!tty_port_initialized(tport))
++		goto out_unlock;
++
++	termios = &tty->termios;
++
++	baud = serial8250_get_baud_rate(port, termios, NULL);
++	quot = serial8250_get_divisor(port, baud, &frac);
++
++	serial8250_rpm_get(up);
++	spin_lock_irqsave(&port->lock, flags);
++
++	uart_update_timeout(port, termios->c_cflag, baud);
++
++	serial8250_set_divisor(port, baud, quot, frac);
++	serial_port_out(port, UART_LCR, up->lcr);
++
++	spin_unlock_irqrestore(&port->lock, flags);
++	serial8250_rpm_put(up);
++
++out_unlock:
++	mutex_unlock(&tport->mutex);
++	up_write(&tty->termios_rwsem);
++	tty_kref_put(tty);
++}
++EXPORT_SYMBOL_GPL(serial8250_update_uartclk);
++
++void
++serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
++			  struct ktermios *old)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	unsigned char cval;
++	unsigned long flags;
++	unsigned int baud, quot, frac = 0;
++
++	if (up->capabilities & UART_CAP_MINI) {
++		termios->c_cflag &= ~(CSTOPB | PARENB | PARODD | CMSPAR);
++		if ((termios->c_cflag & CSIZE) == CS5 ||
++		    (termios->c_cflag & CSIZE) == CS6)
++			termios->c_cflag = (termios->c_cflag & ~CSIZE) | CS7;
++	}
++	cval = serial8250_compute_lcr(up, termios->c_cflag);
++
++	baud = serial8250_get_baud_rate(port, termios, old);
++	quot = serial8250_get_divisor(port, baud, &frac);
++
++	/*
++	 * Ok, we're now changing the port state.  Do it with
++	 * interrupts disabled.
++	 */
++	serial8250_rpm_get(up);
++	spin_lock_irqsave(&port->lock, flags);
++
++	up->lcr = cval;					/* Save computed LCR */
++
++	if (up->capabilities & UART_CAP_FIFO && port->fifosize > 1) {
++		/* NOTE: If fifo_bug is not set, a user can set RX_trigger. */
++		if ((baud < 2400 && !up->dma) || up->fifo_bug) {
++			up->fcr &= ~UART_FCR_TRIGGER_MASK;
++			up->fcr |= UART_FCR_TRIGGER_1;
++		}
++	}
++
++	/*
++	 * MCR-based auto flow control.  When AFE is enabled, RTS will be
++	 * deasserted when the receive FIFO contains more characters than
++	 * the trigger, or the MCR RTS bit is cleared.
++	 */
++	if (up->capabilities & UART_CAP_AFE) {
++		up->mcr &= ~UART_MCR_AFE;
++		if (termios->c_cflag & CRTSCTS)
++			up->mcr |= UART_MCR_AFE;
++	}
++
++	/*
++	 * Update the per-port timeout.
++	 */
++	uart_update_timeout(port, termios->c_cflag, baud);
++
++	port->read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR;
++	if (termios->c_iflag & INPCK)
++		port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
++	if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
++		port->read_status_mask |= UART_LSR_BI;
++
++	/*
++	 * Characters to ignore
++	 */
++	port->ignore_status_mask = 0;
++	if (termios->c_iflag & IGNPAR)
++		port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
++	if (termios->c_iflag & IGNBRK) {
++		port->ignore_status_mask |= UART_LSR_BI;
++		/*
++		 * If we're ignoring parity and break indicators,
++		 * ignore overruns too (for real raw support).
++		 */
++		if (termios->c_iflag & IGNPAR)
++			port->ignore_status_mask |= UART_LSR_OE;
++	}
++
++	/*
++	 * ignore all characters if CREAD is not set
++	 */
++	if ((termios->c_cflag & CREAD) == 0)
++		port->ignore_status_mask |= UART_LSR_DR;
++
++	/*
++	 * CTS flow control flag and modem status interrupts
++	 */
++	up->ier &= ~UART_IER_MSI;
++	if (!(up->bugs & UART_BUG_NOMSR) &&
++			UART_ENABLE_MS(&up->port, termios->c_cflag))
++		up->ier |= UART_IER_MSI;
++	if (up->capabilities & UART_CAP_UUE)
++		up->ier |= UART_IER_UUE;
++	if (up->capabilities & UART_CAP_RTOIE)
++		up->ier |= UART_IER_RTOIE;
++
++	serial_port_out(port, UART_IER, up->ier);
++
++	if (up->capabilities & UART_CAP_EFR) {
++		unsigned char efr = 0;
++		/*
++		 * TI16C752/Startech hardware flow control.  FIXME:
++		 * - TI16C752 requires control thresholds to be set.
++		 * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled.
++		 */
++		if (termios->c_cflag & CRTSCTS)
++			efr |= UART_EFR_CTS;
++
++		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
++		if (port->flags & UPF_EXAR_EFR)
++			serial_port_out(port, UART_XR_EFR, efr);
++		else
++			serial_port_out(port, UART_EFR, efr);
++	}
++
++	serial8250_set_divisor(port, baud, quot, frac);
++
++	/*
++	 * LCR DLAB must be set to enable 64-byte FIFO mode. If the FCR
++	 * is written without DLAB set, this mode will be disabled.
++	 */
++	if (port->type == PORT_16750)
++		serial_port_out(port, UART_FCR, up->fcr);
++
++	serial_port_out(port, UART_LCR, up->lcr);	/* reset DLAB */
++	if (port->type != PORT_16750) {
++		/* emulated UARTs (Lucent Venus 167x) need two steps */
++		if (up->fcr & UART_FCR_ENABLE_FIFO)
++			serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO);
++		serial_port_out(port, UART_FCR, up->fcr);	/* set fcr */
++	}
++	serial8250_set_mctrl(port, port->mctrl);
++	spin_unlock_irqrestore(&port->lock, flags);
++	serial8250_rpm_put(up);
++
++	/* Don't rewrite B0 */
++	if (tty_termios_baud_rate(termios))
++		tty_termios_encode_baud_rate(termios, baud, baud);
++}
++EXPORT_SYMBOL(serial8250_do_set_termios);
++
++static void
++serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
++		       struct ktermios *old)
++{
++	if (port->set_termios)
++		port->set_termios(port, termios, old);
++	else
++		serial8250_do_set_termios(port, termios, old);
++}
++
++void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios)
++{
++	if (termios->c_line == N_PPS) {
++		port->flags |= UPF_HARDPPS_CD;
++		spin_lock_irq(&port->lock);
++		serial8250_enable_ms(port);
++		spin_unlock_irq(&port->lock);
++	} else {
++		port->flags &= ~UPF_HARDPPS_CD;
++		if (!UART_ENABLE_MS(port, termios->c_cflag)) {
++			spin_lock_irq(&port->lock);
++			serial8250_disable_ms(port);
++			spin_unlock_irq(&port->lock);
++		}
++	}
++}
++EXPORT_SYMBOL_GPL(serial8250_do_set_ldisc);
++
++static void
++serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios)
++{
++	if (port->set_ldisc)
++		port->set_ldisc(port, termios);
++	else
++		serial8250_do_set_ldisc(port, termios);
++}
++
++void serial8250_do_pm(struct uart_port *port, unsigned int state,
++		      unsigned int oldstate)
++{
++	struct uart_8250_port *p = up_to_u8250p(port);
++
++	serial8250_set_sleep(p, state != 0);
++}
++EXPORT_SYMBOL(serial8250_do_pm);
++
++static void
++serial8250_pm(struct uart_port *port, unsigned int state,
++	      unsigned int oldstate)
++{
++	if (port->pm)
++		port->pm(port, state, oldstate);
++	else
++		serial8250_do_pm(port, state, oldstate);
++}
++
++static unsigned int serial8250_port_size(struct uart_8250_port *pt)
++{
++	if (pt->port.mapsize)
++		return pt->port.mapsize;
++	if (pt->port.iotype == UPIO_AU) {
++		if (pt->port.type == PORT_RT2880)
++			return 0x100;
++		return 0x1000;
++	}
++	if (is_omap1_8250(pt))
++		return 0x16 << pt->port.regshift;
++
++	return 8 << pt->port.regshift;
++}
++
++/*
++ * Resource handling.
++ */
++static int serial8250_request_std_resource(struct uart_8250_port *up)
++{
++	unsigned int size = serial8250_port_size(up);
++	struct uart_port *port = &up->port;
++	int ret = 0;
++
++	switch (port->iotype) {
++	case UPIO_AU:
++	case UPIO_TSI:
++	case UPIO_MEM32:
++	case UPIO_MEM32BE:
++	case UPIO_MEM16:
++	case UPIO_MEM:
++		if (!port->mapbase) {
++			ret = -EINVAL;
++			break;
++		}
++
++		if (!request_mem_region(port->mapbase, size, "serial")) {
++			ret = -EBUSY;
++			break;
++		}
++
++		if (port->flags & UPF_IOREMAP) {
++			port->membase = ioremap(port->mapbase, size);
++			if (!port->membase) {
++				release_mem_region(port->mapbase, size);
++				ret = -ENOMEM;
++			}
++		}
++		break;
++
++	case UPIO_HUB6:
++	case UPIO_PORT:
++		if (!request_region(port->iobase, size, "serial"))
++			ret = -EBUSY;
++		break;
++	}
++	return ret;
++}
++
++static void serial8250_release_std_resource(struct uart_8250_port *up)
++{
++	unsigned int size = serial8250_port_size(up);
++	struct uart_port *port = &up->port;
++
++	switch (port->iotype) {
++	case UPIO_AU:
++	case UPIO_TSI:
++	case UPIO_MEM32:
++	case UPIO_MEM32BE:
++	case UPIO_MEM16:
++	case UPIO_MEM:
++		if (!port->mapbase)
++			break;
++
++		if (port->flags & UPF_IOREMAP) {
++			iounmap(port->membase);
++			port->membase = NULL;
++		}
++
++		release_mem_region(port->mapbase, size);
++		break;
++
++	case UPIO_HUB6:
++	case UPIO_PORT:
++		release_region(port->iobase, size);
++		break;
++	}
++}
++
++static void serial8250_release_port(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	serial8250_release_std_resource(up);
++}
++
++static int serial8250_request_port(struct uart_port *port)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	return serial8250_request_std_resource(up);
++}
++
++static int fcr_get_rxtrig_bytes(struct uart_8250_port *up)
++{
++	const struct serial8250_config *conf_type = &uart_config[up->port.type];
++	unsigned char bytes;
++
++	bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)];
++
++	return bytes ? bytes : -EOPNOTSUPP;
++}
++
++static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes)
++{
++	const struct serial8250_config *conf_type = &uart_config[up->port.type];
++	int i;
++
++	if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)])
++		return -EOPNOTSUPP;
++
++	for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) {
++		if (bytes < conf_type->rxtrig_bytes[i])
++			/* Use the nearest lower value */
++			return (--i) << UART_FCR_R_TRIG_SHIFT;
++	}
++
++	return UART_FCR_R_TRIG_11;
++}
++
++static int do_get_rxtrig(struct tty_port *port)
++{
++	struct uart_state *state = container_of(port, struct uart_state, port);
++	struct uart_port *uport = state->uart_port;
++	struct uart_8250_port *up = up_to_u8250p(uport);
++
++	if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
++		return -EINVAL;
++
++	return fcr_get_rxtrig_bytes(up);
++}
++
++static int do_serial8250_get_rxtrig(struct tty_port *port)
++{
++	int rxtrig_bytes;
++
++	mutex_lock(&port->mutex);
++	rxtrig_bytes = do_get_rxtrig(port);
++	mutex_unlock(&port->mutex);
++
++	return rxtrig_bytes;
++}
++
++static ssize_t rx_trig_bytes_show(struct device *dev,
++	struct device_attribute *attr, char *buf)
++{
++	struct tty_port *port = dev_get_drvdata(dev);
++	int rxtrig_bytes;
++
++	rxtrig_bytes = do_serial8250_get_rxtrig(port);
++	if (rxtrig_bytes < 0)
++		return rxtrig_bytes;
++
++	return sysfs_emit(buf, "%d\n", rxtrig_bytes);
++}
++
++static int do_set_rxtrig(struct tty_port *port, unsigned char bytes)
++{
++	struct uart_state *state = container_of(port, struct uart_state, port);
++	struct uart_port *uport = state->uart_port;
++	struct uart_8250_port *up = up_to_u8250p(uport);
++	int rxtrig;
++
++	if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1 ||
++	    up->fifo_bug)
++		return -EINVAL;
++
++	rxtrig = bytes_to_fcr_rxtrig(up, bytes);
++	if (rxtrig < 0)
++		return rxtrig;
++
++	serial8250_clear_fifos(up);
++	up->fcr &= ~UART_FCR_TRIGGER_MASK;
++	up->fcr |= (unsigned char)rxtrig;
++	serial_out(up, UART_FCR, up->fcr);
++	return 0;
++}
++
++static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes)
++{
++	int ret;
++
++	mutex_lock(&port->mutex);
++	ret = do_set_rxtrig(port, bytes);
++	mutex_unlock(&port->mutex);
++
++	return ret;
++}
++
++static ssize_t rx_trig_bytes_store(struct device *dev,
++	struct device_attribute *attr, const char *buf, size_t count)
++{
++	struct tty_port *port = dev_get_drvdata(dev);
++	unsigned char bytes;
++	int ret;
++
++	if (!count)
++		return -EINVAL;
++
++	ret = kstrtou8(buf, 10, &bytes);
++	if (ret < 0)
++		return ret;
++
++	ret = do_serial8250_set_rxtrig(port, bytes);
++	if (ret < 0)
++		return ret;
++
++	return count;
++}
++
++static DEVICE_ATTR_RW(rx_trig_bytes);
++
++static struct attribute *serial8250_dev_attrs[] = {
++	&dev_attr_rx_trig_bytes.attr,
++	NULL
++};
++
++static struct attribute_group serial8250_dev_attr_group = {
++	.attrs = serial8250_dev_attrs,
++};
++
++static void register_dev_spec_attr_grp(struct uart_8250_port *up)
++{
++	const struct serial8250_config *conf_type = &uart_config[up->port.type];
++
++	if (conf_type->rxtrig_bytes[0])
++		up->port.attr_group = &serial8250_dev_attr_group;
++}
++
++static void serial8250_config_port(struct uart_port *port, int flags)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++	int ret;
++
++	/*
++	 * Find the region that we can probe for.  This in turn
++	 * tells us whether we can probe for the type of port.
++	 */
++	ret = serial8250_request_std_resource(up);
++	if (ret < 0)
++		return;
++
++	if (port->iotype != up->cur_iotype)
++		set_io_from_upio(port);
++
++	if (flags & UART_CONFIG_TYPE)
++		autoconfig(up);
++
++	/* if access method is AU, it is a 16550 with a quirk */
++	if (port->type == PORT_16550A && port->iotype == UPIO_AU)
++		up->bugs |= UART_BUG_NOMSR;
++
++	/* HW bugs may trigger IRQ while IIR == NO_INT */
++	if (port->type == PORT_TEGRA)
++		up->bugs |= UART_BUG_NOMSR;
++
++	if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ)
++		autoconfig_irq(up);
++
++	if (port->type == PORT_UNKNOWN)
++		serial8250_release_std_resource(up);
++
++	register_dev_spec_attr_grp(up);
++	up->fcr = uart_config[up->port.type].fcr;
++}
++
++static int
++serial8250_verify_port(struct uart_port *port, struct serial_struct *ser)
++{
++	if (ser->irq >= nr_irqs || ser->irq < 0 ||
++	    ser->baud_base < 9600 || ser->type < PORT_UNKNOWN ||
++	    ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS ||
++	    ser->type == PORT_STARTECH)
++		return -EINVAL;
++	return 0;
++}
++
++static const char *serial8250_type(struct uart_port *port)
++{
++	int type = port->type;
++
++	if (type >= ARRAY_SIZE(uart_config))
++		type = 0;
++	return uart_config[type].name;
++}
++
++static const struct uart_ops serial8250_pops = {
++	.tx_empty	= serial8250_tx_empty,
++	.set_mctrl	= serial8250_set_mctrl,
++	.get_mctrl	= serial8250_get_mctrl,
++	.stop_tx	= serial8250_stop_tx,
++	.start_tx	= serial8250_start_tx,
++	.throttle	= serial8250_throttle,
++	.unthrottle	= serial8250_unthrottle,
++	.stop_rx	= serial8250_stop_rx,
++	.enable_ms	= serial8250_enable_ms,
++	.break_ctl	= serial8250_break_ctl,
++	.startup	= serial8250_startup,
++	.shutdown	= serial8250_shutdown,
++	.set_termios	= serial8250_set_termios,
++	.set_ldisc	= serial8250_set_ldisc,
++	.pm		= serial8250_pm,
++	.type		= serial8250_type,
++	.release_port	= serial8250_release_port,
++	.request_port	= serial8250_request_port,
++	.config_port	= serial8250_config_port,
++	.verify_port	= serial8250_verify_port,
++#ifdef CONFIG_CONSOLE_POLL
++	.poll_get_char = serial8250_get_poll_char,
++	.poll_put_char = serial8250_put_poll_char,
++#endif
++};
++
++void serial8250_init_port(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++
++	spin_lock_init(&port->lock);
++	port->ops = &serial8250_pops;
++	port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE);
++
++	up->cur_iotype = 0xFF;
++}
++EXPORT_SYMBOL_GPL(serial8250_init_port);
++
++void serial8250_set_defaults(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++
++	if (up->port.flags & UPF_FIXED_TYPE) {
++		unsigned int type = up->port.type;
++
++		if (!up->port.fifosize)
++			up->port.fifosize = uart_config[type].fifo_size;
++		if (!up->tx_loadsz)
++			up->tx_loadsz = uart_config[type].tx_loadsz;
++		if (!up->capabilities)
++			up->capabilities = uart_config[type].flags;
++	}
++
++	set_io_from_upio(port);
++
++	/* default dma handlers */
++	if (up->dma) {
++		if (!up->dma->tx_dma)
++			up->dma->tx_dma = serial8250_tx_dma;
++		if (!up->dma->rx_dma)
++			up->dma->rx_dma = serial8250_rx_dma;
++	}
++}
++EXPORT_SYMBOL_GPL(serial8250_set_defaults);
++
++#ifdef CONFIG_SERIAL_8250_CONSOLE
++
++static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
++{
++	struct uart_8250_port *up = up_to_u8250p(port);
++
++	wait_for_xmitr(up, UART_LSR_THRE);
++	serial_port_out(port, UART_TX, ch);
++}
++
++/*
++ *	Restore serial console when h/w power-off detected
++ */
++static void serial8250_console_restore(struct uart_8250_port *up)
++{
++	struct uart_port *port = &up->port;
++	struct ktermios termios;
++	unsigned int baud, quot, frac = 0;
++
++	termios.c_cflag = port->cons->cflag;
++	termios.c_ispeed = port->cons->ispeed;
++	termios.c_ospeed = port->cons->ospeed;
++	if (port->state->port.tty && termios.c_cflag == 0) {
++		termios.c_cflag = port->state->port.tty->termios.c_cflag;
++		termios.c_ispeed = port->state->port.tty->termios.c_ispeed;
++		termios.c_ospeed = port->state->port.tty->termios.c_ospeed;
++	}
++
++	baud = serial8250_get_baud_rate(port, &termios, NULL);
++	quot = serial8250_get_divisor(port, baud, &frac);
++
++	serial8250_set_divisor(port, baud, quot, frac);
++	serial_port_out(port, UART_LCR, up->lcr);
++	serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
++}
++
++/*
++ * Print a string to the serial port using the device FIFO
++ *
++ * It sends fifosize bytes and then waits for the fifo
++ * to get empty.
++ */
++static void serial8250_console_fifo_write(struct uart_8250_port *up,
++					  const char *s, unsigned int count)
++{
++	int i;
++	const char *end = s + count;
++	unsigned int fifosize = up->tx_loadsz;
++	bool cr_sent = false;
++
++	while (s != end) {
++		wait_for_lsr(up, UART_LSR_THRE);
++
++		for (i = 0; i < fifosize && s != end; ++i) {
++			if (*s == '\n' && !cr_sent) {
++				serial_out(up, UART_TX, '\r');
++				cr_sent = true;
++			} else {
++				serial_out(up, UART_TX, *s++);
++				cr_sent = false;
++			}
++		}
++	}
++}
++
++/*
++ *	Print a string to the serial port trying not to disturb
++ *	any possible real use of the port...
++ *
++ *	The console_lock must be held when we get here.
++ *
++ *	Doing runtime PM is really a bad idea for the kernel console.
++ *	Thus, we assume the function is called when device is powered up.
++ */
++void serial8250_console_write(struct uart_8250_port *up, const char *s,
++			      unsigned int count)
++{
++	struct uart_8250_em485 *em485 = up->em485;
++	struct uart_port *port = &up->port;
++	unsigned long flags;
++	unsigned int ier, use_fifo;
++	int locked = 1;
++
++	touch_nmi_watchdog();
++
++	if (oops_in_progress)
++		locked = spin_trylock_irqsave(&port->lock, flags);
++	else
++		spin_lock_irqsave(&port->lock, flags);
++
++	/*
++	 *	First save the IER then disable the interrupts
++	 */
++	ier = serial_port_in(port, UART_IER);
++
++	if (up->capabilities & UART_CAP_UUE)
++		serial_port_out(port, UART_IER, UART_IER_UUE);
++	else
++		serial_port_out(port, UART_IER, 0);
++
++	/* check scratch reg to see if port powered off during system sleep */
++	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
++		serial8250_console_restore(up);
++		up->canary = 0;
++	}
++
++	if (em485) {
++		if (em485->tx_stopped)
++			up->rs485_start_tx(up);
++		mdelay(port->rs485.delay_rts_before_send);
++	}
++
++	use_fifo = (up->capabilities & UART_CAP_FIFO) &&
++		/*
++		 * BCM283x requires to check the fifo
++		 * after each byte.
++		 */
++		!(up->capabilities & UART_CAP_MINI) &&
++		/*
++		 * tx_loadsz contains the transmit fifo size
++		 */
++		up->tx_loadsz > 1 &&
++		(up->fcr & UART_FCR_ENABLE_FIFO) &&
++		port->state &&
++		test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) &&
++		/*
++		 * After we put a data in the fifo, the controller will send
++		 * it regardless of the CTS state. Therefore, only use fifo
++		 * if we don't use control flow.
++		 */
++		!(up->port.flags & UPF_CONS_FLOW);
++
++	if (likely(use_fifo))
++		serial8250_console_fifo_write(up, s, count);
++	else
++		uart_console_write(port, s, count, serial8250_console_putchar);
++
++	/*
++	 *	Finally, wait for transmitter to become empty
++	 *	and restore the IER
++	 */
++	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
++
++	if (em485) {
++		mdelay(port->rs485.delay_rts_after_send);
++		if (em485->tx_stopped)
++			up->rs485_stop_tx(up);
++	}
++
++	serial_port_out(port, UART_IER, ier);
++
++	/*
++	 *	The receive handling will happen properly because the
++	 *	receive ready bit will still be set; it is not cleared
++	 *	on read.  However, modem control will not, we must
++	 *	call it if we have saved something in the saved flags
++	 *	while processing with interrupts off.
++	 */
++	if (up->msr_saved_flags)
++		serial8250_modem_status(up);
++
++	if (locked)
++		spin_unlock_irqrestore(&port->lock, flags);
++}
++
++static unsigned int probe_baud(struct uart_port *port)
++{
++	unsigned char lcr, dll, dlm;
++	unsigned int quot;
++
++	lcr = serial_port_in(port, UART_LCR);
++	serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB);
++	dll = serial_port_in(port, UART_DLL);
++	dlm = serial_port_in(port, UART_DLM);
++	serial_port_out(port, UART_LCR, lcr);
++
++	quot = (dlm << 8) | dll;
++	return (port->uartclk / 16) / quot;
++}
++
++int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
++{
++	int baud = 9600;
++	int bits = 8;
++	int parity = 'n';
++	int flow = 'n';
++	int ret;
++
++	if (!port->iobase && !port->membase)
++		return -ENODEV;
++
++	if (options)
++		uart_parse_options(options, &baud, &parity, &bits, &flow);
++	else if (probe)
++		baud = probe_baud(port);
++
++	ret = uart_set_options(port, port->cons, baud, parity, bits, flow);
++	if (ret)
++		return ret;
++
++	if (port->dev)
++		pm_runtime_get_sync(port->dev);
++
++	return 0;
++}
++
++int serial8250_console_exit(struct uart_port *port)
++{
++	if (port->dev)
++		pm_runtime_put_sync(port->dev);
++
++	return 0;
++}
++
++#endif /* CONFIG_SERIAL_8250_CONSOLE */
++
++MODULE_LICENSE("GPL");
+diff -rupN linux.orig/drivers/tty/serial/8250/Kconfig linux/drivers/tty/serial/8250/Kconfig
+--- linux.orig/drivers/tty/serial/8250/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/8250/Kconfig	2022-12-04 10:40:26.708034065 -0500
 @@ -9,6 +9,7 @@ config SERIAL_8250
  	depends on !S390
  	select SERIAL_CORE
@@ -4064,11 +26933,10 @@ index d0b49e15fbf5e..02c308467339c 100644
  	help
  	  This selects whether you want to include the driver for the standard
  	  serial ports.  The standard answer is Y.  People who might say N
-diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
-index 15f0e4d88c5a0..ffdb001e3d109 100644
---- a/drivers/tty/serial/amba-pl011.c
-+++ b/drivers/tty/serial/amba-pl011.c
-@@ -2308,18 +2308,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
+diff -rupN linux.orig/drivers/tty/serial/amba-pl011.c linux/drivers/tty/serial/amba-pl011.c
+--- linux.orig/drivers/tty/serial/amba-pl011.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/amba-pl011.c	2022-12-04 10:40:26.708034065 -0500
+@@ -2308,18 +2308,24 @@ pl011_console_write(struct console *co,
  {
  	struct uart_amba_port *uap = amba_ports[co->index];
  	unsigned int old_cr = 0, new_cr;
@@ -4097,7 +26965,7 @@ index 15f0e4d88c5a0..ffdb001e3d109 100644
  
  	/*
  	 *	First save the CR then disable the interrupts
-@@ -2345,8 +2351,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
+@@ -2345,8 +2351,7 @@ pl011_console_write(struct console *co,
  		pl011_write(old_cr, uap, REG_CR);
  
  	if (locked)
@@ -4107,11 +26975,10 @@ index 15f0e4d88c5a0..ffdb001e3d109 100644
  
  	clk_disable(uap->clk);
  }
-diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
-index 0aa666e247d57..d7130d1ae64c0 100644
---- a/drivers/tty/serial/omap-serial.c
-+++ b/drivers/tty/serial/omap-serial.c
-@@ -1240,13 +1240,10 @@ serial_omap_console_write(struct console *co, const char *s,
+diff -rupN linux.orig/drivers/tty/serial/omap-serial.c linux/drivers/tty/serial/omap-serial.c
+--- linux.orig/drivers/tty/serial/omap-serial.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/serial/omap-serial.c	2022-12-04 10:40:26.708034065 -0500
+@@ -1240,13 +1240,10 @@ serial_omap_console_write(struct console
  	unsigned int ier;
  	int locked = 1;
  
@@ -4128,7 +26995,7 @@ index 0aa666e247d57..d7130d1ae64c0 100644
  
  	/*
  	 * First save the IER then disable the interrupts
-@@ -1273,8 +1270,7 @@ serial_omap_console_write(struct console *co, const char *s,
+@@ -1273,8 +1270,7 @@ serial_omap_console_write(struct console
  		check_modem_status(up);
  
  	if (locked)
@@ -4138,11 +27005,10 @@ index 0aa666e247d57..d7130d1ae64c0 100644
  }
  
  static int __init
-diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
-index d2b2720db6ca7..18e623325887f 100644
---- a/drivers/tty/sysrq.c
-+++ b/drivers/tty/sysrq.c
-@@ -581,6 +581,7 @@ void __handle_sysrq(int key, bool check_mask)
+diff -rupN linux.orig/drivers/tty/sysrq.c linux/drivers/tty/sysrq.c
+--- linux.orig/drivers/tty/sysrq.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/tty/sysrq.c	2022-12-04 10:40:26.708034065 -0500
+@@ -581,6 +581,7 @@ void __handle_sysrq(int key, bool check_
  
  	rcu_sysrq_start();
  	rcu_read_lock();
@@ -4150,7 +27016,7 @@ index d2b2720db6ca7..18e623325887f 100644
  	/*
  	 * Raise the apparent loglevel to maximum so that the sysrq header
  	 * is shown to provide the user with positive feedback.  We do not
-@@ -622,6 +623,7 @@ void __handle_sysrq(int key, bool check_mask)
+@@ -622,6 +623,7 @@ void __handle_sysrq(int key, bool check_
  		pr_cont("\n");
  		console_loglevel = orig_log_level;
  	}
@@ -4158,10 +27024,9 @@ index d2b2720db6ca7..18e623325887f 100644
  	rcu_read_unlock();
  	rcu_sysrq_end();
  
-diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
-index 4e0e50e7ac153..173e979b84a93 100644
---- a/drivers/vdpa/vdpa_user/iova_domain.h
-+++ b/drivers/vdpa/vdpa_user/iova_domain.h
+diff -rupN linux.orig/drivers/vdpa/vdpa_user/iova_domain.h linux/drivers/vdpa/vdpa_user/iova_domain.h
+--- linux.orig/drivers/vdpa/vdpa_user/iova_domain.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/drivers/vdpa/vdpa_user/iova_domain.h	2022-12-04 10:40:26.708034065 -0500
 @@ -14,7 +14,6 @@
  #include <linux/iova.h>
  #include <linux/dma-mapping.h>
@@ -4170,10 +27035,9 @@ index 4e0e50e7ac153..173e979b84a93 100644
  
  #define IOVA_START_PFN 1
  
-diff --git a/fs/dcache.c b/fs/dcache.c
-index bb0c4d0038dbd..2ee8636016ee9 100644
---- a/fs/dcache.c
-+++ b/fs/dcache.c
+diff -rupN linux.orig/fs/dcache.c linux/fs/dcache.c
+--- linux.orig/fs/dcache.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/fs/dcache.c	2022-12-04 10:40:26.708034065 -0500
 @@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash);
  
  static inline unsigned start_dir_add(struct inode *dir)
@@ -4191,7 +27055,7 @@ index bb0c4d0038dbd..2ee8636016ee9 100644
  	for (;;) {
  		unsigned n = dir->i_dir_seq;
  		if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
-@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
+@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct in
  			       wait_queue_head_t *d_wait)
  {
  	smp_store_release(&dir->i_dir_seq, n + 2);
@@ -4201,10 +27065,9 @@ index bb0c4d0038dbd..2ee8636016ee9 100644
  	wake_up_all(d_wait);
  }
  
-diff --git a/include/linux/console.h b/include/linux/console.h
-index 8c1686e2c2337..8a813cbaf9285 100644
---- a/include/linux/console.h
-+++ b/include/linux/console.h
+diff -rupN linux.orig/include/linux/console.h linux/include/linux/console.h
+--- linux.orig/include/linux/console.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/console.h	2022-12-04 10:40:26.712034055 -0500
 @@ -16,6 +16,7 @@
  
  #include <linux/atomic.h>
@@ -4269,10 +27132,9 @@ index 8c1686e2c2337..8a813cbaf9285 100644
  	CONSOLE_FLUSH_PENDING,
  	CONSOLE_REPLAY_ALL,
  };
-diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
-index 84a466b176cf4..df6d17bc30aa3 100644
---- a/include/linux/entry-common.h
-+++ b/include/linux/entry-common.h
+diff -rupN linux.orig/include/linux/entry-common.h linux/include/linux/entry-common.h
+--- linux.orig/include/linux/entry-common.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/entry-common.h	2022-12-04 10:40:26.712034055 -0500
 @@ -57,9 +57,15 @@
  # define ARCH_EXIT_TO_USER_MODE_WORK		(0)
  #endif
@@ -4290,11 +27152,10 @@ index 84a466b176cf4..df6d17bc30aa3 100644
  	 ARCH_EXIT_TO_USER_MODE_WORK)
  
  /**
-diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
-index a92bce40b04b3..bf82980f569df 100644
---- a/include/linux/interrupt.h
-+++ b/include/linux/interrupt.h
-@@ -605,6 +605,35 @@ extern void __raise_softirq_irqoff(unsigned int nr);
+diff -rupN linux.orig/include/linux/interrupt.h linux/include/linux/interrupt.h
+--- linux.orig/include/linux/interrupt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/interrupt.h	2022-12-04 10:40:26.712034055 -0500
+@@ -605,6 +605,35 @@ extern void __raise_softirq_irqoff(unsig
  extern void raise_softirq_irqoff(unsigned int nr);
  extern void raise_softirq(unsigned int nr);
  
@@ -4330,11 +27191,10 @@ index a92bce40b04b3..bf82980f569df 100644
  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
  
  static inline struct task_struct *this_cpu_ksoftirqd(void)
-diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
-index 1cd4e36890fbf..844a8e30e6de5 100644
---- a/include/linux/irqdesc.h
-+++ b/include/linux/irqdesc.h
-@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int irq);
+diff -rupN linux.orig/include/linux/irqdesc.h linux/include/linux/irqdesc.h
+--- linux.orig/include/linux/irqdesc.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/irqdesc.h	2022-12-04 10:40:26.712034055 -0500
+@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int
   * conversion failed.
   */
  int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq);
@@ -4342,10 +27202,9 @@ index 1cd4e36890fbf..844a8e30e6de5 100644
  int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq);
  #endif
  
-diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
-index 1f1099dac3f05..1023f349af716 100644
---- a/include/linux/lockdep.h
-+++ b/include/linux/lockdep.h
+diff -rupN linux.orig/include/linux/lockdep.h linux/include/linux/lockdep.h
+--- linux.orig/include/linux/lockdep.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/lockdep.h	2022-12-04 10:40:26.712034055 -0500
 @@ -435,7 +435,6 @@ enum xhlock_context_t {
  	XHLOCK_CTX_NR,
  };
@@ -4354,11 +27213,10 @@ index 1f1099dac3f05..1023f349af716 100644
  /*
   * To initialize a lockdep_map statically use this macro.
   * Note that _name must not be NULL.
-diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
-index 15ae78cd28536..b8728d11c9490 100644
---- a/include/linux/mmdebug.h
-+++ b/include/linux/mmdebug.h
-@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm);
+diff -rupN linux.orig/include/linux/mmdebug.h linux/include/linux/mmdebug.h
+--- linux.orig/include/linux/mmdebug.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/mmdebug.h	2022-12-04 10:40:26.712034055 -0500
+@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm)
  #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
  #endif
  
@@ -4371,10 +27229,9 @@ index 15ae78cd28536..b8728d11c9490 100644
  #ifdef CONFIG_DEBUG_VIRTUAL
  #define VIRTUAL_BUG_ON(cond) BUG_ON(cond)
  #else
-diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
-index 05d6f3facd5a5..5e6b840f5a9ac 100644
---- a/include/linux/netdevice.h
-+++ b/include/linux/netdevice.h
+diff -rupN linux.orig/include/linux/netdevice.h linux/include/linux/netdevice.h
+--- linux.orig/include/linux/netdevice.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/netdevice.h	2022-12-04 10:40:26.712034055 -0500
 @@ -3156,7 +3156,11 @@ struct softnet_data {
  	int			defer_count;
  	int			defer_ipi_scheduled;
@@ -4387,10 +27244,9 @@ index 05d6f3facd5a5..5e6b840f5a9ac 100644
  };
  
  static inline void input_queue_head_incr(struct softnet_data *sd)
-diff --git a/include/linux/preempt.h b/include/linux/preempt.h
-index b4381f255a5ca..12f59cdaaedda 100644
---- a/include/linux/preempt.h
-+++ b/include/linux/preempt.h
+diff -rupN linux.orig/include/linux/preempt.h linux/include/linux/preempt.h
+--- linux.orig/include/linux/preempt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/preempt.h	2022-12-04 10:40:26.712034055 -0500
 @@ -196,6 +196,20 @@ extern void preempt_count_sub(int val);
  #define preempt_count_inc() preempt_count_add(1)
  #define preempt_count_dec() preempt_count_sub(1)
@@ -4537,10 +27393,9 @@ index b4381f255a5ca..12f59cdaaedda 100644
 +}
 +
  #endif /* __LINUX_PREEMPT_H */
-diff --git a/include/linux/printk.h b/include/linux/printk.h
-index cf7d666ab1f8e..f88ec15f83dcc 100644
---- a/include/linux/printk.h
-+++ b/include/linux/printk.h
+diff -rupN linux.orig/include/linux/printk.h linux/include/linux/printk.h
+--- linux.orig/include/linux/printk.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/printk.h	2022-12-04 10:40:26.712034055 -0500
 @@ -169,7 +169,11 @@ extern void __printk_safe_exit(void);
  #define printk_deferred_enter __printk_safe_enter
  #define printk_deferred_exit __printk_safe_exit
@@ -4553,7 +27408,7 @@ index cf7d666ab1f8e..f88ec15f83dcc 100644
  
  /*
   * Please don't use printk_ratelimit(), because it shares ratelimiting state
-@@ -221,11 +225,23 @@ static inline void printk_deferred_exit(void)
+@@ -221,11 +225,23 @@ static inline void printk_deferred_exit(
  {
  }
  
@@ -4577,10 +27432,9 @@ index cf7d666ab1f8e..f88ec15f83dcc 100644
  static inline int printk_ratelimit(void)
  {
  	return 0;
-diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
-index 8f416c5e929ea..c0ef596f340b5 100644
---- a/include/linux/rwlock.h
-+++ b/include/linux/rwlock.h
+diff -rupN linux.orig/include/linux/rwlock.h linux/include/linux/rwlock.h
+--- linux.orig/include/linux/rwlock.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/rwlock.h	2022-12-04 10:40:26.712034055 -0500
 @@ -1,7 +1,7 @@
  #ifndef __LINUX_RWLOCK_H
  #define __LINUX_RWLOCK_H
@@ -4590,11 +27444,10 @@ index 8f416c5e929ea..c0ef596f340b5 100644
  # error "please don't include this file directly"
  #endif
  
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 8d82d6d326701..e1623b3001c5b 100644
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -2038,6 +2038,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
+diff -rupN linux.orig/include/linux/sched.h linux/include/linux/sched.h
+--- linux.orig/include/linux/sched.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/sched.h	2022-12-04 10:40:26.712034055 -0500
+@@ -2038,6 +2038,43 @@ static inline int test_tsk_need_resched(
  	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
  }
  
@@ -4638,10 +27491,9 @@ index 8d82d6d326701..e1623b3001c5b 100644
  /*
   * cond_resched() and cond_resched_lock(): latency reduction via
   * explicit rescheduling in places that are safe. The return
-diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
-index 16e3d75a324c7..ee1f719a21678 100644
---- a/include/linux/serial_8250.h
-+++ b/include/linux/serial_8250.h
+diff -rupN linux.orig/include/linux/serial_8250.h linux/include/linux/serial_8250.h
+--- linux.orig/include/linux/serial_8250.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/serial_8250.h	2022-12-04 10:40:26.712034055 -0500
 @@ -7,6 +7,7 @@
  #ifndef _LINUX_SERIAL_8250_H
  #define _LINUX_SERIAL_8250_H
@@ -4659,7 +27511,7 @@ index 16e3d75a324c7..ee1f719a21678 100644
  	struct uart_8250_dma	*dma;
  	const struct uart_8250_ops *ops;
  
-@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up);
+@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_82
  void serial8250_set_defaults(struct uart_8250_port *up);
  void serial8250_console_write(struct uart_8250_port *up, const char *s,
  			      unsigned int count);
@@ -4668,28 +27520,9 @@ index 16e3d75a324c7..ee1f719a21678 100644
  int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
  int serial8250_console_exit(struct uart_port *port);
  
-diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
-index 5c0c5174155d0..1341f7d62da44 100644
---- a/include/linux/spinlock.h
-+++ b/include/linux/spinlock.h
-@@ -1,6 +1,7 @@
- /* SPDX-License-Identifier: GPL-2.0 */
- #ifndef __LINUX_SPINLOCK_H
- #define __LINUX_SPINLOCK_H
-+#define __LINUX_INSIDE_SPINLOCK_H
- 
- /*
-  * include/linux/spinlock.h - generic spinlock/rwlock declarations
-@@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
- 
- void free_bucket_spinlocks(spinlock_t *locks);
- 
-+#undef __LINUX_INSIDE_SPINLOCK_H
- #endif /* __LINUX_SPINLOCK_H */
-diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
-index 51fa0dab68c4d..89eb6f4c659c7 100644
---- a/include/linux/spinlock_api_smp.h
-+++ b/include/linux/spinlock_api_smp.h
+diff -rupN linux.orig/include/linux/spinlock_api_smp.h linux/include/linux/spinlock_api_smp.h
+--- linux.orig/include/linux/spinlock_api_smp.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock_api_smp.h	2022-12-04 10:40:26.712034055 -0500
 @@ -1,7 +1,7 @@
  #ifndef __LINUX_SPINLOCK_API_SMP_H
  #define __LINUX_SPINLOCK_API_SMP_H
@@ -4699,10 +27532,9 @@ index 51fa0dab68c4d..89eb6f4c659c7 100644
  # error "please don't include this file directly"
  #endif
  
-diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
-index b8ba00ccccdeb..819aeba1c87e6 100644
---- a/include/linux/spinlock_api_up.h
-+++ b/include/linux/spinlock_api_up.h
+diff -rupN linux.orig/include/linux/spinlock_api_up.h linux/include/linux/spinlock_api_up.h
+--- linux.orig/include/linux/spinlock_api_up.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock_api_up.h	2022-12-04 10:40:26.712034055 -0500
 @@ -1,7 +1,7 @@
  #ifndef __LINUX_SPINLOCK_API_UP_H
  #define __LINUX_SPINLOCK_API_UP_H
@@ -4712,10 +27544,26 @@ index b8ba00ccccdeb..819aeba1c87e6 100644
  # error "please don't include this file directly"
  #endif
  
-diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
-index 835aedaf68acd..61c49b16f69ab 100644
---- a/include/linux/spinlock_rt.h
-+++ b/include/linux/spinlock_rt.h
+diff -rupN linux.orig/include/linux/spinlock.h linux/include/linux/spinlock.h
+--- linux.orig/include/linux/spinlock.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock.h	2022-12-04 10:40:26.712034055 -0500
+@@ -1,6 +1,7 @@
+ /* SPDX-License-Identifier: GPL-2.0 */
+ #ifndef __LINUX_SPINLOCK_H
+ #define __LINUX_SPINLOCK_H
++#define __LINUX_INSIDE_SPINLOCK_H
+ 
+ /*
+  * include/linux/spinlock.h - generic spinlock/rwlock declarations
+@@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t
+ 
+ void free_bucket_spinlocks(spinlock_t *locks);
+ 
++#undef __LINUX_INSIDE_SPINLOCK_H
+ #endif /* __LINUX_SPINLOCK_H */
+diff -rupN linux.orig/include/linux/spinlock_rt.h linux/include/linux/spinlock_rt.h
+--- linux.orig/include/linux/spinlock_rt.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock_rt.h	2022-12-04 10:40:26.712034055 -0500
 @@ -2,7 +2,7 @@
  #ifndef __LINUX_SPINLOCK_RT_H
  #define __LINUX_SPINLOCK_RT_H
@@ -4725,10 +27573,9 @@ index 835aedaf68acd..61c49b16f69ab 100644
  #error Do not include directly. Use spinlock.h
  #endif
  
-diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
-index 16521074b6f7c..c87204247592f 100644
---- a/include/linux/spinlock_up.h
-+++ b/include/linux/spinlock_up.h
+diff -rupN linux.orig/include/linux/spinlock_up.h linux/include/linux/spinlock_up.h
+--- linux.orig/include/linux/spinlock_up.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/spinlock_up.h	2022-12-04 10:40:26.716034044 -0500
 @@ -1,7 +1,7 @@
  #ifndef __LINUX_SPINLOCK_UP_H
  #define __LINUX_SPINLOCK_UP_H
@@ -4738,11 +27585,10 @@ index 16521074b6f7c..c87204247592f 100644
  # error "please don't include this file directly"
  #endif
  
-diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
-index 9f392ec76f2bb..779e0e96b9cb0 100644
---- a/include/linux/thread_info.h
-+++ b/include/linux/thread_info.h
-@@ -177,7 +177,17 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
+diff -rupN linux.orig/include/linux/thread_info.h linux/include/linux/thread_info.h
+--- linux.orig/include/linux/thread_info.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/thread_info.h	2022-12-04 10:40:26.716034044 -0500
+@@ -177,7 +177,17 @@ static __always_inline unsigned long rea
  	clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
  #endif /* !CONFIG_GENERIC_ENTRY */
  
@@ -4761,10 +27607,9 @@ index 9f392ec76f2bb..779e0e96b9cb0 100644
  
  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
  static inline int arch_within_stack_frames(const void * const stack,
-diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
-index 20749bd9db718..224bf60d6563c 100644
---- a/include/linux/trace_events.h
-+++ b/include/linux/trace_events.h
+diff -rupN linux.orig/include/linux/trace_events.h linux/include/linux/trace_events.h
+--- linux.orig/include/linux/trace_events.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/trace_events.h	2022-12-04 10:40:26.716034044 -0500
 @@ -70,6 +70,7 @@ struct trace_entry {
  	unsigned char		flags;
  	unsigned char		preempt_count;
@@ -4773,7 +27618,7 @@ index 20749bd9db718..224bf60d6563c 100644
  };
  
  #define TRACE_EVENT_TYPE_MAX						\
-@@ -159,9 +160,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry,
+@@ -159,9 +160,10 @@ static inline void tracing_generic_entry
  						unsigned int trace_ctx)
  {
  	entry->preempt_count		= trace_ctx & 0xff;
@@ -4799,10 +27644,9 @@ index 20749bd9db718..224bf60d6563c 100644
  	TRACE_FLAG_NMI			= 0x40,
  	TRACE_FLAG_BH_OFF		= 0x80,
  };
-diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
-index 6ad4e9032d538..ffe48e69b3f3a 100644
---- a/include/linux/u64_stats_sync.h
-+++ b/include/linux/u64_stats_sync.h
+diff -rupN linux.orig/include/linux/u64_stats_sync.h linux/include/linux/u64_stats_sync.h
+--- linux.orig/include/linux/u64_stats_sync.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/include/linux/u64_stats_sync.h	2022-12-04 10:40:26.716034044 -0500
 @@ -8,7 +8,7 @@
   *
   * Key points :
@@ -4843,7 +27687,7 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
  	seqcount_t	seq;
  #endif
  };
-@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p)
+@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_sta
  	local64_inc(&p->v);
  }
  
@@ -4867,7 +27711,7 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
  
  typedef struct {
  	u64		v;
-@@ -123,122 +134,82 @@ static inline void u64_stats_inc(u64_stats_t *p)
+@@ -123,122 +134,82 @@ static inline void u64_stats_inc(u64_sta
  {
  	p->v++;
  }
@@ -4944,25 +27788,50 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
 -#else
 -	return 0;
 -#endif
-+}
-+
+ }
+ 
+-static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 +					   unsigned int start)
-+{
+ {
+-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
+-	preempt_disable();
+-#endif
+-	return __u64_stats_fetch_begin(syncp);
 +	return read_seqcount_retry(&syncp->seq, start);
-+}
+ }
 +#endif /* !64 bit */
-+
+ 
+-static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+-					 unsigned int start)
 +static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
-+{
+ {
+-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
+-	return read_seqcount_retry(&syncp->seq, start);
+-#else
+-	return false;
+-#endif
 +	__u64_stats_update_begin(syncp);
-+}
-+
+ }
+ 
+-static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+-					 unsigned int start)
 +static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
-+{
+ {
+-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
+-	preempt_enable();
+-#endif
+-	return __u64_stats_fetch_retry(syncp, start);
 +	__u64_stats_update_end(syncp);
-+}
-+
+ }
+ 
+-/*
+- * In case irq handlers can update u64 counters, readers can use following helpers
+- * - SMP 32bit arches use seqcount protection, irq safe.
+- * - UP 32bit must disable irqs.
+- * - 64bit have no problem atomically reading u64 values, irq safe.
+- */
+-static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
 +static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
 +{
 +	unsigned long flags = __u64_stats_irqsave();
@@ -4976,54 +27845,23 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
 +{
 +	__u64_stats_update_end(syncp);
 +	__u64_stats_irqrestore(flags);
- }
- 
- static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
++}
++
++static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
  {
--#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
--	preempt_disable();
--#endif
- 	return __u64_stats_fetch_begin(syncp);
- }
- 
--static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
--					 unsigned int start)
--{
--#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
--	return read_seqcount_retry(&syncp->seq, start);
--#else
--	return false;
--#endif
--}
--
- static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
- 					 unsigned int start)
- {
--#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
--	preempt_enable();
--#endif
--	return __u64_stats_fetch_retry(syncp, start);
--}
--
--/*
-- * In case irq handlers can update u64 counters, readers can use following helpers
-- * - SMP 32bit arches use seqcount protection, irq safe.
-- * - UP 32bit must disable irqs.
-- * - 64bit have no problem atomically reading u64 values, irq safe.
-- */
--static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
--{
 -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
 -	preempt_disable();
 -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
 -	local_irq_disable();
 -#endif
--	return __u64_stats_fetch_begin(syncp);
--}
--
+ 	return __u64_stats_fetch_begin(syncp);
+ }
+ 
 -static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
 -					     unsigned int start)
--{
++static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
++					 unsigned int start)
+ {
 -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
 -	preempt_enable();
 -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
@@ -5032,10 +27870,9 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644
  	return __u64_stats_fetch_retry(syncp, start);
  }
  
-diff --git a/init/Kconfig b/init/Kconfig
-index 532362fcfe31f..08ec5f25e6642 100644
---- a/init/Kconfig
-+++ b/init/Kconfig
+diff -rupN linux.orig/init/Kconfig linux/init/Kconfig
+--- linux.orig/init/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/init/Kconfig	2022-12-04 10:40:26.716034044 -0500
 @@ -1574,6 +1574,10 @@ config PRINTK
  	  very difficult to diagnose system problems, saying N here is
  	  strongly discouraged.
@@ -5047,27 +27884,10 @@ index 532362fcfe31f..08ec5f25e6642 100644
  config BUG
  	bool "BUG() support" if EXPERT
  	default y
-diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
-index c2f1fd95a8214..260c08efeb486 100644
---- a/kernel/Kconfig.preempt
-+++ b/kernel/Kconfig.preempt
-@@ -1,5 +1,11 @@
- # SPDX-License-Identifier: GPL-2.0-only
- 
-+config HAVE_PREEMPT_LAZY
-+	bool
-+
-+config PREEMPT_LAZY
-+	def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT
-+
- config PREEMPT_NONE_BUILD
- 	bool
- 
-diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
-index 22e7a805c6723..b492e482b63a9 100644
---- a/kernel/bpf/syscall.c
-+++ b/kernel/bpf/syscall.c
-@@ -2107,11 +2107,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
+diff -rupN linux.orig/kernel/bpf/syscall.c linux/kernel/bpf/syscall.c
+--- linux.orig/kernel/bpf/syscall.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/bpf/syscall.c	2022-12-04 10:40:26.716034044 -0500
+@@ -2118,11 +2118,11 @@ static void bpf_prog_get_stats(const str
  
  		st = per_cpu_ptr(prog->stats, cpu);
  		do {
@@ -5081,11 +27901,5333 @@ index 22e7a805c6723..b492e482b63a9 100644
  		nsecs += tnsecs;
  		cnt += tcnt;
  		misses += tmisses;
-diff --git a/kernel/entry/common.c b/kernel/entry/common.c
-index 063068a9ea9b3..26b772720b227 100644
---- a/kernel/entry/common.c
-+++ b/kernel/entry/common.c
-@@ -153,7 +153,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+diff -rupN linux.orig/kernel/bpf/syscall.c.orig linux/kernel/bpf/syscall.c.orig
+--- linux.orig/kernel/bpf/syscall.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/kernel/bpf/syscall.c.orig	2022-12-04 10:40:18.684054629 -0500
+@@ -0,0 +1,5319 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
++ */
++#include <linux/bpf.h>
++#include <linux/bpf-cgroup.h>
++#include <linux/bpf_trace.h>
++#include <linux/bpf_lirc.h>
++#include <linux/bpf_verifier.h>
++#include <linux/bsearch.h>
++#include <linux/btf.h>
++#include <linux/syscalls.h>
++#include <linux/slab.h>
++#include <linux/sched/signal.h>
++#include <linux/vmalloc.h>
++#include <linux/mmzone.h>
++#include <linux/anon_inodes.h>
++#include <linux/fdtable.h>
++#include <linux/file.h>
++#include <linux/fs.h>
++#include <linux/license.h>
++#include <linux/filter.h>
++#include <linux/kernel.h>
++#include <linux/idr.h>
++#include <linux/cred.h>
++#include <linux/timekeeping.h>
++#include <linux/ctype.h>
++#include <linux/nospec.h>
++#include <linux/audit.h>
++#include <uapi/linux/btf.h>
++#include <linux/pgtable.h>
++#include <linux/bpf_lsm.h>
++#include <linux/poll.h>
++#include <linux/sort.h>
++#include <linux/bpf-netns.h>
++#include <linux/rcupdate_trace.h>
++#include <linux/memcontrol.h>
++#include <linux/trace_events.h>
++
++#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
++			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
++			  (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
++#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
++#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
++#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
++			IS_FD_HASH(map))
++
++#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
++
++DEFINE_PER_CPU(int, bpf_prog_active);
++static DEFINE_IDR(prog_idr);
++static DEFINE_SPINLOCK(prog_idr_lock);
++static DEFINE_IDR(map_idr);
++static DEFINE_SPINLOCK(map_idr_lock);
++static DEFINE_IDR(link_idr);
++static DEFINE_SPINLOCK(link_idr_lock);
++
++int sysctl_unprivileged_bpf_disabled __read_mostly =
++	IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
++
++static const struct bpf_map_ops * const bpf_map_types[] = {
++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
++#define BPF_MAP_TYPE(_id, _ops) \
++	[_id] = &_ops,
++#define BPF_LINK_TYPE(_id, _name)
++#include <linux/bpf_types.h>
++#undef BPF_PROG_TYPE
++#undef BPF_MAP_TYPE
++#undef BPF_LINK_TYPE
++};
++
++/*
++ * If we're handed a bigger struct than we know of, ensure all the unknown bits
++ * are 0 - i.e. new user-space does not rely on any kernel feature extensions
++ * we don't know about yet.
++ *
++ * There is a ToCToU between this function call and the following
++ * copy_from_user() call. However, this is not a concern since this function is
++ * meant to be a future-proofing of bits.
++ */
++int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
++			     size_t expected_size,
++			     size_t actual_size)
++{
++	int res;
++
++	if (unlikely(actual_size > PAGE_SIZE))	/* silly large */
++		return -E2BIG;
++
++	if (actual_size <= expected_size)
++		return 0;
++
++	if (uaddr.is_kernel)
++		res = memchr_inv(uaddr.kernel + expected_size, 0,
++				 actual_size - expected_size) == NULL;
++	else
++		res = check_zeroed_user(uaddr.user + expected_size,
++					actual_size - expected_size);
++	if (res < 0)
++		return res;
++	return res ? 0 : -E2BIG;
++}
++
++const struct bpf_map_ops bpf_map_offload_ops = {
++	.map_meta_equal = bpf_map_meta_equal,
++	.map_alloc = bpf_map_offload_map_alloc,
++	.map_free = bpf_map_offload_map_free,
++	.map_check_btf = map_check_no_btf,
++};
++
++static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
++{
++	const struct bpf_map_ops *ops;
++	u32 type = attr->map_type;
++	struct bpf_map *map;
++	int err;
++
++	if (type >= ARRAY_SIZE(bpf_map_types))
++		return ERR_PTR(-EINVAL);
++	type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
++	ops = bpf_map_types[type];
++	if (!ops)
++		return ERR_PTR(-EINVAL);
++
++	if (ops->map_alloc_check) {
++		err = ops->map_alloc_check(attr);
++		if (err)
++			return ERR_PTR(err);
++	}
++	if (attr->map_ifindex)
++		ops = &bpf_map_offload_ops;
++	map = ops->map_alloc(attr);
++	if (IS_ERR(map))
++		return map;
++	map->ops = ops;
++	map->map_type = type;
++	return map;
++}
++
++static void bpf_map_write_active_inc(struct bpf_map *map)
++{
++	atomic64_inc(&map->writecnt);
++}
++
++static void bpf_map_write_active_dec(struct bpf_map *map)
++{
++	atomic64_dec(&map->writecnt);
++}
++
++bool bpf_map_write_active(const struct bpf_map *map)
++{
++	return atomic64_read(&map->writecnt) != 0;
++}
++
++static u32 bpf_map_value_size(const struct bpf_map *map)
++{
++	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
++	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
++	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
++	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
++		return round_up(map->value_size, 8) * num_possible_cpus();
++	else if (IS_FD_MAP(map))
++		return sizeof(u32);
++	else
++		return  map->value_size;
++}
++
++static void maybe_wait_bpf_programs(struct bpf_map *map)
++{
++	/* Wait for any running BPF programs to complete so that
++	 * userspace, when we return to it, knows that all programs
++	 * that could be running use the new map value.
++	 */
++	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
++	    map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
++		synchronize_rcu();
++}
++
++static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
++				void *value, __u64 flags)
++{
++	int err;
++
++	/* Need to create a kthread, thus must support schedule */
++	if (bpf_map_is_dev_bound(map)) {
++		return bpf_map_offload_update_elem(map, key, value, flags);
++	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
++		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
++		return map->ops->map_update_elem(map, key, value, flags);
++	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
++		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
++		return sock_map_update_elem_sys(map, key, value, flags);
++	} else if (IS_FD_PROG_ARRAY(map)) {
++		return bpf_fd_array_map_update_elem(map, f.file, key, value,
++						    flags);
++	}
++
++	bpf_disable_instrumentation();
++	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
++	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
++		err = bpf_percpu_hash_update(map, key, value, flags);
++	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
++		err = bpf_percpu_array_update(map, key, value, flags);
++	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
++		err = bpf_percpu_cgroup_storage_update(map, key, value,
++						       flags);
++	} else if (IS_FD_ARRAY(map)) {
++		rcu_read_lock();
++		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
++						   flags);
++		rcu_read_unlock();
++	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
++		rcu_read_lock();
++		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
++						  flags);
++		rcu_read_unlock();
++	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
++		/* rcu_read_lock() is not needed */
++		err = bpf_fd_reuseport_array_update_elem(map, key, value,
++							 flags);
++	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
++		   map->map_type == BPF_MAP_TYPE_STACK ||
++		   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
++		err = map->ops->map_push_elem(map, value, flags);
++	} else {
++		rcu_read_lock();
++		err = map->ops->map_update_elem(map, key, value, flags);
++		rcu_read_unlock();
++	}
++	bpf_enable_instrumentation();
++	maybe_wait_bpf_programs(map);
++
++	return err;
++}
++
++static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
++			      __u64 flags)
++{
++	void *ptr;
++	int err;
++
++	if (bpf_map_is_dev_bound(map))
++		return bpf_map_offload_lookup_elem(map, key, value);
++
++	bpf_disable_instrumentation();
++	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
++	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
++		err = bpf_percpu_hash_copy(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
++		err = bpf_percpu_array_copy(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
++		err = bpf_percpu_cgroup_storage_copy(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
++		err = bpf_stackmap_copy(map, key, value);
++	} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
++		err = bpf_fd_array_map_lookup_elem(map, key, value);
++	} else if (IS_FD_HASH(map)) {
++		err = bpf_fd_htab_map_lookup_elem(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
++		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
++	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
++		   map->map_type == BPF_MAP_TYPE_STACK ||
++		   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
++		err = map->ops->map_peek_elem(map, value);
++	} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
++		/* struct_ops map requires directly updating "value" */
++		err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
++	} else {
++		rcu_read_lock();
++		if (map->ops->map_lookup_elem_sys_only)
++			ptr = map->ops->map_lookup_elem_sys_only(map, key);
++		else
++			ptr = map->ops->map_lookup_elem(map, key);
++		if (IS_ERR(ptr)) {
++			err = PTR_ERR(ptr);
++		} else if (!ptr) {
++			err = -ENOENT;
++		} else {
++			err = 0;
++			if (flags & BPF_F_LOCK)
++				/* lock 'ptr' and copy everything but lock */
++				copy_map_value_locked(map, value, ptr, true);
++			else
++				copy_map_value(map, value, ptr);
++			/* mask lock and timer, since value wasn't zero inited */
++			check_and_init_map_value(map, value);
++		}
++		rcu_read_unlock();
++	}
++
++	bpf_enable_instrumentation();
++	maybe_wait_bpf_programs(map);
++
++	return err;
++}
++
++/* Please, do not use this function outside from the map creation path
++ * (e.g. in map update path) without taking care of setting the active
++ * memory cgroup (see at bpf_map_kmalloc_node() for example).
++ */
++static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
++{
++	/* We really just want to fail instead of triggering OOM killer
++	 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
++	 * which is used for lower order allocation requests.
++	 *
++	 * It has been observed that higher order allocation requests done by
++	 * vmalloc with __GFP_NORETRY being set might fail due to not trying
++	 * to reclaim memory from the page cache, thus we set
++	 * __GFP_RETRY_MAYFAIL to avoid such situations.
++	 */
++
++	const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
++	unsigned int flags = 0;
++	unsigned long align = 1;
++	void *area;
++
++	if (size >= SIZE_MAX)
++		return NULL;
++
++	/* kmalloc()'ed memory can't be mmap()'ed */
++	if (mmapable) {
++		BUG_ON(!PAGE_ALIGNED(size));
++		align = SHMLBA;
++		flags = VM_USERMAP;
++	} else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
++		area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
++				    numa_node);
++		if (area != NULL)
++			return area;
++	}
++
++	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
++			gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
++			flags, numa_node, __builtin_return_address(0));
++}
++
++void *bpf_map_area_alloc(u64 size, int numa_node)
++{
++	return __bpf_map_area_alloc(size, numa_node, false);
++}
++
++void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
++{
++	return __bpf_map_area_alloc(size, numa_node, true);
++}
++
++void bpf_map_area_free(void *area)
++{
++	kvfree(area);
++}
++
++static u32 bpf_map_flags_retain_permanent(u32 flags)
++{
++	/* Some map creation flags are not tied to the map object but
++	 * rather to the map fd instead, so they have no meaning upon
++	 * map object inspection since multiple file descriptors with
++	 * different (access) properties can exist here. Thus, given
++	 * this has zero meaning for the map itself, lets clear these
++	 * from here.
++	 */
++	return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
++}
++
++void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
++{
++	map->map_type = attr->map_type;
++	map->key_size = attr->key_size;
++	map->value_size = attr->value_size;
++	map->max_entries = attr->max_entries;
++	map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
++	map->numa_node = bpf_map_attr_numa_node(attr);
++	map->map_extra = attr->map_extra;
++}
++
++static int bpf_map_alloc_id(struct bpf_map *map)
++{
++	int id;
++
++	idr_preload(GFP_KERNEL);
++	spin_lock_bh(&map_idr_lock);
++	id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
++	if (id > 0)
++		map->id = id;
++	spin_unlock_bh(&map_idr_lock);
++	idr_preload_end();
++
++	if (WARN_ON_ONCE(!id))
++		return -ENOSPC;
++
++	return id > 0 ? 0 : id;
++}
++
++void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
++{
++	unsigned long flags;
++
++	/* Offloaded maps are removed from the IDR store when their device
++	 * disappears - even if someone holds an fd to them they are unusable,
++	 * the memory is gone, all ops will fail; they are simply waiting for
++	 * refcnt to drop to be freed.
++	 */
++	if (!map->id)
++		return;
++
++	if (do_idr_lock)
++		spin_lock_irqsave(&map_idr_lock, flags);
++	else
++		__acquire(&map_idr_lock);
++
++	idr_remove(&map_idr, map->id);
++	map->id = 0;
++
++	if (do_idr_lock)
++		spin_unlock_irqrestore(&map_idr_lock, flags);
++	else
++		__release(&map_idr_lock);
++}
++
++#ifdef CONFIG_MEMCG_KMEM
++static void bpf_map_save_memcg(struct bpf_map *map)
++{
++	/* Currently if a map is created by a process belonging to the root
++	 * memory cgroup, get_obj_cgroup_from_current() will return NULL.
++	 * So we have to check map->objcg for being NULL each time it's
++	 * being used.
++	 */
++	map->objcg = get_obj_cgroup_from_current();
++}
++
++static void bpf_map_release_memcg(struct bpf_map *map)
++{
++	if (map->objcg)
++		obj_cgroup_put(map->objcg);
++}
++
++static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
++{
++	if (map->objcg)
++		return get_mem_cgroup_from_objcg(map->objcg);
++
++	return root_mem_cgroup;
++}
++
++void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
++			   int node)
++{
++	struct mem_cgroup *memcg, *old_memcg;
++	void *ptr;
++
++	memcg = bpf_map_get_memcg(map);
++	old_memcg = set_active_memcg(memcg);
++	ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
++	set_active_memcg(old_memcg);
++	mem_cgroup_put(memcg);
++
++	return ptr;
++}
++
++void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
++{
++	struct mem_cgroup *memcg, *old_memcg;
++	void *ptr;
++
++	memcg = bpf_map_get_memcg(map);
++	old_memcg = set_active_memcg(memcg);
++	ptr = kzalloc(size, flags | __GFP_ACCOUNT);
++	set_active_memcg(old_memcg);
++	mem_cgroup_put(memcg);
++
++	return ptr;
++}
++
++void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
++				    size_t align, gfp_t flags)
++{
++	struct mem_cgroup *memcg, *old_memcg;
++	void __percpu *ptr;
++
++	memcg = bpf_map_get_memcg(map);
++	old_memcg = set_active_memcg(memcg);
++	ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
++	set_active_memcg(old_memcg);
++	mem_cgroup_put(memcg);
++
++	return ptr;
++}
++
++#else
++static void bpf_map_save_memcg(struct bpf_map *map)
++{
++}
++
++static void bpf_map_release_memcg(struct bpf_map *map)
++{
++}
++#endif
++
++static int bpf_map_kptr_off_cmp(const void *a, const void *b)
++{
++	const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
++
++	if (off_desc1->offset < off_desc2->offset)
++		return -1;
++	else if (off_desc1->offset > off_desc2->offset)
++		return 1;
++	return 0;
++}
++
++struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
++{
++	/* Since members are iterated in btf_find_field in increasing order,
++	 * offsets appended to kptr_off_tab are in increasing order, so we can
++	 * do bsearch to find exact match.
++	 */
++	struct bpf_map_value_off *tab;
++
++	if (!map_value_has_kptrs(map))
++		return NULL;
++	tab = map->kptr_off_tab;
++	return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
++}
++
++void bpf_map_free_kptr_off_tab(struct bpf_map *map)
++{
++	struct bpf_map_value_off *tab = map->kptr_off_tab;
++	int i;
++
++	if (!map_value_has_kptrs(map))
++		return;
++	for (i = 0; i < tab->nr_off; i++) {
++		if (tab->off[i].kptr.module)
++			module_put(tab->off[i].kptr.module);
++		btf_put(tab->off[i].kptr.btf);
++	}
++	kfree(tab);
++	map->kptr_off_tab = NULL;
++}
++
++struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
++{
++	struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
++	int size, i;
++
++	if (!map_value_has_kptrs(map))
++		return ERR_PTR(-ENOENT);
++	size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
++	new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
++	if (!new_tab)
++		return ERR_PTR(-ENOMEM);
++	/* Do a deep copy of the kptr_off_tab */
++	for (i = 0; i < tab->nr_off; i++) {
++		btf_get(tab->off[i].kptr.btf);
++		if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) {
++			while (i--) {
++				if (tab->off[i].kptr.module)
++					module_put(tab->off[i].kptr.module);
++				btf_put(tab->off[i].kptr.btf);
++			}
++			kfree(new_tab);
++			return ERR_PTR(-ENXIO);
++		}
++	}
++	return new_tab;
++}
++
++bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
++{
++	struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
++	bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
++	int size;
++
++	if (!a_has_kptr && !b_has_kptr)
++		return true;
++	if (a_has_kptr != b_has_kptr)
++		return false;
++	if (tab_a->nr_off != tab_b->nr_off)
++		return false;
++	size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
++	return !memcmp(tab_a, tab_b, size);
++}
++
++/* Caller must ensure map_value_has_kptrs is true. Note that this function can
++ * be called on a map value while the map_value is visible to BPF programs, as
++ * it ensures the correct synchronization, and we already enforce the same using
++ * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs.
++ */
++void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
++{
++	struct bpf_map_value_off *tab = map->kptr_off_tab;
++	unsigned long *btf_id_ptr;
++	int i;
++
++	for (i = 0; i < tab->nr_off; i++) {
++		struct bpf_map_value_off_desc *off_desc = &tab->off[i];
++		unsigned long old_ptr;
++
++		btf_id_ptr = map_value + off_desc->offset;
++		if (off_desc->type == BPF_KPTR_UNREF) {
++			u64 *p = (u64 *)btf_id_ptr;
++
++			WRITE_ONCE(*p, 0);
++			continue;
++		}
++		old_ptr = xchg(btf_id_ptr, 0);
++		off_desc->kptr.dtor((void *)old_ptr);
++	}
++}
++
++/* called from workqueue */
++static void bpf_map_free_deferred(struct work_struct *work)
++{
++	struct bpf_map *map = container_of(work, struct bpf_map, work);
++
++	security_bpf_map_free(map);
++	kfree(map->off_arr);
++	bpf_map_release_memcg(map);
++	/* implementation dependent freeing, map_free callback also does
++	 * bpf_map_free_kptr_off_tab, if needed.
++	 */
++	map->ops->map_free(map);
++}
++
++static void bpf_map_put_uref(struct bpf_map *map)
++{
++	if (atomic64_dec_and_test(&map->usercnt)) {
++		if (map->ops->map_release_uref)
++			map->ops->map_release_uref(map);
++	}
++}
++
++/* decrement map refcnt and schedule it for freeing via workqueue
++ * (unrelying map implementation ops->map_free() might sleep)
++ */
++static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
++{
++	if (atomic64_dec_and_test(&map->refcnt)) {
++		/* bpf_map_free_id() must be called first */
++		bpf_map_free_id(map, do_idr_lock);
++		btf_put(map->btf);
++		INIT_WORK(&map->work, bpf_map_free_deferred);
++		schedule_work(&map->work);
++	}
++}
++
++void bpf_map_put(struct bpf_map *map)
++{
++	__bpf_map_put(map, true);
++}
++EXPORT_SYMBOL_GPL(bpf_map_put);
++
++void bpf_map_put_with_uref(struct bpf_map *map)
++{
++	bpf_map_put_uref(map);
++	bpf_map_put(map);
++}
++
++static int bpf_map_release(struct inode *inode, struct file *filp)
++{
++	struct bpf_map *map = filp->private_data;
++
++	if (map->ops->map_release)
++		map->ops->map_release(map, filp);
++
++	bpf_map_put_with_uref(map);
++	return 0;
++}
++
++static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
++{
++	fmode_t mode = f.file->f_mode;
++
++	/* Our file permissions may have been overridden by global
++	 * map permissions facing syscall side.
++	 */
++	if (READ_ONCE(map->frozen))
++		mode &= ~FMODE_CAN_WRITE;
++	return mode;
++}
++
++#ifdef CONFIG_PROC_FS
++/* Provides an approximation of the map's memory footprint.
++ * Used only to provide a backward compatibility and display
++ * a reasonable "memlock" info.
++ */
++static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
++{
++	unsigned long size;
++
++	size = round_up(map->key_size + bpf_map_value_size(map), 8);
++
++	return round_up(map->max_entries * size, PAGE_SIZE);
++}
++
++static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
++{
++	struct bpf_map *map = filp->private_data;
++	u32 type = 0, jited = 0;
++
++	if (map_type_contains_progs(map)) {
++		spin_lock(&map->owner.lock);
++		type  = map->owner.type;
++		jited = map->owner.jited;
++		spin_unlock(&map->owner.lock);
++	}
++
++	seq_printf(m,
++		   "map_type:\t%u\n"
++		   "key_size:\t%u\n"
++		   "value_size:\t%u\n"
++		   "max_entries:\t%u\n"
++		   "map_flags:\t%#x\n"
++		   "map_extra:\t%#llx\n"
++		   "memlock:\t%lu\n"
++		   "map_id:\t%u\n"
++		   "frozen:\t%u\n",
++		   map->map_type,
++		   map->key_size,
++		   map->value_size,
++		   map->max_entries,
++		   map->map_flags,
++		   (unsigned long long)map->map_extra,
++		   bpf_map_memory_footprint(map),
++		   map->id,
++		   READ_ONCE(map->frozen));
++	if (type) {
++		seq_printf(m, "owner_prog_type:\t%u\n", type);
++		seq_printf(m, "owner_jited:\t%u\n", jited);
++	}
++}
++#endif
++
++static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
++			      loff_t *ppos)
++{
++	/* We need this handler such that alloc_file() enables
++	 * f_mode with FMODE_CAN_READ.
++	 */
++	return -EINVAL;
++}
++
++static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
++			       size_t siz, loff_t *ppos)
++{
++	/* We need this handler such that alloc_file() enables
++	 * f_mode with FMODE_CAN_WRITE.
++	 */
++	return -EINVAL;
++}
++
++/* called for any extra memory-mapped regions (except initial) */
++static void bpf_map_mmap_open(struct vm_area_struct *vma)
++{
++	struct bpf_map *map = vma->vm_file->private_data;
++
++	if (vma->vm_flags & VM_MAYWRITE)
++		bpf_map_write_active_inc(map);
++}
++
++/* called for all unmapped memory region (including initial) */
++static void bpf_map_mmap_close(struct vm_area_struct *vma)
++{
++	struct bpf_map *map = vma->vm_file->private_data;
++
++	if (vma->vm_flags & VM_MAYWRITE)
++		bpf_map_write_active_dec(map);
++}
++
++static const struct vm_operations_struct bpf_map_default_vmops = {
++	.open		= bpf_map_mmap_open,
++	.close		= bpf_map_mmap_close,
++};
++
++static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++	struct bpf_map *map = filp->private_data;
++	int err;
++
++	if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
++	    map_value_has_timer(map) || map_value_has_kptrs(map))
++		return -ENOTSUPP;
++
++	if (!(vma->vm_flags & VM_SHARED))
++		return -EINVAL;
++
++	mutex_lock(&map->freeze_mutex);
++
++	if (vma->vm_flags & VM_WRITE) {
++		if (map->frozen) {
++			err = -EPERM;
++			goto out;
++		}
++		/* map is meant to be read-only, so do not allow mapping as
++		 * writable, because it's possible to leak a writable page
++		 * reference and allows user-space to still modify it after
++		 * freezing, while verifier will assume contents do not change
++		 */
++		if (map->map_flags & BPF_F_RDONLY_PROG) {
++			err = -EACCES;
++			goto out;
++		}
++	}
++
++	/* set default open/close callbacks */
++	vma->vm_ops = &bpf_map_default_vmops;
++	vma->vm_private_data = map;
++	vma->vm_flags &= ~VM_MAYEXEC;
++	if (!(vma->vm_flags & VM_WRITE))
++		/* disallow re-mapping with PROT_WRITE */
++		vma->vm_flags &= ~VM_MAYWRITE;
++
++	err = map->ops->map_mmap(map, vma);
++	if (err)
++		goto out;
++
++	if (vma->vm_flags & VM_MAYWRITE)
++		bpf_map_write_active_inc(map);
++out:
++	mutex_unlock(&map->freeze_mutex);
++	return err;
++}
++
++static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
++{
++	struct bpf_map *map = filp->private_data;
++
++	if (map->ops->map_poll)
++		return map->ops->map_poll(map, filp, pts);
++
++	return EPOLLERR;
++}
++
++const struct file_operations bpf_map_fops = {
++#ifdef CONFIG_PROC_FS
++	.show_fdinfo	= bpf_map_show_fdinfo,
++#endif
++	.release	= bpf_map_release,
++	.read		= bpf_dummy_read,
++	.write		= bpf_dummy_write,
++	.mmap		= bpf_map_mmap,
++	.poll		= bpf_map_poll,
++};
++
++int bpf_map_new_fd(struct bpf_map *map, int flags)
++{
++	int ret;
++
++	ret = security_bpf_map(map, OPEN_FMODE(flags));
++	if (ret < 0)
++		return ret;
++
++	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
++				flags | O_CLOEXEC);
++}
++
++int bpf_get_file_flag(int flags)
++{
++	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
++		return -EINVAL;
++	if (flags & BPF_F_RDONLY)
++		return O_RDONLY;
++	if (flags & BPF_F_WRONLY)
++		return O_WRONLY;
++	return O_RDWR;
++}
++
++/* helper macro to check that unused fields 'union bpf_attr' are zero */
++#define CHECK_ATTR(CMD) \
++	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
++		   sizeof(attr->CMD##_LAST_FIELD), 0, \
++		   sizeof(*attr) - \
++		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
++		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
++
++/* dst and src must have at least "size" number of bytes.
++ * Return strlen on success and < 0 on error.
++ */
++int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
++{
++	const char *end = src + size;
++	const char *orig_src = src;
++
++	memset(dst, 0, size);
++	/* Copy all isalnum(), '_' and '.' chars. */
++	while (src < end && *src) {
++		if (!isalnum(*src) &&
++		    *src != '_' && *src != '.')
++			return -EINVAL;
++		*dst++ = *src++;
++	}
++
++	/* No '\0' found in "size" number of bytes */
++	if (src == end)
++		return -EINVAL;
++
++	return src - orig_src;
++}
++
++int map_check_no_btf(const struct bpf_map *map,
++		     const struct btf *btf,
++		     const struct btf_type *key_type,
++		     const struct btf_type *value_type)
++{
++	return -ENOTSUPP;
++}
++
++static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
++{
++	const u32 a = *(const u32 *)_a;
++	const u32 b = *(const u32 *)_b;
++
++	if (a < b)
++		return -1;
++	else if (a > b)
++		return 1;
++	return 0;
++}
++
++static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
++{
++	struct bpf_map *map = (struct bpf_map *)priv;
++	u32 *off_base = map->off_arr->field_off;
++	u32 *a = _a, *b = _b;
++	u8 *sz_a, *sz_b;
++
++	sz_a = map->off_arr->field_sz + (a - off_base);
++	sz_b = map->off_arr->field_sz + (b - off_base);
++
++	swap(*a, *b);
++	swap(*sz_a, *sz_b);
++}
++
++static int bpf_map_alloc_off_arr(struct bpf_map *map)
++{
++	bool has_spin_lock = map_value_has_spin_lock(map);
++	bool has_timer = map_value_has_timer(map);
++	bool has_kptrs = map_value_has_kptrs(map);
++	struct bpf_map_off_arr *off_arr;
++	u32 i;
++
++	if (!has_spin_lock && !has_timer && !has_kptrs) {
++		map->off_arr = NULL;
++		return 0;
++	}
++
++	off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
++	if (!off_arr)
++		return -ENOMEM;
++	map->off_arr = off_arr;
++
++	off_arr->cnt = 0;
++	if (has_spin_lock) {
++		i = off_arr->cnt;
++
++		off_arr->field_off[i] = map->spin_lock_off;
++		off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
++		off_arr->cnt++;
++	}
++	if (has_timer) {
++		i = off_arr->cnt;
++
++		off_arr->field_off[i] = map->timer_off;
++		off_arr->field_sz[i] = sizeof(struct bpf_timer);
++		off_arr->cnt++;
++	}
++	if (has_kptrs) {
++		struct bpf_map_value_off *tab = map->kptr_off_tab;
++		u32 *off = &off_arr->field_off[off_arr->cnt];
++		u8 *sz = &off_arr->field_sz[off_arr->cnt];
++
++		for (i = 0; i < tab->nr_off; i++) {
++			*off++ = tab->off[i].offset;
++			*sz++ = sizeof(u64);
++		}
++		off_arr->cnt += tab->nr_off;
++	}
++
++	if (off_arr->cnt == 1)
++		return 0;
++	sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
++	       map_off_arr_cmp, map_off_arr_swap, map);
++	return 0;
++}
++
++static int map_check_btf(struct bpf_map *map, const struct btf *btf,
++			 u32 btf_key_id, u32 btf_value_id)
++{
++	const struct btf_type *key_type, *value_type;
++	u32 key_size, value_size;
++	int ret = 0;
++
++	/* Some maps allow key to be unspecified. */
++	if (btf_key_id) {
++		key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
++		if (!key_type || key_size != map->key_size)
++			return -EINVAL;
++	} else {
++		key_type = btf_type_by_id(btf, 0);
++		if (!map->ops->map_check_btf)
++			return -EINVAL;
++	}
++
++	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
++	if (!value_type || value_size != map->value_size)
++		return -EINVAL;
++
++	map->spin_lock_off = btf_find_spin_lock(btf, value_type);
++
++	if (map_value_has_spin_lock(map)) {
++		if (map->map_flags & BPF_F_RDONLY_PROG)
++			return -EACCES;
++		if (map->map_type != BPF_MAP_TYPE_HASH &&
++		    map->map_type != BPF_MAP_TYPE_ARRAY &&
++		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
++		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
++		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
++		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
++			return -ENOTSUPP;
++		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
++		    map->value_size) {
++			WARN_ONCE(1,
++				  "verifier bug spin_lock_off %d value_size %d\n",
++				  map->spin_lock_off, map->value_size);
++			return -EFAULT;
++		}
++	}
++
++	map->timer_off = btf_find_timer(btf, value_type);
++	if (map_value_has_timer(map)) {
++		if (map->map_flags & BPF_F_RDONLY_PROG)
++			return -EACCES;
++		if (map->map_type != BPF_MAP_TYPE_HASH &&
++		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
++		    map->map_type != BPF_MAP_TYPE_ARRAY)
++			return -EOPNOTSUPP;
++	}
++
++	map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
++	if (map_value_has_kptrs(map)) {
++		if (!bpf_capable()) {
++			ret = -EPERM;
++			goto free_map_tab;
++		}
++		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
++			ret = -EACCES;
++			goto free_map_tab;
++		}
++		if (map->map_type != BPF_MAP_TYPE_HASH &&
++		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
++		    map->map_type != BPF_MAP_TYPE_ARRAY) {
++			ret = -EOPNOTSUPP;
++			goto free_map_tab;
++		}
++	}
++
++	if (map->ops->map_check_btf) {
++		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
++		if (ret < 0)
++			goto free_map_tab;
++	}
++
++	return ret;
++free_map_tab:
++	bpf_map_free_kptr_off_tab(map);
++	return ret;
++}
++
++#define BPF_MAP_CREATE_LAST_FIELD map_extra
++/* called via syscall */
++static int map_create(union bpf_attr *attr)
++{
++	int numa_node = bpf_map_attr_numa_node(attr);
++	struct bpf_map *map;
++	int f_flags;
++	int err;
++
++	err = CHECK_ATTR(BPF_MAP_CREATE);
++	if (err)
++		return -EINVAL;
++
++	if (attr->btf_vmlinux_value_type_id) {
++		if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
++		    attr->btf_key_type_id || attr->btf_value_type_id)
++			return -EINVAL;
++	} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
++		return -EINVAL;
++	}
++
++	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
++	    attr->map_extra != 0)
++		return -EINVAL;
++
++	f_flags = bpf_get_file_flag(attr->map_flags);
++	if (f_flags < 0)
++		return f_flags;
++
++	if (numa_node != NUMA_NO_NODE &&
++	    ((unsigned int)numa_node >= nr_node_ids ||
++	     !node_online(numa_node)))
++		return -EINVAL;
++
++	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
++	map = find_and_alloc_map(attr);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++
++	err = bpf_obj_name_cpy(map->name, attr->map_name,
++			       sizeof(attr->map_name));
++	if (err < 0)
++		goto free_map;
++
++	atomic64_set(&map->refcnt, 1);
++	atomic64_set(&map->usercnt, 1);
++	mutex_init(&map->freeze_mutex);
++	spin_lock_init(&map->owner.lock);
++
++	map->spin_lock_off = -EINVAL;
++	map->timer_off = -EINVAL;
++	if (attr->btf_key_type_id || attr->btf_value_type_id ||
++	    /* Even the map's value is a kernel's struct,
++	     * the bpf_prog.o must have BTF to begin with
++	     * to figure out the corresponding kernel's
++	     * counter part.  Thus, attr->btf_fd has
++	     * to be valid also.
++	     */
++	    attr->btf_vmlinux_value_type_id) {
++		struct btf *btf;
++
++		btf = btf_get_by_fd(attr->btf_fd);
++		if (IS_ERR(btf)) {
++			err = PTR_ERR(btf);
++			goto free_map;
++		}
++		if (btf_is_kernel(btf)) {
++			btf_put(btf);
++			err = -EACCES;
++			goto free_map;
++		}
++		map->btf = btf;
++
++		if (attr->btf_value_type_id) {
++			err = map_check_btf(map, btf, attr->btf_key_type_id,
++					    attr->btf_value_type_id);
++			if (err)
++				goto free_map;
++		}
++
++		map->btf_key_type_id = attr->btf_key_type_id;
++		map->btf_value_type_id = attr->btf_value_type_id;
++		map->btf_vmlinux_value_type_id =
++			attr->btf_vmlinux_value_type_id;
++	}
++
++	err = bpf_map_alloc_off_arr(map);
++	if (err)
++		goto free_map;
++
++	err = security_bpf_map_alloc(map);
++	if (err)
++		goto free_map_off_arr;
++
++	err = bpf_map_alloc_id(map);
++	if (err)
++		goto free_map_sec;
++
++	bpf_map_save_memcg(map);
++
++	err = bpf_map_new_fd(map, f_flags);
++	if (err < 0) {
++		/* failed to allocate fd.
++		 * bpf_map_put_with_uref() is needed because the above
++		 * bpf_map_alloc_id() has published the map
++		 * to the userspace and the userspace may
++		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
++		 */
++		bpf_map_put_with_uref(map);
++		return err;
++	}
++
++	return err;
++
++free_map_sec:
++	security_bpf_map_free(map);
++free_map_off_arr:
++	kfree(map->off_arr);
++free_map:
++	btf_put(map->btf);
++	map->ops->map_free(map);
++	return err;
++}
++
++/* if error is returned, fd is released.
++ * On success caller should complete fd access with matching fdput()
++ */
++struct bpf_map *__bpf_map_get(struct fd f)
++{
++	if (!f.file)
++		return ERR_PTR(-EBADF);
++	if (f.file->f_op != &bpf_map_fops) {
++		fdput(f);
++		return ERR_PTR(-EINVAL);
++	}
++
++	return f.file->private_data;
++}
++
++void bpf_map_inc(struct bpf_map *map)
++{
++	atomic64_inc(&map->refcnt);
++}
++EXPORT_SYMBOL_GPL(bpf_map_inc);
++
++void bpf_map_inc_with_uref(struct bpf_map *map)
++{
++	atomic64_inc(&map->refcnt);
++	atomic64_inc(&map->usercnt);
++}
++EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
++
++struct bpf_map *bpf_map_get(u32 ufd)
++{
++	struct fd f = fdget(ufd);
++	struct bpf_map *map;
++
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return map;
++
++	bpf_map_inc(map);
++	fdput(f);
++
++	return map;
++}
++EXPORT_SYMBOL(bpf_map_get);
++
++struct bpf_map *bpf_map_get_with_uref(u32 ufd)
++{
++	struct fd f = fdget(ufd);
++	struct bpf_map *map;
++
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return map;
++
++	bpf_map_inc_with_uref(map);
++	fdput(f);
++
++	return map;
++}
++
++/* map_idr_lock should have been held */
++static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
++{
++	int refold;
++
++	refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
++	if (!refold)
++		return ERR_PTR(-ENOENT);
++	if (uref)
++		atomic64_inc(&map->usercnt);
++
++	return map;
++}
++
++struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
++{
++	spin_lock_bh(&map_idr_lock);
++	map = __bpf_map_inc_not_zero(map, false);
++	spin_unlock_bh(&map_idr_lock);
++
++	return map;
++}
++EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
++
++int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
++{
++	return -ENOTSUPP;
++}
++
++static void *__bpf_copy_key(void __user *ukey, u64 key_size)
++{
++	if (key_size)
++		return vmemdup_user(ukey, key_size);
++
++	if (ukey)
++		return ERR_PTR(-EINVAL);
++
++	return NULL;
++}
++
++static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
++{
++	if (key_size)
++		return kvmemdup_bpfptr(ukey, key_size);
++
++	if (!bpfptr_is_null(ukey))
++		return ERR_PTR(-EINVAL);
++
++	return NULL;
++}
++
++/* last field in 'union bpf_attr' used by this command */
++#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
++
++static int map_lookup_elem(union bpf_attr *attr)
++{
++	void __user *ukey = u64_to_user_ptr(attr->key);
++	void __user *uvalue = u64_to_user_ptr(attr->value);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	void *key, *value;
++	u32 value_size;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
++		return -EINVAL;
++
++	if (attr->flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if ((attr->flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		err = -EINVAL;
++		goto err_put;
++	}
++
++	key = __bpf_copy_key(ukey, map->key_size);
++	if (IS_ERR(key)) {
++		err = PTR_ERR(key);
++		goto err_put;
++	}
++
++	value_size = bpf_map_value_size(map);
++
++	err = -ENOMEM;
++	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
++	if (!value)
++		goto free_key;
++
++	if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
++		if (copy_from_user(value, uvalue, value_size))
++			err = -EFAULT;
++		else
++			err = bpf_map_copy_value(map, key, value, attr->flags);
++		goto free_value;
++	}
++
++	err = bpf_map_copy_value(map, key, value, attr->flags);
++	if (err)
++		goto free_value;
++
++	err = -EFAULT;
++	if (copy_to_user(uvalue, value, value_size) != 0)
++		goto free_value;
++
++	err = 0;
++
++free_value:
++	kvfree(value);
++free_key:
++	kvfree(key);
++err_put:
++	fdput(f);
++	return err;
++}
++
++
++#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
++
++static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
++{
++	bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
++	bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	void *key, *value;
++	u32 value_size;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	bpf_map_write_active_inc(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if ((attr->flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		err = -EINVAL;
++		goto err_put;
++	}
++
++	key = ___bpf_copy_key(ukey, map->key_size);
++	if (IS_ERR(key)) {
++		err = PTR_ERR(key);
++		goto err_put;
++	}
++
++	value_size = bpf_map_value_size(map);
++
++	err = -ENOMEM;
++	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
++	if (!value)
++		goto free_key;
++
++	err = -EFAULT;
++	if (copy_from_bpfptr(value, uvalue, value_size) != 0)
++		goto free_value;
++
++	err = bpf_map_update_value(map, f, key, value, attr->flags);
++
++free_value:
++	kvfree(value);
++free_key:
++	kvfree(key);
++err_put:
++	bpf_map_write_active_dec(map);
++	fdput(f);
++	return err;
++}
++
++#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
++
++static int map_delete_elem(union bpf_attr *attr)
++{
++	void __user *ukey = u64_to_user_ptr(attr->key);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	struct fd f;
++	void *key;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	bpf_map_write_active_inc(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	key = __bpf_copy_key(ukey, map->key_size);
++	if (IS_ERR(key)) {
++		err = PTR_ERR(key);
++		goto err_put;
++	}
++
++	if (bpf_map_is_dev_bound(map)) {
++		err = bpf_map_offload_delete_elem(map, key);
++		goto out;
++	} else if (IS_FD_PROG_ARRAY(map) ||
++		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
++		/* These maps require sleepable context */
++		err = map->ops->map_delete_elem(map, key);
++		goto out;
++	}
++
++	bpf_disable_instrumentation();
++	rcu_read_lock();
++	err = map->ops->map_delete_elem(map, key);
++	rcu_read_unlock();
++	bpf_enable_instrumentation();
++	maybe_wait_bpf_programs(map);
++out:
++	kvfree(key);
++err_put:
++	bpf_map_write_active_dec(map);
++	fdput(f);
++	return err;
++}
++
++/* last field in 'union bpf_attr' used by this command */
++#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
++
++static int map_get_next_key(union bpf_attr *attr)
++{
++	void __user *ukey = u64_to_user_ptr(attr->key);
++	void __user *unext_key = u64_to_user_ptr(attr->next_key);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	void *key, *next_key;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if (ukey) {
++		key = __bpf_copy_key(ukey, map->key_size);
++		if (IS_ERR(key)) {
++			err = PTR_ERR(key);
++			goto err_put;
++		}
++	} else {
++		key = NULL;
++	}
++
++	err = -ENOMEM;
++	next_key = kvmalloc(map->key_size, GFP_USER);
++	if (!next_key)
++		goto free_key;
++
++	if (bpf_map_is_dev_bound(map)) {
++		err = bpf_map_offload_get_next_key(map, key, next_key);
++		goto out;
++	}
++
++	rcu_read_lock();
++	err = map->ops->map_get_next_key(map, key, next_key);
++	rcu_read_unlock();
++out:
++	if (err)
++		goto free_next_key;
++
++	err = -EFAULT;
++	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
++		goto free_next_key;
++
++	err = 0;
++
++free_next_key:
++	kvfree(next_key);
++free_key:
++	kvfree(key);
++err_put:
++	fdput(f);
++	return err;
++}
++
++int generic_map_delete_batch(struct bpf_map *map,
++			     const union bpf_attr *attr,
++			     union bpf_attr __user *uattr)
++{
++	void __user *keys = u64_to_user_ptr(attr->batch.keys);
++	u32 cp, max_count;
++	int err = 0;
++	void *key;
++
++	if (attr->batch.elem_flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		return -EINVAL;
++	}
++
++	max_count = attr->batch.count;
++	if (!max_count)
++		return 0;
++
++	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
++	if (!key)
++		return -ENOMEM;
++
++	for (cp = 0; cp < max_count; cp++) {
++		err = -EFAULT;
++		if (copy_from_user(key, keys + cp * map->key_size,
++				   map->key_size))
++			break;
++
++		if (bpf_map_is_dev_bound(map)) {
++			err = bpf_map_offload_delete_elem(map, key);
++			break;
++		}
++
++		bpf_disable_instrumentation();
++		rcu_read_lock();
++		err = map->ops->map_delete_elem(map, key);
++		rcu_read_unlock();
++		bpf_enable_instrumentation();
++		if (err)
++			break;
++		cond_resched();
++	}
++	if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
++		err = -EFAULT;
++
++	kvfree(key);
++
++	maybe_wait_bpf_programs(map);
++	return err;
++}
++
++int generic_map_update_batch(struct bpf_map *map,
++			     const union bpf_attr *attr,
++			     union bpf_attr __user *uattr)
++{
++	void __user *values = u64_to_user_ptr(attr->batch.values);
++	void __user *keys = u64_to_user_ptr(attr->batch.keys);
++	u32 value_size, cp, max_count;
++	int ufd = attr->batch.map_fd;
++	void *key, *value;
++	struct fd f;
++	int err = 0;
++
++	if (attr->batch.elem_flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		return -EINVAL;
++	}
++
++	value_size = bpf_map_value_size(map);
++
++	max_count = attr->batch.count;
++	if (!max_count)
++		return 0;
++
++	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
++	if (!key)
++		return -ENOMEM;
++
++	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
++	if (!value) {
++		kvfree(key);
++		return -ENOMEM;
++	}
++
++	f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
++	for (cp = 0; cp < max_count; cp++) {
++		err = -EFAULT;
++		if (copy_from_user(key, keys + cp * map->key_size,
++		    map->key_size) ||
++		    copy_from_user(value, values + cp * value_size, value_size))
++			break;
++
++		err = bpf_map_update_value(map, f, key, value,
++					   attr->batch.elem_flags);
++
++		if (err)
++			break;
++		cond_resched();
++	}
++
++	if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
++		err = -EFAULT;
++
++	kvfree(value);
++	kvfree(key);
++	fdput(f);
++	return err;
++}
++
++#define MAP_LOOKUP_RETRIES 3
++
++int generic_map_lookup_batch(struct bpf_map *map,
++				    const union bpf_attr *attr,
++				    union bpf_attr __user *uattr)
++{
++	void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
++	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
++	void __user *values = u64_to_user_ptr(attr->batch.values);
++	void __user *keys = u64_to_user_ptr(attr->batch.keys);
++	void *buf, *buf_prevkey, *prev_key, *key, *value;
++	int err, retry = MAP_LOOKUP_RETRIES;
++	u32 value_size, cp, max_count;
++
++	if (attr->batch.elem_flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map))
++		return -EINVAL;
++
++	value_size = bpf_map_value_size(map);
++
++	max_count = attr->batch.count;
++	if (!max_count)
++		return 0;
++
++	if (put_user(0, &uattr->batch.count))
++		return -EFAULT;
++
++	buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
++	if (!buf_prevkey)
++		return -ENOMEM;
++
++	buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
++	if (!buf) {
++		kvfree(buf_prevkey);
++		return -ENOMEM;
++	}
++
++	err = -EFAULT;
++	prev_key = NULL;
++	if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
++		goto free_buf;
++	key = buf;
++	value = key + map->key_size;
++	if (ubatch)
++		prev_key = buf_prevkey;
++
++	for (cp = 0; cp < max_count;) {
++		rcu_read_lock();
++		err = map->ops->map_get_next_key(map, prev_key, key);
++		rcu_read_unlock();
++		if (err)
++			break;
++		err = bpf_map_copy_value(map, key, value,
++					 attr->batch.elem_flags);
++
++		if (err == -ENOENT) {
++			if (retry) {
++				retry--;
++				continue;
++			}
++			err = -EINTR;
++			break;
++		}
++
++		if (err)
++			goto free_buf;
++
++		if (copy_to_user(keys + cp * map->key_size, key,
++				 map->key_size)) {
++			err = -EFAULT;
++			goto free_buf;
++		}
++		if (copy_to_user(values + cp * value_size, value, value_size)) {
++			err = -EFAULT;
++			goto free_buf;
++		}
++
++		if (!prev_key)
++			prev_key = buf_prevkey;
++
++		swap(prev_key, key);
++		retry = MAP_LOOKUP_RETRIES;
++		cp++;
++		cond_resched();
++	}
++
++	if (err == -EFAULT)
++		goto free_buf;
++
++	if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
++		    (cp && copy_to_user(uobatch, prev_key, map->key_size))))
++		err = -EFAULT;
++
++free_buf:
++	kvfree(buf_prevkey);
++	kvfree(buf);
++	return err;
++}
++
++#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
++
++static int map_lookup_and_delete_elem(union bpf_attr *attr)
++{
++	void __user *ukey = u64_to_user_ptr(attr->key);
++	void __user *uvalue = u64_to_user_ptr(attr->value);
++	int ufd = attr->map_fd;
++	struct bpf_map *map;
++	void *key, *value;
++	u32 value_size;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
++		return -EINVAL;
++
++	if (attr->flags & ~BPF_F_LOCK)
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	bpf_map_write_active_inc(map);
++	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
++	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if (attr->flags &&
++	    (map->map_type == BPF_MAP_TYPE_QUEUE ||
++	     map->map_type == BPF_MAP_TYPE_STACK)) {
++		err = -EINVAL;
++		goto err_put;
++	}
++
++	if ((attr->flags & BPF_F_LOCK) &&
++	    !map_value_has_spin_lock(map)) {
++		err = -EINVAL;
++		goto err_put;
++	}
++
++	key = __bpf_copy_key(ukey, map->key_size);
++	if (IS_ERR(key)) {
++		err = PTR_ERR(key);
++		goto err_put;
++	}
++
++	value_size = bpf_map_value_size(map);
++
++	err = -ENOMEM;
++	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
++	if (!value)
++		goto free_key;
++
++	err = -ENOTSUPP;
++	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
++	    map->map_type == BPF_MAP_TYPE_STACK) {
++		err = map->ops->map_pop_elem(map, value);
++	} else if (map->map_type == BPF_MAP_TYPE_HASH ||
++		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
++		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
++		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
++		if (!bpf_map_is_dev_bound(map)) {
++			bpf_disable_instrumentation();
++			rcu_read_lock();
++			err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
++			rcu_read_unlock();
++			bpf_enable_instrumentation();
++		}
++	}
++
++	if (err)
++		goto free_value;
++
++	if (copy_to_user(uvalue, value, value_size) != 0) {
++		err = -EFAULT;
++		goto free_value;
++	}
++
++	err = 0;
++
++free_value:
++	kvfree(value);
++free_key:
++	kvfree(key);
++err_put:
++	bpf_map_write_active_dec(map);
++	fdput(f);
++	return err;
++}
++
++#define BPF_MAP_FREEZE_LAST_FIELD map_fd
++
++static int map_freeze(const union bpf_attr *attr)
++{
++	int err = 0, ufd = attr->map_fd;
++	struct bpf_map *map;
++	struct fd f;
++
++	if (CHECK_ATTR(BPF_MAP_FREEZE))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++
++	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
++	    map_value_has_timer(map) || map_value_has_kptrs(map)) {
++		fdput(f);
++		return -ENOTSUPP;
++	}
++
++	mutex_lock(&map->freeze_mutex);
++	if (bpf_map_write_active(map)) {
++		err = -EBUSY;
++		goto err_put;
++	}
++	if (READ_ONCE(map->frozen)) {
++		err = -EBUSY;
++		goto err_put;
++	}
++	if (!bpf_capable()) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	WRITE_ONCE(map->frozen, true);
++err_put:
++	mutex_unlock(&map->freeze_mutex);
++	fdput(f);
++	return err;
++}
++
++static const struct bpf_prog_ops * const bpf_prog_types[] = {
++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
++	[_id] = & _name ## _prog_ops,
++#define BPF_MAP_TYPE(_id, _ops)
++#define BPF_LINK_TYPE(_id, _name)
++#include <linux/bpf_types.h>
++#undef BPF_PROG_TYPE
++#undef BPF_MAP_TYPE
++#undef BPF_LINK_TYPE
++};
++
++static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
++{
++	const struct bpf_prog_ops *ops;
++
++	if (type >= ARRAY_SIZE(bpf_prog_types))
++		return -EINVAL;
++	type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
++	ops = bpf_prog_types[type];
++	if (!ops)
++		return -EINVAL;
++
++	if (!bpf_prog_is_dev_bound(prog->aux))
++		prog->aux->ops = ops;
++	else
++		prog->aux->ops = &bpf_offload_prog_ops;
++	prog->type = type;
++	return 0;
++}
++
++enum bpf_audit {
++	BPF_AUDIT_LOAD,
++	BPF_AUDIT_UNLOAD,
++	BPF_AUDIT_MAX,
++};
++
++static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
++	[BPF_AUDIT_LOAD]   = "LOAD",
++	[BPF_AUDIT_UNLOAD] = "UNLOAD",
++};
++
++static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
++{
++	struct audit_context *ctx = NULL;
++	struct audit_buffer *ab;
++
++	if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
++		return;
++	if (audit_enabled == AUDIT_OFF)
++		return;
++	if (op == BPF_AUDIT_LOAD)
++		ctx = audit_context();
++	ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
++	if (unlikely(!ab))
++		return;
++	audit_log_format(ab, "prog-id=%u op=%s",
++			 prog->aux->id, bpf_audit_str[op]);
++	audit_log_end(ab);
++}
++
++static int bpf_prog_alloc_id(struct bpf_prog *prog)
++{
++	int id;
++
++	idr_preload(GFP_KERNEL);
++	spin_lock_bh(&prog_idr_lock);
++	id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
++	if (id > 0)
++		prog->aux->id = id;
++	spin_unlock_bh(&prog_idr_lock);
++	idr_preload_end();
++
++	/* id is in [1, INT_MAX) */
++	if (WARN_ON_ONCE(!id))
++		return -ENOSPC;
++
++	return id > 0 ? 0 : id;
++}
++
++void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
++{
++	unsigned long flags;
++
++	/* cBPF to eBPF migrations are currently not in the idr store.
++	 * Offloaded programs are removed from the store when their device
++	 * disappears - even if someone grabs an fd to them they are unusable,
++	 * simply waiting for refcnt to drop to be freed.
++	 */
++	if (!prog->aux->id)
++		return;
++
++	if (do_idr_lock)
++		spin_lock_irqsave(&prog_idr_lock, flags);
++	else
++		__acquire(&prog_idr_lock);
++
++	idr_remove(&prog_idr, prog->aux->id);
++	prog->aux->id = 0;
++
++	if (do_idr_lock)
++		spin_unlock_irqrestore(&prog_idr_lock, flags);
++	else
++		__release(&prog_idr_lock);
++}
++
++static void __bpf_prog_put_rcu(struct rcu_head *rcu)
++{
++	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
++
++	kvfree(aux->func_info);
++	kfree(aux->func_info_aux);
++	free_uid(aux->user);
++	security_bpf_prog_free(aux);
++	bpf_prog_free(aux->prog);
++}
++
++static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
++{
++	bpf_prog_kallsyms_del_all(prog);
++	btf_put(prog->aux->btf);
++	kvfree(prog->aux->jited_linfo);
++	kvfree(prog->aux->linfo);
++	kfree(prog->aux->kfunc_tab);
++	if (prog->aux->attach_btf)
++		btf_put(prog->aux->attach_btf);
++
++	if (deferred) {
++		if (prog->aux->sleepable)
++			call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
++		else
++			call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
++	} else {
++		__bpf_prog_put_rcu(&prog->aux->rcu);
++	}
++}
++
++static void bpf_prog_put_deferred(struct work_struct *work)
++{
++	struct bpf_prog_aux *aux;
++	struct bpf_prog *prog;
++
++	aux = container_of(work, struct bpf_prog_aux, work);
++	prog = aux->prog;
++	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
++	bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
++	__bpf_prog_put_noref(prog, true);
++}
++
++static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
++{
++	struct bpf_prog_aux *aux = prog->aux;
++
++	if (atomic64_dec_and_test(&aux->refcnt)) {
++		/* bpf_prog_free_id() must be called first */
++		bpf_prog_free_id(prog, do_idr_lock);
++
++		if (in_irq() || irqs_disabled()) {
++			INIT_WORK(&aux->work, bpf_prog_put_deferred);
++			schedule_work(&aux->work);
++		} else {
++			bpf_prog_put_deferred(&aux->work);
++		}
++	}
++}
++
++void bpf_prog_put(struct bpf_prog *prog)
++{
++	__bpf_prog_put(prog, true);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_put);
++
++static int bpf_prog_release(struct inode *inode, struct file *filp)
++{
++	struct bpf_prog *prog = filp->private_data;
++
++	bpf_prog_put(prog);
++	return 0;
++}
++
++struct bpf_prog_kstats {
++	u64 nsecs;
++	u64 cnt;
++	u64 misses;
++};
++
++void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
++{
++	struct bpf_prog_stats *stats;
++	unsigned int flags;
++
++	stats = this_cpu_ptr(prog->stats);
++	flags = u64_stats_update_begin_irqsave(&stats->syncp);
++	u64_stats_inc(&stats->misses);
++	u64_stats_update_end_irqrestore(&stats->syncp, flags);
++}
++
++static void bpf_prog_get_stats(const struct bpf_prog *prog,
++			       struct bpf_prog_kstats *stats)
++{
++	u64 nsecs = 0, cnt = 0, misses = 0;
++	int cpu;
++
++	for_each_possible_cpu(cpu) {
++		const struct bpf_prog_stats *st;
++		unsigned int start;
++		u64 tnsecs, tcnt, tmisses;
++
++		st = per_cpu_ptr(prog->stats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&st->syncp);
++			tnsecs = u64_stats_read(&st->nsecs);
++			tcnt = u64_stats_read(&st->cnt);
++			tmisses = u64_stats_read(&st->misses);
++		} while (u64_stats_fetch_retry_irq(&st->syncp, start));
++		nsecs += tnsecs;
++		cnt += tcnt;
++		misses += tmisses;
++	}
++	stats->nsecs = nsecs;
++	stats->cnt = cnt;
++	stats->misses = misses;
++}
++
++#ifdef CONFIG_PROC_FS
++static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
++{
++	const struct bpf_prog *prog = filp->private_data;
++	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
++	struct bpf_prog_kstats stats;
++
++	bpf_prog_get_stats(prog, &stats);
++	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
++	seq_printf(m,
++		   "prog_type:\t%u\n"
++		   "prog_jited:\t%u\n"
++		   "prog_tag:\t%s\n"
++		   "memlock:\t%llu\n"
++		   "prog_id:\t%u\n"
++		   "run_time_ns:\t%llu\n"
++		   "run_cnt:\t%llu\n"
++		   "recursion_misses:\t%llu\n"
++		   "verified_insns:\t%u\n",
++		   prog->type,
++		   prog->jited,
++		   prog_tag,
++		   prog->pages * 1ULL << PAGE_SHIFT,
++		   prog->aux->id,
++		   stats.nsecs,
++		   stats.cnt,
++		   stats.misses,
++		   prog->aux->verified_insns);
++}
++#endif
++
++const struct file_operations bpf_prog_fops = {
++#ifdef CONFIG_PROC_FS
++	.show_fdinfo	= bpf_prog_show_fdinfo,
++#endif
++	.release	= bpf_prog_release,
++	.read		= bpf_dummy_read,
++	.write		= bpf_dummy_write,
++};
++
++int bpf_prog_new_fd(struct bpf_prog *prog)
++{
++	int ret;
++
++	ret = security_bpf_prog(prog);
++	if (ret < 0)
++		return ret;
++
++	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
++				O_RDWR | O_CLOEXEC);
++}
++
++static struct bpf_prog *____bpf_prog_get(struct fd f)
++{
++	if (!f.file)
++		return ERR_PTR(-EBADF);
++	if (f.file->f_op != &bpf_prog_fops) {
++		fdput(f);
++		return ERR_PTR(-EINVAL);
++	}
++
++	return f.file->private_data;
++}
++
++void bpf_prog_add(struct bpf_prog *prog, int i)
++{
++	atomic64_add(i, &prog->aux->refcnt);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_add);
++
++void bpf_prog_sub(struct bpf_prog *prog, int i)
++{
++	/* Only to be used for undoing previous bpf_prog_add() in some
++	 * error path. We still know that another entity in our call
++	 * path holds a reference to the program, thus atomic_sub() can
++	 * be safely used in such cases!
++	 */
++	WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_sub);
++
++void bpf_prog_inc(struct bpf_prog *prog)
++{
++	atomic64_inc(&prog->aux->refcnt);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_inc);
++
++/* prog_idr_lock should have been held */
++struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
++{
++	int refold;
++
++	refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
++
++	if (!refold)
++		return ERR_PTR(-ENOENT);
++
++	return prog;
++}
++EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
++
++bool bpf_prog_get_ok(struct bpf_prog *prog,
++			    enum bpf_prog_type *attach_type, bool attach_drv)
++{
++	/* not an attachment, just a refcount inc, always allow */
++	if (!attach_type)
++		return true;
++
++	if (prog->type != *attach_type)
++		return false;
++	if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
++		return false;
++
++	return true;
++}
++
++static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
++				       bool attach_drv)
++{
++	struct fd f = fdget(ufd);
++	struct bpf_prog *prog;
++
++	prog = ____bpf_prog_get(f);
++	if (IS_ERR(prog))
++		return prog;
++	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
++		prog = ERR_PTR(-EINVAL);
++		goto out;
++	}
++
++	bpf_prog_inc(prog);
++out:
++	fdput(f);
++	return prog;
++}
++
++struct bpf_prog *bpf_prog_get(u32 ufd)
++{
++	return __bpf_prog_get(ufd, NULL, false);
++}
++
++struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
++				       bool attach_drv)
++{
++	return __bpf_prog_get(ufd, &type, attach_drv);
++}
++EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
++
++/* Initially all BPF programs could be loaded w/o specifying
++ * expected_attach_type. Later for some of them specifying expected_attach_type
++ * at load time became required so that program could be validated properly.
++ * Programs of types that are allowed to be loaded both w/ and w/o (for
++ * backward compatibility) expected_attach_type, should have the default attach
++ * type assigned to expected_attach_type for the latter case, so that it can be
++ * validated later at attach time.
++ *
++ * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
++ * prog type requires it but has some attach types that have to be backward
++ * compatible.
++ */
++static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
++{
++	switch (attr->prog_type) {
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++		/* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
++		 * exist so checking for non-zero is the way to go here.
++		 */
++		if (!attr->expected_attach_type)
++			attr->expected_attach_type =
++				BPF_CGROUP_INET_SOCK_CREATE;
++		break;
++	case BPF_PROG_TYPE_SK_REUSEPORT:
++		if (!attr->expected_attach_type)
++			attr->expected_attach_type =
++				BPF_SK_REUSEPORT_SELECT;
++		break;
++	}
++}
++
++static int
++bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
++			   enum bpf_attach_type expected_attach_type,
++			   struct btf *attach_btf, u32 btf_id,
++			   struct bpf_prog *dst_prog)
++{
++	if (btf_id) {
++		if (btf_id > BTF_MAX_TYPE)
++			return -EINVAL;
++
++		if (!attach_btf && !dst_prog)
++			return -EINVAL;
++
++		switch (prog_type) {
++		case BPF_PROG_TYPE_TRACING:
++		case BPF_PROG_TYPE_LSM:
++		case BPF_PROG_TYPE_STRUCT_OPS:
++		case BPF_PROG_TYPE_EXT:
++			break;
++		default:
++			return -EINVAL;
++		}
++	}
++
++	if (attach_btf && (!btf_id || dst_prog))
++		return -EINVAL;
++
++	if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
++	    prog_type != BPF_PROG_TYPE_EXT)
++		return -EINVAL;
++
++	switch (prog_type) {
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++		switch (expected_attach_type) {
++		case BPF_CGROUP_INET_SOCK_CREATE:
++		case BPF_CGROUP_INET_SOCK_RELEASE:
++		case BPF_CGROUP_INET4_POST_BIND:
++		case BPF_CGROUP_INET6_POST_BIND:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++		switch (expected_attach_type) {
++		case BPF_CGROUP_INET4_BIND:
++		case BPF_CGROUP_INET6_BIND:
++		case BPF_CGROUP_INET4_CONNECT:
++		case BPF_CGROUP_INET6_CONNECT:
++		case BPF_CGROUP_INET4_GETPEERNAME:
++		case BPF_CGROUP_INET6_GETPEERNAME:
++		case BPF_CGROUP_INET4_GETSOCKNAME:
++		case BPF_CGROUP_INET6_GETSOCKNAME:
++		case BPF_CGROUP_UDP4_SENDMSG:
++		case BPF_CGROUP_UDP6_SENDMSG:
++		case BPF_CGROUP_UDP4_RECVMSG:
++		case BPF_CGROUP_UDP6_RECVMSG:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_CGROUP_SKB:
++		switch (expected_attach_type) {
++		case BPF_CGROUP_INET_INGRESS:
++		case BPF_CGROUP_INET_EGRESS:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++		switch (expected_attach_type) {
++		case BPF_CGROUP_SETSOCKOPT:
++		case BPF_CGROUP_GETSOCKOPT:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_SK_LOOKUP:
++		if (expected_attach_type == BPF_SK_LOOKUP)
++			return 0;
++		return -EINVAL;
++	case BPF_PROG_TYPE_SK_REUSEPORT:
++		switch (expected_attach_type) {
++		case BPF_SK_REUSEPORT_SELECT:
++		case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
++			return 0;
++		default:
++			return -EINVAL;
++		}
++	case BPF_PROG_TYPE_SYSCALL:
++	case BPF_PROG_TYPE_EXT:
++		if (expected_attach_type)
++			return -EINVAL;
++		fallthrough;
++	default:
++		return 0;
++	}
++}
++
++static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
++{
++	switch (prog_type) {
++	case BPF_PROG_TYPE_SCHED_CLS:
++	case BPF_PROG_TYPE_SCHED_ACT:
++	case BPF_PROG_TYPE_XDP:
++	case BPF_PROG_TYPE_LWT_IN:
++	case BPF_PROG_TYPE_LWT_OUT:
++	case BPF_PROG_TYPE_LWT_XMIT:
++	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
++	case BPF_PROG_TYPE_SK_SKB:
++	case BPF_PROG_TYPE_SK_MSG:
++	case BPF_PROG_TYPE_LIRC_MODE2:
++	case BPF_PROG_TYPE_FLOW_DISSECTOR:
++	case BPF_PROG_TYPE_CGROUP_DEVICE:
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++	case BPF_PROG_TYPE_CGROUP_SYSCTL:
++	case BPF_PROG_TYPE_SOCK_OPS:
++	case BPF_PROG_TYPE_EXT: /* extends any prog */
++		return true;
++	case BPF_PROG_TYPE_CGROUP_SKB:
++		/* always unpriv */
++	case BPF_PROG_TYPE_SK_REUSEPORT:
++		/* equivalent to SOCKET_FILTER. need CAP_BPF only */
++	default:
++		return false;
++	}
++}
++
++static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
++{
++	switch (prog_type) {
++	case BPF_PROG_TYPE_KPROBE:
++	case BPF_PROG_TYPE_TRACEPOINT:
++	case BPF_PROG_TYPE_PERF_EVENT:
++	case BPF_PROG_TYPE_RAW_TRACEPOINT:
++	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
++	case BPF_PROG_TYPE_TRACING:
++	case BPF_PROG_TYPE_LSM:
++	case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
++	case BPF_PROG_TYPE_EXT: /* extends any prog */
++		return true;
++	default:
++		return false;
++	}
++}
++
++/* last field in 'union bpf_attr' used by this command */
++#define	BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
++
++static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
++{
++	enum bpf_prog_type type = attr->prog_type;
++	struct bpf_prog *prog, *dst_prog = NULL;
++	struct btf *attach_btf = NULL;
++	int err;
++	char license[128];
++	bool is_gpl;
++
++	if (CHECK_ATTR(BPF_PROG_LOAD))
++		return -EINVAL;
++
++	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
++				 BPF_F_ANY_ALIGNMENT |
++				 BPF_F_TEST_STATE_FREQ |
++				 BPF_F_SLEEPABLE |
++				 BPF_F_TEST_RND_HI32 |
++				 BPF_F_XDP_HAS_FRAGS))
++		return -EINVAL;
++
++	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
++	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
++	    !bpf_capable())
++		return -EPERM;
++
++	/* copy eBPF program license from user space */
++	if (strncpy_from_bpfptr(license,
++				make_bpfptr(attr->license, uattr.is_kernel),
++				sizeof(license) - 1) < 0)
++		return -EFAULT;
++	license[sizeof(license) - 1] = 0;
++
++	/* eBPF programs must be GPL compatible to use GPL-ed functions */
++	is_gpl = license_is_gpl_compatible(license);
++
++	if (attr->insn_cnt == 0 ||
++	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
++		return -E2BIG;
++	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
++	    type != BPF_PROG_TYPE_CGROUP_SKB &&
++	    !bpf_capable())
++		return -EPERM;
++
++	if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
++		return -EPERM;
++	if (is_perfmon_prog_type(type) && !perfmon_capable())
++		return -EPERM;
++
++	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
++	 * or btf, we need to check which one it is
++	 */
++	if (attr->attach_prog_fd) {
++		dst_prog = bpf_prog_get(attr->attach_prog_fd);
++		if (IS_ERR(dst_prog)) {
++			dst_prog = NULL;
++			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
++			if (IS_ERR(attach_btf))
++				return -EINVAL;
++			if (!btf_is_kernel(attach_btf)) {
++				/* attaching through specifying bpf_prog's BTF
++				 * objects directly might be supported eventually
++				 */
++				btf_put(attach_btf);
++				return -ENOTSUPP;
++			}
++		}
++	} else if (attr->attach_btf_id) {
++		/* fall back to vmlinux BTF, if BTF type ID is specified */
++		attach_btf = bpf_get_btf_vmlinux();
++		if (IS_ERR(attach_btf))
++			return PTR_ERR(attach_btf);
++		if (!attach_btf)
++			return -EINVAL;
++		btf_get(attach_btf);
++	}
++
++	bpf_prog_load_fixup_attach_type(attr);
++	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
++				       attach_btf, attr->attach_btf_id,
++				       dst_prog)) {
++		if (dst_prog)
++			bpf_prog_put(dst_prog);
++		if (attach_btf)
++			btf_put(attach_btf);
++		return -EINVAL;
++	}
++
++	/* plain bpf_prog allocation */
++	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
++	if (!prog) {
++		if (dst_prog)
++			bpf_prog_put(dst_prog);
++		if (attach_btf)
++			btf_put(attach_btf);
++		return -ENOMEM;
++	}
++
++	prog->expected_attach_type = attr->expected_attach_type;
++	prog->aux->attach_btf = attach_btf;
++	prog->aux->attach_btf_id = attr->attach_btf_id;
++	prog->aux->dst_prog = dst_prog;
++	prog->aux->offload_requested = !!attr->prog_ifindex;
++	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
++	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
++
++	err = security_bpf_prog_alloc(prog->aux);
++	if (err)
++		goto free_prog;
++
++	prog->aux->user = get_current_user();
++	prog->len = attr->insn_cnt;
++
++	err = -EFAULT;
++	if (copy_from_bpfptr(prog->insns,
++			     make_bpfptr(attr->insns, uattr.is_kernel),
++			     bpf_prog_insn_size(prog)) != 0)
++		goto free_prog_sec;
++
++	prog->orig_prog = NULL;
++	prog->jited = 0;
++
++	atomic64_set(&prog->aux->refcnt, 1);
++	prog->gpl_compatible = is_gpl ? 1 : 0;
++
++	if (bpf_prog_is_dev_bound(prog->aux)) {
++		err = bpf_prog_offload_init(prog, attr);
++		if (err)
++			goto free_prog_sec;
++	}
++
++	/* find program type: socket_filter vs tracing_filter */
++	err = find_prog_type(type, prog);
++	if (err < 0)
++		goto free_prog_sec;
++
++	prog->aux->load_time = ktime_get_boottime_ns();
++	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
++			       sizeof(attr->prog_name));
++	if (err < 0)
++		goto free_prog_sec;
++
++	/* run eBPF verifier */
++	err = bpf_check(&prog, attr, uattr);
++	if (err < 0)
++		goto free_used_maps;
++
++	prog = bpf_prog_select_runtime(prog, &err);
++	if (err < 0)
++		goto free_used_maps;
++
++	err = bpf_prog_alloc_id(prog);
++	if (err)
++		goto free_used_maps;
++
++	/* Upon success of bpf_prog_alloc_id(), the BPF prog is
++	 * effectively publicly exposed. However, retrieving via
++	 * bpf_prog_get_fd_by_id() will take another reference,
++	 * therefore it cannot be gone underneath us.
++	 *
++	 * Only for the time /after/ successful bpf_prog_new_fd()
++	 * and before returning to userspace, we might just hold
++	 * one reference and any parallel close on that fd could
++	 * rip everything out. Hence, below notifications must
++	 * happen before bpf_prog_new_fd().
++	 *
++	 * Also, any failure handling from this point onwards must
++	 * be using bpf_prog_put() given the program is exposed.
++	 */
++	bpf_prog_kallsyms_add(prog);
++	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
++	bpf_audit_prog(prog, BPF_AUDIT_LOAD);
++
++	err = bpf_prog_new_fd(prog);
++	if (err < 0)
++		bpf_prog_put(prog);
++	return err;
++
++free_used_maps:
++	/* In case we have subprogs, we need to wait for a grace
++	 * period before we can tear down JIT memory since symbols
++	 * are already exposed under kallsyms.
++	 */
++	__bpf_prog_put_noref(prog, prog->aux->func_cnt);
++	return err;
++free_prog_sec:
++	free_uid(prog->aux->user);
++	security_bpf_prog_free(prog->aux);
++free_prog:
++	if (prog->aux->attach_btf)
++		btf_put(prog->aux->attach_btf);
++	bpf_prog_free(prog);
++	return err;
++}
++
++#define BPF_OBJ_LAST_FIELD file_flags
++
++static int bpf_obj_pin(const union bpf_attr *attr)
++{
++	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
++		return -EINVAL;
++
++	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
++}
++
++static int bpf_obj_get(const union bpf_attr *attr)
++{
++	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
++	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
++		return -EINVAL;
++
++	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
++				attr->file_flags);
++}
++
++void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
++		   const struct bpf_link_ops *ops, struct bpf_prog *prog)
++{
++	atomic64_set(&link->refcnt, 1);
++	link->type = type;
++	link->id = 0;
++	link->ops = ops;
++	link->prog = prog;
++}
++
++static void bpf_link_free_id(int id)
++{
++	if (!id)
++		return;
++
++	spin_lock_bh(&link_idr_lock);
++	idr_remove(&link_idr, id);
++	spin_unlock_bh(&link_idr_lock);
++}
++
++/* Clean up bpf_link and corresponding anon_inode file and FD. After
++ * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
++ * anon_inode's release() call. This helper marksbpf_link as
++ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
++ * is not decremented, it's the responsibility of a calling code that failed
++ * to complete bpf_link initialization.
++ */
++void bpf_link_cleanup(struct bpf_link_primer *primer)
++{
++	primer->link->prog = NULL;
++	bpf_link_free_id(primer->id);
++	fput(primer->file);
++	put_unused_fd(primer->fd);
++}
++
++void bpf_link_inc(struct bpf_link *link)
++{
++	atomic64_inc(&link->refcnt);
++}
++
++/* bpf_link_free is guaranteed to be called from process context */
++static void bpf_link_free(struct bpf_link *link)
++{
++	bpf_link_free_id(link->id);
++	if (link->prog) {
++		/* detach BPF program, clean up used resources */
++		link->ops->release(link);
++		bpf_prog_put(link->prog);
++	}
++	/* free bpf_link and its containing memory */
++	link->ops->dealloc(link);
++}
++
++static void bpf_link_put_deferred(struct work_struct *work)
++{
++	struct bpf_link *link = container_of(work, struct bpf_link, work);
++
++	bpf_link_free(link);
++}
++
++/* bpf_link_put can be called from atomic context, but ensures that resources
++ * are freed from process context
++ */
++void bpf_link_put(struct bpf_link *link)
++{
++	if (!atomic64_dec_and_test(&link->refcnt))
++		return;
++
++	if (in_atomic()) {
++		INIT_WORK(&link->work, bpf_link_put_deferred);
++		schedule_work(&link->work);
++	} else {
++		bpf_link_free(link);
++	}
++}
++EXPORT_SYMBOL(bpf_link_put);
++
++static int bpf_link_release(struct inode *inode, struct file *filp)
++{
++	struct bpf_link *link = filp->private_data;
++
++	bpf_link_put(link);
++	return 0;
++}
++
++#ifdef CONFIG_PROC_FS
++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
++#define BPF_MAP_TYPE(_id, _ops)
++#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
++static const char *bpf_link_type_strs[] = {
++	[BPF_LINK_TYPE_UNSPEC] = "<invalid>",
++#include <linux/bpf_types.h>
++};
++#undef BPF_PROG_TYPE
++#undef BPF_MAP_TYPE
++#undef BPF_LINK_TYPE
++
++static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
++{
++	const struct bpf_link *link = filp->private_data;
++	const struct bpf_prog *prog = link->prog;
++	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
++
++	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
++	seq_printf(m,
++		   "link_type:\t%s\n"
++		   "link_id:\t%u\n"
++		   "prog_tag:\t%s\n"
++		   "prog_id:\t%u\n",
++		   bpf_link_type_strs[link->type],
++		   link->id,
++		   prog_tag,
++		   prog->aux->id);
++	if (link->ops->show_fdinfo)
++		link->ops->show_fdinfo(link, m);
++}
++#endif
++
++static const struct file_operations bpf_link_fops = {
++#ifdef CONFIG_PROC_FS
++	.show_fdinfo	= bpf_link_show_fdinfo,
++#endif
++	.release	= bpf_link_release,
++	.read		= bpf_dummy_read,
++	.write		= bpf_dummy_write,
++};
++
++static int bpf_link_alloc_id(struct bpf_link *link)
++{
++	int id;
++
++	idr_preload(GFP_KERNEL);
++	spin_lock_bh(&link_idr_lock);
++	id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
++	spin_unlock_bh(&link_idr_lock);
++	idr_preload_end();
++
++	return id;
++}
++
++/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
++ * reserving unused FD and allocating ID from link_idr. This is to be paired
++ * with bpf_link_settle() to install FD and ID and expose bpf_link to
++ * user-space, if bpf_link is successfully attached. If not, bpf_link and
++ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
++ * transient state is passed around in struct bpf_link_primer.
++ * This is preferred way to create and initialize bpf_link, especially when
++ * there are complicated and expensive operations in between creating bpf_link
++ * itself and attaching it to BPF hook. By using bpf_link_prime() and
++ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
++ * expensive (and potentially failing) roll back operations in a rare case
++ * that file, FD, or ID can't be allocated.
++ */
++int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
++{
++	struct file *file;
++	int fd, id;
++
++	fd = get_unused_fd_flags(O_CLOEXEC);
++	if (fd < 0)
++		return fd;
++
++
++	id = bpf_link_alloc_id(link);
++	if (id < 0) {
++		put_unused_fd(fd);
++		return id;
++	}
++
++	file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
++	if (IS_ERR(file)) {
++		bpf_link_free_id(id);
++		put_unused_fd(fd);
++		return PTR_ERR(file);
++	}
++
++	primer->link = link;
++	primer->file = file;
++	primer->fd = fd;
++	primer->id = id;
++	return 0;
++}
++
++int bpf_link_settle(struct bpf_link_primer *primer)
++{
++	/* make bpf_link fetchable by ID */
++	spin_lock_bh(&link_idr_lock);
++	primer->link->id = primer->id;
++	spin_unlock_bh(&link_idr_lock);
++	/* make bpf_link fetchable by FD */
++	fd_install(primer->fd, primer->file);
++	/* pass through installed FD */
++	return primer->fd;
++}
++
++int bpf_link_new_fd(struct bpf_link *link)
++{
++	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
++}
++
++struct bpf_link *bpf_link_get_from_fd(u32 ufd)
++{
++	struct fd f = fdget(ufd);
++	struct bpf_link *link;
++
++	if (!f.file)
++		return ERR_PTR(-EBADF);
++	if (f.file->f_op != &bpf_link_fops) {
++		fdput(f);
++		return ERR_PTR(-EINVAL);
++	}
++
++	link = f.file->private_data;
++	bpf_link_inc(link);
++	fdput(f);
++
++	return link;
++}
++EXPORT_SYMBOL(bpf_link_get_from_fd);
++
++static void bpf_tracing_link_release(struct bpf_link *link)
++{
++	struct bpf_tracing_link *tr_link =
++		container_of(link, struct bpf_tracing_link, link.link);
++
++	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
++						tr_link->trampoline));
++
++	bpf_trampoline_put(tr_link->trampoline);
++
++	/* tgt_prog is NULL if target is a kernel function */
++	if (tr_link->tgt_prog)
++		bpf_prog_put(tr_link->tgt_prog);
++}
++
++static void bpf_tracing_link_dealloc(struct bpf_link *link)
++{
++	struct bpf_tracing_link *tr_link =
++		container_of(link, struct bpf_tracing_link, link.link);
++
++	kfree(tr_link);
++}
++
++static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
++					 struct seq_file *seq)
++{
++	struct bpf_tracing_link *tr_link =
++		container_of(link, struct bpf_tracing_link, link.link);
++
++	seq_printf(seq,
++		   "attach_type:\t%d\n",
++		   tr_link->attach_type);
++}
++
++static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
++					   struct bpf_link_info *info)
++{
++	struct bpf_tracing_link *tr_link =
++		container_of(link, struct bpf_tracing_link, link.link);
++
++	info->tracing.attach_type = tr_link->attach_type;
++	bpf_trampoline_unpack_key(tr_link->trampoline->key,
++				  &info->tracing.target_obj_id,
++				  &info->tracing.target_btf_id);
++
++	return 0;
++}
++
++static const struct bpf_link_ops bpf_tracing_link_lops = {
++	.release = bpf_tracing_link_release,
++	.dealloc = bpf_tracing_link_dealloc,
++	.show_fdinfo = bpf_tracing_link_show_fdinfo,
++	.fill_link_info = bpf_tracing_link_fill_link_info,
++};
++
++static int bpf_tracing_prog_attach(struct bpf_prog *prog,
++				   int tgt_prog_fd,
++				   u32 btf_id,
++				   u64 bpf_cookie)
++{
++	struct bpf_link_primer link_primer;
++	struct bpf_prog *tgt_prog = NULL;
++	struct bpf_trampoline *tr = NULL;
++	struct bpf_tracing_link *link;
++	u64 key = 0;
++	int err;
++
++	switch (prog->type) {
++	case BPF_PROG_TYPE_TRACING:
++		if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
++		    prog->expected_attach_type != BPF_TRACE_FEXIT &&
++		    prog->expected_attach_type != BPF_MODIFY_RETURN) {
++			err = -EINVAL;
++			goto out_put_prog;
++		}
++		break;
++	case BPF_PROG_TYPE_EXT:
++		if (prog->expected_attach_type != 0) {
++			err = -EINVAL;
++			goto out_put_prog;
++		}
++		break;
++	case BPF_PROG_TYPE_LSM:
++		if (prog->expected_attach_type != BPF_LSM_MAC) {
++			err = -EINVAL;
++			goto out_put_prog;
++		}
++		break;
++	default:
++		err = -EINVAL;
++		goto out_put_prog;
++	}
++
++	if (!!tgt_prog_fd != !!btf_id) {
++		err = -EINVAL;
++		goto out_put_prog;
++	}
++
++	if (tgt_prog_fd) {
++		/* For now we only allow new targets for BPF_PROG_TYPE_EXT */
++		if (prog->type != BPF_PROG_TYPE_EXT) {
++			err = -EINVAL;
++			goto out_put_prog;
++		}
++
++		tgt_prog = bpf_prog_get(tgt_prog_fd);
++		if (IS_ERR(tgt_prog)) {
++			err = PTR_ERR(tgt_prog);
++			tgt_prog = NULL;
++			goto out_put_prog;
++		}
++
++		key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
++	}
++
++	link = kzalloc(sizeof(*link), GFP_USER);
++	if (!link) {
++		err = -ENOMEM;
++		goto out_put_prog;
++	}
++	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
++		      &bpf_tracing_link_lops, prog);
++	link->attach_type = prog->expected_attach_type;
++	link->link.cookie = bpf_cookie;
++
++	mutex_lock(&prog->aux->dst_mutex);
++
++	/* There are a few possible cases here:
++	 *
++	 * - if prog->aux->dst_trampoline is set, the program was just loaded
++	 *   and not yet attached to anything, so we can use the values stored
++	 *   in prog->aux
++	 *
++	 * - if prog->aux->dst_trampoline is NULL, the program has already been
++         *   attached to a target and its initial target was cleared (below)
++	 *
++	 * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
++	 *   target_btf_id using the link_create API.
++	 *
++	 * - if tgt_prog == NULL when this function was called using the old
++	 *   raw_tracepoint_open API, and we need a target from prog->aux
++	 *
++	 * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
++	 *   was detached and is going for re-attachment.
++	 */
++	if (!prog->aux->dst_trampoline && !tgt_prog) {
++		/*
++		 * Allow re-attach for TRACING and LSM programs. If it's
++		 * currently linked, bpf_trampoline_link_prog will fail.
++		 * EXT programs need to specify tgt_prog_fd, so they
++		 * re-attach in separate code path.
++		 */
++		if (prog->type != BPF_PROG_TYPE_TRACING &&
++		    prog->type != BPF_PROG_TYPE_LSM) {
++			err = -EINVAL;
++			goto out_unlock;
++		}
++		btf_id = prog->aux->attach_btf_id;
++		key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
++	}
++
++	if (!prog->aux->dst_trampoline ||
++	    (key && key != prog->aux->dst_trampoline->key)) {
++		/* If there is no saved target, or the specified target is
++		 * different from the destination specified at load time, we
++		 * need a new trampoline and a check for compatibility
++		 */
++		struct bpf_attach_target_info tgt_info = {};
++
++		err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
++					      &tgt_info);
++		if (err)
++			goto out_unlock;
++
++		tr = bpf_trampoline_get(key, &tgt_info);
++		if (!tr) {
++			err = -ENOMEM;
++			goto out_unlock;
++		}
++	} else {
++		/* The caller didn't specify a target, or the target was the
++		 * same as the destination supplied during program load. This
++		 * means we can reuse the trampoline and reference from program
++		 * load time, and there is no need to allocate a new one. This
++		 * can only happen once for any program, as the saved values in
++		 * prog->aux are cleared below.
++		 */
++		tr = prog->aux->dst_trampoline;
++		tgt_prog = prog->aux->dst_prog;
++	}
++
++	err = bpf_link_prime(&link->link.link, &link_primer);
++	if (err)
++		goto out_unlock;
++
++	err = bpf_trampoline_link_prog(&link->link, tr);
++	if (err) {
++		bpf_link_cleanup(&link_primer);
++		link = NULL;
++		goto out_unlock;
++	}
++
++	link->tgt_prog = tgt_prog;
++	link->trampoline = tr;
++
++	/* Always clear the trampoline and target prog from prog->aux to make
++	 * sure the original attach destination is not kept alive after a
++	 * program is (re-)attached to another target.
++	 */
++	if (prog->aux->dst_prog &&
++	    (tgt_prog_fd || tr != prog->aux->dst_trampoline))
++		/* got extra prog ref from syscall, or attaching to different prog */
++		bpf_prog_put(prog->aux->dst_prog);
++	if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
++		/* we allocated a new trampoline, so free the old one */
++		bpf_trampoline_put(prog->aux->dst_trampoline);
++
++	prog->aux->dst_prog = NULL;
++	prog->aux->dst_trampoline = NULL;
++	mutex_unlock(&prog->aux->dst_mutex);
++
++	return bpf_link_settle(&link_primer);
++out_unlock:
++	if (tr && tr != prog->aux->dst_trampoline)
++		bpf_trampoline_put(tr);
++	mutex_unlock(&prog->aux->dst_mutex);
++	kfree(link);
++out_put_prog:
++	if (tgt_prog_fd && tgt_prog)
++		bpf_prog_put(tgt_prog);
++	return err;
++}
++
++struct bpf_raw_tp_link {
++	struct bpf_link link;
++	struct bpf_raw_event_map *btp;
++};
++
++static void bpf_raw_tp_link_release(struct bpf_link *link)
++{
++	struct bpf_raw_tp_link *raw_tp =
++		container_of(link, struct bpf_raw_tp_link, link);
++
++	bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
++	bpf_put_raw_tracepoint(raw_tp->btp);
++}
++
++static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
++{
++	struct bpf_raw_tp_link *raw_tp =
++		container_of(link, struct bpf_raw_tp_link, link);
++
++	kfree(raw_tp);
++}
++
++static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
++					struct seq_file *seq)
++{
++	struct bpf_raw_tp_link *raw_tp_link =
++		container_of(link, struct bpf_raw_tp_link, link);
++
++	seq_printf(seq,
++		   "tp_name:\t%s\n",
++		   raw_tp_link->btp->tp->name);
++}
++
++static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
++					  struct bpf_link_info *info)
++{
++	struct bpf_raw_tp_link *raw_tp_link =
++		container_of(link, struct bpf_raw_tp_link, link);
++	char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
++	const char *tp_name = raw_tp_link->btp->tp->name;
++	u32 ulen = info->raw_tracepoint.tp_name_len;
++	size_t tp_len = strlen(tp_name);
++
++	if (!ulen ^ !ubuf)
++		return -EINVAL;
++
++	info->raw_tracepoint.tp_name_len = tp_len + 1;
++
++	if (!ubuf)
++		return 0;
++
++	if (ulen >= tp_len + 1) {
++		if (copy_to_user(ubuf, tp_name, tp_len + 1))
++			return -EFAULT;
++	} else {
++		char zero = '\0';
++
++		if (copy_to_user(ubuf, tp_name, ulen - 1))
++			return -EFAULT;
++		if (put_user(zero, ubuf + ulen - 1))
++			return -EFAULT;
++		return -ENOSPC;
++	}
++
++	return 0;
++}
++
++static const struct bpf_link_ops bpf_raw_tp_link_lops = {
++	.release = bpf_raw_tp_link_release,
++	.dealloc = bpf_raw_tp_link_dealloc,
++	.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
++	.fill_link_info = bpf_raw_tp_link_fill_link_info,
++};
++
++#ifdef CONFIG_PERF_EVENTS
++struct bpf_perf_link {
++	struct bpf_link link;
++	struct file *perf_file;
++};
++
++static void bpf_perf_link_release(struct bpf_link *link)
++{
++	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
++	struct perf_event *event = perf_link->perf_file->private_data;
++
++	perf_event_free_bpf_prog(event);
++	fput(perf_link->perf_file);
++}
++
++static void bpf_perf_link_dealloc(struct bpf_link *link)
++{
++	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
++
++	kfree(perf_link);
++}
++
++static const struct bpf_link_ops bpf_perf_link_lops = {
++	.release = bpf_perf_link_release,
++	.dealloc = bpf_perf_link_dealloc,
++};
++
++static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
++{
++	struct bpf_link_primer link_primer;
++	struct bpf_perf_link *link;
++	struct perf_event *event;
++	struct file *perf_file;
++	int err;
++
++	if (attr->link_create.flags)
++		return -EINVAL;
++
++	perf_file = perf_event_get(attr->link_create.target_fd);
++	if (IS_ERR(perf_file))
++		return PTR_ERR(perf_file);
++
++	link = kzalloc(sizeof(*link), GFP_USER);
++	if (!link) {
++		err = -ENOMEM;
++		goto out_put_file;
++	}
++	bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
++	link->perf_file = perf_file;
++
++	err = bpf_link_prime(&link->link, &link_primer);
++	if (err) {
++		kfree(link);
++		goto out_put_file;
++	}
++
++	event = perf_file->private_data;
++	err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
++	if (err) {
++		bpf_link_cleanup(&link_primer);
++		goto out_put_file;
++	}
++	/* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
++	bpf_prog_inc(prog);
++
++	return bpf_link_settle(&link_primer);
++
++out_put_file:
++	fput(perf_file);
++	return err;
++}
++#else
++static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
++{
++	return -EOPNOTSUPP;
++}
++#endif /* CONFIG_PERF_EVENTS */
++
++static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
++				  const char __user *user_tp_name)
++{
++	struct bpf_link_primer link_primer;
++	struct bpf_raw_tp_link *link;
++	struct bpf_raw_event_map *btp;
++	const char *tp_name;
++	char buf[128];
++	int err;
++
++	switch (prog->type) {
++	case BPF_PROG_TYPE_TRACING:
++	case BPF_PROG_TYPE_EXT:
++	case BPF_PROG_TYPE_LSM:
++		if (user_tp_name)
++			/* The attach point for this category of programs
++			 * should be specified via btf_id during program load.
++			 */
++			return -EINVAL;
++		if (prog->type == BPF_PROG_TYPE_TRACING &&
++		    prog->expected_attach_type == BPF_TRACE_RAW_TP) {
++			tp_name = prog->aux->attach_func_name;
++			break;
++		}
++		return bpf_tracing_prog_attach(prog, 0, 0, 0);
++	case BPF_PROG_TYPE_RAW_TRACEPOINT:
++	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
++		if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
++			return -EFAULT;
++		buf[sizeof(buf) - 1] = 0;
++		tp_name = buf;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	btp = bpf_get_raw_tracepoint(tp_name);
++	if (!btp)
++		return -ENOENT;
++
++	link = kzalloc(sizeof(*link), GFP_USER);
++	if (!link) {
++		err = -ENOMEM;
++		goto out_put_btp;
++	}
++	bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
++		      &bpf_raw_tp_link_lops, prog);
++	link->btp = btp;
++
++	err = bpf_link_prime(&link->link, &link_primer);
++	if (err) {
++		kfree(link);
++		goto out_put_btp;
++	}
++
++	err = bpf_probe_register(link->btp, prog);
++	if (err) {
++		bpf_link_cleanup(&link_primer);
++		goto out_put_btp;
++	}
++
++	return bpf_link_settle(&link_primer);
++
++out_put_btp:
++	bpf_put_raw_tracepoint(btp);
++	return err;
++}
++
++#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
++
++static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
++{
++	struct bpf_prog *prog;
++	int fd;
++
++	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
++		return -EINVAL;
++
++	prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
++	if (fd < 0)
++		bpf_prog_put(prog);
++	return fd;
++}
++
++static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
++					     enum bpf_attach_type attach_type)
++{
++	switch (prog->type) {
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++	case BPF_PROG_TYPE_SK_LOOKUP:
++		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
++	case BPF_PROG_TYPE_CGROUP_SKB:
++		if (!capable(CAP_NET_ADMIN))
++			/* cg-skb progs can be loaded by unpriv user.
++			 * check permissions at attach time.
++			 */
++			return -EPERM;
++		return prog->enforce_expected_attach_type &&
++			prog->expected_attach_type != attach_type ?
++			-EINVAL : 0;
++	default:
++		return 0;
++	}
++}
++
++static enum bpf_prog_type
++attach_type_to_prog_type(enum bpf_attach_type attach_type)
++{
++	switch (attach_type) {
++	case BPF_CGROUP_INET_INGRESS:
++	case BPF_CGROUP_INET_EGRESS:
++		return BPF_PROG_TYPE_CGROUP_SKB;
++	case BPF_CGROUP_INET_SOCK_CREATE:
++	case BPF_CGROUP_INET_SOCK_RELEASE:
++	case BPF_CGROUP_INET4_POST_BIND:
++	case BPF_CGROUP_INET6_POST_BIND:
++		return BPF_PROG_TYPE_CGROUP_SOCK;
++	case BPF_CGROUP_INET4_BIND:
++	case BPF_CGROUP_INET6_BIND:
++	case BPF_CGROUP_INET4_CONNECT:
++	case BPF_CGROUP_INET6_CONNECT:
++	case BPF_CGROUP_INET4_GETPEERNAME:
++	case BPF_CGROUP_INET6_GETPEERNAME:
++	case BPF_CGROUP_INET4_GETSOCKNAME:
++	case BPF_CGROUP_INET6_GETSOCKNAME:
++	case BPF_CGROUP_UDP4_SENDMSG:
++	case BPF_CGROUP_UDP6_SENDMSG:
++	case BPF_CGROUP_UDP4_RECVMSG:
++	case BPF_CGROUP_UDP6_RECVMSG:
++		return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
++	case BPF_CGROUP_SOCK_OPS:
++		return BPF_PROG_TYPE_SOCK_OPS;
++	case BPF_CGROUP_DEVICE:
++		return BPF_PROG_TYPE_CGROUP_DEVICE;
++	case BPF_SK_MSG_VERDICT:
++		return BPF_PROG_TYPE_SK_MSG;
++	case BPF_SK_SKB_STREAM_PARSER:
++	case BPF_SK_SKB_STREAM_VERDICT:
++	case BPF_SK_SKB_VERDICT:
++		return BPF_PROG_TYPE_SK_SKB;
++	case BPF_LIRC_MODE2:
++		return BPF_PROG_TYPE_LIRC_MODE2;
++	case BPF_FLOW_DISSECTOR:
++		return BPF_PROG_TYPE_FLOW_DISSECTOR;
++	case BPF_CGROUP_SYSCTL:
++		return BPF_PROG_TYPE_CGROUP_SYSCTL;
++	case BPF_CGROUP_GETSOCKOPT:
++	case BPF_CGROUP_SETSOCKOPT:
++		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
++	case BPF_TRACE_ITER:
++	case BPF_TRACE_RAW_TP:
++	case BPF_TRACE_FENTRY:
++	case BPF_TRACE_FEXIT:
++	case BPF_MODIFY_RETURN:
++		return BPF_PROG_TYPE_TRACING;
++	case BPF_LSM_MAC:
++		return BPF_PROG_TYPE_LSM;
++	case BPF_SK_LOOKUP:
++		return BPF_PROG_TYPE_SK_LOOKUP;
++	case BPF_XDP:
++		return BPF_PROG_TYPE_XDP;
++	case BPF_LSM_CGROUP:
++		return BPF_PROG_TYPE_LSM;
++	default:
++		return BPF_PROG_TYPE_UNSPEC;
++	}
++}
++
++#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
++
++#define BPF_F_ATTACH_MASK \
++	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
++
++static int bpf_prog_attach(const union bpf_attr *attr)
++{
++	enum bpf_prog_type ptype;
++	struct bpf_prog *prog;
++	int ret;
++
++	if (CHECK_ATTR(BPF_PROG_ATTACH))
++		return -EINVAL;
++
++	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
++		return -EINVAL;
++
++	ptype = attach_type_to_prog_type(attr->attach_type);
++	if (ptype == BPF_PROG_TYPE_UNSPEC)
++		return -EINVAL;
++
++	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
++		bpf_prog_put(prog);
++		return -EINVAL;
++	}
++
++	switch (ptype) {
++	case BPF_PROG_TYPE_SK_SKB:
++	case BPF_PROG_TYPE_SK_MSG:
++		ret = sock_map_get_from_fd(attr, prog);
++		break;
++	case BPF_PROG_TYPE_LIRC_MODE2:
++		ret = lirc_prog_attach(attr, prog);
++		break;
++	case BPF_PROG_TYPE_FLOW_DISSECTOR:
++		ret = netns_bpf_prog_attach(attr, prog);
++		break;
++	case BPF_PROG_TYPE_CGROUP_DEVICE:
++	case BPF_PROG_TYPE_CGROUP_SKB:
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++	case BPF_PROG_TYPE_CGROUP_SYSCTL:
++	case BPF_PROG_TYPE_SOCK_OPS:
++	case BPF_PROG_TYPE_LSM:
++		if (ptype == BPF_PROG_TYPE_LSM &&
++		    prog->expected_attach_type != BPF_LSM_CGROUP)
++			return -EINVAL;
++
++		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret)
++		bpf_prog_put(prog);
++	return ret;
++}
++
++#define BPF_PROG_DETACH_LAST_FIELD attach_type
++
++static int bpf_prog_detach(const union bpf_attr *attr)
++{
++	enum bpf_prog_type ptype;
++
++	if (CHECK_ATTR(BPF_PROG_DETACH))
++		return -EINVAL;
++
++	ptype = attach_type_to_prog_type(attr->attach_type);
++
++	switch (ptype) {
++	case BPF_PROG_TYPE_SK_MSG:
++	case BPF_PROG_TYPE_SK_SKB:
++		return sock_map_prog_detach(attr, ptype);
++	case BPF_PROG_TYPE_LIRC_MODE2:
++		return lirc_prog_detach(attr);
++	case BPF_PROG_TYPE_FLOW_DISSECTOR:
++		return netns_bpf_prog_detach(attr, ptype);
++	case BPF_PROG_TYPE_CGROUP_DEVICE:
++	case BPF_PROG_TYPE_CGROUP_SKB:
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++	case BPF_PROG_TYPE_CGROUP_SYSCTL:
++	case BPF_PROG_TYPE_SOCK_OPS:
++	case BPF_PROG_TYPE_LSM:
++		return cgroup_bpf_prog_detach(attr, ptype);
++	default:
++		return -EINVAL;
++	}
++}
++
++#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
++
++static int bpf_prog_query(const union bpf_attr *attr,
++			  union bpf_attr __user *uattr)
++{
++	if (!capable(CAP_NET_ADMIN))
++		return -EPERM;
++	if (CHECK_ATTR(BPF_PROG_QUERY))
++		return -EINVAL;
++	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
++		return -EINVAL;
++
++	switch (attr->query.attach_type) {
++	case BPF_CGROUP_INET_INGRESS:
++	case BPF_CGROUP_INET_EGRESS:
++	case BPF_CGROUP_INET_SOCK_CREATE:
++	case BPF_CGROUP_INET_SOCK_RELEASE:
++	case BPF_CGROUP_INET4_BIND:
++	case BPF_CGROUP_INET6_BIND:
++	case BPF_CGROUP_INET4_POST_BIND:
++	case BPF_CGROUP_INET6_POST_BIND:
++	case BPF_CGROUP_INET4_CONNECT:
++	case BPF_CGROUP_INET6_CONNECT:
++	case BPF_CGROUP_INET4_GETPEERNAME:
++	case BPF_CGROUP_INET6_GETPEERNAME:
++	case BPF_CGROUP_INET4_GETSOCKNAME:
++	case BPF_CGROUP_INET6_GETSOCKNAME:
++	case BPF_CGROUP_UDP4_SENDMSG:
++	case BPF_CGROUP_UDP6_SENDMSG:
++	case BPF_CGROUP_UDP4_RECVMSG:
++	case BPF_CGROUP_UDP6_RECVMSG:
++	case BPF_CGROUP_SOCK_OPS:
++	case BPF_CGROUP_DEVICE:
++	case BPF_CGROUP_SYSCTL:
++	case BPF_CGROUP_GETSOCKOPT:
++	case BPF_CGROUP_SETSOCKOPT:
++	case BPF_LSM_CGROUP:
++		return cgroup_bpf_prog_query(attr, uattr);
++	case BPF_LIRC_MODE2:
++		return lirc_prog_query(attr, uattr);
++	case BPF_FLOW_DISSECTOR:
++	case BPF_SK_LOOKUP:
++		return netns_bpf_prog_query(attr, uattr);
++	case BPF_SK_SKB_STREAM_PARSER:
++	case BPF_SK_SKB_STREAM_VERDICT:
++	case BPF_SK_MSG_VERDICT:
++	case BPF_SK_SKB_VERDICT:
++		return sock_map_bpf_prog_query(attr, uattr);
++	default:
++		return -EINVAL;
++	}
++}
++
++#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
++
++static int bpf_prog_test_run(const union bpf_attr *attr,
++			     union bpf_attr __user *uattr)
++{
++	struct bpf_prog *prog;
++	int ret = -ENOTSUPP;
++
++	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
++		return -EINVAL;
++
++	if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
++	    (!attr->test.ctx_size_in && attr->test.ctx_in))
++		return -EINVAL;
++
++	if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
++	    (!attr->test.ctx_size_out && attr->test.ctx_out))
++		return -EINVAL;
++
++	prog = bpf_prog_get(attr->test.prog_fd);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	if (prog->aux->ops->test_run)
++		ret = prog->aux->ops->test_run(prog, attr, uattr);
++
++	bpf_prog_put(prog);
++	return ret;
++}
++
++#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
++
++static int bpf_obj_get_next_id(const union bpf_attr *attr,
++			       union bpf_attr __user *uattr,
++			       struct idr *idr,
++			       spinlock_t *lock)
++{
++	u32 next_id = attr->start_id;
++	int err = 0;
++
++	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	next_id++;
++	spin_lock_bh(lock);
++	if (!idr_get_next(idr, &next_id))
++		err = -ENOENT;
++	spin_unlock_bh(lock);
++
++	if (!err)
++		err = put_user(next_id, &uattr->next_id);
++
++	return err;
++}
++
++struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
++{
++	struct bpf_map *map;
++
++	spin_lock_bh(&map_idr_lock);
++again:
++	map = idr_get_next(&map_idr, id);
++	if (map) {
++		map = __bpf_map_inc_not_zero(map, false);
++		if (IS_ERR(map)) {
++			(*id)++;
++			goto again;
++		}
++	}
++	spin_unlock_bh(&map_idr_lock);
++
++	return map;
++}
++
++struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
++{
++	struct bpf_prog *prog;
++
++	spin_lock_bh(&prog_idr_lock);
++again:
++	prog = idr_get_next(&prog_idr, id);
++	if (prog) {
++		prog = bpf_prog_inc_not_zero(prog);
++		if (IS_ERR(prog)) {
++			(*id)++;
++			goto again;
++		}
++	}
++	spin_unlock_bh(&prog_idr_lock);
++
++	return prog;
++}
++
++#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
++
++struct bpf_prog *bpf_prog_by_id(u32 id)
++{
++	struct bpf_prog *prog;
++
++	if (!id)
++		return ERR_PTR(-ENOENT);
++
++	spin_lock_bh(&prog_idr_lock);
++	prog = idr_find(&prog_idr, id);
++	if (prog)
++		prog = bpf_prog_inc_not_zero(prog);
++	else
++		prog = ERR_PTR(-ENOENT);
++	spin_unlock_bh(&prog_idr_lock);
++	return prog;
++}
++
++static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
++{
++	struct bpf_prog *prog;
++	u32 id = attr->prog_id;
++	int fd;
++
++	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	prog = bpf_prog_by_id(id);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	fd = bpf_prog_new_fd(prog);
++	if (fd < 0)
++		bpf_prog_put(prog);
++
++	return fd;
++}
++
++#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
++
++static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
++{
++	struct bpf_map *map;
++	u32 id = attr->map_id;
++	int f_flags;
++	int fd;
++
++	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
++	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	f_flags = bpf_get_file_flag(attr->open_flags);
++	if (f_flags < 0)
++		return f_flags;
++
++	spin_lock_bh(&map_idr_lock);
++	map = idr_find(&map_idr, id);
++	if (map)
++		map = __bpf_map_inc_not_zero(map, true);
++	else
++		map = ERR_PTR(-ENOENT);
++	spin_unlock_bh(&map_idr_lock);
++
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++
++	fd = bpf_map_new_fd(map, f_flags);
++	if (fd < 0)
++		bpf_map_put_with_uref(map);
++
++	return fd;
++}
++
++static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
++					      unsigned long addr, u32 *off,
++					      u32 *type)
++{
++	const struct bpf_map *map;
++	int i;
++
++	mutex_lock(&prog->aux->used_maps_mutex);
++	for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
++		map = prog->aux->used_maps[i];
++		if (map == (void *)addr) {
++			*type = BPF_PSEUDO_MAP_FD;
++			goto out;
++		}
++		if (!map->ops->map_direct_value_meta)
++			continue;
++		if (!map->ops->map_direct_value_meta(map, addr, off)) {
++			*type = BPF_PSEUDO_MAP_VALUE;
++			goto out;
++		}
++	}
++	map = NULL;
++
++out:
++	mutex_unlock(&prog->aux->used_maps_mutex);
++	return map;
++}
++
++static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
++					      const struct cred *f_cred)
++{
++	const struct bpf_map *map;
++	struct bpf_insn *insns;
++	u32 off, type;
++	u64 imm;
++	u8 code;
++	int i;
++
++	insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
++			GFP_USER);
++	if (!insns)
++		return insns;
++
++	for (i = 0; i < prog->len; i++) {
++		code = insns[i].code;
++
++		if (code == (BPF_JMP | BPF_TAIL_CALL)) {
++			insns[i].code = BPF_JMP | BPF_CALL;
++			insns[i].imm = BPF_FUNC_tail_call;
++			/* fall-through */
++		}
++		if (code == (BPF_JMP | BPF_CALL) ||
++		    code == (BPF_JMP | BPF_CALL_ARGS)) {
++			if (code == (BPF_JMP | BPF_CALL_ARGS))
++				insns[i].code = BPF_JMP | BPF_CALL;
++			if (!bpf_dump_raw_ok(f_cred))
++				insns[i].imm = 0;
++			continue;
++		}
++		if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
++			insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
++			continue;
++		}
++
++		if (code != (BPF_LD | BPF_IMM | BPF_DW))
++			continue;
++
++		imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
++		map = bpf_map_from_imm(prog, imm, &off, &type);
++		if (map) {
++			insns[i].src_reg = type;
++			insns[i].imm = map->id;
++			insns[i + 1].imm = off;
++			continue;
++		}
++	}
++
++	return insns;
++}
++
++static int set_info_rec_size(struct bpf_prog_info *info)
++{
++	/*
++	 * Ensure info.*_rec_size is the same as kernel expected size
++	 *
++	 * or
++	 *
++	 * Only allow zero *_rec_size if both _rec_size and _cnt are
++	 * zero.  In this case, the kernel will set the expected
++	 * _rec_size back to the info.
++	 */
++
++	if ((info->nr_func_info || info->func_info_rec_size) &&
++	    info->func_info_rec_size != sizeof(struct bpf_func_info))
++		return -EINVAL;
++
++	if ((info->nr_line_info || info->line_info_rec_size) &&
++	    info->line_info_rec_size != sizeof(struct bpf_line_info))
++		return -EINVAL;
++
++	if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
++	    info->jited_line_info_rec_size != sizeof(__u64))
++		return -EINVAL;
++
++	info->func_info_rec_size = sizeof(struct bpf_func_info);
++	info->line_info_rec_size = sizeof(struct bpf_line_info);
++	info->jited_line_info_rec_size = sizeof(__u64);
++
++	return 0;
++}
++
++static int bpf_prog_get_info_by_fd(struct file *file,
++				   struct bpf_prog *prog,
++				   const union bpf_attr *attr,
++				   union bpf_attr __user *uattr)
++{
++	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
++	struct btf *attach_btf = bpf_prog_get_target_btf(prog);
++	struct bpf_prog_info info;
++	u32 info_len = attr->info.info_len;
++	struct bpf_prog_kstats stats;
++	char __user *uinsns;
++	u32 ulen;
++	int err;
++
++	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
++	if (err)
++		return err;
++	info_len = min_t(u32, sizeof(info), info_len);
++
++	memset(&info, 0, sizeof(info));
++	if (copy_from_user(&info, uinfo, info_len))
++		return -EFAULT;
++
++	info.type = prog->type;
++	info.id = prog->aux->id;
++	info.load_time = prog->aux->load_time;
++	info.created_by_uid = from_kuid_munged(current_user_ns(),
++					       prog->aux->user->uid);
++	info.gpl_compatible = prog->gpl_compatible;
++
++	memcpy(info.tag, prog->tag, sizeof(prog->tag));
++	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
++
++	mutex_lock(&prog->aux->used_maps_mutex);
++	ulen = info.nr_map_ids;
++	info.nr_map_ids = prog->aux->used_map_cnt;
++	ulen = min_t(u32, info.nr_map_ids, ulen);
++	if (ulen) {
++		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
++		u32 i;
++
++		for (i = 0; i < ulen; i++)
++			if (put_user(prog->aux->used_maps[i]->id,
++				     &user_map_ids[i])) {
++				mutex_unlock(&prog->aux->used_maps_mutex);
++				return -EFAULT;
++			}
++	}
++	mutex_unlock(&prog->aux->used_maps_mutex);
++
++	err = set_info_rec_size(&info);
++	if (err)
++		return err;
++
++	bpf_prog_get_stats(prog, &stats);
++	info.run_time_ns = stats.nsecs;
++	info.run_cnt = stats.cnt;
++	info.recursion_misses = stats.misses;
++
++	info.verified_insns = prog->aux->verified_insns;
++
++	if (!bpf_capable()) {
++		info.jited_prog_len = 0;
++		info.xlated_prog_len = 0;
++		info.nr_jited_ksyms = 0;
++		info.nr_jited_func_lens = 0;
++		info.nr_func_info = 0;
++		info.nr_line_info = 0;
++		info.nr_jited_line_info = 0;
++		goto done;
++	}
++
++	ulen = info.xlated_prog_len;
++	info.xlated_prog_len = bpf_prog_insn_size(prog);
++	if (info.xlated_prog_len && ulen) {
++		struct bpf_insn *insns_sanitized;
++		bool fault;
++
++		if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
++			info.xlated_prog_insns = 0;
++			goto done;
++		}
++		insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
++		if (!insns_sanitized)
++			return -ENOMEM;
++		uinsns = u64_to_user_ptr(info.xlated_prog_insns);
++		ulen = min_t(u32, info.xlated_prog_len, ulen);
++		fault = copy_to_user(uinsns, insns_sanitized, ulen);
++		kfree(insns_sanitized);
++		if (fault)
++			return -EFAULT;
++	}
++
++	if (bpf_prog_is_dev_bound(prog->aux)) {
++		err = bpf_prog_offload_info_fill(&info, prog);
++		if (err)
++			return err;
++		goto done;
++	}
++
++	/* NOTE: the following code is supposed to be skipped for offload.
++	 * bpf_prog_offload_info_fill() is the place to fill similar fields
++	 * for offload.
++	 */
++	ulen = info.jited_prog_len;
++	if (prog->aux->func_cnt) {
++		u32 i;
++
++		info.jited_prog_len = 0;
++		for (i = 0; i < prog->aux->func_cnt; i++)
++			info.jited_prog_len += prog->aux->func[i]->jited_len;
++	} else {
++		info.jited_prog_len = prog->jited_len;
++	}
++
++	if (info.jited_prog_len && ulen) {
++		if (bpf_dump_raw_ok(file->f_cred)) {
++			uinsns = u64_to_user_ptr(info.jited_prog_insns);
++			ulen = min_t(u32, info.jited_prog_len, ulen);
++
++			/* for multi-function programs, copy the JITed
++			 * instructions for all the functions
++			 */
++			if (prog->aux->func_cnt) {
++				u32 len, free, i;
++				u8 *img;
++
++				free = ulen;
++				for (i = 0; i < prog->aux->func_cnt; i++) {
++					len = prog->aux->func[i]->jited_len;
++					len = min_t(u32, len, free);
++					img = (u8 *) prog->aux->func[i]->bpf_func;
++					if (copy_to_user(uinsns, img, len))
++						return -EFAULT;
++					uinsns += len;
++					free -= len;
++					if (!free)
++						break;
++				}
++			} else {
++				if (copy_to_user(uinsns, prog->bpf_func, ulen))
++					return -EFAULT;
++			}
++		} else {
++			info.jited_prog_insns = 0;
++		}
++	}
++
++	ulen = info.nr_jited_ksyms;
++	info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
++	if (ulen) {
++		if (bpf_dump_raw_ok(file->f_cred)) {
++			unsigned long ksym_addr;
++			u64 __user *user_ksyms;
++			u32 i;
++
++			/* copy the address of the kernel symbol
++			 * corresponding to each function
++			 */
++			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
++			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
++			if (prog->aux->func_cnt) {
++				for (i = 0; i < ulen; i++) {
++					ksym_addr = (unsigned long)
++						prog->aux->func[i]->bpf_func;
++					if (put_user((u64) ksym_addr,
++						     &user_ksyms[i]))
++						return -EFAULT;
++				}
++			} else {
++				ksym_addr = (unsigned long) prog->bpf_func;
++				if (put_user((u64) ksym_addr, &user_ksyms[0]))
++					return -EFAULT;
++			}
++		} else {
++			info.jited_ksyms = 0;
++		}
++	}
++
++	ulen = info.nr_jited_func_lens;
++	info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
++	if (ulen) {
++		if (bpf_dump_raw_ok(file->f_cred)) {
++			u32 __user *user_lens;
++			u32 func_len, i;
++
++			/* copy the JITed image lengths for each function */
++			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
++			user_lens = u64_to_user_ptr(info.jited_func_lens);
++			if (prog->aux->func_cnt) {
++				for (i = 0; i < ulen; i++) {
++					func_len =
++						prog->aux->func[i]->jited_len;
++					if (put_user(func_len, &user_lens[i]))
++						return -EFAULT;
++				}
++			} else {
++				func_len = prog->jited_len;
++				if (put_user(func_len, &user_lens[0]))
++					return -EFAULT;
++			}
++		} else {
++			info.jited_func_lens = 0;
++		}
++	}
++
++	if (prog->aux->btf)
++		info.btf_id = btf_obj_id(prog->aux->btf);
++	info.attach_btf_id = prog->aux->attach_btf_id;
++	if (attach_btf)
++		info.attach_btf_obj_id = btf_obj_id(attach_btf);
++
++	ulen = info.nr_func_info;
++	info.nr_func_info = prog->aux->func_info_cnt;
++	if (info.nr_func_info && ulen) {
++		char __user *user_finfo;
++
++		user_finfo = u64_to_user_ptr(info.func_info);
++		ulen = min_t(u32, info.nr_func_info, ulen);
++		if (copy_to_user(user_finfo, prog->aux->func_info,
++				 info.func_info_rec_size * ulen))
++			return -EFAULT;
++	}
++
++	ulen = info.nr_line_info;
++	info.nr_line_info = prog->aux->nr_linfo;
++	if (info.nr_line_info && ulen) {
++		__u8 __user *user_linfo;
++
++		user_linfo = u64_to_user_ptr(info.line_info);
++		ulen = min_t(u32, info.nr_line_info, ulen);
++		if (copy_to_user(user_linfo, prog->aux->linfo,
++				 info.line_info_rec_size * ulen))
++			return -EFAULT;
++	}
++
++	ulen = info.nr_jited_line_info;
++	if (prog->aux->jited_linfo)
++		info.nr_jited_line_info = prog->aux->nr_linfo;
++	else
++		info.nr_jited_line_info = 0;
++	if (info.nr_jited_line_info && ulen) {
++		if (bpf_dump_raw_ok(file->f_cred)) {
++			unsigned long line_addr;
++			__u64 __user *user_linfo;
++			u32 i;
++
++			user_linfo = u64_to_user_ptr(info.jited_line_info);
++			ulen = min_t(u32, info.nr_jited_line_info, ulen);
++			for (i = 0; i < ulen; i++) {
++				line_addr = (unsigned long)prog->aux->jited_linfo[i];
++				if (put_user((__u64)line_addr, &user_linfo[i]))
++					return -EFAULT;
++			}
++		} else {
++			info.jited_line_info = 0;
++		}
++	}
++
++	ulen = info.nr_prog_tags;
++	info.nr_prog_tags = prog->aux->func_cnt ? : 1;
++	if (ulen) {
++		__u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
++		u32 i;
++
++		user_prog_tags = u64_to_user_ptr(info.prog_tags);
++		ulen = min_t(u32, info.nr_prog_tags, ulen);
++		if (prog->aux->func_cnt) {
++			for (i = 0; i < ulen; i++) {
++				if (copy_to_user(user_prog_tags[i],
++						 prog->aux->func[i]->tag,
++						 BPF_TAG_SIZE))
++					return -EFAULT;
++			}
++		} else {
++			if (copy_to_user(user_prog_tags[0],
++					 prog->tag, BPF_TAG_SIZE))
++				return -EFAULT;
++		}
++	}
++
++done:
++	if (copy_to_user(uinfo, &info, info_len) ||
++	    put_user(info_len, &uattr->info.info_len))
++		return -EFAULT;
++
++	return 0;
++}
++
++static int bpf_map_get_info_by_fd(struct file *file,
++				  struct bpf_map *map,
++				  const union bpf_attr *attr,
++				  union bpf_attr __user *uattr)
++{
++	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
++	struct bpf_map_info info;
++	u32 info_len = attr->info.info_len;
++	int err;
++
++	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
++	if (err)
++		return err;
++	info_len = min_t(u32, sizeof(info), info_len);
++
++	memset(&info, 0, sizeof(info));
++	info.type = map->map_type;
++	info.id = map->id;
++	info.key_size = map->key_size;
++	info.value_size = map->value_size;
++	info.max_entries = map->max_entries;
++	info.map_flags = map->map_flags;
++	info.map_extra = map->map_extra;
++	memcpy(info.name, map->name, sizeof(map->name));
++
++	if (map->btf) {
++		info.btf_id = btf_obj_id(map->btf);
++		info.btf_key_type_id = map->btf_key_type_id;
++		info.btf_value_type_id = map->btf_value_type_id;
++	}
++	info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
++
++	if (bpf_map_is_dev_bound(map)) {
++		err = bpf_map_offload_info_fill(&info, map);
++		if (err)
++			return err;
++	}
++
++	if (copy_to_user(uinfo, &info, info_len) ||
++	    put_user(info_len, &uattr->info.info_len))
++		return -EFAULT;
++
++	return 0;
++}
++
++static int bpf_btf_get_info_by_fd(struct file *file,
++				  struct btf *btf,
++				  const union bpf_attr *attr,
++				  union bpf_attr __user *uattr)
++{
++	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
++	u32 info_len = attr->info.info_len;
++	int err;
++
++	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
++	if (err)
++		return err;
++
++	return btf_get_info_by_fd(btf, attr, uattr);
++}
++
++static int bpf_link_get_info_by_fd(struct file *file,
++				  struct bpf_link *link,
++				  const union bpf_attr *attr,
++				  union bpf_attr __user *uattr)
++{
++	struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
++	struct bpf_link_info info;
++	u32 info_len = attr->info.info_len;
++	int err;
++
++	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
++	if (err)
++		return err;
++	info_len = min_t(u32, sizeof(info), info_len);
++
++	memset(&info, 0, sizeof(info));
++	if (copy_from_user(&info, uinfo, info_len))
++		return -EFAULT;
++
++	info.type = link->type;
++	info.id = link->id;
++	info.prog_id = link->prog->aux->id;
++
++	if (link->ops->fill_link_info) {
++		err = link->ops->fill_link_info(link, &info);
++		if (err)
++			return err;
++	}
++
++	if (copy_to_user(uinfo, &info, info_len) ||
++	    put_user(info_len, &uattr->info.info_len))
++		return -EFAULT;
++
++	return 0;
++}
++
++
++#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
++
++static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
++				  union bpf_attr __user *uattr)
++{
++	int ufd = attr->info.bpf_fd;
++	struct fd f;
++	int err;
++
++	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
++		return -EINVAL;
++
++	f = fdget(ufd);
++	if (!f.file)
++		return -EBADFD;
++
++	if (f.file->f_op == &bpf_prog_fops)
++		err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
++					      uattr);
++	else if (f.file->f_op == &bpf_map_fops)
++		err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
++					     uattr);
++	else if (f.file->f_op == &btf_fops)
++		err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
++	else if (f.file->f_op == &bpf_link_fops)
++		err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
++					      attr, uattr);
++	else
++		err = -EINVAL;
++
++	fdput(f);
++	return err;
++}
++
++#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
++
++static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
++{
++	if (CHECK_ATTR(BPF_BTF_LOAD))
++		return -EINVAL;
++
++	if (!bpf_capable())
++		return -EPERM;
++
++	return btf_new_fd(attr, uattr);
++}
++
++#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
++
++static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
++{
++	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	return btf_get_fd_by_id(attr->btf_id);
++}
++
++static int bpf_task_fd_query_copy(const union bpf_attr *attr,
++				    union bpf_attr __user *uattr,
++				    u32 prog_id, u32 fd_type,
++				    const char *buf, u64 probe_offset,
++				    u64 probe_addr)
++{
++	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
++	u32 len = buf ? strlen(buf) : 0, input_len;
++	int err = 0;
++
++	if (put_user(len, &uattr->task_fd_query.buf_len))
++		return -EFAULT;
++	input_len = attr->task_fd_query.buf_len;
++	if (input_len && ubuf) {
++		if (!len) {
++			/* nothing to copy, just make ubuf NULL terminated */
++			char zero = '\0';
++
++			if (put_user(zero, ubuf))
++				return -EFAULT;
++		} else if (input_len >= len + 1) {
++			/* ubuf can hold the string with NULL terminator */
++			if (copy_to_user(ubuf, buf, len + 1))
++				return -EFAULT;
++		} else {
++			/* ubuf cannot hold the string with NULL terminator,
++			 * do a partial copy with NULL terminator.
++			 */
++			char zero = '\0';
++
++			err = -ENOSPC;
++			if (copy_to_user(ubuf, buf, input_len - 1))
++				return -EFAULT;
++			if (put_user(zero, ubuf + input_len - 1))
++				return -EFAULT;
++		}
++	}
++
++	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
++	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
++	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
++	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
++		return -EFAULT;
++
++	return err;
++}
++
++#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
++
++static int bpf_task_fd_query(const union bpf_attr *attr,
++			     union bpf_attr __user *uattr)
++{
++	pid_t pid = attr->task_fd_query.pid;
++	u32 fd = attr->task_fd_query.fd;
++	const struct perf_event *event;
++	struct task_struct *task;
++	struct file *file;
++	int err;
++
++	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if (attr->task_fd_query.flags != 0)
++		return -EINVAL;
++
++	rcu_read_lock();
++	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
++	rcu_read_unlock();
++	if (!task)
++		return -ENOENT;
++
++	err = 0;
++	file = fget_task(task, fd);
++	put_task_struct(task);
++	if (!file)
++		return -EBADF;
++
++	if (file->f_op == &bpf_link_fops) {
++		struct bpf_link *link = file->private_data;
++
++		if (link->ops == &bpf_raw_tp_link_lops) {
++			struct bpf_raw_tp_link *raw_tp =
++				container_of(link, struct bpf_raw_tp_link, link);
++			struct bpf_raw_event_map *btp = raw_tp->btp;
++
++			err = bpf_task_fd_query_copy(attr, uattr,
++						     raw_tp->link.prog->aux->id,
++						     BPF_FD_TYPE_RAW_TRACEPOINT,
++						     btp->tp->name, 0, 0);
++			goto put_file;
++		}
++		goto out_not_supp;
++	}
++
++	event = perf_get_event(file);
++	if (!IS_ERR(event)) {
++		u64 probe_offset, probe_addr;
++		u32 prog_id, fd_type;
++		const char *buf;
++
++		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
++					      &buf, &probe_offset,
++					      &probe_addr);
++		if (!err)
++			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
++						     fd_type, buf,
++						     probe_offset,
++						     probe_addr);
++		goto put_file;
++	}
++
++out_not_supp:
++	err = -ENOTSUPP;
++put_file:
++	fput(file);
++	return err;
++}
++
++#define BPF_MAP_BATCH_LAST_FIELD batch.flags
++
++#define BPF_DO_BATCH(fn)			\
++	do {					\
++		if (!fn) {			\
++			err = -ENOTSUPP;	\
++			goto err_put;		\
++		}				\
++		err = fn(map, attr, uattr);	\
++	} while (0)
++
++static int bpf_map_do_batch(const union bpf_attr *attr,
++			    union bpf_attr __user *uattr,
++			    int cmd)
++{
++	bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
++			 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
++	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
++	struct bpf_map *map;
++	int err, ufd;
++	struct fd f;
++
++	if (CHECK_ATTR(BPF_MAP_BATCH))
++		return -EINVAL;
++
++	ufd = attr->batch.map_fd;
++	f = fdget(ufd);
++	map = __bpf_map_get(f);
++	if (IS_ERR(map))
++		return PTR_ERR(map);
++	if (has_write)
++		bpf_map_write_active_inc(map);
++	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
++		err = -EPERM;
++		goto err_put;
++	}
++	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
++		err = -EPERM;
++		goto err_put;
++	}
++
++	if (cmd == BPF_MAP_LOOKUP_BATCH)
++		BPF_DO_BATCH(map->ops->map_lookup_batch);
++	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
++		BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
++	else if (cmd == BPF_MAP_UPDATE_BATCH)
++		BPF_DO_BATCH(map->ops->map_update_batch);
++	else
++		BPF_DO_BATCH(map->ops->map_delete_batch);
++err_put:
++	if (has_write)
++		bpf_map_write_active_dec(map);
++	fdput(f);
++	return err;
++}
++
++#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
++static int link_create(union bpf_attr *attr, bpfptr_t uattr)
++{
++	enum bpf_prog_type ptype;
++	struct bpf_prog *prog;
++	int ret;
++
++	if (CHECK_ATTR(BPF_LINK_CREATE))
++		return -EINVAL;
++
++	prog = bpf_prog_get(attr->link_create.prog_fd);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	ret = bpf_prog_attach_check_attach_type(prog,
++						attr->link_create.attach_type);
++	if (ret)
++		goto out;
++
++	switch (prog->type) {
++	case BPF_PROG_TYPE_EXT:
++		break;
++	case BPF_PROG_TYPE_PERF_EVENT:
++	case BPF_PROG_TYPE_TRACEPOINT:
++		if (attr->link_create.attach_type != BPF_PERF_EVENT) {
++			ret = -EINVAL;
++			goto out;
++		}
++		break;
++	case BPF_PROG_TYPE_KPROBE:
++		if (attr->link_create.attach_type != BPF_PERF_EVENT &&
++		    attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
++			ret = -EINVAL;
++			goto out;
++		}
++		break;
++	default:
++		ptype = attach_type_to_prog_type(attr->link_create.attach_type);
++		if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
++			ret = -EINVAL;
++			goto out;
++		}
++		break;
++	}
++
++	switch (prog->type) {
++	case BPF_PROG_TYPE_CGROUP_SKB:
++	case BPF_PROG_TYPE_CGROUP_SOCK:
++	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++	case BPF_PROG_TYPE_SOCK_OPS:
++	case BPF_PROG_TYPE_CGROUP_DEVICE:
++	case BPF_PROG_TYPE_CGROUP_SYSCTL:
++	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
++		ret = cgroup_bpf_link_attach(attr, prog);
++		break;
++	case BPF_PROG_TYPE_EXT:
++		ret = bpf_tracing_prog_attach(prog,
++					      attr->link_create.target_fd,
++					      attr->link_create.target_btf_id,
++					      attr->link_create.tracing.cookie);
++		break;
++	case BPF_PROG_TYPE_LSM:
++	case BPF_PROG_TYPE_TRACING:
++		if (attr->link_create.attach_type != prog->expected_attach_type) {
++			ret = -EINVAL;
++			goto out;
++		}
++		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
++			ret = bpf_raw_tp_link_attach(prog, NULL);
++		else if (prog->expected_attach_type == BPF_TRACE_ITER)
++			ret = bpf_iter_link_attach(attr, uattr, prog);
++		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
++			ret = cgroup_bpf_link_attach(attr, prog);
++		else
++			ret = bpf_tracing_prog_attach(prog,
++						      attr->link_create.target_fd,
++						      attr->link_create.target_btf_id,
++						      attr->link_create.tracing.cookie);
++		break;
++	case BPF_PROG_TYPE_FLOW_DISSECTOR:
++	case BPF_PROG_TYPE_SK_LOOKUP:
++		ret = netns_bpf_link_create(attr, prog);
++		break;
++#ifdef CONFIG_NET
++	case BPF_PROG_TYPE_XDP:
++		ret = bpf_xdp_link_attach(attr, prog);
++		break;
++#endif
++	case BPF_PROG_TYPE_PERF_EVENT:
++	case BPF_PROG_TYPE_TRACEPOINT:
++		ret = bpf_perf_link_attach(attr, prog);
++		break;
++	case BPF_PROG_TYPE_KPROBE:
++		if (attr->link_create.attach_type == BPF_PERF_EVENT)
++			ret = bpf_perf_link_attach(attr, prog);
++		else
++			ret = bpf_kprobe_multi_link_attach(attr, prog);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++out:
++	if (ret < 0)
++		bpf_prog_put(prog);
++	return ret;
++}
++
++#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
++
++static int link_update(union bpf_attr *attr)
++{
++	struct bpf_prog *old_prog = NULL, *new_prog;
++	struct bpf_link *link;
++	u32 flags;
++	int ret;
++
++	if (CHECK_ATTR(BPF_LINK_UPDATE))
++		return -EINVAL;
++
++	flags = attr->link_update.flags;
++	if (flags & ~BPF_F_REPLACE)
++		return -EINVAL;
++
++	link = bpf_link_get_from_fd(attr->link_update.link_fd);
++	if (IS_ERR(link))
++		return PTR_ERR(link);
++
++	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
++	if (IS_ERR(new_prog)) {
++		ret = PTR_ERR(new_prog);
++		goto out_put_link;
++	}
++
++	if (flags & BPF_F_REPLACE) {
++		old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
++		if (IS_ERR(old_prog)) {
++			ret = PTR_ERR(old_prog);
++			old_prog = NULL;
++			goto out_put_progs;
++		}
++	} else if (attr->link_update.old_prog_fd) {
++		ret = -EINVAL;
++		goto out_put_progs;
++	}
++
++	if (link->ops->update_prog)
++		ret = link->ops->update_prog(link, new_prog, old_prog);
++	else
++		ret = -EINVAL;
++
++out_put_progs:
++	if (old_prog)
++		bpf_prog_put(old_prog);
++	if (ret)
++		bpf_prog_put(new_prog);
++out_put_link:
++	bpf_link_put(link);
++	return ret;
++}
++
++#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
++
++static int link_detach(union bpf_attr *attr)
++{
++	struct bpf_link *link;
++	int ret;
++
++	if (CHECK_ATTR(BPF_LINK_DETACH))
++		return -EINVAL;
++
++	link = bpf_link_get_from_fd(attr->link_detach.link_fd);
++	if (IS_ERR(link))
++		return PTR_ERR(link);
++
++	if (link->ops->detach)
++		ret = link->ops->detach(link);
++	else
++		ret = -EOPNOTSUPP;
++
++	bpf_link_put(link);
++	return ret;
++}
++
++static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
++{
++	return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
++}
++
++struct bpf_link *bpf_link_by_id(u32 id)
++{
++	struct bpf_link *link;
++
++	if (!id)
++		return ERR_PTR(-ENOENT);
++
++	spin_lock_bh(&link_idr_lock);
++	/* before link is "settled", ID is 0, pretend it doesn't exist yet */
++	link = idr_find(&link_idr, id);
++	if (link) {
++		if (link->id)
++			link = bpf_link_inc_not_zero(link);
++		else
++			link = ERR_PTR(-EAGAIN);
++	} else {
++		link = ERR_PTR(-ENOENT);
++	}
++	spin_unlock_bh(&link_idr_lock);
++	return link;
++}
++
++struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
++{
++	struct bpf_link *link;
++
++	spin_lock_bh(&link_idr_lock);
++again:
++	link = idr_get_next(&link_idr, id);
++	if (link) {
++		link = bpf_link_inc_not_zero(link);
++		if (IS_ERR(link)) {
++			(*id)++;
++			goto again;
++		}
++	}
++	spin_unlock_bh(&link_idr_lock);
++
++	return link;
++}
++
++#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
++
++static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
++{
++	struct bpf_link *link;
++	u32 id = attr->link_id;
++	int fd;
++
++	if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	link = bpf_link_by_id(id);
++	if (IS_ERR(link))
++		return PTR_ERR(link);
++
++	fd = bpf_link_new_fd(link);
++	if (fd < 0)
++		bpf_link_put(link);
++
++	return fd;
++}
++
++DEFINE_MUTEX(bpf_stats_enabled_mutex);
++
++static int bpf_stats_release(struct inode *inode, struct file *file)
++{
++	mutex_lock(&bpf_stats_enabled_mutex);
++	static_key_slow_dec(&bpf_stats_enabled_key.key);
++	mutex_unlock(&bpf_stats_enabled_mutex);
++	return 0;
++}
++
++static const struct file_operations bpf_stats_fops = {
++	.release = bpf_stats_release,
++};
++
++static int bpf_enable_runtime_stats(void)
++{
++	int fd;
++
++	mutex_lock(&bpf_stats_enabled_mutex);
++
++	/* Set a very high limit to avoid overflow */
++	if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
++		mutex_unlock(&bpf_stats_enabled_mutex);
++		return -EBUSY;
++	}
++
++	fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
++	if (fd >= 0)
++		static_key_slow_inc(&bpf_stats_enabled_key.key);
++
++	mutex_unlock(&bpf_stats_enabled_mutex);
++	return fd;
++}
++
++#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
++
++static int bpf_enable_stats(union bpf_attr *attr)
++{
++
++	if (CHECK_ATTR(BPF_ENABLE_STATS))
++		return -EINVAL;
++
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	switch (attr->enable_stats.type) {
++	case BPF_STATS_RUN_TIME:
++		return bpf_enable_runtime_stats();
++	default:
++		break;
++	}
++	return -EINVAL;
++}
++
++#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
++
++static int bpf_iter_create(union bpf_attr *attr)
++{
++	struct bpf_link *link;
++	int err;
++
++	if (CHECK_ATTR(BPF_ITER_CREATE))
++		return -EINVAL;
++
++	if (attr->iter_create.flags)
++		return -EINVAL;
++
++	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
++	if (IS_ERR(link))
++		return PTR_ERR(link);
++
++	err = bpf_iter_new_fd(link);
++	bpf_link_put(link);
++
++	return err;
++}
++
++#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
++
++static int bpf_prog_bind_map(union bpf_attr *attr)
++{
++	struct bpf_prog *prog;
++	struct bpf_map *map;
++	struct bpf_map **used_maps_old, **used_maps_new;
++	int i, ret = 0;
++
++	if (CHECK_ATTR(BPF_PROG_BIND_MAP))
++		return -EINVAL;
++
++	if (attr->prog_bind_map.flags)
++		return -EINVAL;
++
++	prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
++	if (IS_ERR(prog))
++		return PTR_ERR(prog);
++
++	map = bpf_map_get(attr->prog_bind_map.map_fd);
++	if (IS_ERR(map)) {
++		ret = PTR_ERR(map);
++		goto out_prog_put;
++	}
++
++	mutex_lock(&prog->aux->used_maps_mutex);
++
++	used_maps_old = prog->aux->used_maps;
++
++	for (i = 0; i < prog->aux->used_map_cnt; i++)
++		if (used_maps_old[i] == map) {
++			bpf_map_put(map);
++			goto out_unlock;
++		}
++
++	used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
++				      sizeof(used_maps_new[0]),
++				      GFP_KERNEL);
++	if (!used_maps_new) {
++		ret = -ENOMEM;
++		goto out_unlock;
++	}
++
++	memcpy(used_maps_new, used_maps_old,
++	       sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
++	used_maps_new[prog->aux->used_map_cnt] = map;
++
++	prog->aux->used_map_cnt++;
++	prog->aux->used_maps = used_maps_new;
++
++	kfree(used_maps_old);
++
++out_unlock:
++	mutex_unlock(&prog->aux->used_maps_mutex);
++
++	if (ret)
++		bpf_map_put(map);
++out_prog_put:
++	bpf_prog_put(prog);
++	return ret;
++}
++
++static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
++{
++	union bpf_attr attr;
++	bool capable;
++	int err;
++
++	capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
++
++	/* Intent here is for unprivileged_bpf_disabled to block key object
++	 * creation commands for unprivileged users; other actions depend
++	 * of fd availability and access to bpffs, so are dependent on
++	 * object creation success.  Capabilities are later verified for
++	 * operations such as load and map create, so even with unprivileged
++	 * BPF disabled, capability checks are still carried out for these
++	 * and other operations.
++	 */
++	if (!capable &&
++	    (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
++		return -EPERM;
++
++	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
++	if (err)
++		return err;
++	size = min_t(u32, size, sizeof(attr));
++
++	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
++	memset(&attr, 0, sizeof(attr));
++	if (copy_from_bpfptr(&attr, uattr, size) != 0)
++		return -EFAULT;
++
++	err = security_bpf(cmd, &attr, size);
++	if (err < 0)
++		return err;
++
++	switch (cmd) {
++	case BPF_MAP_CREATE:
++		err = map_create(&attr);
++		break;
++	case BPF_MAP_LOOKUP_ELEM:
++		err = map_lookup_elem(&attr);
++		break;
++	case BPF_MAP_UPDATE_ELEM:
++		err = map_update_elem(&attr, uattr);
++		break;
++	case BPF_MAP_DELETE_ELEM:
++		err = map_delete_elem(&attr);
++		break;
++	case BPF_MAP_GET_NEXT_KEY:
++		err = map_get_next_key(&attr);
++		break;
++	case BPF_MAP_FREEZE:
++		err = map_freeze(&attr);
++		break;
++	case BPF_PROG_LOAD:
++		err = bpf_prog_load(&attr, uattr);
++		break;
++	case BPF_OBJ_PIN:
++		err = bpf_obj_pin(&attr);
++		break;
++	case BPF_OBJ_GET:
++		err = bpf_obj_get(&attr);
++		break;
++	case BPF_PROG_ATTACH:
++		err = bpf_prog_attach(&attr);
++		break;
++	case BPF_PROG_DETACH:
++		err = bpf_prog_detach(&attr);
++		break;
++	case BPF_PROG_QUERY:
++		err = bpf_prog_query(&attr, uattr.user);
++		break;
++	case BPF_PROG_TEST_RUN:
++		err = bpf_prog_test_run(&attr, uattr.user);
++		break;
++	case BPF_PROG_GET_NEXT_ID:
++		err = bpf_obj_get_next_id(&attr, uattr.user,
++					  &prog_idr, &prog_idr_lock);
++		break;
++	case BPF_MAP_GET_NEXT_ID:
++		err = bpf_obj_get_next_id(&attr, uattr.user,
++					  &map_idr, &map_idr_lock);
++		break;
++	case BPF_BTF_GET_NEXT_ID:
++		err = bpf_obj_get_next_id(&attr, uattr.user,
++					  &btf_idr, &btf_idr_lock);
++		break;
++	case BPF_PROG_GET_FD_BY_ID:
++		err = bpf_prog_get_fd_by_id(&attr);
++		break;
++	case BPF_MAP_GET_FD_BY_ID:
++		err = bpf_map_get_fd_by_id(&attr);
++		break;
++	case BPF_OBJ_GET_INFO_BY_FD:
++		err = bpf_obj_get_info_by_fd(&attr, uattr.user);
++		break;
++	case BPF_RAW_TRACEPOINT_OPEN:
++		err = bpf_raw_tracepoint_open(&attr);
++		break;
++	case BPF_BTF_LOAD:
++		err = bpf_btf_load(&attr, uattr);
++		break;
++	case BPF_BTF_GET_FD_BY_ID:
++		err = bpf_btf_get_fd_by_id(&attr);
++		break;
++	case BPF_TASK_FD_QUERY:
++		err = bpf_task_fd_query(&attr, uattr.user);
++		break;
++	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
++		err = map_lookup_and_delete_elem(&attr);
++		break;
++	case BPF_MAP_LOOKUP_BATCH:
++		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
++		break;
++	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
++		err = bpf_map_do_batch(&attr, uattr.user,
++				       BPF_MAP_LOOKUP_AND_DELETE_BATCH);
++		break;
++	case BPF_MAP_UPDATE_BATCH:
++		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
++		break;
++	case BPF_MAP_DELETE_BATCH:
++		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
++		break;
++	case BPF_LINK_CREATE:
++		err = link_create(&attr, uattr);
++		break;
++	case BPF_LINK_UPDATE:
++		err = link_update(&attr);
++		break;
++	case BPF_LINK_GET_FD_BY_ID:
++		err = bpf_link_get_fd_by_id(&attr);
++		break;
++	case BPF_LINK_GET_NEXT_ID:
++		err = bpf_obj_get_next_id(&attr, uattr.user,
++					  &link_idr, &link_idr_lock);
++		break;
++	case BPF_ENABLE_STATS:
++		err = bpf_enable_stats(&attr);
++		break;
++	case BPF_ITER_CREATE:
++		err = bpf_iter_create(&attr);
++		break;
++	case BPF_LINK_DETACH:
++		err = link_detach(&attr);
++		break;
++	case BPF_PROG_BIND_MAP:
++		err = bpf_prog_bind_map(&attr);
++		break;
++	default:
++		err = -EINVAL;
++		break;
++	}
++
++	return err;
++}
++
++SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
++{
++	return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
++}
++
++static bool syscall_prog_is_valid_access(int off, int size,
++					 enum bpf_access_type type,
++					 const struct bpf_prog *prog,
++					 struct bpf_insn_access_aux *info)
++{
++	if (off < 0 || off >= U16_MAX)
++		return false;
++	if (off % size != 0)
++		return false;
++	return true;
++}
++
++BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
++{
++	switch (cmd) {
++	case BPF_MAP_CREATE:
++	case BPF_MAP_UPDATE_ELEM:
++	case BPF_MAP_FREEZE:
++	case BPF_PROG_LOAD:
++	case BPF_BTF_LOAD:
++	case BPF_LINK_CREATE:
++	case BPF_RAW_TRACEPOINT_OPEN:
++		break;
++	default:
++		return -EINVAL;
++	}
++	return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
++}
++
++
++/* To shut up -Wmissing-prototypes.
++ * This function is used by the kernel light skeleton
++ * to load bpf programs when modules are loaded or during kernel boot.
++ * See tools/lib/bpf/skel_internal.h
++ */
++int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
++
++int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
++{
++	struct bpf_prog * __maybe_unused prog;
++	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
++
++	switch (cmd) {
++#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
++	case BPF_PROG_TEST_RUN:
++		if (attr->test.data_in || attr->test.data_out ||
++		    attr->test.ctx_out || attr->test.duration ||
++		    attr->test.repeat || attr->test.flags)
++			return -EINVAL;
++
++		prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
++		if (IS_ERR(prog))
++			return PTR_ERR(prog);
++
++		if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
++		    attr->test.ctx_size_in > U16_MAX) {
++			bpf_prog_put(prog);
++			return -EINVAL;
++		}
++
++		run_ctx.bpf_cookie = 0;
++		run_ctx.saved_run_ctx = NULL;
++		if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
++			/* recursion detected */
++			bpf_prog_put(prog);
++			return -EBUSY;
++		}
++		attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
++		__bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
++		bpf_prog_put(prog);
++		return 0;
++#endif
++	default:
++		return ____bpf_sys_bpf(cmd, attr, size);
++	}
++}
++EXPORT_SYMBOL(kern_sys_bpf);
++
++static const struct bpf_func_proto bpf_sys_bpf_proto = {
++	.func		= bpf_sys_bpf,
++	.gpl_only	= false,
++	.ret_type	= RET_INTEGER,
++	.arg1_type	= ARG_ANYTHING,
++	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
++	.arg3_type	= ARG_CONST_SIZE,
++};
++
++const struct bpf_func_proto * __weak
++tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
++{
++	return bpf_base_func_proto(func_id);
++}
++
++BPF_CALL_1(bpf_sys_close, u32, fd)
++{
++	/* When bpf program calls this helper there should not be
++	 * an fdget() without matching completed fdput().
++	 * This helper is allowed in the following callchain only:
++	 * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
++	 */
++	return close_fd(fd);
++}
++
++static const struct bpf_func_proto bpf_sys_close_proto = {
++	.func		= bpf_sys_close,
++	.gpl_only	= false,
++	.ret_type	= RET_INTEGER,
++	.arg1_type	= ARG_ANYTHING,
++};
++
++BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
++{
++	if (flags)
++		return -EINVAL;
++
++	if (name_sz <= 1 || name[name_sz - 1])
++		return -EINVAL;
++
++	if (!bpf_dump_raw_ok(current_cred()))
++		return -EPERM;
++
++	*res = kallsyms_lookup_name(name);
++	return *res ? 0 : -ENOENT;
++}
++
++static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
++	.func		= bpf_kallsyms_lookup_name,
++	.gpl_only	= false,
++	.ret_type	= RET_INTEGER,
++	.arg1_type	= ARG_PTR_TO_MEM,
++	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
++	.arg3_type	= ARG_ANYTHING,
++	.arg4_type	= ARG_PTR_TO_LONG,
++};
++
++static const struct bpf_func_proto *
++syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
++{
++	switch (func_id) {
++	case BPF_FUNC_sys_bpf:
++		return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
++	case BPF_FUNC_btf_find_by_name_kind:
++		return &bpf_btf_find_by_name_kind_proto;
++	case BPF_FUNC_sys_close:
++		return &bpf_sys_close_proto;
++	case BPF_FUNC_kallsyms_lookup_name:
++		return &bpf_kallsyms_lookup_name_proto;
++	default:
++		return tracing_prog_func_proto(func_id, prog);
++	}
++}
++
++const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
++	.get_func_proto  = syscall_prog_func_proto,
++	.is_valid_access = syscall_prog_is_valid_access,
++};
++
++const struct bpf_prog_ops bpf_syscall_prog_ops = {
++	.test_run = bpf_prog_test_run_syscall,
++};
++
++#ifdef CONFIG_SYSCTL
++static int bpf_stats_handler(struct ctl_table *table, int write,
++			     void *buffer, size_t *lenp, loff_t *ppos)
++{
++	struct static_key *key = (struct static_key *)table->data;
++	static int saved_val;
++	int val, ret;
++	struct ctl_table tmp = {
++		.data   = &val,
++		.maxlen = sizeof(val),
++		.mode   = table->mode,
++		.extra1 = SYSCTL_ZERO,
++		.extra2 = SYSCTL_ONE,
++	};
++
++	if (write && !capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	mutex_lock(&bpf_stats_enabled_mutex);
++	val = saved_val;
++	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
++	if (write && !ret && val != saved_val) {
++		if (val)
++			static_key_slow_inc(key);
++		else
++			static_key_slow_dec(key);
++		saved_val = val;
++	}
++	mutex_unlock(&bpf_stats_enabled_mutex);
++	return ret;
++}
++
++void __weak unpriv_ebpf_notify(int new_state)
++{
++}
++
++static int bpf_unpriv_handler(struct ctl_table *table, int write,
++			      void *buffer, size_t *lenp, loff_t *ppos)
++{
++	int ret, unpriv_enable = *(int *)table->data;
++	bool locked_state = unpriv_enable == 1;
++	struct ctl_table tmp = *table;
++
++	if (write && !capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	tmp.data = &unpriv_enable;
++	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
++	if (write && !ret) {
++		if (locked_state && unpriv_enable != 1)
++			return -EPERM;
++		*(int *)table->data = unpriv_enable;
++	}
++
++	unpriv_ebpf_notify(unpriv_enable);
++
++	return ret;
++}
++
++static struct ctl_table bpf_syscall_table[] = {
++	{
++		.procname	= "unprivileged_bpf_disabled",
++		.data		= &sysctl_unprivileged_bpf_disabled,
++		.maxlen		= sizeof(sysctl_unprivileged_bpf_disabled),
++		.mode		= 0644,
++		.proc_handler	= bpf_unpriv_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_TWO,
++	},
++	{
++		.procname	= "bpf_stats_enabled",
++		.data		= &bpf_stats_enabled_key.key,
++		.maxlen		= sizeof(bpf_stats_enabled_key),
++		.mode		= 0644,
++		.proc_handler	= bpf_stats_handler,
++	},
++	{ }
++};
++
++static int __init bpf_syscall_sysctl_init(void)
++{
++	register_sysctl_init("kernel", bpf_syscall_table);
++	return 0;
++}
++late_initcall(bpf_syscall_sysctl_init);
++#endif /* CONFIG_SYSCTL */
+diff -rupN linux.orig/kernel/entry/common.c linux/kernel/entry/common.c
+--- linux.orig/kernel/entry/common.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/entry/common.c	2022-12-04 10:40:26.716034044 -0500
+@@ -153,7 +153,7 @@ static unsigned long exit_to_user_mode_l
  
  		local_irq_enable_exit_to_user(ti_work);
  
@@ -5094,7 +33236,7 @@ index 063068a9ea9b3..26b772720b227 100644
  			schedule();
  
  		if (ti_work & _TIF_UPROBE)
-@@ -381,7 +381,7 @@ void raw_irqentry_exit_cond_resched(void)
+@@ -381,7 +381,7 @@ void raw_irqentry_exit_cond_resched(void
  		rcu_irq_exit_check_preempt();
  		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
  			WARN_ON_ONCE(!on_thread_stack());
@@ -5103,11 +33245,10 @@ index 063068a9ea9b3..26b772720b227 100644
  			preempt_schedule_irq();
  	}
  }
-diff --git a/kernel/hung_task.c b/kernel/hung_task.c
-index bb2354f73dedc..19c9de825d248 100644
---- a/kernel/hung_task.c
-+++ b/kernel/hung_task.c
-@@ -127,6 +127,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
+diff -rupN linux.orig/kernel/hung_task.c linux/kernel/hung_task.c
+--- linux.orig/kernel/hung_task.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/hung_task.c	2022-12-04 10:40:26.716034044 -0500
+@@ -127,6 +127,8 @@ static void check_hung_task(struct task_
  	 * complain:
  	 */
  	if (sysctl_hung_task_warnings) {
@@ -5116,7 +33257,7 @@ index bb2354f73dedc..19c9de825d248 100644
  		if (sysctl_hung_task_warnings > 0)
  			sysctl_hung_task_warnings--;
  		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
-@@ -142,6 +144,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
+@@ -142,6 +144,8 @@ static void check_hung_task(struct task_
  
  		if (sysctl_hung_task_all_cpu_backtrace)
  			hung_task_show_all_bt = true;
@@ -5125,7 +33266,7 @@ index bb2354f73dedc..19c9de825d248 100644
  	}
  
  	touch_nmi_watchdog();
-@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
+@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_t
  	}
   unlock:
  	rcu_read_unlock();
@@ -5144,11 +33285,10 @@ index bb2354f73dedc..19c9de825d248 100644
  	}
  
  	if (hung_task_call_panic)
-diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
-index 5db0230aa6b52..476a3fecb8c53 100644
---- a/kernel/irq/irqdesc.c
-+++ b/kernel/irq/irqdesc.c
-@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
+diff -rupN linux.orig/kernel/irq/irqdesc.c linux/kernel/irq/irqdesc.c
+--- linux.orig/kernel/irq/irqdesc.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/irq/irqdesc.c	2022-12-04 10:40:26.716034044 -0500
+@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq
  }
  EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
  
@@ -5179,10 +33319,24 @@ index 5db0230aa6b52..476a3fecb8c53 100644
  /**
   * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
   *                             to a domain.
-diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
-index b1292a57c2a53..a6514db7ef58e 100644
---- a/kernel/ksysfs.c
-+++ b/kernel/ksysfs.c
+diff -rupN linux.orig/kernel/Kconfig.preempt linux/kernel/Kconfig.preempt
+--- linux.orig/kernel/Kconfig.preempt	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/Kconfig.preempt	2022-12-04 10:40:26.716034044 -0500
+@@ -1,5 +1,11 @@
+ # SPDX-License-Identifier: GPL-2.0-only
+ 
++config HAVE_PREEMPT_LAZY
++	bool
++
++config PREEMPT_LAZY
++	def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT
++
+ config PREEMPT_NONE_BUILD
+ 	bool
+ 
+diff -rupN linux.orig/kernel/ksysfs.c linux/kernel/ksysfs.c
+--- linux.orig/kernel/ksysfs.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/ksysfs.c	2022-12-04 10:40:26.716034044 -0500
 @@ -137,6 +137,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
  
  #endif /* CONFIG_CRASH_CORE */
@@ -5199,20 +33353,19 @@ index b1292a57c2a53..a6514db7ef58e 100644
  /* whether file capabilities are enabled */
  static ssize_t fscaps_show(struct kobject *kobj,
  				  struct kobj_attribute *attr, char *buf)
-@@ -227,6 +236,9 @@ static struct attribute * kernel_attrs[] = {
- #ifndef CONFIG_TINY_RCU
+@@ -228,6 +237,9 @@ static struct attribute * kernel_attrs[]
  	&rcu_expedited_attr.attr,
  	&rcu_normal_attr.attr,
-+#endif
+ #endif
 +#ifdef CONFIG_PREEMPT_RT
 +	&realtime_attr.attr,
- #endif
++#endif
  	NULL
  };
-diff --git a/kernel/panic.c b/kernel/panic.c
-index c6eb8f8db0c05..c4e8896e3caba 100644
---- a/kernel/panic.c
-+++ b/kernel/panic.c
+ 
+diff -rupN linux.orig/kernel/panic.c linux/kernel/panic.c
+--- linux.orig/kernel/panic.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/panic.c	2022-12-04 10:40:26.716034044 -0500
 @@ -257,7 +257,6 @@ void panic(const char *fmt, ...)
  		panic_smp_self_stop();
  
@@ -5249,7 +33402,7 @@ index c6eb8f8db0c05..c4e8896e3caba 100644
  		crash_smp_send_stop();
  	}
  
-@@ -604,6 +610,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
+@@ -604,6 +610,8 @@ void __warn(const char *file, int line,
  {
  	disable_trace_on_warning();
  
@@ -5258,7 +33411,7 @@ index c6eb8f8db0c05..c4e8896e3caba 100644
  	if (file)
  		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
  			raw_smp_processor_id(), current->pid, file, line,
-@@ -633,6 +641,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
+@@ -633,6 +641,8 @@ void __warn(const char *file, int line,
  
  	/* Just a warning, don't kill lockdep. */
  	add_taint(taint, LOCKDEP_STILL_OK);
@@ -5267,10 +33420,9 @@ index c6eb8f8db0c05..c4e8896e3caba 100644
  }
  
  #ifndef __WARN_FLAGS
-diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
-index d947ca6c84f99..e7d8578860adf 100644
---- a/kernel/printk/internal.h
-+++ b/kernel/printk/internal.h
+diff -rupN linux.orig/kernel/printk/internal.h linux/kernel/printk/internal.h
+--- linux.orig/kernel/printk/internal.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/printk/internal.h	2022-12-04 10:40:26.716034044 -0500
 @@ -20,6 +20,8 @@ enum printk_info_flags {
  	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
  };
@@ -5280,10 +33432,9 @@ index d947ca6c84f99..e7d8578860adf 100644
  __printf(4, 0)
  int vprintk_store(int facility, int level,
  		  const struct dev_printk_info *dev_info,
-diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
-index a1a81fd9889bb..f1f9ce9b23f60 100644
---- a/kernel/printk/printk.c
-+++ b/kernel/printk/printk.c
+diff -rupN linux.orig/kernel/printk/printk.c linux/kernel/printk/printk.c
+--- linux.orig/kernel/printk/printk.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/printk/printk.c	2022-12-04 10:40:26.720034034 -0500
 @@ -44,6 +44,7 @@
  #include <linux/irq_work.h>
  #include <linux/ctype.h>
@@ -5292,11 +33443,10 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  #include <linux/sched/clock.h>
  #include <linux/sched/debug.h>
  #include <linux/sched/task_stack.h>
-@@ -223,6 +224,36 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
- /* Number of registered extended console drivers. */
+@@ -224,6 +225,36 @@ int devkmsg_sysctl_set_loglvl(struct ctl
  static int nr_ext_console_drivers;
  
-+/*
+ /*
 + * Used to synchronize printing kthreads against direct printing via
 + * console_trylock/console_unlock.
 + *
@@ -5326,9 +33476,10 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
 +/* Block console kthreads to avoid processing new messages. */
 +bool block_console_kthreads;
 +
- /*
++/*
   * Helper macros to handle lockdep when locking/unlocking console_sem. We use
   * macros instead of functions so that _RET_IP_ contains useful information.
+  */
 @@ -271,14 +302,49 @@ static bool panic_in_progress(void)
  }
  
@@ -5342,15 +33493,15 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
 + * Tracks whether kthread printers are all blocked. A value of true implies
 + * that the console is locked via console_lock() or the console is suspended.
 + * Writing to this variable requires holding @console_sem.
-  */
--static int console_locked, console_suspended;
++ */
 +static bool console_kthreads_blocked;
 +
 +/*
 + * Block all kthread printers from a schedulable context.
 + *
 + * Requires holding @console_sem.
-+ */
+  */
+-static int console_locked, console_suspended;
 +static void console_kthreads_block(void)
 +{
 +	struct console *con;
@@ -5386,7 +33537,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  
  /*
   *	Array of consoles built from command line options (console=)
-@@ -361,7 +427,75 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
+@@ -361,7 +427,75 @@ static int console_msg_format = MSG_FORM
  /* syslog_lock protects syslog_* variables and write access to clear_seq. */
  static DEFINE_MUTEX(syslog_lock);
  
@@ -5462,7 +33613,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  DECLARE_WAIT_QUEUE_HEAD(log_wait);
  /* All 3 protected by @syslog_lock. */
  /* the next printk record to read by syslog(READ) or /proc/kmsg */
-@@ -1850,6 +1984,7 @@ static int console_lock_spinning_disable_and_check(void)
+@@ -1850,6 +1984,7 @@ static int console_lock_spinning_disable
  	return 1;
  }
  
@@ -5470,7 +33621,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  /**
   * console_trylock_spinning - try to get console_lock by busy waiting
   *
-@@ -1923,6 +2058,7 @@ static int console_trylock_spinning(void)
+@@ -1923,6 +2058,7 @@ static int console_trylock_spinning(void
  
  	return 1;
  }
@@ -5478,7 +33629,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  
  /*
   * Call the specified console driver, asking it to write out the specified
-@@ -1930,19 +2066,28 @@ static int console_trylock_spinning(void)
+@@ -1930,19 +2066,28 @@ static int console_trylock_spinning(void
   * dropped, a dropped message will be written out first.
   */
  static void call_console_driver(struct console *con, const char *text, size_t len,
@@ -5513,7 +33664,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  }
  
  /*
-@@ -2252,10 +2397,22 @@ asmlinkage int vprintk_emit(int facility, int level,
+@@ -2252,10 +2397,22 @@ asmlinkage int vprintk_emit(int facility
  	printed_len = vprintk_store(facility, level, dev_info, fmt, args);
  
  	/* If called from the scheduler, we can not call up(). */
@@ -5538,7 +33689,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  		 * printing of all remaining records to all consoles so that
  		 * this context can return as soon as possible. Hopefully
  		 * another printk() caller will take over the printing.
-@@ -2270,6 +2427,7 @@ asmlinkage int vprintk_emit(int facility, int level,
+@@ -2270,6 +2427,7 @@ asmlinkage int vprintk_emit(int facility
  		if (console_trylock_spinning())
  			console_unlock();
  		preempt_enable();
@@ -5546,7 +33697,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	}
  
  	wake_up_klogd();
-@@ -2296,8 +2454,80 @@ asmlinkage __visible int _printk(const char *fmt, ...)
+@@ -2296,8 +2454,80 @@ asmlinkage __visible int _printk(const c
  }
  EXPORT_SYMBOL(_printk);
  
@@ -5627,7 +33778,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  #else /* CONFIG_PRINTK */
  
  #define CONSOLE_LOG_MAX		0
-@@ -2308,6 +2538,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
+@@ -2308,6 +2538,8 @@ static bool __pr_flush(struct console *c
  #define prb_first_valid_seq(rb)		0
  #define prb_next_seq(rb)		0
  
@@ -5636,7 +33787,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  static u64 syslog_seq;
  
  static size_t record_print_text(const struct printk_record *r,
-@@ -2326,11 +2558,13 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
+@@ -2326,11 +2558,13 @@ static ssize_t msg_print_ext_body(char *
  static void console_lock_spinning_enable(void) { }
  static int console_lock_spinning_disable_and_check(void) { return 0; }
  static void call_console_driver(struct console *con, const char *text, size_t len,
@@ -5651,7 +33802,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  
  #endif /* CONFIG_PRINTK */
  
-@@ -2549,6 +2783,14 @@ static int console_cpu_notify(unsigned int cpu)
+@@ -2549,6 +2783,14 @@ static int console_cpu_notify(unsigned i
  		/* If trylock fails, someone else is doing the printing */
  		if (console_trylock())
  			console_unlock();
@@ -5708,7 +33859,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  }
  EXPORT_SYMBOL(is_console_locked);
  
-@@ -2620,18 +2877,9 @@ static bool abandon_console_lock_in_panic(void)
+@@ -2620,18 +2877,9 @@ static bool abandon_console_lock_in_pani
  	return atomic_read(&panic_cpu) != raw_smp_processor_id();
  }
  
@@ -5729,7 +33880,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  		return false;
  
  	/*
-@@ -2640,18 +2888,116 @@ static inline bool console_is_usable(struct console *con)
+@@ -2640,18 +2888,116 @@ static inline bool console_is_usable(str
  	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
  	 */
  	if (!cpu_online(raw_smp_processor_id()) &&
@@ -5907,7 +34058,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  			suppress_panic_printk = 1;
  			pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
  		}
-@@ -2701,7 +3058,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
+@@ -2701,7 +3058,7 @@ static bool console_emit_next_record(str
  
  	/* Skip record that has level above the console loglevel. */
  	if (suppress_message_printing(r.info->level)) {
@@ -5916,7 +34067,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  		goto skip;
  	}
  
-@@ -2715,31 +3072,65 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
+@@ -2715,32 +3072,66 @@ static bool console_emit_next_record(str
  		len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
  	}
  
@@ -5969,7 +34120,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	return true;
  }
  
-+/*
+ /*
 + * Print a record for a given console, but allow another printk() caller to
 + * take over the console_lock and continue printing.
 + *
@@ -5997,10 +34148,11 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
 +	return __console_emit_next_record(con, text, ext_text, dropped_text, false, handover);
 +}
 +
- /*
++/*
   * Print out all remaining records to all consoles.
   *
-@@ -2758,8 +3149,8 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
+  * @do_cond_resched is set by the caller. It can be true only in schedulable
+@@ -2758,8 +3149,8 @@ skip:
   * were flushed to all usable consoles. A returned false informs the caller
   * that everything was not flushed (either there were no usable consoles or
   * another context has taken over printing or it is a panic situation and this
@@ -6011,7 +34163,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
   *
   * Requires the console_lock.
   */
-@@ -2776,24 +3167,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
+@@ -2776,24 +3167,26 @@ static bool console_flush_all(bool do_co
  	*handover = false;
  
  	do {
@@ -6045,7 +34197,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  			}
  			if (*handover)
  				return false;
-@@ -2818,6 +3211,68 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
+@@ -2818,6 +3211,68 @@ static bool console_flush_all(bool do_co
  	return any_usable;
  }
  
@@ -6141,7 +34293,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	/*
  	 * If someone else is holding the console lock, trylock will fail
  	 * and may_schedule may be set.  Ignore and proceed to unlock so
-@@ -2946,7 +3409,7 @@ void console_flush_on_panic(enum con_flush_mode mode)
+@@ -2946,7 +3409,7 @@ void console_flush_on_panic(enum con_flu
  
  		seq = prb_first_valid_seq(prb);
  		for_each_console(c)
@@ -6150,7 +34302,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	}
  	console_unlock();
  }
-@@ -3189,16 +3652,27 @@ void register_console(struct console *newcon)
+@@ -3189,16 +3652,27 @@ void register_console(struct console *ne
  	if (newcon->flags & CON_EXTENDED)
  		nr_ext_console_drivers++;
  
@@ -6189,7 +34341,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  	struct console *con;
  	int res;
  
-@@ -3265,9 +3740,26 @@ int unregister_console(struct console *console)
+@@ -3265,9 +3740,26 @@ int unregister_console(struct console *c
  		console_drivers->flags |= CON_CONSDEV;
  
  	console->flags &= ~CON_ENABLED;
@@ -6237,7 +34389,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  #if defined CONFIG_PRINTK
  /* If @con is specified, only wait for that console. Otherwise wait for all. */
  static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
-@@ -3384,7 +3890,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
+@@ -3384,7 +3890,7 @@ static bool __pr_flush(struct console *c
  		for_each_console(c) {
  			if (con && con != c)
  				continue;
@@ -6246,7 +34398,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  				continue;
  			printk_seq = c->seq;
  			if (printk_seq < seq)
-@@ -3444,11 +3950,215 @@ bool pr_flush(int timeout_ms, bool reset_on_progress)
+@@ -3444,11 +3950,215 @@ bool pr_flush(int timeout_ms, bool reset
  }
  EXPORT_SYMBOL(pr_flush);
  
@@ -6464,7 +34616,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  
  static DEFINE_PER_CPU(int, printk_pending);
  
-@@ -3456,10 +4166,14 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
+@@ -3456,10 +4166,14 @@ static void wake_up_klogd_work_func(stru
  {
  	int pending = this_cpu_xchg(printk_pending, 0);
  
@@ -6513,10 +34665,9 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644
  }
  
  void printk_trigger_flush(void)
-diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
-index ef0f9a2044da1..caac4de1ea59a 100644
---- a/kernel/printk/printk_safe.c
-+++ b/kernel/printk/printk_safe.c
+diff -rupN linux.orig/kernel/printk/printk_safe.c linux/kernel/printk/printk_safe.c
+--- linux.orig/kernel/printk/printk_safe.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/printk/printk_safe.c	2022-12-04 10:40:26.720034034 -0500
 @@ -8,7 +8,9 @@
  #include <linux/smp.h>
  #include <linux/cpumask.h>
@@ -6527,7 +34678,7 @@ index ef0f9a2044da1..caac4de1ea59a 100644
  
  #include "internal.h"
  
-@@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt, va_list args)
+@@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt,
  	return vprintk_default(fmt, args);
  }
  EXPORT_SYMBOL(vprintk);
@@ -6561,11 +34712,10 @@ index ef0f9a2044da1..caac4de1ea59a 100644
 +		timeout_ms -= 1;
 +	}
 +}
-diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
-index d8e1b270a065f..257cb6f5ea622 100644
---- a/kernel/rcu/rcutorture.c
-+++ b/kernel/rcu/rcutorture.c
-@@ -2157,6 +2157,12 @@ static int rcutorture_booster_init(unsigned int cpu)
+diff -rupN linux.orig/kernel/rcu/rcutorture.c linux/kernel/rcu/rcutorture.c
+--- linux.orig/kernel/rcu/rcutorture.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/rcu/rcutorture.c	2022-12-04 10:40:26.720034034 -0500
+@@ -2157,6 +2157,12 @@ static int rcutorture_booster_init(unsig
  		WARN_ON_ONCE(!t);
  		sp.sched_priority = 2;
  		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
@@ -6578,11 +34728,10 @@ index d8e1b270a065f..257cb6f5ea622 100644
  	}
  
  	/* Don't allow time recalculation while creating a new task. */
-diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
-index c3fbbcc09327f..195cad14742dd 100644
---- a/kernel/rcu/tree_stall.h
-+++ b/kernel/rcu/tree_stall.h
-@@ -643,6 +643,7 @@ static void print_cpu_stall(unsigned long gps)
+diff -rupN linux.orig/kernel/rcu/tree_stall.h linux/kernel/rcu/tree_stall.h
+--- linux.orig/kernel/rcu/tree_stall.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/rcu/tree_stall.h	2022-12-04 10:40:26.720034034 -0500
+@@ -643,6 +643,7 @@ static void print_cpu_stall(unsigned lon
  	 * See Documentation/RCU/stallwarn.rst for info on how to debug
  	 * RCU CPU stall warnings.
  	 */
@@ -6590,7 +34739,7 @@ index c3fbbcc09327f..195cad14742dd 100644
  	trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
  	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
  	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
-@@ -677,6 +678,7 @@ static void print_cpu_stall(unsigned long gps)
+@@ -677,6 +678,7 @@ static void print_cpu_stall(unsigned lon
  	 */
  	set_tsk_need_resched(current);
  	set_preempt_need_resched();
@@ -6598,10 +34747,9 @@ index c3fbbcc09327f..195cad14742dd 100644
  }
  
  static void check_cpu_stall(struct rcu_data *rdp)
-diff --git a/kernel/reboot.c b/kernel/reboot.c
-index 3c35445bf5ad3..80564ffafabff 100644
---- a/kernel/reboot.c
-+++ b/kernel/reboot.c
+diff -rupN linux.orig/kernel/reboot.c linux/kernel/reboot.c
+--- linux.orig/kernel/reboot.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/reboot.c	2022-12-04 10:40:26.720034034 -0500
 @@ -82,6 +82,7 @@ void kernel_restart_prepare(char *cmd)
  {
  	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
@@ -6610,7 +34758,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  	usermodehelper_disable();
  	device_shutdown();
  }
-@@ -270,6 +271,7 @@ static void kernel_shutdown_prepare(enum system_states state)
+@@ -270,6 +271,7 @@ static void kernel_shutdown_prepare(enum
  	blocking_notifier_call_chain(&reboot_notifier_list,
  		(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
  	system_state = state;
@@ -6630,7 +34778,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  	}
  
  	return ret;
-@@ -834,6 +838,7 @@ static int __orderly_poweroff(bool force)
+@@ -834,6 +838,7 @@ static int __orderly_poweroff(bool force
  	ret = run_cmd(poweroff_cmd);
  
  	if (ret && force) {
@@ -6638,7 +34786,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  		pr_warn("Failed to start orderly shutdown: forcing the issue\n");
  
  		/*
-@@ -843,6 +848,7 @@ static int __orderly_poweroff(bool force)
+@@ -843,6 +848,7 @@ static int __orderly_poweroff(bool force
  		 */
  		emergency_sync();
  		kernel_power_off();
@@ -6655,7 +34803,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  	/*
  	 * We have reached here after the emergency shutdown waiting period has
  	 * expired. This means orderly_poweroff has not been able to shut off
-@@ -916,6 +924,8 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work)
+@@ -916,6 +924,8 @@ static void hw_failure_emergency_powerof
  	 */
  	pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
  	emergency_restart();
@@ -6664,7 +34812,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  }
  
  static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
-@@ -954,11 +964,13 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
+@@ -954,11 +964,13 @@ void hw_protection_shutdown(const char *
  {
  	static atomic_t allow_proceed = ATOMIC_INIT(1);
  
@@ -6679,7 +34827,7 @@ index 3c35445bf5ad3..80564ffafabff 100644
  
  	/*
  	 * Queue a backup emergency shutdown in the event of
-@@ -966,6 +978,8 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
+@@ -966,6 +978,8 @@ void hw_protection_shutdown(const char *
  	 */
  	hw_failure_emergency_poweroff(ms_until_forced);
  	orderly_poweroff(true);
@@ -6688,10 +34836,9 @@ index 3c35445bf5ad3..80564ffafabff 100644
  }
  EXPORT_SYMBOL_GPL(hw_protection_shutdown);
  
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index ee28253c9ac0c..2ce515d3e6f8d 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
+diff -rupN linux.orig/kernel/sched/core.c linux/kernel/sched/core.c
+--- linux.orig/kernel/sched/core.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/sched/core.c	2022-12-04 10:40:26.720034034 -0500
 @@ -1046,6 +1046,46 @@ void resched_curr(struct rq *rq)
  		trace_sched_wake_idle_without_ipi(cpu);
  }
@@ -6755,7 +34902,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	preempt_enable();
  }
  EXPORT_SYMBOL_GPL(migrate_enable);
-@@ -3251,6 +3293,70 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
+@@ -3251,6 +3293,70 @@ out:
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
@@ -6826,7 +34973,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  /*
   * wait_task_inactive - wait for a thread to unschedule.
   *
-@@ -3269,7 +3375,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
+@@ -3269,7 +3375,7 @@ out:
   */
  unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
  {
@@ -6835,7 +34982,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	struct rq_flags rf;
  	unsigned long ncsw;
  	struct rq *rq;
-@@ -3295,7 +3401,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
+@@ -3295,7 +3401,7 @@ unsigned long wait_task_inactive(struct
  		 * is actually now running somewhere else!
  		 */
  		while (task_running(rq, p)) {
@@ -6844,7 +34991,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  				return 0;
  			cpu_relax();
  		}
-@@ -3308,10 +3414,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
+@@ -3308,10 +3414,12 @@ unsigned long wait_task_inactive(struct
  		rq = task_rq_lock(p, &rf);
  		trace_sched_wait_task(p);
  		running = task_running(rq, p);
@@ -6859,7 +35006,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  		task_rq_unlock(rq, p, &rf);
  
  		/*
-@@ -3340,7 +3448,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
+@@ -3340,7 +3448,7 @@ unsigned long wait_task_inactive(struct
  		 * running right now), it's preempted, and we should
  		 * yield - it could be a while.
  		 */
@@ -6868,7 +35015,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  			ktime_t to = NSEC_PER_SEC / HZ;
  
  			set_current_state(TASK_UNINTERRUPTIBLE);
-@@ -4589,6 +4697,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4589,6 +4697,9 @@ int sched_fork(unsigned long clone_flags
  	p->on_cpu = 0;
  #endif
  	init_task_preempt_count(p);
@@ -6878,7 +35025,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  #ifdef CONFIG_SMP
  	plist_node_init(&p->pushable_tasks, MAX_PRIO);
  	RB_CLEAR_NODE(&p->pushable_dl_tasks);
-@@ -6457,6 +6568,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+@@ -6457,6 +6568,7 @@ static void __sched notrace __schedule(u
  
  	next = pick_next_task(rq, prev, &rf);
  	clear_tsk_need_resched(prev);
@@ -6886,7 +35033,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	clear_preempt_need_resched();
  #ifdef CONFIG_SCHED_DEBUG
  	rq->last_seen_need_resched_ns = 0;
-@@ -6671,6 +6783,30 @@ static void __sched notrace preempt_schedule_common(void)
+@@ -6671,6 +6783,30 @@ static void __sched notrace preempt_sche
  	} while (need_resched());
  }
  
@@ -6917,7 +35064,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  #ifdef CONFIG_PREEMPTION
  /*
   * This is the entry point to schedule() from in-kernel preemption
-@@ -6684,6 +6820,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
+@@ -6684,6 +6820,8 @@ asmlinkage __visible void __sched notrac
  	 */
  	if (likely(!preemptible()))
  		return;
@@ -6926,7 +35073,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	preempt_schedule_common();
  }
  NOKPROBE_SYMBOL(preempt_schedule);
-@@ -6731,6 +6869,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
+@@ -6731,6 +6869,9 @@ asmlinkage __visible void __sched notrac
  	if (likely(!preemptible()))
  		return;
  
@@ -6936,7 +35083,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	do {
  		/*
  		 * Because the function tracer can trace preempt_count_sub()
-@@ -8988,7 +9129,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
+@@ -8988,7 +9129,9 @@ void __init init_idle(struct task_struct
  
  	/* Set the preempt count _outside_ the spinlocks! */
  	init_idle_preempt_count(idle, cpu);
@@ -6947,11 +35094,10 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644
  	/*
  	 * The idle tasks have their own, simple scheduling class:
  	 */
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 914096c5b1ae1..3cb55e6ede337 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -4576,7 +4576,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+diff -rupN linux.orig/kernel/sched/fair.c linux/kernel/sched/fair.c
+--- linux.orig/kernel/sched/fair.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/sched/fair.c	2022-12-04 10:40:26.720034034 -0500
+@@ -4576,7 +4576,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq
  	ideal_runtime = sched_slice(cfs_rq, curr);
  	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
  	if (delta_exec > ideal_runtime) {
@@ -6960,7 +35106,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  		/*
  		 * The current task ran long enough, ensure it doesn't get
  		 * re-elected due to buddy favours.
-@@ -4600,7 +4600,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+@@ -4600,7 +4600,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq
  		return;
  
  	if (delta > ideal_runtime)
@@ -6969,7 +35115,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  }
  
  static void
-@@ -4746,7 +4746,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+@@ -4746,7 +4746,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
  	 * validating it and just reschedule.
  	 */
  	if (queued) {
@@ -6978,7 +35124,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  		return;
  	}
  	/*
-@@ -4895,7 +4895,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+@@ -4895,7 +4895,7 @@ static void __account_cfs_rq_runtime(str
  	 * hierarchy can be throttled
  	 */
  	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
@@ -6987,7 +35133,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  }
  
  static __always_inline
-@@ -5646,7 +5646,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+@@ -5646,7 +5646,7 @@ static void hrtick_start_fair(struct rq
  
  		if (delta < 0) {
  			if (task_current(rq, p))
@@ -6996,7 +35142,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  			return;
  		}
  		hrtick_start(rq, delta);
-@@ -7307,7 +7307,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+@@ -7307,7 +7307,7 @@ static void check_preempt_wakeup(struct
  	return;
  
  preempt:
@@ -7005,7 +35151,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  	/*
  	 * Only set the backward buddy when the current task is still
  	 * on the rq. This can happen when a wakeup gets interleaved
-@@ -11454,7 +11454,7 @@ static void task_fork_fair(struct task_struct *p)
+@@ -11454,7 +11454,7 @@ static void task_fork_fair(struct task_s
  		 * 'current' within the tree based on its new key value.
  		 */
  		swap(curr->vruntime, se->vruntime);
@@ -7014,7 +35160,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  	}
  
  	se->vruntime -= cfs_rq->min_vruntime;
-@@ -11481,7 +11481,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+@@ -11481,7 +11481,7 @@ prio_changed_fair(struct rq *rq, struct
  	 */
  	if (task_current(rq, p)) {
  		if (p->prio > oldprio)
@@ -7023,10 +35169,9 @@ index 914096c5b1ae1..3cb55e6ede337 100644
  	} else
  		check_preempt_curr(rq, p, 0);
  }
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index ee7f23c76bd33..e13090e33f3c4 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
+diff -rupN linux.orig/kernel/sched/features.h linux/kernel/sched/features.h
+--- linux.orig/kernel/sched/features.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/sched/features.h	2022-12-04 10:40:26.720034034 -0500
 @@ -48,6 +48,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
  
  #ifdef CONFIG_PREEMPT_RT
@@ -7037,11 +35182,10 @@ index ee7f23c76bd33..e13090e33f3c4 100644
  #else
  
  /*
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index e26688d387aeb..5b889de29e3c9 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -2356,6 +2356,15 @@ extern void reweight_task(struct task_struct *p, int prio);
+diff -rupN linux.orig/kernel/sched/sched.h linux/kernel/sched/sched.h
+--- linux.orig/kernel/sched/sched.h	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/sched/sched.h	2022-12-04 10:40:26.724034024 -0500
+@@ -2356,6 +2356,15 @@ extern void reweight_task(struct task_st
  extern void resched_curr(struct rq *rq);
  extern void resched_cpu(int cpu);
  
@@ -7057,11 +35201,10 @@ index e26688d387aeb..5b889de29e3c9 100644
  extern struct rt_bandwidth def_rt_bandwidth;
  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
  extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-diff --git a/kernel/signal.c b/kernel/signal.c
-index 6f86fda5e432a..139b965e4fafc 100644
---- a/kernel/signal.c
-+++ b/kernel/signal.c
-@@ -2297,13 +2297,13 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
+diff -rupN linux.orig/kernel/signal.c linux/kernel/signal.c
+--- linux.orig/kernel/signal.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/signal.c	2022-12-04 10:40:26.724034024 -0500
+@@ -2297,13 +2297,13 @@ static int ptrace_stop(int exit_code, in
  	/*
  	 * Don't want to allow preemption here, because
  	 * sys_ptrace() needs this task to be inactive.
@@ -7079,10 +35222,9 @@ index 6f86fda5e432a..139b965e4fafc 100644
  	freezable_schedule();
  	cgroup_leave_frozen(true);
  
-diff --git a/kernel/softirq.c b/kernel/softirq.c
-index c8a6913c067d9..ab1fe34326bab 100644
---- a/kernel/softirq.c
-+++ b/kernel/softirq.c
+diff -rupN linux.orig/kernel/softirq.c linux/kernel/softirq.c
+--- linux.orig/kernel/softirq.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/softirq.c	2022-12-04 10:40:26.724034024 -0500
 @@ -637,6 +637,24 @@ static inline void tick_irq_exit(void)
  #endif
  }
@@ -7124,7 +35266,7 @@ index c8a6913c067d9..ab1fe34326bab 100644
  
  	tick_irq_exit();
  }
-@@ -976,12 +999,70 @@ static struct smp_hotplug_thread softirq_threads = {
+@@ -976,12 +999,70 @@ static struct smp_hotplug_thread softirq
  	.thread_comm		= "ksoftirqd/%u",
  };
  
@@ -7196,11 +35338,10 @@ index c8a6913c067d9..ab1fe34326bab 100644
  	return 0;
  }
  early_initcall(spawn_ksoftirqd);
-diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
-index 23af5eca11b14..b0b4e44dd0968 100644
---- a/kernel/time/hrtimer.c
-+++ b/kernel/time/hrtimer.c
-@@ -1805,7 +1805,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
+diff -rupN linux.orig/kernel/time/hrtimer.c linux/kernel/time/hrtimer.c
+--- linux.orig/kernel/time/hrtimer.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/time/hrtimer.c	2022-12-04 10:40:26.724034024 -0500
+@@ -1805,7 +1805,7 @@ retry:
  	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
  		cpu_base->softirq_expires_next = KTIME_MAX;
  		cpu_base->softirq_activated = 1;
@@ -7218,11 +35359,10 @@ index 23af5eca11b14..b0b4e44dd0968 100644
  	}
  
  	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
-diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
-index b0e3c9205946f..133e4160ed54b 100644
---- a/kernel/time/tick-sched.c
-+++ b/kernel/time/tick-sched.c
-@@ -779,7 +779,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+diff -rupN linux.orig/kernel/time/tick-sched.c linux/kernel/time/tick-sched.c
+--- linux.orig/kernel/time/tick-sched.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/time/tick-sched.c	2022-12-04 10:40:26.724034024 -0500
+@@ -779,7 +779,7 @@ static void tick_nohz_restart(struct tic
  
  static inline bool local_timer_softirq_pending(void)
  {
@@ -7231,10 +35371,9 @@ index b0e3c9205946f..133e4160ed54b 100644
  }
  
  static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
-diff --git a/kernel/time/timer.c b/kernel/time/timer.c
-index 717fcb9fb14aa..e6219da89933d 100644
---- a/kernel/time/timer.c
-+++ b/kernel/time/timer.c
+diff -rupN linux.orig/kernel/time/timer.c linux/kernel/time/timer.c
+--- linux.orig/kernel/time/timer.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/time/timer.c	2022-12-04 10:40:26.724034024 -0500
 @@ -1822,7 +1822,7 @@ static void run_local_timers(void)
  		if (time_before(jiffies, base->next_expiry))
  			return;
@@ -7244,11 +35383,10 @@ index 717fcb9fb14aa..e6219da89933d 100644
  }
  
  /*
-diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
-index cc65887b31bd9..1d01756752676 100644
---- a/kernel/trace/trace.c
-+++ b/kernel/trace/trace.c
-@@ -2640,11 +2640,19 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
+diff -rupN linux.orig/kernel/trace/trace.c linux/kernel/trace/trace.c
+--- linux.orig/kernel/trace/trace.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/trace/trace.c	2022-12-04 10:40:26.724034024 -0500
+@@ -2640,11 +2640,19 @@ unsigned int tracing_gen_ctx_irq_test(un
  	if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
  		trace_flags |= TRACE_FLAG_BH_OFF;
  
@@ -7270,7 +35408,7 @@ index cc65887b31bd9..1d01756752676 100644
  		(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
  }
  
-@@ -4230,15 +4238,17 @@ unsigned long trace_total_entries(struct trace_array *tr)
+@@ -4230,15 +4238,17 @@ unsigned long trace_total_entries(struct
  
  static void print_lat_help_header(struct seq_file *m)
  {
@@ -7297,7 +35435,7 @@ index cc65887b31bd9..1d01756752676 100644
  }
  
  static void print_event_info(struct array_buffer *buf, struct seq_file *m)
-@@ -4272,14 +4282,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
+@@ -4272,14 +4282,16 @@ static void print_func_help_header_irq(s
  
  	print_event_info(buf, m);
  
@@ -7322,11 +35460,10 @@ index cc65887b31bd9..1d01756752676 100644
  }
  
  void
-diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
-index 0356cae0cf74e..585380a3db753 100644
---- a/kernel/trace/trace_events.c
-+++ b/kernel/trace/trace_events.c
-@@ -193,6 +193,7 @@ static int trace_define_common_fields(void)
+diff -rupN linux.orig/kernel/trace/trace_events.c linux/kernel/trace/trace_events.c
+--- linux.orig/kernel/trace/trace_events.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/trace/trace_events.c	2022-12-04 10:40:26.724034024 -0500
+@@ -193,6 +193,7 @@ static int trace_define_common_fields(vo
  	/* Holds both preempt_count and migrate_disable */
  	__common_field(unsigned char, preempt_count);
  	__common_field(int, pid);
@@ -7334,11 +35471,10 @@ index 0356cae0cf74e..585380a3db753 100644
  
  	return ret;
  }
-diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
-index 67f47ea27921d..de58eaaf1ac7a 100644
---- a/kernel/trace/trace_output.c
-+++ b/kernel/trace/trace_output.c
-@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+diff -rupN linux.orig/kernel/trace/trace_output.c linux/kernel/trace/trace_output.c
+--- linux.orig/kernel/trace/trace_output.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/trace/trace_output.c	2022-12-04 10:40:26.724034024 -0500
+@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq
  {
  	char hardsoft_irq;
  	char need_resched;
@@ -7346,7 +35482,7 @@ index 67f47ea27921d..de58eaaf1ac7a 100644
  	char irqs_off;
  	int hardirq;
  	int softirq;
-@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq
  
  	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
  				TRACE_FLAG_PREEMPT_RESCHED)) {
@@ -7374,7 +35510,7 @@ index 67f47ea27921d..de58eaaf1ac7a 100644
  	hardsoft_irq =
  		(nmi && hardirq)     ? 'Z' :
  		nmi                  ? 'z' :
-@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq
  		softirq              ? 's' :
  		                       '.' ;
  
@@ -7397,11 +35533,10 @@ index 67f47ea27921d..de58eaaf1ac7a 100644
  	if (entry->preempt_count & 0xf0)
  		trace_seq_printf(s, "%x", entry->preempt_count >> 4);
  	else
-diff --git a/kernel/watchdog.c b/kernel/watchdog.c
-index 8e61f21e7e33e..41596c415111b 100644
---- a/kernel/watchdog.c
-+++ b/kernel/watchdog.c
-@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+diff -rupN linux.orig/kernel/watchdog.c linux/kernel/watchdog.c
+--- linux.orig/kernel/watchdog.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/watchdog.c	2022-12-04 10:40:26.724034024 -0500
+@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_tim
  		/* Start period for the next softlockup warning. */
  		update_report_ts();
  
@@ -7410,7 +35545,7 @@ index 8e61f21e7e33e..41596c415111b 100644
  		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
  			smp_processor_id(), duration,
  			current->comm, task_pid_nr(current));
-@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_tim
  		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
  		if (softlockup_panic)
  			panic("softlockup: hung tasks");
@@ -7419,11 +35554,10 @@ index 8e61f21e7e33e..41596c415111b 100644
  	}
  
  	return HRTIMER_RESTART;
-diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
-index 247bf0b1582ca..701f35f0e2d44 100644
---- a/kernel/watchdog_hld.c
-+++ b/kernel/watchdog_hld.c
-@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
+diff -rupN linux.orig/kernel/watchdog_hld.c linux/kernel/watchdog_hld.c
+--- linux.orig/kernel/watchdog_hld.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/kernel/watchdog_hld.c	2022-12-04 10:40:26.724034024 -0500
+@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(s
  		if (__this_cpu_read(hard_watchdog_warn) == true)
  			return;
  
@@ -7432,7 +35566,7 @@ index 247bf0b1582ca..701f35f0e2d44 100644
  		pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
  			 this_cpu);
  		print_modules();
-@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
+@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(s
  		if (hardlockup_panic)
  			nmi_panic(regs, "Hard LOCKUP");
  
@@ -7441,10 +35575,28 @@ index 247bf0b1582ca..701f35f0e2d44 100644
  		__this_cpu_write(hard_watchdog_warn, true);
  		return;
  	}
-diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
-index cb131fad117cc..c65e69bf4eebb 100644
---- a/lib/Kconfig.debug
-+++ b/lib/Kconfig.debug
+diff -rupN linux.orig/lib/flex_proportions.c linux/lib/flex_proportions.c
+--- linux.orig/lib/flex_proportions.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/lib/flex_proportions.c	2022-12-04 10:40:26.728034014 -0500
+@@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_globa
+ 	 */
+ 	if (events <= 1)
+ 		return false;
++	preempt_disable_nested();
+ 	write_seqcount_begin(&p->sequence);
+ 	if (periods < 64)
+ 		events -= events >> periods;
+@@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_globa
+ 	percpu_counter_add(&p->events, -events);
+ 	p->period += periods;
+ 	write_seqcount_end(&p->sequence);
++	preempt_enable_nested();
+ 
+ 	return true;
+ }
+diff -rupN linux.orig/lib/Kconfig.debug linux/lib/Kconfig.debug
+--- linux.orig/lib/Kconfig.debug	2022-12-02 11:43:18.000000000 -0500
++++ linux/lib/Kconfig.debug	2022-12-04 10:40:26.724034024 -0500
 @@ -811,6 +811,9 @@ config ARCH_HAS_DEBUG_VM_PGTABLE
  	  An architecture should select this when it can successfully
  	  build and run DEBUG_VM_PGTABLE.
@@ -7455,31 +35607,10 @@ index cb131fad117cc..c65e69bf4eebb 100644
  config DEBUG_VM
  	bool "Debug VM"
  	depends on DEBUG_KERNEL
-diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
-index 05cccbcf1661a..83332fefa6f42 100644
---- a/lib/flex_proportions.c
-+++ b/lib/flex_proportions.c
-@@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_global *p, int periods)
- 	 */
- 	if (events <= 1)
- 		return false;
-+	preempt_disable_nested();
- 	write_seqcount_begin(&p->sequence);
- 	if (periods < 64)
- 		events -= events >> periods;
-@@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_global *p, int periods)
- 	percpu_counter_add(&p->events, -events);
- 	p->period += periods;
- 	write_seqcount_end(&p->sequence);
-+	preempt_enable_nested();
- 
- 	return true;
- }
-diff --git a/lib/vsprintf.c b/lib/vsprintf.c
-index 3c1853a9d1c09..ffaba68e6a290 100644
---- a/lib/vsprintf.c
-+++ b/lib/vsprintf.c
-@@ -750,37 +750,42 @@ static int __init debug_boot_weak_hash_enable(char *str)
+diff -rupN linux.orig/lib/vsprintf.c linux/lib/vsprintf.c
+--- linux.orig/lib/vsprintf.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/lib/vsprintf.c	2022-12-04 10:40:26.728034014 -0500
+@@ -750,37 +750,42 @@ static int __init debug_boot_weak_hash_e
  }
  early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable);
  
@@ -7543,34 +35674,14 @@ index 3c1853a9d1c09..ffaba68e6a290 100644
  
  #ifdef CONFIG_64BIT
  	hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
-diff --git a/localversion-rt b/localversion-rt
-new file mode 100644
-index 0000000000000..08b3e75841adc
---- /dev/null
-+++ b/localversion-rt
+diff -rupN linux.orig/localversion-rt linux/localversion-rt
+--- linux.orig/localversion-rt	1969-12-31 19:00:00.000000000 -0500
++++ linux/localversion-rt	2022-12-04 10:40:26.728034014 -0500
 @@ -0,0 +1 @@
 +-rt14
-diff --git a/mm/Kconfig b/mm/Kconfig
-index 0331f1461f81c..3897e924e40f2 100644
---- a/mm/Kconfig
-+++ b/mm/Kconfig
-@@ -579,6 +579,12 @@ config COMPACTION
- 	  it and then we would be really interested to hear about that at
- 	  linux-mm@kvack.org.
- 
-+config COMPACT_UNEVICTABLE_DEFAULT
-+	int
-+	depends on COMPACTION
-+	default 0 if PREEMPT_RT
-+	default 1
-+
- #
- # support for free page reporting
- config PAGE_REPORTING
-diff --git a/mm/compaction.c b/mm/compaction.c
-index 640fa76228dd9..10561cb1aaad9 100644
---- a/mm/compaction.c
-+++ b/mm/compaction.c
+diff -rupN linux.orig/mm/compaction.c linux/mm/compaction.c
+--- linux.orig/mm/compaction.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/compaction.c	2022-12-04 10:40:26.728034014 -0500
 @@ -1727,11 +1727,7 @@ typedef enum {
   * Allow userspace to control policy on scanning the unevictable LRU for
   * compactable pages.
@@ -7584,10 +35695,25 @@ index 640fa76228dd9..10561cb1aaad9 100644
  
  static inline void
  update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
-diff --git a/mm/memcontrol.c b/mm/memcontrol.c
-index b69979c9ced5c..d35b6fa560f0a 100644
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
+diff -rupN linux.orig/mm/Kconfig linux/mm/Kconfig
+--- linux.orig/mm/Kconfig	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/Kconfig	2022-12-04 10:40:26.728034014 -0500
+@@ -579,6 +579,12 @@ config COMPACTION
+ 	  it and then we would be really interested to hear about that at
+ 	  linux-mm@kvack.org.
+ 
++config COMPACT_UNEVICTABLE_DEFAULT
++	int
++	depends on COMPACTION
++	default 0 if PREEMPT_RT
++	default 1
++
+ #
+ # support for free page reporting
+ config PAGE_REPORTING
+diff -rupN linux.orig/mm/memcontrol.c linux/mm/memcontrol.c
+--- linux.orig/mm/memcontrol.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/memcontrol.c	2022-12-04 10:40:26.728034014 -0500
 @@ -597,25 +597,18 @@ static u64 flush_next_time;
   */
  static void memcg_stats_lock(void)
@@ -7618,7 +35744,7 @@ index b69979c9ced5c..d35b6fa560f0a 100644
  }
  
  static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
-@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lru
  	 * interrupt context while other caller need to have disabled interrupt.
  	 */
  	__memcg_stats_lock();
@@ -7627,7 +35753,7 @@ index b69979c9ced5c..d35b6fa560f0a 100644
  		switch (idx) {
  		case NR_ANON_MAPPED:
  		case NR_FILE_MAPPED:
-@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lru
  			WARN_ON_ONCE(!in_task());
  			break;
  		default:
@@ -7636,10 +35762,9 @@ index b69979c9ced5c..d35b6fa560f0a 100644
  		}
  	}
  
-diff --git a/mm/slub.c b/mm/slub.c
-index 4b98dff9be8e3..59173fa5901a0 100644
---- a/mm/slub.c
-+++ b/mm/slub.c
+diff -rupN linux.orig/mm/slub.c linux/mm/slub.c
+--- linux.orig/mm/slub.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/slub.c	2022-12-04 10:40:26.728034014 -0500
 @@ -50,7 +50,7 @@
   *   1. slab_mutex (Global Mutex)
   *   2. node->list_lock (Spinlock)
@@ -7705,7 +35830,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  #endif
  
  #ifdef CONFIG_SLUB_DEBUG
-@@ -447,7 +455,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
+@@ -447,7 +455,7 @@ slub_set_cpu_partial(struct kmem_cache *
  /*
   * Per slab locking using the pagelock
   */
@@ -7714,7 +35839,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  {
  	struct page *page = slab_page(slab);
  
-@@ -455,7 +463,7 @@ static __always_inline void __slab_lock(struct slab *slab)
+@@ -455,7 +463,7 @@ static __always_inline void __slab_lock(
  	bit_spin_lock(PG_locked, &page->flags);
  }
  
@@ -7723,7 +35848,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  {
  	struct page *page = slab_page(slab);
  
-@@ -463,31 +471,19 @@ static __always_inline void __slab_unlock(struct slab *slab)
+@@ -463,31 +471,19 @@ static __always_inline void __slab_unloc
  	__bit_spin_unlock(PG_locked, &page->flags);
  }
  
@@ -7760,7 +35885,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		lockdep_assert_irqs_disabled();
  #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
-@@ -499,18 +495,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab
+@@ -499,18 +495,15 @@ static inline bool __cmpxchg_double_slab
  	} else
  #endif
  	{
@@ -7782,7 +35907,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	}
  
  	cpu_relax();
-@@ -541,16 +534,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+@@ -541,16 +534,16 @@ static inline bool cmpxchg_double_slab(s
  		unsigned long flags;
  
  		local_irq_save(flags);
@@ -7802,7 +35927,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		local_irq_restore(flags);
  	}
  
-@@ -566,7 +559,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+@@ -566,7 +559,7 @@ static inline bool cmpxchg_double_slab(s
  
  #ifdef CONFIG_SLUB_DEBUG
  static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
@@ -7842,7 +35967,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  static inline unsigned int size_from_object(struct kmem_cache *s)
  {
  	if (s->flags & SLAB_RED_ZONE)
-@@ -1329,17 +1298,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
+@@ -1329,17 +1298,14 @@ static inline int alloc_consistency_chec
  }
  
  static noinline int alloc_debug_processing(struct kmem_cache *s,
@@ -7862,7 +35987,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	trace(s, slab, object, 1);
  	init_object(s, object, SLUB_RED_ACTIVE);
  	return 1;
-@@ -1390,63 +1356,6 @@ static inline int free_consistency_checks(struct kmem_cache *s,
+@@ -1390,63 +1356,6 @@ static inline int free_consistency_check
  	return 1;
  }
  
@@ -7948,7 +36073,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
  					struct slab *slab) {}
  static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
-@@ -1981,11 +1892,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+@@ -1981,11 +1892,13 @@ static struct slab *allocate_slab(struct
  		 */
  		slab = alloc_slab_page(alloc_gfp, node, oo);
  		if (unlikely(!slab))
@@ -7963,7 +36088,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  	account_slab(slab, oo_order(oo), s, flags);
  
-@@ -2012,15 +1925,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+@@ -2012,15 +1925,6 @@ static struct slab *allocate_slab(struct
  		set_freepointer(s, p, NULL);
  	}
  
@@ -7979,11 +36104,10 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	return slab;
  }
  
-@@ -2107,6 +2011,75 @@ static inline void remove_partial(struct kmem_cache_node *n,
- 	n->nr_partial--;
+@@ -2108,6 +2012,75 @@ static inline void remove_partial(struct
  }
  
-+/*
+ /*
 + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
 + * slab from the n->partial list. Remove only a single object from the slab, do
 + * the alloc_debug_processing() checks and leave the slab on the list, or move
@@ -8052,10 +36176,11 @@ index 4b98dff9be8e3..59173fa5901a0 100644
 +	return object;
 +}
 +
- /*
++/*
   * Remove slab from the partial list, freeze it and
   * return the pointer to the freelist.
-@@ -2187,6 +2160,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
+  *
+@@ -2187,6 +2160,13 @@ static void *get_partial_node(struct kme
  		if (!pfmemalloc_match(slab, gfpflags))
  			continue;
  
@@ -8069,7 +36194,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		t = acquire_slab(s, n, slab, object == NULL);
  		if (!t)
  			break;
-@@ -2793,6 +2773,109 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
+@@ -2793,6 +2773,109 @@ static inline unsigned long node_nr_objs
  {
  	return atomic_long_read(&n->total_objects);
  }
@@ -8179,7 +36304,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  #endif /* CONFIG_SLUB_DEBUG */
  
  #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
-@@ -3041,36 +3124,52 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+@@ -3041,36 +3124,52 @@ new_objects:
  		return NULL;
  	}
  
@@ -8245,7 +36370,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  retry_load_slab:
  
-@@ -3094,11 +3193,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+@@ -3094,11 +3193,6 @@ retry_load_slab:
  	c->slab = slab;
  
  	goto load_freelist;
@@ -8257,7 +36382,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  }
  
  /*
-@@ -3202,14 +3296,8 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_l
+@@ -3202,14 +3296,8 @@ redo:
  
  	object = c->freelist;
  	slab = c->slab;
@@ -8274,7 +36399,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	    unlikely(!object || !slab || !node_match(slab, node))) {
  		object = __slab_alloc(s, gfpflags, node, addr, c);
  	} else {
-@@ -3346,9 +3434,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
+@@ -3346,9 +3434,10 @@ static void __slab_free(struct kmem_cach
  	if (kfence_free(head))
  		return;
  
@@ -8287,7 +36412,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  	do {
  		if (unlikely(n)) {
-@@ -3468,6 +3557,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
+@@ -3468,6 +3557,7 @@ static __always_inline void do_slab_free
  	void *tail_obj = tail ? : head;
  	struct kmem_cache_cpu *c;
  	unsigned long tid;
@@ -8295,7 +36420,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  redo:
  	/*
-@@ -3482,9 +3572,13 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
+@@ -3482,9 +3572,13 @@ redo:
  	/* Same with comment on barrier() in slab_alloc_node() */
  	barrier();
  
@@ -8312,7 +36437,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  		set_freepointer(s, tail_obj, freelist);
  
-@@ -3496,16 +3590,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
+@@ -3496,16 +3590,8 @@ redo:
  			note_cmpxchg_failure("slab_free", s, tid);
  			goto redo;
  		}
@@ -8331,7 +36456,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		local_lock(&s->cpu_slab->lock);
  		c = this_cpu_ptr(s->cpu_slab);
  		if (unlikely(slab != c->slab)) {
-@@ -3520,11 +3606,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
+@@ -3520,11 +3606,8 @@ redo:
  		c->tid = next_tid(tid);
  
  		local_unlock(&s->cpu_slab->lock);
@@ -8345,7 +36470,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  }
  
  static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
-@@ -3941,6 +4024,7 @@ static void early_kmem_cache_node_alloc(int node)
+@@ -3941,6 +4024,7 @@ static void early_kmem_cache_node_alloc(
  	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
  
  	BUG_ON(!slab);
@@ -8353,7 +36478,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	if (slab_nid(slab) != node) {
  		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
  		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
-@@ -3955,7 +4039,6 @@ static void early_kmem_cache_node_alloc(int node)
+@@ -3955,7 +4039,6 @@ static void early_kmem_cache_node_alloc(
  	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
  	slab->freelist = get_freepointer(kmem_cache_node, n);
  	slab->inuse = 1;
@@ -8361,7 +36486,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  	kmem_cache_node->node[node] = n;
  	init_kmem_cache_node(n);
  	inc_slabs_node(kmem_cache_node, node, slab->objects);
-@@ -4242,23 +4325,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
+@@ -4242,23 +4325,21 @@ static void list_slab_objects(struct kme
  {
  #ifdef CONFIG_SLUB_DEBUG
  	void *addr = slab_address(slab);
@@ -8390,7 +36515,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  #endif
  }
  
-@@ -4616,6 +4697,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
+@@ -4616,6 +4697,7 @@ static int __kmem_cache_do_shrink(struct
  			if (free == slab->objects) {
  				list_move(&slab->slab_list, &discard);
  				n->nr_partial--;
@@ -8398,7 +36523,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  			} else if (free <= SHRINK_PROMOTE_MAX)
  				list_move(&slab->slab_list, promote + free - 1);
  		}
-@@ -4631,7 +4713,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
+@@ -4631,7 +4713,7 @@ static int __kmem_cache_do_shrink(struct
  
  		/* Release empty slabs */
  		list_for_each_entry_safe(slab, t, &discard, slab_list)
@@ -8407,7 +36532,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  		if (slabs_node(s, node))
  			ret = 1;
-@@ -4991,12 +5073,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
+@@ -4991,12 +5073,9 @@ static void validate_slab(struct kmem_ca
  {
  	void *p;
  	void *addr = slab_address(slab);
@@ -8421,7 +36546,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  
  	/* Now we know that a valid freelist exists */
  	__fill_map(obj_map, s, slab);
-@@ -5007,8 +5086,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
+@@ -5007,8 +5086,6 @@ static void validate_slab(struct kmem_ca
  		if (!check_object(s, slab, p, val))
  			break;
  	}
@@ -8430,7 +36555,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  }
  
  static int validate_slab_node(struct kmem_cache *s,
-@@ -5612,7 +5689,7 @@ static ssize_t validate_store(struct kmem_cache *s,
+@@ -5612,7 +5689,7 @@ static ssize_t validate_store(struct kme
  {
  	int ret = -EINVAL;
  
@@ -8439,11 +36564,10 @@ index 4b98dff9be8e3..59173fa5901a0 100644
  		ret = validate_slab_cache(s);
  		if (ret >= 0)
  			ret = length;
-diff --git a/mm/vmstat.c b/mm/vmstat.c
-index 90af9a8572f5a..7a2d73f152304 100644
---- a/mm/vmstat.c
-+++ b/mm/vmstat.c
-@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+diff -rupN linux.orig/mm/vmstat.c linux/mm/vmstat.c
+--- linux.orig/mm/vmstat.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/mm/vmstat.c	2022-12-04 10:40:26.728034014 -0500
+@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *
  	 * CPU migrations and preemption potentially corrupts a counter so
  	 * disable preemption.
  	 */
@@ -8453,7 +36577,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	x = delta + __this_cpu_read(*p);
  
-@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *
  	}
  	__this_cpu_write(*p, x);
  
@@ -8463,7 +36587,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  EXPORT_SYMBOL(__mod_zone_page_state);
  
-@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist
  	}
  
  	/* See __mod_node_page_state */
@@ -8473,7 +36597,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	x = delta + __this_cpu_read(*p);
  
-@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist
  	}
  	__this_cpu_write(*p, x);
  
@@ -8483,7 +36607,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  EXPORT_SYMBOL(__mod_node_page_state);
  
-@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
+@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone,
  	s8 v, t;
  
  	/* See __mod_node_page_state */
@@ -8493,7 +36617,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	v = __this_cpu_inc_return(*p);
  	t = __this_cpu_read(pcp->stat_threshold);
-@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
+@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone,
  		__this_cpu_write(*p, -overstep);
  	}
  
@@ -8503,7 +36627,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  
  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
-@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data
  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
  
  	/* See __mod_node_page_state */
@@ -8513,7 +36637,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	v = __this_cpu_inc_return(*p);
  	t = __this_cpu_read(pcp->stat_threshold);
-@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data
  		__this_cpu_write(*p, -overstep);
  	}
  
@@ -8523,7 +36647,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  
  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
-@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
+@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone,
  	s8 v, t;
  
  	/* See __mod_node_page_state */
@@ -8533,7 +36657,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	v = __this_cpu_dec_return(*p);
  	t = __this_cpu_read(pcp->stat_threshold);
-@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
+@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone,
  		__this_cpu_write(*p, overstep);
  	}
  
@@ -8543,7 +36667,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  
  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
-@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data
  	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
  
  	/* See __mod_node_page_state */
@@ -8553,7 +36677,7 @@ index 90af9a8572f5a..7a2d73f152304 100644
  
  	v = __this_cpu_dec_return(*p);
  	t = __this_cpu_read(pcp->stat_threshold);
-@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data
  		__this_cpu_write(*p, overstep);
  	}
  
@@ -8563,11 +36687,10 @@ index 90af9a8572f5a..7a2d73f152304 100644
  }
  
  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
-diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
-index 035812b0461cc..ecdb47712d956 100644
---- a/net/8021q/vlan_dev.c
-+++ b/net/8021q/vlan_dev.c
-@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct net_device *dev,
+diff -rupN linux.orig/net/8021q/vlan_dev.c linux/net/8021q/vlan_dev.c
+--- linux.orig/net/8021q/vlan_dev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/8021q/vlan_dev.c	2022-12-04 10:40:26.728034014 -0500
+@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct
  
  		p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i);
  		do {
@@ -8583,11 +36706,10 @@ index 035812b0461cc..ecdb47712d956 100644
  
  		stats->rx_packets	+= rxpackets;
  		stats->rx_bytes		+= rxbytes;
-diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
-index db4f2641d1cd1..7e2a9fb5786c9 100644
---- a/net/bridge/br_multicast.c
-+++ b/net/bridge/br_multicast.c
-@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct net_bridge *br,
+diff -rupN linux.orig/net/bridge/br_multicast.c linux/net/bridge/br_multicast.c
+--- linux.orig/net/bridge/br_multicast.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/bridge/br_multicast.c	2022-12-04 10:40:26.728034014 -0500
+@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct
  		unsigned int start;
  
  		do {
@@ -8599,11 +36721,10 @@ index db4f2641d1cd1..7e2a9fb5786c9 100644
  
  		mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries);
  		mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries);
-diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
-index 6e53dc9914094..f2fc284abab38 100644
---- a/net/bridge/br_vlan.c
-+++ b/net/bridge/br_vlan.c
-@@ -1378,12 +1378,12 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
+diff -rupN linux.orig/net/bridge/br_vlan.c linux/net/bridge/br_vlan.c
+--- linux.orig/net/bridge/br_vlan.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/bridge/br_vlan.c	2022-12-04 10:40:26.728034014 -0500
+@@ -1389,12 +1389,12 @@ void br_vlan_get_stats(const struct net_
  
  		cpu_stats = per_cpu_ptr(v->stats, i);
  		do {
@@ -8618,11 +36739,2324 @@ index 6e53dc9914094..f2fc284abab38 100644
  
  		u64_stats_add(&stats->rx_packets, rxpackets);
  		u64_stats_add(&stats->rx_bytes, rxbytes);
-diff --git a/net/core/dev.c b/net/core/dev.c
-index 56c8b0921c9fd..d96506980d2f2 100644
---- a/net/core/dev.c
-+++ b/net/core/dev.c
-@@ -4582,15 +4582,6 @@ static void rps_trigger_softirq(void *data)
+diff -rupN linux.orig/net/bridge/br_vlan.c.orig linux/net/bridge/br_vlan.c.orig
+--- linux.orig/net/bridge/br_vlan.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/net/bridge/br_vlan.c.orig	2022-12-04 10:40:18.724054527 -0500
+@@ -0,0 +1,2310 @@
++// SPDX-License-Identifier: GPL-2.0-only
++#include <linux/kernel.h>
++#include <linux/netdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/slab.h>
++#include <net/switchdev.h>
++
++#include "br_private.h"
++#include "br_private_tunnel.h"
++
++static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid);
++
++static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
++			      const void *ptr)
++{
++	const struct net_bridge_vlan *vle = ptr;
++	u16 vid = *(u16 *)arg->key;
++
++	return vle->vid != vid;
++}
++
++static const struct rhashtable_params br_vlan_rht_params = {
++	.head_offset = offsetof(struct net_bridge_vlan, vnode),
++	.key_offset = offsetof(struct net_bridge_vlan, vid),
++	.key_len = sizeof(u16),
++	.nelem_hint = 3,
++	.max_size = VLAN_N_VID,
++	.obj_cmpfn = br_vlan_cmp,
++	.automatic_shrinking = true,
++};
++
++static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid)
++{
++	return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params);
++}
++
++static void __vlan_add_pvid(struct net_bridge_vlan_group *vg,
++			    const struct net_bridge_vlan *v)
++{
++	if (vg->pvid == v->vid)
++		return;
++
++	smp_wmb();
++	br_vlan_set_pvid_state(vg, v->state);
++	vg->pvid = v->vid;
++}
++
++static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid)
++{
++	if (vg->pvid != vid)
++		return;
++
++	smp_wmb();
++	vg->pvid = 0;
++}
++
++/* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v.
++ * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and
++ * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v.
++ */
++static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags,
++				bool commit)
++{
++	struct net_bridge_vlan_group *vg;
++	bool change;
++
++	if (br_vlan_is_master(v))
++		vg = br_vlan_group(v->br);
++	else
++		vg = nbp_vlan_group(v->port);
++
++	/* check if anything would be changed on commit */
++	change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) ||
++		 ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED);
++
++	if (!commit)
++		goto out;
++
++	if (flags & BRIDGE_VLAN_INFO_PVID)
++		__vlan_add_pvid(vg, v);
++	else
++		__vlan_delete_pvid(vg, v->vid);
++
++	if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
++		v->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
++	else
++		v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED;
++
++out:
++	return change;
++}
++
++static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags)
++{
++	return __vlan_flags_update(v, flags, false);
++}
++
++static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags)
++{
++	__vlan_flags_update(v, flags, true);
++}
++
++static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
++			  struct net_bridge_vlan *v, u16 flags,
++			  struct netlink_ext_ack *extack)
++{
++	int err;
++
++	/* Try switchdev op first. In case it is not supported, fallback to
++	 * 8021q add.
++	 */
++	err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack);
++	if (err == -EOPNOTSUPP)
++		return vlan_vid_add(dev, br->vlan_proto, v->vid);
++	v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV;
++	return err;
++}
++
++static void __vlan_add_list(struct net_bridge_vlan *v)
++{
++	struct net_bridge_vlan_group *vg;
++	struct list_head *headp, *hpos;
++	struct net_bridge_vlan *vent;
++
++	if (br_vlan_is_master(v))
++		vg = br_vlan_group(v->br);
++	else
++		vg = nbp_vlan_group(v->port);
++
++	headp = &vg->vlan_list;
++	list_for_each_prev(hpos, headp) {
++		vent = list_entry(hpos, struct net_bridge_vlan, vlist);
++		if (v->vid >= vent->vid)
++			break;
++	}
++	list_add_rcu(&v->vlist, hpos);
++}
++
++static void __vlan_del_list(struct net_bridge_vlan *v)
++{
++	list_del_rcu(&v->vlist);
++}
++
++static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
++			  const struct net_bridge_vlan *v)
++{
++	int err;
++
++	/* Try switchdev op first. In case it is not supported, fallback to
++	 * 8021q del.
++	 */
++	err = br_switchdev_port_vlan_del(dev, v->vid);
++	if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV))
++		vlan_vid_del(dev, br->vlan_proto, v->vid);
++	return err == -EOPNOTSUPP ? 0 : err;
++}
++
++/* Returns a master vlan, if it didn't exist it gets created. In all cases
++ * a reference is taken to the master vlan before returning.
++ */
++static struct net_bridge_vlan *
++br_vlan_get_master(struct net_bridge *br, u16 vid,
++		   struct netlink_ext_ack *extack)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *masterv;
++
++	vg = br_vlan_group(br);
++	masterv = br_vlan_find(vg, vid);
++	if (!masterv) {
++		bool changed;
++
++		/* missing global ctx, create it now */
++		if (br_vlan_add(br, vid, 0, &changed, extack))
++			return NULL;
++		masterv = br_vlan_find(vg, vid);
++		if (WARN_ON(!masterv))
++			return NULL;
++		refcount_set(&masterv->refcnt, 1);
++		return masterv;
++	}
++	refcount_inc(&masterv->refcnt);
++
++	return masterv;
++}
++
++static void br_master_vlan_rcu_free(struct rcu_head *rcu)
++{
++	struct net_bridge_vlan *v;
++
++	v = container_of(rcu, struct net_bridge_vlan, rcu);
++	WARN_ON(!br_vlan_is_master(v));
++	free_percpu(v->stats);
++	v->stats = NULL;
++	kfree(v);
++}
++
++static void br_vlan_put_master(struct net_bridge_vlan *masterv)
++{
++	struct net_bridge_vlan_group *vg;
++
++	if (!br_vlan_is_master(masterv))
++		return;
++
++	vg = br_vlan_group(masterv->br);
++	if (refcount_dec_and_test(&masterv->refcnt)) {
++		rhashtable_remove_fast(&vg->vlan_hash,
++				       &masterv->vnode, br_vlan_rht_params);
++		__vlan_del_list(masterv);
++		br_multicast_toggle_one_vlan(masterv, false);
++		br_multicast_ctx_deinit(&masterv->br_mcast_ctx);
++		call_rcu(&masterv->rcu, br_master_vlan_rcu_free);
++	}
++}
++
++static void nbp_vlan_rcu_free(struct rcu_head *rcu)
++{
++	struct net_bridge_vlan *v;
++
++	v = container_of(rcu, struct net_bridge_vlan, rcu);
++	WARN_ON(br_vlan_is_master(v));
++	/* if we had per-port stats configured then free them here */
++	if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS)
++		free_percpu(v->stats);
++	v->stats = NULL;
++	kfree(v);
++}
++
++static void br_vlan_init_state(struct net_bridge_vlan *v)
++{
++	struct net_bridge *br;
++
++	if (br_vlan_is_master(v))
++		br = v->br;
++	else
++		br = v->port->br;
++
++	if (br_opt_get(br, BROPT_MST_ENABLED)) {
++		br_mst_vlan_init_state(v);
++		return;
++	}
++
++	v->state = BR_STATE_FORWARDING;
++	v->msti = 0;
++}
++
++/* This is the shared VLAN add function which works for both ports and bridge
++ * devices. There are four possible calls to this function in terms of the
++ * vlan entry type:
++ * 1. vlan is being added on a port (no master flags, global entry exists)
++ * 2. vlan is being added on a bridge (both master and brentry flags)
++ * 3. vlan is being added on a port, but a global entry didn't exist which
++ *    is being created right now (master flag set, brentry flag unset), the
++ *    global entry is used for global per-vlan features, but not for filtering
++ * 4. same as 3 but with both master and brentry flags set so the entry
++ *    will be used for filtering in both the port and the bridge
++ */
++static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
++		      struct netlink_ext_ack *extack)
++{
++	struct net_bridge_vlan *masterv = NULL;
++	struct net_bridge_port *p = NULL;
++	struct net_bridge_vlan_group *vg;
++	struct net_device *dev;
++	struct net_bridge *br;
++	int err;
++
++	if (br_vlan_is_master(v)) {
++		br = v->br;
++		dev = br->dev;
++		vg = br_vlan_group(br);
++	} else {
++		p = v->port;
++		br = p->br;
++		dev = p->dev;
++		vg = nbp_vlan_group(p);
++	}
++
++	if (p) {
++		/* Add VLAN to the device filter if it is supported.
++		 * This ensures tagged traffic enters the bridge when
++		 * promiscuous mode is disabled by br_manage_promisc().
++		 */
++		err = __vlan_vid_add(dev, br, v, flags, extack);
++		if (err)
++			goto out;
++
++		/* need to work on the master vlan too */
++		if (flags & BRIDGE_VLAN_INFO_MASTER) {
++			bool changed;
++
++			err = br_vlan_add(br, v->vid,
++					  flags | BRIDGE_VLAN_INFO_BRENTRY,
++					  &changed, extack);
++			if (err)
++				goto out_filt;
++
++			if (changed)
++				br_vlan_notify(br, NULL, v->vid, 0,
++					       RTM_NEWVLAN);
++		}
++
++		masterv = br_vlan_get_master(br, v->vid, extack);
++		if (!masterv) {
++			err = -ENOMEM;
++			goto out_filt;
++		}
++		v->brvlan = masterv;
++		if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) {
++			v->stats =
++			     netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
++			if (!v->stats) {
++				err = -ENOMEM;
++				goto out_filt;
++			}
++			v->priv_flags |= BR_VLFLAG_PER_PORT_STATS;
++		} else {
++			v->stats = masterv->stats;
++		}
++		br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx);
++	} else {
++		if (br_vlan_should_use(v)) {
++			err = br_switchdev_port_vlan_add(dev, v->vid, flags,
++							 false, extack);
++			if (err && err != -EOPNOTSUPP)
++				goto out;
++		}
++		br_multicast_ctx_init(br, v, &v->br_mcast_ctx);
++		v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED;
++	}
++
++	/* Add the dev mac and count the vlan only if it's usable */
++	if (br_vlan_should_use(v)) {
++		err = br_fdb_add_local(br, p, dev->dev_addr, v->vid);
++		if (err) {
++			br_err(br, "failed insert local address into bridge forwarding table\n");
++			goto out_filt;
++		}
++		vg->num_vlans++;
++	}
++
++	/* set the state before publishing */
++	br_vlan_init_state(v);
++
++	err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode,
++					    br_vlan_rht_params);
++	if (err)
++		goto out_fdb_insert;
++
++	__vlan_add_list(v);
++	__vlan_flags_commit(v, flags);
++	br_multicast_toggle_one_vlan(v, true);
++
++	if (p)
++		nbp_vlan_set_vlan_dev_state(p, v->vid);
++out:
++	return err;
++
++out_fdb_insert:
++	if (br_vlan_should_use(v)) {
++		br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid);
++		vg->num_vlans--;
++	}
++
++out_filt:
++	if (p) {
++		__vlan_vid_del(dev, br, v);
++		if (masterv) {
++			if (v->stats && masterv->stats != v->stats)
++				free_percpu(v->stats);
++			v->stats = NULL;
++
++			br_vlan_put_master(masterv);
++			v->brvlan = NULL;
++		}
++	} else {
++		br_switchdev_port_vlan_del(dev, v->vid);
++	}
++
++	goto out;
++}
++
++static int __vlan_del(struct net_bridge_vlan *v)
++{
++	struct net_bridge_vlan *masterv = v;
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p = NULL;
++	int err = 0;
++
++	if (br_vlan_is_master(v)) {
++		vg = br_vlan_group(v->br);
++	} else {
++		p = v->port;
++		vg = nbp_vlan_group(v->port);
++		masterv = v->brvlan;
++	}
++
++	__vlan_delete_pvid(vg, v->vid);
++	if (p) {
++		err = __vlan_vid_del(p->dev, p->br, v);
++		if (err)
++			goto out;
++	} else {
++		err = br_switchdev_port_vlan_del(v->br->dev, v->vid);
++		if (err && err != -EOPNOTSUPP)
++			goto out;
++		err = 0;
++	}
++
++	if (br_vlan_should_use(v)) {
++		v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY;
++		vg->num_vlans--;
++	}
++
++	if (masterv != v) {
++		vlan_tunnel_info_del(vg, v);
++		rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
++				       br_vlan_rht_params);
++		__vlan_del_list(v);
++		nbp_vlan_set_vlan_dev_state(p, v->vid);
++		br_multicast_toggle_one_vlan(v, false);
++		br_multicast_port_ctx_deinit(&v->port_mcast_ctx);
++		call_rcu(&v->rcu, nbp_vlan_rcu_free);
++	}
++
++	br_vlan_put_master(masterv);
++out:
++	return err;
++}
++
++static void __vlan_group_free(struct net_bridge_vlan_group *vg)
++{
++	WARN_ON(!list_empty(&vg->vlan_list));
++	rhashtable_destroy(&vg->vlan_hash);
++	vlan_tunnel_deinit(vg);
++	kfree(vg);
++}
++
++static void __vlan_flush(const struct net_bridge *br,
++			 const struct net_bridge_port *p,
++			 struct net_bridge_vlan_group *vg)
++{
++	struct net_bridge_vlan *vlan, *tmp;
++	u16 v_start = 0, v_end = 0;
++	int err;
++
++	__vlan_delete_pvid(vg, vg->pvid);
++	list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) {
++		/* take care of disjoint ranges */
++		if (!v_start) {
++			v_start = vlan->vid;
++		} else if (vlan->vid - v_end != 1) {
++			/* found range end, notify and start next one */
++			br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
++			v_start = vlan->vid;
++		}
++		v_end = vlan->vid;
++
++		err = __vlan_del(vlan);
++		if (err) {
++			br_err(br,
++			       "port %u(%s) failed to delete vlan %d: %pe\n",
++			       (unsigned int) p->port_no, p->dev->name,
++			       vlan->vid, ERR_PTR(err));
++		}
++	}
++
++	/* notify about the last/whole vlan range */
++	if (v_start)
++		br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
++}
++
++struct sk_buff *br_handle_vlan(struct net_bridge *br,
++			       const struct net_bridge_port *p,
++			       struct net_bridge_vlan_group *vg,
++			       struct sk_buff *skb)
++{
++	struct pcpu_sw_netstats *stats;
++	struct net_bridge_vlan *v;
++	u16 vid;
++
++	/* If this packet was not filtered at input, let it pass */
++	if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
++		goto out;
++
++	/* At this point, we know that the frame was filtered and contains
++	 * a valid vlan id.  If the vlan id has untagged flag set,
++	 * send untagged; otherwise, send tagged.
++	 */
++	br_vlan_get_tag(skb, &vid);
++	v = br_vlan_find(vg, vid);
++	/* Vlan entry must be configured at this point.  The
++	 * only exception is the bridge is set in promisc mode and the
++	 * packet is destined for the bridge device.  In this case
++	 * pass the packet as is.
++	 */
++	if (!v || !br_vlan_should_use(v)) {
++		if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) {
++			goto out;
++		} else {
++			kfree_skb(skb);
++			return NULL;
++		}
++	}
++	if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
++		stats = this_cpu_ptr(v->stats);
++		u64_stats_update_begin(&stats->syncp);
++		u64_stats_add(&stats->tx_bytes, skb->len);
++		u64_stats_inc(&stats->tx_packets);
++		u64_stats_update_end(&stats->syncp);
++	}
++
++	/* If the skb will be sent using forwarding offload, the assumption is
++	 * that the switchdev will inject the packet into hardware together
++	 * with the bridge VLAN, so that it can be forwarded according to that
++	 * VLAN. The switchdev should deal with popping the VLAN header in
++	 * hardware on each egress port as appropriate. So only strip the VLAN
++	 * header if forwarding offload is not being used.
++	 */
++	if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED &&
++	    !br_switchdev_frame_uses_tx_fwd_offload(skb))
++		__vlan_hwaccel_clear_tag(skb);
++
++	if (p && (p->flags & BR_VLAN_TUNNEL) &&
++	    br_handle_egress_vlan_tunnel(skb, v)) {
++		kfree_skb(skb);
++		return NULL;
++	}
++out:
++	return skb;
++}
++
++/* Called under RCU */
++static bool __allowed_ingress(const struct net_bridge *br,
++			      struct net_bridge_vlan_group *vg,
++			      struct sk_buff *skb, u16 *vid,
++			      u8 *state,
++			      struct net_bridge_vlan **vlan)
++{
++	struct pcpu_sw_netstats *stats;
++	struct net_bridge_vlan *v;
++	bool tagged;
++
++	BR_INPUT_SKB_CB(skb)->vlan_filtered = true;
++	/* If vlan tx offload is disabled on bridge device and frame was
++	 * sent from vlan device on the bridge device, it does not have
++	 * HW accelerated vlan tag.
++	 */
++	if (unlikely(!skb_vlan_tag_present(skb) &&
++		     skb->protocol == br->vlan_proto)) {
++		skb = skb_vlan_untag(skb);
++		if (unlikely(!skb))
++			return false;
++	}
++
++	if (!br_vlan_get_tag(skb, vid)) {
++		/* Tagged frame */
++		if (skb->vlan_proto != br->vlan_proto) {
++			/* Protocol-mismatch, empty out vlan_tci for new tag */
++			skb_push(skb, ETH_HLEN);
++			skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
++							skb_vlan_tag_get(skb));
++			if (unlikely(!skb))
++				return false;
++
++			skb_pull(skb, ETH_HLEN);
++			skb_reset_mac_len(skb);
++			*vid = 0;
++			tagged = false;
++		} else {
++			tagged = true;
++		}
++	} else {
++		/* Untagged frame */
++		tagged = false;
++	}
++
++	if (!*vid) {
++		u16 pvid = br_get_pvid(vg);
++
++		/* Frame had a tag with VID 0 or did not have a tag.
++		 * See if pvid is set on this port.  That tells us which
++		 * vlan untagged or priority-tagged traffic belongs to.
++		 */
++		if (!pvid)
++			goto drop;
++
++		/* PVID is set on this port.  Any untagged or priority-tagged
++		 * ingress frame is considered to belong to this vlan.
++		 */
++		*vid = pvid;
++		if (likely(!tagged))
++			/* Untagged Frame. */
++			__vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid);
++		else
++			/* Priority-tagged Frame.
++			 * At this point, we know that skb->vlan_tci VID
++			 * field was 0.
++			 * We update only VID field and preserve PCP field.
++			 */
++			skb->vlan_tci |= pvid;
++
++		/* if snooping and stats are disabled we can avoid the lookup */
++		if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) &&
++		    !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
++			if (*state == BR_STATE_FORWARDING) {
++				*state = br_vlan_get_pvid_state(vg);
++				if (!br_vlan_state_allowed(*state, true))
++					goto drop;
++			}
++			return true;
++		}
++	}
++	v = br_vlan_find(vg, *vid);
++	if (!v || !br_vlan_should_use(v))
++		goto drop;
++
++	if (*state == BR_STATE_FORWARDING) {
++		*state = br_vlan_get_state(v);
++		if (!br_vlan_state_allowed(*state, true))
++			goto drop;
++	}
++
++	if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
++		stats = this_cpu_ptr(v->stats);
++		u64_stats_update_begin(&stats->syncp);
++		u64_stats_add(&stats->rx_bytes, skb->len);
++		u64_stats_inc(&stats->rx_packets);
++		u64_stats_update_end(&stats->syncp);
++	}
++
++	*vlan = v;
++
++	return true;
++
++drop:
++	kfree_skb(skb);
++	return false;
++}
++
++bool br_allowed_ingress(const struct net_bridge *br,
++			struct net_bridge_vlan_group *vg, struct sk_buff *skb,
++			u16 *vid, u8 *state,
++			struct net_bridge_vlan **vlan)
++{
++	/* If VLAN filtering is disabled on the bridge, all packets are
++	 * permitted.
++	 */
++	*vlan = NULL;
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED)) {
++		BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
++		return true;
++	}
++
++	return __allowed_ingress(br, vg, skb, vid, state, vlan);
++}
++
++/* Called under RCU. */
++bool br_allowed_egress(struct net_bridge_vlan_group *vg,
++		       const struct sk_buff *skb)
++{
++	const struct net_bridge_vlan *v;
++	u16 vid;
++
++	/* If this packet was not filtered at input, let it pass */
++	if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
++		return true;
++
++	br_vlan_get_tag(skb, &vid);
++	v = br_vlan_find(vg, vid);
++	if (v && br_vlan_should_use(v) &&
++	    br_vlan_state_allowed(br_vlan_get_state(v), false))
++		return true;
++
++	return false;
++}
++
++/* Called under RCU */
++bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge *br = p->br;
++	struct net_bridge_vlan *v;
++
++	/* If filtering was disabled at input, let it pass. */
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED))
++		return true;
++
++	vg = nbp_vlan_group_rcu(p);
++	if (!vg || !vg->num_vlans)
++		return false;
++
++	if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto)
++		*vid = 0;
++
++	if (!*vid) {
++		*vid = br_get_pvid(vg);
++		if (!*vid ||
++		    !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true))
++			return false;
++
++		return true;
++	}
++
++	v = br_vlan_find(vg, *vid);
++	if (v && br_vlan_state_allowed(br_vlan_get_state(v), true))
++		return true;
++
++	return false;
++}
++
++static int br_vlan_add_existing(struct net_bridge *br,
++				struct net_bridge_vlan_group *vg,
++				struct net_bridge_vlan *vlan,
++				u16 flags, bool *changed,
++				struct netlink_ext_ack *extack)
++{
++	bool would_change = __vlan_flags_would_change(vlan, flags);
++	bool becomes_brentry = false;
++	int err;
++
++	if (!br_vlan_is_brentry(vlan)) {
++		/* Trying to change flags of non-existent bridge vlan */
++		if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
++			return -EINVAL;
++
++		becomes_brentry = true;
++	}
++
++	/* Master VLANs that aren't brentries weren't notified before,
++	 * time to notify them now.
++	 */
++	if (becomes_brentry || would_change) {
++		err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags,
++						 would_change, extack);
++		if (err && err != -EOPNOTSUPP)
++			return err;
++	}
++
++	if (becomes_brentry) {
++		/* It was only kept for port vlans, now make it real */
++		err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid);
++		if (err) {
++			br_err(br, "failed to insert local address into bridge forwarding table\n");
++			goto err_fdb_insert;
++		}
++
++		refcount_inc(&vlan->refcnt);
++		vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
++		vg->num_vlans++;
++		*changed = true;
++		br_multicast_toggle_one_vlan(vlan, true);
++	}
++
++	__vlan_flags_commit(vlan, flags);
++	if (would_change)
++		*changed = true;
++
++	return 0;
++
++err_fdb_insert:
++	br_switchdev_port_vlan_del(br->dev, vlan->vid);
++	return err;
++}
++
++/* Must be protected by RTNL.
++ * Must be called with vid in range from 1 to 4094 inclusive.
++ * changed must be true only if the vlan was created or updated
++ */
++int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed,
++		struct netlink_ext_ack *extack)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *vlan;
++	int ret;
++
++	ASSERT_RTNL();
++
++	*changed = false;
++	vg = br_vlan_group(br);
++	vlan = br_vlan_find(vg, vid);
++	if (vlan)
++		return br_vlan_add_existing(br, vg, vlan, flags, changed,
++					    extack);
++
++	vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
++	if (!vlan)
++		return -ENOMEM;
++
++	vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
++	if (!vlan->stats) {
++		kfree(vlan);
++		return -ENOMEM;
++	}
++	vlan->vid = vid;
++	vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER;
++	vlan->flags &= ~BRIDGE_VLAN_INFO_PVID;
++	vlan->br = br;
++	if (flags & BRIDGE_VLAN_INFO_BRENTRY)
++		refcount_set(&vlan->refcnt, 1);
++	ret = __vlan_add(vlan, flags, extack);
++	if (ret) {
++		free_percpu(vlan->stats);
++		kfree(vlan);
++	} else {
++		*changed = true;
++	}
++
++	return ret;
++}
++
++/* Must be protected by RTNL.
++ * Must be called with vid in range from 1 to 4094 inclusive.
++ */
++int br_vlan_delete(struct net_bridge *br, u16 vid)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v;
++
++	ASSERT_RTNL();
++
++	vg = br_vlan_group(br);
++	v = br_vlan_find(vg, vid);
++	if (!v || !br_vlan_is_brentry(v))
++		return -ENOENT;
++
++	br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
++	br_fdb_delete_by_port(br, NULL, vid, 0);
++
++	vlan_tunnel_info_del(vg, v);
++
++	return __vlan_del(v);
++}
++
++void br_vlan_flush(struct net_bridge *br)
++{
++	struct net_bridge_vlan_group *vg;
++
++	ASSERT_RTNL();
++
++	vg = br_vlan_group(br);
++	__vlan_flush(br, NULL, vg);
++	RCU_INIT_POINTER(br->vlgrp, NULL);
++	synchronize_rcu();
++	__vlan_group_free(vg);
++}
++
++struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid)
++{
++	if (!vg)
++		return NULL;
++
++	return br_vlan_lookup(&vg->vlan_hash, vid);
++}
++
++/* Must be protected by RTNL. */
++static void recalculate_group_addr(struct net_bridge *br)
++{
++	if (br_opt_get(br, BROPT_GROUP_ADDR_SET))
++		return;
++
++	spin_lock_bh(&br->lock);
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
++	    br->vlan_proto == htons(ETH_P_8021Q)) {
++		/* Bridge Group Address */
++		br->group_addr[5] = 0x00;
++	} else { /* vlan_enabled && ETH_P_8021AD */
++		/* Provider Bridge Group Address */
++		br->group_addr[5] = 0x08;
++	}
++	spin_unlock_bh(&br->lock);
++}
++
++/* Must be protected by RTNL. */
++void br_recalculate_fwd_mask(struct net_bridge *br)
++{
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
++	    br->vlan_proto == htons(ETH_P_8021Q))
++		br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
++	else /* vlan_enabled && ETH_P_8021AD */
++		br->group_fwd_mask_required = BR_GROUPFWD_8021AD &
++					      ~(1u << br->group_addr[5]);
++}
++
++int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val,
++			  struct netlink_ext_ack *extack)
++{
++	struct switchdev_attr attr = {
++		.orig_dev = br->dev,
++		.id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
++		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
++		.u.vlan_filtering = val,
++	};
++	int err;
++
++	if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val)
++		return 0;
++
++	br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val);
++
++	err = switchdev_port_attr_set(br->dev, &attr, extack);
++	if (err && err != -EOPNOTSUPP) {
++		br_opt_toggle(br, BROPT_VLAN_ENABLED, !val);
++		return err;
++	}
++
++	br_manage_promisc(br);
++	recalculate_group_addr(br);
++	br_recalculate_fwd_mask(br);
++	if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
++		br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n");
++		br_multicast_toggle_vlan_snooping(br, false, NULL);
++	}
++
++	return 0;
++}
++
++bool br_vlan_enabled(const struct net_device *dev)
++{
++	struct net_bridge *br = netdev_priv(dev);
++
++	return br_opt_get(br, BROPT_VLAN_ENABLED);
++}
++EXPORT_SYMBOL_GPL(br_vlan_enabled);
++
++int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto)
++{
++	struct net_bridge *br = netdev_priv(dev);
++
++	*p_proto = ntohs(br->vlan_proto);
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_proto);
++
++int __br_vlan_set_proto(struct net_bridge *br, __be16 proto,
++			struct netlink_ext_ack *extack)
++{
++	struct switchdev_attr attr = {
++		.orig_dev = br->dev,
++		.id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL,
++		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
++		.u.vlan_protocol = ntohs(proto),
++	};
++	int err = 0;
++	struct net_bridge_port *p;
++	struct net_bridge_vlan *vlan;
++	struct net_bridge_vlan_group *vg;
++	__be16 oldproto = br->vlan_proto;
++
++	if (br->vlan_proto == proto)
++		return 0;
++
++	err = switchdev_port_attr_set(br->dev, &attr, extack);
++	if (err && err != -EOPNOTSUPP)
++		return err;
++
++	/* Add VLANs for the new proto to the device filter. */
++	list_for_each_entry(p, &br->port_list, list) {
++		vg = nbp_vlan_group(p);
++		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
++			if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++				continue;
++			err = vlan_vid_add(p->dev, proto, vlan->vid);
++			if (err)
++				goto err_filt;
++		}
++	}
++
++	br->vlan_proto = proto;
++
++	recalculate_group_addr(br);
++	br_recalculate_fwd_mask(br);
++
++	/* Delete VLANs for the old proto from the device filter. */
++	list_for_each_entry(p, &br->port_list, list) {
++		vg = nbp_vlan_group(p);
++		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
++			if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++				continue;
++			vlan_vid_del(p->dev, oldproto, vlan->vid);
++		}
++	}
++
++	return 0;
++
++err_filt:
++	attr.u.vlan_protocol = ntohs(oldproto);
++	switchdev_port_attr_set(br->dev, &attr, NULL);
++
++	list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) {
++		if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++			continue;
++		vlan_vid_del(p->dev, proto, vlan->vid);
++	}
++
++	list_for_each_entry_continue_reverse(p, &br->port_list, list) {
++		vg = nbp_vlan_group(p);
++		list_for_each_entry(vlan, &vg->vlan_list, vlist) {
++			if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++				continue;
++			vlan_vid_del(p->dev, proto, vlan->vid);
++		}
++	}
++
++	return err;
++}
++
++int br_vlan_set_proto(struct net_bridge *br, unsigned long val,
++		      struct netlink_ext_ack *extack)
++{
++	if (!eth_type_vlan(htons(val)))
++		return -EPROTONOSUPPORT;
++
++	return __br_vlan_set_proto(br, htons(val), extack);
++}
++
++int br_vlan_set_stats(struct net_bridge *br, unsigned long val)
++{
++	switch (val) {
++	case 0:
++	case 1:
++		br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val);
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val)
++{
++	struct net_bridge_port *p;
++
++	/* allow to change the option if there are no port vlans configured */
++	list_for_each_entry(p, &br->port_list, list) {
++		struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
++
++		if (vg->num_vlans)
++			return -EBUSY;
++	}
++
++	switch (val) {
++	case 0:
++	case 1:
++		br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val);
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid)
++{
++	struct net_bridge_vlan *v;
++
++	if (vid != vg->pvid)
++		return false;
++
++	v = br_vlan_lookup(&vg->vlan_hash, vid);
++	if (v && br_vlan_should_use(v) &&
++	    (v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
++		return true;
++
++	return false;
++}
++
++static void br_vlan_disable_default_pvid(struct net_bridge *br)
++{
++	struct net_bridge_port *p;
++	u16 pvid = br->default_pvid;
++
++	/* Disable default_pvid on all ports where it is still
++	 * configured.
++	 */
++	if (vlan_default_pvid(br_vlan_group(br), pvid)) {
++		if (!br_vlan_delete(br, pvid))
++			br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
++	}
++
++	list_for_each_entry(p, &br->port_list, list) {
++		if (vlan_default_pvid(nbp_vlan_group(p), pvid) &&
++		    !nbp_vlan_delete(p, pvid))
++			br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
++	}
++
++	br->default_pvid = 0;
++}
++
++int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
++			       struct netlink_ext_ack *extack)
++{
++	const struct net_bridge_vlan *pvent;
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p;
++	unsigned long *changed;
++	bool vlchange;
++	u16 old_pvid;
++	int err = 0;
++
++	if (!pvid) {
++		br_vlan_disable_default_pvid(br);
++		return 0;
++	}
++
++	changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
++	if (!changed)
++		return -ENOMEM;
++
++	old_pvid = br->default_pvid;
++
++	/* Update default_pvid config only if we do not conflict with
++	 * user configuration.
++	 */
++	vg = br_vlan_group(br);
++	pvent = br_vlan_find(vg, pvid);
++	if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) &&
++	    (!pvent || !br_vlan_should_use(pvent))) {
++		err = br_vlan_add(br, pvid,
++				  BRIDGE_VLAN_INFO_PVID |
++				  BRIDGE_VLAN_INFO_UNTAGGED |
++				  BRIDGE_VLAN_INFO_BRENTRY,
++				  &vlchange, extack);
++		if (err)
++			goto out;
++
++		if (br_vlan_delete(br, old_pvid))
++			br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN);
++		br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN);
++		__set_bit(0, changed);
++	}
++
++	list_for_each_entry(p, &br->port_list, list) {
++		/* Update default_pvid config only if we do not conflict with
++		 * user configuration.
++		 */
++		vg = nbp_vlan_group(p);
++		if ((old_pvid &&
++		     !vlan_default_pvid(vg, old_pvid)) ||
++		    br_vlan_find(vg, pvid))
++			continue;
++
++		err = nbp_vlan_add(p, pvid,
++				   BRIDGE_VLAN_INFO_PVID |
++				   BRIDGE_VLAN_INFO_UNTAGGED,
++				   &vlchange, extack);
++		if (err)
++			goto err_port;
++		if (nbp_vlan_delete(p, old_pvid))
++			br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN);
++		br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN);
++		__set_bit(p->port_no, changed);
++	}
++
++	br->default_pvid = pvid;
++
++out:
++	bitmap_free(changed);
++	return err;
++
++err_port:
++	list_for_each_entry_continue_reverse(p, &br->port_list, list) {
++		if (!test_bit(p->port_no, changed))
++			continue;
++
++		if (old_pvid) {
++			nbp_vlan_add(p, old_pvid,
++				     BRIDGE_VLAN_INFO_PVID |
++				     BRIDGE_VLAN_INFO_UNTAGGED,
++				     &vlchange, NULL);
++			br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN);
++		}
++		nbp_vlan_delete(p, pvid);
++		br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
++	}
++
++	if (test_bit(0, changed)) {
++		if (old_pvid) {
++			br_vlan_add(br, old_pvid,
++				    BRIDGE_VLAN_INFO_PVID |
++				    BRIDGE_VLAN_INFO_UNTAGGED |
++				    BRIDGE_VLAN_INFO_BRENTRY,
++				    &vlchange, NULL);
++			br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN);
++		}
++		br_vlan_delete(br, pvid);
++		br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
++	}
++	goto out;
++}
++
++int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val,
++			     struct netlink_ext_ack *extack)
++{
++	u16 pvid = val;
++	int err = 0;
++
++	if (val >= VLAN_VID_MASK)
++		return -EINVAL;
++
++	if (pvid == br->default_pvid)
++		goto out;
++
++	/* Only allow default pvid change when filtering is disabled */
++	if (br_opt_get(br, BROPT_VLAN_ENABLED)) {
++		pr_info_once("Please disable vlan filtering to change default_pvid\n");
++		err = -EPERM;
++		goto out;
++	}
++	err = __br_vlan_set_default_pvid(br, pvid, extack);
++out:
++	return err;
++}
++
++int br_vlan_init(struct net_bridge *br)
++{
++	struct net_bridge_vlan_group *vg;
++	int ret = -ENOMEM;
++
++	vg = kzalloc(sizeof(*vg), GFP_KERNEL);
++	if (!vg)
++		goto out;
++	ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
++	if (ret)
++		goto err_rhtbl;
++	ret = vlan_tunnel_init(vg);
++	if (ret)
++		goto err_tunnel_init;
++	INIT_LIST_HEAD(&vg->vlan_list);
++	br->vlan_proto = htons(ETH_P_8021Q);
++	br->default_pvid = 1;
++	rcu_assign_pointer(br->vlgrp, vg);
++
++out:
++	return ret;
++
++err_tunnel_init:
++	rhashtable_destroy(&vg->vlan_hash);
++err_rhtbl:
++	kfree(vg);
++
++	goto out;
++}
++
++int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack)
++{
++	struct switchdev_attr attr = {
++		.orig_dev = p->br->dev,
++		.id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
++		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
++		.u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED),
++	};
++	struct net_bridge_vlan_group *vg;
++	int ret = -ENOMEM;
++
++	vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL);
++	if (!vg)
++		goto out;
++
++	ret = switchdev_port_attr_set(p->dev, &attr, extack);
++	if (ret && ret != -EOPNOTSUPP)
++		goto err_vlan_enabled;
++
++	ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
++	if (ret)
++		goto err_rhtbl;
++	ret = vlan_tunnel_init(vg);
++	if (ret)
++		goto err_tunnel_init;
++	INIT_LIST_HEAD(&vg->vlan_list);
++	rcu_assign_pointer(p->vlgrp, vg);
++	if (p->br->default_pvid) {
++		bool changed;
++
++		ret = nbp_vlan_add(p, p->br->default_pvid,
++				   BRIDGE_VLAN_INFO_PVID |
++				   BRIDGE_VLAN_INFO_UNTAGGED,
++				   &changed, extack);
++		if (ret)
++			goto err_vlan_add;
++		br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN);
++	}
++out:
++	return ret;
++
++err_vlan_add:
++	RCU_INIT_POINTER(p->vlgrp, NULL);
++	synchronize_rcu();
++	vlan_tunnel_deinit(vg);
++err_tunnel_init:
++	rhashtable_destroy(&vg->vlan_hash);
++err_rhtbl:
++err_vlan_enabled:
++	kfree(vg);
++
++	goto out;
++}
++
++/* Must be protected by RTNL.
++ * Must be called with vid in range from 1 to 4094 inclusive.
++ * changed must be true only if the vlan was created or updated
++ */
++int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
++		 bool *changed, struct netlink_ext_ack *extack)
++{
++	struct net_bridge_vlan *vlan;
++	int ret;
++
++	ASSERT_RTNL();
++
++	*changed = false;
++	vlan = br_vlan_find(nbp_vlan_group(port), vid);
++	if (vlan) {
++		bool would_change = __vlan_flags_would_change(vlan, flags);
++
++		if (would_change) {
++			/* Pass the flags to the hardware bridge */
++			ret = br_switchdev_port_vlan_add(port->dev, vid, flags,
++							 true, extack);
++			if (ret && ret != -EOPNOTSUPP)
++				return ret;
++		}
++
++		__vlan_flags_commit(vlan, flags);
++		*changed = would_change;
++
++		return 0;
++	}
++
++	vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
++	if (!vlan)
++		return -ENOMEM;
++
++	vlan->vid = vid;
++	vlan->port = port;
++	ret = __vlan_add(vlan, flags, extack);
++	if (ret)
++		kfree(vlan);
++	else
++		*changed = true;
++
++	return ret;
++}
++
++/* Must be protected by RTNL.
++ * Must be called with vid in range from 1 to 4094 inclusive.
++ */
++int nbp_vlan_delete(struct net_bridge_port *port, u16 vid)
++{
++	struct net_bridge_vlan *v;
++
++	ASSERT_RTNL();
++
++	v = br_vlan_find(nbp_vlan_group(port), vid);
++	if (!v)
++		return -ENOENT;
++	br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid);
++	br_fdb_delete_by_port(port->br, port, vid, 0);
++
++	return __vlan_del(v);
++}
++
++void nbp_vlan_flush(struct net_bridge_port *port)
++{
++	struct net_bridge_vlan_group *vg;
++
++	ASSERT_RTNL();
++
++	vg = nbp_vlan_group(port);
++	__vlan_flush(port->br, port, vg);
++	RCU_INIT_POINTER(port->vlgrp, NULL);
++	synchronize_rcu();
++	__vlan_group_free(vg);
++}
++
++void br_vlan_get_stats(const struct net_bridge_vlan *v,
++		       struct pcpu_sw_netstats *stats)
++{
++	int i;
++
++	memset(stats, 0, sizeof(*stats));
++	for_each_possible_cpu(i) {
++		u64 rxpackets, rxbytes, txpackets, txbytes;
++		struct pcpu_sw_netstats *cpu_stats;
++		unsigned int start;
++
++		cpu_stats = per_cpu_ptr(v->stats, i);
++		do {
++			start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
++			rxpackets = u64_stats_read(&cpu_stats->rx_packets);
++			rxbytes = u64_stats_read(&cpu_stats->rx_bytes);
++			txbytes = u64_stats_read(&cpu_stats->tx_bytes);
++			txpackets = u64_stats_read(&cpu_stats->tx_packets);
++		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
++
++		u64_stats_add(&stats->rx_packets, rxpackets);
++		u64_stats_add(&stats->rx_bytes, rxbytes);
++		u64_stats_add(&stats->tx_bytes, txbytes);
++		u64_stats_add(&stats->tx_packets, txpackets);
++	}
++}
++
++int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p;
++
++	ASSERT_RTNL();
++	p = br_port_get_check_rtnl(dev);
++	if (p)
++		vg = nbp_vlan_group(p);
++	else if (netif_is_bridge_master(dev))
++		vg = br_vlan_group(netdev_priv(dev));
++	else
++		return -EINVAL;
++
++	*p_pvid = br_get_pvid(vg);
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
++
++int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p;
++
++	p = br_port_get_check_rcu(dev);
++	if (p)
++		vg = nbp_vlan_group_rcu(p);
++	else if (netif_is_bridge_master(dev))
++		vg = br_vlan_group_rcu(netdev_priv(dev));
++	else
++		return -EINVAL;
++
++	*p_pvid = br_get_pvid(vg);
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
++
++void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
++				    struct net_device_path_ctx *ctx,
++				    struct net_device_path *path)
++{
++	struct net_bridge_vlan_group *vg;
++	int idx = ctx->num_vlans - 1;
++	u16 vid;
++
++	path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
++
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED))
++		return;
++
++	vg = br_vlan_group(br);
++
++	if (idx >= 0 &&
++	    ctx->vlan[idx].proto == br->vlan_proto) {
++		vid = ctx->vlan[idx].id;
++	} else {
++		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG;
++		vid = br_get_pvid(vg);
++	}
++
++	path->bridge.vlan_id = vid;
++	path->bridge.vlan_proto = br->vlan_proto;
++}
++
++int br_vlan_fill_forward_path_mode(struct net_bridge *br,
++				   struct net_bridge_port *dst,
++				   struct net_device_path *path)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v;
++
++	if (!br_opt_get(br, BROPT_VLAN_ENABLED))
++		return 0;
++
++	vg = nbp_vlan_group_rcu(dst);
++	v = br_vlan_find(vg, path->bridge.vlan_id);
++	if (!v || !br_vlan_should_use(v))
++		return -EINVAL;
++
++	if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
++		return 0;
++
++	if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG)
++		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
++	else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
++		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW;
++	else
++		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG;
++
++	return 0;
++}
++
++int br_vlan_get_info(const struct net_device *dev, u16 vid,
++		     struct bridge_vlan_info *p_vinfo)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v;
++	struct net_bridge_port *p;
++
++	ASSERT_RTNL();
++	p = br_port_get_check_rtnl(dev);
++	if (p)
++		vg = nbp_vlan_group(p);
++	else if (netif_is_bridge_master(dev))
++		vg = br_vlan_group(netdev_priv(dev));
++	else
++		return -EINVAL;
++
++	v = br_vlan_find(vg, vid);
++	if (!v)
++		return -ENOENT;
++
++	p_vinfo->vid = vid;
++	p_vinfo->flags = v->flags;
++	if (vid == br_get_pvid(vg))
++		p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_info);
++
++int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid,
++			 struct bridge_vlan_info *p_vinfo)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v;
++	struct net_bridge_port *p;
++
++	p = br_port_get_check_rcu(dev);
++	if (p)
++		vg = nbp_vlan_group_rcu(p);
++	else if (netif_is_bridge_master(dev))
++		vg = br_vlan_group_rcu(netdev_priv(dev));
++	else
++		return -EINVAL;
++
++	v = br_vlan_find(vg, vid);
++	if (!v)
++		return -ENOENT;
++
++	p_vinfo->vid = vid;
++	p_vinfo->flags = v->flags;
++	if (vid == br_get_pvid(vg))
++		p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
++	return 0;
++}
++EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu);
++
++static int br_vlan_is_bind_vlan_dev(const struct net_device *dev)
++{
++	return is_vlan_dev(dev) &&
++		!!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING);
++}
++
++static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev,
++			       __always_unused struct netdev_nested_priv *priv)
++{
++	return br_vlan_is_bind_vlan_dev(dev);
++}
++
++static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev)
++{
++	int found;
++
++	rcu_read_lock();
++	found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn,
++					      NULL);
++	rcu_read_unlock();
++
++	return !!found;
++}
++
++struct br_vlan_bind_walk_data {
++	u16 vid;
++	struct net_device *result;
++};
++
++static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev,
++					  struct netdev_nested_priv *priv)
++{
++	struct br_vlan_bind_walk_data *data = priv->data;
++	int found = 0;
++
++	if (br_vlan_is_bind_vlan_dev(dev) &&
++	    vlan_dev_priv(dev)->vlan_id == data->vid) {
++		data->result = dev;
++		found = 1;
++	}
++
++	return found;
++}
++
++static struct net_device *
++br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid)
++{
++	struct br_vlan_bind_walk_data data = {
++		.vid = vid,
++	};
++	struct netdev_nested_priv priv = {
++		.data = (void *)&data,
++	};
++
++	rcu_read_lock();
++	netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn,
++				      &priv);
++	rcu_read_unlock();
++
++	return data.result;
++}
++
++static bool br_vlan_is_dev_up(const struct net_device *dev)
++{
++	return  !!(dev->flags & IFF_UP) && netif_oper_up(dev);
++}
++
++static void br_vlan_set_vlan_dev_state(const struct net_bridge *br,
++				       struct net_device *vlan_dev)
++{
++	u16 vid = vlan_dev_priv(vlan_dev)->vlan_id;
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p;
++	bool has_carrier = false;
++
++	if (!netif_carrier_ok(br->dev)) {
++		netif_carrier_off(vlan_dev);
++		return;
++	}
++
++	list_for_each_entry(p, &br->port_list, list) {
++		vg = nbp_vlan_group(p);
++		if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) {
++			has_carrier = true;
++			break;
++		}
++	}
++
++	if (has_carrier)
++		netif_carrier_on(vlan_dev);
++	else
++		netif_carrier_off(vlan_dev);
++}
++
++static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p)
++{
++	struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
++	struct net_bridge_vlan *vlan;
++	struct net_device *vlan_dev;
++
++	list_for_each_entry(vlan, &vg->vlan_list, vlist) {
++		vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev,
++							   vlan->vid);
++		if (vlan_dev) {
++			if (br_vlan_is_dev_up(p->dev)) {
++				if (netif_carrier_ok(p->br->dev))
++					netif_carrier_on(vlan_dev);
++			} else {
++				br_vlan_set_vlan_dev_state(p->br, vlan_dev);
++			}
++		}
++	}
++}
++
++static void br_vlan_upper_change(struct net_device *dev,
++				 struct net_device *upper_dev,
++				 bool linking)
++{
++	struct net_bridge *br = netdev_priv(dev);
++
++	if (!br_vlan_is_bind_vlan_dev(upper_dev))
++		return;
++
++	if (linking) {
++		br_vlan_set_vlan_dev_state(br, upper_dev);
++		br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true);
++	} else {
++		br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING,
++			      br_vlan_has_upper_bind_vlan_dev(dev));
++	}
++}
++
++struct br_vlan_link_state_walk_data {
++	struct net_bridge *br;
++};
++
++static int br_vlan_link_state_change_fn(struct net_device *vlan_dev,
++					struct netdev_nested_priv *priv)
++{
++	struct br_vlan_link_state_walk_data *data = priv->data;
++
++	if (br_vlan_is_bind_vlan_dev(vlan_dev))
++		br_vlan_set_vlan_dev_state(data->br, vlan_dev);
++
++	return 0;
++}
++
++static void br_vlan_link_state_change(struct net_device *dev,
++				      struct net_bridge *br)
++{
++	struct br_vlan_link_state_walk_data data = {
++		.br = br
++	};
++	struct netdev_nested_priv priv = {
++		.data = (void *)&data,
++	};
++
++	rcu_read_lock();
++	netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn,
++				      &priv);
++	rcu_read_unlock();
++}
++
++/* Must be protected by RTNL. */
++static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid)
++{
++	struct net_device *vlan_dev;
++
++	if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
++		return;
++
++	vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid);
++	if (vlan_dev)
++		br_vlan_set_vlan_dev_state(p->br, vlan_dev);
++}
++
++/* Must be protected by RTNL. */
++int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr)
++{
++	struct netdev_notifier_changeupper_info *info;
++	struct net_bridge *br = netdev_priv(dev);
++	int vlcmd = 0, ret = 0;
++	bool changed = false;
++
++	switch (event) {
++	case NETDEV_REGISTER:
++		ret = br_vlan_add(br, br->default_pvid,
++				  BRIDGE_VLAN_INFO_PVID |
++				  BRIDGE_VLAN_INFO_UNTAGGED |
++				  BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL);
++		vlcmd = RTM_NEWVLAN;
++		break;
++	case NETDEV_UNREGISTER:
++		changed = !br_vlan_delete(br, br->default_pvid);
++		vlcmd = RTM_DELVLAN;
++		break;
++	case NETDEV_CHANGEUPPER:
++		info = ptr;
++		br_vlan_upper_change(dev, info->upper_dev, info->linking);
++		break;
++
++	case NETDEV_CHANGE:
++	case NETDEV_UP:
++		if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING))
++			break;
++		br_vlan_link_state_change(dev, br);
++		break;
++	}
++	if (changed)
++		br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd);
++
++	return ret;
++}
++
++/* Must be protected by RTNL. */
++void br_vlan_port_event(struct net_bridge_port *p, unsigned long event)
++{
++	if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
++		return;
++
++	switch (event) {
++	case NETDEV_CHANGE:
++	case NETDEV_DOWN:
++	case NETDEV_UP:
++		br_vlan_set_all_vlan_dev_state(p);
++		break;
++	}
++}
++
++static bool br_vlan_stats_fill(struct sk_buff *skb,
++			       const struct net_bridge_vlan *v)
++{
++	struct pcpu_sw_netstats stats;
++	struct nlattr *nest;
++
++	nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS);
++	if (!nest)
++		return false;
++
++	br_vlan_get_stats(v, &stats);
++	if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES,
++			      u64_stats_read(&stats.rx_bytes),
++			      BRIDGE_VLANDB_STATS_PAD) ||
++	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS,
++			      u64_stats_read(&stats.rx_packets),
++			      BRIDGE_VLANDB_STATS_PAD) ||
++	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES,
++			      u64_stats_read(&stats.tx_bytes),
++			      BRIDGE_VLANDB_STATS_PAD) ||
++	    nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS,
++			      u64_stats_read(&stats.tx_packets),
++			      BRIDGE_VLANDB_STATS_PAD))
++		goto out_err;
++
++	nla_nest_end(skb, nest);
++
++	return true;
++
++out_err:
++	nla_nest_cancel(skb, nest);
++	return false;
++}
++
++/* v_opts is used to dump the options which must be equal in the whole range */
++static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range,
++			      const struct net_bridge_vlan *v_opts,
++			      u16 flags,
++			      bool dump_stats)
++{
++	struct bridge_vlan_info info;
++	struct nlattr *nest;
++
++	nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY);
++	if (!nest)
++		return false;
++
++	memset(&info, 0, sizeof(info));
++	info.vid = vid;
++	if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
++		info.flags |= BRIDGE_VLAN_INFO_UNTAGGED;
++	if (flags & BRIDGE_VLAN_INFO_PVID)
++		info.flags |= BRIDGE_VLAN_INFO_PVID;
++
++	if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info))
++		goto out_err;
++
++	if (vid_range && vid < vid_range &&
++	    !(flags & BRIDGE_VLAN_INFO_PVID) &&
++	    nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range))
++		goto out_err;
++
++	if (v_opts) {
++		if (!br_vlan_opts_fill(skb, v_opts))
++			goto out_err;
++
++		if (dump_stats && !br_vlan_stats_fill(skb, v_opts))
++			goto out_err;
++	}
++
++	nla_nest_end(skb, nest);
++
++	return true;
++
++out_err:
++	nla_nest_cancel(skb, nest);
++	return false;
++}
++
++static size_t rtnl_vlan_nlmsg_size(void)
++{
++	return NLMSG_ALIGN(sizeof(struct br_vlan_msg))
++		+ nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */
++		+ nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */
++		+ nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */
++		+ br_vlan_opts_nl_size(); /* bridge vlan options */
++}
++
++void br_vlan_notify(const struct net_bridge *br,
++		    const struct net_bridge_port *p,
++		    u16 vid, u16 vid_range,
++		    int cmd)
++{
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_vlan *v = NULL;
++	struct br_vlan_msg *bvm;
++	struct nlmsghdr *nlh;
++	struct sk_buff *skb;
++	int err = -ENOBUFS;
++	struct net *net;
++	u16 flags = 0;
++	int ifindex;
++
++	/* right now notifications are done only with rtnl held */
++	ASSERT_RTNL();
++
++	if (p) {
++		ifindex = p->dev->ifindex;
++		vg = nbp_vlan_group(p);
++		net = dev_net(p->dev);
++	} else {
++		ifindex = br->dev->ifindex;
++		vg = br_vlan_group(br);
++		net = dev_net(br->dev);
++	}
++
++	skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL);
++	if (!skb)
++		goto out_err;
++
++	err = -EMSGSIZE;
++	nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0);
++	if (!nlh)
++		goto out_err;
++	bvm = nlmsg_data(nlh);
++	memset(bvm, 0, sizeof(*bvm));
++	bvm->family = AF_BRIDGE;
++	bvm->ifindex = ifindex;
++
++	switch (cmd) {
++	case RTM_NEWVLAN:
++		/* need to find the vlan due to flags/options */
++		v = br_vlan_find(vg, vid);
++		if (!v || !br_vlan_should_use(v))
++			goto out_kfree;
++
++		flags = v->flags;
++		if (br_get_pvid(vg) == v->vid)
++			flags |= BRIDGE_VLAN_INFO_PVID;
++		break;
++	case RTM_DELVLAN:
++		break;
++	default:
++		goto out_kfree;
++	}
++
++	if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags, false))
++		goto out_err;
++
++	nlmsg_end(skb, nlh);
++	rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL);
++	return;
++
++out_err:
++	rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err);
++out_kfree:
++	kfree_skb(skb);
++}
++
++/* check if v_curr can enter a range ending in range_end */
++bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
++			     const struct net_bridge_vlan *range_end)
++{
++	return v_curr->vid - range_end->vid == 1 &&
++	       range_end->flags == v_curr->flags &&
++	       br_vlan_opts_eq_range(v_curr, range_end);
++}
++
++static int br_vlan_dump_dev(const struct net_device *dev,
++			    struct sk_buff *skb,
++			    struct netlink_callback *cb,
++			    u32 dump_flags)
++{
++	struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL;
++	bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL);
++	bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS);
++	struct net_bridge_vlan_group *vg;
++	int idx = 0, s_idx = cb->args[1];
++	struct nlmsghdr *nlh = NULL;
++	struct net_bridge_port *p;
++	struct br_vlan_msg *bvm;
++	struct net_bridge *br;
++	int err = 0;
++	u16 pvid;
++
++	if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev))
++		return -EINVAL;
++
++	if (netif_is_bridge_master(dev)) {
++		br = netdev_priv(dev);
++		vg = br_vlan_group_rcu(br);
++		p = NULL;
++	} else {
++		/* global options are dumped only for bridge devices */
++		if (dump_global)
++			return 0;
++
++		p = br_port_get_rcu(dev);
++		if (WARN_ON(!p))
++			return -EINVAL;
++		vg = nbp_vlan_group_rcu(p);
++		br = p->br;
++	}
++
++	if (!vg)
++		return 0;
++
++	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
++			RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI);
++	if (!nlh)
++		return -EMSGSIZE;
++	bvm = nlmsg_data(nlh);
++	memset(bvm, 0, sizeof(*bvm));
++	bvm->family = PF_BRIDGE;
++	bvm->ifindex = dev->ifindex;
++	pvid = br_get_pvid(vg);
++
++	/* idx must stay at range's beginning until it is filled in */
++	list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
++		if (!dump_global && !br_vlan_should_use(v))
++			continue;
++		if (idx < s_idx) {
++			idx++;
++			continue;
++		}
++
++		if (!range_start) {
++			range_start = v;
++			range_end = v;
++			continue;
++		}
++
++		if (dump_global) {
++			if (br_vlan_global_opts_can_enter_range(v, range_end))
++				goto update_end;
++			if (!br_vlan_global_opts_fill(skb, range_start->vid,
++						      range_end->vid,
++						      range_start)) {
++				err = -EMSGSIZE;
++				break;
++			}
++			/* advance number of filled vlans */
++			idx += range_end->vid - range_start->vid + 1;
++
++			range_start = v;
++		} else if (dump_stats || v->vid == pvid ||
++			   !br_vlan_can_enter_range(v, range_end)) {
++			u16 vlan_flags = br_vlan_flags(range_start, pvid);
++
++			if (!br_vlan_fill_vids(skb, range_start->vid,
++					       range_end->vid, range_start,
++					       vlan_flags, dump_stats)) {
++				err = -EMSGSIZE;
++				break;
++			}
++			/* advance number of filled vlans */
++			idx += range_end->vid - range_start->vid + 1;
++
++			range_start = v;
++		}
++update_end:
++		range_end = v;
++	}
++
++	/* err will be 0 and range_start will be set in 3 cases here:
++	 * - first vlan (range_start == range_end)
++	 * - last vlan (range_start == range_end, not in range)
++	 * - last vlan range (range_start != range_end, in range)
++	 */
++	if (!err && range_start) {
++		if (dump_global &&
++		    !br_vlan_global_opts_fill(skb, range_start->vid,
++					      range_end->vid, range_start))
++			err = -EMSGSIZE;
++		else if (!dump_global &&
++			 !br_vlan_fill_vids(skb, range_start->vid,
++					    range_end->vid, range_start,
++					    br_vlan_flags(range_start, pvid),
++					    dump_stats))
++			err = -EMSGSIZE;
++	}
++
++	cb->args[1] = err ? idx : 0;
++
++	nlmsg_end(skb, nlh);
++
++	return err;
++}
++
++static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = {
++	[BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 },
++};
++
++static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb)
++{
++	struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1];
++	int idx = 0, err = 0, s_idx = cb->args[0];
++	struct net *net = sock_net(skb->sk);
++	struct br_vlan_msg *bvm;
++	struct net_device *dev;
++	u32 dump_flags = 0;
++
++	err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX,
++			  br_vlan_db_dump_pol, cb->extack);
++	if (err < 0)
++		return err;
++
++	bvm = nlmsg_data(cb->nlh);
++	if (dtb[BRIDGE_VLANDB_DUMP_FLAGS])
++		dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]);
++
++	rcu_read_lock();
++	if (bvm->ifindex) {
++		dev = dev_get_by_index_rcu(net, bvm->ifindex);
++		if (!dev) {
++			err = -ENODEV;
++			goto out_err;
++		}
++		err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
++		/* if the dump completed without an error we return 0 here */
++		if (err != -EMSGSIZE)
++			goto out_err;
++	} else {
++		for_each_netdev_rcu(net, dev) {
++			if (idx < s_idx)
++				goto skip;
++
++			err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
++			if (err == -EMSGSIZE)
++				break;
++skip:
++			idx++;
++		}
++	}
++	cb->args[0] = idx;
++	rcu_read_unlock();
++
++	return skb->len;
++
++out_err:
++	rcu_read_unlock();
++
++	return err;
++}
++
++static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = {
++	[BRIDGE_VLANDB_ENTRY_INFO]	=
++		NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)),
++	[BRIDGE_VLANDB_ENTRY_RANGE]	= { .type = NLA_U16 },
++	[BRIDGE_VLANDB_ENTRY_STATE]	= { .type = NLA_U8 },
++	[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
++	[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]	= { .type = NLA_U8 },
++};
++
++static int br_vlan_rtm_process_one(struct net_device *dev,
++				   const struct nlattr *attr,
++				   int cmd, struct netlink_ext_ack *extack)
++{
++	struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL;
++	struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1];
++	bool changed = false, skip_processing = false;
++	struct net_bridge_vlan_group *vg;
++	struct net_bridge_port *p = NULL;
++	int err = 0, cmdmap = 0;
++	struct net_bridge *br;
++
++	if (netif_is_bridge_master(dev)) {
++		br = netdev_priv(dev);
++		vg = br_vlan_group(br);
++	} else {
++		p = br_port_get_rtnl(dev);
++		if (WARN_ON(!p))
++			return -ENODEV;
++		br = p->br;
++		vg = nbp_vlan_group(p);
++	}
++
++	if (WARN_ON(!vg))
++		return -ENODEV;
++
++	err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr,
++			       br_vlan_db_policy, extack);
++	if (err)
++		return err;
++
++	if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) {
++		NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info");
++		return -EINVAL;
++	}
++	memset(&vrange_end, 0, sizeof(vrange_end));
++
++	vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]);
++	if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN |
++			    BRIDGE_VLAN_INFO_RANGE_END)) {
++		NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls");
++		return -EINVAL;
++	}
++	if (!br_vlan_valid_id(vinfo->vid, extack))
++		return -EINVAL;
++
++	if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) {
++		vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]);
++		/* validate user-provided flags without RANGE_BEGIN */
++		vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags;
++		vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
++
++		/* vinfo_last is the range start, vinfo the range end */
++		vinfo_last = vinfo;
++		vinfo = &vrange_end;
++
++		if (!br_vlan_valid_id(vinfo->vid, extack) ||
++		    !br_vlan_valid_range(vinfo, vinfo_last, extack))
++			return -EINVAL;
++	}
++
++	switch (cmd) {
++	case RTM_NEWVLAN:
++		cmdmap = RTM_SETLINK;
++		skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS);
++		break;
++	case RTM_DELVLAN:
++		cmdmap = RTM_DELLINK;
++		break;
++	}
++
++	if (!skip_processing) {
++		struct bridge_vlan_info *tmp_last = vinfo_last;
++
++		/* br_process_vlan_info may overwrite vinfo_last */
++		err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last,
++					   &changed, extack);
++
++		/* notify first if anything changed */
++		if (changed)
++			br_ifinfo_notify(cmdmap, br, p);
++
++		if (err)
++			return err;
++	}
++
++	/* deal with options */
++	if (cmd == RTM_NEWVLAN) {
++		struct net_bridge_vlan *range_start, *range_end;
++
++		if (vinfo_last) {
++			range_start = br_vlan_find(vg, vinfo_last->vid);
++			range_end = br_vlan_find(vg, vinfo->vid);
++		} else {
++			range_start = br_vlan_find(vg, vinfo->vid);
++			range_end = range_start;
++		}
++
++		err = br_vlan_process_options(br, p, range_start, range_end,
++					      tb, extack);
++	}
++
++	return err;
++}
++
++static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh,
++			       struct netlink_ext_ack *extack)
++{
++	struct net *net = sock_net(skb->sk);
++	struct br_vlan_msg *bvm;
++	struct net_device *dev;
++	struct nlattr *attr;
++	int err, vlans = 0;
++	int rem;
++
++	/* this should validate the header and check for remaining bytes */
++	err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL,
++			  extack);
++	if (err < 0)
++		return err;
++
++	bvm = nlmsg_data(nlh);
++	dev = __dev_get_by_index(net, bvm->ifindex);
++	if (!dev)
++		return -ENODEV;
++
++	if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) {
++		NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port");
++		return -EINVAL;
++	}
++
++	nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) {
++		switch (nla_type(attr)) {
++		case BRIDGE_VLANDB_ENTRY:
++			err = br_vlan_rtm_process_one(dev, attr,
++						      nlh->nlmsg_type,
++						      extack);
++			break;
++		case BRIDGE_VLANDB_GLOBAL_OPTIONS:
++			err = br_vlan_rtm_process_global_options(dev, attr,
++								 nlh->nlmsg_type,
++								 extack);
++			break;
++		default:
++			continue;
++		}
++
++		vlans++;
++		if (err)
++			break;
++	}
++	if (!vlans) {
++		NL_SET_ERR_MSG_MOD(extack, "No vlans found to process");
++		err = -EINVAL;
++	}
++
++	return err;
++}
++
++void br_vlan_rtnl_init(void)
++{
++	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL,
++			     br_vlan_rtm_dump, 0);
++	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN,
++			     br_vlan_rtm_process, NULL, 0);
++	rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN,
++			     br_vlan_rtm_process, NULL, 0);
++}
++
++void br_vlan_rtnl_uninit(void)
++{
++	rtnl_unregister(PF_BRIDGE, RTM_GETVLAN);
++	rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN);
++	rtnl_unregister(PF_BRIDGE, RTM_DELVLAN);
++}
+diff -rupN linux.orig/net/core/dev.c linux/net/core/dev.c
+--- linux.orig/net/core/dev.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/dev.c	2022-12-04 10:40:26.732034003 -0500
+@@ -4582,15 +4582,6 @@ static void rps_trigger_softirq(void *da
  
  #endif /* CONFIG_RPS */
  
@@ -8638,7 +39072,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644
  /*
   * Check if this softnet_data structure is another cpu one
   * If yes, queue it to our IPI list and return 1
-@@ -6661,6 +6652,30 @@ static void skb_defer_free_flush(struct softnet_data *sd)
+@@ -6665,6 +6656,30 @@ static void skb_defer_free_flush(struct
  	}
  }
  
@@ -8669,7 +39103,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644
  static __latent_entropy void net_rx_action(struct softirq_action *h)
  {
  	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
-@@ -10492,12 +10507,12 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
+@@ -10496,12 +10511,12 @@ void dev_fetch_sw_netstats(struct rtnl_l
  
  		stats = per_cpu_ptr(netstats, cpu);
  		do {
@@ -8684,7 +39118,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644
  
  		s->rx_packets += rx_packets;
  		s->rx_bytes   += rx_bytes;
-@@ -11412,7 +11427,11 @@ static int __init net_dev_init(void)
+@@ -11416,7 +11431,11 @@ static int __init net_dev_init(void)
  		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
  		sd->cpu = i;
  #endif
@@ -8696,11 +39130,11469 @@ index 56c8b0921c9fd..d96506980d2f2 100644
  		spin_lock_init(&sd->defer_lock);
  
  		init_gro_hash(&sd->backlog);
-diff --git a/net/core/devlink.c b/net/core/devlink.c
-index b50bcc18b8d9e..cfa6a099457ae 100644
---- a/net/core/devlink.c
-+++ b/net/core/devlink.c
-@@ -8268,10 +8268,10 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
+diff -rupN linux.orig/net/core/dev.c.orig linux/net/core/dev.c.orig
+--- linux.orig/net/core/dev.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/net/core/dev.c.orig	2022-12-04 10:40:18.728054516 -0500
+@@ -0,0 +1,11455 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *      NET3    Protocol independent device support routines.
++ *
++ *	Derived from the non IP parts of dev.c 1.0.19
++ *              Authors:	Ross Biro
++ *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
++ *				Mark Evans, <evansmp@uhura.aston.ac.uk>
++ *
++ *	Additional Authors:
++ *		Florian la Roche <rzsfl@rz.uni-sb.de>
++ *		Alan Cox <gw4pts@gw4pts.ampr.org>
++ *		David Hinds <dahinds@users.sourceforge.net>
++ *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
++ *		Adam Sulmicki <adam@cfar.umd.edu>
++ *              Pekka Riikonen <priikone@poesidon.pspt.fi>
++ *
++ *	Changes:
++ *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
++ *                                      to 2 if register_netdev gets called
++ *                                      before net_dev_init & also removed a
++ *                                      few lines of code in the process.
++ *		Alan Cox	:	device private ioctl copies fields back.
++ *		Alan Cox	:	Transmit queue code does relevant
++ *					stunts to keep the queue safe.
++ *		Alan Cox	:	Fixed double lock.
++ *		Alan Cox	:	Fixed promisc NULL pointer trap
++ *		????????	:	Support the full private ioctl range
++ *		Alan Cox	:	Moved ioctl permission check into
++ *					drivers
++ *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
++ *		Alan Cox	:	100 backlog just doesn't cut it when
++ *					you start doing multicast video 8)
++ *		Alan Cox	:	Rewrote net_bh and list manager.
++ *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
++ *		Alan Cox	:	Took out transmit every packet pass
++ *					Saved a few bytes in the ioctl handler
++ *		Alan Cox	:	Network driver sets packet type before
++ *					calling netif_rx. Saves a function
++ *					call a packet.
++ *		Alan Cox	:	Hashed net_bh()
++ *		Richard Kooijman:	Timestamp fixes.
++ *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
++ *		Alan Cox	:	Device lock protection.
++ *              Alan Cox        :       Fixed nasty side effect of device close
++ *					changes.
++ *		Rudi Cilibrasi	:	Pass the right thing to
++ *					set_mac_address()
++ *		Dave Miller	:	32bit quantity for the device lock to
++ *					make it work out on a Sparc.
++ *		Bjorn Ekwall	:	Added KERNELD hack.
++ *		Alan Cox	:	Cleaned up the backlog initialise.
++ *		Craig Metz	:	SIOCGIFCONF fix if space for under
++ *					1 device.
++ *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
++ *					is no device open function.
++ *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
++ *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
++ *		Cyrus Durgin	:	Cleaned for KMOD
++ *		Adam Sulmicki   :	Bug Fix : Network Device Unload
++ *					A network device unload needs to purge
++ *					the backlog queue.
++ *	Paul Rusty Russell	:	SIOCSIFNAME
++ *              Pekka Riikonen  :	Netdev boot-time settings code
++ *              Andrew Morton   :       Make unregister_netdevice wait
++ *                                      indefinitely on dev->refcnt
++ *              J Hadi Salim    :       - Backlog queue sampling
++ *				        - netif_rx() feedback
++ */
++
++#include <linux/uaccess.h>
++#include <linux/bitops.h>
++#include <linux/capability.h>
++#include <linux/cpu.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/hash.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/sched/mm.h>
++#include <linux/mutex.h>
++#include <linux/rwsem.h>
++#include <linux/string.h>
++#include <linux/mm.h>
++#include <linux/socket.h>
++#include <linux/sockios.h>
++#include <linux/errno.h>
++#include <linux/interrupt.h>
++#include <linux/if_ether.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/ethtool.h>
++#include <linux/skbuff.h>
++#include <linux/kthread.h>
++#include <linux/bpf.h>
++#include <linux/bpf_trace.h>
++#include <net/net_namespace.h>
++#include <net/sock.h>
++#include <net/busy_poll.h>
++#include <linux/rtnetlink.h>
++#include <linux/stat.h>
++#include <net/dsa.h>
++#include <net/dst.h>
++#include <net/dst_metadata.h>
++#include <net/gro.h>
++#include <net/pkt_sched.h>
++#include <net/pkt_cls.h>
++#include <net/checksum.h>
++#include <net/xfrm.h>
++#include <linux/highmem.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/netpoll.h>
++#include <linux/rcupdate.h>
++#include <linux/delay.h>
++#include <net/iw_handler.h>
++#include <asm/current.h>
++#include <linux/audit.h>
++#include <linux/dmaengine.h>
++#include <linux/err.h>
++#include <linux/ctype.h>
++#include <linux/if_arp.h>
++#include <linux/if_vlan.h>
++#include <linux/ip.h>
++#include <net/ip.h>
++#include <net/mpls.h>
++#include <linux/ipv6.h>
++#include <linux/in.h>
++#include <linux/jhash.h>
++#include <linux/random.h>
++#include <trace/events/napi.h>
++#include <trace/events/net.h>
++#include <trace/events/skb.h>
++#include <trace/events/qdisc.h>
++#include <linux/inetdevice.h>
++#include <linux/cpu_rmap.h>
++#include <linux/static_key.h>
++#include <linux/hashtable.h>
++#include <linux/vmalloc.h>
++#include <linux/if_macvlan.h>
++#include <linux/errqueue.h>
++#include <linux/hrtimer.h>
++#include <linux/netfilter_netdev.h>
++#include <linux/crash_dump.h>
++#include <linux/sctp.h>
++#include <net/udp_tunnel.h>
++#include <linux/net_namespace.h>
++#include <linux/indirect_call_wrapper.h>
++#include <net/devlink.h>
++#include <linux/pm_runtime.h>
++#include <linux/prandom.h>
++#include <linux/once_lite.h>
++
++#include "dev.h"
++#include "net-sysfs.h"
++
++
++static DEFINE_SPINLOCK(ptype_lock);
++struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
++struct list_head ptype_all __read_mostly;	/* Taps */
++
++static int netif_rx_internal(struct sk_buff *skb);
++static int call_netdevice_notifiers_info(unsigned long val,
++					 struct netdev_notifier_info *info);
++static int call_netdevice_notifiers_extack(unsigned long val,
++					   struct net_device *dev,
++					   struct netlink_ext_ack *extack);
++static struct napi_struct *napi_by_id(unsigned int napi_id);
++
++/*
++ * The @dev_base_head list is protected by @dev_base_lock and the rtnl
++ * semaphore.
++ *
++ * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
++ *
++ * Writers must hold the rtnl semaphore while they loop through the
++ * dev_base_head list, and hold dev_base_lock for writing when they do the
++ * actual updates.  This allows pure readers to access the list even
++ * while a writer is preparing to update it.
++ *
++ * To put it another way, dev_base_lock is held for writing only to
++ * protect against pure readers; the rtnl semaphore provides the
++ * protection against other writers.
++ *
++ * See, for example usages, register_netdevice() and
++ * unregister_netdevice(), which must be called with the rtnl
++ * semaphore held.
++ */
++DEFINE_RWLOCK(dev_base_lock);
++EXPORT_SYMBOL(dev_base_lock);
++
++static DEFINE_MUTEX(ifalias_mutex);
++
++/* protects napi_hash addition/deletion and napi_gen_id */
++static DEFINE_SPINLOCK(napi_hash_lock);
++
++static unsigned int napi_gen_id = NR_CPUS;
++static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
++
++static DECLARE_RWSEM(devnet_rename_sem);
++
++static inline void dev_base_seq_inc(struct net *net)
++{
++	while (++net->dev_base_seq == 0)
++		;
++}
++
++static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
++{
++	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
++
++	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
++}
++
++static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
++{
++	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
++}
++
++static inline void rps_lock_irqsave(struct softnet_data *sd,
++				    unsigned long *flags)
++{
++	if (IS_ENABLED(CONFIG_RPS))
++		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
++	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		local_irq_save(*flags);
++}
++
++static inline void rps_lock_irq_disable(struct softnet_data *sd)
++{
++	if (IS_ENABLED(CONFIG_RPS))
++		spin_lock_irq(&sd->input_pkt_queue.lock);
++	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		local_irq_disable();
++}
++
++static inline void rps_unlock_irq_restore(struct softnet_data *sd,
++					  unsigned long *flags)
++{
++	if (IS_ENABLED(CONFIG_RPS))
++		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
++	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		local_irq_restore(*flags);
++}
++
++static inline void rps_unlock_irq_enable(struct softnet_data *sd)
++{
++	if (IS_ENABLED(CONFIG_RPS))
++		spin_unlock_irq(&sd->input_pkt_queue.lock);
++	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		local_irq_enable();
++}
++
++static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
++						       const char *name)
++{
++	struct netdev_name_node *name_node;
++
++	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
++	if (!name_node)
++		return NULL;
++	INIT_HLIST_NODE(&name_node->hlist);
++	name_node->dev = dev;
++	name_node->name = name;
++	return name_node;
++}
++
++static struct netdev_name_node *
++netdev_name_node_head_alloc(struct net_device *dev)
++{
++	struct netdev_name_node *name_node;
++
++	name_node = netdev_name_node_alloc(dev, dev->name);
++	if (!name_node)
++		return NULL;
++	INIT_LIST_HEAD(&name_node->list);
++	return name_node;
++}
++
++static void netdev_name_node_free(struct netdev_name_node *name_node)
++{
++	kfree(name_node);
++}
++
++static void netdev_name_node_add(struct net *net,
++				 struct netdev_name_node *name_node)
++{
++	hlist_add_head_rcu(&name_node->hlist,
++			   dev_name_hash(net, name_node->name));
++}
++
++static void netdev_name_node_del(struct netdev_name_node *name_node)
++{
++	hlist_del_rcu(&name_node->hlist);
++}
++
++static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
++							const char *name)
++{
++	struct hlist_head *head = dev_name_hash(net, name);
++	struct netdev_name_node *name_node;
++
++	hlist_for_each_entry(name_node, head, hlist)
++		if (!strcmp(name_node->name, name))
++			return name_node;
++	return NULL;
++}
++
++static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
++							    const char *name)
++{
++	struct hlist_head *head = dev_name_hash(net, name);
++	struct netdev_name_node *name_node;
++
++	hlist_for_each_entry_rcu(name_node, head, hlist)
++		if (!strcmp(name_node->name, name))
++			return name_node;
++	return NULL;
++}
++
++bool netdev_name_in_use(struct net *net, const char *name)
++{
++	return netdev_name_node_lookup(net, name);
++}
++EXPORT_SYMBOL(netdev_name_in_use);
++
++int netdev_name_node_alt_create(struct net_device *dev, const char *name)
++{
++	struct netdev_name_node *name_node;
++	struct net *net = dev_net(dev);
++
++	name_node = netdev_name_node_lookup(net, name);
++	if (name_node)
++		return -EEXIST;
++	name_node = netdev_name_node_alloc(dev, name);
++	if (!name_node)
++		return -ENOMEM;
++	netdev_name_node_add(net, name_node);
++	/* The node that holds dev->name acts as a head of per-device list. */
++	list_add_tail(&name_node->list, &dev->name_node->list);
++
++	return 0;
++}
++
++static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
++{
++	list_del(&name_node->list);
++	netdev_name_node_del(name_node);
++	kfree(name_node->name);
++	netdev_name_node_free(name_node);
++}
++
++int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
++{
++	struct netdev_name_node *name_node;
++	struct net *net = dev_net(dev);
++
++	name_node = netdev_name_node_lookup(net, name);
++	if (!name_node)
++		return -ENOENT;
++	/* lookup might have found our primary name or a name belonging
++	 * to another device.
++	 */
++	if (name_node == dev->name_node || name_node->dev != dev)
++		return -EINVAL;
++
++	__netdev_name_node_alt_destroy(name_node);
++
++	return 0;
++}
++
++static void netdev_name_node_alt_flush(struct net_device *dev)
++{
++	struct netdev_name_node *name_node, *tmp;
++
++	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
++		__netdev_name_node_alt_destroy(name_node);
++}
++
++/* Device list insertion */
++static void list_netdevice(struct net_device *dev)
++{
++	struct net *net = dev_net(dev);
++
++	ASSERT_RTNL();
++
++	write_lock(&dev_base_lock);
++	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
++	netdev_name_node_add(net, dev->name_node);
++	hlist_add_head_rcu(&dev->index_hlist,
++			   dev_index_hash(net, dev->ifindex));
++	write_unlock(&dev_base_lock);
++
++	dev_base_seq_inc(net);
++}
++
++/* Device list removal
++ * caller must respect a RCU grace period before freeing/reusing dev
++ */
++static void unlist_netdevice(struct net_device *dev, bool lock)
++{
++	ASSERT_RTNL();
++
++	/* Unlink dev from the device chain */
++	if (lock)
++		write_lock(&dev_base_lock);
++	list_del_rcu(&dev->dev_list);
++	netdev_name_node_del(dev->name_node);
++	hlist_del_rcu(&dev->index_hlist);
++	if (lock)
++		write_unlock(&dev_base_lock);
++
++	dev_base_seq_inc(dev_net(dev));
++}
++
++/*
++ *	Our notifier list
++ */
++
++static RAW_NOTIFIER_HEAD(netdev_chain);
++
++/*
++ *	Device drivers call our routines to queue packets here. We empty the
++ *	queue in the local softnet handler.
++ */
++
++DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
++EXPORT_PER_CPU_SYMBOL(softnet_data);
++
++#ifdef CONFIG_LOCKDEP
++/*
++ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
++ * according to dev->type
++ */
++static const unsigned short netdev_lock_type[] = {
++	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
++	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
++	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
++	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
++	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
++	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
++	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
++	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
++	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
++	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
++	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
++	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
++	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
++	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
++	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
++
++static const char *const netdev_lock_name[] = {
++	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
++	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
++	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
++	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
++	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
++	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
++	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
++	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
++	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
++	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
++	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
++	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
++	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
++	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
++	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
++
++static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
++static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
++
++static inline unsigned short netdev_lock_pos(unsigned short dev_type)
++{
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
++		if (netdev_lock_type[i] == dev_type)
++			return i;
++	/* the last key is used by default */
++	return ARRAY_SIZE(netdev_lock_type) - 1;
++}
++
++static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
++						 unsigned short dev_type)
++{
++	int i;
++
++	i = netdev_lock_pos(dev_type);
++	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
++				   netdev_lock_name[i]);
++}
++
++static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
++{
++	int i;
++
++	i = netdev_lock_pos(dev->type);
++	lockdep_set_class_and_name(&dev->addr_list_lock,
++				   &netdev_addr_lock_key[i],
++				   netdev_lock_name[i]);
++}
++#else
++static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
++						 unsigned short dev_type)
++{
++}
++
++static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
++{
++}
++#endif
++
++/*******************************************************************************
++ *
++ *		Protocol management and registration routines
++ *
++ *******************************************************************************/
++
++
++/*
++ *	Add a protocol ID to the list. Now that the input handler is
++ *	smarter we can dispense with all the messy stuff that used to be
++ *	here.
++ *
++ *	BEWARE!!! Protocol handlers, mangling input packets,
++ *	MUST BE last in hash buckets and checking protocol handlers
++ *	MUST start from promiscuous ptype_all chain in net_bh.
++ *	It is true now, do not change it.
++ *	Explanation follows: if protocol handler, mangling packet, will
++ *	be the first on list, it is not able to sense, that packet
++ *	is cloned and should be copied-on-write, so that it will
++ *	change it and subsequent readers will get broken packet.
++ *							--ANK (980803)
++ */
++
++static inline struct list_head *ptype_head(const struct packet_type *pt)
++{
++	if (pt->type == htons(ETH_P_ALL))
++		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
++	else
++		return pt->dev ? &pt->dev->ptype_specific :
++				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
++}
++
++/**
++ *	dev_add_pack - add packet handler
++ *	@pt: packet type declaration
++ *
++ *	Add a protocol handler to the networking stack. The passed &packet_type
++ *	is linked into kernel lists and may not be freed until it has been
++ *	removed from the kernel lists.
++ *
++ *	This call does not sleep therefore it can not
++ *	guarantee all CPU's that are in middle of receiving packets
++ *	will see the new packet type (until the next received packet).
++ */
++
++void dev_add_pack(struct packet_type *pt)
++{
++	struct list_head *head = ptype_head(pt);
++
++	spin_lock(&ptype_lock);
++	list_add_rcu(&pt->list, head);
++	spin_unlock(&ptype_lock);
++}
++EXPORT_SYMBOL(dev_add_pack);
++
++/**
++ *	__dev_remove_pack	 - remove packet handler
++ *	@pt: packet type declaration
++ *
++ *	Remove a protocol handler that was previously added to the kernel
++ *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
++ *	from the kernel lists and can be freed or reused once this function
++ *	returns.
++ *
++ *      The packet type might still be in use by receivers
++ *	and must not be freed until after all the CPU's have gone
++ *	through a quiescent state.
++ */
++void __dev_remove_pack(struct packet_type *pt)
++{
++	struct list_head *head = ptype_head(pt);
++	struct packet_type *pt1;
++
++	spin_lock(&ptype_lock);
++
++	list_for_each_entry(pt1, head, list) {
++		if (pt == pt1) {
++			list_del_rcu(&pt->list);
++			goto out;
++		}
++	}
++
++	pr_warn("dev_remove_pack: %p not found\n", pt);
++out:
++	spin_unlock(&ptype_lock);
++}
++EXPORT_SYMBOL(__dev_remove_pack);
++
++/**
++ *	dev_remove_pack	 - remove packet handler
++ *	@pt: packet type declaration
++ *
++ *	Remove a protocol handler that was previously added to the kernel
++ *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
++ *	from the kernel lists and can be freed or reused once this function
++ *	returns.
++ *
++ *	This call sleeps to guarantee that no CPU is looking at the packet
++ *	type after return.
++ */
++void dev_remove_pack(struct packet_type *pt)
++{
++	__dev_remove_pack(pt);
++
++	synchronize_net();
++}
++EXPORT_SYMBOL(dev_remove_pack);
++
++
++/*******************************************************************************
++ *
++ *			    Device Interface Subroutines
++ *
++ *******************************************************************************/
++
++/**
++ *	dev_get_iflink	- get 'iflink' value of a interface
++ *	@dev: targeted interface
++ *
++ *	Indicates the ifindex the interface is linked to.
++ *	Physical interfaces have the same 'ifindex' and 'iflink' values.
++ */
++
++int dev_get_iflink(const struct net_device *dev)
++{
++	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
++		return dev->netdev_ops->ndo_get_iflink(dev);
++
++	return dev->ifindex;
++}
++EXPORT_SYMBOL(dev_get_iflink);
++
++/**
++ *	dev_fill_metadata_dst - Retrieve tunnel egress information.
++ *	@dev: targeted interface
++ *	@skb: The packet.
++ *
++ *	For better visibility of tunnel traffic OVS needs to retrieve
++ *	egress tunnel information for a packet. Following API allows
++ *	user to get this info.
++ */
++int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
++{
++	struct ip_tunnel_info *info;
++
++	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
++		return -EINVAL;
++
++	info = skb_tunnel_info_unclone(skb);
++	if (!info)
++		return -ENOMEM;
++	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
++		return -EINVAL;
++
++	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
++}
++EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
++
++static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
++{
++	int k = stack->num_paths++;
++
++	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
++		return NULL;
++
++	return &stack->path[k];
++}
++
++int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
++			  struct net_device_path_stack *stack)
++{
++	const struct net_device *last_dev;
++	struct net_device_path_ctx ctx = {
++		.dev	= dev,
++	};
++	struct net_device_path *path;
++	int ret = 0;
++
++	memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
++	stack->num_paths = 0;
++	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
++		last_dev = ctx.dev;
++		path = dev_fwd_path(stack);
++		if (!path)
++			return -1;
++
++		memset(path, 0, sizeof(struct net_device_path));
++		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
++		if (ret < 0)
++			return -1;
++
++		if (WARN_ON_ONCE(last_dev == ctx.dev))
++			return -1;
++	}
++
++	if (!ctx.dev)
++		return ret;
++
++	path = dev_fwd_path(stack);
++	if (!path)
++		return -1;
++	path->type = DEV_PATH_ETHERNET;
++	path->dev = ctx.dev;
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(dev_fill_forward_path);
++
++/**
++ *	__dev_get_by_name	- find a device by its name
++ *	@net: the applicable net namespace
++ *	@name: name to find
++ *
++ *	Find an interface by name. Must be called under RTNL semaphore
++ *	or @dev_base_lock. If the name is found a pointer to the device
++ *	is returned. If the name is not found then %NULL is returned. The
++ *	reference counters are not incremented so the caller must be
++ *	careful with locks.
++ */
++
++struct net_device *__dev_get_by_name(struct net *net, const char *name)
++{
++	struct netdev_name_node *node_name;
++
++	node_name = netdev_name_node_lookup(net, name);
++	return node_name ? node_name->dev : NULL;
++}
++EXPORT_SYMBOL(__dev_get_by_name);
++
++/**
++ * dev_get_by_name_rcu	- find a device by its name
++ * @net: the applicable net namespace
++ * @name: name to find
++ *
++ * Find an interface by name.
++ * If the name is found a pointer to the device is returned.
++ * If the name is not found then %NULL is returned.
++ * The reference counters are not incremented so the caller must be
++ * careful with locks. The caller must hold RCU lock.
++ */
++
++struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
++{
++	struct netdev_name_node *node_name;
++
++	node_name = netdev_name_node_lookup_rcu(net, name);
++	return node_name ? node_name->dev : NULL;
++}
++EXPORT_SYMBOL(dev_get_by_name_rcu);
++
++/**
++ *	dev_get_by_name		- find a device by its name
++ *	@net: the applicable net namespace
++ *	@name: name to find
++ *
++ *	Find an interface by name. This can be called from any
++ *	context and does its own locking. The returned handle has
++ *	the usage count incremented and the caller must use dev_put() to
++ *	release it when it is no longer needed. %NULL is returned if no
++ *	matching device is found.
++ */
++
++struct net_device *dev_get_by_name(struct net *net, const char *name)
++{
++	struct net_device *dev;
++
++	rcu_read_lock();
++	dev = dev_get_by_name_rcu(net, name);
++	dev_hold(dev);
++	rcu_read_unlock();
++	return dev;
++}
++EXPORT_SYMBOL(dev_get_by_name);
++
++/**
++ *	__dev_get_by_index - find a device by its ifindex
++ *	@net: the applicable net namespace
++ *	@ifindex: index of device
++ *
++ *	Search for an interface by index. Returns %NULL if the device
++ *	is not found or a pointer to the device. The device has not
++ *	had its reference counter increased so the caller must be careful
++ *	about locking. The caller must hold either the RTNL semaphore
++ *	or @dev_base_lock.
++ */
++
++struct net_device *__dev_get_by_index(struct net *net, int ifindex)
++{
++	struct net_device *dev;
++	struct hlist_head *head = dev_index_hash(net, ifindex);
++
++	hlist_for_each_entry(dev, head, index_hlist)
++		if (dev->ifindex == ifindex)
++			return dev;
++
++	return NULL;
++}
++EXPORT_SYMBOL(__dev_get_by_index);
++
++/**
++ *	dev_get_by_index_rcu - find a device by its ifindex
++ *	@net: the applicable net namespace
++ *	@ifindex: index of device
++ *
++ *	Search for an interface by index. Returns %NULL if the device
++ *	is not found or a pointer to the device. The device has not
++ *	had its reference counter increased so the caller must be careful
++ *	about locking. The caller must hold RCU lock.
++ */
++
++struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
++{
++	struct net_device *dev;
++	struct hlist_head *head = dev_index_hash(net, ifindex);
++
++	hlist_for_each_entry_rcu(dev, head, index_hlist)
++		if (dev->ifindex == ifindex)
++			return dev;
++
++	return NULL;
++}
++EXPORT_SYMBOL(dev_get_by_index_rcu);
++
++
++/**
++ *	dev_get_by_index - find a device by its ifindex
++ *	@net: the applicable net namespace
++ *	@ifindex: index of device
++ *
++ *	Search for an interface by index. Returns NULL if the device
++ *	is not found or a pointer to the device. The device returned has
++ *	had a reference added and the pointer is safe until the user calls
++ *	dev_put to indicate they have finished with it.
++ */
++
++struct net_device *dev_get_by_index(struct net *net, int ifindex)
++{
++	struct net_device *dev;
++
++	rcu_read_lock();
++	dev = dev_get_by_index_rcu(net, ifindex);
++	dev_hold(dev);
++	rcu_read_unlock();
++	return dev;
++}
++EXPORT_SYMBOL(dev_get_by_index);
++
++/**
++ *	dev_get_by_napi_id - find a device by napi_id
++ *	@napi_id: ID of the NAPI struct
++ *
++ *	Search for an interface by NAPI ID. Returns %NULL if the device
++ *	is not found or a pointer to the device. The device has not had
++ *	its reference counter increased so the caller must be careful
++ *	about locking. The caller must hold RCU lock.
++ */
++
++struct net_device *dev_get_by_napi_id(unsigned int napi_id)
++{
++	struct napi_struct *napi;
++
++	WARN_ON_ONCE(!rcu_read_lock_held());
++
++	if (napi_id < MIN_NAPI_ID)
++		return NULL;
++
++	napi = napi_by_id(napi_id);
++
++	return napi ? napi->dev : NULL;
++}
++EXPORT_SYMBOL(dev_get_by_napi_id);
++
++/**
++ *	netdev_get_name - get a netdevice name, knowing its ifindex.
++ *	@net: network namespace
++ *	@name: a pointer to the buffer where the name will be stored.
++ *	@ifindex: the ifindex of the interface to get the name from.
++ */
++int netdev_get_name(struct net *net, char *name, int ifindex)
++{
++	struct net_device *dev;
++	int ret;
++
++	down_read(&devnet_rename_sem);
++	rcu_read_lock();
++
++	dev = dev_get_by_index_rcu(net, ifindex);
++	if (!dev) {
++		ret = -ENODEV;
++		goto out;
++	}
++
++	strcpy(name, dev->name);
++
++	ret = 0;
++out:
++	rcu_read_unlock();
++	up_read(&devnet_rename_sem);
++	return ret;
++}
++
++/**
++ *	dev_getbyhwaddr_rcu - find a device by its hardware address
++ *	@net: the applicable net namespace
++ *	@type: media type of device
++ *	@ha: hardware address
++ *
++ *	Search for an interface by MAC address. Returns NULL if the device
++ *	is not found or a pointer to the device.
++ *	The caller must hold RCU or RTNL.
++ *	The returned device has not had its ref count increased
++ *	and the caller must therefore be careful about locking
++ *
++ */
++
++struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
++				       const char *ha)
++{
++	struct net_device *dev;
++
++	for_each_netdev_rcu(net, dev)
++		if (dev->type == type &&
++		    !memcmp(dev->dev_addr, ha, dev->addr_len))
++			return dev;
++
++	return NULL;
++}
++EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
++
++struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
++{
++	struct net_device *dev, *ret = NULL;
++
++	rcu_read_lock();
++	for_each_netdev_rcu(net, dev)
++		if (dev->type == type) {
++			dev_hold(dev);
++			ret = dev;
++			break;
++		}
++	rcu_read_unlock();
++	return ret;
++}
++EXPORT_SYMBOL(dev_getfirstbyhwtype);
++
++/**
++ *	__dev_get_by_flags - find any device with given flags
++ *	@net: the applicable net namespace
++ *	@if_flags: IFF_* values
++ *	@mask: bitmask of bits in if_flags to check
++ *
++ *	Search for any interface with the given flags. Returns NULL if a device
++ *	is not found or a pointer to the device. Must be called inside
++ *	rtnl_lock(), and result refcount is unchanged.
++ */
++
++struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
++				      unsigned short mask)
++{
++	struct net_device *dev, *ret;
++
++	ASSERT_RTNL();
++
++	ret = NULL;
++	for_each_netdev(net, dev) {
++		if (((dev->flags ^ if_flags) & mask) == 0) {
++			ret = dev;
++			break;
++		}
++	}
++	return ret;
++}
++EXPORT_SYMBOL(__dev_get_by_flags);
++
++/**
++ *	dev_valid_name - check if name is okay for network device
++ *	@name: name string
++ *
++ *	Network device names need to be valid file names to
++ *	allow sysfs to work.  We also disallow any kind of
++ *	whitespace.
++ */
++bool dev_valid_name(const char *name)
++{
++	if (*name == '\0')
++		return false;
++	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
++		return false;
++	if (!strcmp(name, ".") || !strcmp(name, ".."))
++		return false;
++
++	while (*name) {
++		if (*name == '/' || *name == ':' || isspace(*name))
++			return false;
++		name++;
++	}
++	return true;
++}
++EXPORT_SYMBOL(dev_valid_name);
++
++/**
++ *	__dev_alloc_name - allocate a name for a device
++ *	@net: network namespace to allocate the device name in
++ *	@name: name format string
++ *	@buf:  scratch buffer and result name string
++ *
++ *	Passed a format string - eg "lt%d" it will try and find a suitable
++ *	id. It scans list of devices to build up a free map, then chooses
++ *	the first empty slot. The caller must hold the dev_base or rtnl lock
++ *	while allocating the name and adding the device in order to avoid
++ *	duplicates.
++ *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
++ *	Returns the number of the unit assigned or a negative errno code.
++ */
++
++static int __dev_alloc_name(struct net *net, const char *name, char *buf)
++{
++	int i = 0;
++	const char *p;
++	const int max_netdevices = 8*PAGE_SIZE;
++	unsigned long *inuse;
++	struct net_device *d;
++
++	if (!dev_valid_name(name))
++		return -EINVAL;
++
++	p = strchr(name, '%');
++	if (p) {
++		/*
++		 * Verify the string as this thing may have come from
++		 * the user.  There must be either one "%d" and no other "%"
++		 * characters.
++		 */
++		if (p[1] != 'd' || strchr(p + 2, '%'))
++			return -EINVAL;
++
++		/* Use one page as a bit array of possible slots */
++		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
++		if (!inuse)
++			return -ENOMEM;
++
++		for_each_netdev(net, d) {
++			struct netdev_name_node *name_node;
++			list_for_each_entry(name_node, &d->name_node->list, list) {
++				if (!sscanf(name_node->name, name, &i))
++					continue;
++				if (i < 0 || i >= max_netdevices)
++					continue;
++
++				/*  avoid cases where sscanf is not exact inverse of printf */
++				snprintf(buf, IFNAMSIZ, name, i);
++				if (!strncmp(buf, name_node->name, IFNAMSIZ))
++					__set_bit(i, inuse);
++			}
++			if (!sscanf(d->name, name, &i))
++				continue;
++			if (i < 0 || i >= max_netdevices)
++				continue;
++
++			/*  avoid cases where sscanf is not exact inverse of printf */
++			snprintf(buf, IFNAMSIZ, name, i);
++			if (!strncmp(buf, d->name, IFNAMSIZ))
++				__set_bit(i, inuse);
++		}
++
++		i = find_first_zero_bit(inuse, max_netdevices);
++		free_page((unsigned long) inuse);
++	}
++
++	snprintf(buf, IFNAMSIZ, name, i);
++	if (!netdev_name_in_use(net, buf))
++		return i;
++
++	/* It is possible to run out of possible slots
++	 * when the name is long and there isn't enough space left
++	 * for the digits, or if all bits are used.
++	 */
++	return -ENFILE;
++}
++
++static int dev_alloc_name_ns(struct net *net,
++			     struct net_device *dev,
++			     const char *name)
++{
++	char buf[IFNAMSIZ];
++	int ret;
++
++	BUG_ON(!net);
++	ret = __dev_alloc_name(net, name, buf);
++	if (ret >= 0)
++		strlcpy(dev->name, buf, IFNAMSIZ);
++	return ret;
++}
++
++/**
++ *	dev_alloc_name - allocate a name for a device
++ *	@dev: device
++ *	@name: name format string
++ *
++ *	Passed a format string - eg "lt%d" it will try and find a suitable
++ *	id. It scans list of devices to build up a free map, then chooses
++ *	the first empty slot. The caller must hold the dev_base or rtnl lock
++ *	while allocating the name and adding the device in order to avoid
++ *	duplicates.
++ *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
++ *	Returns the number of the unit assigned or a negative errno code.
++ */
++
++int dev_alloc_name(struct net_device *dev, const char *name)
++{
++	return dev_alloc_name_ns(dev_net(dev), dev, name);
++}
++EXPORT_SYMBOL(dev_alloc_name);
++
++static int dev_get_valid_name(struct net *net, struct net_device *dev,
++			      const char *name)
++{
++	BUG_ON(!net);
++
++	if (!dev_valid_name(name))
++		return -EINVAL;
++
++	if (strchr(name, '%'))
++		return dev_alloc_name_ns(net, dev, name);
++	else if (netdev_name_in_use(net, name))
++		return -EEXIST;
++	else if (dev->name != name)
++		strlcpy(dev->name, name, IFNAMSIZ);
++
++	return 0;
++}
++
++/**
++ *	dev_change_name - change name of a device
++ *	@dev: device
++ *	@newname: name (or format string) must be at least IFNAMSIZ
++ *
++ *	Change name of a device, can pass format strings "eth%d".
++ *	for wildcarding.
++ */
++int dev_change_name(struct net_device *dev, const char *newname)
++{
++	unsigned char old_assign_type;
++	char oldname[IFNAMSIZ];
++	int err = 0;
++	int ret;
++	struct net *net;
++
++	ASSERT_RTNL();
++	BUG_ON(!dev_net(dev));
++
++	net = dev_net(dev);
++
++	/* Some auto-enslaved devices e.g. failover slaves are
++	 * special, as userspace might rename the device after
++	 * the interface had been brought up and running since
++	 * the point kernel initiated auto-enslavement. Allow
++	 * live name change even when these slave devices are
++	 * up and running.
++	 *
++	 * Typically, users of these auto-enslaving devices
++	 * don't actually care about slave name change, as
++	 * they are supposed to operate on master interface
++	 * directly.
++	 */
++	if (dev->flags & IFF_UP &&
++	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
++		return -EBUSY;
++
++	down_write(&devnet_rename_sem);
++
++	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
++		up_write(&devnet_rename_sem);
++		return 0;
++	}
++
++	memcpy(oldname, dev->name, IFNAMSIZ);
++
++	err = dev_get_valid_name(net, dev, newname);
++	if (err < 0) {
++		up_write(&devnet_rename_sem);
++		return err;
++	}
++
++	if (oldname[0] && !strchr(oldname, '%'))
++		netdev_info(dev, "renamed from %s\n", oldname);
++
++	old_assign_type = dev->name_assign_type;
++	dev->name_assign_type = NET_NAME_RENAMED;
++
++rollback:
++	ret = device_rename(&dev->dev, dev->name);
++	if (ret) {
++		memcpy(dev->name, oldname, IFNAMSIZ);
++		dev->name_assign_type = old_assign_type;
++		up_write(&devnet_rename_sem);
++		return ret;
++	}
++
++	up_write(&devnet_rename_sem);
++
++	netdev_adjacent_rename_links(dev, oldname);
++
++	write_lock(&dev_base_lock);
++	netdev_name_node_del(dev->name_node);
++	write_unlock(&dev_base_lock);
++
++	synchronize_rcu();
++
++	write_lock(&dev_base_lock);
++	netdev_name_node_add(net, dev->name_node);
++	write_unlock(&dev_base_lock);
++
++	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
++	ret = notifier_to_errno(ret);
++
++	if (ret) {
++		/* err >= 0 after dev_alloc_name() or stores the first errno */
++		if (err >= 0) {
++			err = ret;
++			down_write(&devnet_rename_sem);
++			memcpy(dev->name, oldname, IFNAMSIZ);
++			memcpy(oldname, newname, IFNAMSIZ);
++			dev->name_assign_type = old_assign_type;
++			old_assign_type = NET_NAME_RENAMED;
++			goto rollback;
++		} else {
++			netdev_err(dev, "name change rollback failed: %d\n",
++				   ret);
++		}
++	}
++
++	return err;
++}
++
++/**
++ *	dev_set_alias - change ifalias of a device
++ *	@dev: device
++ *	@alias: name up to IFALIASZ
++ *	@len: limit of bytes to copy from info
++ *
++ *	Set ifalias for a device,
++ */
++int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
++{
++	struct dev_ifalias *new_alias = NULL;
++
++	if (len >= IFALIASZ)
++		return -EINVAL;
++
++	if (len) {
++		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
++		if (!new_alias)
++			return -ENOMEM;
++
++		memcpy(new_alias->ifalias, alias, len);
++		new_alias->ifalias[len] = 0;
++	}
++
++	mutex_lock(&ifalias_mutex);
++	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
++					mutex_is_locked(&ifalias_mutex));
++	mutex_unlock(&ifalias_mutex);
++
++	if (new_alias)
++		kfree_rcu(new_alias, rcuhead);
++
++	return len;
++}
++EXPORT_SYMBOL(dev_set_alias);
++
++/**
++ *	dev_get_alias - get ifalias of a device
++ *	@dev: device
++ *	@name: buffer to store name of ifalias
++ *	@len: size of buffer
++ *
++ *	get ifalias for a device.  Caller must make sure dev cannot go
++ *	away,  e.g. rcu read lock or own a reference count to device.
++ */
++int dev_get_alias(const struct net_device *dev, char *name, size_t len)
++{
++	const struct dev_ifalias *alias;
++	int ret = 0;
++
++	rcu_read_lock();
++	alias = rcu_dereference(dev->ifalias);
++	if (alias)
++		ret = snprintf(name, len, "%s", alias->ifalias);
++	rcu_read_unlock();
++
++	return ret;
++}
++
++/**
++ *	netdev_features_change - device changes features
++ *	@dev: device to cause notification
++ *
++ *	Called to indicate a device has changed features.
++ */
++void netdev_features_change(struct net_device *dev)
++{
++	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
++}
++EXPORT_SYMBOL(netdev_features_change);
++
++/**
++ *	netdev_state_change - device changes state
++ *	@dev: device to cause notification
++ *
++ *	Called to indicate a device has changed state. This function calls
++ *	the notifier chains for netdev_chain and sends a NEWLINK message
++ *	to the routing socket.
++ */
++void netdev_state_change(struct net_device *dev)
++{
++	if (dev->flags & IFF_UP) {
++		struct netdev_notifier_change_info change_info = {
++			.info.dev = dev,
++		};
++
++		call_netdevice_notifiers_info(NETDEV_CHANGE,
++					      &change_info.info);
++		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
++	}
++}
++EXPORT_SYMBOL(netdev_state_change);
++
++/**
++ * __netdev_notify_peers - notify network peers about existence of @dev,
++ * to be called when rtnl lock is already held.
++ * @dev: network device
++ *
++ * Generate traffic such that interested network peers are aware of
++ * @dev, such as by generating a gratuitous ARP. This may be used when
++ * a device wants to inform the rest of the network about some sort of
++ * reconfiguration such as a failover event or virtual machine
++ * migration.
++ */
++void __netdev_notify_peers(struct net_device *dev)
++{
++	ASSERT_RTNL();
++	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
++	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
++}
++EXPORT_SYMBOL(__netdev_notify_peers);
++
++/**
++ * netdev_notify_peers - notify network peers about existence of @dev
++ * @dev: network device
++ *
++ * Generate traffic such that interested network peers are aware of
++ * @dev, such as by generating a gratuitous ARP. This may be used when
++ * a device wants to inform the rest of the network about some sort of
++ * reconfiguration such as a failover event or virtual machine
++ * migration.
++ */
++void netdev_notify_peers(struct net_device *dev)
++{
++	rtnl_lock();
++	__netdev_notify_peers(dev);
++	rtnl_unlock();
++}
++EXPORT_SYMBOL(netdev_notify_peers);
++
++static int napi_threaded_poll(void *data);
++
++static int napi_kthread_create(struct napi_struct *n)
++{
++	int err = 0;
++
++	/* Create and wake up the kthread once to put it in
++	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
++	 * warning and work with loadavg.
++	 */
++	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
++				n->dev->name, n->napi_id);
++	if (IS_ERR(n->thread)) {
++		err = PTR_ERR(n->thread);
++		pr_err("kthread_run failed with err %d\n", err);
++		n->thread = NULL;
++	}
++
++	return err;
++}
++
++static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	int ret;
++
++	ASSERT_RTNL();
++	dev_addr_check(dev);
++
++	if (!netif_device_present(dev)) {
++		/* may be detached because parent is runtime-suspended */
++		if (dev->dev.parent)
++			pm_runtime_resume(dev->dev.parent);
++		if (!netif_device_present(dev))
++			return -ENODEV;
++	}
++
++	/* Block netpoll from trying to do any rx path servicing.
++	 * If we don't do this there is a chance ndo_poll_controller
++	 * or ndo_poll may be running while we open the device
++	 */
++	netpoll_poll_disable(dev);
++
++	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
++	ret = notifier_to_errno(ret);
++	if (ret)
++		return ret;
++
++	set_bit(__LINK_STATE_START, &dev->state);
++
++	if (ops->ndo_validate_addr)
++		ret = ops->ndo_validate_addr(dev);
++
++	if (!ret && ops->ndo_open)
++		ret = ops->ndo_open(dev);
++
++	netpoll_poll_enable(dev);
++
++	if (ret)
++		clear_bit(__LINK_STATE_START, &dev->state);
++	else {
++		dev->flags |= IFF_UP;
++		dev_set_rx_mode(dev);
++		dev_activate(dev);
++		add_device_randomness(dev->dev_addr, dev->addr_len);
++	}
++
++	return ret;
++}
++
++/**
++ *	dev_open	- prepare an interface for use.
++ *	@dev: device to open
++ *	@extack: netlink extended ack
++ *
++ *	Takes a device from down to up state. The device's private open
++ *	function is invoked and then the multicast lists are loaded. Finally
++ *	the device is moved into the up state and a %NETDEV_UP message is
++ *	sent to the netdev notifier chain.
++ *
++ *	Calling this function on an active interface is a nop. On a failure
++ *	a negative errno code is returned.
++ */
++int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
++{
++	int ret;
++
++	if (dev->flags & IFF_UP)
++		return 0;
++
++	ret = __dev_open(dev, extack);
++	if (ret < 0)
++		return ret;
++
++	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
++	call_netdevice_notifiers(NETDEV_UP, dev);
++
++	return ret;
++}
++EXPORT_SYMBOL(dev_open);
++
++static void __dev_close_many(struct list_head *head)
++{
++	struct net_device *dev;
++
++	ASSERT_RTNL();
++	might_sleep();
++
++	list_for_each_entry(dev, head, close_list) {
++		/* Temporarily disable netpoll until the interface is down */
++		netpoll_poll_disable(dev);
++
++		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
++
++		clear_bit(__LINK_STATE_START, &dev->state);
++
++		/* Synchronize to scheduled poll. We cannot touch poll list, it
++		 * can be even on different cpu. So just clear netif_running().
++		 *
++		 * dev->stop() will invoke napi_disable() on all of it's
++		 * napi_struct instances on this device.
++		 */
++		smp_mb__after_atomic(); /* Commit netif_running(). */
++	}
++
++	dev_deactivate_many(head);
++
++	list_for_each_entry(dev, head, close_list) {
++		const struct net_device_ops *ops = dev->netdev_ops;
++
++		/*
++		 *	Call the device specific close. This cannot fail.
++		 *	Only if device is UP
++		 *
++		 *	We allow it to be called even after a DETACH hot-plug
++		 *	event.
++		 */
++		if (ops->ndo_stop)
++			ops->ndo_stop(dev);
++
++		dev->flags &= ~IFF_UP;
++		netpoll_poll_enable(dev);
++	}
++}
++
++static void __dev_close(struct net_device *dev)
++{
++	LIST_HEAD(single);
++
++	list_add(&dev->close_list, &single);
++	__dev_close_many(&single);
++	list_del(&single);
++}
++
++void dev_close_many(struct list_head *head, bool unlink)
++{
++	struct net_device *dev, *tmp;
++
++	/* Remove the devices that don't need to be closed */
++	list_for_each_entry_safe(dev, tmp, head, close_list)
++		if (!(dev->flags & IFF_UP))
++			list_del_init(&dev->close_list);
++
++	__dev_close_many(head);
++
++	list_for_each_entry_safe(dev, tmp, head, close_list) {
++		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
++		call_netdevice_notifiers(NETDEV_DOWN, dev);
++		if (unlink)
++			list_del_init(&dev->close_list);
++	}
++}
++EXPORT_SYMBOL(dev_close_many);
++
++/**
++ *	dev_close - shutdown an interface.
++ *	@dev: device to shutdown
++ *
++ *	This function moves an active device into down state. A
++ *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
++ *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
++ *	chain.
++ */
++void dev_close(struct net_device *dev)
++{
++	if (dev->flags & IFF_UP) {
++		LIST_HEAD(single);
++
++		list_add(&dev->close_list, &single);
++		dev_close_many(&single, true);
++		list_del(&single);
++	}
++}
++EXPORT_SYMBOL(dev_close);
++
++
++/**
++ *	dev_disable_lro - disable Large Receive Offload on a device
++ *	@dev: device
++ *
++ *	Disable Large Receive Offload (LRO) on a net device.  Must be
++ *	called under RTNL.  This is needed if received packets may be
++ *	forwarded to another interface.
++ */
++void dev_disable_lro(struct net_device *dev)
++{
++	struct net_device *lower_dev;
++	struct list_head *iter;
++
++	dev->wanted_features &= ~NETIF_F_LRO;
++	netdev_update_features(dev);
++
++	if (unlikely(dev->features & NETIF_F_LRO))
++		netdev_WARN(dev, "failed to disable LRO!\n");
++
++	netdev_for_each_lower_dev(dev, lower_dev, iter)
++		dev_disable_lro(lower_dev);
++}
++EXPORT_SYMBOL(dev_disable_lro);
++
++/**
++ *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
++ *	@dev: device
++ *
++ *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
++ *	called under RTNL.  This is needed if Generic XDP is installed on
++ *	the device.
++ */
++static void dev_disable_gro_hw(struct net_device *dev)
++{
++	dev->wanted_features &= ~NETIF_F_GRO_HW;
++	netdev_update_features(dev);
++
++	if (unlikely(dev->features & NETIF_F_GRO_HW))
++		netdev_WARN(dev, "failed to disable GRO_HW!\n");
++}
++
++const char *netdev_cmd_to_name(enum netdev_cmd cmd)
++{
++#define N(val) 						\
++	case NETDEV_##val:				\
++		return "NETDEV_" __stringify(val);
++	switch (cmd) {
++	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
++	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
++	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
++	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
++	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
++	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
++	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
++	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
++	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
++	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
++	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
++	}
++#undef N
++	return "UNKNOWN_NETDEV_EVENT";
++}
++EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
++
++static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
++				   struct net_device *dev)
++{
++	struct netdev_notifier_info info = {
++		.dev = dev,
++	};
++
++	return nb->notifier_call(nb, val, &info);
++}
++
++static int call_netdevice_register_notifiers(struct notifier_block *nb,
++					     struct net_device *dev)
++{
++	int err;
++
++	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
++	err = notifier_to_errno(err);
++	if (err)
++		return err;
++
++	if (!(dev->flags & IFF_UP))
++		return 0;
++
++	call_netdevice_notifier(nb, NETDEV_UP, dev);
++	return 0;
++}
++
++static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
++						struct net_device *dev)
++{
++	if (dev->flags & IFF_UP) {
++		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
++					dev);
++		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
++	}
++	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
++}
++
++static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
++						 struct net *net)
++{
++	struct net_device *dev;
++	int err;
++
++	for_each_netdev(net, dev) {
++		err = call_netdevice_register_notifiers(nb, dev);
++		if (err)
++			goto rollback;
++	}
++	return 0;
++
++rollback:
++	for_each_netdev_continue_reverse(net, dev)
++		call_netdevice_unregister_notifiers(nb, dev);
++	return err;
++}
++
++static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
++						    struct net *net)
++{
++	struct net_device *dev;
++
++	for_each_netdev(net, dev)
++		call_netdevice_unregister_notifiers(nb, dev);
++}
++
++static int dev_boot_phase = 1;
++
++/**
++ * register_netdevice_notifier - register a network notifier block
++ * @nb: notifier
++ *
++ * Register a notifier to be called when network device events occur.
++ * The notifier passed is linked into the kernel structures and must
++ * not be reused until it has been unregistered. A negative errno code
++ * is returned on a failure.
++ *
++ * When registered all registration and up events are replayed
++ * to the new notifier to allow device to have a race free
++ * view of the network device list.
++ */
++
++int register_netdevice_notifier(struct notifier_block *nb)
++{
++	struct net *net;
++	int err;
++
++	/* Close race with setup_net() and cleanup_net() */
++	down_write(&pernet_ops_rwsem);
++	rtnl_lock();
++	err = raw_notifier_chain_register(&netdev_chain, nb);
++	if (err)
++		goto unlock;
++	if (dev_boot_phase)
++		goto unlock;
++	for_each_net(net) {
++		err = call_netdevice_register_net_notifiers(nb, net);
++		if (err)
++			goto rollback;
++	}
++
++unlock:
++	rtnl_unlock();
++	up_write(&pernet_ops_rwsem);
++	return err;
++
++rollback:
++	for_each_net_continue_reverse(net)
++		call_netdevice_unregister_net_notifiers(nb, net);
++
++	raw_notifier_chain_unregister(&netdev_chain, nb);
++	goto unlock;
++}
++EXPORT_SYMBOL(register_netdevice_notifier);
++
++/**
++ * unregister_netdevice_notifier - unregister a network notifier block
++ * @nb: notifier
++ *
++ * Unregister a notifier previously registered by
++ * register_netdevice_notifier(). The notifier is unlinked into the
++ * kernel structures and may then be reused. A negative errno code
++ * is returned on a failure.
++ *
++ * After unregistering unregister and down device events are synthesized
++ * for all devices on the device list to the removed notifier to remove
++ * the need for special case cleanup code.
++ */
++
++int unregister_netdevice_notifier(struct notifier_block *nb)
++{
++	struct net *net;
++	int err;
++
++	/* Close race with setup_net() and cleanup_net() */
++	down_write(&pernet_ops_rwsem);
++	rtnl_lock();
++	err = raw_notifier_chain_unregister(&netdev_chain, nb);
++	if (err)
++		goto unlock;
++
++	for_each_net(net)
++		call_netdevice_unregister_net_notifiers(nb, net);
++
++unlock:
++	rtnl_unlock();
++	up_write(&pernet_ops_rwsem);
++	return err;
++}
++EXPORT_SYMBOL(unregister_netdevice_notifier);
++
++static int __register_netdevice_notifier_net(struct net *net,
++					     struct notifier_block *nb,
++					     bool ignore_call_fail)
++{
++	int err;
++
++	err = raw_notifier_chain_register(&net->netdev_chain, nb);
++	if (err)
++		return err;
++	if (dev_boot_phase)
++		return 0;
++
++	err = call_netdevice_register_net_notifiers(nb, net);
++	if (err && !ignore_call_fail)
++		goto chain_unregister;
++
++	return 0;
++
++chain_unregister:
++	raw_notifier_chain_unregister(&net->netdev_chain, nb);
++	return err;
++}
++
++static int __unregister_netdevice_notifier_net(struct net *net,
++					       struct notifier_block *nb)
++{
++	int err;
++
++	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
++	if (err)
++		return err;
++
++	call_netdevice_unregister_net_notifiers(nb, net);
++	return 0;
++}
++
++/**
++ * register_netdevice_notifier_net - register a per-netns network notifier block
++ * @net: network namespace
++ * @nb: notifier
++ *
++ * Register a notifier to be called when network device events occur.
++ * The notifier passed is linked into the kernel structures and must
++ * not be reused until it has been unregistered. A negative errno code
++ * is returned on a failure.
++ *
++ * When registered all registration and up events are replayed
++ * to the new notifier to allow device to have a race free
++ * view of the network device list.
++ */
++
++int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
++{
++	int err;
++
++	rtnl_lock();
++	err = __register_netdevice_notifier_net(net, nb, false);
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(register_netdevice_notifier_net);
++
++/**
++ * unregister_netdevice_notifier_net - unregister a per-netns
++ *                                     network notifier block
++ * @net: network namespace
++ * @nb: notifier
++ *
++ * Unregister a notifier previously registered by
++ * register_netdevice_notifier(). The notifier is unlinked into the
++ * kernel structures and may then be reused. A negative errno code
++ * is returned on a failure.
++ *
++ * After unregistering unregister and down device events are synthesized
++ * for all devices on the device list to the removed notifier to remove
++ * the need for special case cleanup code.
++ */
++
++int unregister_netdevice_notifier_net(struct net *net,
++				      struct notifier_block *nb)
++{
++	int err;
++
++	rtnl_lock();
++	err = __unregister_netdevice_notifier_net(net, nb);
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(unregister_netdevice_notifier_net);
++
++int register_netdevice_notifier_dev_net(struct net_device *dev,
++					struct notifier_block *nb,
++					struct netdev_net_notifier *nn)
++{
++	int err;
++
++	rtnl_lock();
++	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
++	if (!err) {
++		nn->nb = nb;
++		list_add(&nn->list, &dev->net_notifier_list);
++	}
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
++
++int unregister_netdevice_notifier_dev_net(struct net_device *dev,
++					  struct notifier_block *nb,
++					  struct netdev_net_notifier *nn)
++{
++	int err;
++
++	rtnl_lock();
++	list_del(&nn->list);
++	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
++
++static void move_netdevice_notifiers_dev_net(struct net_device *dev,
++					     struct net *net)
++{
++	struct netdev_net_notifier *nn;
++
++	list_for_each_entry(nn, &dev->net_notifier_list, list) {
++		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
++		__register_netdevice_notifier_net(net, nn->nb, true);
++	}
++}
++
++/**
++ *	call_netdevice_notifiers_info - call all network notifier blocks
++ *	@val: value passed unmodified to notifier function
++ *	@info: notifier information data
++ *
++ *	Call all network notifier blocks.  Parameters and return value
++ *	are as for raw_notifier_call_chain().
++ */
++
++static int call_netdevice_notifiers_info(unsigned long val,
++					 struct netdev_notifier_info *info)
++{
++	struct net *net = dev_net(info->dev);
++	int ret;
++
++	ASSERT_RTNL();
++
++	/* Run per-netns notifier block chain first, then run the global one.
++	 * Hopefully, one day, the global one is going to be removed after
++	 * all notifier block registrators get converted to be per-netns.
++	 */
++	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
++	if (ret & NOTIFY_STOP_MASK)
++		return ret;
++	return raw_notifier_call_chain(&netdev_chain, val, info);
++}
++
++/**
++ *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
++ *	                                       for and rollback on error
++ *	@val_up: value passed unmodified to notifier function
++ *	@val_down: value passed unmodified to the notifier function when
++ *	           recovering from an error on @val_up
++ *	@info: notifier information data
++ *
++ *	Call all per-netns network notifier blocks, but not notifier blocks on
++ *	the global notifier chain. Parameters and return value are as for
++ *	raw_notifier_call_chain_robust().
++ */
++
++static int
++call_netdevice_notifiers_info_robust(unsigned long val_up,
++				     unsigned long val_down,
++				     struct netdev_notifier_info *info)
++{
++	struct net *net = dev_net(info->dev);
++
++	ASSERT_RTNL();
++
++	return raw_notifier_call_chain_robust(&net->netdev_chain,
++					      val_up, val_down, info);
++}
++
++static int call_netdevice_notifiers_extack(unsigned long val,
++					   struct net_device *dev,
++					   struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_info info = {
++		.dev = dev,
++		.extack = extack,
++	};
++
++	return call_netdevice_notifiers_info(val, &info);
++}
++
++/**
++ *	call_netdevice_notifiers - call all network notifier blocks
++ *      @val: value passed unmodified to notifier function
++ *      @dev: net_device pointer passed unmodified to notifier function
++ *
++ *	Call all network notifier blocks.  Parameters and return value
++ *	are as for raw_notifier_call_chain().
++ */
++
++int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
++{
++	return call_netdevice_notifiers_extack(val, dev, NULL);
++}
++EXPORT_SYMBOL(call_netdevice_notifiers);
++
++/**
++ *	call_netdevice_notifiers_mtu - call all network notifier blocks
++ *	@val: value passed unmodified to notifier function
++ *	@dev: net_device pointer passed unmodified to notifier function
++ *	@arg: additional u32 argument passed to the notifier function
++ *
++ *	Call all network notifier blocks.  Parameters and return value
++ *	are as for raw_notifier_call_chain().
++ */
++static int call_netdevice_notifiers_mtu(unsigned long val,
++					struct net_device *dev, u32 arg)
++{
++	struct netdev_notifier_info_ext info = {
++		.info.dev = dev,
++		.ext.mtu = arg,
++	};
++
++	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
++
++	return call_netdevice_notifiers_info(val, &info.info);
++}
++
++#ifdef CONFIG_NET_INGRESS
++static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
++
++void net_inc_ingress_queue(void)
++{
++	static_branch_inc(&ingress_needed_key);
++}
++EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
++
++void net_dec_ingress_queue(void)
++{
++	static_branch_dec(&ingress_needed_key);
++}
++EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
++#endif
++
++#ifdef CONFIG_NET_EGRESS
++static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
++
++void net_inc_egress_queue(void)
++{
++	static_branch_inc(&egress_needed_key);
++}
++EXPORT_SYMBOL_GPL(net_inc_egress_queue);
++
++void net_dec_egress_queue(void)
++{
++	static_branch_dec(&egress_needed_key);
++}
++EXPORT_SYMBOL_GPL(net_dec_egress_queue);
++#endif
++
++DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
++EXPORT_SYMBOL(netstamp_needed_key);
++#ifdef CONFIG_JUMP_LABEL
++static atomic_t netstamp_needed_deferred;
++static atomic_t netstamp_wanted;
++static void netstamp_clear(struct work_struct *work)
++{
++	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
++	int wanted;
++
++	wanted = atomic_add_return(deferred, &netstamp_wanted);
++	if (wanted > 0)
++		static_branch_enable(&netstamp_needed_key);
++	else
++		static_branch_disable(&netstamp_needed_key);
++}
++static DECLARE_WORK(netstamp_work, netstamp_clear);
++#endif
++
++void net_enable_timestamp(void)
++{
++#ifdef CONFIG_JUMP_LABEL
++	int wanted;
++
++	while (1) {
++		wanted = atomic_read(&netstamp_wanted);
++		if (wanted <= 0)
++			break;
++		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
++			return;
++	}
++	atomic_inc(&netstamp_needed_deferred);
++	schedule_work(&netstamp_work);
++#else
++	static_branch_inc(&netstamp_needed_key);
++#endif
++}
++EXPORT_SYMBOL(net_enable_timestamp);
++
++void net_disable_timestamp(void)
++{
++#ifdef CONFIG_JUMP_LABEL
++	int wanted;
++
++	while (1) {
++		wanted = atomic_read(&netstamp_wanted);
++		if (wanted <= 1)
++			break;
++		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
++			return;
++	}
++	atomic_dec(&netstamp_needed_deferred);
++	schedule_work(&netstamp_work);
++#else
++	static_branch_dec(&netstamp_needed_key);
++#endif
++}
++EXPORT_SYMBOL(net_disable_timestamp);
++
++static inline void net_timestamp_set(struct sk_buff *skb)
++{
++	skb->tstamp = 0;
++	skb->mono_delivery_time = 0;
++	if (static_branch_unlikely(&netstamp_needed_key))
++		skb->tstamp = ktime_get_real();
++}
++
++#define net_timestamp_check(COND, SKB)				\
++	if (static_branch_unlikely(&netstamp_needed_key)) {	\
++		if ((COND) && !(SKB)->tstamp)			\
++			(SKB)->tstamp = ktime_get_real();	\
++	}							\
++
++bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
++{
++	return __is_skb_forwardable(dev, skb, true);
++}
++EXPORT_SYMBOL_GPL(is_skb_forwardable);
++
++static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
++			      bool check_mtu)
++{
++	int ret = ____dev_forward_skb(dev, skb, check_mtu);
++
++	if (likely(!ret)) {
++		skb->protocol = eth_type_trans(skb, dev);
++		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
++	}
++
++	return ret;
++}
++
++int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
++{
++	return __dev_forward_skb2(dev, skb, true);
++}
++EXPORT_SYMBOL_GPL(__dev_forward_skb);
++
++/**
++ * dev_forward_skb - loopback an skb to another netif
++ *
++ * @dev: destination network device
++ * @skb: buffer to forward
++ *
++ * return values:
++ *	NET_RX_SUCCESS	(no congestion)
++ *	NET_RX_DROP     (packet was dropped, but freed)
++ *
++ * dev_forward_skb can be used for injecting an skb from the
++ * start_xmit function of one device into the receive queue
++ * of another device.
++ *
++ * The receiving device may be in another namespace, so
++ * we have to clear all information in the skb that could
++ * impact namespace isolation.
++ */
++int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
++{
++	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
++}
++EXPORT_SYMBOL_GPL(dev_forward_skb);
++
++int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
++{
++	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
++}
++
++static inline int deliver_skb(struct sk_buff *skb,
++			      struct packet_type *pt_prev,
++			      struct net_device *orig_dev)
++{
++	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
++		return -ENOMEM;
++	refcount_inc(&skb->users);
++	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
++}
++
++static inline void deliver_ptype_list_skb(struct sk_buff *skb,
++					  struct packet_type **pt,
++					  struct net_device *orig_dev,
++					  __be16 type,
++					  struct list_head *ptype_list)
++{
++	struct packet_type *ptype, *pt_prev = *pt;
++
++	list_for_each_entry_rcu(ptype, ptype_list, list) {
++		if (ptype->type != type)
++			continue;
++		if (pt_prev)
++			deliver_skb(skb, pt_prev, orig_dev);
++		pt_prev = ptype;
++	}
++	*pt = pt_prev;
++}
++
++static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
++{
++	if (!ptype->af_packet_priv || !skb->sk)
++		return false;
++
++	if (ptype->id_match)
++		return ptype->id_match(ptype, skb->sk);
++	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
++		return true;
++
++	return false;
++}
++
++/**
++ * dev_nit_active - return true if any network interface taps are in use
++ *
++ * @dev: network device to check for the presence of taps
++ */
++bool dev_nit_active(struct net_device *dev)
++{
++	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
++}
++EXPORT_SYMBOL_GPL(dev_nit_active);
++
++/*
++ *	Support routine. Sends outgoing frames to any network
++ *	taps currently in use.
++ */
++
++void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
++{
++	struct packet_type *ptype;
++	struct sk_buff *skb2 = NULL;
++	struct packet_type *pt_prev = NULL;
++	struct list_head *ptype_list = &ptype_all;
++
++	rcu_read_lock();
++again:
++	list_for_each_entry_rcu(ptype, ptype_list, list) {
++		if (ptype->ignore_outgoing)
++			continue;
++
++		/* Never send packets back to the socket
++		 * they originated from - MvS (miquels@drinkel.ow.org)
++		 */
++		if (skb_loop_sk(ptype, skb))
++			continue;
++
++		if (pt_prev) {
++			deliver_skb(skb2, pt_prev, skb->dev);
++			pt_prev = ptype;
++			continue;
++		}
++
++		/* need to clone skb, done only once */
++		skb2 = skb_clone(skb, GFP_ATOMIC);
++		if (!skb2)
++			goto out_unlock;
++
++		net_timestamp_set(skb2);
++
++		/* skb->nh should be correctly
++		 * set by sender, so that the second statement is
++		 * just protection against buggy protocols.
++		 */
++		skb_reset_mac_header(skb2);
++
++		if (skb_network_header(skb2) < skb2->data ||
++		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
++			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
++					     ntohs(skb2->protocol),
++					     dev->name);
++			skb_reset_network_header(skb2);
++		}
++
++		skb2->transport_header = skb2->network_header;
++		skb2->pkt_type = PACKET_OUTGOING;
++		pt_prev = ptype;
++	}
++
++	if (ptype_list == &ptype_all) {
++		ptype_list = &dev->ptype_all;
++		goto again;
++	}
++out_unlock:
++	if (pt_prev) {
++		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
++			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
++		else
++			kfree_skb(skb2);
++	}
++	rcu_read_unlock();
++}
++EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
++
++/**
++ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
++ * @dev: Network device
++ * @txq: number of queues available
++ *
++ * If real_num_tx_queues is changed the tc mappings may no longer be
++ * valid. To resolve this verify the tc mapping remains valid and if
++ * not NULL the mapping. With no priorities mapping to this
++ * offset/count pair it will no longer be used. In the worst case TC0
++ * is invalid nothing can be done so disable priority mappings. If is
++ * expected that drivers will fix this mapping if they can before
++ * calling netif_set_real_num_tx_queues.
++ */
++static void netif_setup_tc(struct net_device *dev, unsigned int txq)
++{
++	int i;
++	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
++
++	/* If TC0 is invalidated disable TC mapping */
++	if (tc->offset + tc->count > txq) {
++		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
++		dev->num_tc = 0;
++		return;
++	}
++
++	/* Invalidated prio to tc mappings set to TC0 */
++	for (i = 1; i < TC_BITMASK + 1; i++) {
++		int q = netdev_get_prio_tc_map(dev, i);
++
++		tc = &dev->tc_to_txq[q];
++		if (tc->offset + tc->count > txq) {
++			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
++				    i, q);
++			netdev_set_prio_tc_map(dev, i, 0);
++		}
++	}
++}
++
++int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
++{
++	if (dev->num_tc) {
++		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
++		int i;
++
++		/* walk through the TCs and see if it falls into any of them */
++		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
++			if ((txq - tc->offset) < tc->count)
++				return i;
++		}
++
++		/* didn't find it, just return -1 to indicate no match */
++		return -1;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(netdev_txq_to_tc);
++
++#ifdef CONFIG_XPS
++static struct static_key xps_needed __read_mostly;
++static struct static_key xps_rxqs_needed __read_mostly;
++static DEFINE_MUTEX(xps_map_mutex);
++#define xmap_dereference(P)		\
++	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
++
++static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
++			     struct xps_dev_maps *old_maps, int tci, u16 index)
++{
++	struct xps_map *map = NULL;
++	int pos;
++
++	if (dev_maps)
++		map = xmap_dereference(dev_maps->attr_map[tci]);
++	if (!map)
++		return false;
++
++	for (pos = map->len; pos--;) {
++		if (map->queues[pos] != index)
++			continue;
++
++		if (map->len > 1) {
++			map->queues[pos] = map->queues[--map->len];
++			break;
++		}
++
++		if (old_maps)
++			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
++		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
++		kfree_rcu(map, rcu);
++		return false;
++	}
++
++	return true;
++}
++
++static bool remove_xps_queue_cpu(struct net_device *dev,
++				 struct xps_dev_maps *dev_maps,
++				 int cpu, u16 offset, u16 count)
++{
++	int num_tc = dev_maps->num_tc;
++	bool active = false;
++	int tci;
++
++	for (tci = cpu * num_tc; num_tc--; tci++) {
++		int i, j;
++
++		for (i = count, j = offset; i--; j++) {
++			if (!remove_xps_queue(dev_maps, NULL, tci, j))
++				break;
++		}
++
++		active |= i < 0;
++	}
++
++	return active;
++}
++
++static void reset_xps_maps(struct net_device *dev,
++			   struct xps_dev_maps *dev_maps,
++			   enum xps_map_type type)
++{
++	static_key_slow_dec_cpuslocked(&xps_needed);
++	if (type == XPS_RXQS)
++		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
++
++	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
++
++	kfree_rcu(dev_maps, rcu);
++}
++
++static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
++			   u16 offset, u16 count)
++{
++	struct xps_dev_maps *dev_maps;
++	bool active = false;
++	int i, j;
++
++	dev_maps = xmap_dereference(dev->xps_maps[type]);
++	if (!dev_maps)
++		return;
++
++	for (j = 0; j < dev_maps->nr_ids; j++)
++		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
++	if (!active)
++		reset_xps_maps(dev, dev_maps, type);
++
++	if (type == XPS_CPUS) {
++		for (i = offset + (count - 1); count--; i--)
++			netdev_queue_numa_node_write(
++				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
++	}
++}
++
++static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
++				   u16 count)
++{
++	if (!static_key_false(&xps_needed))
++		return;
++
++	cpus_read_lock();
++	mutex_lock(&xps_map_mutex);
++
++	if (static_key_false(&xps_rxqs_needed))
++		clean_xps_maps(dev, XPS_RXQS, offset, count);
++
++	clean_xps_maps(dev, XPS_CPUS, offset, count);
++
++	mutex_unlock(&xps_map_mutex);
++	cpus_read_unlock();
++}
++
++static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
++{
++	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
++}
++
++static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
++				      u16 index, bool is_rxqs_map)
++{
++	struct xps_map *new_map;
++	int alloc_len = XPS_MIN_MAP_ALLOC;
++	int i, pos;
++
++	for (pos = 0; map && pos < map->len; pos++) {
++		if (map->queues[pos] != index)
++			continue;
++		return map;
++	}
++
++	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
++	if (map) {
++		if (pos < map->alloc_len)
++			return map;
++
++		alloc_len = map->alloc_len * 2;
++	}
++
++	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
++	 *  map
++	 */
++	if (is_rxqs_map)
++		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
++	else
++		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
++				       cpu_to_node(attr_index));
++	if (!new_map)
++		return NULL;
++
++	for (i = 0; i < pos; i++)
++		new_map->queues[i] = map->queues[i];
++	new_map->alloc_len = alloc_len;
++	new_map->len = pos;
++
++	return new_map;
++}
++
++/* Copy xps maps at a given index */
++static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
++			      struct xps_dev_maps *new_dev_maps, int index,
++			      int tc, bool skip_tc)
++{
++	int i, tci = index * dev_maps->num_tc;
++	struct xps_map *map;
++
++	/* copy maps belonging to foreign traffic classes */
++	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
++		if (i == tc && skip_tc)
++			continue;
++
++		/* fill in the new device map from the old device map */
++		map = xmap_dereference(dev_maps->attr_map[tci]);
++		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
++	}
++}
++
++/* Must be called under cpus_read_lock */
++int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
++			  u16 index, enum xps_map_type type)
++{
++	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
++	const unsigned long *online_mask = NULL;
++	bool active = false, copy = false;
++	int i, j, tci, numa_node_id = -2;
++	int maps_sz, num_tc = 1, tc = 0;
++	struct xps_map *map, *new_map;
++	unsigned int nr_ids;
++
++	if (dev->num_tc) {
++		/* Do not allow XPS on subordinate device directly */
++		num_tc = dev->num_tc;
++		if (num_tc < 0)
++			return -EINVAL;
++
++		/* If queue belongs to subordinate dev use its map */
++		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
++
++		tc = netdev_txq_to_tc(dev, index);
++		if (tc < 0)
++			return -EINVAL;
++	}
++
++	mutex_lock(&xps_map_mutex);
++
++	dev_maps = xmap_dereference(dev->xps_maps[type]);
++	if (type == XPS_RXQS) {
++		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
++		nr_ids = dev->num_rx_queues;
++	} else {
++		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
++		if (num_possible_cpus() > 1)
++			online_mask = cpumask_bits(cpu_online_mask);
++		nr_ids = nr_cpu_ids;
++	}
++
++	if (maps_sz < L1_CACHE_BYTES)
++		maps_sz = L1_CACHE_BYTES;
++
++	/* The old dev_maps could be larger or smaller than the one we're
++	 * setting up now, as dev->num_tc or nr_ids could have been updated in
++	 * between. We could try to be smart, but let's be safe instead and only
++	 * copy foreign traffic classes if the two map sizes match.
++	 */
++	if (dev_maps &&
++	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
++		copy = true;
++
++	/* allocate memory for queue storage */
++	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
++	     j < nr_ids;) {
++		if (!new_dev_maps) {
++			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
++			if (!new_dev_maps) {
++				mutex_unlock(&xps_map_mutex);
++				return -ENOMEM;
++			}
++
++			new_dev_maps->nr_ids = nr_ids;
++			new_dev_maps->num_tc = num_tc;
++		}
++
++		tci = j * num_tc + tc;
++		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
++
++		map = expand_xps_map(map, j, index, type == XPS_RXQS);
++		if (!map)
++			goto error;
++
++		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
++	}
++
++	if (!new_dev_maps)
++		goto out_no_new_maps;
++
++	if (!dev_maps) {
++		/* Increment static keys at most once per type */
++		static_key_slow_inc_cpuslocked(&xps_needed);
++		if (type == XPS_RXQS)
++			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
++	}
++
++	for (j = 0; j < nr_ids; j++) {
++		bool skip_tc = false;
++
++		tci = j * num_tc + tc;
++		if (netif_attr_test_mask(j, mask, nr_ids) &&
++		    netif_attr_test_online(j, online_mask, nr_ids)) {
++			/* add tx-queue to CPU/rx-queue maps */
++			int pos = 0;
++
++			skip_tc = true;
++
++			map = xmap_dereference(new_dev_maps->attr_map[tci]);
++			while ((pos < map->len) && (map->queues[pos] != index))
++				pos++;
++
++			if (pos == map->len)
++				map->queues[map->len++] = index;
++#ifdef CONFIG_NUMA
++			if (type == XPS_CPUS) {
++				if (numa_node_id == -2)
++					numa_node_id = cpu_to_node(j);
++				else if (numa_node_id != cpu_to_node(j))
++					numa_node_id = -1;
++			}
++#endif
++		}
++
++		if (copy)
++			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
++					  skip_tc);
++	}
++
++	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
++
++	/* Cleanup old maps */
++	if (!dev_maps)
++		goto out_no_old_maps;
++
++	for (j = 0; j < dev_maps->nr_ids; j++) {
++		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
++			map = xmap_dereference(dev_maps->attr_map[tci]);
++			if (!map)
++				continue;
++
++			if (copy) {
++				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
++				if (map == new_map)
++					continue;
++			}
++
++			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
++			kfree_rcu(map, rcu);
++		}
++	}
++
++	old_dev_maps = dev_maps;
++
++out_no_old_maps:
++	dev_maps = new_dev_maps;
++	active = true;
++
++out_no_new_maps:
++	if (type == XPS_CPUS)
++		/* update Tx queue numa node */
++		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
++					     (numa_node_id >= 0) ?
++					     numa_node_id : NUMA_NO_NODE);
++
++	if (!dev_maps)
++		goto out_no_maps;
++
++	/* removes tx-queue from unused CPUs/rx-queues */
++	for (j = 0; j < dev_maps->nr_ids; j++) {
++		tci = j * dev_maps->num_tc;
++
++		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
++			if (i == tc &&
++			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
++			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
++				continue;
++
++			active |= remove_xps_queue(dev_maps,
++						   copy ? old_dev_maps : NULL,
++						   tci, index);
++		}
++	}
++
++	if (old_dev_maps)
++		kfree_rcu(old_dev_maps, rcu);
++
++	/* free map if not active */
++	if (!active)
++		reset_xps_maps(dev, dev_maps, type);
++
++out_no_maps:
++	mutex_unlock(&xps_map_mutex);
++
++	return 0;
++error:
++	/* remove any maps that we added */
++	for (j = 0; j < nr_ids; j++) {
++		for (i = num_tc, tci = j * num_tc; i--; tci++) {
++			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
++			map = copy ?
++			      xmap_dereference(dev_maps->attr_map[tci]) :
++			      NULL;
++			if (new_map && new_map != map)
++				kfree(new_map);
++		}
++	}
++
++	mutex_unlock(&xps_map_mutex);
++
++	kfree(new_dev_maps);
++	return -ENOMEM;
++}
++EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
++
++int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
++			u16 index)
++{
++	int ret;
++
++	cpus_read_lock();
++	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
++	cpus_read_unlock();
++
++	return ret;
++}
++EXPORT_SYMBOL(netif_set_xps_queue);
++
++#endif
++static void netdev_unbind_all_sb_channels(struct net_device *dev)
++{
++	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
++
++	/* Unbind any subordinate channels */
++	while (txq-- != &dev->_tx[0]) {
++		if (txq->sb_dev)
++			netdev_unbind_sb_channel(dev, txq->sb_dev);
++	}
++}
++
++void netdev_reset_tc(struct net_device *dev)
++{
++#ifdef CONFIG_XPS
++	netif_reset_xps_queues_gt(dev, 0);
++#endif
++	netdev_unbind_all_sb_channels(dev);
++
++	/* Reset TC configuration of device */
++	dev->num_tc = 0;
++	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
++	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
++}
++EXPORT_SYMBOL(netdev_reset_tc);
++
++int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
++{
++	if (tc >= dev->num_tc)
++		return -EINVAL;
++
++#ifdef CONFIG_XPS
++	netif_reset_xps_queues(dev, offset, count);
++#endif
++	dev->tc_to_txq[tc].count = count;
++	dev->tc_to_txq[tc].offset = offset;
++	return 0;
++}
++EXPORT_SYMBOL(netdev_set_tc_queue);
++
++int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
++{
++	if (num_tc > TC_MAX_QUEUE)
++		return -EINVAL;
++
++#ifdef CONFIG_XPS
++	netif_reset_xps_queues_gt(dev, 0);
++#endif
++	netdev_unbind_all_sb_channels(dev);
++
++	dev->num_tc = num_tc;
++	return 0;
++}
++EXPORT_SYMBOL(netdev_set_num_tc);
++
++void netdev_unbind_sb_channel(struct net_device *dev,
++			      struct net_device *sb_dev)
++{
++	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
++
++#ifdef CONFIG_XPS
++	netif_reset_xps_queues_gt(sb_dev, 0);
++#endif
++	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
++	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
++
++	while (txq-- != &dev->_tx[0]) {
++		if (txq->sb_dev == sb_dev)
++			txq->sb_dev = NULL;
++	}
++}
++EXPORT_SYMBOL(netdev_unbind_sb_channel);
++
++int netdev_bind_sb_channel_queue(struct net_device *dev,
++				 struct net_device *sb_dev,
++				 u8 tc, u16 count, u16 offset)
++{
++	/* Make certain the sb_dev and dev are already configured */
++	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
++		return -EINVAL;
++
++	/* We cannot hand out queues we don't have */
++	if ((offset + count) > dev->real_num_tx_queues)
++		return -EINVAL;
++
++	/* Record the mapping */
++	sb_dev->tc_to_txq[tc].count = count;
++	sb_dev->tc_to_txq[tc].offset = offset;
++
++	/* Provide a way for Tx queue to find the tc_to_txq map or
++	 * XPS map for itself.
++	 */
++	while (count--)
++		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
++
++	return 0;
++}
++EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
++
++int netdev_set_sb_channel(struct net_device *dev, u16 channel)
++{
++	/* Do not use a multiqueue device to represent a subordinate channel */
++	if (netif_is_multiqueue(dev))
++		return -ENODEV;
++
++	/* We allow channels 1 - 32767 to be used for subordinate channels.
++	 * Channel 0 is meant to be "native" mode and used only to represent
++	 * the main root device. We allow writing 0 to reset the device back
++	 * to normal mode after being used as a subordinate channel.
++	 */
++	if (channel > S16_MAX)
++		return -EINVAL;
++
++	dev->num_tc = -channel;
++
++	return 0;
++}
++EXPORT_SYMBOL(netdev_set_sb_channel);
++
++/*
++ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
++ * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
++ */
++int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
++{
++	bool disabling;
++	int rc;
++
++	disabling = txq < dev->real_num_tx_queues;
++
++	if (txq < 1 || txq > dev->num_tx_queues)
++		return -EINVAL;
++
++	if (dev->reg_state == NETREG_REGISTERED ||
++	    dev->reg_state == NETREG_UNREGISTERING) {
++		ASSERT_RTNL();
++
++		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
++						  txq);
++		if (rc)
++			return rc;
++
++		if (dev->num_tc)
++			netif_setup_tc(dev, txq);
++
++		dev_qdisc_change_real_num_tx(dev, txq);
++
++		dev->real_num_tx_queues = txq;
++
++		if (disabling) {
++			synchronize_net();
++			qdisc_reset_all_tx_gt(dev, txq);
++#ifdef CONFIG_XPS
++			netif_reset_xps_queues_gt(dev, txq);
++#endif
++		}
++	} else {
++		dev->real_num_tx_queues = txq;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(netif_set_real_num_tx_queues);
++
++#ifdef CONFIG_SYSFS
++/**
++ *	netif_set_real_num_rx_queues - set actual number of RX queues used
++ *	@dev: Network device
++ *	@rxq: Actual number of RX queues
++ *
++ *	This must be called either with the rtnl_lock held or before
++ *	registration of the net device.  Returns 0 on success, or a
++ *	negative error code.  If called before registration, it always
++ *	succeeds.
++ */
++int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
++{
++	int rc;
++
++	if (rxq < 1 || rxq > dev->num_rx_queues)
++		return -EINVAL;
++
++	if (dev->reg_state == NETREG_REGISTERED) {
++		ASSERT_RTNL();
++
++		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
++						  rxq);
++		if (rc)
++			return rc;
++	}
++
++	dev->real_num_rx_queues = rxq;
++	return 0;
++}
++EXPORT_SYMBOL(netif_set_real_num_rx_queues);
++#endif
++
++/**
++ *	netif_set_real_num_queues - set actual number of RX and TX queues used
++ *	@dev: Network device
++ *	@txq: Actual number of TX queues
++ *	@rxq: Actual number of RX queues
++ *
++ *	Set the real number of both TX and RX queues.
++ *	Does nothing if the number of queues is already correct.
++ */
++int netif_set_real_num_queues(struct net_device *dev,
++			      unsigned int txq, unsigned int rxq)
++{
++	unsigned int old_rxq = dev->real_num_rx_queues;
++	int err;
++
++	if (txq < 1 || txq > dev->num_tx_queues ||
++	    rxq < 1 || rxq > dev->num_rx_queues)
++		return -EINVAL;
++
++	/* Start from increases, so the error path only does decreases -
++	 * decreases can't fail.
++	 */
++	if (rxq > dev->real_num_rx_queues) {
++		err = netif_set_real_num_rx_queues(dev, rxq);
++		if (err)
++			return err;
++	}
++	if (txq > dev->real_num_tx_queues) {
++		err = netif_set_real_num_tx_queues(dev, txq);
++		if (err)
++			goto undo_rx;
++	}
++	if (rxq < dev->real_num_rx_queues)
++		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
++	if (txq < dev->real_num_tx_queues)
++		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
++
++	return 0;
++undo_rx:
++	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
++	return err;
++}
++EXPORT_SYMBOL(netif_set_real_num_queues);
++
++/**
++ * netif_set_tso_max_size() - set the max size of TSO frames supported
++ * @dev:	netdev to update
++ * @size:	max skb->len of a TSO frame
++ *
++ * Set the limit on the size of TSO super-frames the device can handle.
++ * Unless explicitly set the stack will assume the value of
++ * %GSO_LEGACY_MAX_SIZE.
++ */
++void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
++{
++	dev->tso_max_size = min(GSO_MAX_SIZE, size);
++	if (size < READ_ONCE(dev->gso_max_size))
++		netif_set_gso_max_size(dev, size);
++}
++EXPORT_SYMBOL(netif_set_tso_max_size);
++
++/**
++ * netif_set_tso_max_segs() - set the max number of segs supported for TSO
++ * @dev:	netdev to update
++ * @segs:	max number of TCP segments
++ *
++ * Set the limit on the number of TCP segments the device can generate from
++ * a single TSO super-frame.
++ * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
++ */
++void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
++{
++	dev->tso_max_segs = segs;
++	if (segs < READ_ONCE(dev->gso_max_segs))
++		netif_set_gso_max_segs(dev, segs);
++}
++EXPORT_SYMBOL(netif_set_tso_max_segs);
++
++/**
++ * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
++ * @to:		netdev to update
++ * @from:	netdev from which to copy the limits
++ */
++void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
++{
++	netif_set_tso_max_size(to, from->tso_max_size);
++	netif_set_tso_max_segs(to, from->tso_max_segs);
++}
++EXPORT_SYMBOL(netif_inherit_tso_max);
++
++/**
++ * netif_get_num_default_rss_queues - default number of RSS queues
++ *
++ * Default value is the number of physical cores if there are only 1 or 2, or
++ * divided by 2 if there are more.
++ */
++int netif_get_num_default_rss_queues(void)
++{
++	cpumask_var_t cpus;
++	int cpu, count = 0;
++
++	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
++		return 1;
++
++	cpumask_copy(cpus, cpu_online_mask);
++	for_each_cpu(cpu, cpus) {
++		++count;
++		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
++	}
++	free_cpumask_var(cpus);
++
++	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
++}
++EXPORT_SYMBOL(netif_get_num_default_rss_queues);
++
++static void __netif_reschedule(struct Qdisc *q)
++{
++	struct softnet_data *sd;
++	unsigned long flags;
++
++	local_irq_save(flags);
++	sd = this_cpu_ptr(&softnet_data);
++	q->next_sched = NULL;
++	*sd->output_queue_tailp = q;
++	sd->output_queue_tailp = &q->next_sched;
++	raise_softirq_irqoff(NET_TX_SOFTIRQ);
++	local_irq_restore(flags);
++}
++
++void __netif_schedule(struct Qdisc *q)
++{
++	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
++		__netif_reschedule(q);
++}
++EXPORT_SYMBOL(__netif_schedule);
++
++struct dev_kfree_skb_cb {
++	enum skb_free_reason reason;
++};
++
++static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
++{
++	return (struct dev_kfree_skb_cb *)skb->cb;
++}
++
++void netif_schedule_queue(struct netdev_queue *txq)
++{
++	rcu_read_lock();
++	if (!netif_xmit_stopped(txq)) {
++		struct Qdisc *q = rcu_dereference(txq->qdisc);
++
++		__netif_schedule(q);
++	}
++	rcu_read_unlock();
++}
++EXPORT_SYMBOL(netif_schedule_queue);
++
++void netif_tx_wake_queue(struct netdev_queue *dev_queue)
++{
++	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
++		struct Qdisc *q;
++
++		rcu_read_lock();
++		q = rcu_dereference(dev_queue->qdisc);
++		__netif_schedule(q);
++		rcu_read_unlock();
++	}
++}
++EXPORT_SYMBOL(netif_tx_wake_queue);
++
++void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
++{
++	unsigned long flags;
++
++	if (unlikely(!skb))
++		return;
++
++	if (likely(refcount_read(&skb->users) == 1)) {
++		smp_rmb();
++		refcount_set(&skb->users, 0);
++	} else if (likely(!refcount_dec_and_test(&skb->users))) {
++		return;
++	}
++	get_kfree_skb_cb(skb)->reason = reason;
++	local_irq_save(flags);
++	skb->next = __this_cpu_read(softnet_data.completion_queue);
++	__this_cpu_write(softnet_data.completion_queue, skb);
++	raise_softirq_irqoff(NET_TX_SOFTIRQ);
++	local_irq_restore(flags);
++}
++EXPORT_SYMBOL(__dev_kfree_skb_irq);
++
++void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
++{
++	if (in_hardirq() || irqs_disabled())
++		__dev_kfree_skb_irq(skb, reason);
++	else
++		dev_kfree_skb(skb);
++}
++EXPORT_SYMBOL(__dev_kfree_skb_any);
++
++
++/**
++ * netif_device_detach - mark device as removed
++ * @dev: network device
++ *
++ * Mark device as removed from system and therefore no longer available.
++ */
++void netif_device_detach(struct net_device *dev)
++{
++	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
++	    netif_running(dev)) {
++		netif_tx_stop_all_queues(dev);
++	}
++}
++EXPORT_SYMBOL(netif_device_detach);
++
++/**
++ * netif_device_attach - mark device as attached
++ * @dev: network device
++ *
++ * Mark device as attached from system and restart if needed.
++ */
++void netif_device_attach(struct net_device *dev)
++{
++	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
++	    netif_running(dev)) {
++		netif_tx_wake_all_queues(dev);
++		__netdev_watchdog_up(dev);
++	}
++}
++EXPORT_SYMBOL(netif_device_attach);
++
++/*
++ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
++ * to be used as a distribution range.
++ */
++static u16 skb_tx_hash(const struct net_device *dev,
++		       const struct net_device *sb_dev,
++		       struct sk_buff *skb)
++{
++	u32 hash;
++	u16 qoffset = 0;
++	u16 qcount = dev->real_num_tx_queues;
++
++	if (dev->num_tc) {
++		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
++
++		qoffset = sb_dev->tc_to_txq[tc].offset;
++		qcount = sb_dev->tc_to_txq[tc].count;
++		if (unlikely(!qcount)) {
++			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
++					     sb_dev->name, qoffset, tc);
++			qoffset = 0;
++			qcount = dev->real_num_tx_queues;
++		}
++	}
++
++	if (skb_rx_queue_recorded(skb)) {
++		hash = skb_get_rx_queue(skb);
++		if (hash >= qoffset)
++			hash -= qoffset;
++		while (unlikely(hash >= qcount))
++			hash -= qcount;
++		return hash + qoffset;
++	}
++
++	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
++}
++
++static void skb_warn_bad_offload(const struct sk_buff *skb)
++{
++	static const netdev_features_t null_features;
++	struct net_device *dev = skb->dev;
++	const char *name = "";
++
++	if (!net_ratelimit())
++		return;
++
++	if (dev) {
++		if (dev->dev.parent)
++			name = dev_driver_string(dev->dev.parent);
++		else
++			name = netdev_name(dev);
++	}
++	skb_dump(KERN_WARNING, skb, false);
++	WARN(1, "%s: caps=(%pNF, %pNF)\n",
++	     name, dev ? &dev->features : &null_features,
++	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
++}
++
++/*
++ * Invalidate hardware checksum when packet is to be mangled, and
++ * complete checksum manually on outgoing path.
++ */
++int skb_checksum_help(struct sk_buff *skb)
++{
++	__wsum csum;
++	int ret = 0, offset;
++
++	if (skb->ip_summed == CHECKSUM_COMPLETE)
++		goto out_set_summed;
++
++	if (unlikely(skb_is_gso(skb))) {
++		skb_warn_bad_offload(skb);
++		return -EINVAL;
++	}
++
++	/* Before computing a checksum, we should make sure no frag could
++	 * be modified by an external entity : checksum could be wrong.
++	 */
++	if (skb_has_shared_frag(skb)) {
++		ret = __skb_linearize(skb);
++		if (ret)
++			goto out;
++	}
++
++	offset = skb_checksum_start_offset(skb);
++	ret = -EINVAL;
++	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
++		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
++		goto out;
++	}
++	csum = skb_checksum(skb, offset, skb->len - offset, 0);
++
++	offset += skb->csum_offset;
++	if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) {
++		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
++		goto out;
++	}
++	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
++	if (ret)
++		goto out;
++
++	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
++out_set_summed:
++	skb->ip_summed = CHECKSUM_NONE;
++out:
++	return ret;
++}
++EXPORT_SYMBOL(skb_checksum_help);
++
++int skb_crc32c_csum_help(struct sk_buff *skb)
++{
++	__le32 crc32c_csum;
++	int ret = 0, offset, start;
++
++	if (skb->ip_summed != CHECKSUM_PARTIAL)
++		goto out;
++
++	if (unlikely(skb_is_gso(skb)))
++		goto out;
++
++	/* Before computing a checksum, we should make sure no frag could
++	 * be modified by an external entity : checksum could be wrong.
++	 */
++	if (unlikely(skb_has_shared_frag(skb))) {
++		ret = __skb_linearize(skb);
++		if (ret)
++			goto out;
++	}
++	start = skb_checksum_start_offset(skb);
++	offset = start + offsetof(struct sctphdr, checksum);
++	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
++		ret = -EINVAL;
++		goto out;
++	}
++
++	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
++	if (ret)
++		goto out;
++
++	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
++						  skb->len - start, ~(__u32)0,
++						  crc32c_csum_stub));
++	*(__le32 *)(skb->data + offset) = crc32c_csum;
++	skb->ip_summed = CHECKSUM_NONE;
++	skb->csum_not_inet = 0;
++out:
++	return ret;
++}
++
++__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
++{
++	__be16 type = skb->protocol;
++
++	/* Tunnel gso handlers can set protocol to ethernet. */
++	if (type == htons(ETH_P_TEB)) {
++		struct ethhdr *eth;
++
++		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
++			return 0;
++
++		eth = (struct ethhdr *)skb->data;
++		type = eth->h_proto;
++	}
++
++	return __vlan_get_protocol(skb, type, depth);
++}
++
++/* openvswitch calls this on rx path, so we need a different check.
++ */
++static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
++{
++	if (tx_path)
++		return skb->ip_summed != CHECKSUM_PARTIAL &&
++		       skb->ip_summed != CHECKSUM_UNNECESSARY;
++
++	return skb->ip_summed == CHECKSUM_NONE;
++}
++
++/**
++ *	__skb_gso_segment - Perform segmentation on skb.
++ *	@skb: buffer to segment
++ *	@features: features for the output path (see dev->features)
++ *	@tx_path: whether it is called in TX path
++ *
++ *	This function segments the given skb and returns a list of segments.
++ *
++ *	It may return NULL if the skb requires no segmentation.  This is
++ *	only possible when GSO is used for verifying header integrity.
++ *
++ *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
++ */
++struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
++				  netdev_features_t features, bool tx_path)
++{
++	struct sk_buff *segs;
++
++	if (unlikely(skb_needs_check(skb, tx_path))) {
++		int err;
++
++		/* We're going to init ->check field in TCP or UDP header */
++		err = skb_cow_head(skb, 0);
++		if (err < 0)
++			return ERR_PTR(err);
++	}
++
++	/* Only report GSO partial support if it will enable us to
++	 * support segmentation on this frame without needing additional
++	 * work.
++	 */
++	if (features & NETIF_F_GSO_PARTIAL) {
++		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
++		struct net_device *dev = skb->dev;
++
++		partial_features |= dev->features & dev->gso_partial_features;
++		if (!skb_gso_ok(skb, features | partial_features))
++			features &= ~NETIF_F_GSO_PARTIAL;
++	}
++
++	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
++		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
++
++	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
++	SKB_GSO_CB(skb)->encap_level = 0;
++
++	skb_reset_mac_header(skb);
++	skb_reset_mac_len(skb);
++
++	segs = skb_mac_gso_segment(skb, features);
++
++	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
++		skb_warn_bad_offload(skb);
++
++	return segs;
++}
++EXPORT_SYMBOL(__skb_gso_segment);
++
++/* Take action when hardware reception checksum errors are detected. */
++#ifdef CONFIG_BUG
++static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
++{
++	netdev_err(dev, "hw csum failure\n");
++	skb_dump(KERN_ERR, skb, true);
++	dump_stack();
++}
++
++void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
++{
++	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
++}
++EXPORT_SYMBOL(netdev_rx_csum_fault);
++#endif
++
++/* XXX: check that highmem exists at all on the given machine. */
++static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
++{
++#ifdef CONFIG_HIGHMEM
++	int i;
++
++	if (!(dev->features & NETIF_F_HIGHDMA)) {
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++			if (PageHighMem(skb_frag_page(frag)))
++				return 1;
++		}
++	}
++#endif
++	return 0;
++}
++
++/* If MPLS offload request, verify we are testing hardware MPLS features
++ * instead of standard features for the netdev.
++ */
++#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
++static netdev_features_t net_mpls_features(struct sk_buff *skb,
++					   netdev_features_t features,
++					   __be16 type)
++{
++	if (eth_p_mpls(type))
++		features &= skb->dev->mpls_features;
++
++	return features;
++}
++#else
++static netdev_features_t net_mpls_features(struct sk_buff *skb,
++					   netdev_features_t features,
++					   __be16 type)
++{
++	return features;
++}
++#endif
++
++static netdev_features_t harmonize_features(struct sk_buff *skb,
++	netdev_features_t features)
++{
++	__be16 type;
++
++	type = skb_network_protocol(skb, NULL);
++	features = net_mpls_features(skb, features, type);
++
++	if (skb->ip_summed != CHECKSUM_NONE &&
++	    !can_checksum_protocol(features, type)) {
++		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
++	}
++	if (illegal_highdma(skb->dev, skb))
++		features &= ~NETIF_F_SG;
++
++	return features;
++}
++
++netdev_features_t passthru_features_check(struct sk_buff *skb,
++					  struct net_device *dev,
++					  netdev_features_t features)
++{
++	return features;
++}
++EXPORT_SYMBOL(passthru_features_check);
++
++static netdev_features_t dflt_features_check(struct sk_buff *skb,
++					     struct net_device *dev,
++					     netdev_features_t features)
++{
++	return vlan_features_check(skb, features);
++}
++
++static netdev_features_t gso_features_check(const struct sk_buff *skb,
++					    struct net_device *dev,
++					    netdev_features_t features)
++{
++	u16 gso_segs = skb_shinfo(skb)->gso_segs;
++
++	if (gso_segs > READ_ONCE(dev->gso_max_segs))
++		return features & ~NETIF_F_GSO_MASK;
++
++	if (!skb_shinfo(skb)->gso_type) {
++		skb_warn_bad_offload(skb);
++		return features & ~NETIF_F_GSO_MASK;
++	}
++
++	/* Support for GSO partial features requires software
++	 * intervention before we can actually process the packets
++	 * so we need to strip support for any partial features now
++	 * and we can pull them back in after we have partially
++	 * segmented the frame.
++	 */
++	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
++		features &= ~dev->gso_partial_features;
++
++	/* Make sure to clear the IPv4 ID mangling feature if the
++	 * IPv4 header has the potential to be fragmented.
++	 */
++	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
++		struct iphdr *iph = skb->encapsulation ?
++				    inner_ip_hdr(skb) : ip_hdr(skb);
++
++		if (!(iph->frag_off & htons(IP_DF)))
++			features &= ~NETIF_F_TSO_MANGLEID;
++	}
++
++	return features;
++}
++
++netdev_features_t netif_skb_features(struct sk_buff *skb)
++{
++	struct net_device *dev = skb->dev;
++	netdev_features_t features = dev->features;
++
++	if (skb_is_gso(skb))
++		features = gso_features_check(skb, dev, features);
++
++	/* If encapsulation offload request, verify we are testing
++	 * hardware encapsulation features instead of standard
++	 * features for the netdev
++	 */
++	if (skb->encapsulation)
++		features &= dev->hw_enc_features;
++
++	if (skb_vlan_tagged(skb))
++		features = netdev_intersect_features(features,
++						     dev->vlan_features |
++						     NETIF_F_HW_VLAN_CTAG_TX |
++						     NETIF_F_HW_VLAN_STAG_TX);
++
++	if (dev->netdev_ops->ndo_features_check)
++		features &= dev->netdev_ops->ndo_features_check(skb, dev,
++								features);
++	else
++		features &= dflt_features_check(skb, dev, features);
++
++	return harmonize_features(skb, features);
++}
++EXPORT_SYMBOL(netif_skb_features);
++
++static int xmit_one(struct sk_buff *skb, struct net_device *dev,
++		    struct netdev_queue *txq, bool more)
++{
++	unsigned int len;
++	int rc;
++
++	if (dev_nit_active(dev))
++		dev_queue_xmit_nit(skb, dev);
++
++	len = skb->len;
++	trace_net_dev_start_xmit(skb, dev);
++	rc = netdev_start_xmit(skb, dev, txq, more);
++	trace_net_dev_xmit(skb, rc, dev, len);
++
++	return rc;
++}
++
++struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
++				    struct netdev_queue *txq, int *ret)
++{
++	struct sk_buff *skb = first;
++	int rc = NETDEV_TX_OK;
++
++	while (skb) {
++		struct sk_buff *next = skb->next;
++
++		skb_mark_not_on_list(skb);
++		rc = xmit_one(skb, dev, txq, next != NULL);
++		if (unlikely(!dev_xmit_complete(rc))) {
++			skb->next = next;
++			goto out;
++		}
++
++		skb = next;
++		if (netif_tx_queue_stopped(txq) && skb) {
++			rc = NETDEV_TX_BUSY;
++			break;
++		}
++	}
++
++out:
++	*ret = rc;
++	return skb;
++}
++
++static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
++					  netdev_features_t features)
++{
++	if (skb_vlan_tag_present(skb) &&
++	    !vlan_hw_offload_capable(features, skb->vlan_proto))
++		skb = __vlan_hwaccel_push_inside(skb);
++	return skb;
++}
++
++int skb_csum_hwoffload_help(struct sk_buff *skb,
++			    const netdev_features_t features)
++{
++	if (unlikely(skb_csum_is_sctp(skb)))
++		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
++			skb_crc32c_csum_help(skb);
++
++	if (features & NETIF_F_HW_CSUM)
++		return 0;
++
++	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
++		switch (skb->csum_offset) {
++		case offsetof(struct tcphdr, check):
++		case offsetof(struct udphdr, check):
++			return 0;
++		}
++	}
++
++	return skb_checksum_help(skb);
++}
++EXPORT_SYMBOL(skb_csum_hwoffload_help);
++
++static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
++{
++	netdev_features_t features;
++
++	features = netif_skb_features(skb);
++	skb = validate_xmit_vlan(skb, features);
++	if (unlikely(!skb))
++		goto out_null;
++
++	skb = sk_validate_xmit_skb(skb, dev);
++	if (unlikely(!skb))
++		goto out_null;
++
++	if (netif_needs_gso(skb, features)) {
++		struct sk_buff *segs;
++
++		segs = skb_gso_segment(skb, features);
++		if (IS_ERR(segs)) {
++			goto out_kfree_skb;
++		} else if (segs) {
++			consume_skb(skb);
++			skb = segs;
++		}
++	} else {
++		if (skb_needs_linearize(skb, features) &&
++		    __skb_linearize(skb))
++			goto out_kfree_skb;
++
++		/* If packet is not checksummed and device does not
++		 * support checksumming for this protocol, complete
++		 * checksumming here.
++		 */
++		if (skb->ip_summed == CHECKSUM_PARTIAL) {
++			if (skb->encapsulation)
++				skb_set_inner_transport_header(skb,
++							       skb_checksum_start_offset(skb));
++			else
++				skb_set_transport_header(skb,
++							 skb_checksum_start_offset(skb));
++			if (skb_csum_hwoffload_help(skb, features))
++				goto out_kfree_skb;
++		}
++	}
++
++	skb = validate_xmit_xfrm(skb, features, again);
++
++	return skb;
++
++out_kfree_skb:
++	kfree_skb(skb);
++out_null:
++	dev_core_stats_tx_dropped_inc(dev);
++	return NULL;
++}
++
++struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
++{
++	struct sk_buff *next, *head = NULL, *tail;
++
++	for (; skb != NULL; skb = next) {
++		next = skb->next;
++		skb_mark_not_on_list(skb);
++
++		/* in case skb wont be segmented, point to itself */
++		skb->prev = skb;
++
++		skb = validate_xmit_skb(skb, dev, again);
++		if (!skb)
++			continue;
++
++		if (!head)
++			head = skb;
++		else
++			tail->next = skb;
++		/* If skb was segmented, skb->prev points to
++		 * the last segment. If not, it still contains skb.
++		 */
++		tail = skb->prev;
++	}
++	return head;
++}
++EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
++
++static void qdisc_pkt_len_init(struct sk_buff *skb)
++{
++	const struct skb_shared_info *shinfo = skb_shinfo(skb);
++
++	qdisc_skb_cb(skb)->pkt_len = skb->len;
++
++	/* To get more precise estimation of bytes sent on wire,
++	 * we add to pkt_len the headers size of all segments
++	 */
++	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
++		unsigned int hdr_len;
++		u16 gso_segs = shinfo->gso_segs;
++
++		/* mac layer + network layer */
++		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
++
++		/* + transport layer */
++		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
++			const struct tcphdr *th;
++			struct tcphdr _tcphdr;
++
++			th = skb_header_pointer(skb, skb_transport_offset(skb),
++						sizeof(_tcphdr), &_tcphdr);
++			if (likely(th))
++				hdr_len += __tcp_hdrlen(th);
++		} else {
++			struct udphdr _udphdr;
++
++			if (skb_header_pointer(skb, skb_transport_offset(skb),
++					       sizeof(_udphdr), &_udphdr))
++				hdr_len += sizeof(struct udphdr);
++		}
++
++		if (shinfo->gso_type & SKB_GSO_DODGY)
++			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
++						shinfo->gso_size);
++
++		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
++	}
++}
++
++static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
++			     struct sk_buff **to_free,
++			     struct netdev_queue *txq)
++{
++	int rc;
++
++	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
++	if (rc == NET_XMIT_SUCCESS)
++		trace_qdisc_enqueue(q, txq, skb);
++	return rc;
++}
++
++static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
++				 struct net_device *dev,
++				 struct netdev_queue *txq)
++{
++	spinlock_t *root_lock = qdisc_lock(q);
++	struct sk_buff *to_free = NULL;
++	bool contended;
++	int rc;
++
++	qdisc_calculate_pkt_len(skb, q);
++
++	if (q->flags & TCQ_F_NOLOCK) {
++		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
++		    qdisc_run_begin(q)) {
++			/* Retest nolock_qdisc_is_empty() within the protection
++			 * of q->seqlock to protect from racing with requeuing.
++			 */
++			if (unlikely(!nolock_qdisc_is_empty(q))) {
++				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
++				__qdisc_run(q);
++				qdisc_run_end(q);
++
++				goto no_lock_out;
++			}
++
++			qdisc_bstats_cpu_update(q, skb);
++			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
++			    !nolock_qdisc_is_empty(q))
++				__qdisc_run(q);
++
++			qdisc_run_end(q);
++			return NET_XMIT_SUCCESS;
++		}
++
++		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
++		qdisc_run(q);
++
++no_lock_out:
++		if (unlikely(to_free))
++			kfree_skb_list_reason(to_free,
++					      SKB_DROP_REASON_QDISC_DROP);
++		return rc;
++	}
++
++	/*
++	 * Heuristic to force contended enqueues to serialize on a
++	 * separate lock before trying to get qdisc main lock.
++	 * This permits qdisc->running owner to get the lock more
++	 * often and dequeue packets faster.
++	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
++	 * and then other tasks will only enqueue packets. The packets will be
++	 * sent after the qdisc owner is scheduled again. To prevent this
++	 * scenario the task always serialize on the lock.
++	 */
++	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
++	if (unlikely(contended))
++		spin_lock(&q->busylock);
++
++	spin_lock(root_lock);
++	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
++		__qdisc_drop(skb, &to_free);
++		rc = NET_XMIT_DROP;
++	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
++		   qdisc_run_begin(q)) {
++		/*
++		 * This is a work-conserving queue; there are no old skbs
++		 * waiting to be sent out; and the qdisc is not running -
++		 * xmit the skb directly.
++		 */
++
++		qdisc_bstats_update(q, skb);
++
++		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
++			if (unlikely(contended)) {
++				spin_unlock(&q->busylock);
++				contended = false;
++			}
++			__qdisc_run(q);
++		}
++
++		qdisc_run_end(q);
++		rc = NET_XMIT_SUCCESS;
++	} else {
++		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
++		if (qdisc_run_begin(q)) {
++			if (unlikely(contended)) {
++				spin_unlock(&q->busylock);
++				contended = false;
++			}
++			__qdisc_run(q);
++			qdisc_run_end(q);
++		}
++	}
++	spin_unlock(root_lock);
++	if (unlikely(to_free))
++		kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
++	if (unlikely(contended))
++		spin_unlock(&q->busylock);
++	return rc;
++}
++
++#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
++static void skb_update_prio(struct sk_buff *skb)
++{
++	const struct netprio_map *map;
++	const struct sock *sk;
++	unsigned int prioidx;
++
++	if (skb->priority)
++		return;
++	map = rcu_dereference_bh(skb->dev->priomap);
++	if (!map)
++		return;
++	sk = skb_to_full_sk(skb);
++	if (!sk)
++		return;
++
++	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
++
++	if (prioidx < map->priomap_len)
++		skb->priority = map->priomap[prioidx];
++}
++#else
++#define skb_update_prio(skb)
++#endif
++
++/**
++ *	dev_loopback_xmit - loop back @skb
++ *	@net: network namespace this loopback is happening in
++ *	@sk:  sk needed to be a netfilter okfn
++ *	@skb: buffer to transmit
++ */
++int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
++{
++	skb_reset_mac_header(skb);
++	__skb_pull(skb, skb_network_offset(skb));
++	skb->pkt_type = PACKET_LOOPBACK;
++	if (skb->ip_summed == CHECKSUM_NONE)
++		skb->ip_summed = CHECKSUM_UNNECESSARY;
++	DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
++	skb_dst_force(skb);
++	netif_rx(skb);
++	return 0;
++}
++EXPORT_SYMBOL(dev_loopback_xmit);
++
++#ifdef CONFIG_NET_EGRESS
++static struct sk_buff *
++sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
++{
++#ifdef CONFIG_NET_CLS_ACT
++	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
++	struct tcf_result cl_res;
++
++	if (!miniq)
++		return skb;
++
++	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
++	tc_skb_cb(skb)->mru = 0;
++	tc_skb_cb(skb)->post_ct = false;
++	mini_qdisc_bstats_cpu_update(miniq, skb);
++
++	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
++	case TC_ACT_OK:
++	case TC_ACT_RECLASSIFY:
++		skb->tc_index = TC_H_MIN(cl_res.classid);
++		break;
++	case TC_ACT_SHOT:
++		mini_qdisc_qstats_cpu_drop(miniq);
++		*ret = NET_XMIT_DROP;
++		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
++		return NULL;
++	case TC_ACT_STOLEN:
++	case TC_ACT_QUEUED:
++	case TC_ACT_TRAP:
++		*ret = NET_XMIT_SUCCESS;
++		consume_skb(skb);
++		return NULL;
++	case TC_ACT_REDIRECT:
++		/* No need to push/pop skb's mac_header here on egress! */
++		skb_do_redirect(skb);
++		*ret = NET_XMIT_SUCCESS;
++		return NULL;
++	default:
++		break;
++	}
++#endif /* CONFIG_NET_CLS_ACT */
++
++	return skb;
++}
++
++static struct netdev_queue *
++netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
++{
++	int qm = skb_get_queue_mapping(skb);
++
++	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
++}
++
++static bool netdev_xmit_txqueue_skipped(void)
++{
++	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
++}
++
++void netdev_xmit_skip_txqueue(bool skip)
++{
++	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
++}
++EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
++#endif /* CONFIG_NET_EGRESS */
++
++#ifdef CONFIG_XPS
++static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
++			       struct xps_dev_maps *dev_maps, unsigned int tci)
++{
++	int tc = netdev_get_prio_tc_map(dev, skb->priority);
++	struct xps_map *map;
++	int queue_index = -1;
++
++	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
++		return queue_index;
++
++	tci *= dev_maps->num_tc;
++	tci += tc;
++
++	map = rcu_dereference(dev_maps->attr_map[tci]);
++	if (map) {
++		if (map->len == 1)
++			queue_index = map->queues[0];
++		else
++			queue_index = map->queues[reciprocal_scale(
++						skb_get_hash(skb), map->len)];
++		if (unlikely(queue_index >= dev->real_num_tx_queues))
++			queue_index = -1;
++	}
++	return queue_index;
++}
++#endif
++
++static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
++			 struct sk_buff *skb)
++{
++#ifdef CONFIG_XPS
++	struct xps_dev_maps *dev_maps;
++	struct sock *sk = skb->sk;
++	int queue_index = -1;
++
++	if (!static_key_false(&xps_needed))
++		return -1;
++
++	rcu_read_lock();
++	if (!static_key_false(&xps_rxqs_needed))
++		goto get_cpus_map;
++
++	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
++	if (dev_maps) {
++		int tci = sk_rx_queue_get(sk);
++
++		if (tci >= 0)
++			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
++							  tci);
++	}
++
++get_cpus_map:
++	if (queue_index < 0) {
++		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
++		if (dev_maps) {
++			unsigned int tci = skb->sender_cpu - 1;
++
++			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
++							  tci);
++		}
++	}
++	rcu_read_unlock();
++
++	return queue_index;
++#else
++	return -1;
++#endif
++}
++
++u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
++		     struct net_device *sb_dev)
++{
++	return 0;
++}
++EXPORT_SYMBOL(dev_pick_tx_zero);
++
++u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
++		       struct net_device *sb_dev)
++{
++	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
++}
++EXPORT_SYMBOL(dev_pick_tx_cpu_id);
++
++u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
++		     struct net_device *sb_dev)
++{
++	struct sock *sk = skb->sk;
++	int queue_index = sk_tx_queue_get(sk);
++
++	sb_dev = sb_dev ? : dev;
++
++	if (queue_index < 0 || skb->ooo_okay ||
++	    queue_index >= dev->real_num_tx_queues) {
++		int new_index = get_xps_queue(dev, sb_dev, skb);
++
++		if (new_index < 0)
++			new_index = skb_tx_hash(dev, sb_dev, skb);
++
++		if (queue_index != new_index && sk &&
++		    sk_fullsock(sk) &&
++		    rcu_access_pointer(sk->sk_dst_cache))
++			sk_tx_queue_set(sk, new_index);
++
++		queue_index = new_index;
++	}
++
++	return queue_index;
++}
++EXPORT_SYMBOL(netdev_pick_tx);
++
++struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
++					 struct sk_buff *skb,
++					 struct net_device *sb_dev)
++{
++	int queue_index = 0;
++
++#ifdef CONFIG_XPS
++	u32 sender_cpu = skb->sender_cpu - 1;
++
++	if (sender_cpu >= (u32)NR_CPUS)
++		skb->sender_cpu = raw_smp_processor_id() + 1;
++#endif
++
++	if (dev->real_num_tx_queues != 1) {
++		const struct net_device_ops *ops = dev->netdev_ops;
++
++		if (ops->ndo_select_queue)
++			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
++		else
++			queue_index = netdev_pick_tx(dev, skb, sb_dev);
++
++		queue_index = netdev_cap_txqueue(dev, queue_index);
++	}
++
++	skb_set_queue_mapping(skb, queue_index);
++	return netdev_get_tx_queue(dev, queue_index);
++}
++
++/**
++ * __dev_queue_xmit() - transmit a buffer
++ * @skb:	buffer to transmit
++ * @sb_dev:	suboordinate device used for L2 forwarding offload
++ *
++ * Queue a buffer for transmission to a network device. The caller must
++ * have set the device and priority and built the buffer before calling
++ * this function. The function can be called from an interrupt.
++ *
++ * When calling this method, interrupts MUST be enabled. This is because
++ * the BH enable code must have IRQs enabled so that it will not deadlock.
++ *
++ * Regardless of the return value, the skb is consumed, so it is currently
++ * difficult to retry a send to this method. (You can bump the ref count
++ * before sending to hold a reference for retry if you are careful.)
++ *
++ * Return:
++ * * 0				- buffer successfully transmitted
++ * * positive qdisc return code	- NET_XMIT_DROP etc.
++ * * negative errno		- other errors
++ */
++int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
++{
++	struct net_device *dev = skb->dev;
++	struct netdev_queue *txq = NULL;
++	struct Qdisc *q;
++	int rc = -ENOMEM;
++	bool again = false;
++
++	skb_reset_mac_header(skb);
++	skb_assert_len(skb);
++
++	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
++		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
++
++	/* Disable soft irqs for various locks below. Also
++	 * stops preemption for RCU.
++	 */
++	rcu_read_lock_bh();
++
++	skb_update_prio(skb);
++
++	qdisc_pkt_len_init(skb);
++#ifdef CONFIG_NET_CLS_ACT
++	skb->tc_at_ingress = 0;
++#endif
++#ifdef CONFIG_NET_EGRESS
++	if (static_branch_unlikely(&egress_needed_key)) {
++		if (nf_hook_egress_active()) {
++			skb = nf_hook_egress(skb, &rc, dev);
++			if (!skb)
++				goto out;
++		}
++
++		netdev_xmit_skip_txqueue(false);
++
++		nf_skip_egress(skb, true);
++		skb = sch_handle_egress(skb, &rc, dev);
++		if (!skb)
++			goto out;
++		nf_skip_egress(skb, false);
++
++		if (netdev_xmit_txqueue_skipped())
++			txq = netdev_tx_queue_mapping(dev, skb);
++	}
++#endif
++	/* If device/qdisc don't need skb->dst, release it right now while
++	 * its hot in this cpu cache.
++	 */
++	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
++		skb_dst_drop(skb);
++	else
++		skb_dst_force(skb);
++
++	if (!txq)
++		txq = netdev_core_pick_tx(dev, skb, sb_dev);
++
++	q = rcu_dereference_bh(txq->qdisc);
++
++	trace_net_dev_queue(skb);
++	if (q->enqueue) {
++		rc = __dev_xmit_skb(skb, q, dev, txq);
++		goto out;
++	}
++
++	/* The device has no queue. Common case for software devices:
++	 * loopback, all the sorts of tunnels...
++
++	 * Really, it is unlikely that netif_tx_lock protection is necessary
++	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
++	 * counters.)
++	 * However, it is possible, that they rely on protection
++	 * made by us here.
++
++	 * Check this and shot the lock. It is not prone from deadlocks.
++	 *Either shot noqueue qdisc, it is even simpler 8)
++	 */
++	if (dev->flags & IFF_UP) {
++		int cpu = smp_processor_id(); /* ok because BHs are off */
++
++		/* Other cpus might concurrently change txq->xmit_lock_owner
++		 * to -1 or to their cpu id, but not to our id.
++		 */
++		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
++			if (dev_xmit_recursion())
++				goto recursion_alert;
++
++			skb = validate_xmit_skb(skb, dev, &again);
++			if (!skb)
++				goto out;
++
++			HARD_TX_LOCK(dev, txq, cpu);
++
++			if (!netif_xmit_stopped(txq)) {
++				dev_xmit_recursion_inc();
++				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
++				dev_xmit_recursion_dec();
++				if (dev_xmit_complete(rc)) {
++					HARD_TX_UNLOCK(dev, txq);
++					goto out;
++				}
++			}
++			HARD_TX_UNLOCK(dev, txq);
++			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
++					     dev->name);
++		} else {
++			/* Recursion is detected! It is possible,
++			 * unfortunately
++			 */
++recursion_alert:
++			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
++					     dev->name);
++		}
++	}
++
++	rc = -ENETDOWN;
++	rcu_read_unlock_bh();
++
++	dev_core_stats_tx_dropped_inc(dev);
++	kfree_skb_list(skb);
++	return rc;
++out:
++	rcu_read_unlock_bh();
++	return rc;
++}
++EXPORT_SYMBOL(__dev_queue_xmit);
++
++int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
++{
++	struct net_device *dev = skb->dev;
++	struct sk_buff *orig_skb = skb;
++	struct netdev_queue *txq;
++	int ret = NETDEV_TX_BUSY;
++	bool again = false;
++
++	if (unlikely(!netif_running(dev) ||
++		     !netif_carrier_ok(dev)))
++		goto drop;
++
++	skb = validate_xmit_skb_list(skb, dev, &again);
++	if (skb != orig_skb)
++		goto drop;
++
++	skb_set_queue_mapping(skb, queue_id);
++	txq = skb_get_tx_queue(dev, skb);
++
++	local_bh_disable();
++
++	dev_xmit_recursion_inc();
++	HARD_TX_LOCK(dev, txq, smp_processor_id());
++	if (!netif_xmit_frozen_or_drv_stopped(txq))
++		ret = netdev_start_xmit(skb, dev, txq, false);
++	HARD_TX_UNLOCK(dev, txq);
++	dev_xmit_recursion_dec();
++
++	local_bh_enable();
++	return ret;
++drop:
++	dev_core_stats_tx_dropped_inc(dev);
++	kfree_skb_list(skb);
++	return NET_XMIT_DROP;
++}
++EXPORT_SYMBOL(__dev_direct_xmit);
++
++/*************************************************************************
++ *			Receiver routines
++ *************************************************************************/
++
++int netdev_max_backlog __read_mostly = 1000;
++EXPORT_SYMBOL(netdev_max_backlog);
++
++int netdev_tstamp_prequeue __read_mostly = 1;
++unsigned int sysctl_skb_defer_max __read_mostly = 64;
++int netdev_budget __read_mostly = 300;
++/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
++unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
++int weight_p __read_mostly = 64;           /* old backlog weight */
++int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
++int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
++int dev_rx_weight __read_mostly = 64;
++int dev_tx_weight __read_mostly = 64;
++
++/* Called with irq disabled */
++static inline void ____napi_schedule(struct softnet_data *sd,
++				     struct napi_struct *napi)
++{
++	struct task_struct *thread;
++
++	lockdep_assert_irqs_disabled();
++
++	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
++		/* Paired with smp_mb__before_atomic() in
++		 * napi_enable()/dev_set_threaded().
++		 * Use READ_ONCE() to guarantee a complete
++		 * read on napi->thread. Only call
++		 * wake_up_process() when it's not NULL.
++		 */
++		thread = READ_ONCE(napi->thread);
++		if (thread) {
++			/* Avoid doing set_bit() if the thread is in
++			 * INTERRUPTIBLE state, cause napi_thread_wait()
++			 * makes sure to proceed with napi polling
++			 * if the thread is explicitly woken from here.
++			 */
++			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
++				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
++			wake_up_process(thread);
++			return;
++		}
++	}
++
++	list_add_tail(&napi->poll_list, &sd->poll_list);
++	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
++}
++
++#ifdef CONFIG_RPS
++
++/* One global table that all flow-based protocols share. */
++struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
++EXPORT_SYMBOL(rps_sock_flow_table);
++u32 rps_cpu_mask __read_mostly;
++EXPORT_SYMBOL(rps_cpu_mask);
++
++struct static_key_false rps_needed __read_mostly;
++EXPORT_SYMBOL(rps_needed);
++struct static_key_false rfs_needed __read_mostly;
++EXPORT_SYMBOL(rfs_needed);
++
++static struct rps_dev_flow *
++set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
++	    struct rps_dev_flow *rflow, u16 next_cpu)
++{
++	if (next_cpu < nr_cpu_ids) {
++#ifdef CONFIG_RFS_ACCEL
++		struct netdev_rx_queue *rxqueue;
++		struct rps_dev_flow_table *flow_table;
++		struct rps_dev_flow *old_rflow;
++		u32 flow_id;
++		u16 rxq_index;
++		int rc;
++
++		/* Should we steer this flow to a different hardware queue? */
++		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
++		    !(dev->features & NETIF_F_NTUPLE))
++			goto out;
++		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
++		if (rxq_index == skb_get_rx_queue(skb))
++			goto out;
++
++		rxqueue = dev->_rx + rxq_index;
++		flow_table = rcu_dereference(rxqueue->rps_flow_table);
++		if (!flow_table)
++			goto out;
++		flow_id = skb_get_hash(skb) & flow_table->mask;
++		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
++							rxq_index, flow_id);
++		if (rc < 0)
++			goto out;
++		old_rflow = rflow;
++		rflow = &flow_table->flows[flow_id];
++		rflow->filter = rc;
++		if (old_rflow->filter == rflow->filter)
++			old_rflow->filter = RPS_NO_FILTER;
++	out:
++#endif
++		rflow->last_qtail =
++			per_cpu(softnet_data, next_cpu).input_queue_head;
++	}
++
++	rflow->cpu = next_cpu;
++	return rflow;
++}
++
++/*
++ * get_rps_cpu is called from netif_receive_skb and returns the target
++ * CPU from the RPS map of the receiving queue for a given skb.
++ * rcu_read_lock must be held on entry.
++ */
++static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
++		       struct rps_dev_flow **rflowp)
++{
++	const struct rps_sock_flow_table *sock_flow_table;
++	struct netdev_rx_queue *rxqueue = dev->_rx;
++	struct rps_dev_flow_table *flow_table;
++	struct rps_map *map;
++	int cpu = -1;
++	u32 tcpu;
++	u32 hash;
++
++	if (skb_rx_queue_recorded(skb)) {
++		u16 index = skb_get_rx_queue(skb);
++
++		if (unlikely(index >= dev->real_num_rx_queues)) {
++			WARN_ONCE(dev->real_num_rx_queues > 1,
++				  "%s received packet on queue %u, but number "
++				  "of RX queues is %u\n",
++				  dev->name, index, dev->real_num_rx_queues);
++			goto done;
++		}
++		rxqueue += index;
++	}
++
++	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
++
++	flow_table = rcu_dereference(rxqueue->rps_flow_table);
++	map = rcu_dereference(rxqueue->rps_map);
++	if (!flow_table && !map)
++		goto done;
++
++	skb_reset_network_header(skb);
++	hash = skb_get_hash(skb);
++	if (!hash)
++		goto done;
++
++	sock_flow_table = rcu_dereference(rps_sock_flow_table);
++	if (flow_table && sock_flow_table) {
++		struct rps_dev_flow *rflow;
++		u32 next_cpu;
++		u32 ident;
++
++		/* First check into global flow table if there is a match */
++		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
++		if ((ident ^ hash) & ~rps_cpu_mask)
++			goto try_rps;
++
++		next_cpu = ident & rps_cpu_mask;
++
++		/* OK, now we know there is a match,
++		 * we can look at the local (per receive queue) flow table
++		 */
++		rflow = &flow_table->flows[hash & flow_table->mask];
++		tcpu = rflow->cpu;
++
++		/*
++		 * If the desired CPU (where last recvmsg was done) is
++		 * different from current CPU (one in the rx-queue flow
++		 * table entry), switch if one of the following holds:
++		 *   - Current CPU is unset (>= nr_cpu_ids).
++		 *   - Current CPU is offline.
++		 *   - The current CPU's queue tail has advanced beyond the
++		 *     last packet that was enqueued using this table entry.
++		 *     This guarantees that all previous packets for the flow
++		 *     have been dequeued, thus preserving in order delivery.
++		 */
++		if (unlikely(tcpu != next_cpu) &&
++		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
++		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
++		      rflow->last_qtail)) >= 0)) {
++			tcpu = next_cpu;
++			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
++		}
++
++		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
++			*rflowp = rflow;
++			cpu = tcpu;
++			goto done;
++		}
++	}
++
++try_rps:
++
++	if (map) {
++		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
++		if (cpu_online(tcpu)) {
++			cpu = tcpu;
++			goto done;
++		}
++	}
++
++done:
++	return cpu;
++}
++
++#ifdef CONFIG_RFS_ACCEL
++
++/**
++ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
++ * @dev: Device on which the filter was set
++ * @rxq_index: RX queue index
++ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
++ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
++ *
++ * Drivers that implement ndo_rx_flow_steer() should periodically call
++ * this function for each installed filter and remove the filters for
++ * which it returns %true.
++ */
++bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
++			 u32 flow_id, u16 filter_id)
++{
++	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
++	struct rps_dev_flow_table *flow_table;
++	struct rps_dev_flow *rflow;
++	bool expire = true;
++	unsigned int cpu;
++
++	rcu_read_lock();
++	flow_table = rcu_dereference(rxqueue->rps_flow_table);
++	if (flow_table && flow_id <= flow_table->mask) {
++		rflow = &flow_table->flows[flow_id];
++		cpu = READ_ONCE(rflow->cpu);
++		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
++		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
++			   rflow->last_qtail) <
++		     (int)(10 * flow_table->mask)))
++			expire = false;
++	}
++	rcu_read_unlock();
++	return expire;
++}
++EXPORT_SYMBOL(rps_may_expire_flow);
++
++#endif /* CONFIG_RFS_ACCEL */
++
++/* Called from hardirq (IPI) context */
++static void rps_trigger_softirq(void *data)
++{
++	struct softnet_data *sd = data;
++
++	____napi_schedule(sd, &sd->backlog);
++	sd->received_rps++;
++}
++
++#endif /* CONFIG_RPS */
++
++/* Called from hardirq (IPI) context */
++static void trigger_rx_softirq(void *data)
++{
++	struct softnet_data *sd = data;
++
++	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
++	smp_store_release(&sd->defer_ipi_scheduled, 0);
++}
++
++/*
++ * Check if this softnet_data structure is another cpu one
++ * If yes, queue it to our IPI list and return 1
++ * If no, return 0
++ */
++static int napi_schedule_rps(struct softnet_data *sd)
++{
++	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
++
++#ifdef CONFIG_RPS
++	if (sd != mysd) {
++		sd->rps_ipi_next = mysd->rps_ipi_list;
++		mysd->rps_ipi_list = sd;
++
++		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
++		return 1;
++	}
++#endif /* CONFIG_RPS */
++	__napi_schedule_irqoff(&mysd->backlog);
++	return 0;
++}
++
++#ifdef CONFIG_NET_FLOW_LIMIT
++int netdev_flow_limit_table_len __read_mostly = (1 << 12);
++#endif
++
++static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
++{
++#ifdef CONFIG_NET_FLOW_LIMIT
++	struct sd_flow_limit *fl;
++	struct softnet_data *sd;
++	unsigned int old_flow, new_flow;
++
++	if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
++		return false;
++
++	sd = this_cpu_ptr(&softnet_data);
++
++	rcu_read_lock();
++	fl = rcu_dereference(sd->flow_limit);
++	if (fl) {
++		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
++		old_flow = fl->history[fl->history_head];
++		fl->history[fl->history_head] = new_flow;
++
++		fl->history_head++;
++		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
++
++		if (likely(fl->buckets[old_flow]))
++			fl->buckets[old_flow]--;
++
++		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
++			fl->count++;
++			rcu_read_unlock();
++			return true;
++		}
++	}
++	rcu_read_unlock();
++#endif
++	return false;
++}
++
++/*
++ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
++ * queue (may be a remote CPU queue).
++ */
++static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
++			      unsigned int *qtail)
++{
++	enum skb_drop_reason reason;
++	struct softnet_data *sd;
++	unsigned long flags;
++	unsigned int qlen;
++
++	reason = SKB_DROP_REASON_NOT_SPECIFIED;
++	sd = &per_cpu(softnet_data, cpu);
++
++	rps_lock_irqsave(sd, &flags);
++	if (!netif_running(skb->dev))
++		goto drop;
++	qlen = skb_queue_len(&sd->input_pkt_queue);
++	if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
++		if (qlen) {
++enqueue:
++			__skb_queue_tail(&sd->input_pkt_queue, skb);
++			input_queue_tail_incr_save(sd, qtail);
++			rps_unlock_irq_restore(sd, &flags);
++			return NET_RX_SUCCESS;
++		}
++
++		/* Schedule NAPI for backlog device
++		 * We can use non atomic operation since we own the queue lock
++		 */
++		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
++			napi_schedule_rps(sd);
++		goto enqueue;
++	}
++	reason = SKB_DROP_REASON_CPU_BACKLOG;
++
++drop:
++	sd->dropped++;
++	rps_unlock_irq_restore(sd, &flags);
++
++	dev_core_stats_rx_dropped_inc(skb->dev);
++	kfree_skb_reason(skb, reason);
++	return NET_RX_DROP;
++}
++
++static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
++{
++	struct net_device *dev = skb->dev;
++	struct netdev_rx_queue *rxqueue;
++
++	rxqueue = dev->_rx;
++
++	if (skb_rx_queue_recorded(skb)) {
++		u16 index = skb_get_rx_queue(skb);
++
++		if (unlikely(index >= dev->real_num_rx_queues)) {
++			WARN_ONCE(dev->real_num_rx_queues > 1,
++				  "%s received packet on queue %u, but number "
++				  "of RX queues is %u\n",
++				  dev->name, index, dev->real_num_rx_queues);
++
++			return rxqueue; /* Return first rxqueue */
++		}
++		rxqueue += index;
++	}
++	return rxqueue;
++}
++
++u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
++			     struct bpf_prog *xdp_prog)
++{
++	void *orig_data, *orig_data_end, *hard_start;
++	struct netdev_rx_queue *rxqueue;
++	bool orig_bcast, orig_host;
++	u32 mac_len, frame_sz;
++	__be16 orig_eth_type;
++	struct ethhdr *eth;
++	u32 metalen, act;
++	int off;
++
++	/* The XDP program wants to see the packet starting at the MAC
++	 * header.
++	 */
++	mac_len = skb->data - skb_mac_header(skb);
++	hard_start = skb->data - skb_headroom(skb);
++
++	/* SKB "head" area always have tailroom for skb_shared_info */
++	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
++	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++
++	rxqueue = netif_get_rxqueue(skb);
++	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
++	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
++			 skb_headlen(skb) + mac_len, true);
++
++	orig_data_end = xdp->data_end;
++	orig_data = xdp->data;
++	eth = (struct ethhdr *)xdp->data;
++	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
++	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
++	orig_eth_type = eth->h_proto;
++
++	act = bpf_prog_run_xdp(xdp_prog, xdp);
++
++	/* check if bpf_xdp_adjust_head was used */
++	off = xdp->data - orig_data;
++	if (off) {
++		if (off > 0)
++			__skb_pull(skb, off);
++		else if (off < 0)
++			__skb_push(skb, -off);
++
++		skb->mac_header += off;
++		skb_reset_network_header(skb);
++	}
++
++	/* check if bpf_xdp_adjust_tail was used */
++	off = xdp->data_end - orig_data_end;
++	if (off != 0) {
++		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
++		skb->len += off; /* positive on grow, negative on shrink */
++	}
++
++	/* check if XDP changed eth hdr such SKB needs update */
++	eth = (struct ethhdr *)xdp->data;
++	if ((orig_eth_type != eth->h_proto) ||
++	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
++						  skb->dev->dev_addr)) ||
++	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
++		__skb_push(skb, ETH_HLEN);
++		skb->pkt_type = PACKET_HOST;
++		skb->protocol = eth_type_trans(skb, skb->dev);
++	}
++
++	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
++	 * before calling us again on redirect path. We do not call do_redirect
++	 * as we leave that up to the caller.
++	 *
++	 * Caller is responsible for managing lifetime of skb (i.e. calling
++	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
++	 */
++	switch (act) {
++	case XDP_REDIRECT:
++	case XDP_TX:
++		__skb_push(skb, mac_len);
++		break;
++	case XDP_PASS:
++		metalen = xdp->data - xdp->data_meta;
++		if (metalen)
++			skb_metadata_set(skb, metalen);
++		break;
++	}
++
++	return act;
++}
++
++static u32 netif_receive_generic_xdp(struct sk_buff *skb,
++				     struct xdp_buff *xdp,
++				     struct bpf_prog *xdp_prog)
++{
++	u32 act = XDP_DROP;
++
++	/* Reinjected packets coming from act_mirred or similar should
++	 * not get XDP generic processing.
++	 */
++	if (skb_is_redirected(skb))
++		return XDP_PASS;
++
++	/* XDP packets must be linear and must have sufficient headroom
++	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
++	 * native XDP provides, thus we need to do it here as well.
++	 */
++	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
++	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
++		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
++		int troom = skb->tail + skb->data_len - skb->end;
++
++		/* In case we have to go down the path and also linearize,
++		 * then lets do the pskb_expand_head() work just once here.
++		 */
++		if (pskb_expand_head(skb,
++				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
++				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
++			goto do_drop;
++		if (skb_linearize(skb))
++			goto do_drop;
++	}
++
++	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
++	switch (act) {
++	case XDP_REDIRECT:
++	case XDP_TX:
++	case XDP_PASS:
++		break;
++	default:
++		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
++		fallthrough;
++	case XDP_ABORTED:
++		trace_xdp_exception(skb->dev, xdp_prog, act);
++		fallthrough;
++	case XDP_DROP:
++	do_drop:
++		kfree_skb(skb);
++		break;
++	}
++
++	return act;
++}
++
++/* When doing generic XDP we have to bypass the qdisc layer and the
++ * network taps in order to match in-driver-XDP behavior. This also means
++ * that XDP packets are able to starve other packets going through a qdisc,
++ * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
++ * queues, so they do not have this starvation issue.
++ */
++void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
++{
++	struct net_device *dev = skb->dev;
++	struct netdev_queue *txq;
++	bool free_skb = true;
++	int cpu, rc;
++
++	txq = netdev_core_pick_tx(dev, skb, NULL);
++	cpu = smp_processor_id();
++	HARD_TX_LOCK(dev, txq, cpu);
++	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
++		rc = netdev_start_xmit(skb, dev, txq, 0);
++		if (dev_xmit_complete(rc))
++			free_skb = false;
++	}
++	HARD_TX_UNLOCK(dev, txq);
++	if (free_skb) {
++		trace_xdp_exception(dev, xdp_prog, XDP_TX);
++		dev_core_stats_tx_dropped_inc(dev);
++		kfree_skb(skb);
++	}
++}
++
++static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
++
++int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
++{
++	if (xdp_prog) {
++		struct xdp_buff xdp;
++		u32 act;
++		int err;
++
++		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
++		if (act != XDP_PASS) {
++			switch (act) {
++			case XDP_REDIRECT:
++				err = xdp_do_generic_redirect(skb->dev, skb,
++							      &xdp, xdp_prog);
++				if (err)
++					goto out_redir;
++				break;
++			case XDP_TX:
++				generic_xdp_tx(skb, xdp_prog);
++				break;
++			}
++			return XDP_DROP;
++		}
++	}
++	return XDP_PASS;
++out_redir:
++	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
++	return XDP_DROP;
++}
++EXPORT_SYMBOL_GPL(do_xdp_generic);
++
++static int netif_rx_internal(struct sk_buff *skb)
++{
++	int ret;
++
++	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
++
++	trace_netif_rx(skb);
++
++#ifdef CONFIG_RPS
++	if (static_branch_unlikely(&rps_needed)) {
++		struct rps_dev_flow voidflow, *rflow = &voidflow;
++		int cpu;
++
++		rcu_read_lock();
++
++		cpu = get_rps_cpu(skb->dev, skb, &rflow);
++		if (cpu < 0)
++			cpu = smp_processor_id();
++
++		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
++
++		rcu_read_unlock();
++	} else
++#endif
++	{
++		unsigned int qtail;
++
++		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
++	}
++	return ret;
++}
++
++/**
++ *	__netif_rx	-	Slightly optimized version of netif_rx
++ *	@skb: buffer to post
++ *
++ *	This behaves as netif_rx except that it does not disable bottom halves.
++ *	As a result this function may only be invoked from the interrupt context
++ *	(either hard or soft interrupt).
++ */
++int __netif_rx(struct sk_buff *skb)
++{
++	int ret;
++
++	lockdep_assert_once(hardirq_count() | softirq_count());
++
++	trace_netif_rx_entry(skb);
++	ret = netif_rx_internal(skb);
++	trace_netif_rx_exit(ret);
++	return ret;
++}
++EXPORT_SYMBOL(__netif_rx);
++
++/**
++ *	netif_rx	-	post buffer to the network code
++ *	@skb: buffer to post
++ *
++ *	This function receives a packet from a device driver and queues it for
++ *	the upper (protocol) levels to process via the backlog NAPI device. It
++ *	always succeeds. The buffer may be dropped during processing for
++ *	congestion control or by the protocol layers.
++ *	The network buffer is passed via the backlog NAPI device. Modern NIC
++ *	driver should use NAPI and GRO.
++ *	This function can used from interrupt and from process context. The
++ *	caller from process context must not disable interrupts before invoking
++ *	this function.
++ *
++ *	return values:
++ *	NET_RX_SUCCESS	(no congestion)
++ *	NET_RX_DROP     (packet was dropped)
++ *
++ */
++int netif_rx(struct sk_buff *skb)
++{
++	bool need_bh_off = !(hardirq_count() | softirq_count());
++	int ret;
++
++	if (need_bh_off)
++		local_bh_disable();
++	trace_netif_rx_entry(skb);
++	ret = netif_rx_internal(skb);
++	trace_netif_rx_exit(ret);
++	if (need_bh_off)
++		local_bh_enable();
++	return ret;
++}
++EXPORT_SYMBOL(netif_rx);
++
++static __latent_entropy void net_tx_action(struct softirq_action *h)
++{
++	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
++
++	if (sd->completion_queue) {
++		struct sk_buff *clist;
++
++		local_irq_disable();
++		clist = sd->completion_queue;
++		sd->completion_queue = NULL;
++		local_irq_enable();
++
++		while (clist) {
++			struct sk_buff *skb = clist;
++
++			clist = clist->next;
++
++			WARN_ON(refcount_read(&skb->users));
++			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
++				trace_consume_skb(skb);
++			else
++				trace_kfree_skb(skb, net_tx_action,
++						SKB_DROP_REASON_NOT_SPECIFIED);
++
++			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
++				__kfree_skb(skb);
++			else
++				__kfree_skb_defer(skb);
++		}
++	}
++
++	if (sd->output_queue) {
++		struct Qdisc *head;
++
++		local_irq_disable();
++		head = sd->output_queue;
++		sd->output_queue = NULL;
++		sd->output_queue_tailp = &sd->output_queue;
++		local_irq_enable();
++
++		rcu_read_lock();
++
++		while (head) {
++			struct Qdisc *q = head;
++			spinlock_t *root_lock = NULL;
++
++			head = head->next_sched;
++
++			/* We need to make sure head->next_sched is read
++			 * before clearing __QDISC_STATE_SCHED
++			 */
++			smp_mb__before_atomic();
++
++			if (!(q->flags & TCQ_F_NOLOCK)) {
++				root_lock = qdisc_lock(q);
++				spin_lock(root_lock);
++			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
++						     &q->state))) {
++				/* There is a synchronize_net() between
++				 * STATE_DEACTIVATED flag being set and
++				 * qdisc_reset()/some_qdisc_is_busy() in
++				 * dev_deactivate(), so we can safely bail out
++				 * early here to avoid data race between
++				 * qdisc_deactivate() and some_qdisc_is_busy()
++				 * for lockless qdisc.
++				 */
++				clear_bit(__QDISC_STATE_SCHED, &q->state);
++				continue;
++			}
++
++			clear_bit(__QDISC_STATE_SCHED, &q->state);
++			qdisc_run(q);
++			if (root_lock)
++				spin_unlock(root_lock);
++		}
++
++		rcu_read_unlock();
++	}
++
++	xfrm_dev_backlog(sd);
++}
++
++#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
++/* This hook is defined here for ATM LANE */
++int (*br_fdb_test_addr_hook)(struct net_device *dev,
++			     unsigned char *addr) __read_mostly;
++EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
++#endif
++
++static inline struct sk_buff *
++sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
++		   struct net_device *orig_dev, bool *another)
++{
++#ifdef CONFIG_NET_CLS_ACT
++	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
++	struct tcf_result cl_res;
++
++	/* If there's at least one ingress present somewhere (so
++	 * we get here via enabled static key), remaining devices
++	 * that are not configured with an ingress qdisc will bail
++	 * out here.
++	 */
++	if (!miniq)
++		return skb;
++
++	if (*pt_prev) {
++		*ret = deliver_skb(skb, *pt_prev, orig_dev);
++		*pt_prev = NULL;
++	}
++
++	qdisc_skb_cb(skb)->pkt_len = skb->len;
++	tc_skb_cb(skb)->mru = 0;
++	tc_skb_cb(skb)->post_ct = false;
++	skb->tc_at_ingress = 1;
++	mini_qdisc_bstats_cpu_update(miniq, skb);
++
++	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
++	case TC_ACT_OK:
++	case TC_ACT_RECLASSIFY:
++		skb->tc_index = TC_H_MIN(cl_res.classid);
++		break;
++	case TC_ACT_SHOT:
++		mini_qdisc_qstats_cpu_drop(miniq);
++		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
++		*ret = NET_RX_DROP;
++		return NULL;
++	case TC_ACT_STOLEN:
++	case TC_ACT_QUEUED:
++	case TC_ACT_TRAP:
++		consume_skb(skb);
++		*ret = NET_RX_SUCCESS;
++		return NULL;
++	case TC_ACT_REDIRECT:
++		/* skb_mac_header check was done by cls/act_bpf, so
++		 * we can safely push the L2 header back before
++		 * redirecting to another netdev
++		 */
++		__skb_push(skb, skb->mac_len);
++		if (skb_do_redirect(skb) == -EAGAIN) {
++			__skb_pull(skb, skb->mac_len);
++			*another = true;
++			break;
++		}
++		*ret = NET_RX_SUCCESS;
++		return NULL;
++	case TC_ACT_CONSUMED:
++		*ret = NET_RX_SUCCESS;
++		return NULL;
++	default:
++		break;
++	}
++#endif /* CONFIG_NET_CLS_ACT */
++	return skb;
++}
++
++/**
++ *	netdev_is_rx_handler_busy - check if receive handler is registered
++ *	@dev: device to check
++ *
++ *	Check if a receive handler is already registered for a given device.
++ *	Return true if there one.
++ *
++ *	The caller must hold the rtnl_mutex.
++ */
++bool netdev_is_rx_handler_busy(struct net_device *dev)
++{
++	ASSERT_RTNL();
++	return dev && rtnl_dereference(dev->rx_handler);
++}
++EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
++
++/**
++ *	netdev_rx_handler_register - register receive handler
++ *	@dev: device to register a handler for
++ *	@rx_handler: receive handler to register
++ *	@rx_handler_data: data pointer that is used by rx handler
++ *
++ *	Register a receive handler for a device. This handler will then be
++ *	called from __netif_receive_skb. A negative errno code is returned
++ *	on a failure.
++ *
++ *	The caller must hold the rtnl_mutex.
++ *
++ *	For a general description of rx_handler, see enum rx_handler_result.
++ */
++int netdev_rx_handler_register(struct net_device *dev,
++			       rx_handler_func_t *rx_handler,
++			       void *rx_handler_data)
++{
++	if (netdev_is_rx_handler_busy(dev))
++		return -EBUSY;
++
++	if (dev->priv_flags & IFF_NO_RX_HANDLER)
++		return -EINVAL;
++
++	/* Note: rx_handler_data must be set before rx_handler */
++	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
++	rcu_assign_pointer(dev->rx_handler, rx_handler);
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
++
++/**
++ *	netdev_rx_handler_unregister - unregister receive handler
++ *	@dev: device to unregister a handler from
++ *
++ *	Unregister a receive handler from a device.
++ *
++ *	The caller must hold the rtnl_mutex.
++ */
++void netdev_rx_handler_unregister(struct net_device *dev)
++{
++
++	ASSERT_RTNL();
++	RCU_INIT_POINTER(dev->rx_handler, NULL);
++	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
++	 * section has a guarantee to see a non NULL rx_handler_data
++	 * as well.
++	 */
++	synchronize_net();
++	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
++}
++EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
++
++/*
++ * Limit the use of PFMEMALLOC reserves to those protocols that implement
++ * the special handling of PFMEMALLOC skbs.
++ */
++static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
++{
++	switch (skb->protocol) {
++	case htons(ETH_P_ARP):
++	case htons(ETH_P_IP):
++	case htons(ETH_P_IPV6):
++	case htons(ETH_P_8021Q):
++	case htons(ETH_P_8021AD):
++		return true;
++	default:
++		return false;
++	}
++}
++
++static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
++			     int *ret, struct net_device *orig_dev)
++{
++	if (nf_hook_ingress_active(skb)) {
++		int ingress_retval;
++
++		if (*pt_prev) {
++			*ret = deliver_skb(skb, *pt_prev, orig_dev);
++			*pt_prev = NULL;
++		}
++
++		rcu_read_lock();
++		ingress_retval = nf_hook_ingress(skb);
++		rcu_read_unlock();
++		return ingress_retval;
++	}
++	return 0;
++}
++
++static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
++				    struct packet_type **ppt_prev)
++{
++	struct packet_type *ptype, *pt_prev;
++	rx_handler_func_t *rx_handler;
++	struct sk_buff *skb = *pskb;
++	struct net_device *orig_dev;
++	bool deliver_exact = false;
++	int ret = NET_RX_DROP;
++	__be16 type;
++
++	net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
++
++	trace_netif_receive_skb(skb);
++
++	orig_dev = skb->dev;
++
++	skb_reset_network_header(skb);
++	if (!skb_transport_header_was_set(skb))
++		skb_reset_transport_header(skb);
++	skb_reset_mac_len(skb);
++
++	pt_prev = NULL;
++
++another_round:
++	skb->skb_iif = skb->dev->ifindex;
++
++	__this_cpu_inc(softnet_data.processed);
++
++	if (static_branch_unlikely(&generic_xdp_needed_key)) {
++		int ret2;
++
++		migrate_disable();
++		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
++		migrate_enable();
++
++		if (ret2 != XDP_PASS) {
++			ret = NET_RX_DROP;
++			goto out;
++		}
++	}
++
++	if (eth_type_vlan(skb->protocol)) {
++		skb = skb_vlan_untag(skb);
++		if (unlikely(!skb))
++			goto out;
++	}
++
++	if (skb_skip_tc_classify(skb))
++		goto skip_classify;
++
++	if (pfmemalloc)
++		goto skip_taps;
++
++	list_for_each_entry_rcu(ptype, &ptype_all, list) {
++		if (pt_prev)
++			ret = deliver_skb(skb, pt_prev, orig_dev);
++		pt_prev = ptype;
++	}
++
++	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
++		if (pt_prev)
++			ret = deliver_skb(skb, pt_prev, orig_dev);
++		pt_prev = ptype;
++	}
++
++skip_taps:
++#ifdef CONFIG_NET_INGRESS
++	if (static_branch_unlikely(&ingress_needed_key)) {
++		bool another = false;
++
++		nf_skip_egress(skb, true);
++		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
++					 &another);
++		if (another)
++			goto another_round;
++		if (!skb)
++			goto out;
++
++		nf_skip_egress(skb, false);
++		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
++			goto out;
++	}
++#endif
++	skb_reset_redirect(skb);
++skip_classify:
++	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
++		goto drop;
++
++	if (skb_vlan_tag_present(skb)) {
++		if (pt_prev) {
++			ret = deliver_skb(skb, pt_prev, orig_dev);
++			pt_prev = NULL;
++		}
++		if (vlan_do_receive(&skb))
++			goto another_round;
++		else if (unlikely(!skb))
++			goto out;
++	}
++
++	rx_handler = rcu_dereference(skb->dev->rx_handler);
++	if (rx_handler) {
++		if (pt_prev) {
++			ret = deliver_skb(skb, pt_prev, orig_dev);
++			pt_prev = NULL;
++		}
++		switch (rx_handler(&skb)) {
++		case RX_HANDLER_CONSUMED:
++			ret = NET_RX_SUCCESS;
++			goto out;
++		case RX_HANDLER_ANOTHER:
++			goto another_round;
++		case RX_HANDLER_EXACT:
++			deliver_exact = true;
++			break;
++		case RX_HANDLER_PASS:
++			break;
++		default:
++			BUG();
++		}
++	}
++
++	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
++check_vlan_id:
++		if (skb_vlan_tag_get_id(skb)) {
++			/* Vlan id is non 0 and vlan_do_receive() above couldn't
++			 * find vlan device.
++			 */
++			skb->pkt_type = PACKET_OTHERHOST;
++		} else if (eth_type_vlan(skb->protocol)) {
++			/* Outer header is 802.1P with vlan 0, inner header is
++			 * 802.1Q or 802.1AD and vlan_do_receive() above could
++			 * not find vlan dev for vlan id 0.
++			 */
++			__vlan_hwaccel_clear_tag(skb);
++			skb = skb_vlan_untag(skb);
++			if (unlikely(!skb))
++				goto out;
++			if (vlan_do_receive(&skb))
++				/* After stripping off 802.1P header with vlan 0
++				 * vlan dev is found for inner header.
++				 */
++				goto another_round;
++			else if (unlikely(!skb))
++				goto out;
++			else
++				/* We have stripped outer 802.1P vlan 0 header.
++				 * But could not find vlan dev.
++				 * check again for vlan id to set OTHERHOST.
++				 */
++				goto check_vlan_id;
++		}
++		/* Note: we might in the future use prio bits
++		 * and set skb->priority like in vlan_do_receive()
++		 * For the time being, just ignore Priority Code Point
++		 */
++		__vlan_hwaccel_clear_tag(skb);
++	}
++
++	type = skb->protocol;
++
++	/* deliver only exact match when indicated */
++	if (likely(!deliver_exact)) {
++		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
++				       &ptype_base[ntohs(type) &
++						   PTYPE_HASH_MASK]);
++	}
++
++	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
++			       &orig_dev->ptype_specific);
++
++	if (unlikely(skb->dev != orig_dev)) {
++		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
++				       &skb->dev->ptype_specific);
++	}
++
++	if (pt_prev) {
++		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
++			goto drop;
++		*ppt_prev = pt_prev;
++	} else {
++drop:
++		if (!deliver_exact)
++			dev_core_stats_rx_dropped_inc(skb->dev);
++		else
++			dev_core_stats_rx_nohandler_inc(skb->dev);
++		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
++		/* Jamal, now you will not able to escape explaining
++		 * me how you were going to use this. :-)
++		 */
++		ret = NET_RX_DROP;
++	}
++
++out:
++	/* The invariant here is that if *ppt_prev is not NULL
++	 * then skb should also be non-NULL.
++	 *
++	 * Apparently *ppt_prev assignment above holds this invariant due to
++	 * skb dereferencing near it.
++	 */
++	*pskb = skb;
++	return ret;
++}
++
++static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
++{
++	struct net_device *orig_dev = skb->dev;
++	struct packet_type *pt_prev = NULL;
++	int ret;
++
++	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
++	if (pt_prev)
++		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
++					 skb->dev, pt_prev, orig_dev);
++	return ret;
++}
++
++/**
++ *	netif_receive_skb_core - special purpose version of netif_receive_skb
++ *	@skb: buffer to process
++ *
++ *	More direct receive version of netif_receive_skb().  It should
++ *	only be used by callers that have a need to skip RPS and Generic XDP.
++ *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
++ *
++ *	This function may only be called from softirq context and interrupts
++ *	should be enabled.
++ *
++ *	Return values (usually ignored):
++ *	NET_RX_SUCCESS: no congestion
++ *	NET_RX_DROP: packet was dropped
++ */
++int netif_receive_skb_core(struct sk_buff *skb)
++{
++	int ret;
++
++	rcu_read_lock();
++	ret = __netif_receive_skb_one_core(skb, false);
++	rcu_read_unlock();
++
++	return ret;
++}
++EXPORT_SYMBOL(netif_receive_skb_core);
++
++static inline void __netif_receive_skb_list_ptype(struct list_head *head,
++						  struct packet_type *pt_prev,
++						  struct net_device *orig_dev)
++{
++	struct sk_buff *skb, *next;
++
++	if (!pt_prev)
++		return;
++	if (list_empty(head))
++		return;
++	if (pt_prev->list_func != NULL)
++		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
++				   ip_list_rcv, head, pt_prev, orig_dev);
++	else
++		list_for_each_entry_safe(skb, next, head, list) {
++			skb_list_del_init(skb);
++			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
++		}
++}
++
++static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
++{
++	/* Fast-path assumptions:
++	 * - There is no RX handler.
++	 * - Only one packet_type matches.
++	 * If either of these fails, we will end up doing some per-packet
++	 * processing in-line, then handling the 'last ptype' for the whole
++	 * sublist.  This can't cause out-of-order delivery to any single ptype,
++	 * because the 'last ptype' must be constant across the sublist, and all
++	 * other ptypes are handled per-packet.
++	 */
++	/* Current (common) ptype of sublist */
++	struct packet_type *pt_curr = NULL;
++	/* Current (common) orig_dev of sublist */
++	struct net_device *od_curr = NULL;
++	struct list_head sublist;
++	struct sk_buff *skb, *next;
++
++	INIT_LIST_HEAD(&sublist);
++	list_for_each_entry_safe(skb, next, head, list) {
++		struct net_device *orig_dev = skb->dev;
++		struct packet_type *pt_prev = NULL;
++
++		skb_list_del_init(skb);
++		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
++		if (!pt_prev)
++			continue;
++		if (pt_curr != pt_prev || od_curr != orig_dev) {
++			/* dispatch old sublist */
++			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
++			/* start new sublist */
++			INIT_LIST_HEAD(&sublist);
++			pt_curr = pt_prev;
++			od_curr = orig_dev;
++		}
++		list_add_tail(&skb->list, &sublist);
++	}
++
++	/* dispatch final sublist */
++	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
++}
++
++static int __netif_receive_skb(struct sk_buff *skb)
++{
++	int ret;
++
++	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
++		unsigned int noreclaim_flag;
++
++		/*
++		 * PFMEMALLOC skbs are special, they should
++		 * - be delivered to SOCK_MEMALLOC sockets only
++		 * - stay away from userspace
++		 * - have bounded memory usage
++		 *
++		 * Use PF_MEMALLOC as this saves us from propagating the allocation
++		 * context down to all allocation sites.
++		 */
++		noreclaim_flag = memalloc_noreclaim_save();
++		ret = __netif_receive_skb_one_core(skb, true);
++		memalloc_noreclaim_restore(noreclaim_flag);
++	} else
++		ret = __netif_receive_skb_one_core(skb, false);
++
++	return ret;
++}
++
++static void __netif_receive_skb_list(struct list_head *head)
++{
++	unsigned long noreclaim_flag = 0;
++	struct sk_buff *skb, *next;
++	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
++
++	list_for_each_entry_safe(skb, next, head, list) {
++		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
++			struct list_head sublist;
++
++			/* Handle the previous sublist */
++			list_cut_before(&sublist, head, &skb->list);
++			if (!list_empty(&sublist))
++				__netif_receive_skb_list_core(&sublist, pfmemalloc);
++			pfmemalloc = !pfmemalloc;
++			/* See comments in __netif_receive_skb */
++			if (pfmemalloc)
++				noreclaim_flag = memalloc_noreclaim_save();
++			else
++				memalloc_noreclaim_restore(noreclaim_flag);
++		}
++	}
++	/* Handle the remaining sublist */
++	if (!list_empty(head))
++		__netif_receive_skb_list_core(head, pfmemalloc);
++	/* Restore pflags */
++	if (pfmemalloc)
++		memalloc_noreclaim_restore(noreclaim_flag);
++}
++
++static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
++{
++	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
++	struct bpf_prog *new = xdp->prog;
++	int ret = 0;
++
++	switch (xdp->command) {
++	case XDP_SETUP_PROG:
++		rcu_assign_pointer(dev->xdp_prog, new);
++		if (old)
++			bpf_prog_put(old);
++
++		if (old && !new) {
++			static_branch_dec(&generic_xdp_needed_key);
++		} else if (new && !old) {
++			static_branch_inc(&generic_xdp_needed_key);
++			dev_disable_lro(dev);
++			dev_disable_gro_hw(dev);
++		}
++		break;
++
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	return ret;
++}
++
++static int netif_receive_skb_internal(struct sk_buff *skb)
++{
++	int ret;
++
++	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
++
++	if (skb_defer_rx_timestamp(skb))
++		return NET_RX_SUCCESS;
++
++	rcu_read_lock();
++#ifdef CONFIG_RPS
++	if (static_branch_unlikely(&rps_needed)) {
++		struct rps_dev_flow voidflow, *rflow = &voidflow;
++		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
++
++		if (cpu >= 0) {
++			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
++			rcu_read_unlock();
++			return ret;
++		}
++	}
++#endif
++	ret = __netif_receive_skb(skb);
++	rcu_read_unlock();
++	return ret;
++}
++
++void netif_receive_skb_list_internal(struct list_head *head)
++{
++	struct sk_buff *skb, *next;
++	struct list_head sublist;
++
++	INIT_LIST_HEAD(&sublist);
++	list_for_each_entry_safe(skb, next, head, list) {
++		net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
++		skb_list_del_init(skb);
++		if (!skb_defer_rx_timestamp(skb))
++			list_add_tail(&skb->list, &sublist);
++	}
++	list_splice_init(&sublist, head);
++
++	rcu_read_lock();
++#ifdef CONFIG_RPS
++	if (static_branch_unlikely(&rps_needed)) {
++		list_for_each_entry_safe(skb, next, head, list) {
++			struct rps_dev_flow voidflow, *rflow = &voidflow;
++			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
++
++			if (cpu >= 0) {
++				/* Will be handled, remove from list */
++				skb_list_del_init(skb);
++				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
++			}
++		}
++	}
++#endif
++	__netif_receive_skb_list(head);
++	rcu_read_unlock();
++}
++
++/**
++ *	netif_receive_skb - process receive buffer from network
++ *	@skb: buffer to process
++ *
++ *	netif_receive_skb() is the main receive data processing function.
++ *	It always succeeds. The buffer may be dropped during processing
++ *	for congestion control or by the protocol layers.
++ *
++ *	This function may only be called from softirq context and interrupts
++ *	should be enabled.
++ *
++ *	Return values (usually ignored):
++ *	NET_RX_SUCCESS: no congestion
++ *	NET_RX_DROP: packet was dropped
++ */
++int netif_receive_skb(struct sk_buff *skb)
++{
++	int ret;
++
++	trace_netif_receive_skb_entry(skb);
++
++	ret = netif_receive_skb_internal(skb);
++	trace_netif_receive_skb_exit(ret);
++
++	return ret;
++}
++EXPORT_SYMBOL(netif_receive_skb);
++
++/**
++ *	netif_receive_skb_list - process many receive buffers from network
++ *	@head: list of skbs to process.
++ *
++ *	Since return value of netif_receive_skb() is normally ignored, and
++ *	wouldn't be meaningful for a list, this function returns void.
++ *
++ *	This function may only be called from softirq context and interrupts
++ *	should be enabled.
++ */
++void netif_receive_skb_list(struct list_head *head)
++{
++	struct sk_buff *skb;
++
++	if (list_empty(head))
++		return;
++	if (trace_netif_receive_skb_list_entry_enabled()) {
++		list_for_each_entry(skb, head, list)
++			trace_netif_receive_skb_list_entry(skb);
++	}
++	netif_receive_skb_list_internal(head);
++	trace_netif_receive_skb_list_exit(0);
++}
++EXPORT_SYMBOL(netif_receive_skb_list);
++
++static DEFINE_PER_CPU(struct work_struct, flush_works);
++
++/* Network device is going away, flush any packets still pending */
++static void flush_backlog(struct work_struct *work)
++{
++	struct sk_buff *skb, *tmp;
++	struct softnet_data *sd;
++
++	local_bh_disable();
++	sd = this_cpu_ptr(&softnet_data);
++
++	rps_lock_irq_disable(sd);
++	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
++		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
++			__skb_unlink(skb, &sd->input_pkt_queue);
++			dev_kfree_skb_irq(skb);
++			input_queue_head_incr(sd);
++		}
++	}
++	rps_unlock_irq_enable(sd);
++
++	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
++		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
++			__skb_unlink(skb, &sd->process_queue);
++			kfree_skb(skb);
++			input_queue_head_incr(sd);
++		}
++	}
++	local_bh_enable();
++}
++
++static bool flush_required(int cpu)
++{
++#if IS_ENABLED(CONFIG_RPS)
++	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
++	bool do_flush;
++
++	rps_lock_irq_disable(sd);
++
++	/* as insertion into process_queue happens with the rps lock held,
++	 * process_queue access may race only with dequeue
++	 */
++	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
++		   !skb_queue_empty_lockless(&sd->process_queue);
++	rps_unlock_irq_enable(sd);
++
++	return do_flush;
++#endif
++	/* without RPS we can't safely check input_pkt_queue: during a
++	 * concurrent remote skb_queue_splice() we can detect as empty both
++	 * input_pkt_queue and process_queue even if the latter could end-up
++	 * containing a lot of packets.
++	 */
++	return true;
++}
++
++static void flush_all_backlogs(void)
++{
++	static cpumask_t flush_cpus;
++	unsigned int cpu;
++
++	/* since we are under rtnl lock protection we can use static data
++	 * for the cpumask and avoid allocating on stack the possibly
++	 * large mask
++	 */
++	ASSERT_RTNL();
++
++	cpus_read_lock();
++
++	cpumask_clear(&flush_cpus);
++	for_each_online_cpu(cpu) {
++		if (flush_required(cpu)) {
++			queue_work_on(cpu, system_highpri_wq,
++				      per_cpu_ptr(&flush_works, cpu));
++			cpumask_set_cpu(cpu, &flush_cpus);
++		}
++	}
++
++	/* we can have in flight packet[s] on the cpus we are not flushing,
++	 * synchronize_net() in unregister_netdevice_many() will take care of
++	 * them
++	 */
++	for_each_cpu(cpu, &flush_cpus)
++		flush_work(per_cpu_ptr(&flush_works, cpu));
++
++	cpus_read_unlock();
++}
++
++static void net_rps_send_ipi(struct softnet_data *remsd)
++{
++#ifdef CONFIG_RPS
++	while (remsd) {
++		struct softnet_data *next = remsd->rps_ipi_next;
++
++		if (cpu_online(remsd->cpu))
++			smp_call_function_single_async(remsd->cpu, &remsd->csd);
++		remsd = next;
++	}
++#endif
++}
++
++/*
++ * net_rps_action_and_irq_enable sends any pending IPI's for rps.
++ * Note: called with local irq disabled, but exits with local irq enabled.
++ */
++static void net_rps_action_and_irq_enable(struct softnet_data *sd)
++{
++#ifdef CONFIG_RPS
++	struct softnet_data *remsd = sd->rps_ipi_list;
++
++	if (remsd) {
++		sd->rps_ipi_list = NULL;
++
++		local_irq_enable();
++
++		/* Send pending IPI's to kick RPS processing on remote cpus. */
++		net_rps_send_ipi(remsd);
++	} else
++#endif
++		local_irq_enable();
++}
++
++static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
++{
++#ifdef CONFIG_RPS
++	return sd->rps_ipi_list != NULL;
++#else
++	return false;
++#endif
++}
++
++static int process_backlog(struct napi_struct *napi, int quota)
++{
++	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
++	bool again = true;
++	int work = 0;
++
++	/* Check if we have pending ipi, its better to send them now,
++	 * not waiting net_rx_action() end.
++	 */
++	if (sd_has_rps_ipi_waiting(sd)) {
++		local_irq_disable();
++		net_rps_action_and_irq_enable(sd);
++	}
++
++	napi->weight = READ_ONCE(dev_rx_weight);
++	while (again) {
++		struct sk_buff *skb;
++
++		while ((skb = __skb_dequeue(&sd->process_queue))) {
++			rcu_read_lock();
++			__netif_receive_skb(skb);
++			rcu_read_unlock();
++			input_queue_head_incr(sd);
++			if (++work >= quota)
++				return work;
++
++		}
++
++		rps_lock_irq_disable(sd);
++		if (skb_queue_empty(&sd->input_pkt_queue)) {
++			/*
++			 * Inline a custom version of __napi_complete().
++			 * only current cpu owns and manipulates this napi,
++			 * and NAPI_STATE_SCHED is the only possible flag set
++			 * on backlog.
++			 * We can use a plain write instead of clear_bit(),
++			 * and we dont need an smp_mb() memory barrier.
++			 */
++			napi->state = 0;
++			again = false;
++		} else {
++			skb_queue_splice_tail_init(&sd->input_pkt_queue,
++						   &sd->process_queue);
++		}
++		rps_unlock_irq_enable(sd);
++	}
++
++	return work;
++}
++
++/**
++ * __napi_schedule - schedule for receive
++ * @n: entry to schedule
++ *
++ * The entry's receive function will be scheduled to run.
++ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
++ */
++void __napi_schedule(struct napi_struct *n)
++{
++	unsigned long flags;
++
++	local_irq_save(flags);
++	____napi_schedule(this_cpu_ptr(&softnet_data), n);
++	local_irq_restore(flags);
++}
++EXPORT_SYMBOL(__napi_schedule);
++
++/**
++ *	napi_schedule_prep - check if napi can be scheduled
++ *	@n: napi context
++ *
++ * Test if NAPI routine is already running, and if not mark
++ * it as running.  This is used as a condition variable to
++ * insure only one NAPI poll instance runs.  We also make
++ * sure there is no pending NAPI disable.
++ */
++bool napi_schedule_prep(struct napi_struct *n)
++{
++	unsigned long val, new;
++
++	do {
++		val = READ_ONCE(n->state);
++		if (unlikely(val & NAPIF_STATE_DISABLE))
++			return false;
++		new = val | NAPIF_STATE_SCHED;
++
++		/* Sets STATE_MISSED bit if STATE_SCHED was already set
++		 * This was suggested by Alexander Duyck, as compiler
++		 * emits better code than :
++		 * if (val & NAPIF_STATE_SCHED)
++		 *     new |= NAPIF_STATE_MISSED;
++		 */
++		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
++						   NAPIF_STATE_MISSED;
++	} while (cmpxchg(&n->state, val, new) != val);
++
++	return !(val & NAPIF_STATE_SCHED);
++}
++EXPORT_SYMBOL(napi_schedule_prep);
++
++/**
++ * __napi_schedule_irqoff - schedule for receive
++ * @n: entry to schedule
++ *
++ * Variant of __napi_schedule() assuming hard irqs are masked.
++ *
++ * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
++ * because the interrupt disabled assumption might not be true
++ * due to force-threaded interrupts and spinlock substitution.
++ */
++void __napi_schedule_irqoff(struct napi_struct *n)
++{
++	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++		____napi_schedule(this_cpu_ptr(&softnet_data), n);
++	else
++		__napi_schedule(n);
++}
++EXPORT_SYMBOL(__napi_schedule_irqoff);
++
++bool napi_complete_done(struct napi_struct *n, int work_done)
++{
++	unsigned long flags, val, new, timeout = 0;
++	bool ret = true;
++
++	/*
++	 * 1) Don't let napi dequeue from the cpu poll list
++	 *    just in case its running on a different cpu.
++	 * 2) If we are busy polling, do nothing here, we have
++	 *    the guarantee we will be called later.
++	 */
++	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
++				 NAPIF_STATE_IN_BUSY_POLL)))
++		return false;
++
++	if (work_done) {
++		if (n->gro_bitmask)
++			timeout = READ_ONCE(n->dev->gro_flush_timeout);
++		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
++	}
++	if (n->defer_hard_irqs_count > 0) {
++		n->defer_hard_irqs_count--;
++		timeout = READ_ONCE(n->dev->gro_flush_timeout);
++		if (timeout)
++			ret = false;
++	}
++	if (n->gro_bitmask) {
++		/* When the NAPI instance uses a timeout and keeps postponing
++		 * it, we need to bound somehow the time packets are kept in
++		 * the GRO layer
++		 */
++		napi_gro_flush(n, !!timeout);
++	}
++
++	gro_normal_list(n);
++
++	if (unlikely(!list_empty(&n->poll_list))) {
++		/* If n->poll_list is not empty, we need to mask irqs */
++		local_irq_save(flags);
++		list_del_init(&n->poll_list);
++		local_irq_restore(flags);
++	}
++
++	do {
++		val = READ_ONCE(n->state);
++
++		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
++
++		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
++			      NAPIF_STATE_SCHED_THREADED |
++			      NAPIF_STATE_PREFER_BUSY_POLL);
++
++		/* If STATE_MISSED was set, leave STATE_SCHED set,
++		 * because we will call napi->poll() one more time.
++		 * This C code was suggested by Alexander Duyck to help gcc.
++		 */
++		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
++						    NAPIF_STATE_SCHED;
++	} while (cmpxchg(&n->state, val, new) != val);
++
++	if (unlikely(val & NAPIF_STATE_MISSED)) {
++		__napi_schedule(n);
++		return false;
++	}
++
++	if (timeout)
++		hrtimer_start(&n->timer, ns_to_ktime(timeout),
++			      HRTIMER_MODE_REL_PINNED);
++	return ret;
++}
++EXPORT_SYMBOL(napi_complete_done);
++
++/* must be called under rcu_read_lock(), as we dont take a reference */
++static struct napi_struct *napi_by_id(unsigned int napi_id)
++{
++	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
++	struct napi_struct *napi;
++
++	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
++		if (napi->napi_id == napi_id)
++			return napi;
++
++	return NULL;
++}
++
++#if defined(CONFIG_NET_RX_BUSY_POLL)
++
++static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
++{
++	if (!skip_schedule) {
++		gro_normal_list(napi);
++		__napi_schedule(napi);
++		return;
++	}
++
++	if (napi->gro_bitmask) {
++		/* flush too old packets
++		 * If HZ < 1000, flush all packets.
++		 */
++		napi_gro_flush(napi, HZ >= 1000);
++	}
++
++	gro_normal_list(napi);
++	clear_bit(NAPI_STATE_SCHED, &napi->state);
++}
++
++static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
++			   u16 budget)
++{
++	bool skip_schedule = false;
++	unsigned long timeout;
++	int rc;
++
++	/* Busy polling means there is a high chance device driver hard irq
++	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
++	 * set in napi_schedule_prep().
++	 * Since we are about to call napi->poll() once more, we can safely
++	 * clear NAPI_STATE_MISSED.
++	 *
++	 * Note: x86 could use a single "lock and ..." instruction
++	 * to perform these two clear_bit()
++	 */
++	clear_bit(NAPI_STATE_MISSED, &napi->state);
++	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
++
++	local_bh_disable();
++
++	if (prefer_busy_poll) {
++		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
++		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
++		if (napi->defer_hard_irqs_count && timeout) {
++			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
++			skip_schedule = true;
++		}
++	}
++
++	/* All we really want here is to re-enable device interrupts.
++	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
++	 */
++	rc = napi->poll(napi, budget);
++	/* We can't gro_normal_list() here, because napi->poll() might have
++	 * rearmed the napi (napi_complete_done()) in which case it could
++	 * already be running on another CPU.
++	 */
++	trace_napi_poll(napi, rc, budget);
++	netpoll_poll_unlock(have_poll_lock);
++	if (rc == budget)
++		__busy_poll_stop(napi, skip_schedule);
++	local_bh_enable();
++}
++
++void napi_busy_loop(unsigned int napi_id,
++		    bool (*loop_end)(void *, unsigned long),
++		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
++{
++	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
++	int (*napi_poll)(struct napi_struct *napi, int budget);
++	void *have_poll_lock = NULL;
++	struct napi_struct *napi;
++
++restart:
++	napi_poll = NULL;
++
++	rcu_read_lock();
++
++	napi = napi_by_id(napi_id);
++	if (!napi)
++		goto out;
++
++	preempt_disable();
++	for (;;) {
++		int work = 0;
++
++		local_bh_disable();
++		if (!napi_poll) {
++			unsigned long val = READ_ONCE(napi->state);
++
++			/* If multiple threads are competing for this napi,
++			 * we avoid dirtying napi->state as much as we can.
++			 */
++			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
++				   NAPIF_STATE_IN_BUSY_POLL)) {
++				if (prefer_busy_poll)
++					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
++				goto count;
++			}
++			if (cmpxchg(&napi->state, val,
++				    val | NAPIF_STATE_IN_BUSY_POLL |
++					  NAPIF_STATE_SCHED) != val) {
++				if (prefer_busy_poll)
++					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
++				goto count;
++			}
++			have_poll_lock = netpoll_poll_lock(napi);
++			napi_poll = napi->poll;
++		}
++		work = napi_poll(napi, budget);
++		trace_napi_poll(napi, work, budget);
++		gro_normal_list(napi);
++count:
++		if (work > 0)
++			__NET_ADD_STATS(dev_net(napi->dev),
++					LINUX_MIB_BUSYPOLLRXPACKETS, work);
++		local_bh_enable();
++
++		if (!loop_end || loop_end(loop_end_arg, start_time))
++			break;
++
++		if (unlikely(need_resched())) {
++			if (napi_poll)
++				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
++			preempt_enable();
++			rcu_read_unlock();
++			cond_resched();
++			if (loop_end(loop_end_arg, start_time))
++				return;
++			goto restart;
++		}
++		cpu_relax();
++	}
++	if (napi_poll)
++		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
++	preempt_enable();
++out:
++	rcu_read_unlock();
++}
++EXPORT_SYMBOL(napi_busy_loop);
++
++#endif /* CONFIG_NET_RX_BUSY_POLL */
++
++static void napi_hash_add(struct napi_struct *napi)
++{
++	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
++		return;
++
++	spin_lock(&napi_hash_lock);
++
++	/* 0..NR_CPUS range is reserved for sender_cpu use */
++	do {
++		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
++			napi_gen_id = MIN_NAPI_ID;
++	} while (napi_by_id(napi_gen_id));
++	napi->napi_id = napi_gen_id;
++
++	hlist_add_head_rcu(&napi->napi_hash_node,
++			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
++
++	spin_unlock(&napi_hash_lock);
++}
++
++/* Warning : caller is responsible to make sure rcu grace period
++ * is respected before freeing memory containing @napi
++ */
++static void napi_hash_del(struct napi_struct *napi)
++{
++	spin_lock(&napi_hash_lock);
++
++	hlist_del_init_rcu(&napi->napi_hash_node);
++
++	spin_unlock(&napi_hash_lock);
++}
++
++static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
++{
++	struct napi_struct *napi;
++
++	napi = container_of(timer, struct napi_struct, timer);
++
++	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
++	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
++	 */
++	if (!napi_disable_pending(napi) &&
++	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
++		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
++		__napi_schedule_irqoff(napi);
++	}
++
++	return HRTIMER_NORESTART;
++}
++
++static void init_gro_hash(struct napi_struct *napi)
++{
++	int i;
++
++	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
++		INIT_LIST_HEAD(&napi->gro_hash[i].list);
++		napi->gro_hash[i].count = 0;
++	}
++	napi->gro_bitmask = 0;
++}
++
++int dev_set_threaded(struct net_device *dev, bool threaded)
++{
++	struct napi_struct *napi;
++	int err = 0;
++
++	if (dev->threaded == threaded)
++		return 0;
++
++	if (threaded) {
++		list_for_each_entry(napi, &dev->napi_list, dev_list) {
++			if (!napi->thread) {
++				err = napi_kthread_create(napi);
++				if (err) {
++					threaded = false;
++					break;
++				}
++			}
++		}
++	}
++
++	dev->threaded = threaded;
++
++	/* Make sure kthread is created before THREADED bit
++	 * is set.
++	 */
++	smp_mb__before_atomic();
++
++	/* Setting/unsetting threaded mode on a napi might not immediately
++	 * take effect, if the current napi instance is actively being
++	 * polled. In this case, the switch between threaded mode and
++	 * softirq mode will happen in the next round of napi_schedule().
++	 * This should not cause hiccups/stalls to the live traffic.
++	 */
++	list_for_each_entry(napi, &dev->napi_list, dev_list) {
++		if (threaded)
++			set_bit(NAPI_STATE_THREADED, &napi->state);
++		else
++			clear_bit(NAPI_STATE_THREADED, &napi->state);
++	}
++
++	return err;
++}
++EXPORT_SYMBOL(dev_set_threaded);
++
++/* Double check that napi_get_frags() allocates skbs with
++ * skb->head being backed by slab, not a page fragment.
++ * This is to make sure bug fixed in 3226b158e67c
++ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
++ * does not accidentally come back.
++ */
++static void napi_get_frags_check(struct napi_struct *napi)
++{
++	struct sk_buff *skb;
++
++	local_bh_disable();
++	skb = napi_get_frags(napi);
++	WARN_ON_ONCE(skb && skb->head_frag);
++	napi_free_frags(napi);
++	local_bh_enable();
++}
++
++void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
++			   int (*poll)(struct napi_struct *, int), int weight)
++{
++	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
++		return;
++
++	INIT_LIST_HEAD(&napi->poll_list);
++	INIT_HLIST_NODE(&napi->napi_hash_node);
++	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
++	napi->timer.function = napi_watchdog;
++	init_gro_hash(napi);
++	napi->skb = NULL;
++	INIT_LIST_HEAD(&napi->rx_list);
++	napi->rx_count = 0;
++	napi->poll = poll;
++	if (weight > NAPI_POLL_WEIGHT)
++		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
++				weight);
++	napi->weight = weight;
++	napi->dev = dev;
++#ifdef CONFIG_NETPOLL
++	napi->poll_owner = -1;
++#endif
++	set_bit(NAPI_STATE_SCHED, &napi->state);
++	set_bit(NAPI_STATE_NPSVC, &napi->state);
++	list_add_rcu(&napi->dev_list, &dev->napi_list);
++	napi_hash_add(napi);
++	napi_get_frags_check(napi);
++	/* Create kthread for this napi if dev->threaded is set.
++	 * Clear dev->threaded if kthread creation failed so that
++	 * threaded mode will not be enabled in napi_enable().
++	 */
++	if (dev->threaded && napi_kthread_create(napi))
++		dev->threaded = 0;
++}
++EXPORT_SYMBOL(netif_napi_add_weight);
++
++void napi_disable(struct napi_struct *n)
++{
++	unsigned long val, new;
++
++	might_sleep();
++	set_bit(NAPI_STATE_DISABLE, &n->state);
++
++	for ( ; ; ) {
++		val = READ_ONCE(n->state);
++		if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
++			usleep_range(20, 200);
++			continue;
++		}
++
++		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
++		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
++
++		if (cmpxchg(&n->state, val, new) == val)
++			break;
++	}
++
++	hrtimer_cancel(&n->timer);
++
++	clear_bit(NAPI_STATE_DISABLE, &n->state);
++}
++EXPORT_SYMBOL(napi_disable);
++
++/**
++ *	napi_enable - enable NAPI scheduling
++ *	@n: NAPI context
++ *
++ * Resume NAPI from being scheduled on this context.
++ * Must be paired with napi_disable.
++ */
++void napi_enable(struct napi_struct *n)
++{
++	unsigned long val, new;
++
++	do {
++		val = READ_ONCE(n->state);
++		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
++
++		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
++		if (n->dev->threaded && n->thread)
++			new |= NAPIF_STATE_THREADED;
++	} while (cmpxchg(&n->state, val, new) != val);
++}
++EXPORT_SYMBOL(napi_enable);
++
++static void flush_gro_hash(struct napi_struct *napi)
++{
++	int i;
++
++	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
++		struct sk_buff *skb, *n;
++
++		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
++			kfree_skb(skb);
++		napi->gro_hash[i].count = 0;
++	}
++}
++
++/* Must be called in process context */
++void __netif_napi_del(struct napi_struct *napi)
++{
++	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
++		return;
++
++	napi_hash_del(napi);
++	list_del_rcu(&napi->dev_list);
++	napi_free_frags(napi);
++
++	flush_gro_hash(napi);
++	napi->gro_bitmask = 0;
++
++	if (napi->thread) {
++		kthread_stop(napi->thread);
++		napi->thread = NULL;
++	}
++}
++EXPORT_SYMBOL(__netif_napi_del);
++
++static int __napi_poll(struct napi_struct *n, bool *repoll)
++{
++	int work, weight;
++
++	weight = n->weight;
++
++	/* This NAPI_STATE_SCHED test is for avoiding a race
++	 * with netpoll's poll_napi().  Only the entity which
++	 * obtains the lock and sees NAPI_STATE_SCHED set will
++	 * actually make the ->poll() call.  Therefore we avoid
++	 * accidentally calling ->poll() when NAPI is not scheduled.
++	 */
++	work = 0;
++	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
++		work = n->poll(n, weight);
++		trace_napi_poll(n, work, weight);
++	}
++
++	if (unlikely(work > weight))
++		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
++				n->poll, work, weight);
++
++	if (likely(work < weight))
++		return work;
++
++	/* Drivers must not modify the NAPI state if they
++	 * consume the entire weight.  In such cases this code
++	 * still "owns" the NAPI instance and therefore can
++	 * move the instance around on the list at-will.
++	 */
++	if (unlikely(napi_disable_pending(n))) {
++		napi_complete(n);
++		return work;
++	}
++
++	/* The NAPI context has more processing work, but busy-polling
++	 * is preferred. Exit early.
++	 */
++	if (napi_prefer_busy_poll(n)) {
++		if (napi_complete_done(n, work)) {
++			/* If timeout is not set, we need to make sure
++			 * that the NAPI is re-scheduled.
++			 */
++			napi_schedule(n);
++		}
++		return work;
++	}
++
++	if (n->gro_bitmask) {
++		/* flush too old packets
++		 * If HZ < 1000, flush all packets.
++		 */
++		napi_gro_flush(n, HZ >= 1000);
++	}
++
++	gro_normal_list(n);
++
++	/* Some drivers may have called napi_schedule
++	 * prior to exhausting their budget.
++	 */
++	if (unlikely(!list_empty(&n->poll_list))) {
++		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
++			     n->dev ? n->dev->name : "backlog");
++		return work;
++	}
++
++	*repoll = true;
++
++	return work;
++}
++
++static int napi_poll(struct napi_struct *n, struct list_head *repoll)
++{
++	bool do_repoll = false;
++	void *have;
++	int work;
++
++	list_del_init(&n->poll_list);
++
++	have = netpoll_poll_lock(n);
++
++	work = __napi_poll(n, &do_repoll);
++
++	if (do_repoll)
++		list_add_tail(&n->poll_list, repoll);
++
++	netpoll_poll_unlock(have);
++
++	return work;
++}
++
++static int napi_thread_wait(struct napi_struct *napi)
++{
++	bool woken = false;
++
++	set_current_state(TASK_INTERRUPTIBLE);
++
++	while (!kthread_should_stop()) {
++		/* Testing SCHED_THREADED bit here to make sure the current
++		 * kthread owns this napi and could poll on this napi.
++		 * Testing SCHED bit is not enough because SCHED bit might be
++		 * set by some other busy poll thread or by napi_disable().
++		 */
++		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
++			WARN_ON(!list_empty(&napi->poll_list));
++			__set_current_state(TASK_RUNNING);
++			return 0;
++		}
++
++		schedule();
++		/* woken being true indicates this thread owns this napi. */
++		woken = true;
++		set_current_state(TASK_INTERRUPTIBLE);
++	}
++	__set_current_state(TASK_RUNNING);
++
++	return -1;
++}
++
++static int napi_threaded_poll(void *data)
++{
++	struct napi_struct *napi = data;
++	void *have;
++
++	while (!napi_thread_wait(napi)) {
++		for (;;) {
++			bool repoll = false;
++
++			local_bh_disable();
++
++			have = netpoll_poll_lock(napi);
++			__napi_poll(napi, &repoll);
++			netpoll_poll_unlock(have);
++
++			local_bh_enable();
++
++			if (!repoll)
++				break;
++
++			cond_resched();
++		}
++	}
++	return 0;
++}
++
++static void skb_defer_free_flush(struct softnet_data *sd)
++{
++	struct sk_buff *skb, *next;
++	unsigned long flags;
++
++	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
++	if (!READ_ONCE(sd->defer_list))
++		return;
++
++	spin_lock_irqsave(&sd->defer_lock, flags);
++	skb = sd->defer_list;
++	sd->defer_list = NULL;
++	sd->defer_count = 0;
++	spin_unlock_irqrestore(&sd->defer_lock, flags);
++
++	while (skb != NULL) {
++		next = skb->next;
++		napi_consume_skb(skb, 1);
++		skb = next;
++	}
++}
++
++static __latent_entropy void net_rx_action(struct softirq_action *h)
++{
++	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
++	unsigned long time_limit = jiffies +
++		usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
++	int budget = READ_ONCE(netdev_budget);
++	LIST_HEAD(list);
++	LIST_HEAD(repoll);
++
++	local_irq_disable();
++	list_splice_init(&sd->poll_list, &list);
++	local_irq_enable();
++
++	for (;;) {
++		struct napi_struct *n;
++
++		skb_defer_free_flush(sd);
++
++		if (list_empty(&list)) {
++			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
++				goto end;
++			break;
++		}
++
++		n = list_first_entry(&list, struct napi_struct, poll_list);
++		budget -= napi_poll(n, &repoll);
++
++		/* If softirq window is exhausted then punt.
++		 * Allow this to run for 2 jiffies since which will allow
++		 * an average latency of 1.5/HZ.
++		 */
++		if (unlikely(budget <= 0 ||
++			     time_after_eq(jiffies, time_limit))) {
++			sd->time_squeeze++;
++			break;
++		}
++	}
++
++	local_irq_disable();
++
++	list_splice_tail_init(&sd->poll_list, &list);
++	list_splice_tail(&repoll, &list);
++	list_splice(&list, &sd->poll_list);
++	if (!list_empty(&sd->poll_list))
++		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
++
++	net_rps_action_and_irq_enable(sd);
++end:;
++}
++
++struct netdev_adjacent {
++	struct net_device *dev;
++	netdevice_tracker dev_tracker;
++
++	/* upper master flag, there can only be one master device per list */
++	bool master;
++
++	/* lookup ignore flag */
++	bool ignore;
++
++	/* counter for the number of times this device was added to us */
++	u16 ref_nr;
++
++	/* private field for the users */
++	void *private;
++
++	struct list_head list;
++	struct rcu_head rcu;
++};
++
++static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
++						 struct list_head *adj_list)
++{
++	struct netdev_adjacent *adj;
++
++	list_for_each_entry(adj, adj_list, list) {
++		if (adj->dev == adj_dev)
++			return adj;
++	}
++	return NULL;
++}
++
++static int ____netdev_has_upper_dev(struct net_device *upper_dev,
++				    struct netdev_nested_priv *priv)
++{
++	struct net_device *dev = (struct net_device *)priv->data;
++
++	return upper_dev == dev;
++}
++
++/**
++ * netdev_has_upper_dev - Check if device is linked to an upper device
++ * @dev: device
++ * @upper_dev: upper device to check
++ *
++ * Find out if a device is linked to specified upper device and return true
++ * in case it is. Note that this checks only immediate upper device,
++ * not through a complete stack of devices. The caller must hold the RTNL lock.
++ */
++bool netdev_has_upper_dev(struct net_device *dev,
++			  struct net_device *upper_dev)
++{
++	struct netdev_nested_priv priv = {
++		.data = (void *)upper_dev,
++	};
++
++	ASSERT_RTNL();
++
++	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
++					     &priv);
++}
++EXPORT_SYMBOL(netdev_has_upper_dev);
++
++/**
++ * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
++ * @dev: device
++ * @upper_dev: upper device to check
++ *
++ * Find out if a device is linked to specified upper device and return true
++ * in case it is. Note that this checks the entire upper device chain.
++ * The caller must hold rcu lock.
++ */
++
++bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
++				  struct net_device *upper_dev)
++{
++	struct netdev_nested_priv priv = {
++		.data = (void *)upper_dev,
++	};
++
++	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
++					       &priv);
++}
++EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
++
++/**
++ * netdev_has_any_upper_dev - Check if device is linked to some device
++ * @dev: device
++ *
++ * Find out if a device is linked to an upper device and return true in case
++ * it is. The caller must hold the RTNL lock.
++ */
++bool netdev_has_any_upper_dev(struct net_device *dev)
++{
++	ASSERT_RTNL();
++
++	return !list_empty(&dev->adj_list.upper);
++}
++EXPORT_SYMBOL(netdev_has_any_upper_dev);
++
++/**
++ * netdev_master_upper_dev_get - Get master upper device
++ * @dev: device
++ *
++ * Find a master upper device and return pointer to it or NULL in case
++ * it's not there. The caller must hold the RTNL lock.
++ */
++struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
++{
++	struct netdev_adjacent *upper;
++
++	ASSERT_RTNL();
++
++	if (list_empty(&dev->adj_list.upper))
++		return NULL;
++
++	upper = list_first_entry(&dev->adj_list.upper,
++				 struct netdev_adjacent, list);
++	if (likely(upper->master))
++		return upper->dev;
++	return NULL;
++}
++EXPORT_SYMBOL(netdev_master_upper_dev_get);
++
++static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
++{
++	struct netdev_adjacent *upper;
++
++	ASSERT_RTNL();
++
++	if (list_empty(&dev->adj_list.upper))
++		return NULL;
++
++	upper = list_first_entry(&dev->adj_list.upper,
++				 struct netdev_adjacent, list);
++	if (likely(upper->master) && !upper->ignore)
++		return upper->dev;
++	return NULL;
++}
++
++/**
++ * netdev_has_any_lower_dev - Check if device is linked to some device
++ * @dev: device
++ *
++ * Find out if a device is linked to a lower device and return true in case
++ * it is. The caller must hold the RTNL lock.
++ */
++static bool netdev_has_any_lower_dev(struct net_device *dev)
++{
++	ASSERT_RTNL();
++
++	return !list_empty(&dev->adj_list.lower);
++}
++
++void *netdev_adjacent_get_private(struct list_head *adj_list)
++{
++	struct netdev_adjacent *adj;
++
++	adj = list_entry(adj_list, struct netdev_adjacent, list);
++
++	return adj->private;
++}
++EXPORT_SYMBOL(netdev_adjacent_get_private);
++
++/**
++ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
++ * @dev: device
++ * @iter: list_head ** of the current position
++ *
++ * Gets the next device from the dev's upper list, starting from iter
++ * position. The caller must hold RCU read lock.
++ */
++struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
++						 struct list_head **iter)
++{
++	struct netdev_adjacent *upper;
++
++	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
++
++	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
++
++	if (&upper->list == &dev->adj_list.upper)
++		return NULL;
++
++	*iter = &upper->list;
++
++	return upper->dev;
++}
++EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
++
++static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
++						  struct list_head **iter,
++						  bool *ignore)
++{
++	struct netdev_adjacent *upper;
++
++	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
++
++	if (&upper->list == &dev->adj_list.upper)
++		return NULL;
++
++	*iter = &upper->list;
++	*ignore = upper->ignore;
++
++	return upper->dev;
++}
++
++static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
++						    struct list_head **iter)
++{
++	struct netdev_adjacent *upper;
++
++	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
++
++	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
++
++	if (&upper->list == &dev->adj_list.upper)
++		return NULL;
++
++	*iter = &upper->list;
++
++	return upper->dev;
++}
++
++static int __netdev_walk_all_upper_dev(struct net_device *dev,
++				       int (*fn)(struct net_device *dev,
++					 struct netdev_nested_priv *priv),
++				       struct netdev_nested_priv *priv)
++{
++	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++	bool ignore;
++
++	now = dev;
++	iter = &dev->adj_list.upper;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			udev = __netdev_next_upper_dev(now, &iter, &ignore);
++			if (!udev)
++				break;
++			if (ignore)
++				continue;
++
++			next = udev;
++			niter = &udev->adj_list.upper;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++
++int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
++				  int (*fn)(struct net_device *dev,
++					    struct netdev_nested_priv *priv),
++				  struct netdev_nested_priv *priv)
++{
++	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++
++	now = dev;
++	iter = &dev->adj_list.upper;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			udev = netdev_next_upper_dev_rcu(now, &iter);
++			if (!udev)
++				break;
++
++			next = udev;
++			niter = &udev->adj_list.upper;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
++
++static bool __netdev_has_upper_dev(struct net_device *dev,
++				   struct net_device *upper_dev)
++{
++	struct netdev_nested_priv priv = {
++		.flags = 0,
++		.data = (void *)upper_dev,
++	};
++
++	ASSERT_RTNL();
++
++	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
++					   &priv);
++}
++
++/**
++ * netdev_lower_get_next_private - Get the next ->private from the
++ *				   lower neighbour list
++ * @dev: device
++ * @iter: list_head ** of the current position
++ *
++ * Gets the next netdev_adjacent->private from the dev's lower neighbour
++ * list, starting from iter position. The caller must hold either hold the
++ * RTNL lock or its own locking that guarantees that the neighbour lower
++ * list will remain unchanged.
++ */
++void *netdev_lower_get_next_private(struct net_device *dev,
++				    struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry(*iter, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = lower->list.next;
++
++	return lower->private;
++}
++EXPORT_SYMBOL(netdev_lower_get_next_private);
++
++/**
++ * netdev_lower_get_next_private_rcu - Get the next ->private from the
++ *				       lower neighbour list, RCU
++ *				       variant
++ * @dev: device
++ * @iter: list_head ** of the current position
++ *
++ * Gets the next netdev_adjacent->private from the dev's lower neighbour
++ * list, starting from iter position. The caller must hold RCU read lock.
++ */
++void *netdev_lower_get_next_private_rcu(struct net_device *dev,
++					struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
++
++	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = &lower->list;
++
++	return lower->private;
++}
++EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
++
++/**
++ * netdev_lower_get_next - Get the next device from the lower neighbour
++ *                         list
++ * @dev: device
++ * @iter: list_head ** of the current position
++ *
++ * Gets the next netdev_adjacent from the dev's lower neighbour
++ * list, starting from iter position. The caller must hold RTNL lock or
++ * its own locking that guarantees that the neighbour lower
++ * list will remain unchanged.
++ */
++void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry(*iter, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = lower->list.next;
++
++	return lower->dev;
++}
++EXPORT_SYMBOL(netdev_lower_get_next);
++
++static struct net_device *netdev_next_lower_dev(struct net_device *dev,
++						struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = &lower->list;
++
++	return lower->dev;
++}
++
++static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
++						  struct list_head **iter,
++						  bool *ignore)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
++
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = &lower->list;
++	*ignore = lower->ignore;
++
++	return lower->dev;
++}
++
++int netdev_walk_all_lower_dev(struct net_device *dev,
++			      int (*fn)(struct net_device *dev,
++					struct netdev_nested_priv *priv),
++			      struct netdev_nested_priv *priv)
++{
++	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++
++	now = dev;
++	iter = &dev->adj_list.lower;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			ldev = netdev_next_lower_dev(now, &iter);
++			if (!ldev)
++				break;
++
++			next = ldev;
++			niter = &ldev->adj_list.lower;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
++
++static int __netdev_walk_all_lower_dev(struct net_device *dev,
++				       int (*fn)(struct net_device *dev,
++					 struct netdev_nested_priv *priv),
++				       struct netdev_nested_priv *priv)
++{
++	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++	bool ignore;
++
++	now = dev;
++	iter = &dev->adj_list.lower;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
++			if (!ldev)
++				break;
++			if (ignore)
++				continue;
++
++			next = ldev;
++			niter = &ldev->adj_list.lower;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++
++struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
++					     struct list_head **iter)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
++	if (&lower->list == &dev->adj_list.lower)
++		return NULL;
++
++	*iter = &lower->list;
++
++	return lower->dev;
++}
++EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
++
++static u8 __netdev_upper_depth(struct net_device *dev)
++{
++	struct net_device *udev;
++	struct list_head *iter;
++	u8 max_depth = 0;
++	bool ignore;
++
++	for (iter = &dev->adj_list.upper,
++	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
++	     udev;
++	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
++		if (ignore)
++			continue;
++		if (max_depth < udev->upper_level)
++			max_depth = udev->upper_level;
++	}
++
++	return max_depth;
++}
++
++static u8 __netdev_lower_depth(struct net_device *dev)
++{
++	struct net_device *ldev;
++	struct list_head *iter;
++	u8 max_depth = 0;
++	bool ignore;
++
++	for (iter = &dev->adj_list.lower,
++	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
++	     ldev;
++	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
++		if (ignore)
++			continue;
++		if (max_depth < ldev->lower_level)
++			max_depth = ldev->lower_level;
++	}
++
++	return max_depth;
++}
++
++static int __netdev_update_upper_level(struct net_device *dev,
++				       struct netdev_nested_priv *__unused)
++{
++	dev->upper_level = __netdev_upper_depth(dev) + 1;
++	return 0;
++}
++
++#ifdef CONFIG_LOCKDEP
++static LIST_HEAD(net_unlink_list);
++
++static void net_unlink_todo(struct net_device *dev)
++{
++	if (list_empty(&dev->unlink_list))
++		list_add_tail(&dev->unlink_list, &net_unlink_list);
++}
++#endif
++
++static int __netdev_update_lower_level(struct net_device *dev,
++				       struct netdev_nested_priv *priv)
++{
++	dev->lower_level = __netdev_lower_depth(dev) + 1;
++
++#ifdef CONFIG_LOCKDEP
++	if (!priv)
++		return 0;
++
++	if (priv->flags & NESTED_SYNC_IMM)
++		dev->nested_level = dev->lower_level - 1;
++	if (priv->flags & NESTED_SYNC_TODO)
++		net_unlink_todo(dev);
++#endif
++	return 0;
++}
++
++int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
++				  int (*fn)(struct net_device *dev,
++					    struct netdev_nested_priv *priv),
++				  struct netdev_nested_priv *priv)
++{
++	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
++	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
++	int ret, cur = 0;
++
++	now = dev;
++	iter = &dev->adj_list.lower;
++
++	while (1) {
++		if (now != dev) {
++			ret = fn(now, priv);
++			if (ret)
++				return ret;
++		}
++
++		next = NULL;
++		while (1) {
++			ldev = netdev_next_lower_dev_rcu(now, &iter);
++			if (!ldev)
++				break;
++
++			next = ldev;
++			niter = &ldev->adj_list.lower;
++			dev_stack[cur] = now;
++			iter_stack[cur++] = iter;
++			break;
++		}
++
++		if (!next) {
++			if (!cur)
++				return 0;
++			next = dev_stack[--cur];
++			niter = iter_stack[cur];
++		}
++
++		now = next;
++		iter = niter;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
++
++/**
++ * netdev_lower_get_first_private_rcu - Get the first ->private from the
++ *				       lower neighbour list, RCU
++ *				       variant
++ * @dev: device
++ *
++ * Gets the first netdev_adjacent->private from the dev's lower neighbour
++ * list. The caller must hold RCU read lock.
++ */
++void *netdev_lower_get_first_private_rcu(struct net_device *dev)
++{
++	struct netdev_adjacent *lower;
++
++	lower = list_first_or_null_rcu(&dev->adj_list.lower,
++			struct netdev_adjacent, list);
++	if (lower)
++		return lower->private;
++	return NULL;
++}
++EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
++
++/**
++ * netdev_master_upper_dev_get_rcu - Get master upper device
++ * @dev: device
++ *
++ * Find a master upper device and return pointer to it or NULL in case
++ * it's not there. The caller must hold the RCU read lock.
++ */
++struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
++{
++	struct netdev_adjacent *upper;
++
++	upper = list_first_or_null_rcu(&dev->adj_list.upper,
++				       struct netdev_adjacent, list);
++	if (upper && likely(upper->master))
++		return upper->dev;
++	return NULL;
++}
++EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
++
++static int netdev_adjacent_sysfs_add(struct net_device *dev,
++			      struct net_device *adj_dev,
++			      struct list_head *dev_list)
++{
++	char linkname[IFNAMSIZ+7];
++
++	sprintf(linkname, dev_list == &dev->adj_list.upper ?
++		"upper_%s" : "lower_%s", adj_dev->name);
++	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
++				 linkname);
++}
++static void netdev_adjacent_sysfs_del(struct net_device *dev,
++			       char *name,
++			       struct list_head *dev_list)
++{
++	char linkname[IFNAMSIZ+7];
++
++	sprintf(linkname, dev_list == &dev->adj_list.upper ?
++		"upper_%s" : "lower_%s", name);
++	sysfs_remove_link(&(dev->dev.kobj), linkname);
++}
++
++static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
++						 struct net_device *adj_dev,
++						 struct list_head *dev_list)
++{
++	return (dev_list == &dev->adj_list.upper ||
++		dev_list == &dev->adj_list.lower) &&
++		net_eq(dev_net(dev), dev_net(adj_dev));
++}
++
++static int __netdev_adjacent_dev_insert(struct net_device *dev,
++					struct net_device *adj_dev,
++					struct list_head *dev_list,
++					void *private, bool master)
++{
++	struct netdev_adjacent *adj;
++	int ret;
++
++	adj = __netdev_find_adj(adj_dev, dev_list);
++
++	if (adj) {
++		adj->ref_nr += 1;
++		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
++			 dev->name, adj_dev->name, adj->ref_nr);
++
++		return 0;
++	}
++
++	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
++	if (!adj)
++		return -ENOMEM;
++
++	adj->dev = adj_dev;
++	adj->master = master;
++	adj->ref_nr = 1;
++	adj->private = private;
++	adj->ignore = false;
++	netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
++
++	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
++		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
++
++	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
++		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
++		if (ret)
++			goto free_adj;
++	}
++
++	/* Ensure that master link is always the first item in list. */
++	if (master) {
++		ret = sysfs_create_link(&(dev->dev.kobj),
++					&(adj_dev->dev.kobj), "master");
++		if (ret)
++			goto remove_symlinks;
++
++		list_add_rcu(&adj->list, dev_list);
++	} else {
++		list_add_tail_rcu(&adj->list, dev_list);
++	}
++
++	return 0;
++
++remove_symlinks:
++	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
++		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
++free_adj:
++	netdev_put(adj_dev, &adj->dev_tracker);
++	kfree(adj);
++
++	return ret;
++}
++
++static void __netdev_adjacent_dev_remove(struct net_device *dev,
++					 struct net_device *adj_dev,
++					 u16 ref_nr,
++					 struct list_head *dev_list)
++{
++	struct netdev_adjacent *adj;
++
++	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
++		 dev->name, adj_dev->name, ref_nr);
++
++	adj = __netdev_find_adj(adj_dev, dev_list);
++
++	if (!adj) {
++		pr_err("Adjacency does not exist for device %s from %s\n",
++		       dev->name, adj_dev->name);
++		WARN_ON(1);
++		return;
++	}
++
++	if (adj->ref_nr > ref_nr) {
++		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
++			 dev->name, adj_dev->name, ref_nr,
++			 adj->ref_nr - ref_nr);
++		adj->ref_nr -= ref_nr;
++		return;
++	}
++
++	if (adj->master)
++		sysfs_remove_link(&(dev->dev.kobj), "master");
++
++	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
++		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
++
++	list_del_rcu(&adj->list);
++	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
++		 adj_dev->name, dev->name, adj_dev->name);
++	netdev_put(adj_dev, &adj->dev_tracker);
++	kfree_rcu(adj, rcu);
++}
++
++static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
++					    struct net_device *upper_dev,
++					    struct list_head *up_list,
++					    struct list_head *down_list,
++					    void *private, bool master)
++{
++	int ret;
++
++	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
++					   private, master);
++	if (ret)
++		return ret;
++
++	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
++					   private, false);
++	if (ret) {
++		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
++		return ret;
++	}
++
++	return 0;
++}
++
++static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
++					       struct net_device *upper_dev,
++					       u16 ref_nr,
++					       struct list_head *up_list,
++					       struct list_head *down_list)
++{
++	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
++	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
++}
++
++static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
++						struct net_device *upper_dev,
++						void *private, bool master)
++{
++	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
++						&dev->adj_list.upper,
++						&upper_dev->adj_list.lower,
++						private, master);
++}
++
++static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
++						   struct net_device *upper_dev)
++{
++	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
++					   &dev->adj_list.upper,
++					   &upper_dev->adj_list.lower);
++}
++
++static int __netdev_upper_dev_link(struct net_device *dev,
++				   struct net_device *upper_dev, bool master,
++				   void *upper_priv, void *upper_info,
++				   struct netdev_nested_priv *priv,
++				   struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_changeupper_info changeupper_info = {
++		.info = {
++			.dev = dev,
++			.extack = extack,
++		},
++		.upper_dev = upper_dev,
++		.master = master,
++		.linking = true,
++		.upper_info = upper_info,
++	};
++	struct net_device *master_dev;
++	int ret = 0;
++
++	ASSERT_RTNL();
++
++	if (dev == upper_dev)
++		return -EBUSY;
++
++	/* To prevent loops, check if dev is not upper device to upper_dev. */
++	if (__netdev_has_upper_dev(upper_dev, dev))
++		return -EBUSY;
++
++	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
++		return -EMLINK;
++
++	if (!master) {
++		if (__netdev_has_upper_dev(dev, upper_dev))
++			return -EEXIST;
++	} else {
++		master_dev = __netdev_master_upper_dev_get(dev);
++		if (master_dev)
++			return master_dev == upper_dev ? -EEXIST : -EBUSY;
++	}
++
++	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
++					    &changeupper_info.info);
++	ret = notifier_to_errno(ret);
++	if (ret)
++		return ret;
++
++	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
++						   master);
++	if (ret)
++		return ret;
++
++	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
++					    &changeupper_info.info);
++	ret = notifier_to_errno(ret);
++	if (ret)
++		goto rollback;
++
++	__netdev_update_upper_level(dev, NULL);
++	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
++
++	__netdev_update_lower_level(upper_dev, priv);
++	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
++				    priv);
++
++	return 0;
++
++rollback:
++	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
++
++	return ret;
++}
++
++/**
++ * netdev_upper_dev_link - Add a link to the upper device
++ * @dev: device
++ * @upper_dev: new upper device
++ * @extack: netlink extended ack
++ *
++ * Adds a link to device which is upper to this one. The caller must hold
++ * the RTNL lock. On a failure a negative errno code is returned.
++ * On success the reference counts are adjusted and the function
++ * returns zero.
++ */
++int netdev_upper_dev_link(struct net_device *dev,
++			  struct net_device *upper_dev,
++			  struct netlink_ext_ack *extack)
++{
++	struct netdev_nested_priv priv = {
++		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
++		.data = NULL,
++	};
++
++	return __netdev_upper_dev_link(dev, upper_dev, false,
++				       NULL, NULL, &priv, extack);
++}
++EXPORT_SYMBOL(netdev_upper_dev_link);
++
++/**
++ * netdev_master_upper_dev_link - Add a master link to the upper device
++ * @dev: device
++ * @upper_dev: new upper device
++ * @upper_priv: upper device private
++ * @upper_info: upper info to be passed down via notifier
++ * @extack: netlink extended ack
++ *
++ * Adds a link to device which is upper to this one. In this case, only
++ * one master upper device can be linked, although other non-master devices
++ * might be linked as well. The caller must hold the RTNL lock.
++ * On a failure a negative errno code is returned. On success the reference
++ * counts are adjusted and the function returns zero.
++ */
++int netdev_master_upper_dev_link(struct net_device *dev,
++				 struct net_device *upper_dev,
++				 void *upper_priv, void *upper_info,
++				 struct netlink_ext_ack *extack)
++{
++	struct netdev_nested_priv priv = {
++		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
++		.data = NULL,
++	};
++
++	return __netdev_upper_dev_link(dev, upper_dev, true,
++				       upper_priv, upper_info, &priv, extack);
++}
++EXPORT_SYMBOL(netdev_master_upper_dev_link);
++
++static void __netdev_upper_dev_unlink(struct net_device *dev,
++				      struct net_device *upper_dev,
++				      struct netdev_nested_priv *priv)
++{
++	struct netdev_notifier_changeupper_info changeupper_info = {
++		.info = {
++			.dev = dev,
++		},
++		.upper_dev = upper_dev,
++		.linking = false,
++	};
++
++	ASSERT_RTNL();
++
++	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
++
++	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
++				      &changeupper_info.info);
++
++	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
++
++	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
++				      &changeupper_info.info);
++
++	__netdev_update_upper_level(dev, NULL);
++	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
++
++	__netdev_update_lower_level(upper_dev, priv);
++	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
++				    priv);
++}
++
++/**
++ * netdev_upper_dev_unlink - Removes a link to upper device
++ * @dev: device
++ * @upper_dev: new upper device
++ *
++ * Removes a link to device which is upper to this one. The caller must hold
++ * the RTNL lock.
++ */
++void netdev_upper_dev_unlink(struct net_device *dev,
++			     struct net_device *upper_dev)
++{
++	struct netdev_nested_priv priv = {
++		.flags = NESTED_SYNC_TODO,
++		.data = NULL,
++	};
++
++	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
++}
++EXPORT_SYMBOL(netdev_upper_dev_unlink);
++
++static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
++				      struct net_device *lower_dev,
++				      bool val)
++{
++	struct netdev_adjacent *adj;
++
++	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
++	if (adj)
++		adj->ignore = val;
++
++	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
++	if (adj)
++		adj->ignore = val;
++}
++
++static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
++					struct net_device *lower_dev)
++{
++	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
++}
++
++static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
++				       struct net_device *lower_dev)
++{
++	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
++}
++
++int netdev_adjacent_change_prepare(struct net_device *old_dev,
++				   struct net_device *new_dev,
++				   struct net_device *dev,
++				   struct netlink_ext_ack *extack)
++{
++	struct netdev_nested_priv priv = {
++		.flags = 0,
++		.data = NULL,
++	};
++	int err;
++
++	if (!new_dev)
++		return 0;
++
++	if (old_dev && new_dev != old_dev)
++		netdev_adjacent_dev_disable(dev, old_dev);
++	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
++				      extack);
++	if (err) {
++		if (old_dev && new_dev != old_dev)
++			netdev_adjacent_dev_enable(dev, old_dev);
++		return err;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(netdev_adjacent_change_prepare);
++
++void netdev_adjacent_change_commit(struct net_device *old_dev,
++				   struct net_device *new_dev,
++				   struct net_device *dev)
++{
++	struct netdev_nested_priv priv = {
++		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
++		.data = NULL,
++	};
++
++	if (!new_dev || !old_dev)
++		return;
++
++	if (new_dev == old_dev)
++		return;
++
++	netdev_adjacent_dev_enable(dev, old_dev);
++	__netdev_upper_dev_unlink(old_dev, dev, &priv);
++}
++EXPORT_SYMBOL(netdev_adjacent_change_commit);
++
++void netdev_adjacent_change_abort(struct net_device *old_dev,
++				  struct net_device *new_dev,
++				  struct net_device *dev)
++{
++	struct netdev_nested_priv priv = {
++		.flags = 0,
++		.data = NULL,
++	};
++
++	if (!new_dev)
++		return;
++
++	if (old_dev && new_dev != old_dev)
++		netdev_adjacent_dev_enable(dev, old_dev);
++
++	__netdev_upper_dev_unlink(new_dev, dev, &priv);
++}
++EXPORT_SYMBOL(netdev_adjacent_change_abort);
++
++/**
++ * netdev_bonding_info_change - Dispatch event about slave change
++ * @dev: device
++ * @bonding_info: info to dispatch
++ *
++ * Send NETDEV_BONDING_INFO to netdev notifiers with info.
++ * The caller must hold the RTNL lock.
++ */
++void netdev_bonding_info_change(struct net_device *dev,
++				struct netdev_bonding_info *bonding_info)
++{
++	struct netdev_notifier_bonding_info info = {
++		.info.dev = dev,
++	};
++
++	memcpy(&info.bonding_info, bonding_info,
++	       sizeof(struct netdev_bonding_info));
++	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
++				      &info.info);
++}
++EXPORT_SYMBOL(netdev_bonding_info_change);
++
++static int netdev_offload_xstats_enable_l3(struct net_device *dev,
++					   struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_offload_xstats_info info = {
++		.info.dev = dev,
++		.info.extack = extack,
++		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
++	};
++	int err;
++	int rc;
++
++	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
++					 GFP_KERNEL);
++	if (!dev->offload_xstats_l3)
++		return -ENOMEM;
++
++	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
++						  NETDEV_OFFLOAD_XSTATS_DISABLE,
++						  &info.info);
++	err = notifier_to_errno(rc);
++	if (err)
++		goto free_stats;
++
++	return 0;
++
++free_stats:
++	kfree(dev->offload_xstats_l3);
++	dev->offload_xstats_l3 = NULL;
++	return err;
++}
++
++int netdev_offload_xstats_enable(struct net_device *dev,
++				 enum netdev_offload_xstats_type type,
++				 struct netlink_ext_ack *extack)
++{
++	ASSERT_RTNL();
++
++	if (netdev_offload_xstats_enabled(dev, type))
++		return -EALREADY;
++
++	switch (type) {
++	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
++		return netdev_offload_xstats_enable_l3(dev, extack);
++	}
++
++	WARN_ON(1);
++	return -EINVAL;
++}
++EXPORT_SYMBOL(netdev_offload_xstats_enable);
++
++static void netdev_offload_xstats_disable_l3(struct net_device *dev)
++{
++	struct netdev_notifier_offload_xstats_info info = {
++		.info.dev = dev,
++		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
++	};
++
++	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
++				      &info.info);
++	kfree(dev->offload_xstats_l3);
++	dev->offload_xstats_l3 = NULL;
++}
++
++int netdev_offload_xstats_disable(struct net_device *dev,
++				  enum netdev_offload_xstats_type type)
++{
++	ASSERT_RTNL();
++
++	if (!netdev_offload_xstats_enabled(dev, type))
++		return -EALREADY;
++
++	switch (type) {
++	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
++		netdev_offload_xstats_disable_l3(dev);
++		return 0;
++	}
++
++	WARN_ON(1);
++	return -EINVAL;
++}
++EXPORT_SYMBOL(netdev_offload_xstats_disable);
++
++static void netdev_offload_xstats_disable_all(struct net_device *dev)
++{
++	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
++}
++
++static struct rtnl_hw_stats64 *
++netdev_offload_xstats_get_ptr(const struct net_device *dev,
++			      enum netdev_offload_xstats_type type)
++{
++	switch (type) {
++	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
++		return dev->offload_xstats_l3;
++	}
++
++	WARN_ON(1);
++	return NULL;
++}
++
++bool netdev_offload_xstats_enabled(const struct net_device *dev,
++				   enum netdev_offload_xstats_type type)
++{
++	ASSERT_RTNL();
++
++	return netdev_offload_xstats_get_ptr(dev, type);
++}
++EXPORT_SYMBOL(netdev_offload_xstats_enabled);
++
++struct netdev_notifier_offload_xstats_ru {
++	bool used;
++};
++
++struct netdev_notifier_offload_xstats_rd {
++	struct rtnl_hw_stats64 stats;
++	bool used;
++};
++
++static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
++				  const struct rtnl_hw_stats64 *src)
++{
++	dest->rx_packets	  += src->rx_packets;
++	dest->tx_packets	  += src->tx_packets;
++	dest->rx_bytes		  += src->rx_bytes;
++	dest->tx_bytes		  += src->tx_bytes;
++	dest->rx_errors		  += src->rx_errors;
++	dest->tx_errors		  += src->tx_errors;
++	dest->rx_dropped	  += src->rx_dropped;
++	dest->tx_dropped	  += src->tx_dropped;
++	dest->multicast		  += src->multicast;
++}
++
++static int netdev_offload_xstats_get_used(struct net_device *dev,
++					  enum netdev_offload_xstats_type type,
++					  bool *p_used,
++					  struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_offload_xstats_ru report_used = {};
++	struct netdev_notifier_offload_xstats_info info = {
++		.info.dev = dev,
++		.info.extack = extack,
++		.type = type,
++		.report_used = &report_used,
++	};
++	int rc;
++
++	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
++	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
++					   &info.info);
++	*p_used = report_used.used;
++	return notifier_to_errno(rc);
++}
++
++static int netdev_offload_xstats_get_stats(struct net_device *dev,
++					   enum netdev_offload_xstats_type type,
++					   struct rtnl_hw_stats64 *p_stats,
++					   bool *p_used,
++					   struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_offload_xstats_rd report_delta = {};
++	struct netdev_notifier_offload_xstats_info info = {
++		.info.dev = dev,
++		.info.extack = extack,
++		.type = type,
++		.report_delta = &report_delta,
++	};
++	struct rtnl_hw_stats64 *stats;
++	int rc;
++
++	stats = netdev_offload_xstats_get_ptr(dev, type);
++	if (WARN_ON(!stats))
++		return -EINVAL;
++
++	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
++					   &info.info);
++
++	/* Cache whatever we got, even if there was an error, otherwise the
++	 * successful stats retrievals would get lost.
++	 */
++	netdev_hw_stats64_add(stats, &report_delta.stats);
++
++	if (p_stats)
++		*p_stats = *stats;
++	*p_used = report_delta.used;
++
++	return notifier_to_errno(rc);
++}
++
++int netdev_offload_xstats_get(struct net_device *dev,
++			      enum netdev_offload_xstats_type type,
++			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
++			      struct netlink_ext_ack *extack)
++{
++	ASSERT_RTNL();
++
++	if (p_stats)
++		return netdev_offload_xstats_get_stats(dev, type, p_stats,
++						       p_used, extack);
++	else
++		return netdev_offload_xstats_get_used(dev, type, p_used,
++						      extack);
++}
++EXPORT_SYMBOL(netdev_offload_xstats_get);
++
++void
++netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
++				   const struct rtnl_hw_stats64 *stats)
++{
++	report_delta->used = true;
++	netdev_hw_stats64_add(&report_delta->stats, stats);
++}
++EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
++
++void
++netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
++{
++	report_used->used = true;
++}
++EXPORT_SYMBOL(netdev_offload_xstats_report_used);
++
++void netdev_offload_xstats_push_delta(struct net_device *dev,
++				      enum netdev_offload_xstats_type type,
++				      const struct rtnl_hw_stats64 *p_stats)
++{
++	struct rtnl_hw_stats64 *stats;
++
++	ASSERT_RTNL();
++
++	stats = netdev_offload_xstats_get_ptr(dev, type);
++	if (WARN_ON(!stats))
++		return;
++
++	netdev_hw_stats64_add(stats, p_stats);
++}
++EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
++
++/**
++ * netdev_get_xmit_slave - Get the xmit slave of master device
++ * @dev: device
++ * @skb: The packet
++ * @all_slaves: assume all the slaves are active
++ *
++ * The reference counters are not incremented so the caller must be
++ * careful with locks. The caller must hold RCU lock.
++ * %NULL is returned if no slave is found.
++ */
++
++struct net_device *netdev_get_xmit_slave(struct net_device *dev,
++					 struct sk_buff *skb,
++					 bool all_slaves)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (!ops->ndo_get_xmit_slave)
++		return NULL;
++	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
++}
++EXPORT_SYMBOL(netdev_get_xmit_slave);
++
++static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
++						  struct sock *sk)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (!ops->ndo_sk_get_lower_dev)
++		return NULL;
++	return ops->ndo_sk_get_lower_dev(dev, sk);
++}
++
++/**
++ * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
++ * @dev: device
++ * @sk: the socket
++ *
++ * %NULL is returned if no lower device is found.
++ */
++
++struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
++					    struct sock *sk)
++{
++	struct net_device *lower;
++
++	lower = netdev_sk_get_lower_dev(dev, sk);
++	while (lower) {
++		dev = lower;
++		lower = netdev_sk_get_lower_dev(dev, sk);
++	}
++
++	return dev;
++}
++EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
++
++static void netdev_adjacent_add_links(struct net_device *dev)
++{
++	struct netdev_adjacent *iter;
++
++	struct net *net = dev_net(dev);
++
++	list_for_each_entry(iter, &dev->adj_list.upper, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_add(iter->dev, dev,
++					  &iter->dev->adj_list.lower);
++		netdev_adjacent_sysfs_add(dev, iter->dev,
++					  &dev->adj_list.upper);
++	}
++
++	list_for_each_entry(iter, &dev->adj_list.lower, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_add(iter->dev, dev,
++					  &iter->dev->adj_list.upper);
++		netdev_adjacent_sysfs_add(dev, iter->dev,
++					  &dev->adj_list.lower);
++	}
++}
++
++static void netdev_adjacent_del_links(struct net_device *dev)
++{
++	struct netdev_adjacent *iter;
++
++	struct net *net = dev_net(dev);
++
++	list_for_each_entry(iter, &dev->adj_list.upper, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_del(iter->dev, dev->name,
++					  &iter->dev->adj_list.lower);
++		netdev_adjacent_sysfs_del(dev, iter->dev->name,
++					  &dev->adj_list.upper);
++	}
++
++	list_for_each_entry(iter, &dev->adj_list.lower, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_del(iter->dev, dev->name,
++					  &iter->dev->adj_list.upper);
++		netdev_adjacent_sysfs_del(dev, iter->dev->name,
++					  &dev->adj_list.lower);
++	}
++}
++
++void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
++{
++	struct netdev_adjacent *iter;
++
++	struct net *net = dev_net(dev);
++
++	list_for_each_entry(iter, &dev->adj_list.upper, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_del(iter->dev, oldname,
++					  &iter->dev->adj_list.lower);
++		netdev_adjacent_sysfs_add(iter->dev, dev,
++					  &iter->dev->adj_list.lower);
++	}
++
++	list_for_each_entry(iter, &dev->adj_list.lower, list) {
++		if (!net_eq(net, dev_net(iter->dev)))
++			continue;
++		netdev_adjacent_sysfs_del(iter->dev, oldname,
++					  &iter->dev->adj_list.upper);
++		netdev_adjacent_sysfs_add(iter->dev, dev,
++					  &iter->dev->adj_list.upper);
++	}
++}
++
++void *netdev_lower_dev_get_private(struct net_device *dev,
++				   struct net_device *lower_dev)
++{
++	struct netdev_adjacent *lower;
++
++	if (!lower_dev)
++		return NULL;
++	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
++	if (!lower)
++		return NULL;
++
++	return lower->private;
++}
++EXPORT_SYMBOL(netdev_lower_dev_get_private);
++
++
++/**
++ * netdev_lower_state_changed - Dispatch event about lower device state change
++ * @lower_dev: device
++ * @lower_state_info: state to dispatch
++ *
++ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
++ * The caller must hold the RTNL lock.
++ */
++void netdev_lower_state_changed(struct net_device *lower_dev,
++				void *lower_state_info)
++{
++	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
++		.info.dev = lower_dev,
++	};
++
++	ASSERT_RTNL();
++	changelowerstate_info.lower_state_info = lower_state_info;
++	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
++				      &changelowerstate_info.info);
++}
++EXPORT_SYMBOL(netdev_lower_state_changed);
++
++static void dev_change_rx_flags(struct net_device *dev, int flags)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (ops->ndo_change_rx_flags)
++		ops->ndo_change_rx_flags(dev, flags);
++}
++
++static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
++{
++	unsigned int old_flags = dev->flags;
++	kuid_t uid;
++	kgid_t gid;
++
++	ASSERT_RTNL();
++
++	dev->flags |= IFF_PROMISC;
++	dev->promiscuity += inc;
++	if (dev->promiscuity == 0) {
++		/*
++		 * Avoid overflow.
++		 * If inc causes overflow, untouch promisc and return error.
++		 */
++		if (inc < 0)
++			dev->flags &= ~IFF_PROMISC;
++		else {
++			dev->promiscuity -= inc;
++			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
++			return -EOVERFLOW;
++		}
++	}
++	if (dev->flags != old_flags) {
++		pr_info("device %s %s promiscuous mode\n",
++			dev->name,
++			dev->flags & IFF_PROMISC ? "entered" : "left");
++		if (audit_enabled) {
++			current_uid_gid(&uid, &gid);
++			audit_log(audit_context(), GFP_ATOMIC,
++				  AUDIT_ANOM_PROMISCUOUS,
++				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
++				  dev->name, (dev->flags & IFF_PROMISC),
++				  (old_flags & IFF_PROMISC),
++				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
++				  from_kuid(&init_user_ns, uid),
++				  from_kgid(&init_user_ns, gid),
++				  audit_get_sessionid(current));
++		}
++
++		dev_change_rx_flags(dev, IFF_PROMISC);
++	}
++	if (notify)
++		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
++	return 0;
++}
++
++/**
++ *	dev_set_promiscuity	- update promiscuity count on a device
++ *	@dev: device
++ *	@inc: modifier
++ *
++ *	Add or remove promiscuity from a device. While the count in the device
++ *	remains above zero the interface remains promiscuous. Once it hits zero
++ *	the device reverts back to normal filtering operation. A negative inc
++ *	value is used to drop promiscuity on the device.
++ *	Return 0 if successful or a negative errno code on error.
++ */
++int dev_set_promiscuity(struct net_device *dev, int inc)
++{
++	unsigned int old_flags = dev->flags;
++	int err;
++
++	err = __dev_set_promiscuity(dev, inc, true);
++	if (err < 0)
++		return err;
++	if (dev->flags != old_flags)
++		dev_set_rx_mode(dev);
++	return err;
++}
++EXPORT_SYMBOL(dev_set_promiscuity);
++
++static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
++{
++	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
++
++	ASSERT_RTNL();
++
++	dev->flags |= IFF_ALLMULTI;
++	dev->allmulti += inc;
++	if (dev->allmulti == 0) {
++		/*
++		 * Avoid overflow.
++		 * If inc causes overflow, untouch allmulti and return error.
++		 */
++		if (inc < 0)
++			dev->flags &= ~IFF_ALLMULTI;
++		else {
++			dev->allmulti -= inc;
++			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
++			return -EOVERFLOW;
++		}
++	}
++	if (dev->flags ^ old_flags) {
++		dev_change_rx_flags(dev, IFF_ALLMULTI);
++		dev_set_rx_mode(dev);
++		if (notify)
++			__dev_notify_flags(dev, old_flags,
++					   dev->gflags ^ old_gflags);
++	}
++	return 0;
++}
++
++/**
++ *	dev_set_allmulti	- update allmulti count on a device
++ *	@dev: device
++ *	@inc: modifier
++ *
++ *	Add or remove reception of all multicast frames to a device. While the
++ *	count in the device remains above zero the interface remains listening
++ *	to all interfaces. Once it hits zero the device reverts back to normal
++ *	filtering operation. A negative @inc value is used to drop the counter
++ *	when releasing a resource needing all multicasts.
++ *	Return 0 if successful or a negative errno code on error.
++ */
++
++int dev_set_allmulti(struct net_device *dev, int inc)
++{
++	return __dev_set_allmulti(dev, inc, true);
++}
++EXPORT_SYMBOL(dev_set_allmulti);
++
++/*
++ *	Upload unicast and multicast address lists to device and
++ *	configure RX filtering. When the device doesn't support unicast
++ *	filtering it is put in promiscuous mode while unicast addresses
++ *	are present.
++ */
++void __dev_set_rx_mode(struct net_device *dev)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	/* dev_open will call this function so the list will stay sane. */
++	if (!(dev->flags&IFF_UP))
++		return;
++
++	if (!netif_device_present(dev))
++		return;
++
++	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
++		/* Unicast addresses changes may only happen under the rtnl,
++		 * therefore calling __dev_set_promiscuity here is safe.
++		 */
++		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
++			__dev_set_promiscuity(dev, 1, false);
++			dev->uc_promisc = true;
++		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
++			__dev_set_promiscuity(dev, -1, false);
++			dev->uc_promisc = false;
++		}
++	}
++
++	if (ops->ndo_set_rx_mode)
++		ops->ndo_set_rx_mode(dev);
++}
++
++void dev_set_rx_mode(struct net_device *dev)
++{
++	netif_addr_lock_bh(dev);
++	__dev_set_rx_mode(dev);
++	netif_addr_unlock_bh(dev);
++}
++
++/**
++ *	dev_get_flags - get flags reported to userspace
++ *	@dev: device
++ *
++ *	Get the combination of flag bits exported through APIs to userspace.
++ */
++unsigned int dev_get_flags(const struct net_device *dev)
++{
++	unsigned int flags;
++
++	flags = (dev->flags & ~(IFF_PROMISC |
++				IFF_ALLMULTI |
++				IFF_RUNNING |
++				IFF_LOWER_UP |
++				IFF_DORMANT)) |
++		(dev->gflags & (IFF_PROMISC |
++				IFF_ALLMULTI));
++
++	if (netif_running(dev)) {
++		if (netif_oper_up(dev))
++			flags |= IFF_RUNNING;
++		if (netif_carrier_ok(dev))
++			flags |= IFF_LOWER_UP;
++		if (netif_dormant(dev))
++			flags |= IFF_DORMANT;
++	}
++
++	return flags;
++}
++EXPORT_SYMBOL(dev_get_flags);
++
++int __dev_change_flags(struct net_device *dev, unsigned int flags,
++		       struct netlink_ext_ack *extack)
++{
++	unsigned int old_flags = dev->flags;
++	int ret;
++
++	ASSERT_RTNL();
++
++	/*
++	 *	Set the flags on our device.
++	 */
++
++	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
++			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
++			       IFF_AUTOMEDIA)) |
++		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
++				    IFF_ALLMULTI));
++
++	/*
++	 *	Load in the correct multicast list now the flags have changed.
++	 */
++
++	if ((old_flags ^ flags) & IFF_MULTICAST)
++		dev_change_rx_flags(dev, IFF_MULTICAST);
++
++	dev_set_rx_mode(dev);
++
++	/*
++	 *	Have we downed the interface. We handle IFF_UP ourselves
++	 *	according to user attempts to set it, rather than blindly
++	 *	setting it.
++	 */
++
++	ret = 0;
++	if ((old_flags ^ flags) & IFF_UP) {
++		if (old_flags & IFF_UP)
++			__dev_close(dev);
++		else
++			ret = __dev_open(dev, extack);
++	}
++
++	if ((flags ^ dev->gflags) & IFF_PROMISC) {
++		int inc = (flags & IFF_PROMISC) ? 1 : -1;
++		unsigned int old_flags = dev->flags;
++
++		dev->gflags ^= IFF_PROMISC;
++
++		if (__dev_set_promiscuity(dev, inc, false) >= 0)
++			if (dev->flags != old_flags)
++				dev_set_rx_mode(dev);
++	}
++
++	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
++	 * is important. Some (broken) drivers set IFF_PROMISC, when
++	 * IFF_ALLMULTI is requested not asking us and not reporting.
++	 */
++	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
++		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
++
++		dev->gflags ^= IFF_ALLMULTI;
++		__dev_set_allmulti(dev, inc, false);
++	}
++
++	return ret;
++}
++
++void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
++			unsigned int gchanges)
++{
++	unsigned int changes = dev->flags ^ old_flags;
++
++	if (gchanges)
++		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
++
++	if (changes & IFF_UP) {
++		if (dev->flags & IFF_UP)
++			call_netdevice_notifiers(NETDEV_UP, dev);
++		else
++			call_netdevice_notifiers(NETDEV_DOWN, dev);
++	}
++
++	if (dev->flags & IFF_UP &&
++	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
++		struct netdev_notifier_change_info change_info = {
++			.info = {
++				.dev = dev,
++			},
++			.flags_changed = changes,
++		};
++
++		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
++	}
++}
++
++/**
++ *	dev_change_flags - change device settings
++ *	@dev: device
++ *	@flags: device state flags
++ *	@extack: netlink extended ack
++ *
++ *	Change settings on device based state flags. The flags are
++ *	in the userspace exported format.
++ */
++int dev_change_flags(struct net_device *dev, unsigned int flags,
++		     struct netlink_ext_ack *extack)
++{
++	int ret;
++	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
++
++	ret = __dev_change_flags(dev, flags, extack);
++	if (ret < 0)
++		return ret;
++
++	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
++	__dev_notify_flags(dev, old_flags, changes);
++	return ret;
++}
++EXPORT_SYMBOL(dev_change_flags);
++
++int __dev_set_mtu(struct net_device *dev, int new_mtu)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (ops->ndo_change_mtu)
++		return ops->ndo_change_mtu(dev, new_mtu);
++
++	/* Pairs with all the lockless reads of dev->mtu in the stack */
++	WRITE_ONCE(dev->mtu, new_mtu);
++	return 0;
++}
++EXPORT_SYMBOL(__dev_set_mtu);
++
++int dev_validate_mtu(struct net_device *dev, int new_mtu,
++		     struct netlink_ext_ack *extack)
++{
++	/* MTU must be positive, and in range */
++	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
++		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
++		return -EINVAL;
++	}
++
++	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
++		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
++		return -EINVAL;
++	}
++	return 0;
++}
++
++/**
++ *	dev_set_mtu_ext - Change maximum transfer unit
++ *	@dev: device
++ *	@new_mtu: new transfer unit
++ *	@extack: netlink extended ack
++ *
++ *	Change the maximum transfer size of the network device.
++ */
++int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
++		    struct netlink_ext_ack *extack)
++{
++	int err, orig_mtu;
++
++	if (new_mtu == dev->mtu)
++		return 0;
++
++	err = dev_validate_mtu(dev, new_mtu, extack);
++	if (err)
++		return err;
++
++	if (!netif_device_present(dev))
++		return -ENODEV;
++
++	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
++	err = notifier_to_errno(err);
++	if (err)
++		return err;
++
++	orig_mtu = dev->mtu;
++	err = __dev_set_mtu(dev, new_mtu);
++
++	if (!err) {
++		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
++						   orig_mtu);
++		err = notifier_to_errno(err);
++		if (err) {
++			/* setting mtu back and notifying everyone again,
++			 * so that they have a chance to revert changes.
++			 */
++			__dev_set_mtu(dev, orig_mtu);
++			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
++						     new_mtu);
++		}
++	}
++	return err;
++}
++
++int dev_set_mtu(struct net_device *dev, int new_mtu)
++{
++	struct netlink_ext_ack extack;
++	int err;
++
++	memset(&extack, 0, sizeof(extack));
++	err = dev_set_mtu_ext(dev, new_mtu, &extack);
++	if (err && extack._msg)
++		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
++	return err;
++}
++EXPORT_SYMBOL(dev_set_mtu);
++
++/**
++ *	dev_change_tx_queue_len - Change TX queue length of a netdevice
++ *	@dev: device
++ *	@new_len: new tx queue length
++ */
++int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
++{
++	unsigned int orig_len = dev->tx_queue_len;
++	int res;
++
++	if (new_len != (unsigned int)new_len)
++		return -ERANGE;
++
++	if (new_len != orig_len) {
++		dev->tx_queue_len = new_len;
++		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
++		res = notifier_to_errno(res);
++		if (res)
++			goto err_rollback;
++		res = dev_qdisc_change_tx_queue_len(dev);
++		if (res)
++			goto err_rollback;
++	}
++
++	return 0;
++
++err_rollback:
++	netdev_err(dev, "refused to change device tx_queue_len\n");
++	dev->tx_queue_len = orig_len;
++	return res;
++}
++
++/**
++ *	dev_set_group - Change group this device belongs to
++ *	@dev: device
++ *	@new_group: group this device should belong to
++ */
++void dev_set_group(struct net_device *dev, int new_group)
++{
++	dev->group = new_group;
++}
++
++/**
++ *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
++ *	@dev: device
++ *	@addr: new address
++ *	@extack: netlink extended ack
++ */
++int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
++			      struct netlink_ext_ack *extack)
++{
++	struct netdev_notifier_pre_changeaddr_info info = {
++		.info.dev = dev,
++		.info.extack = extack,
++		.dev_addr = addr,
++	};
++	int rc;
++
++	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
++	return notifier_to_errno(rc);
++}
++EXPORT_SYMBOL(dev_pre_changeaddr_notify);
++
++/**
++ *	dev_set_mac_address - Change Media Access Control Address
++ *	@dev: device
++ *	@sa: new address
++ *	@extack: netlink extended ack
++ *
++ *	Change the hardware (MAC) address of the device
++ */
++int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
++			struct netlink_ext_ack *extack)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	int err;
++
++	if (!ops->ndo_set_mac_address)
++		return -EOPNOTSUPP;
++	if (sa->sa_family != dev->type)
++		return -EINVAL;
++	if (!netif_device_present(dev))
++		return -ENODEV;
++	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
++	if (err)
++		return err;
++	err = ops->ndo_set_mac_address(dev, sa);
++	if (err)
++		return err;
++	dev->addr_assign_type = NET_ADDR_SET;
++	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
++	add_device_randomness(dev->dev_addr, dev->addr_len);
++	return 0;
++}
++EXPORT_SYMBOL(dev_set_mac_address);
++
++static DECLARE_RWSEM(dev_addr_sem);
++
++int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
++			     struct netlink_ext_ack *extack)
++{
++	int ret;
++
++	down_write(&dev_addr_sem);
++	ret = dev_set_mac_address(dev, sa, extack);
++	up_write(&dev_addr_sem);
++	return ret;
++}
++EXPORT_SYMBOL(dev_set_mac_address_user);
++
++int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
++{
++	size_t size = sizeof(sa->sa_data);
++	struct net_device *dev;
++	int ret = 0;
++
++	down_read(&dev_addr_sem);
++	rcu_read_lock();
++
++	dev = dev_get_by_name_rcu(net, dev_name);
++	if (!dev) {
++		ret = -ENODEV;
++		goto unlock;
++	}
++	if (!dev->addr_len)
++		memset(sa->sa_data, 0, size);
++	else
++		memcpy(sa->sa_data, dev->dev_addr,
++		       min_t(size_t, size, dev->addr_len));
++	sa->sa_family = dev->type;
++
++unlock:
++	rcu_read_unlock();
++	up_read(&dev_addr_sem);
++	return ret;
++}
++EXPORT_SYMBOL(dev_get_mac_address);
++
++/**
++ *	dev_change_carrier - Change device carrier
++ *	@dev: device
++ *	@new_carrier: new value
++ *
++ *	Change device carrier
++ */
++int dev_change_carrier(struct net_device *dev, bool new_carrier)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (!ops->ndo_change_carrier)
++		return -EOPNOTSUPP;
++	if (!netif_device_present(dev))
++		return -ENODEV;
++	return ops->ndo_change_carrier(dev, new_carrier);
++}
++
++/**
++ *	dev_get_phys_port_id - Get device physical port ID
++ *	@dev: device
++ *	@ppid: port ID
++ *
++ *	Get device physical port ID
++ */
++int dev_get_phys_port_id(struct net_device *dev,
++			 struct netdev_phys_item_id *ppid)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	if (!ops->ndo_get_phys_port_id)
++		return -EOPNOTSUPP;
++	return ops->ndo_get_phys_port_id(dev, ppid);
++}
++
++/**
++ *	dev_get_phys_port_name - Get device physical port name
++ *	@dev: device
++ *	@name: port name
++ *	@len: limit of bytes to copy to name
++ *
++ *	Get device physical port name
++ */
++int dev_get_phys_port_name(struct net_device *dev,
++			   char *name, size_t len)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	int err;
++
++	if (ops->ndo_get_phys_port_name) {
++		err = ops->ndo_get_phys_port_name(dev, name, len);
++		if (err != -EOPNOTSUPP)
++			return err;
++	}
++	return devlink_compat_phys_port_name_get(dev, name, len);
++}
++
++/**
++ *	dev_get_port_parent_id - Get the device's port parent identifier
++ *	@dev: network device
++ *	@ppid: pointer to a storage for the port's parent identifier
++ *	@recurse: allow/disallow recursion to lower devices
++ *
++ *	Get the devices's port parent identifier
++ */
++int dev_get_port_parent_id(struct net_device *dev,
++			   struct netdev_phys_item_id *ppid,
++			   bool recurse)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	struct netdev_phys_item_id first = { };
++	struct net_device *lower_dev;
++	struct list_head *iter;
++	int err;
++
++	if (ops->ndo_get_port_parent_id) {
++		err = ops->ndo_get_port_parent_id(dev, ppid);
++		if (err != -EOPNOTSUPP)
++			return err;
++	}
++
++	err = devlink_compat_switch_id_get(dev, ppid);
++	if (!recurse || err != -EOPNOTSUPP)
++		return err;
++
++	netdev_for_each_lower_dev(dev, lower_dev, iter) {
++		err = dev_get_port_parent_id(lower_dev, ppid, true);
++		if (err)
++			break;
++		if (!first.id_len)
++			first = *ppid;
++		else if (memcmp(&first, ppid, sizeof(*ppid)))
++			return -EOPNOTSUPP;
++	}
++
++	return err;
++}
++EXPORT_SYMBOL(dev_get_port_parent_id);
++
++/**
++ *	netdev_port_same_parent_id - Indicate if two network devices have
++ *	the same port parent identifier
++ *	@a: first network device
++ *	@b: second network device
++ */
++bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
++{
++	struct netdev_phys_item_id a_id = { };
++	struct netdev_phys_item_id b_id = { };
++
++	if (dev_get_port_parent_id(a, &a_id, true) ||
++	    dev_get_port_parent_id(b, &b_id, true))
++		return false;
++
++	return netdev_phys_item_id_same(&a_id, &b_id);
++}
++EXPORT_SYMBOL(netdev_port_same_parent_id);
++
++/**
++ *	dev_change_proto_down - set carrier according to proto_down.
++ *
++ *	@dev: device
++ *	@proto_down: new value
++ */
++int dev_change_proto_down(struct net_device *dev, bool proto_down)
++{
++	if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
++		return -EOPNOTSUPP;
++	if (!netif_device_present(dev))
++		return -ENODEV;
++	if (proto_down)
++		netif_carrier_off(dev);
++	else
++		netif_carrier_on(dev);
++	dev->proto_down = proto_down;
++	return 0;
++}
++
++/**
++ *	dev_change_proto_down_reason - proto down reason
++ *
++ *	@dev: device
++ *	@mask: proto down mask
++ *	@value: proto down value
++ */
++void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
++				  u32 value)
++{
++	int b;
++
++	if (!mask) {
++		dev->proto_down_reason = value;
++	} else {
++		for_each_set_bit(b, &mask, 32) {
++			if (value & (1 << b))
++				dev->proto_down_reason |= BIT(b);
++			else
++				dev->proto_down_reason &= ~BIT(b);
++		}
++	}
++}
++
++struct bpf_xdp_link {
++	struct bpf_link link;
++	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
++	int flags;
++};
++
++static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
++{
++	if (flags & XDP_FLAGS_HW_MODE)
++		return XDP_MODE_HW;
++	if (flags & XDP_FLAGS_DRV_MODE)
++		return XDP_MODE_DRV;
++	if (flags & XDP_FLAGS_SKB_MODE)
++		return XDP_MODE_SKB;
++	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
++}
++
++static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
++{
++	switch (mode) {
++	case XDP_MODE_SKB:
++		return generic_xdp_install;
++	case XDP_MODE_DRV:
++	case XDP_MODE_HW:
++		return dev->netdev_ops->ndo_bpf;
++	default:
++		return NULL;
++	}
++}
++
++static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
++					 enum bpf_xdp_mode mode)
++{
++	return dev->xdp_state[mode].link;
++}
++
++static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
++				     enum bpf_xdp_mode mode)
++{
++	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
++
++	if (link)
++		return link->link.prog;
++	return dev->xdp_state[mode].prog;
++}
++
++u8 dev_xdp_prog_count(struct net_device *dev)
++{
++	u8 count = 0;
++	int i;
++
++	for (i = 0; i < __MAX_XDP_MODE; i++)
++		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
++			count++;
++	return count;
++}
++EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
++
++u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
++{
++	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
++
++	return prog ? prog->aux->id : 0;
++}
++
++static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
++			     struct bpf_xdp_link *link)
++{
++	dev->xdp_state[mode].link = link;
++	dev->xdp_state[mode].prog = NULL;
++}
++
++static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
++			     struct bpf_prog *prog)
++{
++	dev->xdp_state[mode].link = NULL;
++	dev->xdp_state[mode].prog = prog;
++}
++
++static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
++			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
++			   u32 flags, struct bpf_prog *prog)
++{
++	struct netdev_bpf xdp;
++	int err;
++
++	memset(&xdp, 0, sizeof(xdp));
++	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
++	xdp.extack = extack;
++	xdp.flags = flags;
++	xdp.prog = prog;
++
++	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
++	 * "moved" into driver), so they don't increment it on their own, but
++	 * they do decrement refcnt when program is detached or replaced.
++	 * Given net_device also owns link/prog, we need to bump refcnt here
++	 * to prevent drivers from underflowing it.
++	 */
++	if (prog)
++		bpf_prog_inc(prog);
++	err = bpf_op(dev, &xdp);
++	if (err) {
++		if (prog)
++			bpf_prog_put(prog);
++		return err;
++	}
++
++	if (mode != XDP_MODE_HW)
++		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
++
++	return 0;
++}
++
++static void dev_xdp_uninstall(struct net_device *dev)
++{
++	struct bpf_xdp_link *link;
++	struct bpf_prog *prog;
++	enum bpf_xdp_mode mode;
++	bpf_op_t bpf_op;
++
++	ASSERT_RTNL();
++
++	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
++		prog = dev_xdp_prog(dev, mode);
++		if (!prog)
++			continue;
++
++		bpf_op = dev_xdp_bpf_op(dev, mode);
++		if (!bpf_op)
++			continue;
++
++		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
++
++		/* auto-detach link from net device */
++		link = dev_xdp_link(dev, mode);
++		if (link)
++			link->dev = NULL;
++		else
++			bpf_prog_put(prog);
++
++		dev_xdp_set_link(dev, mode, NULL);
++	}
++}
++
++static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
++			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
++			  struct bpf_prog *old_prog, u32 flags)
++{
++	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
++	struct bpf_prog *cur_prog;
++	struct net_device *upper;
++	struct list_head *iter;
++	enum bpf_xdp_mode mode;
++	bpf_op_t bpf_op;
++	int err;
++
++	ASSERT_RTNL();
++
++	/* either link or prog attachment, never both */
++	if (link && (new_prog || old_prog))
++		return -EINVAL;
++	/* link supports only XDP mode flags */
++	if (link && (flags & ~XDP_FLAGS_MODES)) {
++		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
++		return -EINVAL;
++	}
++	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
++	if (num_modes > 1) {
++		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
++		return -EINVAL;
++	}
++	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
++	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
++		NL_SET_ERR_MSG(extack,
++			       "More than one program loaded, unset mode is ambiguous");
++		return -EINVAL;
++	}
++	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
++	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
++		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
++		return -EINVAL;
++	}
++
++	mode = dev_xdp_mode(dev, flags);
++	/* can't replace attached link */
++	if (dev_xdp_link(dev, mode)) {
++		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
++		return -EBUSY;
++	}
++
++	/* don't allow if an upper device already has a program */
++	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
++		if (dev_xdp_prog_count(upper) > 0) {
++			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
++			return -EEXIST;
++		}
++	}
++
++	cur_prog = dev_xdp_prog(dev, mode);
++	/* can't replace attached prog with link */
++	if (link && cur_prog) {
++		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
++		return -EBUSY;
++	}
++	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
++		NL_SET_ERR_MSG(extack, "Active program does not match expected");
++		return -EEXIST;
++	}
++
++	/* put effective new program into new_prog */
++	if (link)
++		new_prog = link->link.prog;
++
++	if (new_prog) {
++		bool offload = mode == XDP_MODE_HW;
++		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
++					       ? XDP_MODE_DRV : XDP_MODE_SKB;
++
++		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
++			NL_SET_ERR_MSG(extack, "XDP program already attached");
++			return -EBUSY;
++		}
++		if (!offload && dev_xdp_prog(dev, other_mode)) {
++			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
++			return -EEXIST;
++		}
++		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
++			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
++			return -EINVAL;
++		}
++		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
++			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
++			return -EINVAL;
++		}
++		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
++			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
++			return -EINVAL;
++		}
++	}
++
++	/* don't call drivers if the effective program didn't change */
++	if (new_prog != cur_prog) {
++		bpf_op = dev_xdp_bpf_op(dev, mode);
++		if (!bpf_op) {
++			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
++			return -EOPNOTSUPP;
++		}
++
++		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
++		if (err)
++			return err;
++	}
++
++	if (link)
++		dev_xdp_set_link(dev, mode, link);
++	else
++		dev_xdp_set_prog(dev, mode, new_prog);
++	if (cur_prog)
++		bpf_prog_put(cur_prog);
++
++	return 0;
++}
++
++static int dev_xdp_attach_link(struct net_device *dev,
++			       struct netlink_ext_ack *extack,
++			       struct bpf_xdp_link *link)
++{
++	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
++}
++
++static int dev_xdp_detach_link(struct net_device *dev,
++			       struct netlink_ext_ack *extack,
++			       struct bpf_xdp_link *link)
++{
++	enum bpf_xdp_mode mode;
++	bpf_op_t bpf_op;
++
++	ASSERT_RTNL();
++
++	mode = dev_xdp_mode(dev, link->flags);
++	if (dev_xdp_link(dev, mode) != link)
++		return -EINVAL;
++
++	bpf_op = dev_xdp_bpf_op(dev, mode);
++	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
++	dev_xdp_set_link(dev, mode, NULL);
++	return 0;
++}
++
++static void bpf_xdp_link_release(struct bpf_link *link)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++
++	rtnl_lock();
++
++	/* if racing with net_device's tear down, xdp_link->dev might be
++	 * already NULL, in which case link was already auto-detached
++	 */
++	if (xdp_link->dev) {
++		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
++		xdp_link->dev = NULL;
++	}
++
++	rtnl_unlock();
++}
++
++static int bpf_xdp_link_detach(struct bpf_link *link)
++{
++	bpf_xdp_link_release(link);
++	return 0;
++}
++
++static void bpf_xdp_link_dealloc(struct bpf_link *link)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++
++	kfree(xdp_link);
++}
++
++static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
++				     struct seq_file *seq)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++	u32 ifindex = 0;
++
++	rtnl_lock();
++	if (xdp_link->dev)
++		ifindex = xdp_link->dev->ifindex;
++	rtnl_unlock();
++
++	seq_printf(seq, "ifindex:\t%u\n", ifindex);
++}
++
++static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
++				       struct bpf_link_info *info)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++	u32 ifindex = 0;
++
++	rtnl_lock();
++	if (xdp_link->dev)
++		ifindex = xdp_link->dev->ifindex;
++	rtnl_unlock();
++
++	info->xdp.ifindex = ifindex;
++	return 0;
++}
++
++static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
++			       struct bpf_prog *old_prog)
++{
++	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
++	enum bpf_xdp_mode mode;
++	bpf_op_t bpf_op;
++	int err = 0;
++
++	rtnl_lock();
++
++	/* link might have been auto-released already, so fail */
++	if (!xdp_link->dev) {
++		err = -ENOLINK;
++		goto out_unlock;
++	}
++
++	if (old_prog && link->prog != old_prog) {
++		err = -EPERM;
++		goto out_unlock;
++	}
++	old_prog = link->prog;
++	if (old_prog->type != new_prog->type ||
++	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
++		err = -EINVAL;
++		goto out_unlock;
++	}
++
++	if (old_prog == new_prog) {
++		/* no-op, don't disturb drivers */
++		bpf_prog_put(new_prog);
++		goto out_unlock;
++	}
++
++	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
++	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
++	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
++			      xdp_link->flags, new_prog);
++	if (err)
++		goto out_unlock;
++
++	old_prog = xchg(&link->prog, new_prog);
++	bpf_prog_put(old_prog);
++
++out_unlock:
++	rtnl_unlock();
++	return err;
++}
++
++static const struct bpf_link_ops bpf_xdp_link_lops = {
++	.release = bpf_xdp_link_release,
++	.dealloc = bpf_xdp_link_dealloc,
++	.detach = bpf_xdp_link_detach,
++	.show_fdinfo = bpf_xdp_link_show_fdinfo,
++	.fill_link_info = bpf_xdp_link_fill_link_info,
++	.update_prog = bpf_xdp_link_update,
++};
++
++int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
++{
++	struct net *net = current->nsproxy->net_ns;
++	struct bpf_link_primer link_primer;
++	struct bpf_xdp_link *link;
++	struct net_device *dev;
++	int err, fd;
++
++	rtnl_lock();
++	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
++	if (!dev) {
++		rtnl_unlock();
++		return -EINVAL;
++	}
++
++	link = kzalloc(sizeof(*link), GFP_USER);
++	if (!link) {
++		err = -ENOMEM;
++		goto unlock;
++	}
++
++	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
++	link->dev = dev;
++	link->flags = attr->link_create.flags;
++
++	err = bpf_link_prime(&link->link, &link_primer);
++	if (err) {
++		kfree(link);
++		goto unlock;
++	}
++
++	err = dev_xdp_attach_link(dev, NULL, link);
++	rtnl_unlock();
++
++	if (err) {
++		link->dev = NULL;
++		bpf_link_cleanup(&link_primer);
++		goto out_put_dev;
++	}
++
++	fd = bpf_link_settle(&link_primer);
++	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
++	dev_put(dev);
++	return fd;
++
++unlock:
++	rtnl_unlock();
++
++out_put_dev:
++	dev_put(dev);
++	return err;
++}
++
++/**
++ *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
++ *	@dev: device
++ *	@extack: netlink extended ack
++ *	@fd: new program fd or negative value to clear
++ *	@expected_fd: old program fd that userspace expects to replace or clear
++ *	@flags: xdp-related flags
++ *
++ *	Set or clear a bpf program for a device
++ */
++int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
++		      int fd, int expected_fd, u32 flags)
++{
++	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
++	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
++	int err;
++
++	ASSERT_RTNL();
++
++	if (fd >= 0) {
++		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
++						 mode != XDP_MODE_SKB);
++		if (IS_ERR(new_prog))
++			return PTR_ERR(new_prog);
++	}
++
++	if (expected_fd >= 0) {
++		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
++						 mode != XDP_MODE_SKB);
++		if (IS_ERR(old_prog)) {
++			err = PTR_ERR(old_prog);
++			old_prog = NULL;
++			goto err_out;
++		}
++	}
++
++	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
++
++err_out:
++	if (err && new_prog)
++		bpf_prog_put(new_prog);
++	if (old_prog)
++		bpf_prog_put(old_prog);
++	return err;
++}
++
++/**
++ *	dev_new_index	-	allocate an ifindex
++ *	@net: the applicable net namespace
++ *
++ *	Returns a suitable unique value for a new device interface
++ *	number.  The caller must hold the rtnl semaphore or the
++ *	dev_base_lock to be sure it remains unique.
++ */
++static int dev_new_index(struct net *net)
++{
++	int ifindex = net->ifindex;
++
++	for (;;) {
++		if (++ifindex <= 0)
++			ifindex = 1;
++		if (!__dev_get_by_index(net, ifindex))
++			return net->ifindex = ifindex;
++	}
++}
++
++/* Delayed registration/unregisteration */
++LIST_HEAD(net_todo_list);
++DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
++
++static void net_set_todo(struct net_device *dev)
++{
++	list_add_tail(&dev->todo_list, &net_todo_list);
++	atomic_inc(&dev_net(dev)->dev_unreg_count);
++}
++
++static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
++	struct net_device *upper, netdev_features_t features)
++{
++	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
++	netdev_features_t feature;
++	int feature_bit;
++
++	for_each_netdev_feature(upper_disables, feature_bit) {
++		feature = __NETIF_F_BIT(feature_bit);
++		if (!(upper->wanted_features & feature)
++		    && (features & feature)) {
++			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
++				   &feature, upper->name);
++			features &= ~feature;
++		}
++	}
++
++	return features;
++}
++
++static void netdev_sync_lower_features(struct net_device *upper,
++	struct net_device *lower, netdev_features_t features)
++{
++	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
++	netdev_features_t feature;
++	int feature_bit;
++
++	for_each_netdev_feature(upper_disables, feature_bit) {
++		feature = __NETIF_F_BIT(feature_bit);
++		if (!(features & feature) && (lower->features & feature)) {
++			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
++				   &feature, lower->name);
++			lower->wanted_features &= ~feature;
++			__netdev_update_features(lower);
++
++			if (unlikely(lower->features & feature))
++				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
++					    &feature, lower->name);
++			else
++				netdev_features_change(lower);
++		}
++	}
++}
++
++static netdev_features_t netdev_fix_features(struct net_device *dev,
++	netdev_features_t features)
++{
++	/* Fix illegal checksum combinations */
++	if ((features & NETIF_F_HW_CSUM) &&
++	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
++		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
++		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
++	}
++
++	/* TSO requires that SG is present as well. */
++	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
++		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
++		features &= ~NETIF_F_ALL_TSO;
++	}
++
++	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
++					!(features & NETIF_F_IP_CSUM)) {
++		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
++		features &= ~NETIF_F_TSO;
++		features &= ~NETIF_F_TSO_ECN;
++	}
++
++	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
++					 !(features & NETIF_F_IPV6_CSUM)) {
++		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
++		features &= ~NETIF_F_TSO6;
++	}
++
++	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
++	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
++		features &= ~NETIF_F_TSO_MANGLEID;
++
++	/* TSO ECN requires that TSO is present as well. */
++	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
++		features &= ~NETIF_F_TSO_ECN;
++
++	/* Software GSO depends on SG. */
++	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
++		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
++		features &= ~NETIF_F_GSO;
++	}
++
++	/* GSO partial features require GSO partial be set */
++	if ((features & dev->gso_partial_features) &&
++	    !(features & NETIF_F_GSO_PARTIAL)) {
++		netdev_dbg(dev,
++			   "Dropping partially supported GSO features since no GSO partial.\n");
++		features &= ~dev->gso_partial_features;
++	}
++
++	if (!(features & NETIF_F_RXCSUM)) {
++		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
++		 * successfully merged by hardware must also have the
++		 * checksum verified by hardware.  If the user does not
++		 * want to enable RXCSUM, logically, we should disable GRO_HW.
++		 */
++		if (features & NETIF_F_GRO_HW) {
++			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
++			features &= ~NETIF_F_GRO_HW;
++		}
++	}
++
++	/* LRO/HW-GRO features cannot be combined with RX-FCS */
++	if (features & NETIF_F_RXFCS) {
++		if (features & NETIF_F_LRO) {
++			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
++			features &= ~NETIF_F_LRO;
++		}
++
++		if (features & NETIF_F_GRO_HW) {
++			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
++			features &= ~NETIF_F_GRO_HW;
++		}
++	}
++
++	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
++		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
++		features &= ~NETIF_F_LRO;
++	}
++
++	if (features & NETIF_F_HW_TLS_TX) {
++		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
++			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
++		bool hw_csum = features & NETIF_F_HW_CSUM;
++
++		if (!ip_csum && !hw_csum) {
++			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
++			features &= ~NETIF_F_HW_TLS_TX;
++		}
++	}
++
++	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
++		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
++		features &= ~NETIF_F_HW_TLS_RX;
++	}
++
++	return features;
++}
++
++int __netdev_update_features(struct net_device *dev)
++{
++	struct net_device *upper, *lower;
++	netdev_features_t features;
++	struct list_head *iter;
++	int err = -1;
++
++	ASSERT_RTNL();
++
++	features = netdev_get_wanted_features(dev);
++
++	if (dev->netdev_ops->ndo_fix_features)
++		features = dev->netdev_ops->ndo_fix_features(dev, features);
++
++	/* driver might be less strict about feature dependencies */
++	features = netdev_fix_features(dev, features);
++
++	/* some features can't be enabled if they're off on an upper device */
++	netdev_for_each_upper_dev_rcu(dev, upper, iter)
++		features = netdev_sync_upper_features(dev, upper, features);
++
++	if (dev->features == features)
++		goto sync_lower;
++
++	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
++		&dev->features, &features);
++
++	if (dev->netdev_ops->ndo_set_features)
++		err = dev->netdev_ops->ndo_set_features(dev, features);
++	else
++		err = 0;
++
++	if (unlikely(err < 0)) {
++		netdev_err(dev,
++			"set_features() failed (%d); wanted %pNF, left %pNF\n",
++			err, &features, &dev->features);
++		/* return non-0 since some features might have changed and
++		 * it's better to fire a spurious notification than miss it
++		 */
++		return -1;
++	}
++
++sync_lower:
++	/* some features must be disabled on lower devices when disabled
++	 * on an upper device (think: bonding master or bridge)
++	 */
++	netdev_for_each_lower_dev(dev, lower, iter)
++		netdev_sync_lower_features(dev, lower, features);
++
++	if (!err) {
++		netdev_features_t diff = features ^ dev->features;
++
++		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
++			/* udp_tunnel_{get,drop}_rx_info both need
++			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
++			 * device, or they won't do anything.
++			 * Thus we need to update dev->features
++			 * *before* calling udp_tunnel_get_rx_info,
++			 * but *after* calling udp_tunnel_drop_rx_info.
++			 */
++			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
++				dev->features = features;
++				udp_tunnel_get_rx_info(dev);
++			} else {
++				udp_tunnel_drop_rx_info(dev);
++			}
++		}
++
++		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
++			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
++				dev->features = features;
++				err |= vlan_get_rx_ctag_filter_info(dev);
++			} else {
++				vlan_drop_rx_ctag_filter_info(dev);
++			}
++		}
++
++		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
++			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
++				dev->features = features;
++				err |= vlan_get_rx_stag_filter_info(dev);
++			} else {
++				vlan_drop_rx_stag_filter_info(dev);
++			}
++		}
++
++		dev->features = features;
++	}
++
++	return err < 0 ? 0 : 1;
++}
++
++/**
++ *	netdev_update_features - recalculate device features
++ *	@dev: the device to check
++ *
++ *	Recalculate dev->features set and send notifications if it
++ *	has changed. Should be called after driver or hardware dependent
++ *	conditions might have changed that influence the features.
++ */
++void netdev_update_features(struct net_device *dev)
++{
++	if (__netdev_update_features(dev))
++		netdev_features_change(dev);
++}
++EXPORT_SYMBOL(netdev_update_features);
++
++/**
++ *	netdev_change_features - recalculate device features
++ *	@dev: the device to check
++ *
++ *	Recalculate dev->features set and send notifications even
++ *	if they have not changed. Should be called instead of
++ *	netdev_update_features() if also dev->vlan_features might
++ *	have changed to allow the changes to be propagated to stacked
++ *	VLAN devices.
++ */
++void netdev_change_features(struct net_device *dev)
++{
++	__netdev_update_features(dev);
++	netdev_features_change(dev);
++}
++EXPORT_SYMBOL(netdev_change_features);
++
++/**
++ *	netif_stacked_transfer_operstate -	transfer operstate
++ *	@rootdev: the root or lower level device to transfer state from
++ *	@dev: the device to transfer operstate to
++ *
++ *	Transfer operational state from root to device. This is normally
++ *	called when a stacking relationship exists between the root
++ *	device and the device(a leaf device).
++ */
++void netif_stacked_transfer_operstate(const struct net_device *rootdev,
++					struct net_device *dev)
++{
++	if (rootdev->operstate == IF_OPER_DORMANT)
++		netif_dormant_on(dev);
++	else
++		netif_dormant_off(dev);
++
++	if (rootdev->operstate == IF_OPER_TESTING)
++		netif_testing_on(dev);
++	else
++		netif_testing_off(dev);
++
++	if (netif_carrier_ok(rootdev))
++		netif_carrier_on(dev);
++	else
++		netif_carrier_off(dev);
++}
++EXPORT_SYMBOL(netif_stacked_transfer_operstate);
++
++static int netif_alloc_rx_queues(struct net_device *dev)
++{
++	unsigned int i, count = dev->num_rx_queues;
++	struct netdev_rx_queue *rx;
++	size_t sz = count * sizeof(*rx);
++	int err = 0;
++
++	BUG_ON(count < 1);
++
++	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
++	if (!rx)
++		return -ENOMEM;
++
++	dev->_rx = rx;
++
++	for (i = 0; i < count; i++) {
++		rx[i].dev = dev;
++
++		/* XDP RX-queue setup */
++		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
++		if (err < 0)
++			goto err_rxq_info;
++	}
++	return 0;
++
++err_rxq_info:
++	/* Rollback successful reg's and free other resources */
++	while (i--)
++		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
++	kvfree(dev->_rx);
++	dev->_rx = NULL;
++	return err;
++}
++
++static void netif_free_rx_queues(struct net_device *dev)
++{
++	unsigned int i, count = dev->num_rx_queues;
++
++	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
++	if (!dev->_rx)
++		return;
++
++	for (i = 0; i < count; i++)
++		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
++
++	kvfree(dev->_rx);
++}
++
++static void netdev_init_one_queue(struct net_device *dev,
++				  struct netdev_queue *queue, void *_unused)
++{
++	/* Initialize queue lock */
++	spin_lock_init(&queue->_xmit_lock);
++	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
++	queue->xmit_lock_owner = -1;
++	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
++	queue->dev = dev;
++#ifdef CONFIG_BQL
++	dql_init(&queue->dql, HZ);
++#endif
++}
++
++static void netif_free_tx_queues(struct net_device *dev)
++{
++	kvfree(dev->_tx);
++}
++
++static int netif_alloc_netdev_queues(struct net_device *dev)
++{
++	unsigned int count = dev->num_tx_queues;
++	struct netdev_queue *tx;
++	size_t sz = count * sizeof(*tx);
++
++	if (count < 1 || count > 0xffff)
++		return -EINVAL;
++
++	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
++	if (!tx)
++		return -ENOMEM;
++
++	dev->_tx = tx;
++
++	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
++	spin_lock_init(&dev->tx_global_lock);
++
++	return 0;
++}
++
++void netif_tx_stop_all_queues(struct net_device *dev)
++{
++	unsigned int i;
++
++	for (i = 0; i < dev->num_tx_queues; i++) {
++		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
++
++		netif_tx_stop_queue(txq);
++	}
++}
++EXPORT_SYMBOL(netif_tx_stop_all_queues);
++
++/**
++ * register_netdevice() - register a network device
++ * @dev: device to register
++ *
++ * Take a prepared network device structure and make it externally accessible.
++ * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
++ * Callers must hold the rtnl lock - you may want register_netdev()
++ * instead of this.
++ */
++int register_netdevice(struct net_device *dev)
++{
++	int ret;
++	struct net *net = dev_net(dev);
++
++	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
++		     NETDEV_FEATURE_COUNT);
++	BUG_ON(dev_boot_phase);
++	ASSERT_RTNL();
++
++	might_sleep();
++
++	/* When net_device's are persistent, this will be fatal. */
++	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
++	BUG_ON(!net);
++
++	ret = ethtool_check_ops(dev->ethtool_ops);
++	if (ret)
++		return ret;
++
++	spin_lock_init(&dev->addr_list_lock);
++	netdev_set_addr_lockdep_class(dev);
++
++	ret = dev_get_valid_name(net, dev, dev->name);
++	if (ret < 0)
++		goto out;
++
++	ret = -ENOMEM;
++	dev->name_node = netdev_name_node_head_alloc(dev);
++	if (!dev->name_node)
++		goto out;
++
++	/* Init, if this function is available */
++	if (dev->netdev_ops->ndo_init) {
++		ret = dev->netdev_ops->ndo_init(dev);
++		if (ret) {
++			if (ret > 0)
++				ret = -EIO;
++			goto err_free_name;
++		}
++	}
++
++	if (((dev->hw_features | dev->features) &
++	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
++	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
++	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
++		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
++		ret = -EINVAL;
++		goto err_uninit;
++	}
++
++	ret = -EBUSY;
++	if (!dev->ifindex)
++		dev->ifindex = dev_new_index(net);
++	else if (__dev_get_by_index(net, dev->ifindex))
++		goto err_uninit;
++
++	/* Transfer changeable features to wanted_features and enable
++	 * software offloads (GSO and GRO).
++	 */
++	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
++	dev->features |= NETIF_F_SOFT_FEATURES;
++
++	if (dev->udp_tunnel_nic_info) {
++		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
++		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
++	}
++
++	dev->wanted_features = dev->features & dev->hw_features;
++
++	if (!(dev->flags & IFF_LOOPBACK))
++		dev->hw_features |= NETIF_F_NOCACHE_COPY;
++
++	/* If IPv4 TCP segmentation offload is supported we should also
++	 * allow the device to enable segmenting the frame with the option
++	 * of ignoring a static IP ID value.  This doesn't enable the
++	 * feature itself but allows the user to enable it later.
++	 */
++	if (dev->hw_features & NETIF_F_TSO)
++		dev->hw_features |= NETIF_F_TSO_MANGLEID;
++	if (dev->vlan_features & NETIF_F_TSO)
++		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
++	if (dev->mpls_features & NETIF_F_TSO)
++		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
++	if (dev->hw_enc_features & NETIF_F_TSO)
++		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
++
++	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
++	 */
++	dev->vlan_features |= NETIF_F_HIGHDMA;
++
++	/* Make NETIF_F_SG inheritable to tunnel devices.
++	 */
++	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
++
++	/* Make NETIF_F_SG inheritable to MPLS.
++	 */
++	dev->mpls_features |= NETIF_F_SG;
++
++	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
++	ret = notifier_to_errno(ret);
++	if (ret)
++		goto err_uninit;
++
++	ret = netdev_register_kobject(dev);
++	write_lock(&dev_base_lock);
++	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
++	write_unlock(&dev_base_lock);
++	if (ret)
++		goto err_uninit;
++
++	__netdev_update_features(dev);
++
++	/*
++	 *	Default initial state at registry is that the
++	 *	device is present.
++	 */
++
++	set_bit(__LINK_STATE_PRESENT, &dev->state);
++
++	linkwatch_init_dev(dev);
++
++	dev_init_scheduler(dev);
++
++	netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
++	list_netdevice(dev);
++
++	add_device_randomness(dev->dev_addr, dev->addr_len);
++
++	/* If the device has permanent device address, driver should
++	 * set dev_addr and also addr_assign_type should be set to
++	 * NET_ADDR_PERM (default value).
++	 */
++	if (dev->addr_assign_type == NET_ADDR_PERM)
++		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
++
++	/* Notify protocols, that a new device appeared. */
++	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
++	ret = notifier_to_errno(ret);
++	if (ret) {
++		/* Expect explicit free_netdev() on failure */
++		dev->needs_free_netdev = false;
++		unregister_netdevice_queue(dev, NULL);
++		goto out;
++	}
++	/*
++	 *	Prevent userspace races by waiting until the network
++	 *	device is fully setup before sending notifications.
++	 */
++	if (!dev->rtnl_link_ops ||
++	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
++		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
++
++out:
++	return ret;
++
++err_uninit:
++	if (dev->netdev_ops->ndo_uninit)
++		dev->netdev_ops->ndo_uninit(dev);
++	if (dev->priv_destructor)
++		dev->priv_destructor(dev);
++err_free_name:
++	netdev_name_node_free(dev->name_node);
++	goto out;
++}
++EXPORT_SYMBOL(register_netdevice);
++
++/**
++ *	init_dummy_netdev	- init a dummy network device for NAPI
++ *	@dev: device to init
++ *
++ *	This takes a network device structure and initialize the minimum
++ *	amount of fields so it can be used to schedule NAPI polls without
++ *	registering a full blown interface. This is to be used by drivers
++ *	that need to tie several hardware interfaces to a single NAPI
++ *	poll scheduler due to HW limitations.
++ */
++int init_dummy_netdev(struct net_device *dev)
++{
++	/* Clear everything. Note we don't initialize spinlocks
++	 * are they aren't supposed to be taken by any of the
++	 * NAPI code and this dummy netdev is supposed to be
++	 * only ever used for NAPI polls
++	 */
++	memset(dev, 0, sizeof(struct net_device));
++
++	/* make sure we BUG if trying to hit standard
++	 * register/unregister code path
++	 */
++	dev->reg_state = NETREG_DUMMY;
++
++	/* NAPI wants this */
++	INIT_LIST_HEAD(&dev->napi_list);
++
++	/* a dummy interface is started by default */
++	set_bit(__LINK_STATE_PRESENT, &dev->state);
++	set_bit(__LINK_STATE_START, &dev->state);
++
++	/* napi_busy_loop stats accounting wants this */
++	dev_net_set(dev, &init_net);
++
++	/* Note : We dont allocate pcpu_refcnt for dummy devices,
++	 * because users of this 'device' dont need to change
++	 * its refcount.
++	 */
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(init_dummy_netdev);
++
++
++/**
++ *	register_netdev	- register a network device
++ *	@dev: device to register
++ *
++ *	Take a completed network device structure and add it to the kernel
++ *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
++ *	chain. 0 is returned on success. A negative errno code is returned
++ *	on a failure to set up the device, or if the name is a duplicate.
++ *
++ *	This is a wrapper around register_netdevice that takes the rtnl semaphore
++ *	and expands the device name if you passed a format string to
++ *	alloc_netdev.
++ */
++int register_netdev(struct net_device *dev)
++{
++	int err;
++
++	if (rtnl_lock_killable())
++		return -EINTR;
++	err = register_netdevice(dev);
++	rtnl_unlock();
++	return err;
++}
++EXPORT_SYMBOL(register_netdev);
++
++int netdev_refcnt_read(const struct net_device *dev)
++{
++#ifdef CONFIG_PCPU_DEV_REFCNT
++	int i, refcnt = 0;
++
++	for_each_possible_cpu(i)
++		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
++	return refcnt;
++#else
++	return refcount_read(&dev->dev_refcnt);
++#endif
++}
++EXPORT_SYMBOL(netdev_refcnt_read);
++
++int netdev_unregister_timeout_secs __read_mostly = 10;
++
++#define WAIT_REFS_MIN_MSECS 1
++#define WAIT_REFS_MAX_MSECS 250
++/**
++ * netdev_wait_allrefs_any - wait until all references are gone.
++ * @list: list of net_devices to wait on
++ *
++ * This is called when unregistering network devices.
++ *
++ * Any protocol or device that holds a reference should register
++ * for netdevice notification, and cleanup and put back the
++ * reference if they receive an UNREGISTER event.
++ * We can get stuck here if buggy protocols don't correctly
++ * call dev_put.
++ */
++static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
++{
++	unsigned long rebroadcast_time, warning_time;
++	struct net_device *dev;
++	int wait = 0;
++
++	rebroadcast_time = warning_time = jiffies;
++
++	list_for_each_entry(dev, list, todo_list)
++		if (netdev_refcnt_read(dev) == 1)
++			return dev;
++
++	while (true) {
++		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
++			rtnl_lock();
++
++			/* Rebroadcast unregister notification */
++			list_for_each_entry(dev, list, todo_list)
++				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
++
++			__rtnl_unlock();
++			rcu_barrier();
++			rtnl_lock();
++
++			list_for_each_entry(dev, list, todo_list)
++				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
++					     &dev->state)) {
++					/* We must not have linkwatch events
++					 * pending on unregister. If this
++					 * happens, we simply run the queue
++					 * unscheduled, resulting in a noop
++					 * for this device.
++					 */
++					linkwatch_run_queue();
++					break;
++				}
++
++			__rtnl_unlock();
++
++			rebroadcast_time = jiffies;
++		}
++
++		if (!wait) {
++			rcu_barrier();
++			wait = WAIT_REFS_MIN_MSECS;
++		} else {
++			msleep(wait);
++			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
++		}
++
++		list_for_each_entry(dev, list, todo_list)
++			if (netdev_refcnt_read(dev) == 1)
++				return dev;
++
++		if (time_after(jiffies, warning_time +
++			       READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
++			list_for_each_entry(dev, list, todo_list) {
++				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
++					 dev->name, netdev_refcnt_read(dev));
++				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
++			}
++
++			warning_time = jiffies;
++		}
++	}
++}
++
++/* The sequence is:
++ *
++ *	rtnl_lock();
++ *	...
++ *	register_netdevice(x1);
++ *	register_netdevice(x2);
++ *	...
++ *	unregister_netdevice(y1);
++ *	unregister_netdevice(y2);
++ *      ...
++ *	rtnl_unlock();
++ *	free_netdev(y1);
++ *	free_netdev(y2);
++ *
++ * We are invoked by rtnl_unlock().
++ * This allows us to deal with problems:
++ * 1) We can delete sysfs objects which invoke hotplug
++ *    without deadlocking with linkwatch via keventd.
++ * 2) Since we run with the RTNL semaphore not held, we can sleep
++ *    safely in order to wait for the netdev refcnt to drop to zero.
++ *
++ * We must not return until all unregister events added during
++ * the interval the lock was held have been completed.
++ */
++void netdev_run_todo(void)
++{
++	struct net_device *dev, *tmp;
++	struct list_head list;
++#ifdef CONFIG_LOCKDEP
++	struct list_head unlink_list;
++
++	list_replace_init(&net_unlink_list, &unlink_list);
++
++	while (!list_empty(&unlink_list)) {
++		struct net_device *dev = list_first_entry(&unlink_list,
++							  struct net_device,
++							  unlink_list);
++		list_del_init(&dev->unlink_list);
++		dev->nested_level = dev->lower_level - 1;
++	}
++#endif
++
++	/* Snapshot list, allow later requests */
++	list_replace_init(&net_todo_list, &list);
++
++	__rtnl_unlock();
++
++	/* Wait for rcu callbacks to finish before next phase */
++	if (!list_empty(&list))
++		rcu_barrier();
++
++	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
++		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
++			netdev_WARN(dev, "run_todo but not unregistering\n");
++			list_del(&dev->todo_list);
++			continue;
++		}
++
++		write_lock(&dev_base_lock);
++		dev->reg_state = NETREG_UNREGISTERED;
++		write_unlock(&dev_base_lock);
++		linkwatch_forget_dev(dev);
++	}
++
++	while (!list_empty(&list)) {
++		dev = netdev_wait_allrefs_any(&list);
++		list_del(&dev->todo_list);
++
++		/* paranoia */
++		BUG_ON(netdev_refcnt_read(dev) != 1);
++		BUG_ON(!list_empty(&dev->ptype_all));
++		BUG_ON(!list_empty(&dev->ptype_specific));
++		WARN_ON(rcu_access_pointer(dev->ip_ptr));
++		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
++#if IS_ENABLED(CONFIG_DECNET)
++		WARN_ON(dev->dn_ptr);
++#endif
++		if (dev->priv_destructor)
++			dev->priv_destructor(dev);
++		if (dev->needs_free_netdev)
++			free_netdev(dev);
++
++		if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
++			wake_up(&netdev_unregistering_wq);
++
++		/* Free network device */
++		kobject_put(&dev->dev.kobj);
++	}
++}
++
++/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
++ * all the same fields in the same order as net_device_stats, with only
++ * the type differing, but rtnl_link_stats64 may have additional fields
++ * at the end for newer counters.
++ */
++void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
++			     const struct net_device_stats *netdev_stats)
++{
++#if BITS_PER_LONG == 64
++	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
++	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
++	/* zero out counters that only exist in rtnl_link_stats64 */
++	memset((char *)stats64 + sizeof(*netdev_stats), 0,
++	       sizeof(*stats64) - sizeof(*netdev_stats));
++#else
++	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
++	const unsigned long *src = (const unsigned long *)netdev_stats;
++	u64 *dst = (u64 *)stats64;
++
++	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
++	for (i = 0; i < n; i++)
++		dst[i] = src[i];
++	/* zero out counters that only exist in rtnl_link_stats64 */
++	memset((char *)stats64 + n * sizeof(u64), 0,
++	       sizeof(*stats64) - n * sizeof(u64));
++#endif
++}
++EXPORT_SYMBOL(netdev_stats_to_stats64);
++
++struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
++{
++	struct net_device_core_stats __percpu *p;
++
++	p = alloc_percpu_gfp(struct net_device_core_stats,
++			     GFP_ATOMIC | __GFP_NOWARN);
++
++	if (p && cmpxchg(&dev->core_stats, NULL, p))
++		free_percpu(p);
++
++	/* This READ_ONCE() pairs with the cmpxchg() above */
++	return READ_ONCE(dev->core_stats);
++}
++EXPORT_SYMBOL(netdev_core_stats_alloc);
++
++/**
++ *	dev_get_stats	- get network device statistics
++ *	@dev: device to get statistics from
++ *	@storage: place to store stats
++ *
++ *	Get network statistics from device. Return @storage.
++ *	The device driver may provide its own method by setting
++ *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
++ *	otherwise the internal statistics structure is used.
++ */
++struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
++					struct rtnl_link_stats64 *storage)
++{
++	const struct net_device_ops *ops = dev->netdev_ops;
++	const struct net_device_core_stats __percpu *p;
++
++	if (ops->ndo_get_stats64) {
++		memset(storage, 0, sizeof(*storage));
++		ops->ndo_get_stats64(dev, storage);
++	} else if (ops->ndo_get_stats) {
++		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
++	} else {
++		netdev_stats_to_stats64(storage, &dev->stats);
++	}
++
++	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
++	p = READ_ONCE(dev->core_stats);
++	if (p) {
++		const struct net_device_core_stats *core_stats;
++		int i;
++
++		for_each_possible_cpu(i) {
++			core_stats = per_cpu_ptr(p, i);
++			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
++			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
++			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
++			storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
++		}
++	}
++	return storage;
++}
++EXPORT_SYMBOL(dev_get_stats);
++
++/**
++ *	dev_fetch_sw_netstats - get per-cpu network device statistics
++ *	@s: place to store stats
++ *	@netstats: per-cpu network stats to read from
++ *
++ *	Read per-cpu network statistics and populate the related fields in @s.
++ */
++void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
++			   const struct pcpu_sw_netstats __percpu *netstats)
++{
++	int cpu;
++
++	for_each_possible_cpu(cpu) {
++		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
++		const struct pcpu_sw_netstats *stats;
++		unsigned int start;
++
++		stats = per_cpu_ptr(netstats, cpu);
++		do {
++			start = u64_stats_fetch_begin_irq(&stats->syncp);
++			rx_packets = u64_stats_read(&stats->rx_packets);
++			rx_bytes   = u64_stats_read(&stats->rx_bytes);
++			tx_packets = u64_stats_read(&stats->tx_packets);
++			tx_bytes   = u64_stats_read(&stats->tx_bytes);
++		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
++
++		s->rx_packets += rx_packets;
++		s->rx_bytes   += rx_bytes;
++		s->tx_packets += tx_packets;
++		s->tx_bytes   += tx_bytes;
++	}
++}
++EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
++
++/**
++ *	dev_get_tstats64 - ndo_get_stats64 implementation
++ *	@dev: device to get statistics from
++ *	@s: place to store stats
++ *
++ *	Populate @s from dev->stats and dev->tstats. Can be used as
++ *	ndo_get_stats64() callback.
++ */
++void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
++{
++	netdev_stats_to_stats64(s, &dev->stats);
++	dev_fetch_sw_netstats(s, dev->tstats);
++}
++EXPORT_SYMBOL_GPL(dev_get_tstats64);
++
++struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
++{
++	struct netdev_queue *queue = dev_ingress_queue(dev);
++
++#ifdef CONFIG_NET_CLS_ACT
++	if (queue)
++		return queue;
++	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
++	if (!queue)
++		return NULL;
++	netdev_init_one_queue(dev, queue, NULL);
++	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
++	queue->qdisc_sleeping = &noop_qdisc;
++	rcu_assign_pointer(dev->ingress_queue, queue);
++#endif
++	return queue;
++}
++
++static const struct ethtool_ops default_ethtool_ops;
++
++void netdev_set_default_ethtool_ops(struct net_device *dev,
++				    const struct ethtool_ops *ops)
++{
++	if (dev->ethtool_ops == &default_ethtool_ops)
++		dev->ethtool_ops = ops;
++}
++EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
++
++void netdev_freemem(struct net_device *dev)
++{
++	char *addr = (char *)dev - dev->padded;
++
++	kvfree(addr);
++}
++
++/**
++ * alloc_netdev_mqs - allocate network device
++ * @sizeof_priv: size of private data to allocate space for
++ * @name: device name format string
++ * @name_assign_type: origin of device name
++ * @setup: callback to initialize device
++ * @txqs: the number of TX subqueues to allocate
++ * @rxqs: the number of RX subqueues to allocate
++ *
++ * Allocates a struct net_device with private data area for driver use
++ * and performs basic initialization.  Also allocates subqueue structs
++ * for each queue on the device.
++ */
++struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
++		unsigned char name_assign_type,
++		void (*setup)(struct net_device *),
++		unsigned int txqs, unsigned int rxqs)
++{
++	struct net_device *dev;
++	unsigned int alloc_size;
++	struct net_device *p;
++
++	BUG_ON(strlen(name) >= sizeof(dev->name));
++
++	if (txqs < 1) {
++		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
++		return NULL;
++	}
++
++	if (rxqs < 1) {
++		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
++		return NULL;
++	}
++
++	alloc_size = sizeof(struct net_device);
++	if (sizeof_priv) {
++		/* ensure 32-byte alignment of private area */
++		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
++		alloc_size += sizeof_priv;
++	}
++	/* ensure 32-byte alignment of whole construct */
++	alloc_size += NETDEV_ALIGN - 1;
++
++	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
++	if (!p)
++		return NULL;
++
++	dev = PTR_ALIGN(p, NETDEV_ALIGN);
++	dev->padded = (char *)dev - (char *)p;
++
++	ref_tracker_dir_init(&dev->refcnt_tracker, 128);
++#ifdef CONFIG_PCPU_DEV_REFCNT
++	dev->pcpu_refcnt = alloc_percpu(int);
++	if (!dev->pcpu_refcnt)
++		goto free_dev;
++	__dev_hold(dev);
++#else
++	refcount_set(&dev->dev_refcnt, 1);
++#endif
++
++	if (dev_addr_init(dev))
++		goto free_pcpu;
++
++	dev_mc_init(dev);
++	dev_uc_init(dev);
++
++	dev_net_set(dev, &init_net);
++
++	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
++	dev->gso_max_segs = GSO_MAX_SEGS;
++	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
++	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
++	dev->tso_max_segs = TSO_MAX_SEGS;
++	dev->upper_level = 1;
++	dev->lower_level = 1;
++#ifdef CONFIG_LOCKDEP
++	dev->nested_level = 0;
++	INIT_LIST_HEAD(&dev->unlink_list);
++#endif
++
++	INIT_LIST_HEAD(&dev->napi_list);
++	INIT_LIST_HEAD(&dev->unreg_list);
++	INIT_LIST_HEAD(&dev->close_list);
++	INIT_LIST_HEAD(&dev->link_watch_list);
++	INIT_LIST_HEAD(&dev->adj_list.upper);
++	INIT_LIST_HEAD(&dev->adj_list.lower);
++	INIT_LIST_HEAD(&dev->ptype_all);
++	INIT_LIST_HEAD(&dev->ptype_specific);
++	INIT_LIST_HEAD(&dev->net_notifier_list);
++#ifdef CONFIG_NET_SCHED
++	hash_init(dev->qdisc_hash);
++#endif
++	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
++	setup(dev);
++
++	if (!dev->tx_queue_len) {
++		dev->priv_flags |= IFF_NO_QUEUE;
++		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
++	}
++
++	dev->num_tx_queues = txqs;
++	dev->real_num_tx_queues = txqs;
++	if (netif_alloc_netdev_queues(dev))
++		goto free_all;
++
++	dev->num_rx_queues = rxqs;
++	dev->real_num_rx_queues = rxqs;
++	if (netif_alloc_rx_queues(dev))
++		goto free_all;
++
++	strcpy(dev->name, name);
++	dev->name_assign_type = name_assign_type;
++	dev->group = INIT_NETDEV_GROUP;
++	if (!dev->ethtool_ops)
++		dev->ethtool_ops = &default_ethtool_ops;
++
++	nf_hook_netdev_init(dev);
++
++	return dev;
++
++free_all:
++	free_netdev(dev);
++	return NULL;
++
++free_pcpu:
++#ifdef CONFIG_PCPU_DEV_REFCNT
++	free_percpu(dev->pcpu_refcnt);
++free_dev:
++#endif
++	netdev_freemem(dev);
++	return NULL;
++}
++EXPORT_SYMBOL(alloc_netdev_mqs);
++
++/**
++ * free_netdev - free network device
++ * @dev: device
++ *
++ * This function does the last stage of destroying an allocated device
++ * interface. The reference to the device object is released. If this
++ * is the last reference then it will be freed.Must be called in process
++ * context.
++ */
++void free_netdev(struct net_device *dev)
++{
++	struct napi_struct *p, *n;
++
++	might_sleep();
++
++	/* When called immediately after register_netdevice() failed the unwind
++	 * handling may still be dismantling the device. Handle that case by
++	 * deferring the free.
++	 */
++	if (dev->reg_state == NETREG_UNREGISTERING) {
++		ASSERT_RTNL();
++		dev->needs_free_netdev = true;
++		return;
++	}
++
++	netif_free_tx_queues(dev);
++	netif_free_rx_queues(dev);
++
++	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
++
++	/* Flush device addresses */
++	dev_addr_flush(dev);
++
++	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
++		netif_napi_del(p);
++
++	ref_tracker_dir_exit(&dev->refcnt_tracker);
++#ifdef CONFIG_PCPU_DEV_REFCNT
++	free_percpu(dev->pcpu_refcnt);
++	dev->pcpu_refcnt = NULL;
++#endif
++	free_percpu(dev->core_stats);
++	dev->core_stats = NULL;
++	free_percpu(dev->xdp_bulkq);
++	dev->xdp_bulkq = NULL;
++
++	/*  Compatibility with error handling in drivers */
++	if (dev->reg_state == NETREG_UNINITIALIZED) {
++		netdev_freemem(dev);
++		return;
++	}
++
++	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
++	dev->reg_state = NETREG_RELEASED;
++
++	/* will free via device release */
++	put_device(&dev->dev);
++}
++EXPORT_SYMBOL(free_netdev);
++
++/**
++ *	synchronize_net -  Synchronize with packet receive processing
++ *
++ *	Wait for packets currently being received to be done.
++ *	Does not block later packets from starting.
++ */
++void synchronize_net(void)
++{
++	might_sleep();
++	if (rtnl_is_locked())
++		synchronize_rcu_expedited();
++	else
++		synchronize_rcu();
++}
++EXPORT_SYMBOL(synchronize_net);
++
++/**
++ *	unregister_netdevice_queue - remove device from the kernel
++ *	@dev: device
++ *	@head: list
++ *
++ *	This function shuts down a device interface and removes it
++ *	from the kernel tables.
++ *	If head not NULL, device is queued to be unregistered later.
++ *
++ *	Callers must hold the rtnl semaphore.  You may want
++ *	unregister_netdev() instead of this.
++ */
++
++void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
++{
++	ASSERT_RTNL();
++
++	if (head) {
++		list_move_tail(&dev->unreg_list, head);
++	} else {
++		LIST_HEAD(single);
++
++		list_add(&dev->unreg_list, &single);
++		unregister_netdevice_many(&single);
++	}
++}
++EXPORT_SYMBOL(unregister_netdevice_queue);
++
++/**
++ *	unregister_netdevice_many - unregister many devices
++ *	@head: list of devices
++ *
++ *  Note: As most callers use a stack allocated list_head,
++ *  we force a list_del() to make sure stack wont be corrupted later.
++ */
++void unregister_netdevice_many(struct list_head *head)
++{
++	struct net_device *dev, *tmp;
++	LIST_HEAD(close_head);
++
++	BUG_ON(dev_boot_phase);
++	ASSERT_RTNL();
++
++	if (list_empty(head))
++		return;
++
++	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
++		/* Some devices call without registering
++		 * for initialization unwind. Remove those
++		 * devices and proceed with the remaining.
++		 */
++		if (dev->reg_state == NETREG_UNINITIALIZED) {
++			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
++				 dev->name, dev);
++
++			WARN_ON(1);
++			list_del(&dev->unreg_list);
++			continue;
++		}
++		dev->dismantle = true;
++		BUG_ON(dev->reg_state != NETREG_REGISTERED);
++	}
++
++	/* If device is running, close it first. */
++	list_for_each_entry(dev, head, unreg_list)
++		list_add_tail(&dev->close_list, &close_head);
++	dev_close_many(&close_head, true);
++
++	list_for_each_entry(dev, head, unreg_list) {
++		/* And unlink it from device chain. */
++		write_lock(&dev_base_lock);
++		unlist_netdevice(dev, false);
++		dev->reg_state = NETREG_UNREGISTERING;
++		write_unlock(&dev_base_lock);
++	}
++	flush_all_backlogs();
++
++	synchronize_net();
++
++	list_for_each_entry(dev, head, unreg_list) {
++		struct sk_buff *skb = NULL;
++
++		/* Shutdown queueing discipline. */
++		dev_shutdown(dev);
++
++		dev_xdp_uninstall(dev);
++
++		netdev_offload_xstats_disable_all(dev);
++
++		/* Notify protocols, that we are about to destroy
++		 * this device. They should clean all the things.
++		 */
++		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
++
++		if (!dev->rtnl_link_ops ||
++		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
++			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
++						     GFP_KERNEL, NULL, 0);
++
++		/*
++		 *	Flush the unicast and multicast chains
++		 */
++		dev_uc_flush(dev);
++		dev_mc_flush(dev);
++
++		netdev_name_node_alt_flush(dev);
++		netdev_name_node_free(dev->name_node);
++
++		if (dev->netdev_ops->ndo_uninit)
++			dev->netdev_ops->ndo_uninit(dev);
++
++		if (skb)
++			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
++
++		/* Notifier chain MUST detach us all upper devices. */
++		WARN_ON(netdev_has_any_upper_dev(dev));
++		WARN_ON(netdev_has_any_lower_dev(dev));
++
++		/* Remove entries from kobject tree */
++		netdev_unregister_kobject(dev);
++#ifdef CONFIG_XPS
++		/* Remove XPS queueing entries */
++		netif_reset_xps_queues_gt(dev, 0);
++#endif
++	}
++
++	synchronize_net();
++
++	list_for_each_entry(dev, head, unreg_list) {
++		netdev_put(dev, &dev->dev_registered_tracker);
++		net_set_todo(dev);
++	}
++
++	list_del(head);
++}
++EXPORT_SYMBOL(unregister_netdevice_many);
++
++/**
++ *	unregister_netdev - remove device from the kernel
++ *	@dev: device
++ *
++ *	This function shuts down a device interface and removes it
++ *	from the kernel tables.
++ *
++ *	This is just a wrapper for unregister_netdevice that takes
++ *	the rtnl semaphore.  In general you want to use this and not
++ *	unregister_netdevice.
++ */
++void unregister_netdev(struct net_device *dev)
++{
++	rtnl_lock();
++	unregister_netdevice(dev);
++	rtnl_unlock();
++}
++EXPORT_SYMBOL(unregister_netdev);
++
++/**
++ *	__dev_change_net_namespace - move device to different nethost namespace
++ *	@dev: device
++ *	@net: network namespace
++ *	@pat: If not NULL name pattern to try if the current device name
++ *	      is already taken in the destination network namespace.
++ *	@new_ifindex: If not zero, specifies device index in the target
++ *	              namespace.
++ *
++ *	This function shuts down a device interface and moves it
++ *	to a new network namespace. On success 0 is returned, on
++ *	a failure a netagive errno code is returned.
++ *
++ *	Callers must hold the rtnl semaphore.
++ */
++
++int __dev_change_net_namespace(struct net_device *dev, struct net *net,
++			       const char *pat, int new_ifindex)
++{
++	struct net *net_old = dev_net(dev);
++	int err, new_nsid;
++
++	ASSERT_RTNL();
++
++	/* Don't allow namespace local devices to be moved. */
++	err = -EINVAL;
++	if (dev->features & NETIF_F_NETNS_LOCAL)
++		goto out;
++
++	/* Ensure the device has been registrered */
++	if (dev->reg_state != NETREG_REGISTERED)
++		goto out;
++
++	/* Get out if there is nothing todo */
++	err = 0;
++	if (net_eq(net_old, net))
++		goto out;
++
++	/* Pick the destination device name, and ensure
++	 * we can use it in the destination network namespace.
++	 */
++	err = -EEXIST;
++	if (netdev_name_in_use(net, dev->name)) {
++		/* We get here if we can't use the current device name */
++		if (!pat)
++			goto out;
++		err = dev_get_valid_name(net, dev, pat);
++		if (err < 0)
++			goto out;
++	}
++
++	/* Check that new_ifindex isn't used yet. */
++	err = -EBUSY;
++	if (new_ifindex && __dev_get_by_index(net, new_ifindex))
++		goto out;
++
++	/*
++	 * And now a mini version of register_netdevice unregister_netdevice.
++	 */
++
++	/* If device is running close it first. */
++	dev_close(dev);
++
++	/* And unlink it from device chain */
++	unlist_netdevice(dev, true);
++
++	synchronize_net();
++
++	/* Shutdown queueing discipline. */
++	dev_shutdown(dev);
++
++	/* Notify protocols, that we are about to destroy
++	 * this device. They should clean all the things.
++	 *
++	 * Note that dev->reg_state stays at NETREG_REGISTERED.
++	 * This is wanted because this way 8021q and macvlan know
++	 * the device is just moving and can keep their slaves up.
++	 */
++	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
++	rcu_barrier();
++
++	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
++	/* If there is an ifindex conflict assign a new one */
++	if (!new_ifindex) {
++		if (__dev_get_by_index(net, dev->ifindex))
++			new_ifindex = dev_new_index(net);
++		else
++			new_ifindex = dev->ifindex;
++	}
++
++	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
++			    new_ifindex);
++
++	/*
++	 *	Flush the unicast and multicast chains
++	 */
++	dev_uc_flush(dev);
++	dev_mc_flush(dev);
++
++	/* Send a netdev-removed uevent to the old namespace */
++	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
++	netdev_adjacent_del_links(dev);
++
++	/* Move per-net netdevice notifiers that are following the netdevice */
++	move_netdevice_notifiers_dev_net(dev, net);
++
++	/* Actually switch the network namespace */
++	dev_net_set(dev, net);
++	dev->ifindex = new_ifindex;
++
++	/* Send a netdev-add uevent to the new namespace */
++	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
++	netdev_adjacent_add_links(dev);
++
++	/* Fixup kobjects */
++	err = device_rename(&dev->dev, dev->name);
++	WARN_ON(err);
++
++	/* Adapt owner in case owning user namespace of target network
++	 * namespace is different from the original one.
++	 */
++	err = netdev_change_owner(dev, net_old, net);
++	WARN_ON(err);
++
++	/* Add the device back in the hashes */
++	list_netdevice(dev);
++
++	/* Notify protocols, that a new device appeared. */
++	call_netdevice_notifiers(NETDEV_REGISTER, dev);
++
++	/*
++	 *	Prevent userspace races by waiting until the network
++	 *	device is fully setup before sending notifications.
++	 */
++	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
++
++	synchronize_net();
++	err = 0;
++out:
++	return err;
++}
++EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
++
++static int dev_cpu_dead(unsigned int oldcpu)
++{
++	struct sk_buff **list_skb;
++	struct sk_buff *skb;
++	unsigned int cpu;
++	struct softnet_data *sd, *oldsd, *remsd = NULL;
++
++	local_irq_disable();
++	cpu = smp_processor_id();
++	sd = &per_cpu(softnet_data, cpu);
++	oldsd = &per_cpu(softnet_data, oldcpu);
++
++	/* Find end of our completion_queue. */
++	list_skb = &sd->completion_queue;
++	while (*list_skb)
++		list_skb = &(*list_skb)->next;
++	/* Append completion queue from offline CPU. */
++	*list_skb = oldsd->completion_queue;
++	oldsd->completion_queue = NULL;
++
++	/* Append output queue from offline CPU. */
++	if (oldsd->output_queue) {
++		*sd->output_queue_tailp = oldsd->output_queue;
++		sd->output_queue_tailp = oldsd->output_queue_tailp;
++		oldsd->output_queue = NULL;
++		oldsd->output_queue_tailp = &oldsd->output_queue;
++	}
++	/* Append NAPI poll list from offline CPU, with one exception :
++	 * process_backlog() must be called by cpu owning percpu backlog.
++	 * We properly handle process_queue & input_pkt_queue later.
++	 */
++	while (!list_empty(&oldsd->poll_list)) {
++		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
++							    struct napi_struct,
++							    poll_list);
++
++		list_del_init(&napi->poll_list);
++		if (napi->poll == process_backlog)
++			napi->state = 0;
++		else
++			____napi_schedule(sd, napi);
++	}
++
++	raise_softirq_irqoff(NET_TX_SOFTIRQ);
++	local_irq_enable();
++
++#ifdef CONFIG_RPS
++	remsd = oldsd->rps_ipi_list;
++	oldsd->rps_ipi_list = NULL;
++#endif
++	/* send out pending IPI's on offline CPU */
++	net_rps_send_ipi(remsd);
++
++	/* Process offline CPU's input_pkt_queue */
++	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
++		netif_rx(skb);
++		input_queue_head_incr(oldsd);
++	}
++	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
++		netif_rx(skb);
++		input_queue_head_incr(oldsd);
++	}
++
++	return 0;
++}
++
++/**
++ *	netdev_increment_features - increment feature set by one
++ *	@all: current feature set
++ *	@one: new feature set
++ *	@mask: mask feature set
++ *
++ *	Computes a new feature set after adding a device with feature set
++ *	@one to the master device with current feature set @all.  Will not
++ *	enable anything that is off in @mask. Returns the new feature set.
++ */
++netdev_features_t netdev_increment_features(netdev_features_t all,
++	netdev_features_t one, netdev_features_t mask)
++{
++	if (mask & NETIF_F_HW_CSUM)
++		mask |= NETIF_F_CSUM_MASK;
++	mask |= NETIF_F_VLAN_CHALLENGED;
++
++	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
++	all &= one | ~NETIF_F_ALL_FOR_ALL;
++
++	/* If one device supports hw checksumming, set for all. */
++	if (all & NETIF_F_HW_CSUM)
++		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
++
++	return all;
++}
++EXPORT_SYMBOL(netdev_increment_features);
++
++static struct hlist_head * __net_init netdev_create_hash(void)
++{
++	int i;
++	struct hlist_head *hash;
++
++	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
++	if (hash != NULL)
++		for (i = 0; i < NETDEV_HASHENTRIES; i++)
++			INIT_HLIST_HEAD(&hash[i]);
++
++	return hash;
++}
++
++/* Initialize per network namespace state */
++static int __net_init netdev_init(struct net *net)
++{
++	BUILD_BUG_ON(GRO_HASH_BUCKETS >
++		     8 * sizeof_field(struct napi_struct, gro_bitmask));
++
++	INIT_LIST_HEAD(&net->dev_base_head);
++
++	net->dev_name_head = netdev_create_hash();
++	if (net->dev_name_head == NULL)
++		goto err_name;
++
++	net->dev_index_head = netdev_create_hash();
++	if (net->dev_index_head == NULL)
++		goto err_idx;
++
++	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
++
++	return 0;
++
++err_idx:
++	kfree(net->dev_name_head);
++err_name:
++	return -ENOMEM;
++}
++
++/**
++ *	netdev_drivername - network driver for the device
++ *	@dev: network device
++ *
++ *	Determine network driver for device.
++ */
++const char *netdev_drivername(const struct net_device *dev)
++{
++	const struct device_driver *driver;
++	const struct device *parent;
++	const char *empty = "";
++
++	parent = dev->dev.parent;
++	if (!parent)
++		return empty;
++
++	driver = parent->driver;
++	if (driver && driver->name)
++		return driver->name;
++	return empty;
++}
++
++static void __netdev_printk(const char *level, const struct net_device *dev,
++			    struct va_format *vaf)
++{
++	if (dev && dev->dev.parent) {
++		dev_printk_emit(level[1] - '0',
++				dev->dev.parent,
++				"%s %s %s%s: %pV",
++				dev_driver_string(dev->dev.parent),
++				dev_name(dev->dev.parent),
++				netdev_name(dev), netdev_reg_state(dev),
++				vaf);
++	} else if (dev) {
++		printk("%s%s%s: %pV",
++		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
++	} else {
++		printk("%s(NULL net_device): %pV", level, vaf);
++	}
++}
++
++void netdev_printk(const char *level, const struct net_device *dev,
++		   const char *format, ...)
++{
++	struct va_format vaf;
++	va_list args;
++
++	va_start(args, format);
++
++	vaf.fmt = format;
++	vaf.va = &args;
++
++	__netdev_printk(level, dev, &vaf);
++
++	va_end(args);
++}
++EXPORT_SYMBOL(netdev_printk);
++
++#define define_netdev_printk_level(func, level)			\
++void func(const struct net_device *dev, const char *fmt, ...)	\
++{								\
++	struct va_format vaf;					\
++	va_list args;						\
++								\
++	va_start(args, fmt);					\
++								\
++	vaf.fmt = fmt;						\
++	vaf.va = &args;						\
++								\
++	__netdev_printk(level, dev, &vaf);			\
++								\
++	va_end(args);						\
++}								\
++EXPORT_SYMBOL(func);
++
++define_netdev_printk_level(netdev_emerg, KERN_EMERG);
++define_netdev_printk_level(netdev_alert, KERN_ALERT);
++define_netdev_printk_level(netdev_crit, KERN_CRIT);
++define_netdev_printk_level(netdev_err, KERN_ERR);
++define_netdev_printk_level(netdev_warn, KERN_WARNING);
++define_netdev_printk_level(netdev_notice, KERN_NOTICE);
++define_netdev_printk_level(netdev_info, KERN_INFO);
++
++static void __net_exit netdev_exit(struct net *net)
++{
++	kfree(net->dev_name_head);
++	kfree(net->dev_index_head);
++	if (net != &init_net)
++		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
++}
++
++static struct pernet_operations __net_initdata netdev_net_ops = {
++	.init = netdev_init,
++	.exit = netdev_exit,
++};
++
++static void __net_exit default_device_exit_net(struct net *net)
++{
++	struct net_device *dev, *aux;
++	/*
++	 * Push all migratable network devices back to the
++	 * initial network namespace
++	 */
++	ASSERT_RTNL();
++	for_each_netdev_safe(net, dev, aux) {
++		int err;
++		char fb_name[IFNAMSIZ];
++
++		/* Ignore unmoveable devices (i.e. loopback) */
++		if (dev->features & NETIF_F_NETNS_LOCAL)
++			continue;
++
++		/* Leave virtual devices for the generic cleanup */
++		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
++			continue;
++
++		/* Push remaining network devices to init_net */
++		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
++		if (netdev_name_in_use(&init_net, fb_name))
++			snprintf(fb_name, IFNAMSIZ, "dev%%d");
++		err = dev_change_net_namespace(dev, &init_net, fb_name);
++		if (err) {
++			pr_emerg("%s: failed to move %s to init_net: %d\n",
++				 __func__, dev->name, err);
++			BUG();
++		}
++	}
++}
++
++static void __net_exit default_device_exit_batch(struct list_head *net_list)
++{
++	/* At exit all network devices most be removed from a network
++	 * namespace.  Do this in the reverse order of registration.
++	 * Do this across as many network namespaces as possible to
++	 * improve batching efficiency.
++	 */
++	struct net_device *dev;
++	struct net *net;
++	LIST_HEAD(dev_kill_list);
++
++	rtnl_lock();
++	list_for_each_entry(net, net_list, exit_list) {
++		default_device_exit_net(net);
++		cond_resched();
++	}
++
++	list_for_each_entry(net, net_list, exit_list) {
++		for_each_netdev_reverse(net, dev) {
++			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
++				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
++			else
++				unregister_netdevice_queue(dev, &dev_kill_list);
++		}
++	}
++	unregister_netdevice_many(&dev_kill_list);
++	rtnl_unlock();
++}
++
++static struct pernet_operations __net_initdata default_device_ops = {
++	.exit_batch = default_device_exit_batch,
++};
++
++/*
++ *	Initialize the DEV module. At boot time this walks the device list and
++ *	unhooks any devices that fail to initialise (normally hardware not
++ *	present) and leaves us with a valid list of present and active devices.
++ *
++ */
++
++/*
++ *       This is called single threaded during boot, so no need
++ *       to take the rtnl semaphore.
++ */
++static int __init net_dev_init(void)
++{
++	int i, rc = -ENOMEM;
++
++	BUG_ON(!dev_boot_phase);
++
++	if (dev_proc_init())
++		goto out;
++
++	if (netdev_kobject_init())
++		goto out;
++
++	INIT_LIST_HEAD(&ptype_all);
++	for (i = 0; i < PTYPE_HASH_SIZE; i++)
++		INIT_LIST_HEAD(&ptype_base[i]);
++
++	if (register_pernet_subsys(&netdev_net_ops))
++		goto out;
++
++	/*
++	 *	Initialise the packet receive queues.
++	 */
++
++	for_each_possible_cpu(i) {
++		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
++		struct softnet_data *sd = &per_cpu(softnet_data, i);
++
++		INIT_WORK(flush, flush_backlog);
++
++		skb_queue_head_init(&sd->input_pkt_queue);
++		skb_queue_head_init(&sd->process_queue);
++#ifdef CONFIG_XFRM_OFFLOAD
++		skb_queue_head_init(&sd->xfrm_backlog);
++#endif
++		INIT_LIST_HEAD(&sd->poll_list);
++		sd->output_queue_tailp = &sd->output_queue;
++#ifdef CONFIG_RPS
++		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
++		sd->cpu = i;
++#endif
++		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
++		spin_lock_init(&sd->defer_lock);
++
++		init_gro_hash(&sd->backlog);
++		sd->backlog.poll = process_backlog;
++		sd->backlog.weight = weight_p;
++	}
++
++	dev_boot_phase = 0;
++
++	/* The loopback device is special if any other network devices
++	 * is present in a network namespace the loopback device must
++	 * be present. Since we now dynamically allocate and free the
++	 * loopback device ensure this invariant is maintained by
++	 * keeping the loopback device as the first device on the
++	 * list of network devices.  Ensuring the loopback devices
++	 * is the first device that appears and the last network device
++	 * that disappears.
++	 */
++	if (register_pernet_device(&loopback_net_ops))
++		goto out;
++
++	if (register_pernet_device(&default_device_ops))
++		goto out;
++
++	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
++	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
++
++	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
++				       NULL, dev_cpu_dead);
++	WARN_ON(rc < 0);
++	rc = 0;
++out:
++	return rc;
++}
++
++subsys_initcall(net_dev_init);
+diff -rupN linux.orig/net/core/devlink.c linux/net/core/devlink.c
+--- linux.orig/net/core/devlink.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/devlink.c	2022-12-04 10:40:26.732034003 -0500
+@@ -8268,10 +8268,10 @@ static void devlink_trap_stats_read(stru
  
  		cpu_stats = per_cpu_ptr(trap_stats, i);
  		do {
@@ -8713,11 +50605,10 @@ index b50bcc18b8d9e..cfa6a099457ae 100644
  
  		u64_stats_add(&stats->rx_packets, rx_packets);
  		u64_stats_add(&stats->rx_bytes, rx_bytes);
-diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
-index 75501e1bdd25b..dfcaf61d972c7 100644
---- a/net/core/drop_monitor.c
-+++ b/net/core/drop_monitor.c
-@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net_dm_stats *stats)
+diff -rupN linux.orig/net/core/drop_monitor.c linux/net/core/drop_monitor.c
+--- linux.orig/net/core/drop_monitor.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/drop_monitor.c	2022-12-04 10:40:26.732034003 -0500
+@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net
  		u64 dropped;
  
  		do {
@@ -8729,7 +50620,7 @@ index 75501e1bdd25b..dfcaf61d972c7 100644
  
  		u64_stats_add(&stats->dropped, dropped);
  	}
-@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats)
+@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct
  		u64 dropped;
  
  		do {
@@ -8741,11 +50632,10 @@ index 75501e1bdd25b..dfcaf61d972c7 100644
  
  		u64_stats_add(&stats->dropped, dropped);
  	}
-diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
-index c8d137ef5980e..b71ccaec09914 100644
---- a/net/core/gen_stats.c
-+++ b/net/core/gen_stats.c
-@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats,
+diff -rupN linux.orig/net/core/gen_stats.c linux/net/core/gen_stats.c
+--- linux.orig/net/core/gen_stats.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/gen_stats.c	2022-12-04 10:40:26.732034003 -0500
+@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(str
  		u64 bytes, packets;
  
  		do {
@@ -8758,7 +50648,7 @@ index c8d137ef5980e..b71ccaec09914 100644
  
  		t_bytes += bytes;
  		t_packets += packets;
-@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats,
+@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_st
  	}
  	do {
  		if (running)
@@ -8771,7 +50661,7 @@ index c8d137ef5980e..b71ccaec09914 100644
  
  	_bstats_update(bstats, bytes, packets);
  }
-@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
+@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *r
  			u64 bytes, packets;
  
  			do {
@@ -8784,7 +50674,7 @@ index c8d137ef5980e..b71ccaec09914 100644
  
  			t_bytes += bytes;
  			t_packets += packets;
-@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
+@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *r
  	}
  	do {
  		if (running)
@@ -8797,11 +50687,10 @@ index c8d137ef5980e..b71ccaec09914 100644
  }
  
  static int
-diff --git a/net/core/skbuff.c b/net/core/skbuff.c
-index 417463da4fac7..505c72a9b1534 100644
---- a/net/core/skbuff.c
-+++ b/net/core/skbuff.c
-@@ -6555,6 +6555,11 @@ nodefer:	__kfree_skb(skb);
+diff -rupN linux.orig/net/core/skbuff.c linux/net/core/skbuff.c
+--- linux.orig/net/core/skbuff.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/core/skbuff.c	2022-12-04 10:40:26.732034003 -0500
+@@ -6557,6 +6557,11 @@ nodefer:	__kfree_skb(skb);
  	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
  	 * if we are unlucky enough (this seems very unlikely).
  	 */
@@ -8814,11 +50703,6576 @@ index 417463da4fac7..505c72a9b1534 100644
 +#endif
 +	}
  }
-diff --git a/net/dsa/slave.c b/net/dsa/slave.c
-index 1291c2431d440..dcc550b871623 100644
---- a/net/dsa/slave.c
-+++ b/net/dsa/slave.c
-@@ -934,12 +934,12 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev,
+diff -rupN linux.orig/net/core/skbuff.c.orig linux/net/core/skbuff.c.orig
+--- linux.orig/net/core/skbuff.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/net/core/skbuff.c.orig	2022-12-04 10:40:18.728054516 -0500
+@@ -0,0 +1,6562 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *	Routines having to do with the 'struct sk_buff' memory handlers.
++ *
++ *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
++ *			Florian La Roche <rzsfl@rz.uni-sb.de>
++ *
++ *	Fixes:
++ *		Alan Cox	:	Fixed the worst of the load
++ *					balancer bugs.
++ *		Dave Platt	:	Interrupt stacking fix.
++ *	Richard Kooijman	:	Timestamp fixes.
++ *		Alan Cox	:	Changed buffer format.
++ *		Alan Cox	:	destructor hook for AF_UNIX etc.
++ *		Linus Torvalds	:	Better skb_clone.
++ *		Alan Cox	:	Added skb_copy.
++ *		Alan Cox	:	Added all the changed routines Linus
++ *					only put in the headers
++ *		Ray VanTassle	:	Fixed --skb->lock in free
++ *		Alan Cox	:	skb_copy copy arp field
++ *		Andi Kleen	:	slabified it.
++ *		Robert Olsson	:	Removed skb_head_pool
++ *
++ *	NOTE:
++ *		The __skb_ routines should be called with interrupts
++ *	disabled, or you better be *real* sure that the operation is atomic
++ *	with respect to whatever list is being frobbed (e.g. via lock_sock()
++ *	or via disabling bottom half handlers, etc).
++ */
++
++/*
++ *	The functions in this file will not compile correctly with gcc 2.4.x
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/module.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/interrupt.h>
++#include <linux/in.h>
++#include <linux/inet.h>
++#include <linux/slab.h>
++#include <linux/tcp.h>
++#include <linux/udp.h>
++#include <linux/sctp.h>
++#include <linux/netdevice.h>
++#ifdef CONFIG_NET_CLS_ACT
++#include <net/pkt_sched.h>
++#endif
++#include <linux/string.h>
++#include <linux/skbuff.h>
++#include <linux/splice.h>
++#include <linux/cache.h>
++#include <linux/rtnetlink.h>
++#include <linux/init.h>
++#include <linux/scatterlist.h>
++#include <linux/errqueue.h>
++#include <linux/prefetch.h>
++#include <linux/if_vlan.h>
++#include <linux/mpls.h>
++#include <linux/kcov.h>
++
++#include <net/protocol.h>
++#include <net/dst.h>
++#include <net/sock.h>
++#include <net/checksum.h>
++#include <net/ip6_checksum.h>
++#include <net/xfrm.h>
++#include <net/mpls.h>
++#include <net/mptcp.h>
++#include <net/mctp.h>
++#include <net/page_pool.h>
++
++#include <linux/uaccess.h>
++#include <trace/events/skb.h>
++#include <linux/highmem.h>
++#include <linux/capability.h>
++#include <linux/user_namespace.h>
++#include <linux/indirect_call_wrapper.h>
++
++#include "dev.h"
++#include "sock_destructor.h"
++
++struct kmem_cache *skbuff_head_cache __ro_after_init;
++static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
++#ifdef CONFIG_SKB_EXTENSIONS
++static struct kmem_cache *skbuff_ext_cache __ro_after_init;
++#endif
++int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
++EXPORT_SYMBOL(sysctl_max_skb_frags);
++
++#undef FN
++#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
++const char * const drop_reasons[] = {
++	DEFINE_DROP_REASON(FN, FN)
++};
++EXPORT_SYMBOL(drop_reasons);
++
++/**
++ *	skb_panic - private function for out-of-line support
++ *	@skb:	buffer
++ *	@sz:	size
++ *	@addr:	address
++ *	@msg:	skb_over_panic or skb_under_panic
++ *
++ *	Out-of-line support for skb_put() and skb_push().
++ *	Called via the wrapper skb_over_panic() or skb_under_panic().
++ *	Keep out of line to prevent kernel bloat.
++ *	__builtin_return_address is not used because it is not always reliable.
++ */
++static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
++		      const char msg[])
++{
++	pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
++		 msg, addr, skb->len, sz, skb->head, skb->data,
++		 (unsigned long)skb->tail, (unsigned long)skb->end,
++		 skb->dev ? skb->dev->name : "<NULL>");
++	BUG();
++}
++
++static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
++{
++	skb_panic(skb, sz, addr, __func__);
++}
++
++static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
++{
++	skb_panic(skb, sz, addr, __func__);
++}
++
++#define NAPI_SKB_CACHE_SIZE	64
++#define NAPI_SKB_CACHE_BULK	16
++#define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
++
++struct napi_alloc_cache {
++	struct page_frag_cache page;
++	unsigned int skb_count;
++	void *skb_cache[NAPI_SKB_CACHE_SIZE];
++};
++
++static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
++static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
++
++void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
++{
++	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
++
++	fragsz = SKB_DATA_ALIGN(fragsz);
++
++	return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
++}
++EXPORT_SYMBOL(__napi_alloc_frag_align);
++
++void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
++{
++	void *data;
++
++	fragsz = SKB_DATA_ALIGN(fragsz);
++	if (in_hardirq() || irqs_disabled()) {
++		struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
++
++		data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
++	} else {
++		struct napi_alloc_cache *nc;
++
++		local_bh_disable();
++		nc = this_cpu_ptr(&napi_alloc_cache);
++		data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
++		local_bh_enable();
++	}
++	return data;
++}
++EXPORT_SYMBOL(__netdev_alloc_frag_align);
++
++static struct sk_buff *napi_skb_cache_get(void)
++{
++	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
++	struct sk_buff *skb;
++
++	if (unlikely(!nc->skb_count)) {
++		nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
++						      GFP_ATOMIC,
++						      NAPI_SKB_CACHE_BULK,
++						      nc->skb_cache);
++		if (unlikely(!nc->skb_count))
++			return NULL;
++	}
++
++	skb = nc->skb_cache[--nc->skb_count];
++	kasan_unpoison_object_data(skbuff_head_cache, skb);
++
++	return skb;
++}
++
++/* Caller must provide SKB that is memset cleared */
++static void __build_skb_around(struct sk_buff *skb, void *data,
++			       unsigned int frag_size)
++{
++	struct skb_shared_info *shinfo;
++	unsigned int size = frag_size ? : ksize(data);
++
++	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++
++	/* Assumes caller memset cleared SKB */
++	skb->truesize = SKB_TRUESIZE(size);
++	refcount_set(&skb->users, 1);
++	skb->head = data;
++	skb->data = data;
++	skb_reset_tail_pointer(skb);
++	skb_set_end_offset(skb, size);
++	skb->mac_header = (typeof(skb->mac_header))~0U;
++	skb->transport_header = (typeof(skb->transport_header))~0U;
++	skb->alloc_cpu = raw_smp_processor_id();
++	/* make sure we initialize shinfo sequentially */
++	shinfo = skb_shinfo(skb);
++	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
++	atomic_set(&shinfo->dataref, 1);
++
++	skb_set_kcov_handle(skb, kcov_common_handle());
++}
++
++/**
++ * __build_skb - build a network buffer
++ * @data: data buffer provided by caller
++ * @frag_size: size of data, or 0 if head was kmalloced
++ *
++ * Allocate a new &sk_buff. Caller provides space holding head and
++ * skb_shared_info. @data must have been allocated by kmalloc() only if
++ * @frag_size is 0, otherwise data should come from the page allocator
++ *  or vmalloc()
++ * The return is the new skb buffer.
++ * On a failure the return is %NULL, and @data is not freed.
++ * Notes :
++ *  Before IO, driver allocates only data buffer where NIC put incoming frame
++ *  Driver should add room at head (NET_SKB_PAD) and
++ *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
++ *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
++ *  before giving packet to stack.
++ *  RX rings only contains data buffers, not full skbs.
++ */
++struct sk_buff *__build_skb(void *data, unsigned int frag_size)
++{
++	struct sk_buff *skb;
++
++	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
++	if (unlikely(!skb))
++		return NULL;
++
++	memset(skb, 0, offsetof(struct sk_buff, tail));
++	__build_skb_around(skb, data, frag_size);
++
++	return skb;
++}
++
++/* build_skb() is wrapper over __build_skb(), that specifically
++ * takes care of skb->head and skb->pfmemalloc
++ * This means that if @frag_size is not zero, then @data must be backed
++ * by a page fragment, not kmalloc() or vmalloc()
++ */
++struct sk_buff *build_skb(void *data, unsigned int frag_size)
++{
++	struct sk_buff *skb = __build_skb(data, frag_size);
++
++	if (skb && frag_size) {
++		skb->head_frag = 1;
++		if (page_is_pfmemalloc(virt_to_head_page(data)))
++			skb->pfmemalloc = 1;
++	}
++	return skb;
++}
++EXPORT_SYMBOL(build_skb);
++
++/**
++ * build_skb_around - build a network buffer around provided skb
++ * @skb: sk_buff provide by caller, must be memset cleared
++ * @data: data buffer provided by caller
++ * @frag_size: size of data, or 0 if head was kmalloced
++ */
++struct sk_buff *build_skb_around(struct sk_buff *skb,
++				 void *data, unsigned int frag_size)
++{
++	if (unlikely(!skb))
++		return NULL;
++
++	__build_skb_around(skb, data, frag_size);
++
++	if (frag_size) {
++		skb->head_frag = 1;
++		if (page_is_pfmemalloc(virt_to_head_page(data)))
++			skb->pfmemalloc = 1;
++	}
++	return skb;
++}
++EXPORT_SYMBOL(build_skb_around);
++
++/**
++ * __napi_build_skb - build a network buffer
++ * @data: data buffer provided by caller
++ * @frag_size: size of data, or 0 if head was kmalloced
++ *
++ * Version of __build_skb() that uses NAPI percpu caches to obtain
++ * skbuff_head instead of inplace allocation.
++ *
++ * Returns a new &sk_buff on success, %NULL on allocation failure.
++ */
++static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
++{
++	struct sk_buff *skb;
++
++	skb = napi_skb_cache_get();
++	if (unlikely(!skb))
++		return NULL;
++
++	memset(skb, 0, offsetof(struct sk_buff, tail));
++	__build_skb_around(skb, data, frag_size);
++
++	return skb;
++}
++
++/**
++ * napi_build_skb - build a network buffer
++ * @data: data buffer provided by caller
++ * @frag_size: size of data, or 0 if head was kmalloced
++ *
++ * Version of __napi_build_skb() that takes care of skb->head_frag
++ * and skb->pfmemalloc when the data is a page or page fragment.
++ *
++ * Returns a new &sk_buff on success, %NULL on allocation failure.
++ */
++struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
++{
++	struct sk_buff *skb = __napi_build_skb(data, frag_size);
++
++	if (likely(skb) && frag_size) {
++		skb->head_frag = 1;
++		skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
++	}
++
++	return skb;
++}
++EXPORT_SYMBOL(napi_build_skb);
++
++/*
++ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
++ * the caller if emergency pfmemalloc reserves are being used. If it is and
++ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
++ * may be used. Otherwise, the packet data may be discarded until enough
++ * memory is free
++ */
++static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
++			     bool *pfmemalloc)
++{
++	void *obj;
++	bool ret_pfmemalloc = false;
++
++	/*
++	 * Try a regular allocation, when that fails and we're not entitled
++	 * to the reserves, fail.
++	 */
++	obj = kmalloc_node_track_caller(size,
++					flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
++					node);
++	if (obj || !(gfp_pfmemalloc_allowed(flags)))
++		goto out;
++
++	/* Try again but now we are using pfmemalloc reserves */
++	ret_pfmemalloc = true;
++	obj = kmalloc_node_track_caller(size, flags, node);
++
++out:
++	if (pfmemalloc)
++		*pfmemalloc = ret_pfmemalloc;
++
++	return obj;
++}
++
++/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
++ *	'private' fields and also do memory statistics to find all the
++ *	[BEEP] leaks.
++ *
++ */
++
++/**
++ *	__alloc_skb	-	allocate a network buffer
++ *	@size: size to allocate
++ *	@gfp_mask: allocation mask
++ *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
++ *		instead of head cache and allocate a cloned (child) skb.
++ *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
++ *		allocations in case the data is required for writeback
++ *	@node: numa node to allocate memory on
++ *
++ *	Allocate a new &sk_buff. The returned buffer has no headroom and a
++ *	tail room of at least size bytes. The object has a reference count
++ *	of one. The return is the buffer. On a failure the return is %NULL.
++ *
++ *	Buffers may only be allocated from interrupts using a @gfp_mask of
++ *	%GFP_ATOMIC.
++ */
++struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
++			    int flags, int node)
++{
++	struct kmem_cache *cache;
++	struct sk_buff *skb;
++	unsigned int osize;
++	bool pfmemalloc;
++	u8 *data;
++
++	cache = (flags & SKB_ALLOC_FCLONE)
++		? skbuff_fclone_cache : skbuff_head_cache;
++
++	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
++		gfp_mask |= __GFP_MEMALLOC;
++
++	/* Get the HEAD */
++	if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
++	    likely(node == NUMA_NO_NODE || node == numa_mem_id()))
++		skb = napi_skb_cache_get();
++	else
++		skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
++	if (unlikely(!skb))
++		return NULL;
++	prefetchw(skb);
++
++	/* We do our best to align skb_shared_info on a separate cache
++	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
++	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
++	 * Both skb->head and skb_shared_info are cache line aligned.
++	 */
++	size = SKB_DATA_ALIGN(size);
++	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++	data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
++	if (unlikely(!data))
++		goto nodata;
++	/* kmalloc(size) might give us more room than requested.
++	 * Put skb_shared_info exactly at the end of allocated zone,
++	 * to allow max possible filling before reallocation.
++	 */
++	osize = ksize(data);
++	size = SKB_WITH_OVERHEAD(osize);
++	prefetchw(data + size);
++
++	/*
++	 * Only clear those fields we need to clear, not those that we will
++	 * actually initialise below. Hence, don't put any more fields after
++	 * the tail pointer in struct sk_buff!
++	 */
++	memset(skb, 0, offsetof(struct sk_buff, tail));
++	__build_skb_around(skb, data, osize);
++	skb->pfmemalloc = pfmemalloc;
++
++	if (flags & SKB_ALLOC_FCLONE) {
++		struct sk_buff_fclones *fclones;
++
++		fclones = container_of(skb, struct sk_buff_fclones, skb1);
++
++		skb->fclone = SKB_FCLONE_ORIG;
++		refcount_set(&fclones->fclone_ref, 1);
++	}
++
++	return skb;
++
++nodata:
++	kmem_cache_free(cache, skb);
++	return NULL;
++}
++EXPORT_SYMBOL(__alloc_skb);
++
++/**
++ *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
++ *	@dev: network device to receive on
++ *	@len: length to allocate
++ *	@gfp_mask: get_free_pages mask, passed to alloc_skb
++ *
++ *	Allocate a new &sk_buff and assign it a usage count of one. The
++ *	buffer has NET_SKB_PAD headroom built in. Users should allocate
++ *	the headroom they think they need without accounting for the
++ *	built in space. The built in space is used for optimisations.
++ *
++ *	%NULL is returned if there is no free memory.
++ */
++struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
++				   gfp_t gfp_mask)
++{
++	struct page_frag_cache *nc;
++	struct sk_buff *skb;
++	bool pfmemalloc;
++	void *data;
++
++	len += NET_SKB_PAD;
++
++	/* If requested length is either too small or too big,
++	 * we use kmalloc() for skb->head allocation.
++	 */
++	if (len <= SKB_WITH_OVERHEAD(1024) ||
++	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
++	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
++		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
++		if (!skb)
++			goto skb_fail;
++		goto skb_success;
++	}
++
++	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++	len = SKB_DATA_ALIGN(len);
++
++	if (sk_memalloc_socks())
++		gfp_mask |= __GFP_MEMALLOC;
++
++	if (in_hardirq() || irqs_disabled()) {
++		nc = this_cpu_ptr(&netdev_alloc_cache);
++		data = page_frag_alloc(nc, len, gfp_mask);
++		pfmemalloc = nc->pfmemalloc;
++	} else {
++		local_bh_disable();
++		nc = this_cpu_ptr(&napi_alloc_cache.page);
++		data = page_frag_alloc(nc, len, gfp_mask);
++		pfmemalloc = nc->pfmemalloc;
++		local_bh_enable();
++	}
++
++	if (unlikely(!data))
++		return NULL;
++
++	skb = __build_skb(data, len);
++	if (unlikely(!skb)) {
++		skb_free_frag(data);
++		return NULL;
++	}
++
++	if (pfmemalloc)
++		skb->pfmemalloc = 1;
++	skb->head_frag = 1;
++
++skb_success:
++	skb_reserve(skb, NET_SKB_PAD);
++	skb->dev = dev;
++
++skb_fail:
++	return skb;
++}
++EXPORT_SYMBOL(__netdev_alloc_skb);
++
++/**
++ *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
++ *	@napi: napi instance this buffer was allocated for
++ *	@len: length to allocate
++ *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
++ *
++ *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
++ *	attempt to allocate the head from a special reserved region used
++ *	only for NAPI Rx allocation.  By doing this we can save several
++ *	CPU cycles by avoiding having to disable and re-enable IRQs.
++ *
++ *	%NULL is returned if there is no free memory.
++ */
++struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
++				 gfp_t gfp_mask)
++{
++	struct napi_alloc_cache *nc;
++	struct sk_buff *skb;
++	void *data;
++
++	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
++	len += NET_SKB_PAD + NET_IP_ALIGN;
++
++	/* If requested length is either too small or too big,
++	 * we use kmalloc() for skb->head allocation.
++	 */
++	if (len <= SKB_WITH_OVERHEAD(1024) ||
++	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
++	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
++		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
++				  NUMA_NO_NODE);
++		if (!skb)
++			goto skb_fail;
++		goto skb_success;
++	}
++
++	nc = this_cpu_ptr(&napi_alloc_cache);
++	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++	len = SKB_DATA_ALIGN(len);
++
++	if (sk_memalloc_socks())
++		gfp_mask |= __GFP_MEMALLOC;
++
++	data = page_frag_alloc(&nc->page, len, gfp_mask);
++	if (unlikely(!data))
++		return NULL;
++
++	skb = __napi_build_skb(data, len);
++	if (unlikely(!skb)) {
++		skb_free_frag(data);
++		return NULL;
++	}
++
++	if (nc->page.pfmemalloc)
++		skb->pfmemalloc = 1;
++	skb->head_frag = 1;
++
++skb_success:
++	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
++	skb->dev = napi->dev;
++
++skb_fail:
++	return skb;
++}
++EXPORT_SYMBOL(__napi_alloc_skb);
++
++void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
++		     int size, unsigned int truesize)
++{
++	skb_fill_page_desc(skb, i, page, off, size);
++	skb->len += size;
++	skb->data_len += size;
++	skb->truesize += truesize;
++}
++EXPORT_SYMBOL(skb_add_rx_frag);
++
++void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
++			  unsigned int truesize)
++{
++	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++	skb_frag_size_add(frag, size);
++	skb->len += size;
++	skb->data_len += size;
++	skb->truesize += truesize;
++}
++EXPORT_SYMBOL(skb_coalesce_rx_frag);
++
++static void skb_drop_list(struct sk_buff **listp)
++{
++	kfree_skb_list(*listp);
++	*listp = NULL;
++}
++
++static inline void skb_drop_fraglist(struct sk_buff *skb)
++{
++	skb_drop_list(&skb_shinfo(skb)->frag_list);
++}
++
++static void skb_clone_fraglist(struct sk_buff *skb)
++{
++	struct sk_buff *list;
++
++	skb_walk_frags(skb, list)
++		skb_get(list);
++}
++
++static void skb_free_head(struct sk_buff *skb)
++{
++	unsigned char *head = skb->head;
++
++	if (skb->head_frag) {
++		if (skb_pp_recycle(skb, head))
++			return;
++		skb_free_frag(head);
++	} else {
++		kfree(head);
++	}
++}
++
++static void skb_release_data(struct sk_buff *skb)
++{
++	struct skb_shared_info *shinfo = skb_shinfo(skb);
++	int i;
++
++	if (skb->cloned &&
++	    atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
++			      &shinfo->dataref))
++		goto exit;
++
++	if (skb_zcopy(skb)) {
++		bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
++
++		skb_zcopy_clear(skb, true);
++		if (skip_unref)
++			goto free_head;
++	}
++
++	for (i = 0; i < shinfo->nr_frags; i++)
++		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
++
++free_head:
++	if (shinfo->frag_list)
++		kfree_skb_list(shinfo->frag_list);
++
++	skb_free_head(skb);
++exit:
++	/* When we clone an SKB we copy the reycling bit. The pp_recycle
++	 * bit is only set on the head though, so in order to avoid races
++	 * while trying to recycle fragments on __skb_frag_unref() we need
++	 * to make one SKB responsible for triggering the recycle path.
++	 * So disable the recycling bit if an SKB is cloned and we have
++	 * additional references to the fragmented part of the SKB.
++	 * Eventually the last SKB will have the recycling bit set and it's
++	 * dataref set to 0, which will trigger the recycling
++	 */
++	skb->pp_recycle = 0;
++}
++
++/*
++ *	Free an skbuff by memory without cleaning the state.
++ */
++static void kfree_skbmem(struct sk_buff *skb)
++{
++	struct sk_buff_fclones *fclones;
++
++	switch (skb->fclone) {
++	case SKB_FCLONE_UNAVAILABLE:
++		kmem_cache_free(skbuff_head_cache, skb);
++		return;
++
++	case SKB_FCLONE_ORIG:
++		fclones = container_of(skb, struct sk_buff_fclones, skb1);
++
++		/* We usually free the clone (TX completion) before original skb
++		 * This test would have no chance to be true for the clone,
++		 * while here, branch prediction will be good.
++		 */
++		if (refcount_read(&fclones->fclone_ref) == 1)
++			goto fastpath;
++		break;
++
++	default: /* SKB_FCLONE_CLONE */
++		fclones = container_of(skb, struct sk_buff_fclones, skb2);
++		break;
++	}
++	if (!refcount_dec_and_test(&fclones->fclone_ref))
++		return;
++fastpath:
++	kmem_cache_free(skbuff_fclone_cache, fclones);
++}
++
++void skb_release_head_state(struct sk_buff *skb)
++{
++	skb_dst_drop(skb);
++	if (skb->destructor) {
++		DEBUG_NET_WARN_ON_ONCE(in_hardirq());
++		skb->destructor(skb);
++	}
++#if IS_ENABLED(CONFIG_NF_CONNTRACK)
++	nf_conntrack_put(skb_nfct(skb));
++#endif
++	skb_ext_put(skb);
++}
++
++/* Free everything but the sk_buff shell. */
++static void skb_release_all(struct sk_buff *skb)
++{
++	skb_release_head_state(skb);
++	if (likely(skb->head))
++		skb_release_data(skb);
++}
++
++/**
++ *	__kfree_skb - private function
++ *	@skb: buffer
++ *
++ *	Free an sk_buff. Release anything attached to the buffer.
++ *	Clean the state. This is an internal helper function. Users should
++ *	always call kfree_skb
++ */
++
++void __kfree_skb(struct sk_buff *skb)
++{
++	skb_release_all(skb);
++	kfree_skbmem(skb);
++}
++EXPORT_SYMBOL(__kfree_skb);
++
++/**
++ *	kfree_skb_reason - free an sk_buff with special reason
++ *	@skb: buffer to free
++ *	@reason: reason why this skb is dropped
++ *
++ *	Drop a reference to the buffer and free it if the usage count has
++ *	hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
++ *	tracepoint.
++ */
++void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
++{
++	if (!skb_unref(skb))
++		return;
++
++	DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
++
++	trace_kfree_skb(skb, __builtin_return_address(0), reason);
++	__kfree_skb(skb);
++}
++EXPORT_SYMBOL(kfree_skb_reason);
++
++void kfree_skb_list_reason(struct sk_buff *segs,
++			   enum skb_drop_reason reason)
++{
++	while (segs) {
++		struct sk_buff *next = segs->next;
++
++		kfree_skb_reason(segs, reason);
++		segs = next;
++	}
++}
++EXPORT_SYMBOL(kfree_skb_list_reason);
++
++/* Dump skb information and contents.
++ *
++ * Must only be called from net_ratelimit()-ed paths.
++ *
++ * Dumps whole packets if full_pkt, only headers otherwise.
++ */
++void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
++{
++	struct skb_shared_info *sh = skb_shinfo(skb);
++	struct net_device *dev = skb->dev;
++	struct sock *sk = skb->sk;
++	struct sk_buff *list_skb;
++	bool has_mac, has_trans;
++	int headroom, tailroom;
++	int i, len, seg_len;
++
++	if (full_pkt)
++		len = skb->len;
++	else
++		len = min_t(int, skb->len, MAX_HEADER + 128);
++
++	headroom = skb_headroom(skb);
++	tailroom = skb_tailroom(skb);
++
++	has_mac = skb_mac_header_was_set(skb);
++	has_trans = skb_transport_header_was_set(skb);
++
++	printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
++	       "mac=(%d,%d) net=(%d,%d) trans=%d\n"
++	       "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
++	       "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
++	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
++	       level, skb->len, headroom, skb_headlen(skb), tailroom,
++	       has_mac ? skb->mac_header : -1,
++	       has_mac ? skb_mac_header_len(skb) : -1,
++	       skb->network_header,
++	       has_trans ? skb_network_header_len(skb) : -1,
++	       has_trans ? skb->transport_header : -1,
++	       sh->tx_flags, sh->nr_frags,
++	       sh->gso_size, sh->gso_type, sh->gso_segs,
++	       skb->csum, skb->ip_summed, skb->csum_complete_sw,
++	       skb->csum_valid, skb->csum_level,
++	       skb->hash, skb->sw_hash, skb->l4_hash,
++	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
++
++	if (dev)
++		printk("%sdev name=%s feat=%pNF\n",
++		       level, dev->name, &dev->features);
++	if (sk)
++		printk("%ssk family=%hu type=%u proto=%u\n",
++		       level, sk->sk_family, sk->sk_type, sk->sk_protocol);
++
++	if (full_pkt && headroom)
++		print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
++			       16, 1, skb->head, headroom, false);
++
++	seg_len = min_t(int, skb_headlen(skb), len);
++	if (seg_len)
++		print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
++			       16, 1, skb->data, seg_len, false);
++	len -= seg_len;
++
++	if (full_pkt && tailroom)
++		print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
++			       16, 1, skb_tail_pointer(skb), tailroom, false);
++
++	for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++		u32 p_off, p_len, copied;
++		struct page *p;
++		u8 *vaddr;
++
++		skb_frag_foreach_page(frag, skb_frag_off(frag),
++				      skb_frag_size(frag), p, p_off, p_len,
++				      copied) {
++			seg_len = min_t(int, p_len, len);
++			vaddr = kmap_atomic(p);
++			print_hex_dump(level, "skb frag:     ",
++				       DUMP_PREFIX_OFFSET,
++				       16, 1, vaddr + p_off, seg_len, false);
++			kunmap_atomic(vaddr);
++			len -= seg_len;
++			if (!len)
++				break;
++		}
++	}
++
++	if (full_pkt && skb_has_frag_list(skb)) {
++		printk("skb fraglist:\n");
++		skb_walk_frags(skb, list_skb)
++			skb_dump(level, list_skb, true);
++	}
++}
++EXPORT_SYMBOL(skb_dump);
++
++/**
++ *	skb_tx_error - report an sk_buff xmit error
++ *	@skb: buffer that triggered an error
++ *
++ *	Report xmit error if a device callback is tracking this skb.
++ *	skb must be freed afterwards.
++ */
++void skb_tx_error(struct sk_buff *skb)
++{
++	if (skb) {
++		skb_zcopy_downgrade_managed(skb);
++		skb_zcopy_clear(skb, true);
++	}
++}
++EXPORT_SYMBOL(skb_tx_error);
++
++#ifdef CONFIG_TRACEPOINTS
++/**
++ *	consume_skb - free an skbuff
++ *	@skb: buffer to free
++ *
++ *	Drop a ref to the buffer and free it if the usage count has hit zero
++ *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
++ *	is being dropped after a failure and notes that
++ */
++void consume_skb(struct sk_buff *skb)
++{
++	if (!skb_unref(skb))
++		return;
++
++	trace_consume_skb(skb);
++	__kfree_skb(skb);
++}
++EXPORT_SYMBOL(consume_skb);
++#endif
++
++/**
++ *	__consume_stateless_skb - free an skbuff, assuming it is stateless
++ *	@skb: buffer to free
++ *
++ *	Alike consume_skb(), but this variant assumes that this is the last
++ *	skb reference and all the head states have been already dropped
++ */
++void __consume_stateless_skb(struct sk_buff *skb)
++{
++	trace_consume_skb(skb);
++	skb_release_data(skb);
++	kfree_skbmem(skb);
++}
++
++static void napi_skb_cache_put(struct sk_buff *skb)
++{
++	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
++	u32 i;
++
++	kasan_poison_object_data(skbuff_head_cache, skb);
++	nc->skb_cache[nc->skb_count++] = skb;
++
++	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
++		for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
++			kasan_unpoison_object_data(skbuff_head_cache,
++						   nc->skb_cache[i]);
++
++		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
++				     nc->skb_cache + NAPI_SKB_CACHE_HALF);
++		nc->skb_count = NAPI_SKB_CACHE_HALF;
++	}
++}
++
++void __kfree_skb_defer(struct sk_buff *skb)
++{
++	skb_release_all(skb);
++	napi_skb_cache_put(skb);
++}
++
++void napi_skb_free_stolen_head(struct sk_buff *skb)
++{
++	if (unlikely(skb->slow_gro)) {
++		nf_reset_ct(skb);
++		skb_dst_drop(skb);
++		skb_ext_put(skb);
++		skb_orphan(skb);
++		skb->slow_gro = 0;
++	}
++	napi_skb_cache_put(skb);
++}
++
++void napi_consume_skb(struct sk_buff *skb, int budget)
++{
++	/* Zero budget indicate non-NAPI context called us, like netpoll */
++	if (unlikely(!budget)) {
++		dev_consume_skb_any(skb);
++		return;
++	}
++
++	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
++
++	if (!skb_unref(skb))
++		return;
++
++	/* if reaching here SKB is ready to free */
++	trace_consume_skb(skb);
++
++	/* if SKB is a clone, don't handle this case */
++	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
++		__kfree_skb(skb);
++		return;
++	}
++
++	skb_release_all(skb);
++	napi_skb_cache_put(skb);
++}
++EXPORT_SYMBOL(napi_consume_skb);
++
++/* Make sure a field is contained by headers group */
++#define CHECK_SKB_FIELD(field) \
++	BUILD_BUG_ON(offsetof(struct sk_buff, field) !=		\
++		     offsetof(struct sk_buff, headers.field));	\
++
++static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
++{
++	new->tstamp		= old->tstamp;
++	/* We do not copy old->sk */
++	new->dev		= old->dev;
++	memcpy(new->cb, old->cb, sizeof(old->cb));
++	skb_dst_copy(new, old);
++	__skb_ext_copy(new, old);
++	__nf_copy(new, old, false);
++
++	/* Note : this field could be in the headers group.
++	 * It is not yet because we do not want to have a 16 bit hole
++	 */
++	new->queue_mapping = old->queue_mapping;
++
++	memcpy(&new->headers, &old->headers, sizeof(new->headers));
++	CHECK_SKB_FIELD(protocol);
++	CHECK_SKB_FIELD(csum);
++	CHECK_SKB_FIELD(hash);
++	CHECK_SKB_FIELD(priority);
++	CHECK_SKB_FIELD(skb_iif);
++	CHECK_SKB_FIELD(vlan_proto);
++	CHECK_SKB_FIELD(vlan_tci);
++	CHECK_SKB_FIELD(transport_header);
++	CHECK_SKB_FIELD(network_header);
++	CHECK_SKB_FIELD(mac_header);
++	CHECK_SKB_FIELD(inner_protocol);
++	CHECK_SKB_FIELD(inner_transport_header);
++	CHECK_SKB_FIELD(inner_network_header);
++	CHECK_SKB_FIELD(inner_mac_header);
++	CHECK_SKB_FIELD(mark);
++#ifdef CONFIG_NETWORK_SECMARK
++	CHECK_SKB_FIELD(secmark);
++#endif
++#ifdef CONFIG_NET_RX_BUSY_POLL
++	CHECK_SKB_FIELD(napi_id);
++#endif
++	CHECK_SKB_FIELD(alloc_cpu);
++#ifdef CONFIG_XPS
++	CHECK_SKB_FIELD(sender_cpu);
++#endif
++#ifdef CONFIG_NET_SCHED
++	CHECK_SKB_FIELD(tc_index);
++#endif
++
++}
++
++/*
++ * You should not add any new code to this function.  Add it to
++ * __copy_skb_header above instead.
++ */
++static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
++{
++#define C(x) n->x = skb->x
++
++	n->next = n->prev = NULL;
++	n->sk = NULL;
++	__copy_skb_header(n, skb);
++
++	C(len);
++	C(data_len);
++	C(mac_len);
++	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
++	n->cloned = 1;
++	n->nohdr = 0;
++	n->peeked = 0;
++	C(pfmemalloc);
++	C(pp_recycle);
++	n->destructor = NULL;
++	C(tail);
++	C(end);
++	C(head);
++	C(head_frag);
++	C(data);
++	C(truesize);
++	refcount_set(&n->users, 1);
++
++	atomic_inc(&(skb_shinfo(skb)->dataref));
++	skb->cloned = 1;
++
++	return n;
++#undef C
++}
++
++/**
++ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
++ * @first: first sk_buff of the msg
++ */
++struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
++{
++	struct sk_buff *n;
++
++	n = alloc_skb(0, GFP_ATOMIC);
++	if (!n)
++		return NULL;
++
++	n->len = first->len;
++	n->data_len = first->len;
++	n->truesize = first->truesize;
++
++	skb_shinfo(n)->frag_list = first;
++
++	__copy_skb_header(n, first);
++	n->destructor = NULL;
++
++	return n;
++}
++EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
++
++/**
++ *	skb_morph	-	morph one skb into another
++ *	@dst: the skb to receive the contents
++ *	@src: the skb to supply the contents
++ *
++ *	This is identical to skb_clone except that the target skb is
++ *	supplied by the user.
++ *
++ *	The target skb is returned upon exit.
++ */
++struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
++{
++	skb_release_all(dst);
++	return __skb_clone(dst, src);
++}
++EXPORT_SYMBOL_GPL(skb_morph);
++
++int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
++{
++	unsigned long max_pg, num_pg, new_pg, old_pg;
++	struct user_struct *user;
++
++	if (capable(CAP_IPC_LOCK) || !size)
++		return 0;
++
++	num_pg = (size >> PAGE_SHIFT) + 2;	/* worst case */
++	max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
++	user = mmp->user ? : current_user();
++
++	do {
++		old_pg = atomic_long_read(&user->locked_vm);
++		new_pg = old_pg + num_pg;
++		if (new_pg > max_pg)
++			return -ENOBUFS;
++	} while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
++		 old_pg);
++
++	if (!mmp->user) {
++		mmp->user = get_uid(user);
++		mmp->num_pg = num_pg;
++	} else {
++		mmp->num_pg += num_pg;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
++
++void mm_unaccount_pinned_pages(struct mmpin *mmp)
++{
++	if (mmp->user) {
++		atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
++		free_uid(mmp->user);
++	}
++}
++EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
++
++static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
++{
++	struct ubuf_info *uarg;
++	struct sk_buff *skb;
++
++	WARN_ON_ONCE(!in_task());
++
++	skb = sock_omalloc(sk, 0, GFP_KERNEL);
++	if (!skb)
++		return NULL;
++
++	BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
++	uarg = (void *)skb->cb;
++	uarg->mmp.user = NULL;
++
++	if (mm_account_pinned_pages(&uarg->mmp, size)) {
++		kfree_skb(skb);
++		return NULL;
++	}
++
++	uarg->callback = msg_zerocopy_callback;
++	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
++	uarg->len = 1;
++	uarg->bytelen = size;
++	uarg->zerocopy = 1;
++	uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
++	refcount_set(&uarg->refcnt, 1);
++	sock_hold(sk);
++
++	return uarg;
++}
++
++static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
++{
++	return container_of((void *)uarg, struct sk_buff, cb);
++}
++
++struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
++				       struct ubuf_info *uarg)
++{
++	if (uarg) {
++		const u32 byte_limit = 1 << 19;		/* limit to a few TSO */
++		u32 bytelen, next;
++
++		/* there might be non MSG_ZEROCOPY users */
++		if (uarg->callback != msg_zerocopy_callback)
++			return NULL;
++
++		/* realloc only when socket is locked (TCP, UDP cork),
++		 * so uarg->len and sk_zckey access is serialized
++		 */
++		if (!sock_owned_by_user(sk)) {
++			WARN_ON_ONCE(1);
++			return NULL;
++		}
++
++		bytelen = uarg->bytelen + size;
++		if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
++			/* TCP can create new skb to attach new uarg */
++			if (sk->sk_type == SOCK_STREAM)
++				goto new_alloc;
++			return NULL;
++		}
++
++		next = (u32)atomic_read(&sk->sk_zckey);
++		if ((u32)(uarg->id + uarg->len) == next) {
++			if (mm_account_pinned_pages(&uarg->mmp, size))
++				return NULL;
++			uarg->len++;
++			uarg->bytelen = bytelen;
++			atomic_set(&sk->sk_zckey, ++next);
++
++			/* no extra ref when appending to datagram (MSG_MORE) */
++			if (sk->sk_type == SOCK_STREAM)
++				net_zcopy_get(uarg);
++
++			return uarg;
++		}
++	}
++
++new_alloc:
++	return msg_zerocopy_alloc(sk, size);
++}
++EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
++
++static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
++{
++	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
++	u32 old_lo, old_hi;
++	u64 sum_len;
++
++	old_lo = serr->ee.ee_info;
++	old_hi = serr->ee.ee_data;
++	sum_len = old_hi - old_lo + 1ULL + len;
++
++	if (sum_len >= (1ULL << 32))
++		return false;
++
++	if (lo != old_hi + 1)
++		return false;
++
++	serr->ee.ee_data += len;
++	return true;
++}
++
++static void __msg_zerocopy_callback(struct ubuf_info *uarg)
++{
++	struct sk_buff *tail, *skb = skb_from_uarg(uarg);
++	struct sock_exterr_skb *serr;
++	struct sock *sk = skb->sk;
++	struct sk_buff_head *q;
++	unsigned long flags;
++	bool is_zerocopy;
++	u32 lo, hi;
++	u16 len;
++
++	mm_unaccount_pinned_pages(&uarg->mmp);
++
++	/* if !len, there was only 1 call, and it was aborted
++	 * so do not queue a completion notification
++	 */
++	if (!uarg->len || sock_flag(sk, SOCK_DEAD))
++		goto release;
++
++	len = uarg->len;
++	lo = uarg->id;
++	hi = uarg->id + len - 1;
++	is_zerocopy = uarg->zerocopy;
++
++	serr = SKB_EXT_ERR(skb);
++	memset(serr, 0, sizeof(*serr));
++	serr->ee.ee_errno = 0;
++	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
++	serr->ee.ee_data = hi;
++	serr->ee.ee_info = lo;
++	if (!is_zerocopy)
++		serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
++
++	q = &sk->sk_error_queue;
++	spin_lock_irqsave(&q->lock, flags);
++	tail = skb_peek_tail(q);
++	if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
++	    !skb_zerocopy_notify_extend(tail, lo, len)) {
++		__skb_queue_tail(q, skb);
++		skb = NULL;
++	}
++	spin_unlock_irqrestore(&q->lock, flags);
++
++	sk_error_report(sk);
++
++release:
++	consume_skb(skb);
++	sock_put(sk);
++}
++
++void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
++			   bool success)
++{
++	uarg->zerocopy = uarg->zerocopy & success;
++
++	if (refcount_dec_and_test(&uarg->refcnt))
++		__msg_zerocopy_callback(uarg);
++}
++EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
++
++void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
++{
++	struct sock *sk = skb_from_uarg(uarg)->sk;
++
++	atomic_dec(&sk->sk_zckey);
++	uarg->len--;
++
++	if (have_uref)
++		msg_zerocopy_callback(NULL, uarg, true);
++}
++EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
++
++int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
++			     struct msghdr *msg, int len,
++			     struct ubuf_info *uarg)
++{
++	struct ubuf_info *orig_uarg = skb_zcopy(skb);
++	int err, orig_len = skb->len;
++
++	/* An skb can only point to one uarg. This edge case happens when
++	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
++	 */
++	if (orig_uarg && uarg != orig_uarg)
++		return -EEXIST;
++
++	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
++	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
++		struct sock *save_sk = skb->sk;
++
++		/* Streams do not free skb on error. Reset to prev state. */
++		iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
++		skb->sk = sk;
++		___pskb_trim(skb, orig_len);
++		skb->sk = save_sk;
++		return err;
++	}
++
++	skb_zcopy_set(skb, uarg, NULL);
++	return skb->len - orig_len;
++}
++EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
++
++void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
++{
++	int i;
++
++	skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
++		skb_frag_ref(skb, i);
++}
++EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
++
++static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
++			      gfp_t gfp_mask)
++{
++	if (skb_zcopy(orig)) {
++		if (skb_zcopy(nskb)) {
++			/* !gfp_mask callers are verified to !skb_zcopy(nskb) */
++			if (!gfp_mask) {
++				WARN_ON_ONCE(1);
++				return -ENOMEM;
++			}
++			if (skb_uarg(nskb) == skb_uarg(orig))
++				return 0;
++			if (skb_copy_ubufs(nskb, GFP_ATOMIC))
++				return -EIO;
++		}
++		skb_zcopy_set(nskb, skb_uarg(orig), NULL);
++	}
++	return 0;
++}
++
++/**
++ *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
++ *	@skb: the skb to modify
++ *	@gfp_mask: allocation priority
++ *
++ *	This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
++ *	It will copy all frags into kernel and drop the reference
++ *	to userspace pages.
++ *
++ *	If this function is called from an interrupt gfp_mask() must be
++ *	%GFP_ATOMIC.
++ *
++ *	Returns 0 on success or a negative error code on failure
++ *	to allocate kernel memory to copy to.
++ */
++int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
++{
++	int num_frags = skb_shinfo(skb)->nr_frags;
++	struct page *page, *head = NULL;
++	int i, new_frags;
++	u32 d_off;
++
++	if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
++		return -EINVAL;
++
++	if (!num_frags)
++		goto release;
++
++	new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
++	for (i = 0; i < new_frags; i++) {
++		page = alloc_page(gfp_mask);
++		if (!page) {
++			while (head) {
++				struct page *next = (struct page *)page_private(head);
++				put_page(head);
++				head = next;
++			}
++			return -ENOMEM;
++		}
++		set_page_private(page, (unsigned long)head);
++		head = page;
++	}
++
++	page = head;
++	d_off = 0;
++	for (i = 0; i < num_frags; i++) {
++		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
++		u32 p_off, p_len, copied;
++		struct page *p;
++		u8 *vaddr;
++
++		skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
++				      p, p_off, p_len, copied) {
++			u32 copy, done = 0;
++			vaddr = kmap_atomic(p);
++
++			while (done < p_len) {
++				if (d_off == PAGE_SIZE) {
++					d_off = 0;
++					page = (struct page *)page_private(page);
++				}
++				copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
++				memcpy(page_address(page) + d_off,
++				       vaddr + p_off + done, copy);
++				done += copy;
++				d_off += copy;
++			}
++			kunmap_atomic(vaddr);
++		}
++	}
++
++	/* skb frags release userspace buffers */
++	for (i = 0; i < num_frags; i++)
++		skb_frag_unref(skb, i);
++
++	/* skb frags point to kernel buffers */
++	for (i = 0; i < new_frags - 1; i++) {
++		__skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
++		head = (struct page *)page_private(head);
++	}
++	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
++	skb_shinfo(skb)->nr_frags = new_frags;
++
++release:
++	skb_zcopy_clear(skb, false);
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_copy_ubufs);
++
++/**
++ *	skb_clone	-	duplicate an sk_buff
++ *	@skb: buffer to clone
++ *	@gfp_mask: allocation priority
++ *
++ *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
++ *	copies share the same packet data but not structure. The new
++ *	buffer has a reference count of 1. If the allocation fails the
++ *	function returns %NULL otherwise the new buffer is returned.
++ *
++ *	If this function is called from an interrupt gfp_mask() must be
++ *	%GFP_ATOMIC.
++ */
++
++struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
++{
++	struct sk_buff_fclones *fclones = container_of(skb,
++						       struct sk_buff_fclones,
++						       skb1);
++	struct sk_buff *n;
++
++	if (skb_orphan_frags(skb, gfp_mask))
++		return NULL;
++
++	if (skb->fclone == SKB_FCLONE_ORIG &&
++	    refcount_read(&fclones->fclone_ref) == 1) {
++		n = &fclones->skb2;
++		refcount_set(&fclones->fclone_ref, 2);
++		n->fclone = SKB_FCLONE_CLONE;
++	} else {
++		if (skb_pfmemalloc(skb))
++			gfp_mask |= __GFP_MEMALLOC;
++
++		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
++		if (!n)
++			return NULL;
++
++		n->fclone = SKB_FCLONE_UNAVAILABLE;
++	}
++
++	return __skb_clone(n, skb);
++}
++EXPORT_SYMBOL(skb_clone);
++
++void skb_headers_offset_update(struct sk_buff *skb, int off)
++{
++	/* Only adjust this if it actually is csum_start rather than csum */
++	if (skb->ip_summed == CHECKSUM_PARTIAL)
++		skb->csum_start += off;
++	/* {transport,network,mac}_header and tail are relative to skb->head */
++	skb->transport_header += off;
++	skb->network_header   += off;
++	if (skb_mac_header_was_set(skb))
++		skb->mac_header += off;
++	skb->inner_transport_header += off;
++	skb->inner_network_header += off;
++	skb->inner_mac_header += off;
++}
++EXPORT_SYMBOL(skb_headers_offset_update);
++
++void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
++{
++	__copy_skb_header(new, old);
++
++	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
++	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
++	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
++}
++EXPORT_SYMBOL(skb_copy_header);
++
++static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
++{
++	if (skb_pfmemalloc(skb))
++		return SKB_ALLOC_RX;
++	return 0;
++}
++
++/**
++ *	skb_copy	-	create private copy of an sk_buff
++ *	@skb: buffer to copy
++ *	@gfp_mask: allocation priority
++ *
++ *	Make a copy of both an &sk_buff and its data. This is used when the
++ *	caller wishes to modify the data and needs a private copy of the
++ *	data to alter. Returns %NULL on failure or the pointer to the buffer
++ *	on success. The returned buffer has a reference count of 1.
++ *
++ *	As by-product this function converts non-linear &sk_buff to linear
++ *	one, so that &sk_buff becomes completely private and caller is allowed
++ *	to modify all the data of returned buffer. This means that this
++ *	function is not recommended for use in circumstances when only
++ *	header is going to be modified. Use pskb_copy() instead.
++ */
++
++struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
++{
++	int headerlen = skb_headroom(skb);
++	unsigned int size = skb_end_offset(skb) + skb->data_len;
++	struct sk_buff *n = __alloc_skb(size, gfp_mask,
++					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
++
++	if (!n)
++		return NULL;
++
++	/* Set the data pointer */
++	skb_reserve(n, headerlen);
++	/* Set the tail pointer and length */
++	skb_put(n, skb->len);
++
++	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
++
++	skb_copy_header(n, skb);
++	return n;
++}
++EXPORT_SYMBOL(skb_copy);
++
++/**
++ *	__pskb_copy_fclone	-  create copy of an sk_buff with private head.
++ *	@skb: buffer to copy
++ *	@headroom: headroom of new skb
++ *	@gfp_mask: allocation priority
++ *	@fclone: if true allocate the copy of the skb from the fclone
++ *	cache instead of the head cache; it is recommended to set this
++ *	to true for the cases where the copy will likely be cloned
++ *
++ *	Make a copy of both an &sk_buff and part of its data, located
++ *	in header. Fragmented data remain shared. This is used when
++ *	the caller wishes to modify only header of &sk_buff and needs
++ *	private copy of the header to alter. Returns %NULL on failure
++ *	or the pointer to the buffer on success.
++ *	The returned buffer has a reference count of 1.
++ */
++
++struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
++				   gfp_t gfp_mask, bool fclone)
++{
++	unsigned int size = skb_headlen(skb) + headroom;
++	int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
++	struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
++
++	if (!n)
++		goto out;
++
++	/* Set the data pointer */
++	skb_reserve(n, headroom);
++	/* Set the tail pointer and length */
++	skb_put(n, skb_headlen(skb));
++	/* Copy the bytes */
++	skb_copy_from_linear_data(skb, n->data, n->len);
++
++	n->truesize += skb->data_len;
++	n->data_len  = skb->data_len;
++	n->len	     = skb->len;
++
++	if (skb_shinfo(skb)->nr_frags) {
++		int i;
++
++		if (skb_orphan_frags(skb, gfp_mask) ||
++		    skb_zerocopy_clone(n, skb, gfp_mask)) {
++			kfree_skb(n);
++			n = NULL;
++			goto out;
++		}
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
++			skb_frag_ref(skb, i);
++		}
++		skb_shinfo(n)->nr_frags = i;
++	}
++
++	if (skb_has_frag_list(skb)) {
++		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
++		skb_clone_fraglist(n);
++	}
++
++	skb_copy_header(n, skb);
++out:
++	return n;
++}
++EXPORT_SYMBOL(__pskb_copy_fclone);
++
++/**
++ *	pskb_expand_head - reallocate header of &sk_buff
++ *	@skb: buffer to reallocate
++ *	@nhead: room to add at head
++ *	@ntail: room to add at tail
++ *	@gfp_mask: allocation priority
++ *
++ *	Expands (or creates identical copy, if @nhead and @ntail are zero)
++ *	header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
++ *	reference count of 1. Returns zero in the case of success or error,
++ *	if expansion failed. In the last case, &sk_buff is not changed.
++ *
++ *	All the pointers pointing into skb header may change and must be
++ *	reloaded after call to this function.
++ */
++
++int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
++		     gfp_t gfp_mask)
++{
++	int i, osize = skb_end_offset(skb);
++	int size = osize + nhead + ntail;
++	long off;
++	u8 *data;
++
++	BUG_ON(nhead < 0);
++
++	BUG_ON(skb_shared(skb));
++
++	skb_zcopy_downgrade_managed(skb);
++
++	size = SKB_DATA_ALIGN(size);
++
++	if (skb_pfmemalloc(skb))
++		gfp_mask |= __GFP_MEMALLOC;
++	data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
++			       gfp_mask, NUMA_NO_NODE, NULL);
++	if (!data)
++		goto nodata;
++	size = SKB_WITH_OVERHEAD(ksize(data));
++
++	/* Copy only real data... and, alas, header. This should be
++	 * optimized for the cases when header is void.
++	 */
++	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
++
++	memcpy((struct skb_shared_info *)(data + size),
++	       skb_shinfo(skb),
++	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
++
++	/*
++	 * if shinfo is shared we must drop the old head gracefully, but if it
++	 * is not we can just drop the old head and let the existing refcount
++	 * be since all we did is relocate the values
++	 */
++	if (skb_cloned(skb)) {
++		if (skb_orphan_frags(skb, gfp_mask))
++			goto nofrags;
++		if (skb_zcopy(skb))
++			refcount_inc(&skb_uarg(skb)->refcnt);
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
++			skb_frag_ref(skb, i);
++
++		if (skb_has_frag_list(skb))
++			skb_clone_fraglist(skb);
++
++		skb_release_data(skb);
++	} else {
++		skb_free_head(skb);
++	}
++	off = (data + nhead) - skb->head;
++
++	skb->head     = data;
++	skb->head_frag = 0;
++	skb->data    += off;
++
++	skb_set_end_offset(skb, size);
++#ifdef NET_SKBUFF_DATA_USES_OFFSET
++	off           = nhead;
++#endif
++	skb->tail	      += off;
++	skb_headers_offset_update(skb, nhead);
++	skb->cloned   = 0;
++	skb->hdr_len  = 0;
++	skb->nohdr    = 0;
++	atomic_set(&skb_shinfo(skb)->dataref, 1);
++
++	skb_metadata_clear(skb);
++
++	/* It is not generally safe to change skb->truesize.
++	 * For the moment, we really care of rx path, or
++	 * when skb is orphaned (not attached to a socket).
++	 */
++	if (!skb->sk || skb->destructor == sock_edemux)
++		skb->truesize += size - osize;
++
++	return 0;
++
++nofrags:
++	kfree(data);
++nodata:
++	return -ENOMEM;
++}
++EXPORT_SYMBOL(pskb_expand_head);
++
++/* Make private copy of skb with writable head and some headroom */
++
++struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
++{
++	struct sk_buff *skb2;
++	int delta = headroom - skb_headroom(skb);
++
++	if (delta <= 0)
++		skb2 = pskb_copy(skb, GFP_ATOMIC);
++	else {
++		skb2 = skb_clone(skb, GFP_ATOMIC);
++		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
++					     GFP_ATOMIC)) {
++			kfree_skb(skb2);
++			skb2 = NULL;
++		}
++	}
++	return skb2;
++}
++EXPORT_SYMBOL(skb_realloc_headroom);
++
++int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
++{
++	unsigned int saved_end_offset, saved_truesize;
++	struct skb_shared_info *shinfo;
++	int res;
++
++	saved_end_offset = skb_end_offset(skb);
++	saved_truesize = skb->truesize;
++
++	res = pskb_expand_head(skb, 0, 0, pri);
++	if (res)
++		return res;
++
++	skb->truesize = saved_truesize;
++
++	if (likely(skb_end_offset(skb) == saved_end_offset))
++		return 0;
++
++	shinfo = skb_shinfo(skb);
++
++	/* We are about to change back skb->end,
++	 * we need to move skb_shinfo() to its new location.
++	 */
++	memmove(skb->head + saved_end_offset,
++		shinfo,
++		offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
++
++	skb_set_end_offset(skb, saved_end_offset);
++
++	return 0;
++}
++
++/**
++ *	skb_expand_head - reallocate header of &sk_buff
++ *	@skb: buffer to reallocate
++ *	@headroom: needed headroom
++ *
++ *	Unlike skb_realloc_headroom, this one does not allocate a new skb
++ *	if possible; copies skb->sk to new skb as needed
++ *	and frees original skb in case of failures.
++ *
++ *	It expect increased headroom and generates warning otherwise.
++ */
++
++struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
++{
++	int delta = headroom - skb_headroom(skb);
++	int osize = skb_end_offset(skb);
++	struct sock *sk = skb->sk;
++
++	if (WARN_ONCE(delta <= 0,
++		      "%s is expecting an increase in the headroom", __func__))
++		return skb;
++
++	delta = SKB_DATA_ALIGN(delta);
++	/* pskb_expand_head() might crash, if skb is shared. */
++	if (skb_shared(skb) || !is_skb_wmem(skb)) {
++		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
++
++		if (unlikely(!nskb))
++			goto fail;
++
++		if (sk)
++			skb_set_owner_w(nskb, sk);
++		consume_skb(skb);
++		skb = nskb;
++	}
++	if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
++		goto fail;
++
++	if (sk && is_skb_wmem(skb)) {
++		delta = skb_end_offset(skb) - osize;
++		refcount_add(delta, &sk->sk_wmem_alloc);
++		skb->truesize += delta;
++	}
++	return skb;
++
++fail:
++	kfree_skb(skb);
++	return NULL;
++}
++EXPORT_SYMBOL(skb_expand_head);
++
++/**
++ *	skb_copy_expand	-	copy and expand sk_buff
++ *	@skb: buffer to copy
++ *	@newheadroom: new free bytes at head
++ *	@newtailroom: new free bytes at tail
++ *	@gfp_mask: allocation priority
++ *
++ *	Make a copy of both an &sk_buff and its data and while doing so
++ *	allocate additional space.
++ *
++ *	This is used when the caller wishes to modify the data and needs a
++ *	private copy of the data to alter as well as more space for new fields.
++ *	Returns %NULL on failure or the pointer to the buffer
++ *	on success. The returned buffer has a reference count of 1.
++ *
++ *	You must pass %GFP_ATOMIC as the allocation priority if this function
++ *	is called from an interrupt.
++ */
++struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
++				int newheadroom, int newtailroom,
++				gfp_t gfp_mask)
++{
++	/*
++	 *	Allocate the copy buffer
++	 */
++	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
++					gfp_mask, skb_alloc_rx_flag(skb),
++					NUMA_NO_NODE);
++	int oldheadroom = skb_headroom(skb);
++	int head_copy_len, head_copy_off;
++
++	if (!n)
++		return NULL;
++
++	skb_reserve(n, newheadroom);
++
++	/* Set the tail pointer and length */
++	skb_put(n, skb->len);
++
++	head_copy_len = oldheadroom;
++	head_copy_off = 0;
++	if (newheadroom <= head_copy_len)
++		head_copy_len = newheadroom;
++	else
++		head_copy_off = newheadroom - head_copy_len;
++
++	/* Copy the linear header and data. */
++	BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
++			     skb->len + head_copy_len));
++
++	skb_copy_header(n, skb);
++
++	skb_headers_offset_update(n, newheadroom - oldheadroom);
++
++	return n;
++}
++EXPORT_SYMBOL(skb_copy_expand);
++
++/**
++ *	__skb_pad		-	zero pad the tail of an skb
++ *	@skb: buffer to pad
++ *	@pad: space to pad
++ *	@free_on_error: free buffer on error
++ *
++ *	Ensure that a buffer is followed by a padding area that is zero
++ *	filled. Used by network drivers which may DMA or transfer data
++ *	beyond the buffer end onto the wire.
++ *
++ *	May return error in out of memory cases. The skb is freed on error
++ *	if @free_on_error is true.
++ */
++
++int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
++{
++	int err;
++	int ntail;
++
++	/* If the skbuff is non linear tailroom is always zero.. */
++	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
++		memset(skb->data+skb->len, 0, pad);
++		return 0;
++	}
++
++	ntail = skb->data_len + pad - (skb->end - skb->tail);
++	if (likely(skb_cloned(skb) || ntail > 0)) {
++		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
++		if (unlikely(err))
++			goto free_skb;
++	}
++
++	/* FIXME: The use of this function with non-linear skb's really needs
++	 * to be audited.
++	 */
++	err = skb_linearize(skb);
++	if (unlikely(err))
++		goto free_skb;
++
++	memset(skb->data + skb->len, 0, pad);
++	return 0;
++
++free_skb:
++	if (free_on_error)
++		kfree_skb(skb);
++	return err;
++}
++EXPORT_SYMBOL(__skb_pad);
++
++/**
++ *	pskb_put - add data to the tail of a potentially fragmented buffer
++ *	@skb: start of the buffer to use
++ *	@tail: tail fragment of the buffer to use
++ *	@len: amount of data to add
++ *
++ *	This function extends the used data area of the potentially
++ *	fragmented buffer. @tail must be the last fragment of @skb -- or
++ *	@skb itself. If this would exceed the total buffer size the kernel
++ *	will panic. A pointer to the first byte of the extra data is
++ *	returned.
++ */
++
++void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
++{
++	if (tail != skb) {
++		skb->data_len += len;
++		skb->len += len;
++	}
++	return skb_put(tail, len);
++}
++EXPORT_SYMBOL_GPL(pskb_put);
++
++/**
++ *	skb_put - add data to a buffer
++ *	@skb: buffer to use
++ *	@len: amount of data to add
++ *
++ *	This function extends the used data area of the buffer. If this would
++ *	exceed the total buffer size the kernel will panic. A pointer to the
++ *	first byte of the extra data is returned.
++ */
++void *skb_put(struct sk_buff *skb, unsigned int len)
++{
++	void *tmp = skb_tail_pointer(skb);
++	SKB_LINEAR_ASSERT(skb);
++	skb->tail += len;
++	skb->len  += len;
++	if (unlikely(skb->tail > skb->end))
++		skb_over_panic(skb, len, __builtin_return_address(0));
++	return tmp;
++}
++EXPORT_SYMBOL(skb_put);
++
++/**
++ *	skb_push - add data to the start of a buffer
++ *	@skb: buffer to use
++ *	@len: amount of data to add
++ *
++ *	This function extends the used data area of the buffer at the buffer
++ *	start. If this would exceed the total buffer headroom the kernel will
++ *	panic. A pointer to the first byte of the extra data is returned.
++ */
++void *skb_push(struct sk_buff *skb, unsigned int len)
++{
++	skb->data -= len;
++	skb->len  += len;
++	if (unlikely(skb->data < skb->head))
++		skb_under_panic(skb, len, __builtin_return_address(0));
++	return skb->data;
++}
++EXPORT_SYMBOL(skb_push);
++
++/**
++ *	skb_pull - remove data from the start of a buffer
++ *	@skb: buffer to use
++ *	@len: amount of data to remove
++ *
++ *	This function removes data from the start of a buffer, returning
++ *	the memory to the headroom. A pointer to the next data in the buffer
++ *	is returned. Once the data has been pulled future pushes will overwrite
++ *	the old data.
++ */
++void *skb_pull(struct sk_buff *skb, unsigned int len)
++{
++	return skb_pull_inline(skb, len);
++}
++EXPORT_SYMBOL(skb_pull);
++
++/**
++ *	skb_pull_data - remove data from the start of a buffer returning its
++ *	original position.
++ *	@skb: buffer to use
++ *	@len: amount of data to remove
++ *
++ *	This function removes data from the start of a buffer, returning
++ *	the memory to the headroom. A pointer to the original data in the buffer
++ *	is returned after checking if there is enough data to pull. Once the
++ *	data has been pulled future pushes will overwrite the old data.
++ */
++void *skb_pull_data(struct sk_buff *skb, size_t len)
++{
++	void *data = skb->data;
++
++	if (skb->len < len)
++		return NULL;
++
++	skb_pull(skb, len);
++
++	return data;
++}
++EXPORT_SYMBOL(skb_pull_data);
++
++/**
++ *	skb_trim - remove end from a buffer
++ *	@skb: buffer to alter
++ *	@len: new length
++ *
++ *	Cut the length of a buffer down by removing data from the tail. If
++ *	the buffer is already under the length specified it is not modified.
++ *	The skb must be linear.
++ */
++void skb_trim(struct sk_buff *skb, unsigned int len)
++{
++	if (skb->len > len)
++		__skb_trim(skb, len);
++}
++EXPORT_SYMBOL(skb_trim);
++
++/* Trims skb to length len. It can change skb pointers.
++ */
++
++int ___pskb_trim(struct sk_buff *skb, unsigned int len)
++{
++	struct sk_buff **fragp;
++	struct sk_buff *frag;
++	int offset = skb_headlen(skb);
++	int nfrags = skb_shinfo(skb)->nr_frags;
++	int i;
++	int err;
++
++	if (skb_cloned(skb) &&
++	    unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
++		return err;
++
++	i = 0;
++	if (offset >= len)
++		goto drop_pages;
++
++	for (; i < nfrags; i++) {
++		int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (end < len) {
++			offset = end;
++			continue;
++		}
++
++		skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
++
++drop_pages:
++		skb_shinfo(skb)->nr_frags = i;
++
++		for (; i < nfrags; i++)
++			skb_frag_unref(skb, i);
++
++		if (skb_has_frag_list(skb))
++			skb_drop_fraglist(skb);
++		goto done;
++	}
++
++	for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
++	     fragp = &frag->next) {
++		int end = offset + frag->len;
++
++		if (skb_shared(frag)) {
++			struct sk_buff *nfrag;
++
++			nfrag = skb_clone(frag, GFP_ATOMIC);
++			if (unlikely(!nfrag))
++				return -ENOMEM;
++
++			nfrag->next = frag->next;
++			consume_skb(frag);
++			frag = nfrag;
++			*fragp = frag;
++		}
++
++		if (end < len) {
++			offset = end;
++			continue;
++		}
++
++		if (end > len &&
++		    unlikely((err = pskb_trim(frag, len - offset))))
++			return err;
++
++		if (frag->next)
++			skb_drop_list(&frag->next);
++		break;
++	}
++
++done:
++	if (len > skb_headlen(skb)) {
++		skb->data_len -= skb->len - len;
++		skb->len       = len;
++	} else {
++		skb->len       = len;
++		skb->data_len  = 0;
++		skb_set_tail_pointer(skb, len);
++	}
++
++	if (!skb->sk || skb->destructor == sock_edemux)
++		skb_condense(skb);
++	return 0;
++}
++EXPORT_SYMBOL(___pskb_trim);
++
++/* Note : use pskb_trim_rcsum() instead of calling this directly
++ */
++int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
++{
++	if (skb->ip_summed == CHECKSUM_COMPLETE) {
++		int delta = skb->len - len;
++
++		skb->csum = csum_block_sub(skb->csum,
++					   skb_checksum(skb, len, delta, 0),
++					   len);
++	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
++		int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
++		int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
++
++		if (offset + sizeof(__sum16) > hdlen)
++			return -EINVAL;
++	}
++	return __pskb_trim(skb, len);
++}
++EXPORT_SYMBOL(pskb_trim_rcsum_slow);
++
++/**
++ *	__pskb_pull_tail - advance tail of skb header
++ *	@skb: buffer to reallocate
++ *	@delta: number of bytes to advance tail
++ *
++ *	The function makes a sense only on a fragmented &sk_buff,
++ *	it expands header moving its tail forward and copying necessary
++ *	data from fragmented part.
++ *
++ *	&sk_buff MUST have reference count of 1.
++ *
++ *	Returns %NULL (and &sk_buff does not change) if pull failed
++ *	or value of new tail of skb in the case of success.
++ *
++ *	All the pointers pointing into skb header may change and must be
++ *	reloaded after call to this function.
++ */
++
++/* Moves tail of skb head forward, copying data from fragmented part,
++ * when it is necessary.
++ * 1. It may fail due to malloc failure.
++ * 2. It may change skb pointers.
++ *
++ * It is pretty complicated. Luckily, it is called only in exceptional cases.
++ */
++void *__pskb_pull_tail(struct sk_buff *skb, int delta)
++{
++	/* If skb has not enough free space at tail, get new one
++	 * plus 128 bytes for future expansions. If we have enough
++	 * room at tail, reallocate without expansion only if skb is cloned.
++	 */
++	int i, k, eat = (skb->tail + delta) - skb->end;
++
++	if (eat > 0 || skb_cloned(skb)) {
++		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
++				     GFP_ATOMIC))
++			return NULL;
++	}
++
++	BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
++			     skb_tail_pointer(skb), delta));
++
++	/* Optimization: no fragments, no reasons to preestimate
++	 * size of pulled pages. Superb.
++	 */
++	if (!skb_has_frag_list(skb))
++		goto pull_pages;
++
++	/* Estimate size of pulled pages. */
++	eat = delta;
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (size >= eat)
++			goto pull_pages;
++		eat -= size;
++	}
++
++	/* If we need update frag list, we are in troubles.
++	 * Certainly, it is possible to add an offset to skb data,
++	 * but taking into account that pulling is expected to
++	 * be very rare operation, it is worth to fight against
++	 * further bloating skb head and crucify ourselves here instead.
++	 * Pure masohism, indeed. 8)8)
++	 */
++	if (eat) {
++		struct sk_buff *list = skb_shinfo(skb)->frag_list;
++		struct sk_buff *clone = NULL;
++		struct sk_buff *insp = NULL;
++
++		do {
++			if (list->len <= eat) {
++				/* Eaten as whole. */
++				eat -= list->len;
++				list = list->next;
++				insp = list;
++			} else {
++				/* Eaten partially. */
++
++				if (skb_shared(list)) {
++					/* Sucks! We need to fork list. :-( */
++					clone = skb_clone(list, GFP_ATOMIC);
++					if (!clone)
++						return NULL;
++					insp = list->next;
++					list = clone;
++				} else {
++					/* This may be pulled without
++					 * problems. */
++					insp = list;
++				}
++				if (!pskb_pull(list, eat)) {
++					kfree_skb(clone);
++					return NULL;
++				}
++				break;
++			}
++		} while (eat);
++
++		/* Free pulled out fragments. */
++		while ((list = skb_shinfo(skb)->frag_list) != insp) {
++			skb_shinfo(skb)->frag_list = list->next;
++			consume_skb(list);
++		}
++		/* And insert new clone at head. */
++		if (clone) {
++			clone->next = list;
++			skb_shinfo(skb)->frag_list = clone;
++		}
++	}
++	/* Success! Now we may commit changes to skb data. */
++
++pull_pages:
++	eat = delta;
++	k = 0;
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (size <= eat) {
++			skb_frag_unref(skb, i);
++			eat -= size;
++		} else {
++			skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
++
++			*frag = skb_shinfo(skb)->frags[i];
++			if (eat) {
++				skb_frag_off_add(frag, eat);
++				skb_frag_size_sub(frag, eat);
++				if (!i)
++					goto end;
++				eat = 0;
++			}
++			k++;
++		}
++	}
++	skb_shinfo(skb)->nr_frags = k;
++
++end:
++	skb->tail     += delta;
++	skb->data_len -= delta;
++
++	if (!skb->data_len)
++		skb_zcopy_clear(skb, false);
++
++	return skb_tail_pointer(skb);
++}
++EXPORT_SYMBOL(__pskb_pull_tail);
++
++/**
++ *	skb_copy_bits - copy bits from skb to kernel buffer
++ *	@skb: source skb
++ *	@offset: offset in source
++ *	@to: destination buffer
++ *	@len: number of bytes to copy
++ *
++ *	Copy the specified number of bytes from the source skb to the
++ *	destination buffer.
++ *
++ *	CAUTION ! :
++ *		If its prototype is ever changed,
++ *		check arch/{*}/net/{*}.S files,
++ *		since it is called from BPF assembly code.
++ */
++int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
++{
++	int start = skb_headlen(skb);
++	struct sk_buff *frag_iter;
++	int i, copy;
++
++	if (offset > (int)skb->len - len)
++		goto fault;
++
++	/* Copy header. */
++	if ((copy = start - offset) > 0) {
++		if (copy > len)
++			copy = len;
++		skb_copy_from_linear_data_offset(skb, offset, to, copy);
++		if ((len -= copy) == 0)
++			return 0;
++		offset += copy;
++		to     += copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int end;
++		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(f);
++		if ((copy = end - offset) > 0) {
++			u32 p_off, p_len, copied;
++			struct page *p;
++			u8 *vaddr;
++
++			if (copy > len)
++				copy = len;
++
++			skb_frag_foreach_page(f,
++					      skb_frag_off(f) + offset - start,
++					      copy, p, p_off, p_len, copied) {
++				vaddr = kmap_atomic(p);
++				memcpy(to + copied, vaddr + p_off, p_len);
++				kunmap_atomic(vaddr);
++			}
++
++			if ((len -= copy) == 0)
++				return 0;
++			offset += copy;
++			to     += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			if (copy > len)
++				copy = len;
++			if (skb_copy_bits(frag_iter, offset - start, to, copy))
++				goto fault;
++			if ((len -= copy) == 0)
++				return 0;
++			offset += copy;
++			to     += copy;
++		}
++		start = end;
++	}
++
++	if (!len)
++		return 0;
++
++fault:
++	return -EFAULT;
++}
++EXPORT_SYMBOL(skb_copy_bits);
++
++/*
++ * Callback from splice_to_pipe(), if we need to release some pages
++ * at the end of the spd in case we error'ed out in filling the pipe.
++ */
++static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
++{
++	put_page(spd->pages[i]);
++}
++
++static struct page *linear_to_page(struct page *page, unsigned int *len,
++				   unsigned int *offset,
++				   struct sock *sk)
++{
++	struct page_frag *pfrag = sk_page_frag(sk);
++
++	if (!sk_page_frag_refill(sk, pfrag))
++		return NULL;
++
++	*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
++
++	memcpy(page_address(pfrag->page) + pfrag->offset,
++	       page_address(page) + *offset, *len);
++	*offset = pfrag->offset;
++	pfrag->offset += *len;
++
++	return pfrag->page;
++}
++
++static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
++			     struct page *page,
++			     unsigned int offset)
++{
++	return	spd->nr_pages &&
++		spd->pages[spd->nr_pages - 1] == page &&
++		(spd->partial[spd->nr_pages - 1].offset +
++		 spd->partial[spd->nr_pages - 1].len == offset);
++}
++
++/*
++ * Fill page/offset/length into spd, if it can hold more pages.
++ */
++static bool spd_fill_page(struct splice_pipe_desc *spd,
++			  struct pipe_inode_info *pipe, struct page *page,
++			  unsigned int *len, unsigned int offset,
++			  bool linear,
++			  struct sock *sk)
++{
++	if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
++		return true;
++
++	if (linear) {
++		page = linear_to_page(page, len, &offset, sk);
++		if (!page)
++			return true;
++	}
++	if (spd_can_coalesce(spd, page, offset)) {
++		spd->partial[spd->nr_pages - 1].len += *len;
++		return false;
++	}
++	get_page(page);
++	spd->pages[spd->nr_pages] = page;
++	spd->partial[spd->nr_pages].len = *len;
++	spd->partial[spd->nr_pages].offset = offset;
++	spd->nr_pages++;
++
++	return false;
++}
++
++static bool __splice_segment(struct page *page, unsigned int poff,
++			     unsigned int plen, unsigned int *off,
++			     unsigned int *len,
++			     struct splice_pipe_desc *spd, bool linear,
++			     struct sock *sk,
++			     struct pipe_inode_info *pipe)
++{
++	if (!*len)
++		return true;
++
++	/* skip this segment if already processed */
++	if (*off >= plen) {
++		*off -= plen;
++		return false;
++	}
++
++	/* ignore any bits we already processed */
++	poff += *off;
++	plen -= *off;
++	*off = 0;
++
++	do {
++		unsigned int flen = min(*len, plen);
++
++		if (spd_fill_page(spd, pipe, page, &flen, poff,
++				  linear, sk))
++			return true;
++		poff += flen;
++		plen -= flen;
++		*len -= flen;
++	} while (*len && plen);
++
++	return false;
++}
++
++/*
++ * Map linear and fragment data from the skb to spd. It reports true if the
++ * pipe is full or if we already spliced the requested length.
++ */
++static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
++			      unsigned int *offset, unsigned int *len,
++			      struct splice_pipe_desc *spd, struct sock *sk)
++{
++	int seg;
++	struct sk_buff *iter;
++
++	/* map the linear part :
++	 * If skb->head_frag is set, this 'linear' part is backed by a
++	 * fragment, and if the head is not shared with any clones then
++	 * we can avoid a copy since we own the head portion of this page.
++	 */
++	if (__splice_segment(virt_to_page(skb->data),
++			     (unsigned long) skb->data & (PAGE_SIZE - 1),
++			     skb_headlen(skb),
++			     offset, len, spd,
++			     skb_head_is_locked(skb),
++			     sk, pipe))
++		return true;
++
++	/*
++	 * then map the fragments
++	 */
++	for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
++		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
++
++		if (__splice_segment(skb_frag_page(f),
++				     skb_frag_off(f), skb_frag_size(f),
++				     offset, len, spd, false, sk, pipe))
++			return true;
++	}
++
++	skb_walk_frags(skb, iter) {
++		if (*offset >= iter->len) {
++			*offset -= iter->len;
++			continue;
++		}
++		/* __skb_splice_bits() only fails if the output has no room
++		 * left, so no point in going over the frag_list for the error
++		 * case.
++		 */
++		if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
++			return true;
++	}
++
++	return false;
++}
++
++/*
++ * Map data from the skb to a pipe. Should handle both the linear part,
++ * the fragments, and the frag list.
++ */
++int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
++		    struct pipe_inode_info *pipe, unsigned int tlen,
++		    unsigned int flags)
++{
++	struct partial_page partial[MAX_SKB_FRAGS];
++	struct page *pages[MAX_SKB_FRAGS];
++	struct splice_pipe_desc spd = {
++		.pages = pages,
++		.partial = partial,
++		.nr_pages_max = MAX_SKB_FRAGS,
++		.ops = &nosteal_pipe_buf_ops,
++		.spd_release = sock_spd_release,
++	};
++	int ret = 0;
++
++	__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
++
++	if (spd.nr_pages)
++		ret = splice_to_pipe(pipe, &spd);
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(skb_splice_bits);
++
++static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
++			    struct kvec *vec, size_t num, size_t size)
++{
++	struct socket *sock = sk->sk_socket;
++
++	if (!sock)
++		return -EINVAL;
++	return kernel_sendmsg(sock, msg, vec, num, size);
++}
++
++static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
++			     size_t size, int flags)
++{
++	struct socket *sock = sk->sk_socket;
++
++	if (!sock)
++		return -EINVAL;
++	return kernel_sendpage(sock, page, offset, size, flags);
++}
++
++typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
++			    struct kvec *vec, size_t num, size_t size);
++typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
++			     size_t size, int flags);
++static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
++			   int len, sendmsg_func sendmsg, sendpage_func sendpage)
++{
++	unsigned int orig_len = len;
++	struct sk_buff *head = skb;
++	unsigned short fragidx;
++	int slen, ret;
++
++do_frag_list:
++
++	/* Deal with head data */
++	while (offset < skb_headlen(skb) && len) {
++		struct kvec kv;
++		struct msghdr msg;
++
++		slen = min_t(int, len, skb_headlen(skb) - offset);
++		kv.iov_base = skb->data + offset;
++		kv.iov_len = slen;
++		memset(&msg, 0, sizeof(msg));
++		msg.msg_flags = MSG_DONTWAIT;
++
++		ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
++				      sendmsg_unlocked, sk, &msg, &kv, 1, slen);
++		if (ret <= 0)
++			goto error;
++
++		offset += ret;
++		len -= ret;
++	}
++
++	/* All the data was skb head? */
++	if (!len)
++		goto out;
++
++	/* Make offset relative to start of frags */
++	offset -= skb_headlen(skb);
++
++	/* Find where we are in frag list */
++	for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
++		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
++
++		if (offset < skb_frag_size(frag))
++			break;
++
++		offset -= skb_frag_size(frag);
++	}
++
++	for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
++		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
++
++		slen = min_t(size_t, len, skb_frag_size(frag) - offset);
++
++		while (slen) {
++			ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
++					      sendpage_unlocked, sk,
++					      skb_frag_page(frag),
++					      skb_frag_off(frag) + offset,
++					      slen, MSG_DONTWAIT);
++			if (ret <= 0)
++				goto error;
++
++			len -= ret;
++			offset += ret;
++			slen -= ret;
++		}
++
++		offset = 0;
++	}
++
++	if (len) {
++		/* Process any frag lists */
++
++		if (skb == head) {
++			if (skb_has_frag_list(skb)) {
++				skb = skb_shinfo(skb)->frag_list;
++				goto do_frag_list;
++			}
++		} else if (skb->next) {
++			skb = skb->next;
++			goto do_frag_list;
++		}
++	}
++
++out:
++	return orig_len - len;
++
++error:
++	return orig_len == len ? ret : orig_len - len;
++}
++
++/* Send skb data on a socket. Socket must be locked. */
++int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
++			 int len)
++{
++	return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
++			       kernel_sendpage_locked);
++}
++EXPORT_SYMBOL_GPL(skb_send_sock_locked);
++
++/* Send skb data on a socket. Socket must be unlocked. */
++int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
++{
++	return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
++			       sendpage_unlocked);
++}
++
++/**
++ *	skb_store_bits - store bits from kernel buffer to skb
++ *	@skb: destination buffer
++ *	@offset: offset in destination
++ *	@from: source buffer
++ *	@len: number of bytes to copy
++ *
++ *	Copy the specified number of bytes from the source buffer to the
++ *	destination skb.  This function handles all the messy bits of
++ *	traversing fragment lists and such.
++ */
++
++int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
++{
++	int start = skb_headlen(skb);
++	struct sk_buff *frag_iter;
++	int i, copy;
++
++	if (offset > (int)skb->len - len)
++		goto fault;
++
++	if ((copy = start - offset) > 0) {
++		if (copy > len)
++			copy = len;
++		skb_copy_to_linear_data_offset(skb, offset, from, copy);
++		if ((len -= copy) == 0)
++			return 0;
++		offset += copy;
++		from += copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(frag);
++		if ((copy = end - offset) > 0) {
++			u32 p_off, p_len, copied;
++			struct page *p;
++			u8 *vaddr;
++
++			if (copy > len)
++				copy = len;
++
++			skb_frag_foreach_page(frag,
++					      skb_frag_off(frag) + offset - start,
++					      copy, p, p_off, p_len, copied) {
++				vaddr = kmap_atomic(p);
++				memcpy(vaddr + p_off, from + copied, p_len);
++				kunmap_atomic(vaddr);
++			}
++
++			if ((len -= copy) == 0)
++				return 0;
++			offset += copy;
++			from += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			if (copy > len)
++				copy = len;
++			if (skb_store_bits(frag_iter, offset - start,
++					   from, copy))
++				goto fault;
++			if ((len -= copy) == 0)
++				return 0;
++			offset += copy;
++			from += copy;
++		}
++		start = end;
++	}
++	if (!len)
++		return 0;
++
++fault:
++	return -EFAULT;
++}
++EXPORT_SYMBOL(skb_store_bits);
++
++/* Checksum skb data. */
++__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
++		      __wsum csum, const struct skb_checksum_ops *ops)
++{
++	int start = skb_headlen(skb);
++	int i, copy = start - offset;
++	struct sk_buff *frag_iter;
++	int pos = 0;
++
++	/* Checksum header. */
++	if (copy > 0) {
++		if (copy > len)
++			copy = len;
++		csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
++				       skb->data + offset, copy, csum);
++		if ((len -= copy) == 0)
++			return csum;
++		offset += copy;
++		pos	= copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int end;
++		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(frag);
++		if ((copy = end - offset) > 0) {
++			u32 p_off, p_len, copied;
++			struct page *p;
++			__wsum csum2;
++			u8 *vaddr;
++
++			if (copy > len)
++				copy = len;
++
++			skb_frag_foreach_page(frag,
++					      skb_frag_off(frag) + offset - start,
++					      copy, p, p_off, p_len, copied) {
++				vaddr = kmap_atomic(p);
++				csum2 = INDIRECT_CALL_1(ops->update,
++							csum_partial_ext,
++							vaddr + p_off, p_len, 0);
++				kunmap_atomic(vaddr);
++				csum = INDIRECT_CALL_1(ops->combine,
++						       csum_block_add_ext, csum,
++						       csum2, pos, p_len);
++				pos += p_len;
++			}
++
++			if (!(len -= copy))
++				return csum;
++			offset += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			__wsum csum2;
++			if (copy > len)
++				copy = len;
++			csum2 = __skb_checksum(frag_iter, offset - start,
++					       copy, 0, ops);
++			csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
++					       csum, csum2, pos, copy);
++			if ((len -= copy) == 0)
++				return csum;
++			offset += copy;
++			pos    += copy;
++		}
++		start = end;
++	}
++	BUG_ON(len);
++
++	return csum;
++}
++EXPORT_SYMBOL(__skb_checksum);
++
++__wsum skb_checksum(const struct sk_buff *skb, int offset,
++		    int len, __wsum csum)
++{
++	const struct skb_checksum_ops ops = {
++		.update  = csum_partial_ext,
++		.combine = csum_block_add_ext,
++	};
++
++	return __skb_checksum(skb, offset, len, csum, &ops);
++}
++EXPORT_SYMBOL(skb_checksum);
++
++/* Both of above in one bottle. */
++
++__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
++				    u8 *to, int len)
++{
++	int start = skb_headlen(skb);
++	int i, copy = start - offset;
++	struct sk_buff *frag_iter;
++	int pos = 0;
++	__wsum csum = 0;
++
++	/* Copy header. */
++	if (copy > 0) {
++		if (copy > len)
++			copy = len;
++		csum = csum_partial_copy_nocheck(skb->data + offset, to,
++						 copy);
++		if ((len -= copy) == 0)
++			return csum;
++		offset += copy;
++		to     += copy;
++		pos	= copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
++		if ((copy = end - offset) > 0) {
++			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++			u32 p_off, p_len, copied;
++			struct page *p;
++			__wsum csum2;
++			u8 *vaddr;
++
++			if (copy > len)
++				copy = len;
++
++			skb_frag_foreach_page(frag,
++					      skb_frag_off(frag) + offset - start,
++					      copy, p, p_off, p_len, copied) {
++				vaddr = kmap_atomic(p);
++				csum2 = csum_partial_copy_nocheck(vaddr + p_off,
++								  to + copied,
++								  p_len);
++				kunmap_atomic(vaddr);
++				csum = csum_block_add(csum, csum2, pos);
++				pos += p_len;
++			}
++
++			if (!(len -= copy))
++				return csum;
++			offset += copy;
++			to     += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		__wsum csum2;
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			if (copy > len)
++				copy = len;
++			csum2 = skb_copy_and_csum_bits(frag_iter,
++						       offset - start,
++						       to, copy);
++			csum = csum_block_add(csum, csum2, pos);
++			if ((len -= copy) == 0)
++				return csum;
++			offset += copy;
++			to     += copy;
++			pos    += copy;
++		}
++		start = end;
++	}
++	BUG_ON(len);
++	return csum;
++}
++EXPORT_SYMBOL(skb_copy_and_csum_bits);
++
++__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
++{
++	__sum16 sum;
++
++	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
++	/* See comments in __skb_checksum_complete(). */
++	if (likely(!sum)) {
++		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
++		    !skb->csum_complete_sw)
++			netdev_rx_csum_fault(skb->dev, skb);
++	}
++	if (!skb_shared(skb))
++		skb->csum_valid = !sum;
++	return sum;
++}
++EXPORT_SYMBOL(__skb_checksum_complete_head);
++
++/* This function assumes skb->csum already holds pseudo header's checksum,
++ * which has been changed from the hardware checksum, for example, by
++ * __skb_checksum_validate_complete(). And, the original skb->csum must
++ * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
++ *
++ * It returns non-zero if the recomputed checksum is still invalid, otherwise
++ * zero. The new checksum is stored back into skb->csum unless the skb is
++ * shared.
++ */
++__sum16 __skb_checksum_complete(struct sk_buff *skb)
++{
++	__wsum csum;
++	__sum16 sum;
++
++	csum = skb_checksum(skb, 0, skb->len, 0);
++
++	sum = csum_fold(csum_add(skb->csum, csum));
++	/* This check is inverted, because we already knew the hardware
++	 * checksum is invalid before calling this function. So, if the
++	 * re-computed checksum is valid instead, then we have a mismatch
++	 * between the original skb->csum and skb_checksum(). This means either
++	 * the original hardware checksum is incorrect or we screw up skb->csum
++	 * when moving skb->data around.
++	 */
++	if (likely(!sum)) {
++		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
++		    !skb->csum_complete_sw)
++			netdev_rx_csum_fault(skb->dev, skb);
++	}
++
++	if (!skb_shared(skb)) {
++		/* Save full packet checksum */
++		skb->csum = csum;
++		skb->ip_summed = CHECKSUM_COMPLETE;
++		skb->csum_complete_sw = 1;
++		skb->csum_valid = !sum;
++	}
++
++	return sum;
++}
++EXPORT_SYMBOL(__skb_checksum_complete);
++
++static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
++{
++	net_warn_ratelimited(
++		"%s: attempt to compute crc32c without libcrc32c.ko\n",
++		__func__);
++	return 0;
++}
++
++static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
++				       int offset, int len)
++{
++	net_warn_ratelimited(
++		"%s: attempt to compute crc32c without libcrc32c.ko\n",
++		__func__);
++	return 0;
++}
++
++static const struct skb_checksum_ops default_crc32c_ops = {
++	.update  = warn_crc32c_csum_update,
++	.combine = warn_crc32c_csum_combine,
++};
++
++const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
++	&default_crc32c_ops;
++EXPORT_SYMBOL(crc32c_csum_stub);
++
++ /**
++ *	skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
++ *	@from: source buffer
++ *
++ *	Calculates the amount of linear headroom needed in the 'to' skb passed
++ *	into skb_zerocopy().
++ */
++unsigned int
++skb_zerocopy_headlen(const struct sk_buff *from)
++{
++	unsigned int hlen = 0;
++
++	if (!from->head_frag ||
++	    skb_headlen(from) < L1_CACHE_BYTES ||
++	    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
++		hlen = skb_headlen(from);
++		if (!hlen)
++			hlen = from->len;
++	}
++
++	if (skb_has_frag_list(from))
++		hlen = from->len;
++
++	return hlen;
++}
++EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
++
++/**
++ *	skb_zerocopy - Zero copy skb to skb
++ *	@to: destination buffer
++ *	@from: source buffer
++ *	@len: number of bytes to copy from source buffer
++ *	@hlen: size of linear headroom in destination buffer
++ *
++ *	Copies up to `len` bytes from `from` to `to` by creating references
++ *	to the frags in the source buffer.
++ *
++ *	The `hlen` as calculated by skb_zerocopy_headlen() specifies the
++ *	headroom in the `to` buffer.
++ *
++ *	Return value:
++ *	0: everything is OK
++ *	-ENOMEM: couldn't orphan frags of @from due to lack of memory
++ *	-EFAULT: skb_copy_bits() found some problem with skb geometry
++ */
++int
++skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
++{
++	int i, j = 0;
++	int plen = 0; /* length of skb->head fragment */
++	int ret;
++	struct page *page;
++	unsigned int offset;
++
++	BUG_ON(!from->head_frag && !hlen);
++
++	/* dont bother with small payloads */
++	if (len <= skb_tailroom(to))
++		return skb_copy_bits(from, 0, skb_put(to, len), len);
++
++	if (hlen) {
++		ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
++		if (unlikely(ret))
++			return ret;
++		len -= hlen;
++	} else {
++		plen = min_t(int, skb_headlen(from), len);
++		if (plen) {
++			page = virt_to_head_page(from->head);
++			offset = from->data - (unsigned char *)page_address(page);
++			__skb_fill_page_desc(to, 0, page, offset, plen);
++			get_page(page);
++			j = 1;
++			len -= plen;
++		}
++	}
++
++	skb_len_add(to, len + plen);
++
++	if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
++		skb_tx_error(from);
++		return -ENOMEM;
++	}
++	skb_zerocopy_clone(to, from, GFP_ATOMIC);
++
++	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
++		int size;
++
++		if (!len)
++			break;
++		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
++		size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
++					len);
++		skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
++		len -= size;
++		skb_frag_ref(to, j);
++		j++;
++	}
++	skb_shinfo(to)->nr_frags = j;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_zerocopy);
++
++void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
++{
++	__wsum csum;
++	long csstart;
++
++	if (skb->ip_summed == CHECKSUM_PARTIAL)
++		csstart = skb_checksum_start_offset(skb);
++	else
++		csstart = skb_headlen(skb);
++
++	BUG_ON(csstart > skb_headlen(skb));
++
++	skb_copy_from_linear_data(skb, to, csstart);
++
++	csum = 0;
++	if (csstart != skb->len)
++		csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
++					      skb->len - csstart);
++
++	if (skb->ip_summed == CHECKSUM_PARTIAL) {
++		long csstuff = csstart + skb->csum_offset;
++
++		*((__sum16 *)(to + csstuff)) = csum_fold(csum);
++	}
++}
++EXPORT_SYMBOL(skb_copy_and_csum_dev);
++
++/**
++ *	skb_dequeue - remove from the head of the queue
++ *	@list: list to dequeue from
++ *
++ *	Remove the head of the list. The list lock is taken so the function
++ *	may be used safely with other locking list functions. The head item is
++ *	returned or %NULL if the list is empty.
++ */
++
++struct sk_buff *skb_dequeue(struct sk_buff_head *list)
++{
++	unsigned long flags;
++	struct sk_buff *result;
++
++	spin_lock_irqsave(&list->lock, flags);
++	result = __skb_dequeue(list);
++	spin_unlock_irqrestore(&list->lock, flags);
++	return result;
++}
++EXPORT_SYMBOL(skb_dequeue);
++
++/**
++ *	skb_dequeue_tail - remove from the tail of the queue
++ *	@list: list to dequeue from
++ *
++ *	Remove the tail of the list. The list lock is taken so the function
++ *	may be used safely with other locking list functions. The tail item is
++ *	returned or %NULL if the list is empty.
++ */
++struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
++{
++	unsigned long flags;
++	struct sk_buff *result;
++
++	spin_lock_irqsave(&list->lock, flags);
++	result = __skb_dequeue_tail(list);
++	spin_unlock_irqrestore(&list->lock, flags);
++	return result;
++}
++EXPORT_SYMBOL(skb_dequeue_tail);
++
++/**
++ *	skb_queue_purge - empty a list
++ *	@list: list to empty
++ *
++ *	Delete all buffers on an &sk_buff list. Each buffer is removed from
++ *	the list and one reference dropped. This function takes the list
++ *	lock and is atomic with respect to other list locking functions.
++ */
++void skb_queue_purge(struct sk_buff_head *list)
++{
++	struct sk_buff *skb;
++	while ((skb = skb_dequeue(list)) != NULL)
++		kfree_skb(skb);
++}
++EXPORT_SYMBOL(skb_queue_purge);
++
++/**
++ *	skb_rbtree_purge - empty a skb rbtree
++ *	@root: root of the rbtree to empty
++ *	Return value: the sum of truesizes of all purged skbs.
++ *
++ *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
++ *	the list and one reference dropped. This function does not take
++ *	any lock. Synchronization should be handled by the caller (e.g., TCP
++ *	out-of-order queue is protected by the socket lock).
++ */
++unsigned int skb_rbtree_purge(struct rb_root *root)
++{
++	struct rb_node *p = rb_first(root);
++	unsigned int sum = 0;
++
++	while (p) {
++		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
++
++		p = rb_next(p);
++		rb_erase(&skb->rbnode, root);
++		sum += skb->truesize;
++		kfree_skb(skb);
++	}
++	return sum;
++}
++
++/**
++ *	skb_queue_head - queue a buffer at the list head
++ *	@list: list to use
++ *	@newsk: buffer to queue
++ *
++ *	Queue a buffer at the start of the list. This function takes the
++ *	list lock and can be used safely with other locking &sk_buff functions
++ *	safely.
++ *
++ *	A buffer cannot be placed on two lists at the same time.
++ */
++void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&list->lock, flags);
++	__skb_queue_head(list, newsk);
++	spin_unlock_irqrestore(&list->lock, flags);
++}
++EXPORT_SYMBOL(skb_queue_head);
++
++/**
++ *	skb_queue_tail - queue a buffer at the list tail
++ *	@list: list to use
++ *	@newsk: buffer to queue
++ *
++ *	Queue a buffer at the tail of the list. This function takes the
++ *	list lock and can be used safely with other locking &sk_buff functions
++ *	safely.
++ *
++ *	A buffer cannot be placed on two lists at the same time.
++ */
++void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&list->lock, flags);
++	__skb_queue_tail(list, newsk);
++	spin_unlock_irqrestore(&list->lock, flags);
++}
++EXPORT_SYMBOL(skb_queue_tail);
++
++/**
++ *	skb_unlink	-	remove a buffer from a list
++ *	@skb: buffer to remove
++ *	@list: list to use
++ *
++ *	Remove a packet from a list. The list locks are taken and this
++ *	function is atomic with respect to other list locked calls
++ *
++ *	You must know what list the SKB is on.
++ */
++void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&list->lock, flags);
++	__skb_unlink(skb, list);
++	spin_unlock_irqrestore(&list->lock, flags);
++}
++EXPORT_SYMBOL(skb_unlink);
++
++/**
++ *	skb_append	-	append a buffer
++ *	@old: buffer to insert after
++ *	@newsk: buffer to insert
++ *	@list: list to use
++ *
++ *	Place a packet after a given packet in a list. The list locks are taken
++ *	and this function is atomic with respect to other list locked calls.
++ *	A buffer cannot be placed on two lists at the same time.
++ */
++void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&list->lock, flags);
++	__skb_queue_after(list, old, newsk);
++	spin_unlock_irqrestore(&list->lock, flags);
++}
++EXPORT_SYMBOL(skb_append);
++
++static inline void skb_split_inside_header(struct sk_buff *skb,
++					   struct sk_buff* skb1,
++					   const u32 len, const int pos)
++{
++	int i;
++
++	skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
++					 pos - len);
++	/* And move data appendix as is. */
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
++		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
++
++	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
++	skb_shinfo(skb)->nr_frags  = 0;
++	skb1->data_len		   = skb->data_len;
++	skb1->len		   += skb1->data_len;
++	skb->data_len		   = 0;
++	skb->len		   = len;
++	skb_set_tail_pointer(skb, len);
++}
++
++static inline void skb_split_no_header(struct sk_buff *skb,
++				       struct sk_buff* skb1,
++				       const u32 len, int pos)
++{
++	int i, k = 0;
++	const int nfrags = skb_shinfo(skb)->nr_frags;
++
++	skb_shinfo(skb)->nr_frags = 0;
++	skb1->len		  = skb1->data_len = skb->len - len;
++	skb->len		  = len;
++	skb->data_len		  = len - pos;
++
++	for (i = 0; i < nfrags; i++) {
++		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (pos + size > len) {
++			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
++
++			if (pos < len) {
++				/* Split frag.
++				 * We have two variants in this case:
++				 * 1. Move all the frag to the second
++				 *    part, if it is possible. F.e.
++				 *    this approach is mandatory for TUX,
++				 *    where splitting is expensive.
++				 * 2. Split is accurately. We make this.
++				 */
++				skb_frag_ref(skb, i);
++				skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
++				skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
++				skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
++				skb_shinfo(skb)->nr_frags++;
++			}
++			k++;
++		} else
++			skb_shinfo(skb)->nr_frags++;
++		pos += size;
++	}
++	skb_shinfo(skb1)->nr_frags = k;
++}
++
++/**
++ * skb_split - Split fragmented skb to two parts at length len.
++ * @skb: the buffer to split
++ * @skb1: the buffer to receive the second part
++ * @len: new length for skb
++ */
++void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
++{
++	int pos = skb_headlen(skb);
++	const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
++
++	skb_zcopy_downgrade_managed(skb);
++
++	skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
++	skb_zerocopy_clone(skb1, skb, 0);
++	if (len < pos)	/* Split line is inside header. */
++		skb_split_inside_header(skb, skb1, len, pos);
++	else		/* Second chunk has no header, nothing to copy. */
++		skb_split_no_header(skb, skb1, len, pos);
++}
++EXPORT_SYMBOL(skb_split);
++
++/* Shifting from/to a cloned skb is a no-go.
++ *
++ * Caller cannot keep skb_shinfo related pointers past calling here!
++ */
++static int skb_prepare_for_shift(struct sk_buff *skb)
++{
++	return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
++}
++
++/**
++ * skb_shift - Shifts paged data partially from skb to another
++ * @tgt: buffer into which tail data gets added
++ * @skb: buffer from which the paged data comes from
++ * @shiftlen: shift up to this many bytes
++ *
++ * Attempts to shift up to shiftlen worth of bytes, which may be less than
++ * the length of the skb, from skb to tgt. Returns number bytes shifted.
++ * It's up to caller to free skb if everything was shifted.
++ *
++ * If @tgt runs out of frags, the whole operation is aborted.
++ *
++ * Skb cannot include anything else but paged data while tgt is allowed
++ * to have non-paged data as well.
++ *
++ * TODO: full sized shift could be optimized but that would need
++ * specialized skb free'er to handle frags without up-to-date nr_frags.
++ */
++int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
++{
++	int from, to, merge, todo;
++	skb_frag_t *fragfrom, *fragto;
++
++	BUG_ON(shiftlen > skb->len);
++
++	if (skb_headlen(skb))
++		return 0;
++	if (skb_zcopy(tgt) || skb_zcopy(skb))
++		return 0;
++
++	todo = shiftlen;
++	from = 0;
++	to = skb_shinfo(tgt)->nr_frags;
++	fragfrom = &skb_shinfo(skb)->frags[from];
++
++	/* Actual merge is delayed until the point when we know we can
++	 * commit all, so that we don't have to undo partial changes
++	 */
++	if (!to ||
++	    !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
++			      skb_frag_off(fragfrom))) {
++		merge = -1;
++	} else {
++		merge = to - 1;
++
++		todo -= skb_frag_size(fragfrom);
++		if (todo < 0) {
++			if (skb_prepare_for_shift(skb) ||
++			    skb_prepare_for_shift(tgt))
++				return 0;
++
++			/* All previous frag pointers might be stale! */
++			fragfrom = &skb_shinfo(skb)->frags[from];
++			fragto = &skb_shinfo(tgt)->frags[merge];
++
++			skb_frag_size_add(fragto, shiftlen);
++			skb_frag_size_sub(fragfrom, shiftlen);
++			skb_frag_off_add(fragfrom, shiftlen);
++
++			goto onlymerged;
++		}
++
++		from++;
++	}
++
++	/* Skip full, not-fitting skb to avoid expensive operations */
++	if ((shiftlen == skb->len) &&
++	    (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
++		return 0;
++
++	if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
++		return 0;
++
++	while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
++		if (to == MAX_SKB_FRAGS)
++			return 0;
++
++		fragfrom = &skb_shinfo(skb)->frags[from];
++		fragto = &skb_shinfo(tgt)->frags[to];
++
++		if (todo >= skb_frag_size(fragfrom)) {
++			*fragto = *fragfrom;
++			todo -= skb_frag_size(fragfrom);
++			from++;
++			to++;
++
++		} else {
++			__skb_frag_ref(fragfrom);
++			skb_frag_page_copy(fragto, fragfrom);
++			skb_frag_off_copy(fragto, fragfrom);
++			skb_frag_size_set(fragto, todo);
++
++			skb_frag_off_add(fragfrom, todo);
++			skb_frag_size_sub(fragfrom, todo);
++			todo = 0;
++
++			to++;
++			break;
++		}
++	}
++
++	/* Ready to "commit" this state change to tgt */
++	skb_shinfo(tgt)->nr_frags = to;
++
++	if (merge >= 0) {
++		fragfrom = &skb_shinfo(skb)->frags[0];
++		fragto = &skb_shinfo(tgt)->frags[merge];
++
++		skb_frag_size_add(fragto, skb_frag_size(fragfrom));
++		__skb_frag_unref(fragfrom, skb->pp_recycle);
++	}
++
++	/* Reposition in the original skb */
++	to = 0;
++	while (from < skb_shinfo(skb)->nr_frags)
++		skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
++	skb_shinfo(skb)->nr_frags = to;
++
++	BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
++
++onlymerged:
++	/* Most likely the tgt won't ever need its checksum anymore, skb on
++	 * the other hand might need it if it needs to be resent
++	 */
++	tgt->ip_summed = CHECKSUM_PARTIAL;
++	skb->ip_summed = CHECKSUM_PARTIAL;
++
++	skb_len_add(skb, -shiftlen);
++	skb_len_add(tgt, shiftlen);
++
++	return shiftlen;
++}
++
++/**
++ * skb_prepare_seq_read - Prepare a sequential read of skb data
++ * @skb: the buffer to read
++ * @from: lower offset of data to be read
++ * @to: upper offset of data to be read
++ * @st: state variable
++ *
++ * Initializes the specified state variable. Must be called before
++ * invoking skb_seq_read() for the first time.
++ */
++void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
++			  unsigned int to, struct skb_seq_state *st)
++{
++	st->lower_offset = from;
++	st->upper_offset = to;
++	st->root_skb = st->cur_skb = skb;
++	st->frag_idx = st->stepped_offset = 0;
++	st->frag_data = NULL;
++	st->frag_off = 0;
++}
++EXPORT_SYMBOL(skb_prepare_seq_read);
++
++/**
++ * skb_seq_read - Sequentially read skb data
++ * @consumed: number of bytes consumed by the caller so far
++ * @data: destination pointer for data to be returned
++ * @st: state variable
++ *
++ * Reads a block of skb data at @consumed relative to the
++ * lower offset specified to skb_prepare_seq_read(). Assigns
++ * the head of the data block to @data and returns the length
++ * of the block or 0 if the end of the skb data or the upper
++ * offset has been reached.
++ *
++ * The caller is not required to consume all of the data
++ * returned, i.e. @consumed is typically set to the number
++ * of bytes already consumed and the next call to
++ * skb_seq_read() will return the remaining part of the block.
++ *
++ * Note 1: The size of each block of data returned can be arbitrary,
++ *       this limitation is the cost for zerocopy sequential
++ *       reads of potentially non linear data.
++ *
++ * Note 2: Fragment lists within fragments are not implemented
++ *       at the moment, state->root_skb could be replaced with
++ *       a stack for this purpose.
++ */
++unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
++			  struct skb_seq_state *st)
++{
++	unsigned int block_limit, abs_offset = consumed + st->lower_offset;
++	skb_frag_t *frag;
++
++	if (unlikely(abs_offset >= st->upper_offset)) {
++		if (st->frag_data) {
++			kunmap_atomic(st->frag_data);
++			st->frag_data = NULL;
++		}
++		return 0;
++	}
++
++next_skb:
++	block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
++
++	if (abs_offset < block_limit && !st->frag_data) {
++		*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
++		return block_limit - abs_offset;
++	}
++
++	if (st->frag_idx == 0 && !st->frag_data)
++		st->stepped_offset += skb_headlen(st->cur_skb);
++
++	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
++		unsigned int pg_idx, pg_off, pg_sz;
++
++		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
++
++		pg_idx = 0;
++		pg_off = skb_frag_off(frag);
++		pg_sz = skb_frag_size(frag);
++
++		if (skb_frag_must_loop(skb_frag_page(frag))) {
++			pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
++			pg_off = offset_in_page(pg_off + st->frag_off);
++			pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
++						    PAGE_SIZE - pg_off);
++		}
++
++		block_limit = pg_sz + st->stepped_offset;
++		if (abs_offset < block_limit) {
++			if (!st->frag_data)
++				st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
++
++			*data = (u8 *)st->frag_data + pg_off +
++				(abs_offset - st->stepped_offset);
++
++			return block_limit - abs_offset;
++		}
++
++		if (st->frag_data) {
++			kunmap_atomic(st->frag_data);
++			st->frag_data = NULL;
++		}
++
++		st->stepped_offset += pg_sz;
++		st->frag_off += pg_sz;
++		if (st->frag_off == skb_frag_size(frag)) {
++			st->frag_off = 0;
++			st->frag_idx++;
++		}
++	}
++
++	if (st->frag_data) {
++		kunmap_atomic(st->frag_data);
++		st->frag_data = NULL;
++	}
++
++	if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
++		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
++		st->frag_idx = 0;
++		goto next_skb;
++	} else if (st->cur_skb->next) {
++		st->cur_skb = st->cur_skb->next;
++		st->frag_idx = 0;
++		goto next_skb;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(skb_seq_read);
++
++/**
++ * skb_abort_seq_read - Abort a sequential read of skb data
++ * @st: state variable
++ *
++ * Must be called if skb_seq_read() was not called until it
++ * returned 0.
++ */
++void skb_abort_seq_read(struct skb_seq_state *st)
++{
++	if (st->frag_data)
++		kunmap_atomic(st->frag_data);
++}
++EXPORT_SYMBOL(skb_abort_seq_read);
++
++#define TS_SKB_CB(state)	((struct skb_seq_state *) &((state)->cb))
++
++static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
++					  struct ts_config *conf,
++					  struct ts_state *state)
++{
++	return skb_seq_read(offset, text, TS_SKB_CB(state));
++}
++
++static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
++{
++	skb_abort_seq_read(TS_SKB_CB(state));
++}
++
++/**
++ * skb_find_text - Find a text pattern in skb data
++ * @skb: the buffer to look in
++ * @from: search offset
++ * @to: search limit
++ * @config: textsearch configuration
++ *
++ * Finds a pattern in the skb data according to the specified
++ * textsearch configuration. Use textsearch_next() to retrieve
++ * subsequent occurrences of the pattern. Returns the offset
++ * to the first occurrence or UINT_MAX if no match was found.
++ */
++unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
++			   unsigned int to, struct ts_config *config)
++{
++	struct ts_state state;
++	unsigned int ret;
++
++	BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
++
++	config->get_next_block = skb_ts_get_next_block;
++	config->finish = skb_ts_finish;
++
++	skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
++
++	ret = textsearch_find(config, &state);
++	return (ret <= to - from ? ret : UINT_MAX);
++}
++EXPORT_SYMBOL(skb_find_text);
++
++int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
++			 int offset, size_t size)
++{
++	int i = skb_shinfo(skb)->nr_frags;
++
++	if (skb_can_coalesce(skb, i, page, offset)) {
++		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
++	} else if (i < MAX_SKB_FRAGS) {
++		skb_zcopy_downgrade_managed(skb);
++		get_page(page);
++		skb_fill_page_desc_noacc(skb, i, page, offset, size);
++	} else {
++		return -EMSGSIZE;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_append_pagefrags);
++
++/**
++ *	skb_pull_rcsum - pull skb and update receive checksum
++ *	@skb: buffer to update
++ *	@len: length of data pulled
++ *
++ *	This function performs an skb_pull on the packet and updates
++ *	the CHECKSUM_COMPLETE checksum.  It should be used on
++ *	receive path processing instead of skb_pull unless you know
++ *	that the checksum difference is zero (e.g., a valid IP header)
++ *	or you are setting ip_summed to CHECKSUM_NONE.
++ */
++void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
++{
++	unsigned char *data = skb->data;
++
++	BUG_ON(len > skb->len);
++	__skb_pull(skb, len);
++	skb_postpull_rcsum(skb, data, len);
++	return skb->data;
++}
++EXPORT_SYMBOL_GPL(skb_pull_rcsum);
++
++static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
++{
++	skb_frag_t head_frag;
++	struct page *page;
++
++	page = virt_to_head_page(frag_skb->head);
++	__skb_frag_set_page(&head_frag, page);
++	skb_frag_off_set(&head_frag, frag_skb->data -
++			 (unsigned char *)page_address(page));
++	skb_frag_size_set(&head_frag, skb_headlen(frag_skb));
++	return head_frag;
++}
++
++struct sk_buff *skb_segment_list(struct sk_buff *skb,
++				 netdev_features_t features,
++				 unsigned int offset)
++{
++	struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
++	unsigned int tnl_hlen = skb_tnl_header_len(skb);
++	unsigned int delta_truesize = 0;
++	unsigned int delta_len = 0;
++	struct sk_buff *tail = NULL;
++	struct sk_buff *nskb, *tmp;
++	int len_diff, err;
++
++	skb_push(skb, -skb_network_offset(skb) + offset);
++
++	skb_shinfo(skb)->frag_list = NULL;
++
++	do {
++		nskb = list_skb;
++		list_skb = list_skb->next;
++
++		err = 0;
++		delta_truesize += nskb->truesize;
++		if (skb_shared(nskb)) {
++			tmp = skb_clone(nskb, GFP_ATOMIC);
++			if (tmp) {
++				consume_skb(nskb);
++				nskb = tmp;
++				err = skb_unclone(nskb, GFP_ATOMIC);
++			} else {
++				err = -ENOMEM;
++			}
++		}
++
++		if (!tail)
++			skb->next = nskb;
++		else
++			tail->next = nskb;
++
++		if (unlikely(err)) {
++			nskb->next = list_skb;
++			goto err_linearize;
++		}
++
++		tail = nskb;
++
++		delta_len += nskb->len;
++
++		skb_push(nskb, -skb_network_offset(nskb) + offset);
++
++		skb_release_head_state(nskb);
++		len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
++		__copy_skb_header(nskb, skb);
++
++		skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
++		nskb->transport_header += len_diff;
++		skb_copy_from_linear_data_offset(skb, -tnl_hlen,
++						 nskb->data - tnl_hlen,
++						 offset + tnl_hlen);
++
++		if (skb_needs_linearize(nskb, features) &&
++		    __skb_linearize(nskb))
++			goto err_linearize;
++
++	} while (list_skb);
++
++	skb->truesize = skb->truesize - delta_truesize;
++	skb->data_len = skb->data_len - delta_len;
++	skb->len = skb->len - delta_len;
++
++	skb_gso_reset(skb);
++
++	skb->prev = tail;
++
++	if (skb_needs_linearize(skb, features) &&
++	    __skb_linearize(skb))
++		goto err_linearize;
++
++	skb_get(skb);
++
++	return skb;
++
++err_linearize:
++	kfree_skb_list(skb->next);
++	skb->next = NULL;
++	return ERR_PTR(-ENOMEM);
++}
++EXPORT_SYMBOL_GPL(skb_segment_list);
++
++/**
++ *	skb_segment - Perform protocol segmentation on skb.
++ *	@head_skb: buffer to segment
++ *	@features: features for the output path (see dev->features)
++ *
++ *	This function performs segmentation on the given skb.  It returns
++ *	a pointer to the first in a list of new skbs for the segments.
++ *	In case of error it returns ERR_PTR(err).
++ */
++struct sk_buff *skb_segment(struct sk_buff *head_skb,
++			    netdev_features_t features)
++{
++	struct sk_buff *segs = NULL;
++	struct sk_buff *tail = NULL;
++	struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
++	skb_frag_t *frag = skb_shinfo(head_skb)->frags;
++	unsigned int mss = skb_shinfo(head_skb)->gso_size;
++	unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
++	struct sk_buff *frag_skb = head_skb;
++	unsigned int offset = doffset;
++	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
++	unsigned int partial_segs = 0;
++	unsigned int headroom;
++	unsigned int len = head_skb->len;
++	__be16 proto;
++	bool csum, sg;
++	int nfrags = skb_shinfo(head_skb)->nr_frags;
++	int err = -ENOMEM;
++	int i = 0;
++	int pos;
++
++	if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
++	    mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
++		struct sk_buff *check_skb;
++
++		for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
++			if (skb_headlen(check_skb) && !check_skb->head_frag) {
++				/* gso_size is untrusted, and we have a frag_list with
++				 * a linear non head_frag item.
++				 *
++				 * If head_skb's headlen does not fit requested gso_size,
++				 * it means that the frag_list members do NOT terminate
++				 * on exact gso_size boundaries. Hence we cannot perform
++				 * skb_frag_t page sharing. Therefore we must fallback to
++				 * copying the frag_list skbs; we do so by disabling SG.
++				 */
++				features &= ~NETIF_F_SG;
++				break;
++			}
++		}
++	}
++
++	__skb_push(head_skb, doffset);
++	proto = skb_network_protocol(head_skb, NULL);
++	if (unlikely(!proto))
++		return ERR_PTR(-EINVAL);
++
++	sg = !!(features & NETIF_F_SG);
++	csum = !!can_checksum_protocol(features, proto);
++
++	if (sg && csum && (mss != GSO_BY_FRAGS))  {
++		if (!(features & NETIF_F_GSO_PARTIAL)) {
++			struct sk_buff *iter;
++			unsigned int frag_len;
++
++			if (!list_skb ||
++			    !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
++				goto normal;
++
++			/* If we get here then all the required
++			 * GSO features except frag_list are supported.
++			 * Try to split the SKB to multiple GSO SKBs
++			 * with no frag_list.
++			 * Currently we can do that only when the buffers don't
++			 * have a linear part and all the buffers except
++			 * the last are of the same length.
++			 */
++			frag_len = list_skb->len;
++			skb_walk_frags(head_skb, iter) {
++				if (frag_len != iter->len && iter->next)
++					goto normal;
++				if (skb_headlen(iter) && !iter->head_frag)
++					goto normal;
++
++				len -= iter->len;
++			}
++
++			if (len != frag_len)
++				goto normal;
++		}
++
++		/* GSO partial only requires that we trim off any excess that
++		 * doesn't fit into an MSS sized block, so take care of that
++		 * now.
++		 */
++		partial_segs = len / mss;
++		if (partial_segs > 1)
++			mss *= partial_segs;
++		else
++			partial_segs = 0;
++	}
++
++normal:
++	headroom = skb_headroom(head_skb);
++	pos = skb_headlen(head_skb);
++
++	do {
++		struct sk_buff *nskb;
++		skb_frag_t *nskb_frag;
++		int hsize;
++		int size;
++
++		if (unlikely(mss == GSO_BY_FRAGS)) {
++			len = list_skb->len;
++		} else {
++			len = head_skb->len - offset;
++			if (len > mss)
++				len = mss;
++		}
++
++		hsize = skb_headlen(head_skb) - offset;
++
++		if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
++		    (skb_headlen(list_skb) == len || sg)) {
++			BUG_ON(skb_headlen(list_skb) > len);
++
++			i = 0;
++			nfrags = skb_shinfo(list_skb)->nr_frags;
++			frag = skb_shinfo(list_skb)->frags;
++			frag_skb = list_skb;
++			pos += skb_headlen(list_skb);
++
++			while (pos < offset + len) {
++				BUG_ON(i >= nfrags);
++
++				size = skb_frag_size(frag);
++				if (pos + size > offset + len)
++					break;
++
++				i++;
++				pos += size;
++				frag++;
++			}
++
++			nskb = skb_clone(list_skb, GFP_ATOMIC);
++			list_skb = list_skb->next;
++
++			if (unlikely(!nskb))
++				goto err;
++
++			if (unlikely(pskb_trim(nskb, len))) {
++				kfree_skb(nskb);
++				goto err;
++			}
++
++			hsize = skb_end_offset(nskb);
++			if (skb_cow_head(nskb, doffset + headroom)) {
++				kfree_skb(nskb);
++				goto err;
++			}
++
++			nskb->truesize += skb_end_offset(nskb) - hsize;
++			skb_release_head_state(nskb);
++			__skb_push(nskb, doffset);
++		} else {
++			if (hsize < 0)
++				hsize = 0;
++			if (hsize > len || !sg)
++				hsize = len;
++
++			nskb = __alloc_skb(hsize + doffset + headroom,
++					   GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
++					   NUMA_NO_NODE);
++
++			if (unlikely(!nskb))
++				goto err;
++
++			skb_reserve(nskb, headroom);
++			__skb_put(nskb, doffset);
++		}
++
++		if (segs)
++			tail->next = nskb;
++		else
++			segs = nskb;
++		tail = nskb;
++
++		__copy_skb_header(nskb, head_skb);
++
++		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
++		skb_reset_mac_len(nskb);
++
++		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
++						 nskb->data - tnl_hlen,
++						 doffset + tnl_hlen);
++
++		if (nskb->len == len + doffset)
++			goto perform_csum_check;
++
++		if (!sg) {
++			if (!csum) {
++				if (!nskb->remcsum_offload)
++					nskb->ip_summed = CHECKSUM_NONE;
++				SKB_GSO_CB(nskb)->csum =
++					skb_copy_and_csum_bits(head_skb, offset,
++							       skb_put(nskb,
++								       len),
++							       len);
++				SKB_GSO_CB(nskb)->csum_start =
++					skb_headroom(nskb) + doffset;
++			} else {
++				if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
++					goto err;
++			}
++			continue;
++		}
++
++		nskb_frag = skb_shinfo(nskb)->frags;
++
++		skb_copy_from_linear_data_offset(head_skb, offset,
++						 skb_put(nskb, hsize), hsize);
++
++		skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
++					   SKBFL_SHARED_FRAG;
++
++		if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
++		    skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
++			goto err;
++
++		while (pos < offset + len) {
++			if (i >= nfrags) {
++				i = 0;
++				nfrags = skb_shinfo(list_skb)->nr_frags;
++				frag = skb_shinfo(list_skb)->frags;
++				frag_skb = list_skb;
++				if (!skb_headlen(list_skb)) {
++					BUG_ON(!nfrags);
++				} else {
++					BUG_ON(!list_skb->head_frag);
++
++					/* to make room for head_frag. */
++					i--;
++					frag--;
++				}
++				if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
++				    skb_zerocopy_clone(nskb, frag_skb,
++						       GFP_ATOMIC))
++					goto err;
++
++				list_skb = list_skb->next;
++			}
++
++			if (unlikely(skb_shinfo(nskb)->nr_frags >=
++				     MAX_SKB_FRAGS)) {
++				net_warn_ratelimited(
++					"skb_segment: too many frags: %u %u\n",
++					pos, mss);
++				err = -EINVAL;
++				goto err;
++			}
++
++			*nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
++			__skb_frag_ref(nskb_frag);
++			size = skb_frag_size(nskb_frag);
++
++			if (pos < offset) {
++				skb_frag_off_add(nskb_frag, offset - pos);
++				skb_frag_size_sub(nskb_frag, offset - pos);
++			}
++
++			skb_shinfo(nskb)->nr_frags++;
++
++			if (pos + size <= offset + len) {
++				i++;
++				frag++;
++				pos += size;
++			} else {
++				skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
++				goto skip_fraglist;
++			}
++
++			nskb_frag++;
++		}
++
++skip_fraglist:
++		nskb->data_len = len - hsize;
++		nskb->len += nskb->data_len;
++		nskb->truesize += nskb->data_len;
++
++perform_csum_check:
++		if (!csum) {
++			if (skb_has_shared_frag(nskb) &&
++			    __skb_linearize(nskb))
++				goto err;
++
++			if (!nskb->remcsum_offload)
++				nskb->ip_summed = CHECKSUM_NONE;
++			SKB_GSO_CB(nskb)->csum =
++				skb_checksum(nskb, doffset,
++					     nskb->len - doffset, 0);
++			SKB_GSO_CB(nskb)->csum_start =
++				skb_headroom(nskb) + doffset;
++		}
++	} while ((offset += len) < head_skb->len);
++
++	/* Some callers want to get the end of the list.
++	 * Put it in segs->prev to avoid walking the list.
++	 * (see validate_xmit_skb_list() for example)
++	 */
++	segs->prev = tail;
++
++	if (partial_segs) {
++		struct sk_buff *iter;
++		int type = skb_shinfo(head_skb)->gso_type;
++		unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
++
++		/* Update type to add partial and then remove dodgy if set */
++		type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
++		type &= ~SKB_GSO_DODGY;
++
++		/* Update GSO info and prepare to start updating headers on
++		 * our way back down the stack of protocols.
++		 */
++		for (iter = segs; iter; iter = iter->next) {
++			skb_shinfo(iter)->gso_size = gso_size;
++			skb_shinfo(iter)->gso_segs = partial_segs;
++			skb_shinfo(iter)->gso_type = type;
++			SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
++		}
++
++		if (tail->len - doffset <= gso_size)
++			skb_shinfo(tail)->gso_size = 0;
++		else if (tail != segs)
++			skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
++	}
++
++	/* Following permits correct backpressure, for protocols
++	 * using skb_set_owner_w().
++	 * Idea is to tranfert ownership from head_skb to last segment.
++	 */
++	if (head_skb->destructor == sock_wfree) {
++		swap(tail->truesize, head_skb->truesize);
++		swap(tail->destructor, head_skb->destructor);
++		swap(tail->sk, head_skb->sk);
++	}
++	return segs;
++
++err:
++	kfree_skb_list(segs);
++	return ERR_PTR(err);
++}
++EXPORT_SYMBOL_GPL(skb_segment);
++
++#ifdef CONFIG_SKB_EXTENSIONS
++#define SKB_EXT_ALIGN_VALUE	8
++#define SKB_EXT_CHUNKSIZEOF(x)	(ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
++
++static const u8 skb_ext_type_len[] = {
++#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
++	[SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
++#endif
++#ifdef CONFIG_XFRM
++	[SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
++#endif
++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
++	[TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
++#endif
++#if IS_ENABLED(CONFIG_MPTCP)
++	[SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
++#endif
++#if IS_ENABLED(CONFIG_MCTP_FLOWS)
++	[SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
++#endif
++};
++
++static __always_inline unsigned int skb_ext_total_length(void)
++{
++	return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
++#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
++		skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
++#endif
++#ifdef CONFIG_XFRM
++		skb_ext_type_len[SKB_EXT_SEC_PATH] +
++#endif
++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
++		skb_ext_type_len[TC_SKB_EXT] +
++#endif
++#if IS_ENABLED(CONFIG_MPTCP)
++		skb_ext_type_len[SKB_EXT_MPTCP] +
++#endif
++#if IS_ENABLED(CONFIG_MCTP_FLOWS)
++		skb_ext_type_len[SKB_EXT_MCTP] +
++#endif
++		0;
++}
++
++static void skb_extensions_init(void)
++{
++	BUILD_BUG_ON(SKB_EXT_NUM >= 8);
++	BUILD_BUG_ON(skb_ext_total_length() > 255);
++
++	skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
++					     SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
++					     0,
++					     SLAB_HWCACHE_ALIGN|SLAB_PANIC,
++					     NULL);
++}
++#else
++static void skb_extensions_init(void) {}
++#endif
++
++void __init skb_init(void)
++{
++	skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
++					      sizeof(struct sk_buff),
++					      0,
++					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
++					      offsetof(struct sk_buff, cb),
++					      sizeof_field(struct sk_buff, cb),
++					      NULL);
++	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
++						sizeof(struct sk_buff_fclones),
++						0,
++						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
++						NULL);
++	skb_extensions_init();
++}
++
++static int
++__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
++	       unsigned int recursion_level)
++{
++	int start = skb_headlen(skb);
++	int i, copy = start - offset;
++	struct sk_buff *frag_iter;
++	int elt = 0;
++
++	if (unlikely(recursion_level >= 24))
++		return -EMSGSIZE;
++
++	if (copy > 0) {
++		if (copy > len)
++			copy = len;
++		sg_set_buf(sg, skb->data + offset, copy);
++		elt++;
++		if ((len -= copy) == 0)
++			return elt;
++		offset += copy;
++	}
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		int end;
++
++		WARN_ON(start > offset + len);
++
++		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
++		if ((copy = end - offset) > 0) {
++			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
++				return -EMSGSIZE;
++
++			if (copy > len)
++				copy = len;
++			sg_set_page(&sg[elt], skb_frag_page(frag), copy,
++				    skb_frag_off(frag) + offset - start);
++			elt++;
++			if (!(len -= copy))
++				return elt;
++			offset += copy;
++		}
++		start = end;
++	}
++
++	skb_walk_frags(skb, frag_iter) {
++		int end, ret;
++
++		WARN_ON(start > offset + len);
++
++		end = start + frag_iter->len;
++		if ((copy = end - offset) > 0) {
++			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
++				return -EMSGSIZE;
++
++			if (copy > len)
++				copy = len;
++			ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
++					      copy, recursion_level + 1);
++			if (unlikely(ret < 0))
++				return ret;
++			elt += ret;
++			if ((len -= copy) == 0)
++				return elt;
++			offset += copy;
++		}
++		start = end;
++	}
++	BUG_ON(len);
++	return elt;
++}
++
++/**
++ *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer
++ *	@skb: Socket buffer containing the buffers to be mapped
++ *	@sg: The scatter-gather list to map into
++ *	@offset: The offset into the buffer's contents to start mapping
++ *	@len: Length of buffer space to be mapped
++ *
++ *	Fill the specified scatter-gather list with mappings/pointers into a
++ *	region of the buffer space attached to a socket buffer. Returns either
++ *	the number of scatterlist items used, or -EMSGSIZE if the contents
++ *	could not fit.
++ */
++int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
++{
++	int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
++
++	if (nsg <= 0)
++		return nsg;
++
++	sg_mark_end(&sg[nsg - 1]);
++
++	return nsg;
++}
++EXPORT_SYMBOL_GPL(skb_to_sgvec);
++
++/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
++ * sglist without mark the sg which contain last skb data as the end.
++ * So the caller can mannipulate sg list as will when padding new data after
++ * the first call without calling sg_unmark_end to expend sg list.
++ *
++ * Scenario to use skb_to_sgvec_nomark:
++ * 1. sg_init_table
++ * 2. skb_to_sgvec_nomark(payload1)
++ * 3. skb_to_sgvec_nomark(payload2)
++ *
++ * This is equivalent to:
++ * 1. sg_init_table
++ * 2. skb_to_sgvec(payload1)
++ * 3. sg_unmark_end
++ * 4. skb_to_sgvec(payload2)
++ *
++ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
++ * is more preferable.
++ */
++int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
++			int offset, int len)
++{
++	return __skb_to_sgvec(skb, sg, offset, len, 0);
++}
++EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
++
++
++
++/**
++ *	skb_cow_data - Check that a socket buffer's data buffers are writable
++ *	@skb: The socket buffer to check.
++ *	@tailbits: Amount of trailing space to be added
++ *	@trailer: Returned pointer to the skb where the @tailbits space begins
++ *
++ *	Make sure that the data buffers attached to a socket buffer are
++ *	writable. If they are not, private copies are made of the data buffers
++ *	and the socket buffer is set to use these instead.
++ *
++ *	If @tailbits is given, make sure that there is space to write @tailbits
++ *	bytes of data beyond current end of socket buffer.  @trailer will be
++ *	set to point to the skb in which this space begins.
++ *
++ *	The number of scatterlist elements required to completely map the
++ *	COW'd and extended socket buffer will be returned.
++ */
++int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
++{
++	int copyflag;
++	int elt;
++	struct sk_buff *skb1, **skb_p;
++
++	/* If skb is cloned or its head is paged, reallocate
++	 * head pulling out all the pages (pages are considered not writable
++	 * at the moment even if they are anonymous).
++	 */
++	if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
++	    !__pskb_pull_tail(skb, __skb_pagelen(skb)))
++		return -ENOMEM;
++
++	/* Easy case. Most of packets will go this way. */
++	if (!skb_has_frag_list(skb)) {
++		/* A little of trouble, not enough of space for trailer.
++		 * This should not happen, when stack is tuned to generate
++		 * good frames. OK, on miss we reallocate and reserve even more
++		 * space, 128 bytes is fair. */
++
++		if (skb_tailroom(skb) < tailbits &&
++		    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
++			return -ENOMEM;
++
++		/* Voila! */
++		*trailer = skb;
++		return 1;
++	}
++
++	/* Misery. We are in troubles, going to mincer fragments... */
++
++	elt = 1;
++	skb_p = &skb_shinfo(skb)->frag_list;
++	copyflag = 0;
++
++	while ((skb1 = *skb_p) != NULL) {
++		int ntail = 0;
++
++		/* The fragment is partially pulled by someone,
++		 * this can happen on input. Copy it and everything
++		 * after it. */
++
++		if (skb_shared(skb1))
++			copyflag = 1;
++
++		/* If the skb is the last, worry about trailer. */
++
++		if (skb1->next == NULL && tailbits) {
++			if (skb_shinfo(skb1)->nr_frags ||
++			    skb_has_frag_list(skb1) ||
++			    skb_tailroom(skb1) < tailbits)
++				ntail = tailbits + 128;
++		}
++
++		if (copyflag ||
++		    skb_cloned(skb1) ||
++		    ntail ||
++		    skb_shinfo(skb1)->nr_frags ||
++		    skb_has_frag_list(skb1)) {
++			struct sk_buff *skb2;
++
++			/* Fuck, we are miserable poor guys... */
++			if (ntail == 0)
++				skb2 = skb_copy(skb1, GFP_ATOMIC);
++			else
++				skb2 = skb_copy_expand(skb1,
++						       skb_headroom(skb1),
++						       ntail,
++						       GFP_ATOMIC);
++			if (unlikely(skb2 == NULL))
++				return -ENOMEM;
++
++			if (skb1->sk)
++				skb_set_owner_w(skb2, skb1->sk);
++
++			/* Looking around. Are we still alive?
++			 * OK, link new skb, drop old one */
++
++			skb2->next = skb1->next;
++			*skb_p = skb2;
++			kfree_skb(skb1);
++			skb1 = skb2;
++		}
++		elt++;
++		*trailer = skb1;
++		skb_p = &skb1->next;
++	}
++
++	return elt;
++}
++EXPORT_SYMBOL_GPL(skb_cow_data);
++
++static void sock_rmem_free(struct sk_buff *skb)
++{
++	struct sock *sk = skb->sk;
++
++	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
++}
++
++static void skb_set_err_queue(struct sk_buff *skb)
++{
++	/* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
++	 * So, it is safe to (mis)use it to mark skbs on the error queue.
++	 */
++	skb->pkt_type = PACKET_OUTGOING;
++	BUILD_BUG_ON(PACKET_OUTGOING == 0);
++}
++
++/*
++ * Note: We dont mem charge error packets (no sk_forward_alloc changes)
++ */
++int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
++{
++	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
++	    (unsigned int)READ_ONCE(sk->sk_rcvbuf))
++		return -ENOMEM;
++
++	skb_orphan(skb);
++	skb->sk = sk;
++	skb->destructor = sock_rmem_free;
++	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
++	skb_set_err_queue(skb);
++
++	/* before exiting rcu section, make sure dst is refcounted */
++	skb_dst_force(skb);
++
++	skb_queue_tail(&sk->sk_error_queue, skb);
++	if (!sock_flag(sk, SOCK_DEAD))
++		sk_error_report(sk);
++	return 0;
++}
++EXPORT_SYMBOL(sock_queue_err_skb);
++
++static bool is_icmp_err_skb(const struct sk_buff *skb)
++{
++	return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
++		       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
++}
++
++struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
++{
++	struct sk_buff_head *q = &sk->sk_error_queue;
++	struct sk_buff *skb, *skb_next = NULL;
++	bool icmp_next = false;
++	unsigned long flags;
++
++	spin_lock_irqsave(&q->lock, flags);
++	skb = __skb_dequeue(q);
++	if (skb && (skb_next = skb_peek(q))) {
++		icmp_next = is_icmp_err_skb(skb_next);
++		if (icmp_next)
++			sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
++	}
++	spin_unlock_irqrestore(&q->lock, flags);
++
++	if (is_icmp_err_skb(skb) && !icmp_next)
++		sk->sk_err = 0;
++
++	if (skb_next)
++		sk_error_report(sk);
++
++	return skb;
++}
++EXPORT_SYMBOL(sock_dequeue_err_skb);
++
++/**
++ * skb_clone_sk - create clone of skb, and take reference to socket
++ * @skb: the skb to clone
++ *
++ * This function creates a clone of a buffer that holds a reference on
++ * sk_refcnt.  Buffers created via this function are meant to be
++ * returned using sock_queue_err_skb, or free via kfree_skb.
++ *
++ * When passing buffers allocated with this function to sock_queue_err_skb
++ * it is necessary to wrap the call with sock_hold/sock_put in order to
++ * prevent the socket from being released prior to being enqueued on
++ * the sk_error_queue.
++ */
++struct sk_buff *skb_clone_sk(struct sk_buff *skb)
++{
++	struct sock *sk = skb->sk;
++	struct sk_buff *clone;
++
++	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
++		return NULL;
++
++	clone = skb_clone(skb, GFP_ATOMIC);
++	if (!clone) {
++		sock_put(sk);
++		return NULL;
++	}
++
++	clone->sk = sk;
++	clone->destructor = sock_efree;
++
++	return clone;
++}
++EXPORT_SYMBOL(skb_clone_sk);
++
++static void __skb_complete_tx_timestamp(struct sk_buff *skb,
++					struct sock *sk,
++					int tstype,
++					bool opt_stats)
++{
++	struct sock_exterr_skb *serr;
++	int err;
++
++	BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
++
++	serr = SKB_EXT_ERR(skb);
++	memset(serr, 0, sizeof(*serr));
++	serr->ee.ee_errno = ENOMSG;
++	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
++	serr->ee.ee_info = tstype;
++	serr->opt_stats = opt_stats;
++	serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
++	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
++		serr->ee.ee_data = skb_shinfo(skb)->tskey;
++		if (sk_is_tcp(sk))
++			serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
++	}
++
++	err = sock_queue_err_skb(sk, skb);
++
++	if (err)
++		kfree_skb(skb);
++}
++
++static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
++{
++	bool ret;
++
++	if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
++		return true;
++
++	read_lock_bh(&sk->sk_callback_lock);
++	ret = sk->sk_socket && sk->sk_socket->file &&
++	      file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
++	read_unlock_bh(&sk->sk_callback_lock);
++	return ret;
++}
++
++void skb_complete_tx_timestamp(struct sk_buff *skb,
++			       struct skb_shared_hwtstamps *hwtstamps)
++{
++	struct sock *sk = skb->sk;
++
++	if (!skb_may_tx_timestamp(sk, false))
++		goto err;
++
++	/* Take a reference to prevent skb_orphan() from freeing the socket,
++	 * but only if the socket refcount is not zero.
++	 */
++	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
++		*skb_hwtstamps(skb) = *hwtstamps;
++		__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
++		sock_put(sk);
++		return;
++	}
++
++err:
++	kfree_skb(skb);
++}
++EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
++
++void __skb_tstamp_tx(struct sk_buff *orig_skb,
++		     const struct sk_buff *ack_skb,
++		     struct skb_shared_hwtstamps *hwtstamps,
++		     struct sock *sk, int tstype)
++{
++	struct sk_buff *skb;
++	bool tsonly, opt_stats = false;
++
++	if (!sk)
++		return;
++
++	if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
++	    skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
++		return;
++
++	tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
++	if (!skb_may_tx_timestamp(sk, tsonly))
++		return;
++
++	if (tsonly) {
++#ifdef CONFIG_INET
++		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
++		    sk_is_tcp(sk)) {
++			skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
++							     ack_skb);
++			opt_stats = true;
++		} else
++#endif
++			skb = alloc_skb(0, GFP_ATOMIC);
++	} else {
++		skb = skb_clone(orig_skb, GFP_ATOMIC);
++	}
++	if (!skb)
++		return;
++
++	if (tsonly) {
++		skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
++					     SKBTX_ANY_TSTAMP;
++		skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
++	}
++
++	if (hwtstamps)
++		*skb_hwtstamps(skb) = *hwtstamps;
++	else
++		__net_timestamp(skb);
++
++	__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
++}
++EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
++
++void skb_tstamp_tx(struct sk_buff *orig_skb,
++		   struct skb_shared_hwtstamps *hwtstamps)
++{
++	return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
++			       SCM_TSTAMP_SND);
++}
++EXPORT_SYMBOL_GPL(skb_tstamp_tx);
++
++void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
++{
++	struct sock *sk = skb->sk;
++	struct sock_exterr_skb *serr;
++	int err = 1;
++
++	skb->wifi_acked_valid = 1;
++	skb->wifi_acked = acked;
++
++	serr = SKB_EXT_ERR(skb);
++	memset(serr, 0, sizeof(*serr));
++	serr->ee.ee_errno = ENOMSG;
++	serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
++
++	/* Take a reference to prevent skb_orphan() from freeing the socket,
++	 * but only if the socket refcount is not zero.
++	 */
++	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
++		err = sock_queue_err_skb(sk, skb);
++		sock_put(sk);
++	}
++	if (err)
++		kfree_skb(skb);
++}
++EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
++
++/**
++ * skb_partial_csum_set - set up and verify partial csum values for packet
++ * @skb: the skb to set
++ * @start: the number of bytes after skb->data to start checksumming.
++ * @off: the offset from start to place the checksum.
++ *
++ * For untrusted partially-checksummed packets, we need to make sure the values
++ * for skb->csum_start and skb->csum_offset are valid so we don't oops.
++ *
++ * This function checks and sets those values and skb->ip_summed: if this
++ * returns false you should drop the packet.
++ */
++bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
++{
++	u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
++	u32 csum_start = skb_headroom(skb) + (u32)start;
++
++	if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
++		net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
++				     start, off, skb_headroom(skb), skb_headlen(skb));
++		return false;
++	}
++	skb->ip_summed = CHECKSUM_PARTIAL;
++	skb->csum_start = csum_start;
++	skb->csum_offset = off;
++	skb_set_transport_header(skb, start);
++	return true;
++}
++EXPORT_SYMBOL_GPL(skb_partial_csum_set);
++
++static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
++			       unsigned int max)
++{
++	if (skb_headlen(skb) >= len)
++		return 0;
++
++	/* If we need to pullup then pullup to the max, so we
++	 * won't need to do it again.
++	 */
++	if (max > skb->len)
++		max = skb->len;
++
++	if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
++		return -ENOMEM;
++
++	if (skb_headlen(skb) < len)
++		return -EPROTO;
++
++	return 0;
++}
++
++#define MAX_TCP_HDR_LEN (15 * 4)
++
++static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
++				      typeof(IPPROTO_IP) proto,
++				      unsigned int off)
++{
++	int err;
++
++	switch (proto) {
++	case IPPROTO_TCP:
++		err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
++					  off + MAX_TCP_HDR_LEN);
++		if (!err && !skb_partial_csum_set(skb, off,
++						  offsetof(struct tcphdr,
++							   check)))
++			err = -EPROTO;
++		return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
++
++	case IPPROTO_UDP:
++		err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
++					  off + sizeof(struct udphdr));
++		if (!err && !skb_partial_csum_set(skb, off,
++						  offsetof(struct udphdr,
++							   check)))
++			err = -EPROTO;
++		return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
++	}
++
++	return ERR_PTR(-EPROTO);
++}
++
++/* This value should be large enough to cover a tagged ethernet header plus
++ * maximally sized IP and TCP or UDP headers.
++ */
++#define MAX_IP_HDR_LEN 128
++
++static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
++{
++	unsigned int off;
++	bool fragment;
++	__sum16 *csum;
++	int err;
++
++	fragment = false;
++
++	err = skb_maybe_pull_tail(skb,
++				  sizeof(struct iphdr),
++				  MAX_IP_HDR_LEN);
++	if (err < 0)
++		goto out;
++
++	if (ip_is_fragment(ip_hdr(skb)))
++		fragment = true;
++
++	off = ip_hdrlen(skb);
++
++	err = -EPROTO;
++
++	if (fragment)
++		goto out;
++
++	csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
++	if (IS_ERR(csum))
++		return PTR_ERR(csum);
++
++	if (recalculate)
++		*csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
++					   ip_hdr(skb)->daddr,
++					   skb->len - off,
++					   ip_hdr(skb)->protocol, 0);
++	err = 0;
++
++out:
++	return err;
++}
++
++/* This value should be large enough to cover a tagged ethernet header plus
++ * an IPv6 header, all options, and a maximal TCP or UDP header.
++ */
++#define MAX_IPV6_HDR_LEN 256
++
++#define OPT_HDR(type, skb, off) \
++	(type *)(skb_network_header(skb) + (off))
++
++static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
++{
++	int err;
++	u8 nexthdr;
++	unsigned int off;
++	unsigned int len;
++	bool fragment;
++	bool done;
++	__sum16 *csum;
++
++	fragment = false;
++	done = false;
++
++	off = sizeof(struct ipv6hdr);
++
++	err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
++	if (err < 0)
++		goto out;
++
++	nexthdr = ipv6_hdr(skb)->nexthdr;
++
++	len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
++	while (off <= len && !done) {
++		switch (nexthdr) {
++		case IPPROTO_DSTOPTS:
++		case IPPROTO_HOPOPTS:
++		case IPPROTO_ROUTING: {
++			struct ipv6_opt_hdr *hp;
++
++			err = skb_maybe_pull_tail(skb,
++						  off +
++						  sizeof(struct ipv6_opt_hdr),
++						  MAX_IPV6_HDR_LEN);
++			if (err < 0)
++				goto out;
++
++			hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
++			nexthdr = hp->nexthdr;
++			off += ipv6_optlen(hp);
++			break;
++		}
++		case IPPROTO_AH: {
++			struct ip_auth_hdr *hp;
++
++			err = skb_maybe_pull_tail(skb,
++						  off +
++						  sizeof(struct ip_auth_hdr),
++						  MAX_IPV6_HDR_LEN);
++			if (err < 0)
++				goto out;
++
++			hp = OPT_HDR(struct ip_auth_hdr, skb, off);
++			nexthdr = hp->nexthdr;
++			off += ipv6_authlen(hp);
++			break;
++		}
++		case IPPROTO_FRAGMENT: {
++			struct frag_hdr *hp;
++
++			err = skb_maybe_pull_tail(skb,
++						  off +
++						  sizeof(struct frag_hdr),
++						  MAX_IPV6_HDR_LEN);
++			if (err < 0)
++				goto out;
++
++			hp = OPT_HDR(struct frag_hdr, skb, off);
++
++			if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
++				fragment = true;
++
++			nexthdr = hp->nexthdr;
++			off += sizeof(struct frag_hdr);
++			break;
++		}
++		default:
++			done = true;
++			break;
++		}
++	}
++
++	err = -EPROTO;
++
++	if (!done || fragment)
++		goto out;
++
++	csum = skb_checksum_setup_ip(skb, nexthdr, off);
++	if (IS_ERR(csum))
++		return PTR_ERR(csum);
++
++	if (recalculate)
++		*csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
++					 &ipv6_hdr(skb)->daddr,
++					 skb->len - off, nexthdr, 0);
++	err = 0;
++
++out:
++	return err;
++}
++
++/**
++ * skb_checksum_setup - set up partial checksum offset
++ * @skb: the skb to set up
++ * @recalculate: if true the pseudo-header checksum will be recalculated
++ */
++int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
++{
++	int err;
++
++	switch (skb->protocol) {
++	case htons(ETH_P_IP):
++		err = skb_checksum_setup_ipv4(skb, recalculate);
++		break;
++
++	case htons(ETH_P_IPV6):
++		err = skb_checksum_setup_ipv6(skb, recalculate);
++		break;
++
++	default:
++		err = -EPROTO;
++		break;
++	}
++
++	return err;
++}
++EXPORT_SYMBOL(skb_checksum_setup);
++
++/**
++ * skb_checksum_maybe_trim - maybe trims the given skb
++ * @skb: the skb to check
++ * @transport_len: the data length beyond the network header
++ *
++ * Checks whether the given skb has data beyond the given transport length.
++ * If so, returns a cloned skb trimmed to this transport length.
++ * Otherwise returns the provided skb. Returns NULL in error cases
++ * (e.g. transport_len exceeds skb length or out-of-memory).
++ *
++ * Caller needs to set the skb transport header and free any returned skb if it
++ * differs from the provided skb.
++ */
++static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
++					       unsigned int transport_len)
++{
++	struct sk_buff *skb_chk;
++	unsigned int len = skb_transport_offset(skb) + transport_len;
++	int ret;
++
++	if (skb->len < len)
++		return NULL;
++	else if (skb->len == len)
++		return skb;
++
++	skb_chk = skb_clone(skb, GFP_ATOMIC);
++	if (!skb_chk)
++		return NULL;
++
++	ret = pskb_trim_rcsum(skb_chk, len);
++	if (ret) {
++		kfree_skb(skb_chk);
++		return NULL;
++	}
++
++	return skb_chk;
++}
++
++/**
++ * skb_checksum_trimmed - validate checksum of an skb
++ * @skb: the skb to check
++ * @transport_len: the data length beyond the network header
++ * @skb_chkf: checksum function to use
++ *
++ * Applies the given checksum function skb_chkf to the provided skb.
++ * Returns a checked and maybe trimmed skb. Returns NULL on error.
++ *
++ * If the skb has data beyond the given transport length, then a
++ * trimmed & cloned skb is checked and returned.
++ *
++ * Caller needs to set the skb transport header and free any returned skb if it
++ * differs from the provided skb.
++ */
++struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
++				     unsigned int transport_len,
++				     __sum16(*skb_chkf)(struct sk_buff *skb))
++{
++	struct sk_buff *skb_chk;
++	unsigned int offset = skb_transport_offset(skb);
++	__sum16 ret;
++
++	skb_chk = skb_checksum_maybe_trim(skb, transport_len);
++	if (!skb_chk)
++		goto err;
++
++	if (!pskb_may_pull(skb_chk, offset))
++		goto err;
++
++	skb_pull_rcsum(skb_chk, offset);
++	ret = skb_chkf(skb_chk);
++	skb_push_rcsum(skb_chk, offset);
++
++	if (ret)
++		goto err;
++
++	return skb_chk;
++
++err:
++	if (skb_chk && skb_chk != skb)
++		kfree_skb(skb_chk);
++
++	return NULL;
++
++}
++EXPORT_SYMBOL(skb_checksum_trimmed);
++
++void __skb_warn_lro_forwarding(const struct sk_buff *skb)
++{
++	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
++			     skb->dev->name);
++}
++EXPORT_SYMBOL(__skb_warn_lro_forwarding);
++
++void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
++{
++	if (head_stolen) {
++		skb_release_head_state(skb);
++		kmem_cache_free(skbuff_head_cache, skb);
++	} else {
++		__kfree_skb(skb);
++	}
++}
++EXPORT_SYMBOL(kfree_skb_partial);
++
++/**
++ * skb_try_coalesce - try to merge skb to prior one
++ * @to: prior buffer
++ * @from: buffer to add
++ * @fragstolen: pointer to boolean
++ * @delta_truesize: how much more was allocated than was requested
++ */
++bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
++		      bool *fragstolen, int *delta_truesize)
++{
++	struct skb_shared_info *to_shinfo, *from_shinfo;
++	int i, delta, len = from->len;
++
++	*fragstolen = false;
++
++	if (skb_cloned(to))
++		return false;
++
++	/* In general, avoid mixing slab allocated and page_pool allocated
++	 * pages within the same SKB. However when @to is not pp_recycle and
++	 * @from is cloned, we can transition frag pages from page_pool to
++	 * reference counted.
++	 *
++	 * On the other hand, don't allow coalescing two pp_recycle SKBs if
++	 * @from is cloned, in case the SKB is using page_pool fragment
++	 * references (PP_FLAG_PAGE_FRAG). Since we only take full page
++	 * references for cloned SKBs at the moment that would result in
++	 * inconsistent reference counts.
++	 */
++	if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from)))
++		return false;
++
++	if (len <= skb_tailroom(to)) {
++		if (len)
++			BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
++		*delta_truesize = 0;
++		return true;
++	}
++
++	to_shinfo = skb_shinfo(to);
++	from_shinfo = skb_shinfo(from);
++	if (to_shinfo->frag_list || from_shinfo->frag_list)
++		return false;
++	if (skb_zcopy(to) || skb_zcopy(from))
++		return false;
++
++	if (skb_headlen(from) != 0) {
++		struct page *page;
++		unsigned int offset;
++
++		if (to_shinfo->nr_frags +
++		    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
++			return false;
++
++		if (skb_head_is_locked(from))
++			return false;
++
++		delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
++
++		page = virt_to_head_page(from->head);
++		offset = from->data - (unsigned char *)page_address(page);
++
++		skb_fill_page_desc(to, to_shinfo->nr_frags,
++				   page, offset, skb_headlen(from));
++		*fragstolen = true;
++	} else {
++		if (to_shinfo->nr_frags +
++		    from_shinfo->nr_frags > MAX_SKB_FRAGS)
++			return false;
++
++		delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
++	}
++
++	WARN_ON_ONCE(delta < len);
++
++	memcpy(to_shinfo->frags + to_shinfo->nr_frags,
++	       from_shinfo->frags,
++	       from_shinfo->nr_frags * sizeof(skb_frag_t));
++	to_shinfo->nr_frags += from_shinfo->nr_frags;
++
++	if (!skb_cloned(from))
++		from_shinfo->nr_frags = 0;
++
++	/* if the skb is not cloned this does nothing
++	 * since we set nr_frags to 0.
++	 */
++	for (i = 0; i < from_shinfo->nr_frags; i++)
++		__skb_frag_ref(&from_shinfo->frags[i]);
++
++	to->truesize += delta;
++	to->len += len;
++	to->data_len += len;
++
++	*delta_truesize = delta;
++	return true;
++}
++EXPORT_SYMBOL(skb_try_coalesce);
++
++/**
++ * skb_scrub_packet - scrub an skb
++ *
++ * @skb: buffer to clean
++ * @xnet: packet is crossing netns
++ *
++ * skb_scrub_packet can be used after encapsulating or decapsulting a packet
++ * into/from a tunnel. Some information have to be cleared during these
++ * operations.
++ * skb_scrub_packet can also be used to clean a skb before injecting it in
++ * another namespace (@xnet == true). We have to clear all information in the
++ * skb that could impact namespace isolation.
++ */
++void skb_scrub_packet(struct sk_buff *skb, bool xnet)
++{
++	skb->pkt_type = PACKET_HOST;
++	skb->skb_iif = 0;
++	skb->ignore_df = 0;
++	skb_dst_drop(skb);
++	skb_ext_reset(skb);
++	nf_reset_ct(skb);
++	nf_reset_trace(skb);
++
++#ifdef CONFIG_NET_SWITCHDEV
++	skb->offload_fwd_mark = 0;
++	skb->offload_l3_fwd_mark = 0;
++#endif
++
++	if (!xnet)
++		return;
++
++	ipvs_reset(skb);
++	skb->mark = 0;
++	skb_clear_tstamp(skb);
++}
++EXPORT_SYMBOL_GPL(skb_scrub_packet);
++
++/**
++ * skb_gso_transport_seglen - Return length of individual segments of a gso packet
++ *
++ * @skb: GSO skb
++ *
++ * skb_gso_transport_seglen is used to determine the real size of the
++ * individual segments, including Layer4 headers (TCP/UDP).
++ *
++ * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
++ */
++static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
++{
++	const struct skb_shared_info *shinfo = skb_shinfo(skb);
++	unsigned int thlen = 0;
++
++	if (skb->encapsulation) {
++		thlen = skb_inner_transport_header(skb) -
++			skb_transport_header(skb);
++
++		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
++			thlen += inner_tcp_hdrlen(skb);
++	} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
++		thlen = tcp_hdrlen(skb);
++	} else if (unlikely(skb_is_gso_sctp(skb))) {
++		thlen = sizeof(struct sctphdr);
++	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
++		thlen = sizeof(struct udphdr);
++	}
++	/* UFO sets gso_size to the size of the fragmentation
++	 * payload, i.e. the size of the L4 (UDP) header is already
++	 * accounted for.
++	 */
++	return thlen + shinfo->gso_size;
++}
++
++/**
++ * skb_gso_network_seglen - Return length of individual segments of a gso packet
++ *
++ * @skb: GSO skb
++ *
++ * skb_gso_network_seglen is used to determine the real size of the
++ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
++ *
++ * The MAC/L2 header is not accounted for.
++ */
++static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
++{
++	unsigned int hdr_len = skb_transport_header(skb) -
++			       skb_network_header(skb);
++
++	return hdr_len + skb_gso_transport_seglen(skb);
++}
++
++/**
++ * skb_gso_mac_seglen - Return length of individual segments of a gso packet
++ *
++ * @skb: GSO skb
++ *
++ * skb_gso_mac_seglen is used to determine the real size of the
++ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
++ * headers (TCP/UDP).
++ */
++static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
++{
++	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
++
++	return hdr_len + skb_gso_transport_seglen(skb);
++}
++
++/**
++ * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
++ *
++ * There are a couple of instances where we have a GSO skb, and we
++ * want to determine what size it would be after it is segmented.
++ *
++ * We might want to check:
++ * -    L3+L4+payload size (e.g. IP forwarding)
++ * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
++ *
++ * This is a helper to do that correctly considering GSO_BY_FRAGS.
++ *
++ * @skb: GSO skb
++ *
++ * @seg_len: The segmented length (from skb_gso_*_seglen). In the
++ *           GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
++ *
++ * @max_len: The maximum permissible length.
++ *
++ * Returns true if the segmented length <= max length.
++ */
++static inline bool skb_gso_size_check(const struct sk_buff *skb,
++				      unsigned int seg_len,
++				      unsigned int max_len) {
++	const struct skb_shared_info *shinfo = skb_shinfo(skb);
++	const struct sk_buff *iter;
++
++	if (shinfo->gso_size != GSO_BY_FRAGS)
++		return seg_len <= max_len;
++
++	/* Undo this so we can re-use header sizes */
++	seg_len -= GSO_BY_FRAGS;
++
++	skb_walk_frags(skb, iter) {
++		if (seg_len + skb_headlen(iter) > max_len)
++			return false;
++	}
++
++	return true;
++}
++
++/**
++ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
++ *
++ * @skb: GSO skb
++ * @mtu: MTU to validate against
++ *
++ * skb_gso_validate_network_len validates if a given skb will fit a
++ * wanted MTU once split. It considers L3 headers, L4 headers, and the
++ * payload.
++ */
++bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
++{
++	return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
++}
++EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
++
++/**
++ * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
++ *
++ * @skb: GSO skb
++ * @len: length to validate against
++ *
++ * skb_gso_validate_mac_len validates if a given skb will fit a wanted
++ * length once split, including L2, L3 and L4 headers and the payload.
++ */
++bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
++{
++	return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
++}
++EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
++
++static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
++{
++	int mac_len, meta_len;
++	void *meta;
++
++	if (skb_cow(skb, skb_headroom(skb)) < 0) {
++		kfree_skb(skb);
++		return NULL;
++	}
++
++	mac_len = skb->data - skb_mac_header(skb);
++	if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
++		memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
++			mac_len - VLAN_HLEN - ETH_TLEN);
++	}
++
++	meta_len = skb_metadata_len(skb);
++	if (meta_len) {
++		meta = skb_metadata_end(skb) - meta_len;
++		memmove(meta + VLAN_HLEN, meta, meta_len);
++	}
++
++	skb->mac_header += VLAN_HLEN;
++	return skb;
++}
++
++struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
++{
++	struct vlan_hdr *vhdr;
++	u16 vlan_tci;
++
++	if (unlikely(skb_vlan_tag_present(skb))) {
++		/* vlan_tci is already set-up so leave this for another time */
++		return skb;
++	}
++
++	skb = skb_share_check(skb, GFP_ATOMIC);
++	if (unlikely(!skb))
++		goto err_free;
++	/* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
++	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
++		goto err_free;
++
++	vhdr = (struct vlan_hdr *)skb->data;
++	vlan_tci = ntohs(vhdr->h_vlan_TCI);
++	__vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
++
++	skb_pull_rcsum(skb, VLAN_HLEN);
++	vlan_set_encap_proto(skb, vhdr);
++
++	skb = skb_reorder_vlan_header(skb);
++	if (unlikely(!skb))
++		goto err_free;
++
++	skb_reset_network_header(skb);
++	if (!skb_transport_header_was_set(skb))
++		skb_reset_transport_header(skb);
++	skb_reset_mac_len(skb);
++
++	return skb;
++
++err_free:
++	kfree_skb(skb);
++	return NULL;
++}
++EXPORT_SYMBOL(skb_vlan_untag);
++
++int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
++{
++	if (!pskb_may_pull(skb, write_len))
++		return -ENOMEM;
++
++	if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
++		return 0;
++
++	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
++}
++EXPORT_SYMBOL(skb_ensure_writable);
++
++/* remove VLAN header from packet and update csum accordingly.
++ * expects a non skb_vlan_tag_present skb with a vlan tag payload
++ */
++int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
++{
++	struct vlan_hdr *vhdr;
++	int offset = skb->data - skb_mac_header(skb);
++	int err;
++
++	if (WARN_ONCE(offset,
++		      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
++		      offset)) {
++		return -EINVAL;
++	}
++
++	err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
++	if (unlikely(err))
++		return err;
++
++	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
++
++	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
++	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
++
++	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
++	__skb_pull(skb, VLAN_HLEN);
++
++	vlan_set_encap_proto(skb, vhdr);
++	skb->mac_header += VLAN_HLEN;
++
++	if (skb_network_offset(skb) < ETH_HLEN)
++		skb_set_network_header(skb, ETH_HLEN);
++
++	skb_reset_mac_len(skb);
++
++	return err;
++}
++EXPORT_SYMBOL(__skb_vlan_pop);
++
++/* Pop a vlan tag either from hwaccel or from payload.
++ * Expects skb->data at mac header.
++ */
++int skb_vlan_pop(struct sk_buff *skb)
++{
++	u16 vlan_tci;
++	__be16 vlan_proto;
++	int err;
++
++	if (likely(skb_vlan_tag_present(skb))) {
++		__vlan_hwaccel_clear_tag(skb);
++	} else {
++		if (unlikely(!eth_type_vlan(skb->protocol)))
++			return 0;
++
++		err = __skb_vlan_pop(skb, &vlan_tci);
++		if (err)
++			return err;
++	}
++	/* move next vlan tag to hw accel tag */
++	if (likely(!eth_type_vlan(skb->protocol)))
++		return 0;
++
++	vlan_proto = skb->protocol;
++	err = __skb_vlan_pop(skb, &vlan_tci);
++	if (unlikely(err))
++		return err;
++
++	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
++	return 0;
++}
++EXPORT_SYMBOL(skb_vlan_pop);
++
++/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
++ * Expects skb->data at mac header.
++ */
++int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
++{
++	if (skb_vlan_tag_present(skb)) {
++		int offset = skb->data - skb_mac_header(skb);
++		int err;
++
++		if (WARN_ONCE(offset,
++			      "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
++			      offset)) {
++			return -EINVAL;
++		}
++
++		err = __vlan_insert_tag(skb, skb->vlan_proto,
++					skb_vlan_tag_get(skb));
++		if (err)
++			return err;
++
++		skb->protocol = skb->vlan_proto;
++		skb->mac_len += VLAN_HLEN;
++
++		skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
++	}
++	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
++	return 0;
++}
++EXPORT_SYMBOL(skb_vlan_push);
++
++/**
++ * skb_eth_pop() - Drop the Ethernet header at the head of a packet
++ *
++ * @skb: Socket buffer to modify
++ *
++ * Drop the Ethernet header of @skb.
++ *
++ * Expects that skb->data points to the mac header and that no VLAN tags are
++ * present.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_eth_pop(struct sk_buff *skb)
++{
++	if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
++	    skb_network_offset(skb) < ETH_HLEN)
++		return -EPROTO;
++
++	skb_pull_rcsum(skb, ETH_HLEN);
++	skb_reset_mac_header(skb);
++	skb_reset_mac_len(skb);
++
++	return 0;
++}
++EXPORT_SYMBOL(skb_eth_pop);
++
++/**
++ * skb_eth_push() - Add a new Ethernet header at the head of a packet
++ *
++ * @skb: Socket buffer to modify
++ * @dst: Destination MAC address of the new header
++ * @src: Source MAC address of the new header
++ *
++ * Prepend @skb with a new Ethernet header.
++ *
++ * Expects that skb->data points to the mac header, which must be empty.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
++		 const unsigned char *src)
++{
++	struct ethhdr *eth;
++	int err;
++
++	if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
++		return -EPROTO;
++
++	err = skb_cow_head(skb, sizeof(*eth));
++	if (err < 0)
++		return err;
++
++	skb_push(skb, sizeof(*eth));
++	skb_reset_mac_header(skb);
++	skb_reset_mac_len(skb);
++
++	eth = eth_hdr(skb);
++	ether_addr_copy(eth->h_dest, dst);
++	ether_addr_copy(eth->h_source, src);
++	eth->h_proto = skb->protocol;
++
++	skb_postpush_rcsum(skb, eth, sizeof(*eth));
++
++	return 0;
++}
++EXPORT_SYMBOL(skb_eth_push);
++
++/* Update the ethertype of hdr and the skb csum value if required. */
++static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
++			     __be16 ethertype)
++{
++	if (skb->ip_summed == CHECKSUM_COMPLETE) {
++		__be16 diff[] = { ~hdr->h_proto, ethertype };
++
++		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
++	}
++
++	hdr->h_proto = ethertype;
++}
++
++/**
++ * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
++ *                   the packet
++ *
++ * @skb: buffer
++ * @mpls_lse: MPLS label stack entry to push
++ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
++ * @mac_len: length of the MAC header
++ * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
++ *            ethernet
++ *
++ * Expects skb->data at mac header.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
++		  int mac_len, bool ethernet)
++{
++	struct mpls_shim_hdr *lse;
++	int err;
++
++	if (unlikely(!eth_p_mpls(mpls_proto)))
++		return -EINVAL;
++
++	/* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
++	if (skb->encapsulation)
++		return -EINVAL;
++
++	err = skb_cow_head(skb, MPLS_HLEN);
++	if (unlikely(err))
++		return err;
++
++	if (!skb->inner_protocol) {
++		skb_set_inner_network_header(skb, skb_network_offset(skb));
++		skb_set_inner_protocol(skb, skb->protocol);
++	}
++
++	skb_push(skb, MPLS_HLEN);
++	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
++		mac_len);
++	skb_reset_mac_header(skb);
++	skb_set_network_header(skb, mac_len);
++	skb_reset_mac_len(skb);
++
++	lse = mpls_hdr(skb);
++	lse->label_stack_entry = mpls_lse;
++	skb_postpush_rcsum(skb, lse, MPLS_HLEN);
++
++	if (ethernet && mac_len >= ETH_HLEN)
++		skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
++	skb->protocol = mpls_proto;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_mpls_push);
++
++/**
++ * skb_mpls_pop() - pop the outermost MPLS header
++ *
++ * @skb: buffer
++ * @next_proto: ethertype of header after popped MPLS header
++ * @mac_len: length of the MAC header
++ * @ethernet: flag to indicate if the packet is ethernet
++ *
++ * Expects skb->data at mac header.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
++		 bool ethernet)
++{
++	int err;
++
++	if (unlikely(!eth_p_mpls(skb->protocol)))
++		return 0;
++
++	err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
++	if (unlikely(err))
++		return err;
++
++	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
++	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
++		mac_len);
++
++	__skb_pull(skb, MPLS_HLEN);
++	skb_reset_mac_header(skb);
++	skb_set_network_header(skb, mac_len);
++
++	if (ethernet && mac_len >= ETH_HLEN) {
++		struct ethhdr *hdr;
++
++		/* use mpls_hdr() to get ethertype to account for VLANs. */
++		hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
++		skb_mod_eth_type(skb, hdr, next_proto);
++	}
++	skb->protocol = next_proto;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_mpls_pop);
++
++/**
++ * skb_mpls_update_lse() - modify outermost MPLS header and update csum
++ *
++ * @skb: buffer
++ * @mpls_lse: new MPLS label stack entry to update to
++ *
++ * Expects skb->data at mac header.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
++{
++	int err;
++
++	if (unlikely(!eth_p_mpls(skb->protocol)))
++		return -EINVAL;
++
++	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
++	if (unlikely(err))
++		return err;
++
++	if (skb->ip_summed == CHECKSUM_COMPLETE) {
++		__be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
++
++		skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
++	}
++
++	mpls_hdr(skb)->label_stack_entry = mpls_lse;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
++
++/**
++ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
++ *
++ * @skb: buffer
++ *
++ * Expects skb->data at mac header.
++ *
++ * Returns 0 on success, -errno otherwise.
++ */
++int skb_mpls_dec_ttl(struct sk_buff *skb)
++{
++	u32 lse;
++	u8 ttl;
++
++	if (unlikely(!eth_p_mpls(skb->protocol)))
++		return -EINVAL;
++
++	if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
++		return -ENOMEM;
++
++	lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
++	ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
++	if (!--ttl)
++		return -EINVAL;
++
++	lse &= ~MPLS_LS_TTL_MASK;
++	lse |= ttl << MPLS_LS_TTL_SHIFT;
++
++	return skb_mpls_update_lse(skb, cpu_to_be32(lse));
++}
++EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
++
++/**
++ * alloc_skb_with_frags - allocate skb with page frags
++ *
++ * @header_len: size of linear part
++ * @data_len: needed length in frags
++ * @max_page_order: max page order desired.
++ * @errcode: pointer to error code if any
++ * @gfp_mask: allocation mask
++ *
++ * This can be used to allocate a paged skb, given a maximal order for frags.
++ */
++struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
++				     unsigned long data_len,
++				     int max_page_order,
++				     int *errcode,
++				     gfp_t gfp_mask)
++{
++	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
++	unsigned long chunk;
++	struct sk_buff *skb;
++	struct page *page;
++	int i;
++
++	*errcode = -EMSGSIZE;
++	/* Note this test could be relaxed, if we succeed to allocate
++	 * high order pages...
++	 */
++	if (npages > MAX_SKB_FRAGS)
++		return NULL;
++
++	*errcode = -ENOBUFS;
++	skb = alloc_skb(header_len, gfp_mask);
++	if (!skb)
++		return NULL;
++
++	skb->truesize += npages << PAGE_SHIFT;
++
++	for (i = 0; npages > 0; i++) {
++		int order = max_page_order;
++
++		while (order) {
++			if (npages >= 1 << order) {
++				page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
++						   __GFP_COMP |
++						   __GFP_NOWARN,
++						   order);
++				if (page)
++					goto fill_page;
++				/* Do not retry other high order allocations */
++				order = 1;
++				max_page_order = 0;
++			}
++			order--;
++		}
++		page = alloc_page(gfp_mask);
++		if (!page)
++			goto failure;
++fill_page:
++		chunk = min_t(unsigned long, data_len,
++			      PAGE_SIZE << order);
++		skb_fill_page_desc(skb, i, page, 0, chunk);
++		data_len -= chunk;
++		npages -= 1 << order;
++	}
++	return skb;
++
++failure:
++	kfree_skb(skb);
++	return NULL;
++}
++EXPORT_SYMBOL(alloc_skb_with_frags);
++
++/* carve out the first off bytes from skb when off < headlen */
++static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
++				    const int headlen, gfp_t gfp_mask)
++{
++	int i;
++	int size = skb_end_offset(skb);
++	int new_hlen = headlen - off;
++	u8 *data;
++
++	size = SKB_DATA_ALIGN(size);
++
++	if (skb_pfmemalloc(skb))
++		gfp_mask |= __GFP_MEMALLOC;
++	data = kmalloc_reserve(size +
++			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
++			       gfp_mask, NUMA_NO_NODE, NULL);
++	if (!data)
++		return -ENOMEM;
++
++	size = SKB_WITH_OVERHEAD(ksize(data));
++
++	/* Copy real data, and all frags */
++	skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
++	skb->len -= off;
++
++	memcpy((struct skb_shared_info *)(data + size),
++	       skb_shinfo(skb),
++	       offsetof(struct skb_shared_info,
++			frags[skb_shinfo(skb)->nr_frags]));
++	if (skb_cloned(skb)) {
++		/* drop the old head gracefully */
++		if (skb_orphan_frags(skb, gfp_mask)) {
++			kfree(data);
++			return -ENOMEM;
++		}
++		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
++			skb_frag_ref(skb, i);
++		if (skb_has_frag_list(skb))
++			skb_clone_fraglist(skb);
++		skb_release_data(skb);
++	} else {
++		/* we can reuse existing recount- all we did was
++		 * relocate values
++		 */
++		skb_free_head(skb);
++	}
++
++	skb->head = data;
++	skb->data = data;
++	skb->head_frag = 0;
++	skb_set_end_offset(skb, size);
++	skb_set_tail_pointer(skb, skb_headlen(skb));
++	skb_headers_offset_update(skb, 0);
++	skb->cloned = 0;
++	skb->hdr_len = 0;
++	skb->nohdr = 0;
++	atomic_set(&skb_shinfo(skb)->dataref, 1);
++
++	return 0;
++}
++
++static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
++
++/* carve out the first eat bytes from skb's frag_list. May recurse into
++ * pskb_carve()
++ */
++static int pskb_carve_frag_list(struct sk_buff *skb,
++				struct skb_shared_info *shinfo, int eat,
++				gfp_t gfp_mask)
++{
++	struct sk_buff *list = shinfo->frag_list;
++	struct sk_buff *clone = NULL;
++	struct sk_buff *insp = NULL;
++
++	do {
++		if (!list) {
++			pr_err("Not enough bytes to eat. Want %d\n", eat);
++			return -EFAULT;
++		}
++		if (list->len <= eat) {
++			/* Eaten as whole. */
++			eat -= list->len;
++			list = list->next;
++			insp = list;
++		} else {
++			/* Eaten partially. */
++			if (skb_shared(list)) {
++				clone = skb_clone(list, gfp_mask);
++				if (!clone)
++					return -ENOMEM;
++				insp = list->next;
++				list = clone;
++			} else {
++				/* This may be pulled without problems. */
++				insp = list;
++			}
++			if (pskb_carve(list, eat, gfp_mask) < 0) {
++				kfree_skb(clone);
++				return -ENOMEM;
++			}
++			break;
++		}
++	} while (eat);
++
++	/* Free pulled out fragments. */
++	while ((list = shinfo->frag_list) != insp) {
++		shinfo->frag_list = list->next;
++		consume_skb(list);
++	}
++	/* And insert new clone at head. */
++	if (clone) {
++		clone->next = list;
++		shinfo->frag_list = clone;
++	}
++	return 0;
++}
++
++/* carve off first len bytes from skb. Split line (off) is in the
++ * non-linear part of skb
++ */
++static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
++				       int pos, gfp_t gfp_mask)
++{
++	int i, k = 0;
++	int size = skb_end_offset(skb);
++	u8 *data;
++	const int nfrags = skb_shinfo(skb)->nr_frags;
++	struct skb_shared_info *shinfo;
++
++	size = SKB_DATA_ALIGN(size);
++
++	if (skb_pfmemalloc(skb))
++		gfp_mask |= __GFP_MEMALLOC;
++	data = kmalloc_reserve(size +
++			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
++			       gfp_mask, NUMA_NO_NODE, NULL);
++	if (!data)
++		return -ENOMEM;
++
++	size = SKB_WITH_OVERHEAD(ksize(data));
++
++	memcpy((struct skb_shared_info *)(data + size),
++	       skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
++	if (skb_orphan_frags(skb, gfp_mask)) {
++		kfree(data);
++		return -ENOMEM;
++	}
++	shinfo = (struct skb_shared_info *)(data + size);
++	for (i = 0; i < nfrags; i++) {
++		int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
++
++		if (pos + fsize > off) {
++			shinfo->frags[k] = skb_shinfo(skb)->frags[i];
++
++			if (pos < off) {
++				/* Split frag.
++				 * We have two variants in this case:
++				 * 1. Move all the frag to the second
++				 *    part, if it is possible. F.e.
++				 *    this approach is mandatory for TUX,
++				 *    where splitting is expensive.
++				 * 2. Split is accurately. We make this.
++				 */
++				skb_frag_off_add(&shinfo->frags[0], off - pos);
++				skb_frag_size_sub(&shinfo->frags[0], off - pos);
++			}
++			skb_frag_ref(skb, i);
++			k++;
++		}
++		pos += fsize;
++	}
++	shinfo->nr_frags = k;
++	if (skb_has_frag_list(skb))
++		skb_clone_fraglist(skb);
++
++	/* split line is in frag list */
++	if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
++		/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
++		if (skb_has_frag_list(skb))
++			kfree_skb_list(skb_shinfo(skb)->frag_list);
++		kfree(data);
++		return -ENOMEM;
++	}
++	skb_release_data(skb);
++
++	skb->head = data;
++	skb->head_frag = 0;
++	skb->data = data;
++	skb_set_end_offset(skb, size);
++	skb_reset_tail_pointer(skb);
++	skb_headers_offset_update(skb, 0);
++	skb->cloned   = 0;
++	skb->hdr_len  = 0;
++	skb->nohdr    = 0;
++	skb->len -= off;
++	skb->data_len = skb->len;
++	atomic_set(&skb_shinfo(skb)->dataref, 1);
++	return 0;
++}
++
++/* remove len bytes from the beginning of the skb */
++static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
++{
++	int headlen = skb_headlen(skb);
++
++	if (len < headlen)
++		return pskb_carve_inside_header(skb, len, headlen, gfp);
++	else
++		return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
++}
++
++/* Extract to_copy bytes starting at off from skb, and return this in
++ * a new skb
++ */
++struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
++			     int to_copy, gfp_t gfp)
++{
++	struct sk_buff  *clone = skb_clone(skb, gfp);
++
++	if (!clone)
++		return NULL;
++
++	if (pskb_carve(clone, off, gfp) < 0 ||
++	    pskb_trim(clone, to_copy)) {
++		kfree_skb(clone);
++		return NULL;
++	}
++	return clone;
++}
++EXPORT_SYMBOL(pskb_extract);
++
++/**
++ * skb_condense - try to get rid of fragments/frag_list if possible
++ * @skb: buffer
++ *
++ * Can be used to save memory before skb is added to a busy queue.
++ * If packet has bytes in frags and enough tail room in skb->head,
++ * pull all of them, so that we can free the frags right now and adjust
++ * truesize.
++ * Notes:
++ *	We do not reallocate skb->head thus can not fail.
++ *	Caller must re-evaluate skb->truesize if needed.
++ */
++void skb_condense(struct sk_buff *skb)
++{
++	if (skb->data_len) {
++		if (skb->data_len > skb->end - skb->tail ||
++		    skb_cloned(skb))
++			return;
++
++		/* Nice, we can free page frag(s) right now */
++		__pskb_pull_tail(skb, skb->data_len);
++	}
++	/* At this point, skb->truesize might be over estimated,
++	 * because skb had a fragment, and fragments do not tell
++	 * their truesize.
++	 * When we pulled its content into skb->head, fragment
++	 * was freed, but __pskb_pull_tail() could not possibly
++	 * adjust skb->truesize, not knowing the frag truesize.
++	 */
++	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
++}
++
++#ifdef CONFIG_SKB_EXTENSIONS
++static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
++{
++	return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
++}
++
++/**
++ * __skb_ext_alloc - allocate a new skb extensions storage
++ *
++ * @flags: See kmalloc().
++ *
++ * Returns the newly allocated pointer. The pointer can later attached to a
++ * skb via __skb_ext_set().
++ * Note: caller must handle the skb_ext as an opaque data.
++ */
++struct skb_ext *__skb_ext_alloc(gfp_t flags)
++{
++	struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
++
++	if (new) {
++		memset(new->offset, 0, sizeof(new->offset));
++		refcount_set(&new->refcnt, 1);
++	}
++
++	return new;
++}
++
++static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
++					 unsigned int old_active)
++{
++	struct skb_ext *new;
++
++	if (refcount_read(&old->refcnt) == 1)
++		return old;
++
++	new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
++	if (!new)
++		return NULL;
++
++	memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
++	refcount_set(&new->refcnt, 1);
++
++#ifdef CONFIG_XFRM
++	if (old_active & (1 << SKB_EXT_SEC_PATH)) {
++		struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
++		unsigned int i;
++
++		for (i = 0; i < sp->len; i++)
++			xfrm_state_hold(sp->xvec[i]);
++	}
++#endif
++	__skb_ext_put(old);
++	return new;
++}
++
++/**
++ * __skb_ext_set - attach the specified extension storage to this skb
++ * @skb: buffer
++ * @id: extension id
++ * @ext: extension storage previously allocated via __skb_ext_alloc()
++ *
++ * Existing extensions, if any, are cleared.
++ *
++ * Returns the pointer to the extension.
++ */
++void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
++		    struct skb_ext *ext)
++{
++	unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
++
++	skb_ext_put(skb);
++	newlen = newoff + skb_ext_type_len[id];
++	ext->chunks = newlen;
++	ext->offset[id] = newoff;
++	skb->extensions = ext;
++	skb->active_extensions = 1 << id;
++	return skb_ext_get_ptr(ext, id);
++}
++
++/**
++ * skb_ext_add - allocate space for given extension, COW if needed
++ * @skb: buffer
++ * @id: extension to allocate space for
++ *
++ * Allocates enough space for the given extension.
++ * If the extension is already present, a pointer to that extension
++ * is returned.
++ *
++ * If the skb was cloned, COW applies and the returned memory can be
++ * modified without changing the extension space of clones buffers.
++ *
++ * Returns pointer to the extension or NULL on allocation failure.
++ */
++void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
++{
++	struct skb_ext *new, *old = NULL;
++	unsigned int newlen, newoff;
++
++	if (skb->active_extensions) {
++		old = skb->extensions;
++
++		new = skb_ext_maybe_cow(old, skb->active_extensions);
++		if (!new)
++			return NULL;
++
++		if (__skb_ext_exist(new, id))
++			goto set_active;
++
++		newoff = new->chunks;
++	} else {
++		newoff = SKB_EXT_CHUNKSIZEOF(*new);
++
++		new = __skb_ext_alloc(GFP_ATOMIC);
++		if (!new)
++			return NULL;
++	}
++
++	newlen = newoff + skb_ext_type_len[id];
++	new->chunks = newlen;
++	new->offset[id] = newoff;
++set_active:
++	skb->slow_gro = 1;
++	skb->extensions = new;
++	skb->active_extensions |= 1 << id;
++	return skb_ext_get_ptr(new, id);
++}
++EXPORT_SYMBOL(skb_ext_add);
++
++#ifdef CONFIG_XFRM
++static void skb_ext_put_sp(struct sec_path *sp)
++{
++	unsigned int i;
++
++	for (i = 0; i < sp->len; i++)
++		xfrm_state_put(sp->xvec[i]);
++}
++#endif
++
++#ifdef CONFIG_MCTP_FLOWS
++static void skb_ext_put_mctp(struct mctp_flow *flow)
++{
++	if (flow->key)
++		mctp_key_unref(flow->key);
++}
++#endif
++
++void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
++{
++	struct skb_ext *ext = skb->extensions;
++
++	skb->active_extensions &= ~(1 << id);
++	if (skb->active_extensions == 0) {
++		skb->extensions = NULL;
++		__skb_ext_put(ext);
++#ifdef CONFIG_XFRM
++	} else if (id == SKB_EXT_SEC_PATH &&
++		   refcount_read(&ext->refcnt) == 1) {
++		struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
++
++		skb_ext_put_sp(sp);
++		sp->len = 0;
++#endif
++	}
++}
++EXPORT_SYMBOL(__skb_ext_del);
++
++void __skb_ext_put(struct skb_ext *ext)
++{
++	/* If this is last clone, nothing can increment
++	 * it after check passes.  Avoids one atomic op.
++	 */
++	if (refcount_read(&ext->refcnt) == 1)
++		goto free_now;
++
++	if (!refcount_dec_and_test(&ext->refcnt))
++		return;
++free_now:
++#ifdef CONFIG_XFRM
++	if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
++		skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
++#endif
++#ifdef CONFIG_MCTP_FLOWS
++	if (__skb_ext_exist(ext, SKB_EXT_MCTP))
++		skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
++#endif
++
++	kmem_cache_free(skbuff_ext_cache, ext);
++}
++EXPORT_SYMBOL(__skb_ext_put);
++#endif /* CONFIG_SKB_EXTENSIONS */
++
++/**
++ * skb_attempt_defer_free - queue skb for remote freeing
++ * @skb: buffer
++ *
++ * Put @skb in a per-cpu list, using the cpu which
++ * allocated the skb/pages to reduce false sharing
++ * and memory zone spinlock contention.
++ */
++void skb_attempt_defer_free(struct sk_buff *skb)
++{
++	int cpu = skb->alloc_cpu;
++	struct softnet_data *sd;
++	unsigned long flags;
++	unsigned int defer_max;
++	bool kick;
++
++	if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
++	    !cpu_online(cpu) ||
++	    cpu == raw_smp_processor_id()) {
++nodefer:	__kfree_skb(skb);
++		return;
++	}
++
++	sd = &per_cpu(softnet_data, cpu);
++	defer_max = READ_ONCE(sysctl_skb_defer_max);
++	if (READ_ONCE(sd->defer_count) >= defer_max)
++		goto nodefer;
++
++	spin_lock_irqsave(&sd->defer_lock, flags);
++	/* Send an IPI every time queue reaches half capacity. */
++	kick = sd->defer_count == (defer_max >> 1);
++	/* Paired with the READ_ONCE() few lines above */
++	WRITE_ONCE(sd->defer_count, sd->defer_count + 1);
++
++	skb->next = sd->defer_list;
++	/* Paired with READ_ONCE() in skb_defer_free_flush() */
++	WRITE_ONCE(sd->defer_list, skb);
++	spin_unlock_irqrestore(&sd->defer_lock, flags);
++
++	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
++	 * if we are unlucky enough (this seems very unlikely).
++	 */
++	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
++		smp_call_function_single_async(cpu, &sd->defer_csd);
++}
+diff -rupN linux.orig/net/dsa/slave.c linux/net/dsa/slave.c
+--- linux.orig/net/dsa/slave.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/dsa/slave.c	2022-12-04 10:40:26.732034003 -0500
+@@ -934,12 +934,12 @@ static void dsa_slave_get_ethtool_stats(
  
  		s = per_cpu_ptr(dev->tstats, i);
  		do {
@@ -8833,11 +57287,10 @@ index 1291c2431d440..dcc550b871623 100644
  		data[0] += tx_packets;
  		data[1] += tx_bytes;
  		data[2] += rx_packets;
-diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
-index 3ca0cc4678862..dbae0c79d5cfb 100644
---- a/net/ipv4/af_inet.c
-+++ b/net/ipv4/af_inet.c
-@@ -1684,9 +1684,9 @@ u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
+diff -rupN linux.orig/net/ipv4/af_inet.c linux/net/ipv4/af_inet.c
+--- linux.orig/net/ipv4/af_inet.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/ipv4/af_inet.c	2022-12-04 10:40:26.732034003 -0500
+@@ -1686,9 +1686,9 @@ u64 snmp_get_cpu_field64(void __percpu *
  	bhptr = per_cpu_ptr(mib, cpu);
  	syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
  	do {
@@ -8849,11 +57302,2095 @@ index 3ca0cc4678862..dbae0c79d5cfb 100644
  
  	return v;
  }
-diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
-index b7de5e46fdd8f..f84da849819cc 100644
---- a/net/ipv6/seg6_local.c
-+++ b/net/ipv6/seg6_local.c
-@@ -1508,13 +1508,13 @@ static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+diff -rupN linux.orig/net/ipv4/af_inet.c.orig linux/net/ipv4/af_inet.c.orig
+--- linux.orig/net/ipv4/af_inet.c.orig	1969-12-31 19:00:00.000000000 -0500
++++ linux/net/ipv4/af_inet.c.orig	2022-12-04 10:40:18.732054506 -0500
+@@ -0,0 +1,2081 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * INET		An implementation of the TCP/IP protocol suite for the LINUX
++ *		operating system.  INET is implemented using the  BSD Socket
++ *		interface as the means of communication with the user level.
++ *
++ *		PF_INET protocol family socket handler.
++ *
++ * Authors:	Ross Biro
++ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
++ *		Florian La Roche, <flla@stud.uni-sb.de>
++ *		Alan Cox, <A.Cox@swansea.ac.uk>
++ *
++ * Changes (see also sock.c)
++ *
++ *		piggy,
++ *		Karl Knutson	:	Socket protocol table
++ *		A.N.Kuznetsov	:	Socket death error in accept().
++ *		John Richardson :	Fix non blocking error in connect()
++ *					so sockets that fail to connect
++ *					don't return -EINPROGRESS.
++ *		Alan Cox	:	Asynchronous I/O support
++ *		Alan Cox	:	Keep correct socket pointer on sock
++ *					structures
++ *					when accept() ed
++ *		Alan Cox	:	Semantics of SO_LINGER aren't state
++ *					moved to close when you look carefully.
++ *					With this fixed and the accept bug fixed
++ *					some RPC stuff seems happier.
++ *		Niibe Yutaka	:	4.4BSD style write async I/O
++ *		Alan Cox,
++ *		Tony Gale 	:	Fixed reuse semantics.
++ *		Alan Cox	:	bind() shouldn't abort existing but dead
++ *					sockets. Stops FTP netin:.. I hope.
++ *		Alan Cox	:	bind() works correctly for RAW sockets.
++ *					Note that FreeBSD at least was broken
++ *					in this respect so be careful with
++ *					compatibility tests...
++ *		Alan Cox	:	routing cache support
++ *		Alan Cox	:	memzero the socket structure for
++ *					compactness.
++ *		Matt Day	:	nonblock connect error handler
++ *		Alan Cox	:	Allow large numbers of pending sockets
++ *					(eg for big web sites), but only if
++ *					specifically application requested.
++ *		Alan Cox	:	New buffering throughout IP. Used
++ *					dumbly.
++ *		Alan Cox	:	New buffering now used smartly.
++ *		Alan Cox	:	BSD rather than common sense
++ *					interpretation of listen.
++ *		Germano Caronni	:	Assorted small races.
++ *		Alan Cox	:	sendmsg/recvmsg basic support.
++ *		Alan Cox	:	Only sendmsg/recvmsg now supported.
++ *		Alan Cox	:	Locked down bind (see security list).
++ *		Alan Cox	:	Loosened bind a little.
++ *		Mike McLagan	:	ADD/DEL DLCI Ioctls
++ *	Willy Konynenberg	:	Transparent proxying support.
++ *		David S. Miller	:	New socket lookup architecture.
++ *					Some other random speedups.
++ *		Cyrus Durgin	:	Cleaned up file for kmod hacks.
++ *		Andi Kleen	:	Fix inet_stream_connect TCP race.
++ */
++
++#define pr_fmt(fmt) "IPv4: " fmt
++
++#include <linux/err.h>
++#include <linux/errno.h>
++#include <linux/types.h>
++#include <linux/socket.h>
++#include <linux/in.h>
++#include <linux/kernel.h>
++#include <linux/kmod.h>
++#include <linux/sched.h>
++#include <linux/timer.h>
++#include <linux/string.h>
++#include <linux/sockios.h>
++#include <linux/net.h>
++#include <linux/capability.h>
++#include <linux/fcntl.h>
++#include <linux/mm.h>
++#include <linux/interrupt.h>
++#include <linux/stat.h>
++#include <linux/init.h>
++#include <linux/poll.h>
++#include <linux/netfilter_ipv4.h>
++#include <linux/random.h>
++#include <linux/slab.h>
++
++#include <linux/uaccess.h>
++
++#include <linux/inet.h>
++#include <linux/igmp.h>
++#include <linux/inetdevice.h>
++#include <linux/netdevice.h>
++#include <net/checksum.h>
++#include <net/ip.h>
++#include <net/protocol.h>
++#include <net/arp.h>
++#include <net/route.h>
++#include <net/ip_fib.h>
++#include <net/inet_connection_sock.h>
++#include <net/gro.h>
++#include <net/tcp.h>
++#include <net/udp.h>
++#include <net/udplite.h>
++#include <net/ping.h>
++#include <linux/skbuff.h>
++#include <net/sock.h>
++#include <net/raw.h>
++#include <net/icmp.h>
++#include <net/inet_common.h>
++#include <net/ip_tunnels.h>
++#include <net/xfrm.h>
++#include <net/net_namespace.h>
++#include <net/secure_seq.h>
++#ifdef CONFIG_IP_MROUTE
++#include <linux/mroute.h>
++#endif
++#include <net/l3mdev.h>
++#include <net/compat.h>
++
++#include <trace/events/sock.h>
++
++/* The inetsw table contains everything that inet_create needs to
++ * build a new socket.
++ */
++static struct list_head inetsw[SOCK_MAX];
++static DEFINE_SPINLOCK(inetsw_lock);
++
++/* New destruction routine */
++
++void inet_sock_destruct(struct sock *sk)
++{
++	struct inet_sock *inet = inet_sk(sk);
++
++	__skb_queue_purge(&sk->sk_receive_queue);
++	__skb_queue_purge(&sk->sk_error_queue);
++
++	sk_mem_reclaim_final(sk);
++
++	if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
++		pr_err("Attempt to release TCP socket in state %d %p\n",
++		       sk->sk_state, sk);
++		return;
++	}
++	if (!sock_flag(sk, SOCK_DEAD)) {
++		pr_err("Attempt to release alive inet socket %p\n", sk);
++		return;
++	}
++
++	WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
++	WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
++	WARN_ON_ONCE(sk->sk_wmem_queued);
++	WARN_ON_ONCE(sk_forward_alloc_get(sk));
++
++	kfree(rcu_dereference_protected(inet->inet_opt, 1));
++	dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
++	dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
++	sk_refcnt_debug_dec(sk);
++}
++EXPORT_SYMBOL(inet_sock_destruct);
++
++/*
++ *	The routines beyond this point handle the behaviour of an AF_INET
++ *	socket object. Mostly it punts to the subprotocols of IP to do
++ *	the work.
++ */
++
++/*
++ *	Automatically bind an unbound socket.
++ */
++
++static int inet_autobind(struct sock *sk)
++{
++	struct inet_sock *inet;
++	/* We may need to bind the socket. */
++	lock_sock(sk);
++	inet = inet_sk(sk);
++	if (!inet->inet_num) {
++		if (sk->sk_prot->get_port(sk, 0)) {
++			release_sock(sk);
++			return -EAGAIN;
++		}
++		inet->inet_sport = htons(inet->inet_num);
++	}
++	release_sock(sk);
++	return 0;
++}
++
++/*
++ *	Move a socket into listening state.
++ */
++int inet_listen(struct socket *sock, int backlog)
++{
++	struct sock *sk = sock->sk;
++	unsigned char old_state;
++	int err, tcp_fastopen;
++
++	lock_sock(sk);
++
++	err = -EINVAL;
++	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
++		goto out;
++
++	old_state = sk->sk_state;
++	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
++		goto out;
++
++	WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
++	/* Really, if the socket is already in listen state
++	 * we can only allow the backlog to be adjusted.
++	 */
++	if (old_state != TCP_LISTEN) {
++		/* Enable TFO w/o requiring TCP_FASTOPEN socket option.
++		 * Note that only TCP sockets (SOCK_STREAM) will reach here.
++		 * Also fastopen backlog may already been set via the option
++		 * because the socket was in TCP_LISTEN state previously but
++		 * was shutdown() rather than close().
++		 */
++		tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
++		if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
++		    (tcp_fastopen & TFO_SERVER_ENABLE) &&
++		    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
++			fastopen_queue_tune(sk, backlog);
++			tcp_fastopen_init_key_once(sock_net(sk));
++		}
++
++		err = inet_csk_listen_start(sk);
++		if (err)
++			goto out;
++		tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
++	}
++	err = 0;
++
++out:
++	release_sock(sk);
++	return err;
++}
++EXPORT_SYMBOL(inet_listen);
++
++/*
++ *	Create an inet socket.
++ */
++
++static int inet_create(struct net *net, struct socket *sock, int protocol,
++		       int kern)
++{
++	struct sock *sk;
++	struct inet_protosw *answer;
++	struct inet_sock *inet;
++	struct proto *answer_prot;
++	unsigned char answer_flags;
++	int try_loading_module = 0;
++	int err;
++
++	if (protocol < 0 || protocol >= IPPROTO_MAX)
++		return -EINVAL;
++
++	sock->state = SS_UNCONNECTED;
++
++	/* Look for the requested type/protocol pair. */
++lookup_protocol:
++	err = -ESOCKTNOSUPPORT;
++	rcu_read_lock();
++	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
++
++		err = 0;
++		/* Check the non-wild match. */
++		if (protocol == answer->protocol) {
++			if (protocol != IPPROTO_IP)
++				break;
++		} else {
++			/* Check for the two wild cases. */
++			if (IPPROTO_IP == protocol) {
++				protocol = answer->protocol;
++				break;
++			}
++			if (IPPROTO_IP == answer->protocol)
++				break;
++		}
++		err = -EPROTONOSUPPORT;
++	}
++
++	if (unlikely(err)) {
++		if (try_loading_module < 2) {
++			rcu_read_unlock();
++			/*
++			 * Be more specific, e.g. net-pf-2-proto-132-type-1
++			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
++			 */
++			if (++try_loading_module == 1)
++				request_module("net-pf-%d-proto-%d-type-%d",
++					       PF_INET, protocol, sock->type);
++			/*
++			 * Fall back to generic, e.g. net-pf-2-proto-132
++			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
++			 */
++			else
++				request_module("net-pf-%d-proto-%d",
++					       PF_INET, protocol);
++			goto lookup_protocol;
++		} else
++			goto out_rcu_unlock;
++	}
++
++	err = -EPERM;
++	if (sock->type == SOCK_RAW && !kern &&
++	    !ns_capable(net->user_ns, CAP_NET_RAW))
++		goto out_rcu_unlock;
++
++	sock->ops = answer->ops;
++	answer_prot = answer->prot;
++	answer_flags = answer->flags;
++	rcu_read_unlock();
++
++	WARN_ON(!answer_prot->slab);
++
++	err = -ENOMEM;
++	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
++	if (!sk)
++		goto out;
++
++	err = 0;
++	if (INET_PROTOSW_REUSE & answer_flags)
++		sk->sk_reuse = SK_CAN_REUSE;
++
++	inet = inet_sk(sk);
++	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
++
++	inet->nodefrag = 0;
++
++	if (SOCK_RAW == sock->type) {
++		inet->inet_num = protocol;
++		if (IPPROTO_RAW == protocol)
++			inet->hdrincl = 1;
++	}
++
++	if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
++		inet->pmtudisc = IP_PMTUDISC_DONT;
++	else
++		inet->pmtudisc = IP_PMTUDISC_WANT;
++
++	inet->inet_id = 0;
++
++	sock_init_data(sock, sk);
++
++	sk->sk_destruct	   = inet_sock_destruct;
++	sk->sk_protocol	   = protocol;
++	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
++
++	inet->uc_ttl	= -1;
++	inet->mc_loop	= 1;
++	inet->mc_ttl	= 1;
++	inet->mc_all	= 1;
++	inet->mc_index	= 0;
++	inet->mc_list	= NULL;
++	inet->rcv_tos	= 0;
++
++	sk_refcnt_debug_inc(sk);
++
++	if (inet->inet_num) {
++		/* It assumes that any protocol which allows
++		 * the user to assign a number at socket
++		 * creation time automatically
++		 * shares.
++		 */
++		inet->inet_sport = htons(inet->inet_num);
++		/* Add to protocol hash chains. */
++		err = sk->sk_prot->hash(sk);
++		if (err) {
++			sk_common_release(sk);
++			goto out;
++		}
++	}
++
++	if (sk->sk_prot->init) {
++		err = sk->sk_prot->init(sk);
++		if (err) {
++			sk_common_release(sk);
++			goto out;
++		}
++	}
++
++	if (!kern) {
++		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
++		if (err) {
++			sk_common_release(sk);
++			goto out;
++		}
++	}
++out:
++	return err;
++out_rcu_unlock:
++	rcu_read_unlock();
++	goto out;
++}
++
++
++/*
++ *	The peer socket should always be NULL (or else). When we call this
++ *	function we are destroying the object and from then on nobody
++ *	should refer to it.
++ */
++int inet_release(struct socket *sock)
++{
++	struct sock *sk = sock->sk;
++
++	if (sk) {
++		long timeout;
++
++		if (!sk->sk_kern_sock)
++			BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);
++
++		/* Applications forget to leave groups before exiting */
++		ip_mc_drop_socket(sk);
++
++		/* If linger is set, we don't return until the close
++		 * is complete.  Otherwise we return immediately. The
++		 * actually closing is done the same either way.
++		 *
++		 * If the close is due to the process exiting, we never
++		 * linger..
++		 */
++		timeout = 0;
++		if (sock_flag(sk, SOCK_LINGER) &&
++		    !(current->flags & PF_EXITING))
++			timeout = sk->sk_lingertime;
++		sk->sk_prot->close(sk, timeout);
++		sock->sk = NULL;
++	}
++	return 0;
++}
++EXPORT_SYMBOL(inet_release);
++
++int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
++{
++	struct sock *sk = sock->sk;
++	u32 flags = BIND_WITH_LOCK;
++	int err;
++
++	/* If the socket has its own bind function then use it. (RAW) */
++	if (sk->sk_prot->bind) {
++		return sk->sk_prot->bind(sk, uaddr, addr_len);
++	}
++	if (addr_len < sizeof(struct sockaddr_in))
++		return -EINVAL;
++
++	/* BPF prog is run before any checks are done so that if the prog
++	 * changes context in a wrong way it will be caught.
++	 */
++	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
++						 CGROUP_INET4_BIND, &flags);
++	if (err)
++		return err;
++
++	return __inet_bind(sk, uaddr, addr_len, flags);
++}
++EXPORT_SYMBOL(inet_bind);
++
++int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
++		u32 flags)
++{
++	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
++	struct inet_sock *inet = inet_sk(sk);
++	struct net *net = sock_net(sk);
++	unsigned short snum;
++	int chk_addr_ret;
++	u32 tb_id = RT_TABLE_LOCAL;
++	int err;
++
++	if (addr->sin_family != AF_INET) {
++		/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
++		 * only if s_addr is INADDR_ANY.
++		 */
++		err = -EAFNOSUPPORT;
++		if (addr->sin_family != AF_UNSPEC ||
++		    addr->sin_addr.s_addr != htonl(INADDR_ANY))
++			goto out;
++	}
++
++	tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
++	chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
++
++	/* Not specified by any standard per-se, however it breaks too
++	 * many applications when removed.  It is unfortunate since
++	 * allowing applications to make a non-local bind solves
++	 * several problems with systems using dynamic addressing.
++	 * (ie. your servers still start up even if your ISDN link
++	 *  is temporarily down)
++	 */
++	err = -EADDRNOTAVAIL;
++	if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
++	                                 chk_addr_ret))
++		goto out;
++
++	snum = ntohs(addr->sin_port);
++	err = -EACCES;
++	if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
++	    snum && inet_port_requires_bind_service(net, snum) &&
++	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
++		goto out;
++
++	/*      We keep a pair of addresses. rcv_saddr is the one
++	 *      used by hash lookups, and saddr is used for transmit.
++	 *
++	 *      In the BSD API these are the same except where it
++	 *      would be illegal to use them (multicast/broadcast) in
++	 *      which case the sending device address is used.
++	 */
++	if (flags & BIND_WITH_LOCK)
++		lock_sock(sk);
++
++	/* Check these errors (active socket, double bind). */
++	err = -EINVAL;
++	if (sk->sk_state != TCP_CLOSE || inet->inet_num)
++		goto out_release_sock;
++
++	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
++	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
++		inet->inet_saddr = 0;  /* Use device */
++
++	/* Make sure we are allowed to bind here. */
++	if (snum || !(inet->bind_address_no_port ||
++		      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
++		if (sk->sk_prot->get_port(sk, snum)) {
++			inet->inet_saddr = inet->inet_rcv_saddr = 0;
++			err = -EADDRINUSE;
++			goto out_release_sock;
++		}
++		if (!(flags & BIND_FROM_BPF)) {
++			err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
++			if (err) {
++				inet->inet_saddr = inet->inet_rcv_saddr = 0;
++				if (sk->sk_prot->put_port)
++					sk->sk_prot->put_port(sk);
++				goto out_release_sock;
++			}
++		}
++	}
++
++	if (inet->inet_rcv_saddr)
++		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
++	if (snum)
++		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
++	inet->inet_sport = htons(inet->inet_num);
++	inet->inet_daddr = 0;
++	inet->inet_dport = 0;
++	sk_dst_reset(sk);
++	err = 0;
++out_release_sock:
++	if (flags & BIND_WITH_LOCK)
++		release_sock(sk);
++out:
++	return err;
++}
++
++int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
++		       int addr_len, int flags)
++{
++	struct sock *sk = sock->sk;
++	int err;
++
++	if (addr_len < sizeof(uaddr->sa_family))
++		return -EINVAL;
++	if (uaddr->sa_family == AF_UNSPEC)
++		return sk->sk_prot->disconnect(sk, flags);
++
++	if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
++		err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
++		if (err)
++			return err;
++	}
++
++	if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
++		return -EAGAIN;
++	return sk->sk_prot->connect(sk, uaddr, addr_len);
++}
++EXPORT_SYMBOL(inet_dgram_connect);
++
++static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
++{
++	DEFINE_WAIT_FUNC(wait, woken_wake_function);
++
++	add_wait_queue(sk_sleep(sk), &wait);
++	sk->sk_write_pending += writebias;
++
++	/* Basic assumption: if someone sets sk->sk_err, he _must_
++	 * change state of the socket from TCP_SYN_*.
++	 * Connect() does not allow to get error notifications
++	 * without closing the socket.
++	 */
++	while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
++		release_sock(sk);
++		timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
++		lock_sock(sk);
++		if (signal_pending(current) || !timeo)
++			break;
++	}
++	remove_wait_queue(sk_sleep(sk), &wait);
++	sk->sk_write_pending -= writebias;
++	return timeo;
++}
++
++/*
++ *	Connect to a remote host. There is regrettably still a little
++ *	TCP 'magic' in here.
++ */
++int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
++			  int addr_len, int flags, int is_sendmsg)
++{
++	struct sock *sk = sock->sk;
++	int err;
++	long timeo;
++
++	/*
++	 * uaddr can be NULL and addr_len can be 0 if:
++	 * sk is a TCP fastopen active socket and
++	 * TCP_FASTOPEN_CONNECT sockopt is set and
++	 * we already have a valid cookie for this socket.
++	 * In this case, user can call write() after connect().
++	 * write() will invoke tcp_sendmsg_fastopen() which calls
++	 * __inet_stream_connect().
++	 */
++	if (uaddr) {
++		if (addr_len < sizeof(uaddr->sa_family))
++			return -EINVAL;
++
++		if (uaddr->sa_family == AF_UNSPEC) {
++			err = sk->sk_prot->disconnect(sk, flags);
++			sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
++			goto out;
++		}
++	}
++
++	switch (sock->state) {
++	default:
++		err = -EINVAL;
++		goto out;
++	case SS_CONNECTED:
++		err = -EISCONN;
++		goto out;
++	case SS_CONNECTING:
++		if (inet_sk(sk)->defer_connect)
++			err = is_sendmsg ? -EINPROGRESS : -EISCONN;
++		else
++			err = -EALREADY;
++		/* Fall out of switch with err, set for this state */
++		break;
++	case SS_UNCONNECTED:
++		err = -EISCONN;
++		if (sk->sk_state != TCP_CLOSE)
++			goto out;
++
++		if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
++			err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
++			if (err)
++				goto out;
++		}
++
++		err = sk->sk_prot->connect(sk, uaddr, addr_len);
++		if (err < 0)
++			goto out;
++
++		sock->state = SS_CONNECTING;
++
++		if (!err && inet_sk(sk)->defer_connect)
++			goto out;
++
++		/* Just entered SS_CONNECTING state; the only
++		 * difference is that return value in non-blocking
++		 * case is EINPROGRESS, rather than EALREADY.
++		 */
++		err = -EINPROGRESS;
++		break;
++	}
++
++	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
++
++	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
++		int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
++				tcp_sk(sk)->fastopen_req &&
++				tcp_sk(sk)->fastopen_req->data ? 1 : 0;
++
++		/* Error code is set above */
++		if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
++			goto out;
++
++		err = sock_intr_errno(timeo);
++		if (signal_pending(current))
++			goto out;
++	}
++
++	/* Connection was closed by RST, timeout, ICMP error
++	 * or another process disconnected us.
++	 */
++	if (sk->sk_state == TCP_CLOSE)
++		goto sock_error;
++
++	/* sk->sk_err may be not zero now, if RECVERR was ordered by user
++	 * and error was received after socket entered established state.
++	 * Hence, it is handled normally after connect() return successfully.
++	 */
++
++	sock->state = SS_CONNECTED;
++	err = 0;
++out:
++	return err;
++
++sock_error:
++	err = sock_error(sk) ? : -ECONNABORTED;
++	sock->state = SS_UNCONNECTED;
++	if (sk->sk_prot->disconnect(sk, flags))
++		sock->state = SS_DISCONNECTING;
++	goto out;
++}
++EXPORT_SYMBOL(__inet_stream_connect);
++
++int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
++			int addr_len, int flags)
++{
++	int err;
++
++	lock_sock(sock->sk);
++	err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
++	release_sock(sock->sk);
++	return err;
++}
++EXPORT_SYMBOL(inet_stream_connect);
++
++/*
++ *	Accept a pending connection. The TCP layer now gives BSD semantics.
++ */
++
++int inet_accept(struct socket *sock, struct socket *newsock, int flags,
++		bool kern)
++{
++	struct sock *sk1 = sock->sk;
++	int err = -EINVAL;
++	struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern);
++
++	if (!sk2)
++		goto do_err;
++
++	lock_sock(sk2);
++
++	sock_rps_record_flow(sk2);
++	WARN_ON(!((1 << sk2->sk_state) &
++		  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
++		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
++
++	if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
++		set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
++	sock_graft(sk2, newsock);
++
++	newsock->state = SS_CONNECTED;
++	err = 0;
++	release_sock(sk2);
++do_err:
++	return err;
++}
++EXPORT_SYMBOL(inet_accept);
++
++/*
++ *	This does both peername and sockname.
++ */
++int inet_getname(struct socket *sock, struct sockaddr *uaddr,
++		 int peer)
++{
++	struct sock *sk		= sock->sk;
++	struct inet_sock *inet	= inet_sk(sk);
++	DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
++
++	sin->sin_family = AF_INET;
++	lock_sock(sk);
++	if (peer) {
++		if (!inet->inet_dport ||
++		    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
++		     peer == 1)) {
++			release_sock(sk);
++			return -ENOTCONN;
++		}
++		sin->sin_port = inet->inet_dport;
++		sin->sin_addr.s_addr = inet->inet_daddr;
++		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
++				       CGROUP_INET4_GETPEERNAME);
++	} else {
++		__be32 addr = inet->inet_rcv_saddr;
++		if (!addr)
++			addr = inet->inet_saddr;
++		sin->sin_port = inet->inet_sport;
++		sin->sin_addr.s_addr = addr;
++		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
++				       CGROUP_INET4_GETSOCKNAME);
++	}
++	release_sock(sk);
++	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++	return sizeof(*sin);
++}
++EXPORT_SYMBOL(inet_getname);
++
++int inet_send_prepare(struct sock *sk)
++{
++	sock_rps_record_flow(sk);
++
++	/* We may need to bind the socket. */
++	if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind &&
++	    inet_autobind(sk))
++		return -EAGAIN;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(inet_send_prepare);
++
++int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
++{
++	struct sock *sk = sock->sk;
++
++	if (unlikely(inet_send_prepare(sk)))
++		return -EAGAIN;
++
++	return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,
++			       sk, msg, size);
++}
++EXPORT_SYMBOL(inet_sendmsg);
++
++ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
++		      size_t size, int flags)
++{
++	struct sock *sk = sock->sk;
++
++	if (unlikely(inet_send_prepare(sk)))
++		return -EAGAIN;
++
++	if (sk->sk_prot->sendpage)
++		return sk->sk_prot->sendpage(sk, page, offset, size, flags);
++	return sock_no_sendpage(sock, page, offset, size, flags);
++}
++EXPORT_SYMBOL(inet_sendpage);
++
++INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
++					  size_t, int, int *));
++int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
++		 int flags)
++{
++	struct sock *sk = sock->sk;
++	int addr_len = 0;
++	int err;
++
++	if (likely(!(flags & MSG_ERRQUEUE)))
++		sock_rps_record_flow(sk);
++
++	err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
++			      sk, msg, size, flags, &addr_len);
++	if (err >= 0)
++		msg->msg_namelen = addr_len;
++	return err;
++}
++EXPORT_SYMBOL(inet_recvmsg);
++
++int inet_shutdown(struct socket *sock, int how)
++{
++	struct sock *sk = sock->sk;
++	int err = 0;
++
++	/* This should really check to make sure
++	 * the socket is a TCP socket. (WHY AC...)
++	 */
++	how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
++		       1->2 bit 2 snds.
++		       2->3 */
++	if ((how & ~SHUTDOWN_MASK) || !how)	/* MAXINT->0 */
++		return -EINVAL;
++
++	lock_sock(sk);
++	if (sock->state == SS_CONNECTING) {
++		if ((1 << sk->sk_state) &
++		    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
++			sock->state = SS_DISCONNECTING;
++		else
++			sock->state = SS_CONNECTED;
++	}
++
++	switch (sk->sk_state) {
++	case TCP_CLOSE:
++		err = -ENOTCONN;
++		/* Hack to wake up other listeners, who can poll for
++		   EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
++		fallthrough;
++	default:
++		sk->sk_shutdown |= how;
++		if (sk->sk_prot->shutdown)
++			sk->sk_prot->shutdown(sk, how);
++		break;
++
++	/* Remaining two branches are temporary solution for missing
++	 * close() in multithreaded environment. It is _not_ a good idea,
++	 * but we have no choice until close() is repaired at VFS level.
++	 */
++	case TCP_LISTEN:
++		if (!(how & RCV_SHUTDOWN))
++			break;
++		fallthrough;
++	case TCP_SYN_SENT:
++		err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
++		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
++		break;
++	}
++
++	/* Wake up anyone sleeping in poll. */
++	sk->sk_state_change(sk);
++	release_sock(sk);
++	return err;
++}
++EXPORT_SYMBOL(inet_shutdown);
++
++/*
++ *	ioctl() calls you can issue on an INET socket. Most of these are
++ *	device configuration and stuff and very rarely used. Some ioctls
++ *	pass on to the socket itself.
++ *
++ *	NOTE: I like the idea of a module for the config stuff. ie ifconfig
++ *	loads the devconfigure module does its configuring and unloads it.
++ *	There's a good 20K of config code hanging around the kernel.
++ */
++
++int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
++{
++	struct sock *sk = sock->sk;
++	int err = 0;
++	struct net *net = sock_net(sk);
++	void __user *p = (void __user *)arg;
++	struct ifreq ifr;
++	struct rtentry rt;
++
++	switch (cmd) {
++	case SIOCADDRT:
++	case SIOCDELRT:
++		if (copy_from_user(&rt, p, sizeof(struct rtentry)))
++			return -EFAULT;
++		err = ip_rt_ioctl(net, cmd, &rt);
++		break;
++	case SIOCRTMSG:
++		err = -EINVAL;
++		break;
++	case SIOCDARP:
++	case SIOCGARP:
++	case SIOCSARP:
++		err = arp_ioctl(net, cmd, (void __user *)arg);
++		break;
++	case SIOCGIFADDR:
++	case SIOCGIFBRDADDR:
++	case SIOCGIFNETMASK:
++	case SIOCGIFDSTADDR:
++	case SIOCGIFPFLAGS:
++		if (get_user_ifreq(&ifr, NULL, p))
++			return -EFAULT;
++		err = devinet_ioctl(net, cmd, &ifr);
++		if (!err && put_user_ifreq(&ifr, p))
++			err = -EFAULT;
++		break;
++
++	case SIOCSIFADDR:
++	case SIOCSIFBRDADDR:
++	case SIOCSIFNETMASK:
++	case SIOCSIFDSTADDR:
++	case SIOCSIFPFLAGS:
++	case SIOCSIFFLAGS:
++		if (get_user_ifreq(&ifr, NULL, p))
++			return -EFAULT;
++		err = devinet_ioctl(net, cmd, &ifr);
++		break;
++	default:
++		if (sk->sk_prot->ioctl)
++			err = sk->sk_prot->ioctl(sk, cmd, arg);
++		else
++			err = -ENOIOCTLCMD;
++		break;
++	}
++	return err;
++}
++EXPORT_SYMBOL(inet_ioctl);
++
++#ifdef CONFIG_COMPAT
++static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
++		struct compat_rtentry __user *ur)
++{
++	compat_uptr_t rtdev;
++	struct rtentry rt;
++
++	if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
++			3 * sizeof(struct sockaddr)) ||
++	    get_user(rt.rt_flags, &ur->rt_flags) ||
++	    get_user(rt.rt_metric, &ur->rt_metric) ||
++	    get_user(rt.rt_mtu, &ur->rt_mtu) ||
++	    get_user(rt.rt_window, &ur->rt_window) ||
++	    get_user(rt.rt_irtt, &ur->rt_irtt) ||
++	    get_user(rtdev, &ur->rt_dev))
++		return -EFAULT;
++
++	rt.rt_dev = compat_ptr(rtdev);
++	return ip_rt_ioctl(sock_net(sk), cmd, &rt);
++}
++
++static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
++{
++	void __user *argp = compat_ptr(arg);
++	struct sock *sk = sock->sk;
++
++	switch (cmd) {
++	case SIOCADDRT:
++	case SIOCDELRT:
++		return inet_compat_routing_ioctl(sk, cmd, argp);
++	default:
++		if (!sk->sk_prot->compat_ioctl)
++			return -ENOIOCTLCMD;
++		return sk->sk_prot->compat_ioctl(sk, cmd, arg);
++	}
++}
++#endif /* CONFIG_COMPAT */
++
++const struct proto_ops inet_stream_ops = {
++	.family		   = PF_INET,
++	.owner		   = THIS_MODULE,
++	.release	   = inet_release,
++	.bind		   = inet_bind,
++	.connect	   = inet_stream_connect,
++	.socketpair	   = sock_no_socketpair,
++	.accept		   = inet_accept,
++	.getname	   = inet_getname,
++	.poll		   = tcp_poll,
++	.ioctl		   = inet_ioctl,
++	.gettstamp	   = sock_gettstamp,
++	.listen		   = inet_listen,
++	.shutdown	   = inet_shutdown,
++	.setsockopt	   = sock_common_setsockopt,
++	.getsockopt	   = sock_common_getsockopt,
++	.sendmsg	   = inet_sendmsg,
++	.recvmsg	   = inet_recvmsg,
++#ifdef CONFIG_MMU
++	.mmap		   = tcp_mmap,
++#endif
++	.sendpage	   = inet_sendpage,
++	.splice_read	   = tcp_splice_read,
++	.read_sock	   = tcp_read_sock,
++	.read_skb	   = tcp_read_skb,
++	.sendmsg_locked    = tcp_sendmsg_locked,
++	.sendpage_locked   = tcp_sendpage_locked,
++	.peek_len	   = tcp_peek_len,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	   = inet_compat_ioctl,
++#endif
++	.set_rcvlowat	   = tcp_set_rcvlowat,
++};
++EXPORT_SYMBOL(inet_stream_ops);
++
++const struct proto_ops inet_dgram_ops = {
++	.family		   = PF_INET,
++	.owner		   = THIS_MODULE,
++	.release	   = inet_release,
++	.bind		   = inet_bind,
++	.connect	   = inet_dgram_connect,
++	.socketpair	   = sock_no_socketpair,
++	.accept		   = sock_no_accept,
++	.getname	   = inet_getname,
++	.poll		   = udp_poll,
++	.ioctl		   = inet_ioctl,
++	.gettstamp	   = sock_gettstamp,
++	.listen		   = sock_no_listen,
++	.shutdown	   = inet_shutdown,
++	.setsockopt	   = sock_common_setsockopt,
++	.getsockopt	   = sock_common_getsockopt,
++	.sendmsg	   = inet_sendmsg,
++	.read_skb	   = udp_read_skb,
++	.recvmsg	   = inet_recvmsg,
++	.mmap		   = sock_no_mmap,
++	.sendpage	   = inet_sendpage,
++	.set_peek_off	   = sk_set_peek_off,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	   = inet_compat_ioctl,
++#endif
++};
++EXPORT_SYMBOL(inet_dgram_ops);
++
++/*
++ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
++ * udp_poll
++ */
++static const struct proto_ops inet_sockraw_ops = {
++	.family		   = PF_INET,
++	.owner		   = THIS_MODULE,
++	.release	   = inet_release,
++	.bind		   = inet_bind,
++	.connect	   = inet_dgram_connect,
++	.socketpair	   = sock_no_socketpair,
++	.accept		   = sock_no_accept,
++	.getname	   = inet_getname,
++	.poll		   = datagram_poll,
++	.ioctl		   = inet_ioctl,
++	.gettstamp	   = sock_gettstamp,
++	.listen		   = sock_no_listen,
++	.shutdown	   = inet_shutdown,
++	.setsockopt	   = sock_common_setsockopt,
++	.getsockopt	   = sock_common_getsockopt,
++	.sendmsg	   = inet_sendmsg,
++	.recvmsg	   = inet_recvmsg,
++	.mmap		   = sock_no_mmap,
++	.sendpage	   = inet_sendpage,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	   = inet_compat_ioctl,
++#endif
++};
++
++static const struct net_proto_family inet_family_ops = {
++	.family = PF_INET,
++	.create = inet_create,
++	.owner	= THIS_MODULE,
++};
++
++/* Upon startup we insert all the elements in inetsw_array[] into
++ * the linked list inetsw.
++ */
++static struct inet_protosw inetsw_array[] =
++{
++	{
++		.type =       SOCK_STREAM,
++		.protocol =   IPPROTO_TCP,
++		.prot =       &tcp_prot,
++		.ops =        &inet_stream_ops,
++		.flags =      INET_PROTOSW_PERMANENT |
++			      INET_PROTOSW_ICSK,
++	},
++
++	{
++		.type =       SOCK_DGRAM,
++		.protocol =   IPPROTO_UDP,
++		.prot =       &udp_prot,
++		.ops =        &inet_dgram_ops,
++		.flags =      INET_PROTOSW_PERMANENT,
++       },
++
++       {
++		.type =       SOCK_DGRAM,
++		.protocol =   IPPROTO_ICMP,
++		.prot =       &ping_prot,
++		.ops =        &inet_sockraw_ops,
++		.flags =      INET_PROTOSW_REUSE,
++       },
++
++       {
++	       .type =       SOCK_RAW,
++	       .protocol =   IPPROTO_IP,	/* wild card */
++	       .prot =       &raw_prot,
++	       .ops =        &inet_sockraw_ops,
++	       .flags =      INET_PROTOSW_REUSE,
++       }
++};
++
++#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
++
++void inet_register_protosw(struct inet_protosw *p)
++{
++	struct list_head *lh;
++	struct inet_protosw *answer;
++	int protocol = p->protocol;
++	struct list_head *last_perm;
++
++	spin_lock_bh(&inetsw_lock);
++
++	if (p->type >= SOCK_MAX)
++		goto out_illegal;
++
++	/* If we are trying to override a permanent protocol, bail. */
++	last_perm = &inetsw[p->type];
++	list_for_each(lh, &inetsw[p->type]) {
++		answer = list_entry(lh, struct inet_protosw, list);
++		/* Check only the non-wild match. */
++		if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
++			break;
++		if (protocol == answer->protocol)
++			goto out_permanent;
++		last_perm = lh;
++	}
++
++	/* Add the new entry after the last permanent entry if any, so that
++	 * the new entry does not override a permanent entry when matched with
++	 * a wild-card protocol. But it is allowed to override any existing
++	 * non-permanent entry.  This means that when we remove this entry, the
++	 * system automatically returns to the old behavior.
++	 */
++	list_add_rcu(&p->list, last_perm);
++out:
++	spin_unlock_bh(&inetsw_lock);
++
++	return;
++
++out_permanent:
++	pr_err("Attempt to override permanent protocol %d\n", protocol);
++	goto out;
++
++out_illegal:
++	pr_err("Ignoring attempt to register invalid socket type %d\n",
++	       p->type);
++	goto out;
++}
++EXPORT_SYMBOL(inet_register_protosw);
++
++void inet_unregister_protosw(struct inet_protosw *p)
++{
++	if (INET_PROTOSW_PERMANENT & p->flags) {
++		pr_err("Attempt to unregister permanent protocol %d\n",
++		       p->protocol);
++	} else {
++		spin_lock_bh(&inetsw_lock);
++		list_del_rcu(&p->list);
++		spin_unlock_bh(&inetsw_lock);
++
++		synchronize_net();
++	}
++}
++EXPORT_SYMBOL(inet_unregister_protosw);
++
++static int inet_sk_reselect_saddr(struct sock *sk)
++{
++	struct inet_sock *inet = inet_sk(sk);
++	__be32 old_saddr = inet->inet_saddr;
++	__be32 daddr = inet->inet_daddr;
++	struct flowi4 *fl4;
++	struct rtable *rt;
++	__be32 new_saddr;
++	struct ip_options_rcu *inet_opt;
++
++	inet_opt = rcu_dereference_protected(inet->inet_opt,
++					     lockdep_sock_is_held(sk));
++	if (inet_opt && inet_opt->opt.srr)
++		daddr = inet_opt->opt.faddr;
++
++	/* Query new route. */
++	fl4 = &inet->cork.fl.u.ip4;
++	rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if,
++			      sk->sk_protocol, inet->inet_sport,
++			      inet->inet_dport, sk);
++	if (IS_ERR(rt))
++		return PTR_ERR(rt);
++
++	sk_setup_caps(sk, &rt->dst);
++
++	new_saddr = fl4->saddr;
++
++	if (new_saddr == old_saddr)
++		return 0;
++
++	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
++		pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
++			__func__, &old_saddr, &new_saddr);
++	}
++
++	inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
++
++	/*
++	 * XXX The only one ugly spot where we need to
++	 * XXX really change the sockets identity after
++	 * XXX it has entered the hashes. -DaveM
++	 *
++	 * Besides that, it does not check for connection
++	 * uniqueness. Wait for troubles.
++	 */
++	return __sk_prot_rehash(sk);
++}
++
++int inet_sk_rebuild_header(struct sock *sk)
++{
++	struct inet_sock *inet = inet_sk(sk);
++	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
++	__be32 daddr;
++	struct ip_options_rcu *inet_opt;
++	struct flowi4 *fl4;
++	int err;
++
++	/* Route is OK, nothing to do. */
++	if (rt)
++		return 0;
++
++	/* Reroute. */
++	rcu_read_lock();
++	inet_opt = rcu_dereference(inet->inet_opt);
++	daddr = inet->inet_daddr;
++	if (inet_opt && inet_opt->opt.srr)
++		daddr = inet_opt->opt.faddr;
++	rcu_read_unlock();
++	fl4 = &inet->cork.fl.u.ip4;
++	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
++				   inet->inet_dport, inet->inet_sport,
++				   sk->sk_protocol, RT_CONN_FLAGS(sk),
++				   sk->sk_bound_dev_if);
++	if (!IS_ERR(rt)) {
++		err = 0;
++		sk_setup_caps(sk, &rt->dst);
++	} else {
++		err = PTR_ERR(rt);
++
++		/* Routing failed... */
++		sk->sk_route_caps = 0;
++		/*
++		 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
++		 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
++		 */
++		if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
++		    sk->sk_state != TCP_SYN_SENT ||
++		    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
++		    (err = inet_sk_reselect_saddr(sk)) != 0)
++			sk->sk_err_soft = -err;
++	}
++
++	return err;
++}
++EXPORT_SYMBOL(inet_sk_rebuild_header);
++
++void inet_sk_set_state(struct sock *sk, int state)
++{
++	trace_inet_sock_set_state(sk, sk->sk_state, state);
++	sk->sk_state = state;
++}
++EXPORT_SYMBOL(inet_sk_set_state);
++
++void inet_sk_state_store(struct sock *sk, int newstate)
++{
++	trace_inet_sock_set_state(sk, sk->sk_state, newstate);
++	smp_store_release(&sk->sk_state, newstate);
++}
++
++struct sk_buff *inet_gso_segment(struct sk_buff *skb,
++				 netdev_features_t features)
++{
++	bool udpfrag = false, fixedid = false, gso_partial, encap;
++	struct sk_buff *segs = ERR_PTR(-EINVAL);
++	const struct net_offload *ops;
++	unsigned int offset = 0;
++	struct iphdr *iph;
++	int proto, tot_len;
++	int nhoff;
++	int ihl;
++	int id;
++
++	skb_reset_network_header(skb);
++	nhoff = skb_network_header(skb) - skb_mac_header(skb);
++	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
++		goto out;
++
++	iph = ip_hdr(skb);
++	ihl = iph->ihl * 4;
++	if (ihl < sizeof(*iph))
++		goto out;
++
++	id = ntohs(iph->id);
++	proto = iph->protocol;
++
++	/* Warning: after this point, iph might be no longer valid */
++	if (unlikely(!pskb_may_pull(skb, ihl)))
++		goto out;
++	__skb_pull(skb, ihl);
++
++	encap = SKB_GSO_CB(skb)->encap_level > 0;
++	if (encap)
++		features &= skb->dev->hw_enc_features;
++	SKB_GSO_CB(skb)->encap_level += ihl;
++
++	skb_reset_transport_header(skb);
++
++	segs = ERR_PTR(-EPROTONOSUPPORT);
++
++	if (!skb->encapsulation || encap) {
++		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
++		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
++
++		/* fixed ID is invalid if DF bit is not set */
++		if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
++			goto out;
++	}
++
++	ops = rcu_dereference(inet_offloads[proto]);
++	if (likely(ops && ops->callbacks.gso_segment)) {
++		segs = ops->callbacks.gso_segment(skb, features);
++		if (!segs)
++			skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
++	}
++
++	if (IS_ERR_OR_NULL(segs))
++		goto out;
++
++	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
++
++	skb = segs;
++	do {
++		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
++		if (udpfrag) {
++			iph->frag_off = htons(offset >> 3);
++			if (skb->next)
++				iph->frag_off |= htons(IP_MF);
++			offset += skb->len - nhoff - ihl;
++			tot_len = skb->len - nhoff;
++		} else if (skb_is_gso(skb)) {
++			if (!fixedid) {
++				iph->id = htons(id);
++				id += skb_shinfo(skb)->gso_segs;
++			}
++
++			if (gso_partial)
++				tot_len = skb_shinfo(skb)->gso_size +
++					  SKB_GSO_CB(skb)->data_offset +
++					  skb->head - (unsigned char *)iph;
++			else
++				tot_len = skb->len - nhoff;
++		} else {
++			if (!fixedid)
++				iph->id = htons(id++);
++			tot_len = skb->len - nhoff;
++		}
++		iph->tot_len = htons(tot_len);
++		ip_send_check(iph);
++		if (encap)
++			skb_reset_inner_headers(skb);
++		skb->network_header = (u8 *)iph - skb->head;
++		skb_reset_mac_len(skb);
++	} while ((skb = skb->next));
++
++out:
++	return segs;
++}
++
++static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
++					netdev_features_t features)
++{
++	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
++		return ERR_PTR(-EINVAL);
++
++	return inet_gso_segment(skb, features);
++}
++
++struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
++{
++	const struct net_offload *ops;
++	struct sk_buff *pp = NULL;
++	const struct iphdr *iph;
++	struct sk_buff *p;
++	unsigned int hlen;
++	unsigned int off;
++	unsigned int id;
++	int flush = 1;
++	int proto;
++
++	off = skb_gro_offset(skb);
++	hlen = off + sizeof(*iph);
++	iph = skb_gro_header_fast(skb, off);
++	if (skb_gro_header_hard(skb, hlen)) {
++		iph = skb_gro_header_slow(skb, hlen, off);
++		if (unlikely(!iph))
++			goto out;
++	}
++
++	proto = iph->protocol;
++
++	ops = rcu_dereference(inet_offloads[proto]);
++	if (!ops || !ops->callbacks.gro_receive)
++		goto out;
++
++	if (*(u8 *)iph != 0x45)
++		goto out;
++
++	if (ip_is_fragment(iph))
++		goto out;
++
++	if (unlikely(ip_fast_csum((u8 *)iph, 5)))
++		goto out;
++
++	id = ntohl(*(__be32 *)&iph->id);
++	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
++	id >>= 16;
++
++	list_for_each_entry(p, head, list) {
++		struct iphdr *iph2;
++		u16 flush_id;
++
++		if (!NAPI_GRO_CB(p)->same_flow)
++			continue;
++
++		iph2 = (struct iphdr *)(p->data + off);
++		/* The above works because, with the exception of the top
++		 * (inner most) layer, we only aggregate pkts with the same
++		 * hdr length so all the hdrs we'll need to verify will start
++		 * at the same offset.
++		 */
++		if ((iph->protocol ^ iph2->protocol) |
++		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
++		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
++			NAPI_GRO_CB(p)->same_flow = 0;
++			continue;
++		}
++
++		/* All fields must match except length and checksum. */
++		NAPI_GRO_CB(p)->flush |=
++			(iph->ttl ^ iph2->ttl) |
++			(iph->tos ^ iph2->tos) |
++			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
++
++		NAPI_GRO_CB(p)->flush |= flush;
++
++		/* We need to store of the IP ID check to be included later
++		 * when we can verify that this packet does in fact belong
++		 * to a given flow.
++		 */
++		flush_id = (u16)(id - ntohs(iph2->id));
++
++		/* This bit of code makes it much easier for us to identify
++		 * the cases where we are doing atomic vs non-atomic IP ID
++		 * checks.  Specifically an atomic check can return IP ID
++		 * values 0 - 0xFFFF, while a non-atomic check can only
++		 * return 0 or 0xFFFF.
++		 */
++		if (!NAPI_GRO_CB(p)->is_atomic ||
++		    !(iph->frag_off & htons(IP_DF))) {
++			flush_id ^= NAPI_GRO_CB(p)->count;
++			flush_id = flush_id ? 0xFFFF : 0;
++		}
++
++		/* If the previous IP ID value was based on an atomic
++		 * datagram we can overwrite the value and ignore it.
++		 */
++		if (NAPI_GRO_CB(skb)->is_atomic)
++			NAPI_GRO_CB(p)->flush_id = flush_id;
++		else
++			NAPI_GRO_CB(p)->flush_id |= flush_id;
++	}
++
++	NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
++	NAPI_GRO_CB(skb)->flush |= flush;
++	skb_set_network_header(skb, off);
++	/* The above will be needed by the transport layer if there is one
++	 * immediately following this IP hdr.
++	 */
++
++	/* Note : No need to call skb_gro_postpull_rcsum() here,
++	 * as we already checked checksum over ipv4 header was 0
++	 */
++	skb_gro_pull(skb, sizeof(*iph));
++	skb_set_transport_header(skb, skb_gro_offset(skb));
++
++	pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
++				       ops->callbacks.gro_receive, head, skb);
++
++out:
++	skb_gro_flush_final(skb, pp, flush);
++
++	return pp;
++}
++
++static struct sk_buff *ipip_gro_receive(struct list_head *head,
++					struct sk_buff *skb)
++{
++	if (NAPI_GRO_CB(skb)->encap_mark) {
++		NAPI_GRO_CB(skb)->flush = 1;
++		return NULL;
++	}
++
++	NAPI_GRO_CB(skb)->encap_mark = 1;
++
++	return inet_gro_receive(head, skb);
++}
++
++#define SECONDS_PER_DAY	86400
++
++/* inet_current_timestamp - Return IP network timestamp
++ *
++ * Return milliseconds since midnight in network byte order.
++ */
++__be32 inet_current_timestamp(void)
++{
++	u32 secs;
++	u32 msecs;
++	struct timespec64 ts;
++
++	ktime_get_real_ts64(&ts);
++
++	/* Get secs since midnight. */
++	(void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
++	/* Convert to msecs. */
++	msecs = secs * MSEC_PER_SEC;
++	/* Convert nsec to msec. */
++	msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;
++
++	/* Convert to network byte order. */
++	return htonl(msecs);
++}
++EXPORT_SYMBOL(inet_current_timestamp);
++
++int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
++{
++	if (sk->sk_family == AF_INET)
++		return ip_recv_error(sk, msg, len, addr_len);
++#if IS_ENABLED(CONFIG_IPV6)
++	if (sk->sk_family == AF_INET6)
++		return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
++#endif
++	return -EINVAL;
++}
++
++int inet_gro_complete(struct sk_buff *skb, int nhoff)
++{
++	__be16 newlen = htons(skb->len - nhoff);
++	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
++	const struct net_offload *ops;
++	int proto = iph->protocol;
++	int err = -ENOSYS;
++
++	if (skb->encapsulation) {
++		skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
++		skb_set_inner_network_header(skb, nhoff);
++	}
++
++	csum_replace2(&iph->check, iph->tot_len, newlen);
++	iph->tot_len = newlen;
++
++	ops = rcu_dereference(inet_offloads[proto]);
++	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
++		goto out;
++
++	/* Only need to add sizeof(*iph) to get to the next hdr below
++	 * because any hdr with option will have been flushed in
++	 * inet_gro_receive().
++	 */
++	err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
++			      tcp4_gro_complete, udp4_gro_complete,
++			      skb, nhoff + sizeof(*iph));
++
++out:
++	return err;
++}
++
++static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
++{
++	skb->encapsulation = 1;
++	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
++	return inet_gro_complete(skb, nhoff);
++}
++
++int inet_ctl_sock_create(struct sock **sk, unsigned short family,
++			 unsigned short type, unsigned char protocol,
++			 struct net *net)
++{
++	struct socket *sock;
++	int rc = sock_create_kern(net, family, type, protocol, &sock);
++
++	if (rc == 0) {
++		*sk = sock->sk;
++		(*sk)->sk_allocation = GFP_ATOMIC;
++		/*
++		 * Unhash it so that IP input processing does not even see it,
++		 * we do not wish this socket to see incoming packets.
++		 */
++		(*sk)->sk_prot->unhash(*sk);
++	}
++	return rc;
++}
++EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
++
++unsigned long snmp_fold_field(void __percpu *mib, int offt)
++{
++	unsigned long res = 0;
++	int i;
++
++	for_each_possible_cpu(i)
++		res += snmp_get_cpu_field(mib, i, offt);
++	return res;
++}
++EXPORT_SYMBOL_GPL(snmp_fold_field);
++
++#if BITS_PER_LONG==32
++
++u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
++			 size_t syncp_offset)
++{
++	void *bhptr;
++	struct u64_stats_sync *syncp;
++	u64 v;
++	unsigned int start;
++
++	bhptr = per_cpu_ptr(mib, cpu);
++	syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
++	do {
++		start = u64_stats_fetch_begin_irq(syncp);
++		v = *(((u64 *)bhptr) + offt);
++	} while (u64_stats_fetch_retry_irq(syncp, start));
++
++	return v;
++}
++EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);
++
++u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
++{
++	u64 res = 0;
++	int cpu;
++
++	for_each_possible_cpu(cpu) {
++		res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset);
++	}
++	return res;
++}
++EXPORT_SYMBOL_GPL(snmp_fold_field64);
++#endif
++
++#ifdef CONFIG_IP_MULTICAST
++static const struct net_protocol igmp_protocol = {
++	.handler =	igmp_rcv,
++};
++#endif
++
++static const struct net_protocol tcp_protocol = {
++	.handler	=	tcp_v4_rcv,
++	.err_handler	=	tcp_v4_err,
++	.no_policy	=	1,
++	.icmp_strict_tag_validation = 1,
++};
++
++static const struct net_protocol udp_protocol = {
++	.handler =	udp_rcv,
++	.err_handler =	udp_err,
++	.no_policy =	1,
++};
++
++static const struct net_protocol icmp_protocol = {
++	.handler =	icmp_rcv,
++	.err_handler =	icmp_err,
++	.no_policy =	1,
++};
++
++static __net_init int ipv4_mib_init_net(struct net *net)
++{
++	int i;
++
++	net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
++	if (!net->mib.tcp_statistics)
++		goto err_tcp_mib;
++	net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
++	if (!net->mib.ip_statistics)
++		goto err_ip_mib;
++
++	for_each_possible_cpu(i) {
++		struct ipstats_mib *af_inet_stats;
++		af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
++		u64_stats_init(&af_inet_stats->syncp);
++	}
++
++	net->mib.net_statistics = alloc_percpu(struct linux_mib);
++	if (!net->mib.net_statistics)
++		goto err_net_mib;
++	net->mib.udp_statistics = alloc_percpu(struct udp_mib);
++	if (!net->mib.udp_statistics)
++		goto err_udp_mib;
++	net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
++	if (!net->mib.udplite_statistics)
++		goto err_udplite_mib;
++	net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
++	if (!net->mib.icmp_statistics)
++		goto err_icmp_mib;
++	net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
++					      GFP_KERNEL);
++	if (!net->mib.icmpmsg_statistics)
++		goto err_icmpmsg_mib;
++
++	tcp_mib_init(net);
++	return 0;
++
++err_icmpmsg_mib:
++	free_percpu(net->mib.icmp_statistics);
++err_icmp_mib:
++	free_percpu(net->mib.udplite_statistics);
++err_udplite_mib:
++	free_percpu(net->mib.udp_statistics);
++err_udp_mib:
++	free_percpu(net->mib.net_statistics);
++err_net_mib:
++	free_percpu(net->mib.ip_statistics);
++err_ip_mib:
++	free_percpu(net->mib.tcp_statistics);
++err_tcp_mib:
++	return -ENOMEM;
++}
++
++static __net_exit void ipv4_mib_exit_net(struct net *net)
++{
++	kfree(net->mib.icmpmsg_statistics);
++	free_percpu(net->mib.icmp_statistics);
++	free_percpu(net->mib.udplite_statistics);
++	free_percpu(net->mib.udp_statistics);
++	free_percpu(net->mib.net_statistics);
++	free_percpu(net->mib.ip_statistics);
++	free_percpu(net->mib.tcp_statistics);
++#ifdef CONFIG_MPTCP
++	/* allocated on demand, see mptcp_init_sock() */
++	free_percpu(net->mib.mptcp_statistics);
++#endif
++}
++
++static __net_initdata struct pernet_operations ipv4_mib_ops = {
++	.init = ipv4_mib_init_net,
++	.exit = ipv4_mib_exit_net,
++};
++
++static int __init init_ipv4_mibs(void)
++{
++	return register_pernet_subsys(&ipv4_mib_ops);
++}
++
++static __net_init int inet_init_net(struct net *net)
++{
++	/*
++	 * Set defaults for local port range
++	 */
++	seqlock_init(&net->ipv4.ip_local_ports.lock);
++	net->ipv4.ip_local_ports.range[0] =  32768;
++	net->ipv4.ip_local_ports.range[1] =  60999;
++
++	seqlock_init(&net->ipv4.ping_group_range.lock);
++	/*
++	 * Sane defaults - nobody may create ping sockets.
++	 * Boot scripts should set this to distro-specific group.
++	 */
++	net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
++	net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
++
++	/* Default values for sysctl-controlled parameters.
++	 * We set them here, in case sysctl is not compiled.
++	 */
++	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
++	net->ipv4.sysctl_ip_fwd_update_priority = 1;
++	net->ipv4.sysctl_ip_dynaddr = 0;
++	net->ipv4.sysctl_ip_early_demux = 1;
++	net->ipv4.sysctl_udp_early_demux = 1;
++	net->ipv4.sysctl_tcp_early_demux = 1;
++	net->ipv4.sysctl_nexthop_compat_mode = 1;
++#ifdef CONFIG_SYSCTL
++	net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
++#endif
++
++	/* Some igmp sysctl, whose values are always used */
++	net->ipv4.sysctl_igmp_max_memberships = 20;
++	net->ipv4.sysctl_igmp_max_msf = 10;
++	/* IGMP reports for link-local multicast groups are enabled by default */
++	net->ipv4.sysctl_igmp_llm_reports = 1;
++	net->ipv4.sysctl_igmp_qrv = 2;
++
++	net->ipv4.sysctl_fib_notify_on_flag_change = 0;
++
++	return 0;
++}
++
++static __net_initdata struct pernet_operations af_inet_ops = {
++	.init = inet_init_net,
++};
++
++static int __init init_inet_pernet_ops(void)
++{
++	return register_pernet_subsys(&af_inet_ops);
++}
++
++static int ipv4_proc_init(void);
++
++/*
++ *	IP protocol layer initialiser
++ */
++
++static struct packet_offload ip_packet_offload __read_mostly = {
++	.type = cpu_to_be16(ETH_P_IP),
++	.callbacks = {
++		.gso_segment = inet_gso_segment,
++		.gro_receive = inet_gro_receive,
++		.gro_complete = inet_gro_complete,
++	},
++};
++
++static const struct net_offload ipip_offload = {
++	.callbacks = {
++		.gso_segment	= ipip_gso_segment,
++		.gro_receive	= ipip_gro_receive,
++		.gro_complete	= ipip_gro_complete,
++	},
++};
++
++static int __init ipip_offload_init(void)
++{
++	return inet_add_offload(&ipip_offload, IPPROTO_IPIP);
++}
++
++static int __init ipv4_offload_init(void)
++{
++	/*
++	 * Add offloads
++	 */
++	if (udpv4_offload_init() < 0)
++		pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
++	if (tcpv4_offload_init() < 0)
++		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
++	if (ipip_offload_init() < 0)
++		pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
++
++	dev_add_offload(&ip_packet_offload);
++	return 0;
++}
++
++fs_initcall(ipv4_offload_init);
++
++static struct packet_type ip_packet_type __read_mostly = {
++	.type = cpu_to_be16(ETH_P_IP),
++	.func = ip_rcv,
++	.list_func = ip_list_rcv,
++};
++
++static int __init inet_init(void)
++{
++	struct inet_protosw *q;
++	struct list_head *r;
++	int rc;
++
++	sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
++
++	raw_hashinfo_init(&raw_v4_hashinfo);
++
++	rc = proto_register(&tcp_prot, 1);
++	if (rc)
++		goto out;
++
++	rc = proto_register(&udp_prot, 1);
++	if (rc)
++		goto out_unregister_tcp_proto;
++
++	rc = proto_register(&raw_prot, 1);
++	if (rc)
++		goto out_unregister_udp_proto;
++
++	rc = proto_register(&ping_prot, 1);
++	if (rc)
++		goto out_unregister_raw_proto;
++
++	/*
++	 *	Tell SOCKET that we are alive...
++	 */
++
++	(void)sock_register(&inet_family_ops);
++
++#ifdef CONFIG_SYSCTL
++	ip_static_sysctl_init();
++#endif
++
++	/*
++	 *	Add all the base protocols.
++	 */
++
++	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
++		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
++	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
++		pr_crit("%s: Cannot add UDP protocol\n", __func__);
++	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
++		pr_crit("%s: Cannot add TCP protocol\n", __func__);
++#ifdef CONFIG_IP_MULTICAST
++	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
++		pr_crit("%s: Cannot add IGMP protocol\n", __func__);
++#endif
++
++	/* Register the socket-side information for inet_create. */
++	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
++		INIT_LIST_HEAD(r);
++
++	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
++		inet_register_protosw(q);
++
++	/*
++	 *	Set the ARP module up
++	 */
++
++	arp_init();
++
++	/*
++	 *	Set the IP module up
++	 */
++
++	ip_init();
++
++	/* Initialise per-cpu ipv4 mibs */
++	if (init_ipv4_mibs())
++		panic("%s: Cannot init ipv4 mibs\n", __func__);
++
++	/* Setup TCP slab cache for open requests. */
++	tcp_init();
++
++	/* Setup UDP memory threshold */
++	udp_init();
++
++	/* Add UDP-Lite (RFC 3828) */
++	udplite4_register();
++
++	raw_init();
++
++	ping_init();
++
++	/*
++	 *	Set the ICMP layer up
++	 */
++
++	if (icmp_init() < 0)
++		panic("Failed to create the ICMP control socket.\n");
++
++	/*
++	 *	Initialise the multicast router
++	 */
++#if defined(CONFIG_IP_MROUTE)
++	if (ip_mr_init())
++		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
++#endif
++
++	if (init_inet_pernet_ops())
++		pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
++
++	ipv4_proc_init();
++
++	ipfrag_init();
++
++	dev_add_pack(&ip_packet_type);
++
++	ip_tunnel_core_init();
++
++	rc = 0;
++out:
++	return rc;
++out_unregister_raw_proto:
++	proto_unregister(&raw_prot);
++out_unregister_udp_proto:
++	proto_unregister(&udp_prot);
++out_unregister_tcp_proto:
++	proto_unregister(&tcp_prot);
++	goto out;
++}
++
++fs_initcall(inet_init);
++
++/* ------------------------------------------------------------------------ */
++
++#ifdef CONFIG_PROC_FS
++static int __init ipv4_proc_init(void)
++{
++	int rc = 0;
++
++	if (raw_proc_init())
++		goto out_raw;
++	if (tcp4_proc_init())
++		goto out_tcp;
++	if (udp4_proc_init())
++		goto out_udp;
++	if (ping_proc_init())
++		goto out_ping;
++	if (ip_misc_proc_init())
++		goto out_misc;
++out:
++	return rc;
++out_misc:
++	ping_proc_exit();
++out_ping:
++	udp4_proc_exit();
++out_udp:
++	tcp4_proc_exit();
++out_tcp:
++	raw_proc_exit();
++out_raw:
++	rc = -ENOMEM;
++	goto out;
++}
++
++#else /* CONFIG_PROC_FS */
++static int __init ipv4_proc_init(void)
++{
++	return 0;
++}
++#endif /* CONFIG_PROC_FS */
+diff -rupN linux.orig/net/ipv6/seg6_local.c linux/net/ipv6/seg6_local.c
+--- linux.orig/net/ipv6/seg6_local.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/ipv6/seg6_local.c	2022-12-04 10:40:26.732034003 -0500
+@@ -1508,13 +1508,13 @@ static int put_nla_counters(struct sk_bu
  
  		pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
  		do {
@@ -8869,11 +59406,10 @@ index b7de5e46fdd8f..f84da849819cc 100644
  
  		counters.packets += packets;
  		counters.bytes += bytes;
-diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
-index 9d7b238a67372..965b9cb2ef3f2 100644
---- a/net/mac80211/sta_info.c
-+++ b/net/mac80211/sta_info.c
-@@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
+diff -rupN linux.orig/net/mac80211/sta_info.c linux/net/mac80211/sta_info.c
+--- linux.orig/net/mac80211/sta_info.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/mac80211/sta_info.c	2022-12-04 10:40:26.732034003 -0500
+@@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu(
  	u64 value;
  
  	do {
@@ -8885,7 +59421,7 @@ index 9d7b238a67372..965b9cb2ef3f2 100644
  
  	return value;
  }
-@@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
+@@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(st
  	u64 value;
  
  	do {
@@ -8897,11 +59433,10 @@ index 9d7b238a67372..965b9cb2ef3f2 100644
  
  	return value;
  }
-diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
-index b52afe316dc41..35b5f806fdda1 100644
---- a/net/mpls/af_mpls.c
-+++ b/net/mpls/af_mpls.c
-@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev,
+diff -rupN linux.orig/net/mpls/af_mpls.c linux/net/mpls/af_mpls.c
+--- linux.orig/net/mpls/af_mpls.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/mpls/af_mpls.c	2022-12-04 10:40:26.732034003 -0500
+@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_d
  
  		p = per_cpu_ptr(mdev->stats, i);
  		do {
@@ -8913,11 +59448,10 @@ index b52afe316dc41..35b5f806fdda1 100644
  
  		stats->rx_packets	+= local.rx_packets;
  		stats->rx_bytes		+= local.rx_bytes;
-diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
-index efab2b06d3732..5a7349002508e 100644
---- a/net/netfilter/ipvs/ip_vs_ctl.c
-+++ b/net/netfilter/ipvs/ip_vs_ctl.c
-@@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+diff -rupN linux.orig/net/netfilter/ipvs/ip_vs_ctl.c linux/net/netfilter/ipvs/ip_vs_ctl.c
+--- linux.orig/net/netfilter/ipvs/ip_vs_ctl.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/netfilter/ipvs/ip_vs_ctl.c	2022-12-04 10:40:26.736033993 -0500
+@@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struc
  		u64 conns, inpkts, outpkts, inbytes, outbytes;
  
  		do {
@@ -8933,11 +59467,10 @@ index efab2b06d3732..5a7349002508e 100644
  
  		seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
  			   i, (u64)conns, (u64)inpkts,
-diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
-index 63c70141b3e5d..cde0d9f0d838e 100644
---- a/net/netfilter/nf_tables_api.c
-+++ b/net/netfilter/nf_tables_api.c
-@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
+diff -rupN linux.orig/net/netfilter/nf_tables_api.c linux/net/netfilter/nf_tables_api.c
+--- linux.orig/net/netfilter/nf_tables_api.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/netfilter/nf_tables_api.c	2022-12-04 10:40:26.736033993 -0500
+@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff
  	for_each_possible_cpu(cpu) {
  		cpu_stats = per_cpu_ptr(stats, cpu);
  		do {
@@ -8950,11 +59483,10 @@ index 63c70141b3e5d..cde0d9f0d838e 100644
  		total.pkts += pkts;
  		total.bytes += bytes;
  	}
-diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
-index 93c596e3b22b9..b05458c170484 100644
---- a/net/openvswitch/datapath.c
-+++ b/net/openvswitch/datapath.c
-@@ -715,9 +715,9 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
+diff -rupN linux.orig/net/openvswitch/datapath.c linux/net/openvswitch/datapath.c
+--- linux.orig/net/openvswitch/datapath.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/openvswitch/datapath.c	2022-12-04 10:40:26.736033993 -0500
+@@ -715,9 +715,9 @@ static void get_dp_stats(const struct da
  		percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
  
  		do {
@@ -8966,11 +59498,10 @@ index 93c596e3b22b9..b05458c170484 100644
  
  		stats->n_hit += local_stats.n_hit;
  		stats->n_missed += local_stats.n_missed;
-diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
-index d4a2db0b22998..0a0e4c283f02e 100644
---- a/net/openvswitch/flow_table.c
-+++ b/net/openvswitch/flow_table.c
-@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
+diff -rupN linux.orig/net/openvswitch/flow_table.c linux/net/openvswitch/flow_table.c
+--- linux.orig/net/openvswitch/flow_table.c	2022-12-02 11:43:18.000000000 -0500
++++ linux/net/openvswitch/flow_table.c	2022-12-04 10:40:26.736033993 -0500
+@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counter
  
  			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
  			do {
@@ -8982,7 +59513,7 @@ index d4a2db0b22998..0a0e4c283f02e 100644
  
  			ma->masks_usage_zero_cntr[i] += counter;
  		}
-@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
+@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flo
  
  			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
  			do {