Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

2019-01-29 21:18:54 -08:00 · 2019-01-29 21:18:54 -08:00 · eaf2a47f40
parent c829f5f52d 6296789878
commit eaf2a47f40
78 changed files with 755 additions and 284 deletions
--- a/18
+++ b/18
@ -16689,6 +16689,24 @@ T:	git git://linuxtv.org/media_tree.git
 S:	Maintained
 F:	drivers/media/tuners/tuner-xc2028.*

+XDP (eXpress Data Path)
+M:	Alexei Starovoitov <ast@kernel.org>
+M:	Daniel Borkmann <daniel@iogearbox.net>
+M:	David S. Miller <davem@davemloft.net>
+M:	Jakub Kicinski <jakub.kicinski@netronome.com>
+M:	Jesper Dangaard Brouer <hawk@kernel.org>
+M:	John Fastabend <john.fastabend@gmail.com>
+L:	netdev@vger.kernel.org
+L:	xdp-newbies@vger.kernel.org
+S:	Supported
+F:	net/core/xdp.c
+F:	include/net/xdp.h
+F:	kernel/bpf/devmap.c
+F:	kernel/bpf/cpumap.c
+F:	include/trace/events/xdp.h
+K:	xdp
+N:	xdp
+
 XDP SOCKETS (AF_XDP)
 M:	Björn Töpel <bjorn.topel@intel.com>
 M:	Magnus Karlsson <magnus.karlsson@intel.com>
--- a/2
+++ b/2
@ -2,7 +2,7 @@
 VERSION = 5
 PATCHLEVEL = 0
 SUBLEVEL = 0
-EXTRAVERSION = -rc3
+EXTRAVERSION = -rc4
 NAME = Shy Crocodile

 # *DOCUMENTATION*
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@ -198,7 +198,7 @@ config X86
 	select IRQ_FORCED_THREADING
 	select NEED_SG_DMA_LENGTH
 	select PCI_DOMAINS			if PCI
-	select PCI_LOCKLESS_CONFIG
+	select PCI_LOCKLESS_CONFIG		if PCI
 	select PERF_EVENTS
 	select RTC_LIB
 	select RTC_MC146818_LIB
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@ -361,7 +361,8 @@ ENTRY(entry_INT80_compat)

 	/* Need to switch before accessing the thread stack. */
 	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
-	movq	%rsp, %rdi
+	/* In the Xen PV case we already run on the thread stack. */
+	ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

 	pushq	6*8(%rdi)		/* regs->ss */
@ -370,8 +371,9 @@ ENTRY(entry_INT80_compat)
 	pushq	3*8(%rdi)		/* regs->cs */
 	pushq	2*8(%rdi)		/* regs->ip */
 	pushq	1*8(%rdi)		/* regs->orig_ax */
-
 	pushq	(%rdi)			/* pt_regs->di */
+.Lint80_keep_stack:
+
 	pushq	%rsi			/* pt_regs->si */
 	xorl	%esi, %esi		/* nospec   si */
 	pushq	%rdx			/* pt_regs->dx */
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@ -178,6 +178,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)

 void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);

+/*
+ * Init a new mm.  Used on mm copies, like at fork()
+ * and on mm's that are brand-new, like at execve().
+ */
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
@ -228,8 +232,22 @@ do {						\
 } while (0)
 #endif

+static inline void arch_dup_pkeys(struct mm_struct *oldmm,
+				  struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+		return;
+
+	/* Duplicate the oldmm pkey state in mm: */
+	mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
+	mm->context.execute_only_pkey   = oldmm->context.execute_only_pkey;
+#endif
+}
+
 static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
+	arch_dup_pkeys(oldmm, mm);
 	paravirt_arch_dup_mmap(oldmm, mm);
 	return ldt_dup_context(oldmm, mm);
 }
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@ -470,6 +470,7 @@ int crash_load_segments(struct kimage *image)

 	kbuf.memsz = kbuf.bufsz;
 	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
 	ret = kexec_add_buffer(&kbuf);
 	if (ret) {
 		vfree((void *)image->arch.elf_headers);
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@ -21,10 +21,6 @@

 #define HPET_MASK			CLOCKSOURCE_MASK(32)

-/* FSEC = 10^-15
-   NSEC = 10^-9 */
-#define FSEC_PER_NSEC			1000000L
-
 #define HPET_DEV_USED_BIT		2
 #define HPET_DEV_USED			(1 << HPET_DEV_USED_BIT)
 #define HPET_DEV_VALID			0x8
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@ -434,6 +434,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	kbuf.memsz = PAGE_ALIGN(header->init_size);
 	kbuf.buf_align = header->kernel_alignment;
 	kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
 	ret = kexec_add_buffer(&kbuf);
 	if (ret)
 		goto out_free_params;
@ -448,6 +449,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 		kbuf.bufsz = kbuf.memsz = initrd_len;
 		kbuf.buf_align = PAGE_SIZE;
 		kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+		kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
 		ret = kexec_add_buffer(&kbuf);
 		if (ret)
 			goto out_free_params;
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@ -297,15 +297,16 @@ static int __init tsc_setup(char *str)

 __setup("tsc=", tsc_setup);

-#define MAX_RETRIES     5
-#define SMI_TRESHOLD    50000
+#define MAX_RETRIES		5
+#define TSC_DEFAULT_THRESHOLD	0x20000

 /*
- * Read TSC and the reference counters. Take care of SMI disturbance
+ * Read TSC and the reference counters. Take care of any disturbances
 */
 static u64 tsc_read_refs(u64 *p, int hpet)
 {
 	u64 t1, t2;
+	u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
 	int i;

 	for (i = 0; i < MAX_RETRIES; i++) {
@ -315,7 +316,7 @@ static u64 tsc_read_refs(u64 *p, int hpet)
 		else
 			*p = acpi_pm_read_early();
 		t2 = get_cycles();
-		if ((t2 - t1) < SMI_TRESHOLD)
+		if ((t2 - t1) < thresh)
 			return t2;
 	}
 	return ULLONG_MAX;
@ -703,15 +704,15 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
 	 * zero. In each wait loop iteration we read the TSC and check
 	 * the delta to the previous read. We keep track of the min
 	 * and max values of that delta. The delta is mostly defined
-	 * by the IO time of the PIT access, so we can detect when a
-	 * SMI/SMM disturbance happened between the two reads. If the
+	 * by the IO time of the PIT access, so we can detect when
+	 * any disturbance happened between the two reads. If the
 	 * maximum time is significantly larger than the minimum time,
 	 * then we discard the result and have another try.
 	 *
 	 * 2) Reference counter. If available we use the HPET or the
 	 * PMTIMER as a reference to check the sanity of that value.
 	 * We use separate TSC readouts and check inside of the
-	 * reference read for a SMI/SMM disturbance. We dicard
+	 * reference read for any possible disturbance. We dicard
 	 * disturbed values here as well. We do that around the PIT
 	 * calibration delay loop as we have to wait for a certain
 	 * amount of time anyway.
@ -744,7 +745,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
 		if (ref1 == ref2)
 			continue;

-		/* Check, whether the sampling was disturbed by an SMI */
+		/* Check, whether the sampling was disturbed */
 		if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
 			continue;

@ -1268,7 +1269,7 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
 */
 static void tsc_refine_calibration_work(struct work_struct *work)
 {
-	static u64 tsc_start = -1, ref_start;
+	static u64 tsc_start = ULLONG_MAX, ref_start;
 	static int hpet;
 	u64 tsc_stop, ref_stop, delta;
 	unsigned long freq;
@ -1283,14 +1284,15 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 	 * delayed the first time we expire. So set the workqueue
 	 * again once we know timers are working.
 	 */
-	if (tsc_start == -1) {
+	if (tsc_start == ULLONG_MAX) {
+restart:
 		/*
 		 * Only set hpet once, to avoid mixing hardware
 		 * if the hpet becomes enabled later.
 		 */
 		hpet = is_hpet_enabled();
-		schedule_delayed_work(&tsc_irqwork, HZ);
 		tsc_start = tsc_read_refs(&ref_start, hpet);
+		schedule_delayed_work(&tsc_irqwork, HZ);
 		return;
 	}

@ -1300,9 +1302,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 	if (ref_start == ref_stop)
 		goto out;

-	/* Check, whether the sampling was disturbed by an SMI */
-	if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
-		goto out;
+	/* Check, whether the sampling was disturbed */
+	if (tsc_stop == ULLONG_MAX)
+		goto restart;

 	delta = tsc_stop - tsc_start;
 	delta *= 1000000LL;
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@ -36,8 +36,8 @@ static inline u16 i8254(void)
 	u16 status, timer;

 	do {
-		outb(I8254_PORT_CONTROL,
-		     I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
+		outb(I8254_CMD_READBACK | I8254_SELECT_COUNTER0,
+		     I8254_PORT_CONTROL);
 		status = inb(I8254_PORT_COUNTER0);
 		timer  = inb(I8254_PORT_COUNTER0);
 		timer |= inb(I8254_PORT_COUNTER0) << 8;
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@ -158,8 +158,8 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
 	pmd = pmd_offset(pud, ppd->vaddr);
 	if (pmd_none(*pmd)) {
 		pte = ppd->pgtable_area;
-		memset(pte, 0, sizeof(pte) * PTRS_PER_PTE);
-		ppd->pgtable_area += sizeof(pte) * PTRS_PER_PTE;
+		memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE);
+		ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE;
 		set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
 	}

--- a/block/blk-core.c
+++ b/block/blk-core.c
@ -1083,18 +1083,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 			/* Create a fresh bio_list for all subordinate requests */
 			bio_list_on_stack[1] = bio_list_on_stack[0];
 			bio_list_init(&bio_list_on_stack[0]);
-
-			/*
-			 * Since we're recursing into make_request here, ensure
-			 * that we mark this bio as already having entered the queue.
-			 * If not, and the queue is going away, we can get stuck
-			 * forever on waiting for the queue reference to drop. But
-			 * that will never happen, as we're already holding a
-			 * reference to it.
-			 */
-			bio_set_flag(bio, BIO_QUEUE_ENTERED);
 			ret = q->make_request_fn(q, bio);
-			bio_clear_flag(bio, BIO_QUEUE_ENTERED);

 			/* sort new bios into those for a lower level
 			 * and those for the same level
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@ -272,6 +272,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 		/* there isn't chance to merge the splitted bio */
 		split->bi_opf |= REQ_NOMERGE;

+		/*
+		 * Since we're recursing into make_request here, ensure
+		 * that we mark this bio as already having entered the queue.
+		 * If not, and the queue is going away, we can get stuck
+		 * forever on waiting for the queue reference to drop. But
+		 * that will never happen, as we're already holding a
+		 * reference to it.
+		 */
+		bio_set_flag(*bio, BIO_QUEUE_ENTERED);
+
 		bio_chain(split, *bio);
 		trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
 		generic_make_request(*bio);
--- a/drivers/edac/altera_edac.h
+++ b/drivers/edac/altera_edac.h
@ -295,8 +295,8 @@ struct altr_sdram_mc_data {
 #define S10_SYSMGR_ECC_INTSTAT_DERR_OFST  0xA0

 /* Sticky registers for Uncorrected Errors */
-#define S10_SYSMGR_UE_VAL_OFST            0x120
-#define S10_SYSMGR_UE_ADDR_OFST           0x124
+#define S10_SYSMGR_UE_VAL_OFST            0x220
+#define S10_SYSMGR_UE_ADDR_OFST           0x224

 #define S10_DDR0_IRQ_MASK                 BIT(16)

--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@ -2399,13 +2399,14 @@ static void its_free_device(struct its_device *its_dev)
 	kfree(its_dev);
 }

-static int its_alloc_device_irq(struct its_device *dev, irq_hw_number_t *hwirq)
+static int its_alloc_device_irq(struct its_device *dev, int nvecs, irq_hw_number_t *hwirq)
 {
 	int idx;

-	idx = find_first_zero_bit(dev->event_map.lpi_map,
-				  dev->event_map.nr_lpis);
-	if (idx == dev->event_map.nr_lpis)
+	idx = bitmap_find_free_region(dev->event_map.lpi_map,
+				      dev->event_map.nr_lpis,
+				      get_count_order(nvecs));
+	if (idx < 0)
 		return -ENOSPC;

 	*hwirq = dev->event_map.lpi_base + idx;
@ -2501,21 +2502,21 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	int err;
 	int i;

-	for (i = 0; i < nr_irqs; i++) {
-		err = its_alloc_device_irq(its_dev, &hwirq);
-		if (err)
-			return err;
+	err = its_alloc_device_irq(its_dev, nr_irqs, &hwirq);
+	if (err)
+		return err;

-		err = its_irq_gic_domain_alloc(domain, virq + i, hwirq);
+	for (i = 0; i < nr_irqs; i++) {
+		err = its_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
 		if (err)
 			return err;

 		irq_domain_set_hwirq_and_chip(domain, virq + i,
-					      hwirq, &its_irq_chip, its_dev);
+					      hwirq + i, &its_irq_chip, its_dev);
 		irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(virq + i)));
 		pr_debug("ID:%d pID:%d vID:%d\n",
-			 (int)(hwirq - its_dev->event_map.lpi_base),
-			 (int) hwirq, virq + i);
+			 (int)(hwirq + i - its_dev->event_map.lpi_base),
+			 (int)(hwirq + i), virq + i);
 	}

 	return 0;
--- a/drivers/irqchip/irq-gic-v3-mbi.c
+++ b/drivers/irqchip/irq-gic-v3-mbi.c
@ -24,7 +24,7 @@ struct mbi_range {
 	unsigned long		*bm;
 };

-static struct mutex		mbi_lock;
+static DEFINE_MUTEX(mbi_lock);
 static phys_addr_t		mbi_phys_base;
 static struct mbi_range		*mbi_ranges;
 static unsigned int		mbi_range_nr;
--- a/drivers/irqchip/irq-madera.c
+++ b/drivers/irqchip/irq-madera.c
@ -7,7 +7,6 @@
 */

 #include <linux/module.h>
-#include <linux/gpio.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
@ -16,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_gpio.h>
 #include <linux/of_irq.h>
 #include <linux/irqchip/irq-madera.h>
 #include <linux/mfd/madera/core.h>
--- a/drivers/irqchip/irq-stm32-exti.c
+++ b/drivers/irqchip/irq-stm32-exti.c
@ -822,6 +822,7 @@ out_unmap:
 static const struct irq_domain_ops stm32_exti_h_domain_ops = {
 	.alloc	= stm32_exti_h_domain_alloc,
 	.free	= irq_domain_free_irqs_common,
+	.xlate = irq_domain_xlate_twocell,
 };

 static int
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@ -257,10 +257,7 @@ static int handle_tx(struct ser_device *ser)
 		if (skb->len == 0) {
 			struct sk_buff *tmp = skb_dequeue(&ser->head);
 			WARN_ON(tmp != skb);
-			if (in_interrupt())
-				dev_kfree_skb_irq(skb);
-			else
-				kfree_skb(skb);
+			dev_consume_skb_any(skb);
 		}
 	}
 	/* Send flow off if queue is empty */
--- a/drivers/net/dsa/mv88e6xxx/serdes.c
+++ b/drivers/net/dsa/mv88e6xxx/serdes.c
@ -664,7 +664,7 @@ int mv88e6390_serdes_irq_setup(struct mv88e6xxx_chip *chip, int port)
 	if (port < 9)
 		return 0;

-	return mv88e6390_serdes_irq_setup(chip, port);
+	return mv88e6390x_serdes_irq_setup(chip, port);
 }

 void mv88e6390x_serdes_irq_free(struct mv88e6xxx_chip *chip, int port)
--- a/drivers/net/ethernet/alteon/acenic.c
+++ b/drivers/net/ethernet/alteon/acenic.c
@ -2059,7 +2059,7 @@ static inline void ace_tx_int(struct net_device *dev,
 		if (skb) {
 			dev->stats.tx_packets++;
 			dev->stats.tx_bytes += skb->len;
-			dev_kfree_skb_irq(skb);
+			dev_consume_skb_irq(skb);
 			info->skb = NULL;
 		}

--- a/drivers/net/ethernet/altera/altera_msgdma.c
+++ b/drivers/net/ethernet/altera/altera_msgdma.c
@ -145,7 +145,8 @@ u32 msgdma_tx_completions(struct altera_tse_private *priv)
 			& 0xffff;

 	if (inuse) { /* Tx FIFO is not empty */
-		ready = priv->tx_prod - priv->tx_cons - inuse - 1;
+		ready = max_t(int,
+			      priv->tx_prod - priv->tx_cons - inuse - 1, 0);
 	} else {
 		/* Check for buffered last packet */
 		status = csrrd32(priv->tx_dma_csr, msgdma_csroffs(status));
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@ -666,7 +666,7 @@ static int amd8111e_tx(struct net_device *dev)
 			pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[tx_index],
 				  	lp->tx_skbuff[tx_index]->len,
 					PCI_DMA_TODEVICE);
-			dev_kfree_skb_irq (lp->tx_skbuff[tx_index]);
+			dev_consume_skb_irq(lp->tx_skbuff[tx_index]);
 			lp->tx_skbuff[tx_index] = NULL;
 			lp->tx_dma_addr[tx_index] = 0;
 		}
--- a/drivers/net/ethernet/apple/bmac.c
+++ b/drivers/net/ethernet/apple/bmac.c
@ -777,7 +777,7 @@ static irqreturn_t bmac_txdma_intr(int irq, void *dev_id)

 		if (bp->tx_bufs[bp->tx_empty]) {
 			++dev->stats.tx_packets;
-			dev_kfree_skb_irq(bp->tx_bufs[bp->tx_empty]);
+			dev_consume_skb_irq(bp->tx_bufs[bp->tx_empty]);
 		}
 		bp->tx_bufs[bp->tx_empty] = NULL;
 		bp->tx_fullup = 0;
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@ -638,7 +638,7 @@ static void b44_tx(struct b44 *bp)
 		bytes_compl += skb->len;
 		pkts_compl++;

-		dev_kfree_skb_irq(skb);
+		dev_consume_skb_irq(skb);
 	}

 	netdev_completed_queue(bp->dev, pkts_compl, bytes_compl);
@ -1012,7 +1012,7 @@ static netdev_tx_t b44_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		}

 		skb_copy_from_linear_data(skb, skb_put(bounce_skb, len), len);
-		dev_kfree_skb_any(skb);
+		dev_consume_skb_any(skb);
 		skb = bounce_skb;
 	}

--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@ -643,6 +643,7 @@
 #define MACB_CAPS_JUMBO				0x00000020
 #define MACB_CAPS_GEM_HAS_PTP			0x00000040
 #define MACB_CAPS_BD_RD_PREFETCH		0x00000080
+#define MACB_CAPS_NEEDS_RSTONUBR		0x00000100
 #define MACB_CAPS_FIFO_MODE			0x10000000
 #define MACB_CAPS_GIGABIT_MODE_AVAILABLE	0x20000000
 #define MACB_CAPS_SG_DISABLED			0x40000000
@ -1214,6 +1215,8 @@ struct macb {

 	int	rx_bd_rd_prefetch;
 	int	tx_bd_rd_prefetch;
+
+	u32	rx_intr_mask;
 };

 #ifdef CONFIG_MACB_USE_HWSTAMP
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@ -56,8 +56,7 @@
 /* level of occupied TX descriptors under which we wake up TX process */
 #define MACB_TX_WAKEUP_THRESH(bp)	(3 * (bp)->tx_ring_size / 4)

-#define MACB_RX_INT_FLAGS	(MACB_BIT(RCOMP) | MACB_BIT(RXUBR)	\
-				 | MACB_BIT(ISR_ROVR))
+#define MACB_RX_INT_FLAGS	(MACB_BIT(RCOMP) | MACB_BIT(ISR_ROVR))
 #define MACB_TX_ERR_FLAGS	(MACB_BIT(ISR_TUND)			\
 					| MACB_BIT(ISR_RLE)		\
 					| MACB_BIT(TXERR))
@ -1270,7 +1269,7 @@ static int macb_poll(struct napi_struct *napi, int budget)
 				queue_writel(queue, ISR, MACB_BIT(RCOMP));
 			napi_reschedule(napi);
 		} else {
-			queue_writel(queue, IER, MACB_RX_INT_FLAGS);
+			queue_writel(queue, IER, bp->rx_intr_mask);
 		}
 	}

@ -1288,7 +1287,7 @@ static void macb_hresp_error_task(unsigned long data)
 	u32 ctrl;

 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
-		queue_writel(queue, IDR, MACB_RX_INT_FLAGS |
+		queue_writel(queue, IDR, bp->rx_intr_mask |
 					 MACB_TX_INT_FLAGS |
 					 MACB_BIT(HRESP));
 	}
@ -1318,7 +1317,7 @@ static void macb_hresp_error_task(unsigned long data)

 		/* Enable interrupts */
 		queue_writel(queue, IER,
-			     MACB_RX_INT_FLAGS |
+			     bp->rx_intr_mask |
 			     MACB_TX_INT_FLAGS |
 			     MACB_BIT(HRESP));
 	}
@ -1372,14 +1371,14 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 			    (unsigned int)(queue - bp->queues),
 			    (unsigned long)status);

-		if (status & MACB_RX_INT_FLAGS) {
+		if (status & bp->rx_intr_mask) {
 			/* There's no point taking any more interrupts
 			 * until we have processed the buffers. The
 			 * scheduling call may fail if the poll routine
 			 * is already scheduled, so disable interrupts
 			 * now.
 			 */
-			queue_writel(queue, IDR, MACB_RX_INT_FLAGS);
+			queue_writel(queue, IDR, bp->rx_intr_mask);
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
 				queue_writel(queue, ISR, MACB_BIT(RCOMP));

@ -1412,8 +1411,9 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 		/* There is a hardware issue under heavy load where DMA can
 		 * stop, this causes endless "used buffer descriptor read"
 		 * interrupts but it can be cleared by re-enabling RX. See
-		 * the at91 manual, section 41.3.1 or the Zynq manual
-		 * section 16.7.4 for details.
+		 * the at91rm9200 manual, section 41.3.1 or the Zynq manual
+		 * section 16.7.4 for details. RXUBR is only enabled for
+		 * these two versions.
 		 */
 		if (status & MACB_BIT(RXUBR)) {
 			ctrl = macb_readl(bp, NCR);
@ -2259,7 +2259,7 @@ static void macb_init_hw(struct macb *bp)

 		/* Enable interrupts */
 		queue_writel(queue, IER,
-			     MACB_RX_INT_FLAGS |
+			     bp->rx_intr_mask |
 			     MACB_TX_INT_FLAGS |
 			     MACB_BIT(HRESP));
 	}
@ -3907,6 +3907,7 @@ static const struct macb_config sama5d4_config = {
 };

 static const struct macb_config emac_config = {
+	.caps = MACB_CAPS_NEEDS_RSTONUBR,
 	.clk_init = at91ether_clk_init,
 	.init = at91ether_init,
 };
@ -3928,7 +3929,8 @@ static const struct macb_config zynqmp_config = {
 };

 static const struct macb_config zynq_config = {
-	.caps = MACB_CAPS_GIGABIT_MODE_AVAILABLE | MACB_CAPS_NO_GIGABIT_HALF,
+	.caps = MACB_CAPS_GIGABIT_MODE_AVAILABLE | MACB_CAPS_NO_GIGABIT_HALF |
+		MACB_CAPS_NEEDS_RSTONUBR,
 	.dma_burst_length = 16,
 	.clk_init = macb_clk_init,
 	.init = macb_init,
@ -4083,6 +4085,10 @@ static int macb_probe(struct platform_device *pdev)
 						macb_dma_desc_get_size(bp);
 	}

+	bp->rx_intr_mask = MACB_RX_INT_FLAGS;
+	if (bp->caps & MACB_CAPS_NEEDS_RSTONUBR)
+		bp->rx_intr_mask |= MACB_BIT(RXUBR);
+
 	mac = of_get_mac_address(np);
 	if (mac) {
 		ether_addr_copy(bp->dev->dev_addr, mac);
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@ -2418,6 +2418,8 @@ static int hns_nic_dev_probe(struct platform_device *pdev)
 out_notify_fail:
 	(void)cancel_work_sync(&priv->service_task);
 out_read_prop_fail:
+	/* safe for ACPI FW */
+	of_node_put(to_of_node(priv->fwnode));
 	free_netdev(ndev);
 	return ret;
 }
@ -2447,6 +2449,9 @@ static int hns_nic_dev_remove(struct platform_device *pdev)
 	set_bit(NIC_STATE_REMOVING, &priv->state);
 	(void)cancel_work_sync(&priv->service_task);

+	/* safe for ACPI FW */
+	of_node_put(to_of_node(priv->fwnode));
+
 	free_netdev(ndev);
 	return 0;
 }
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@ -1157,16 +1157,18 @@ static int hns_get_regs_len(struct net_device *net_dev)
 */
 static int hns_nic_nway_reset(struct net_device *netdev)
 {
-	int ret = 0;
 	struct phy_device *phy = netdev->phydev;

-	if (netif_running(netdev)) {
-		/* if autoneg is disabled, don't restart auto-negotiation */
-		if (phy && phy->autoneg == AUTONEG_ENABLE)
-			ret = genphy_restart_aneg(phy);
-	}
+	if (!netif_running(netdev))
+		return 0;

-	return ret;
+	if (!phy)
+		return -EOPNOTSUPP;
+
+	if (phy->autoneg != AUTONEG_ENABLE)
+		return -EINVAL;
+
+	return genphy_restart_aneg(phy);
 }

 static u32
--- a/drivers/net/ethernet/hisilicon/hns_mdio.c
+++ b/drivers/net/ethernet/hisilicon/hns_mdio.c
@ -321,7 +321,7 @@ static int hns_mdio_read(struct mii_bus *bus, int phy_id, int regnum)
 		}

 		hns_mdio_cmd_write(mdio_dev, is_c45,
-				   MDIO_C45_WRITE_ADDR, phy_id, devad);
+				   MDIO_C45_READ, phy_id, devad);
 	}

 	/* Step 5: waitting for MDIO_COMMAND_REG 's mdio_start==0,*/
--- a/drivers/net/ethernet/i825xx/82596.c
+++ b/drivers/net/ethernet/i825xx/82596.c
@ -1310,7 +1310,7 @@ static irqreturn_t i596_interrupt(int irq, void *dev_id)
 						dev->stats.tx_aborted_errors++;
 				}

-				dev_kfree_skb_irq(skb);
+				dev_consume_skb_irq(skb);

 				tx_cmd->cmd.command = 0; /* Mark free */
 				break;
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@ -949,7 +949,7 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 	if (params->rx_dim_enabled)
 		__set_bit(MLX5E_RQ_STATE_AM, &c->rq.state);

-	if (params->pflags & MLX5E_PFLAG_RX_NO_CSUM_COMPLETE)
+	if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE))
 		__set_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &c->rq.state);

 	return 0;
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@ -1114,9 +1114,17 @@ static int mlx5e_rep_get_phys_port_name(struct net_device *dev,
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
-	int ret;
+	int ret, pf_num;
+
+	ret = mlx5_lag_get_pf_num(priv->mdev, &pf_num);
+	if (ret)
+		return ret;
+
+	if (rep->vport == FDB_UPLINK_VPORT)
+		ret = snprintf(buf, len, "p%d", pf_num);
+	else
+		ret = snprintf(buf, len, "pf%dvf%d", pf_num, rep->vport - 1);

-	ret = snprintf(buf, len, "%d", rep->vport - 1);
 	if (ret >= len)
 		return -EOPNOTSUPP;

@ -1264,6 +1272,18 @@ static int mlx5e_uplink_rep_set_mac(struct net_device *netdev, void *addr)
 	return 0;
 }

+static int mlx5e_uplink_rep_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+					__be16 vlan_proto)
+{
+	netdev_warn_once(dev, "legacy vf vlan setting isn't supported in switchdev mode\n");
+
+	if (vlan != 0)
+		return -EOPNOTSUPP;
+
+	/* allow setting 0-vid for compatibility with libvirt */
+	return 0;
+}
+
 static const struct switchdev_ops mlx5e_rep_switchdev_ops = {
 	.switchdev_port_attr_get	= mlx5e_attr_get,
 };
@ -1298,6 +1318,7 @@ static const struct net_device_ops mlx5e_netdev_ops_uplink_rep = {
 	.ndo_set_vf_rate         = mlx5e_set_vf_rate,
 	.ndo_get_vf_config       = mlx5e_get_vf_config,
 	.ndo_get_vf_stats        = mlx5e_get_vf_stats,
+	.ndo_set_vf_vlan         = mlx5e_uplink_rep_set_vf_vlan,
 };

 bool mlx5e_eswitch_rep(struct net_device *netdev)
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@ -1134,13 +1134,6 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 	int err = 0;
 	u8 *smac_v;

-	if (vport->info.spoofchk && !is_valid_ether_addr(vport->info.mac)) {
-		mlx5_core_warn(esw->dev,
-			       "vport[%d] configure ingress rules failed, illegal mac with spoofchk\n",
-			       vport->vport);
-		return -EPERM;
-	}
-
 	esw_vport_cleanup_ingress_rules(esw, vport);

 	if (!vport->info.vlan && !vport->info.qos && !vport->info.spoofchk) {
@ -1728,7 +1721,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	int vport_num;
 	int err;

-	if (!MLX5_ESWITCH_MANAGER(dev))
+	if (!MLX5_VPORT_MANAGER(dev))
 		return 0;

 	esw_info(dev,
@ -1797,7 +1790,7 @@ abort:

 void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
 {
-	if (!esw || !MLX5_ESWITCH_MANAGER(esw->dev))
+	if (!esw || !MLX5_VPORT_MANAGER(esw->dev))
 		return;

 	esw_info(esw->dev, "cleanup\n");
@ -1827,13 +1820,10 @@ int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 	mutex_lock(&esw->state_lock);
 	evport = &esw->vports[vport];

-	if (evport->info.spoofchk && !is_valid_ether_addr(mac)) {
+	if (evport->info.spoofchk && !is_valid_ether_addr(mac))
 		mlx5_core_warn(esw->dev,
-			       "MAC invalidation is not allowed when spoofchk is on, vport(%d)\n",
+			       "Set invalid MAC while spoofchk is on, vport(%d)\n",
 			       vport);
-		err = -EPERM;
-		goto unlock;
-	}

 	err = mlx5_modify_nic_vport_mac_address(esw->dev, vport, mac);
 	if (err) {
@ -1979,6 +1969,10 @@ int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
 	evport = &esw->vports[vport];
 	pschk = evport->info.spoofchk;
 	evport->info.spoofchk = spoofchk;
+	if (pschk && !is_valid_ether_addr(evport->info.mac))
+		mlx5_core_warn(esw->dev,
+			       "Spoofchk in set while MAC is invalid, vport(%d)\n",
+			       evport->vport);
 	if (evport->enabled && esw->mode == SRIOV_LEGACY)
 		err = esw_vport_ingress_config(esw, evport);
 	if (err)
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@ -616,6 +616,27 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
 	}
 }

+int mlx5_lag_get_pf_num(struct mlx5_core_dev *dev, int *pf_num)
+{
+	struct mlx5_lag *ldev;
+	int n;
+
+	ldev = mlx5_lag_dev_get(dev);
+	if (!ldev) {
+		mlx5_core_warn(dev, "no lag device, can't get pf num\n");
+		return -EINVAL;
+	}
+
+	for (n = 0; n < MLX5_MAX_PORTS; n++)
+		if (ldev->pf[n].dev == dev) {
+			*pf_num = n;
+			return 0;
+		}
+
+	mlx5_core_warn(dev, "wasn't able to locate pf in the lag device\n");
+	return -EINVAL;
+}
+
 /* Must be called with intf_mutex held */
 void mlx5_lag_remove(struct mlx5_core_dev *dev)
 {
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@ -187,6 +187,8 @@ static inline int mlx5_lag_is_lacp_owner(struct mlx5_core_dev *dev)
 		    MLX5_CAP_GEN(dev, lag_master);
 }

+int mlx5_lag_get_pf_num(struct mlx5_core_dev *dev, int *pf_num);
+
 void mlx5_reload_interface(struct mlx5_core_dev *mdev, int protocol);
 void mlx5_lag_update(struct mlx5_core_dev *dev);

--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@ -44,14 +44,15 @@ static struct mlx5_core_rsc_common *
 mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn)
 {
 	struct mlx5_core_rsc_common *common;
+	unsigned long flags;

-	spin_lock(&table->lock);
+	spin_lock_irqsave(&table->lock, flags);

 	common = radix_tree_lookup(&table->tree, rsn);
 	if (common)
 		atomic_inc(&common->refcount);

-	spin_unlock(&table->lock);
+	spin_unlock_irqrestore(&table->lock, flags);

 	return common;
 }
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@ -795,19 +795,19 @@ static void qed_init_qm_pq(struct qed_hwfn *p_hwfn,

 /* get pq index according to PQ_FLAGS */
 static u16 *qed_init_qm_get_idx_from_flags(struct qed_hwfn *p_hwfn,
-					   u32 pq_flags)
+					   unsigned long pq_flags)
 {
 	struct qed_qm_info *qm_info = &p_hwfn->qm_info;

 	/* Can't have multiple flags set here */
-	if (bitmap_weight((unsigned long *)&pq_flags,
+	if (bitmap_weight(&pq_flags,
 			  sizeof(pq_flags) * BITS_PER_BYTE) > 1) {
-		DP_ERR(p_hwfn, "requested multiple pq flags 0x%x\n", pq_flags);
+		DP_ERR(p_hwfn, "requested multiple pq flags 0x%lx\n", pq_flags);
 		goto err;
 	}

 	if (!(qed_get_pq_flags(p_hwfn) & pq_flags)) {
-		DP_ERR(p_hwfn, "pq flag 0x%x is not set\n", pq_flags);
+		DP_ERR(p_hwfn, "pq flag 0x%lx is not set\n", pq_flags);
 		goto err;
 	}

--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@ -609,6 +609,10 @@ qed_sp_update_accept_mode(struct qed_hwfn *p_hwfn,
 			  (!!(accept_filter & QED_ACCEPT_MCAST_MATCHED) &&
 			   !!(accept_filter & QED_ACCEPT_MCAST_UNMATCHED)));

+		SET_FIELD(state, ETH_VPORT_TX_MODE_UCAST_ACCEPT_ALL,
+			  (!!(accept_filter & QED_ACCEPT_UCAST_MATCHED) &&
+			   !!(accept_filter & QED_ACCEPT_UCAST_UNMATCHED)));
+
 		SET_FIELD(state, ETH_VPORT_TX_MODE_BCAST_ACCEPT_ALL,
 			  !!(accept_filter & QED_ACCEPT_BCAST));

@ -744,6 +748,11 @@ int qed_sp_vport_update(struct qed_hwfn *p_hwfn,
 		return rc;
 	}

+	if (p_params->update_ctl_frame_check) {
+		p_cmn->ctl_frame_mac_check_en = p_params->mac_chk_en;
+		p_cmn->ctl_frame_ethtype_check_en = p_params->ethtype_chk_en;
+	}
+
 	/* Update mcast bins for VFs, PF doesn't use this functionality */
 	qed_sp_update_mcast_bin(p_hwfn, p_ramrod, p_params);

@ -2688,7 +2697,8 @@ static int qed_configure_filter_rx_mode(struct qed_dev *cdev,
 	if (type == QED_FILTER_RX_MODE_TYPE_PROMISC) {
 		accept_flags.rx_accept_filter |= QED_ACCEPT_UCAST_UNMATCHED |
 						 QED_ACCEPT_MCAST_UNMATCHED;
-		accept_flags.tx_accept_filter |= QED_ACCEPT_MCAST_UNMATCHED;
+		accept_flags.tx_accept_filter |= QED_ACCEPT_UCAST_UNMATCHED |
+						 QED_ACCEPT_MCAST_UNMATCHED;
 	} else if (type == QED_FILTER_RX_MODE_TYPE_MULTI_PROMISC) {
 		accept_flags.rx_accept_filter |= QED_ACCEPT_MCAST_UNMATCHED;
 		accept_flags.tx_accept_filter |= QED_ACCEPT_MCAST_UNMATCHED;
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@ -219,6 +219,9 @@ struct qed_sp_vport_update_params {
 	struct qed_rss_params		*rss_params;
 	struct qed_filter_accept_flags	accept_flags;
 	struct qed_sge_tpa_params	*sge_tpa_params;
+	u8				update_ctl_frame_check;
+	u8				mac_chk_en;
+	u8				ethtype_chk_en;
 };

 int qed_sp_vport_update(struct qed_hwfn *p_hwfn,
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@ -2451,19 +2451,24 @@ static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb,
 {
 	struct qed_ll2_tx_pkt_info pkt;
 	const skb_frag_t *frag;
+	u8 flags = 0, nr_frags;
 	int rc = -EINVAL, i;
 	dma_addr_t mapping;
 	u16 vlan = 0;
-	u8 flags = 0;

 	if (unlikely(skb->ip_summed != CHECKSUM_NONE)) {
 		DP_INFO(cdev, "Cannot transmit a checksummed packet\n");
 		return -EINVAL;
 	}

-	if (1 + skb_shinfo(skb)->nr_frags > CORE_LL2_TX_MAX_BDS_PER_PACKET) {
+	/* Cache number of fragments from SKB since SKB may be freed by
+	 * the completion routine after calling qed_ll2_prepare_tx_packet()
+	 */
+	nr_frags = skb_shinfo(skb)->nr_frags;
+
+	if (1 + nr_frags > CORE_LL2_TX_MAX_BDS_PER_PACKET) {
 		DP_ERR(cdev, "Cannot transmit a packet with %d fragments\n",
-		       1 + skb_shinfo(skb)->nr_frags);
+		       1 + nr_frags);
 		return -EINVAL;
 	}

@ -2485,7 +2490,7 @@ static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb,
 	}

 	memset(&pkt, 0, sizeof(pkt));
-	pkt.num_of_bds = 1 + skb_shinfo(skb)->nr_frags;
+	pkt.num_of_bds = 1 + nr_frags;
 	pkt.vlan = vlan;
 	pkt.bd_flags = flags;
 	pkt.tx_dest = QED_LL2_TX_DEST_NW;
@ -2496,12 +2501,17 @@ static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb,
 	    test_bit(QED_LL2_XMIT_FLAGS_FIP_DISCOVERY, &xmit_flags))
 		pkt.remove_stag = true;

+	/* qed_ll2_prepare_tx_packet() may actually send the packet if
+	 * there are no fragments in the skb and subsequently the completion
+	 * routine may run and free the SKB, so no dereferencing the SKB
+	 * beyond this point unless skb has any fragments.
+	 */
 	rc = qed_ll2_prepare_tx_packet(&cdev->hwfns[0], cdev->ll2->handle,
 				       &pkt, 1);
 	if (rc)
 		goto err;

-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+	for (i = 0; i < nr_frags; i++) {
 		frag = &skb_shinfo(skb)->frags[i];

 		mapping = skb_frag_dma_map(&cdev->pdev->dev, frag, 0,
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@ -1969,7 +1969,9 @@ static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
 	params.vport_id = vf->vport_id;
 	params.max_buffers_per_cqe = start->max_buffers_per_cqe;
 	params.mtu = vf->mtu;
-	params.check_mac = true;
+
+	/* Non trusted VFs should enable control frame filtering */
+	params.check_mac = !vf->p_vf_info.is_trusted_configured;

 	rc = qed_sp_eth_vport_start(p_hwfn, &params);
 	if (rc) {
@ -5137,6 +5139,9 @@ static void qed_iov_handle_trust_change(struct qed_hwfn *hwfn)
 		params.opaque_fid = vf->opaque_fid;
 		params.vport_id = vf->vport_id;

+		params.update_ctl_frame_check = 1;
+		params.mac_chk_en = !vf_info->is_trusted_configured;
+
 		if (vf_info->rx_accept_mode & mask) {
 			flags->update_rx_mode_config = 1;
 			flags->rx_accept_filter = vf_info->rx_accept_mode;
@ -5154,7 +5159,8 @@ static void qed_iov_handle_trust_change(struct qed_hwfn *hwfn)
 		}

 		if (flags->update_rx_mode_config ||
-		    flags->update_tx_mode_config)
+		    flags->update_tx_mode_config ||
+		    params.update_ctl_frame_check)
 			qed_sp_vport_update(hwfn, &params,
 					    QED_SPQ_MODE_EBLOCK, NULL);
 	}
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@ -261,6 +261,7 @@ static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn)
 	struct pfvf_acquire_resp_tlv *resp = &p_iov->pf2vf_reply->acquire_resp;
 	struct pf_vf_pfdev_info *pfdev_info = &resp->pfdev_info;
 	struct vf_pf_resc_request *p_resc;
+	u8 retry_cnt = VF_ACQUIRE_THRESH;
 	bool resources_acquired = false;
 	struct vfpf_acquire_tlv *req;
 	int rc = 0, attempts = 0;
@ -314,6 +315,15 @@ static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn)

 		/* send acquire request */
 		rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+
+		/* Re-try acquire in case of vf-pf hw channel timeout */
+		if (retry_cnt && rc == -EBUSY) {
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF retrying to acquire due to VPC timeout\n");
+			retry_cnt--;
+			continue;
+		}
+
 		if (rc)
 			goto exit;

--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@ -691,7 +691,7 @@ static void cp_tx (struct cp_private *cp)
 			}
 			bytes_compl += skb->len;
 			pkts_compl++;
-			dev_kfree_skb_irq(skb);
+			dev_consume_skb_irq(skb);
 		}

 		cp->tx_skb[tx_tail] = NULL;
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@ -1342,8 +1342,10 @@ static int rk_gmac_powerup(struct rk_priv_data *bsp_priv)
 	}

 	ret = phy_power_on(bsp_priv, true);
-	if (ret)
+	if (ret) {
+		gmac_clk_enable(bsp_priv, false);
 		return ret;
+	}

 	pm_runtime_enable(dev);
 	pm_runtime_get_sync(dev);
--- a/drivers/net/ethernet/ti/cpmac.c
+++ b/drivers/net/ethernet/ti/cpmac.c
@ -608,7 +608,7 @@ static void cpmac_end_xmit(struct net_device *dev, int queue)
 			netdev_dbg(dev, "sent 0x%p, len=%d\n",
 				   desc->skb, desc->skb->len);

-		dev_kfree_skb_irq(desc->skb);
+		dev_consume_skb_irq(desc->skb);
 		desc->skb = NULL;
 		if (__netif_subqueue_stopped(dev, queue))
 			netif_wake_subqueue(dev, queue);
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@ -1337,7 +1337,8 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 		n->vqs[i].rx_ring = NULL;
 		vhost_net_buf_init(&n->vqs[i].rxq);
 	}
-	vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
+	vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
+		       UIO_MAXIOV + VHOST_NET_BATCH);

 	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
 	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@ -1627,7 +1627,7 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 		vqs[i] = &vs->vqs[i].vq;
 		vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
 	}
-	vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ);
+	vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV);

 	vhost_scsi_init_inflight(vs, NULL);

--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@ -390,9 +390,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 		vq->indirect = kmalloc_array(UIO_MAXIOV,
 					     sizeof(*vq->indirect),
 					     GFP_KERNEL);
-		vq->log = kmalloc_array(UIO_MAXIOV, sizeof(*vq->log),
+		vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log),
 					GFP_KERNEL);
-		vq->heads = kmalloc_array(UIO_MAXIOV, sizeof(*vq->heads),
+		vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
 					  GFP_KERNEL);
 		if (!vq->indirect || !vq->log || !vq->heads)
 			goto err_nomem;
@ -414,7 +414,7 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev)
 }

 void vhost_dev_init(struct vhost_dev *dev,
-		    struct vhost_virtqueue **vqs, int nvqs)
+		    struct vhost_virtqueue **vqs, int nvqs, int iov_limit)
 {
 	struct vhost_virtqueue *vq;
 	int i;
@ -427,6 +427,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->iotlb = NULL;
 	dev->mm = NULL;
 	dev->worker = NULL;
+	dev->iov_limit = iov_limit;
 	init_llist_head(&dev->work_list);
 	init_waitqueue_head(&dev->wait);
 	INIT_LIST_HEAD(&dev->read_list);
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@ -170,9 +170,11 @@ struct vhost_dev {
 	struct list_head read_list;
 	struct list_head pending_list;
 	wait_queue_head_t wait;
+	int iov_limit;
 };

-void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
+void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
+		    int nvqs, int iov_limit);
 long vhost_dev_set_owner(struct vhost_dev *dev);
 bool vhost_dev_has_owner(struct vhost_dev *dev);
 long vhost_dev_check_owner(struct vhost_dev *);
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@ -531,7 +531,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
 	vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
 	vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;

-	vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs));
+	vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), UIO_MAXIOV);

 	file->private_data = vsock;
 	spin_lock_init(&vsock->send_pkt_list_lock);
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@ -260,6 +260,7 @@ struct irq_affinity {
 /**
 * struct irq_affinity_desc - Interrupt affinity descriptor
 * @mask:	cpumask to hold the affinity assignment
+ * @is_managed: 1 if the interrupt is managed internally
 */
 struct irq_affinity_desc {
 	struct cpumask	mask;
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@ -24,9 +24,13 @@
 * called near the end of a function. Otherwise, the list can be
 * re-initialized for later re-use by wake_q_init().
 *
- * Note that this can cause spurious wakeups. schedule() callers
+ * NOTE that this can cause spurious wakeups. schedule() callers
 * must ensure the call is done inside a loop, confirming that the
 * wakeup condition has in fact occurred.
+ *
+ * NOTE that there is no guarantee the wakeup will happen any later than the
+ * wake_q_add() location. Therefore task must be ready to be woken at the
+ * location of the wake_q_add().
 */

 #include <linux/sched.h>
--- a/include/net/tls.h
+++ b/include/net/tls.h
@ -120,6 +120,8 @@ struct tls_rec {
 	struct scatterlist sg_aead_out[2];

 	char aad_space[TLS_AAD_SPACE_SIZE];
+	u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE +
+		   TLS_CIPHER_AES_GCM_128_SALT_SIZE];
 	struct aead_request aead_req;
 	u8 aead_req_ctx[];
 };
--- a/kernel/exit.c
+++ b/kernel/exit.c
@ -307,7 +307,7 @@ void rcuwait_wake_up(struct rcuwait *w)
 	 *        MB (A)	      MB (B)
 	 *    [L] cond		  [L] tsk
 	 */
-	smp_rmb(); /* (B) */
+	smp_mb(); /* (B) */

 	/*
 	 * Avoid using task_rcu_dereference() magic as long as we are careful,
--- a/kernel/futex.c
+++ b/kernel/futex.c
@ -1452,11 +1452,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
 	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
 		return;

-	/*
-	 * Queue the task for later wakeup for after we've released
-	 * the hb->lock. wake_q_add() grabs reference to p.
-	 */
-	wake_q_add(wake_q, p);
+	get_task_struct(p);
 	__unqueue_futex(q);
 	/*
 	 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@ -1466,6 +1462,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
 	 * plist_del in __unqueue_futex().
 	 */
 	smp_store_release(&q->lock_ptr, NULL);
+
+	/*
+	 * Queue the task for later wakeup for after we've released
+	 * the hb->lock. wake_q_add() grabs reference to p.
+	 */
+	wake_q_add(wake_q, p);
+	put_task_struct(p);
 }

 /*
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@ -457,7 +457,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,

 	/* Validate affinity mask(s) */
 	if (affinity) {
-		for (i = 0; i < cnt; i++, i++) {
+		for (i = 0; i < cnt; i++) {
 			if (cpumask_empty(&affinity[i].mask))
 				return -EINVAL;
 		}
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@ -393,6 +393,9 @@ int irq_setup_affinity(struct irq_desc *desc)
 	}

 	cpumask_and(&mask, cpu_online_mask, set);
+	if (cpumask_empty(&mask))
+		cpumask_copy(&mask, cpu_online_mask);
+
 	if (node != NUMA_NO_NODE) {
 		const struct cpumask *nodemask = cpumask_of_node(node);

--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@ -198,15 +198,22 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 		woken++;
 		tsk = waiter->task;

-		wake_q_add(wake_q, tsk);
+		get_task_struct(tsk);
 		list_del(&waiter->list);
 		/*
-		 * Ensure that the last operation is setting the reader
+		 * Ensure calling get_task_struct() before setting the reader
 		 * waiter to nil such that rwsem_down_read_failed() cannot
 		 * race with do_exit() by always holding a reference count
 		 * to the task to wakeup.
 		 */
 		smp_store_release(&waiter->task, NULL);
+		/*
+		 * Ensure issuing the wakeup (either by us or someone else)
+		 * after setting the reader waiter to nil.
+		 */
+		wake_q_add(wake_q, tsk);
+		/* wake_q_add() already take the task ref */
+		put_task_struct(tsk);
 	}

 	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -396,6 +396,18 @@ static bool set_nr_if_polling(struct task_struct *p)
 #endif
 #endif

+/**
+ * wake_q_add() - queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ */
 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 {
 	struct wake_q_node *node = &task->wake_q;
@ -405,10 +417,11 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 	 * its already queued (either by us or someone else) and will get the
 	 * wakeup due to that.
 	 *
-	 * This cmpxchg() executes a full barrier, which pairs with the full
-	 * barrier executed by the wakeup in wake_up_q().
+	 * In order to ensure that a pending wakeup will observe our pending
+	 * state, even in the failed case, an explicit smp_mb() must be used.
 	 */
-	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+	smp_mb__before_atomic();
+	if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
 		return;

 	get_task_struct(task);
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@ -685,6 +685,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	 * set up the signal and overrun bookkeeping.
 	 */
 	timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
+	timer->it_interval = ns_to_ktime(timer->it.cpu.incr);

 	/*
 	 * This acts as a modification timestamp for the timer,
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@ -5701,18 +5701,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			cond_resched();
 		}
 	}
-#ifdef CONFIG_SPARSEMEM
-	/*
-	 * If the zone does not span the rest of the section then
-	 * we should at least initialize those pages. Otherwise we
-	 * could blow up on a poisoned page in some paths which depend
-	 * on full sections being initialized (e.g. memory hotplug).
-	 */
-	while (end_pfn % PAGES_PER_SECTION) {
-		__init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid);
-		end_pfn++;
-	}
-#endif
 }

 #ifdef CONFIG_ZONE_DEVICE
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@ -2293,9 +2293,12 @@ static int compat_do_replace(struct net *net, void __user *user,

 	xt_compat_lock(NFPROTO_BRIDGE);

-	ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries);
-	if (ret < 0)
-		goto out_unlock;
+	if (tmp.nentries) {
+		ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries);
+		if (ret < 0)
+			goto out_unlock;
+	}
+
 	ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
 	if (ret < 0)
 		goto out_unlock;
--- a/net/core/dev.c
+++ b/net/core/dev.c
@ -8712,6 +8712,9 @@ int init_dummy_netdev(struct net_device *dev)
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 	set_bit(__LINK_STATE_START, &dev->state);

+	/* napi_busy_loop stats accounting wants this */
+	dev_net_set(dev, &init_net);
+
 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 	 * because users of this 'device' dont need to change
 	 * its refcount.
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@ -56,7 +56,7 @@
 #include <net/dn_neigh.h>
 #include <net/dn_fib.h>

-#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn))
+#define DN_IFREQ_SIZE (offsetof(struct ifreq, ifr_ifru) + sizeof(struct sockaddr_dn))

 static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00};
 static char dn_rt_all_rt_mcast[ETH_ALEN]  = {0xAB,0x00,0x00,0x03,0x00,0x00};
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@ -74,6 +74,33 @@ drop:
 	return 0;
 }

+static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi,
+		     int encap_type)
+{
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+				  iph->saddr, iph->daddr, 0);
+	if (tunnel) {
+		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+			goto drop;
+
+		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+
+		skb->dev = tunnel->dev;
+
+		return xfrm_input(skb, nexthdr, spi, encap_type);
+	}
+
+	return -EINVAL;
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
 static int vti_rcv(struct sk_buff *skb)
 {
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
@ -82,6 +109,14 @@ static int vti_rcv(struct sk_buff *skb)
 	return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
 }

+static int vti_rcv_ipip(struct sk_buff *skb)
+{
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+	return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0);
+}
+
 static int vti_rcv_cb(struct sk_buff *skb, int err)
 {
 	unsigned short family;
@ -435,6 +470,12 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
 	.priority	=	100,
 };

+static struct xfrm_tunnel ipip_handler __read_mostly = {
+	.handler	=	vti_rcv_ipip,
+	.err_handler	=	vti4_err,
+	.priority	=	0,
+};
+
 static int __net_init vti_init_net(struct net *net)
 {
 	int err;
@ -603,6 +644,13 @@ static int __init vti_init(void)
 	if (err < 0)
 		goto xfrm_proto_comp_failed;

+	msg = "ipip tunnel";
+	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+	if (err < 0) {
+		pr_info("%s: cant't register tunnel\n",__func__);
+		goto xfrm_tunnel_failed;
+	}
+
 	msg = "netlink interface";
 	err = rtnl_link_register(&vti_link_ops);
 	if (err < 0)
@ -612,6 +660,8 @@ static int __init vti_init(void)

 rtnl_link_failed:
 	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_tunnel_failed:
+	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
 xfrm_proto_comp_failed:
 	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
 xfrm_proto_ah_failed:
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@ -846,9 +846,9 @@ static int clusterip_net_init(struct net *net)

 static void clusterip_net_exit(struct net *net)
 {
+#ifdef CONFIG_PROC_FS
 	struct clusterip_net *cn = clusterip_pernet(net);

-#ifdef CONFIG_PROC_FS
 	mutex_lock(&cn->mutex);
 	proc_remove(cn->procdir);
 	cn->procdir = NULL;
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@ -1516,6 +1516,9 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 			continue;
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
 		list_del_rcu(&c->list);
+		call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
+					       FIB_EVENT_ENTRY_DEL,
+					       (struct mfc6_cache *)c, mrt->id);
 		mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
 		mr_cache_put(c);
 	}
@ -1524,10 +1527,6 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 		spin_lock_bh(&mfc_unres_lock);
 		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
-			call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
-						       FIB_EVENT_ENTRY_DEL,
-						       (struct mfc6_cache *)c,
-						       mrt->id);
 			mr6_netlink_event(mrt, (struct mfc6_cache *)c,
 					  RTM_DELROUTE);
 			ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@ -2220,6 +2220,18 @@ static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user
 		  u->tcp_fin_timeout,
 		  u->udp_timeout);

+#ifdef CONFIG_IP_VS_PROTO_TCP
+	if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
+	    u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
+		return -EINVAL;
+	}
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
+		return -EINVAL;
+#endif
+
 #ifdef CONFIG_IP_VS_PROTO_TCP
 	if (u->tcp_timeout) {
 		pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@ -66,6 +66,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
 			     int ttl_check,
 			     struct nf_osf_hdr_ctx *ctx)
 {
+	const __u8 *optpinit = ctx->optp;
 	unsigned int check_WSS = 0;
 	int fmatch = FMATCH_WRONG;
 	int foptsize, optnum;
@ -155,6 +156,9 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
 		}
 	}

+	if (fmatch != FMATCH_OK)
+		ctx->optp = optpinit;
+
 	return fmatch == FMATCH_OK;
 }

--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@ -22,11 +22,15 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_arp/arp_tables.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netns/generic.h>

 struct nft_xt {
 	struct list_head	head;
 	struct nft_expr_ops	ops;
-	unsigned int		refcnt;
+	refcount_t		refcnt;
+
+	/* used only when transaction mutex is locked */
+	unsigned int		listcnt;

 	/* Unlike other expressions, ops doesn't have static storage duration.
 	 * nft core assumes they do.  We use kfree_rcu so that nft core can
@ -43,10 +47,24 @@ struct nft_xt_match_priv {
 	void *info;
 };

+struct nft_compat_net {
+	struct list_head nft_target_list;
+	struct list_head nft_match_list;
+};
+
+static unsigned int nft_compat_net_id __read_mostly;
+static struct nft_expr_type nft_match_type;
+static struct nft_expr_type nft_target_type;
+
+static struct nft_compat_net *nft_compat_pernet(struct net *net)
+{
+	return net_generic(net, nft_compat_net_id);
+}
+
 static bool nft_xt_put(struct nft_xt *xt)
 {
-	if (--xt->refcnt == 0) {
-		list_del(&xt->head);
+	if (refcount_dec_and_test(&xt->refcnt)) {
+		WARN_ON_ONCE(!list_empty(&xt->head));
 		kfree_rcu(xt, rcu_head);
 		return true;
 	}
@ -273,7 +291,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return -EINVAL;

 	nft_xt = container_of(expr->ops, struct nft_xt, ops);
-	nft_xt->refcnt++;
+	refcount_inc(&nft_xt->refcnt);
 	return 0;
 }

@ -486,7 +504,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return ret;

 	nft_xt = container_of(expr->ops, struct nft_xt, ops);
-	nft_xt->refcnt++;
+	refcount_inc(&nft_xt->refcnt);
 	return 0;
 }

@ -540,6 +558,43 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	__nft_match_destroy(ctx, expr, nft_expr_priv(expr));
 }

+static void nft_compat_activate(const struct nft_ctx *ctx,
+				const struct nft_expr *expr,
+				struct list_head *h)
+{
+	struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
+
+	if (xt->listcnt == 0)
+		list_add(&xt->head, h);
+
+	xt->listcnt++;
+}
+
+static void nft_compat_activate_mt(const struct nft_ctx *ctx,
+				   const struct nft_expr *expr)
+{
+	struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
+
+	nft_compat_activate(ctx, expr, &cn->nft_match_list);
+}
+
+static void nft_compat_activate_tg(const struct nft_ctx *ctx,
+				   const struct nft_expr *expr)
+{
+	struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
+
+	nft_compat_activate(ctx, expr, &cn->nft_target_list);
+}
+
+static void nft_compat_deactivate(const struct nft_ctx *ctx,
+				  const struct nft_expr *expr)
+{
+	struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
+
+	if (--xt->listcnt == 0)
+		list_del_init(&xt->head);
+}
+
 static void
 nft_match_large_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 {
@ -734,10 +789,6 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = {
 	.cb		= nfnl_nft_compat_cb,
 };

-static LIST_HEAD(nft_match_list);
-
-static struct nft_expr_type nft_match_type;
-
 static bool nft_match_cmp(const struct xt_match *match,
 			  const char *name, u32 rev, u32 family)
 {
@ -749,6 +800,7 @@ static const struct nft_expr_ops *
 nft_match_select_ops(const struct nft_ctx *ctx,
 		     const struct nlattr * const tb[])
 {
+	struct nft_compat_net *cn;
 	struct nft_xt *nft_match;
 	struct xt_match *match;
 	unsigned int matchsize;
@ -765,8 +817,10 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 	rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
 	family = ctx->family;

+	cn = nft_compat_pernet(ctx->net);
+
 	/* Re-use the existing match if it's already loaded. */
-	list_for_each_entry(nft_match, &nft_match_list, head) {
+	list_for_each_entry(nft_match, &cn->nft_match_list, head) {
 		struct xt_match *match = nft_match->ops.data;

 		if (nft_match_cmp(match, mt_name, rev, family))
@ -789,11 +843,13 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 		goto err;
 	}

-	nft_match->refcnt = 0;
+	refcount_set(&nft_match->refcnt, 0);
 	nft_match->ops.type = &nft_match_type;
 	nft_match->ops.eval = nft_match_eval;
 	nft_match->ops.init = nft_match_init;
 	nft_match->ops.destroy = nft_match_destroy;
+	nft_match->ops.activate = nft_compat_activate_mt;
+	nft_match->ops.deactivate = nft_compat_deactivate;
 	nft_match->ops.dump = nft_match_dump;
 	nft_match->ops.validate = nft_match_validate;
 	nft_match->ops.data = match;
@ -810,7 +866,8 @@ nft_match_select_ops(const struct nft_ctx *ctx,

 	nft_match->ops.size = matchsize;

-	list_add(&nft_match->head, &nft_match_list);
+	nft_match->listcnt = 1;
+	list_add(&nft_match->head, &cn->nft_match_list);

 	return &nft_match->ops;
 err:
@ -826,10 +883,6 @@ static struct nft_expr_type nft_match_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };

-static LIST_HEAD(nft_target_list);
-
-static struct nft_expr_type nft_target_type;
-
 static bool nft_target_cmp(const struct xt_target *tg,
 			   const char *name, u32 rev, u32 family)
 {
@ -841,6 +894,7 @@ static const struct nft_expr_ops *
 nft_target_select_ops(const struct nft_ctx *ctx,
 		      const struct nlattr * const tb[])
 {
+	struct nft_compat_net *cn;
 	struct nft_xt *nft_target;
 	struct xt_target *target;
 	char *tg_name;
@ -861,8 +915,9 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	    strcmp(tg_name, "standard") == 0)
 		return ERR_PTR(-EINVAL);

+	cn = nft_compat_pernet(ctx->net);
 	/* Re-use the existing target if it's already loaded. */
-	list_for_each_entry(nft_target, &nft_target_list, head) {
+	list_for_each_entry(nft_target, &cn->nft_target_list, head) {
 		struct xt_target *target = nft_target->ops.data;

 		if (!target->target)
@ -893,11 +948,13 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 		goto err;
 	}

-	nft_target->refcnt = 0;
+	refcount_set(&nft_target->refcnt, 0);
 	nft_target->ops.type = &nft_target_type;
 	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
 	nft_target->ops.init = nft_target_init;
 	nft_target->ops.destroy = nft_target_destroy;
+	nft_target->ops.activate = nft_compat_activate_tg;
+	nft_target->ops.deactivate = nft_compat_deactivate;
 	nft_target->ops.dump = nft_target_dump;
 	nft_target->ops.validate = nft_target_validate;
 	nft_target->ops.data = target;
@ -907,7 +964,8 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	else
 		nft_target->ops.eval = nft_target_eval_xt;

-	list_add(&nft_target->head, &nft_target_list);
+	nft_target->listcnt = 1;
+	list_add(&nft_target->head, &cn->nft_target_list);

 	return &nft_target->ops;
 err:
@ -923,13 +981,74 @@ static struct nft_expr_type nft_target_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };

+static int __net_init nft_compat_init_net(struct net *net)
+{
+	struct nft_compat_net *cn = nft_compat_pernet(net);
+
+	INIT_LIST_HEAD(&cn->nft_target_list);
+	INIT_LIST_HEAD(&cn->nft_match_list);
+
+	return 0;
+}
+
+static void __net_exit nft_compat_exit_net(struct net *net)
+{
+	struct nft_compat_net *cn = nft_compat_pernet(net);
+	struct nft_xt *xt, *next;
+
+	if (list_empty(&cn->nft_match_list) &&
+	    list_empty(&cn->nft_target_list))
+		return;
+
+	/* If there was an error that caused nft_xt expr to not be initialized
+	 * fully and noone else requested the same expression later, the lists
+	 * contain 0-refcount entries that still hold module reference.
+	 *
+	 * Clean them here.
+	 */
+	mutex_lock(&net->nft.commit_mutex);
+	list_for_each_entry_safe(xt, next, &cn->nft_target_list, head) {
+		struct xt_target *target = xt->ops.data;
+
+		list_del_init(&xt->head);
+
+		if (refcount_read(&xt->refcnt))
+			continue;
+		module_put(target->me);
+		kfree(xt);
+	}
+
+	list_for_each_entry_safe(xt, next, &cn->nft_match_list, head) {
+		struct xt_match *match = xt->ops.data;
+
+		list_del_init(&xt->head);
+
+		if (refcount_read(&xt->refcnt))
+			continue;
+		module_put(match->me);
+		kfree(xt);
+	}
+	mutex_unlock(&net->nft.commit_mutex);
+}
+
+static struct pernet_operations nft_compat_net_ops = {
+	.init	= nft_compat_init_net,
+	.exit	= nft_compat_exit_net,
+	.id	= &nft_compat_net_id,
+	.size	= sizeof(struct nft_compat_net),
+};
+
 static int __init nft_compat_module_init(void)
 {
 	int ret;

+	ret = register_pernet_subsys(&nft_compat_net_ops);
+	if (ret < 0)
+		goto err_target;
+
 	ret = nft_register_expr(&nft_match_type);
 	if (ret < 0)
-		return ret;
+		goto err_pernet;

 	ret = nft_register_expr(&nft_target_type);
 	if (ret < 0)
@ -942,45 +1061,21 @@ static int __init nft_compat_module_init(void)
 	}

 	return ret;
-
 err_target:
 	nft_unregister_expr(&nft_target_type);
 err_match:
 	nft_unregister_expr(&nft_match_type);
+err_pernet:
+	unregister_pernet_subsys(&nft_compat_net_ops);
 	return ret;
 }

 static void __exit nft_compat_module_exit(void)
 {
-	struct nft_xt *xt, *next;
-
-	/* list should be empty here, it can be non-empty only in case there
-	 * was an error that caused nft_xt expr to not be initialized fully
-	 * and noone else requested the same expression later.
-	 *
-	 * In this case, the lists contain 0-refcount entries that still
-	 * hold module reference.
-	 */
-	list_for_each_entry_safe(xt, next, &nft_target_list, head) {
-		struct xt_target *target = xt->ops.data;
-
-		if (WARN_ON_ONCE(xt->refcnt))
-			continue;
-		module_put(target->me);
-		kfree(xt);
-	}
-
-	list_for_each_entry_safe(xt, next, &nft_match_list, head) {
-		struct xt_match *match = xt->ops.data;
-
-		if (WARN_ON_ONCE(xt->refcnt))
-			continue;
-		module_put(match->me);
-		kfree(xt);
-	}
 	nfnetlink_subsys_unregister(&nfnl_compat_subsys);
 	nft_unregister_expr(&nft_target_type);
 	nft_unregister_expr(&nft_match_type);
+	unregister_pernet_subsys(&nft_compat_net_ops);
 }

 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT);
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@ -52,21 +52,21 @@ void nr_start_t1timer(struct sock *sk)
 {
 	struct nr_sock *nr = nr_sk(sk);

-	mod_timer(&nr->t1timer, jiffies + nr->t1);
+	sk_reset_timer(sk, &nr->t1timer, jiffies + nr->t1);
 }

 void nr_start_t2timer(struct sock *sk)
 {
 	struct nr_sock *nr = nr_sk(sk);

-	mod_timer(&nr->t2timer, jiffies + nr->t2);
+	sk_reset_timer(sk, &nr->t2timer, jiffies + nr->t2);
 }

 void nr_start_t4timer(struct sock *sk)
 {
 	struct nr_sock *nr = nr_sk(sk);

-	mod_timer(&nr->t4timer, jiffies + nr->t4);
+	sk_reset_timer(sk, &nr->t4timer, jiffies + nr->t4);
 }

 void nr_start_idletimer(struct sock *sk)
@ -74,37 +74,37 @@ void nr_start_idletimer(struct sock *sk)
 	struct nr_sock *nr = nr_sk(sk);

 	if (nr->idle > 0)
-		mod_timer(&nr->idletimer, jiffies + nr->idle);
+		sk_reset_timer(sk, &nr->idletimer, jiffies + nr->idle);
 }

 void nr_start_heartbeat(struct sock *sk)
 {
-	mod_timer(&sk->sk_timer, jiffies + 5 * HZ);
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + 5 * HZ);
 }

 void nr_stop_t1timer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->t1timer);
+	sk_stop_timer(sk, &nr_sk(sk)->t1timer);
 }

 void nr_stop_t2timer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->t2timer);
+	sk_stop_timer(sk, &nr_sk(sk)->t2timer);
 }

 void nr_stop_t4timer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->t4timer);
+	sk_stop_timer(sk, &nr_sk(sk)->t4timer);
 }

 void nr_stop_idletimer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->idletimer);
+	sk_stop_timer(sk, &nr_sk(sk)->idletimer);
 }

 void nr_stop_heartbeat(struct sock *sk)
 {
-	del_timer(&sk->sk_timer);
+	sk_stop_timer(sk, &sk->sk_timer);
 }

 int nr_t1timer_running(struct sock *sk)
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@ -850,6 +850,7 @@ void rose_link_device_down(struct net_device *dev)

 /*
 *	Route a frame to an appropriate AX.25 connection.
+ *	A NULL ax25_cb indicates an internally generated frame.
 */
 int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 {
@ -867,6 +868,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)

 	if (skb->len < ROSE_MIN_LEN)
 		return res;
+
+	if (!ax25)
+		return rose_loopback_queue(skb, NULL);
+
 	frametype = skb->data[2];
 	lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
 	if (frametype == ROSE_CALL_REQUEST &&
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@ -447,6 +447,8 @@ static int tls_do_encryption(struct sock *sk,
 	struct scatterlist *sge = sk_msg_elem(msg_en, start);
 	int rc;

+	memcpy(rec->iv_data, tls_ctx->tx.iv, sizeof(rec->iv_data));
+
 	sge->offset += tls_ctx->tx.prepend_size;
 	sge->length -= tls_ctx->tx.prepend_size;

@ -456,7 +458,7 @@ static int tls_do_encryption(struct sock *sk,
 	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
 	aead_request_set_crypt(aead_req, rec->sg_aead_in,
 			       rec->sg_aead_out,
-			       data_len, tls_ctx->tx.iv);
+			       data_len, rec->iv_data);

 	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				  tls_encrypt_done, sk);
@ -1901,7 +1903,9 @@ void tls_sw_free_resources_tx(struct sock *sk)
 	if (atomic_read(&ctx->encrypt_pending))
 		crypto_wait_req(-EINPROGRESS, &ctx->async_wait);

+	release_sock(sk);
 	cancel_delayed_work_sync(&ctx->tx_work.work);
+	lock_sock(sk);

 	/* Tx whatever records we can transmit and abandon the rest */
 	tls_tx_records(sk, -1);
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@ -680,16 +680,6 @@ static void xfrm_hash_resize(struct work_struct *work)
 	mutex_unlock(&hash_resize_mutex);
 }

-static void xfrm_hash_reset_inexact_table(struct net *net)
-{
-	struct xfrm_pol_inexact_bin *b;
-
-	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
-
-	list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins)
-		INIT_HLIST_HEAD(&b->hhead);
-}
-
 /* Make sure *pol can be inserted into fastbin.
 * Useful to check that later insert requests will be sucessful
 * (provided xfrm_policy_lock is held throughout).
@ -833,13 +823,13 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net,
 					      u16 family)
 {
 	unsigned int matched_s, matched_d;
-	struct hlist_node *newpos = NULL;
 	struct xfrm_policy *policy, *p;

 	matched_s = 0;
 	matched_d = 0;

 	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+		struct hlist_node *newpos = NULL;
 		bool matches_s, matches_d;

 		if (!policy->bydst_reinsert)
@ -849,16 +839,19 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net,

 		policy->bydst_reinsert = false;
 		hlist_for_each_entry(p, &n->hhead, bydst) {
-			if (policy->priority >= p->priority)
+			if (policy->priority > p->priority)
+				newpos = &p->bydst;
+			else if (policy->priority == p->priority &&
+				 policy->pos > p->pos)
 				newpos = &p->bydst;
 			else
 				break;
 		}

 		if (newpos)
-			hlist_add_behind(&policy->bydst, newpos);
+			hlist_add_behind_rcu(&policy->bydst, newpos);
 		else
-			hlist_add_head(&policy->bydst, &n->hhead);
+			hlist_add_head_rcu(&policy->bydst, &n->hhead);

 		/* paranoia checks follow.
 		 * Check that the reinserted policy matches at least
@ -893,12 +886,13 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net,
 					      struct rb_root *new,
 					      u16 family)
 {
-	struct rb_node **p, *parent = NULL;
 	struct xfrm_pol_inexact_node *node;
+	struct rb_node **p, *parent;

 	/* we should not have another subtree here */
 	WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
-
+restart:
+	parent = NULL;
 	p = &new->rb_node;
 	while (*p) {
 		u8 prefixlen;
@ -918,12 +912,11 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net,
 		} else {
 			struct xfrm_policy *tmp;

-			hlist_for_each_entry(tmp, &node->hhead, bydst)
-				tmp->bydst_reinsert = true;
-			hlist_for_each_entry(tmp, &n->hhead, bydst)
+			hlist_for_each_entry(tmp, &n->hhead, bydst) {
 				tmp->bydst_reinsert = true;
+				hlist_del_rcu(&tmp->bydst);
+			}

-			INIT_HLIST_HEAD(&node->hhead);
 			xfrm_policy_inexact_list_reinsert(net, node, family);

 			if (node->prefixlen == n->prefixlen) {
@ -935,8 +928,7 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net,
 			kfree_rcu(n, rcu);
 			n = node;
 			n->prefixlen = prefixlen;
-			*p = new->rb_node;
-			parent = NULL;
+			goto restart;
 		}
 	}

@ -965,12 +957,11 @@ static void xfrm_policy_inexact_node_merge(struct net *net,
 						  family);
 	}

-	hlist_for_each_entry(tmp, &v->hhead, bydst)
-		tmp->bydst_reinsert = true;
-	hlist_for_each_entry(tmp, &n->hhead, bydst)
+	hlist_for_each_entry(tmp, &v->hhead, bydst) {
 		tmp->bydst_reinsert = true;
+		hlist_del_rcu(&tmp->bydst);
+	}

-	INIT_HLIST_HEAD(&n->hhead);
 	xfrm_policy_inexact_list_reinsert(net, n, family);
 }

@ -1235,6 +1226,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));

 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+	write_seqcount_begin(&xfrm_policy_hash_generation);

 	/* make sure that we can insert the indirect policies again before
 	 * we start with destructive action.
@ -1278,10 +1270,14 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 	}

 	/* reset the bydst and inexact table in all directions */
-	xfrm_hash_reset_inexact_table(net);
-
 	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
-		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
+		struct hlist_node *n;
+
+		hlist_for_each_entry_safe(policy, n,
+					  &net->xfrm.policy_inexact[dir],
+					  bydst_inexact_list)
+			hlist_del_init(&policy->bydst_inexact_list);
+
 		hmask = net->xfrm.policy_bydst[dir].hmask;
 		odst = net->xfrm.policy_bydst[dir].table;
 		for (i = hmask; i >= 0; i--)
@ -1313,6 +1309,9 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 		newpos = NULL;
 		chain = policy_hash_bysel(net, &policy->selector,
 					  policy->family, dir);
+
+		hlist_del_rcu(&policy->bydst);
+
 		if (!chain) {
 			void *p = xfrm_policy_inexact_insert(policy, dir, 0);

@ -1334,6 +1333,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)

 out_unlock:
 	__xfrm_policy_inexact_flush(net);
+	write_seqcount_end(&xfrm_policy_hash_generation);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

 	mutex_unlock(&hash_resize_mutex);
@ -2600,7 +2600,10 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		dst_copy_metrics(dst1, dst);

 		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
-			__u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
+			__u32 mark = 0;
+
+			if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m)
+				mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);

 			family = xfrm[i]->props.family;
 			dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@ -1488,10 +1488,15 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
 		if (!ut[i].family)
 			ut[i].family = family;

-		if ((ut[i].mode == XFRM_MODE_TRANSPORT) &&
-		    (ut[i].family != prev_family))
-			return -EINVAL;
-
+		switch (ut[i].mode) {
+		case XFRM_MODE_TUNNEL:
+		case XFRM_MODE_BEET:
+			break;
+		default:
+			if (ut[i].family != prev_family)
+				return -EINVAL;
+			break;
+		}
 		if (ut[i].mode >= XFRM_MODE_MAX)
 			return -EINVAL;

--- a/tools/testing/selftests/net/xfrm_policy.sh
+++ b/tools/testing/selftests/net/xfrm_policy.sh
@ -28,6 +28,19 @@ KEY_AES=0x0123456789abcdef0123456789012345
 SPI1=0x1
 SPI2=0x2

+do_esp_policy() {
+    local ns=$1
+    local me=$2
+    local remote=$3
+    local lnet=$4
+    local rnet=$5
+
+    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
+    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 100 action allow
+    # to fwd decrypted packets after esp processing:
+    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 100 action allow
+}
+
 do_esp() {
    local ns=$1
    local me=$2
@ -40,10 +53,59 @@ do_esp() {
    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA  mode tunnel sel src $rnet dst $lnet
    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet

-    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
-    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 100 action allow
-    # to fwd decrypted packets after esp processing:
-    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 100 action allow
+    do_esp_policy $ns $me $remote $lnet $rnet
+}
+
+# add policies with different netmasks, to make sure kernel carries
+# the policies contained within new netmask over when search tree is
+# re-built.
+# peer netns that are supposed to be encapsulated via esp have addresses
+# in the 10.0.1.0/24 and 10.0.2.0/24 subnets, respectively.
+#
+# Adding a policy for '10.0.1.0/23' will make it necessary to
+# alter the prefix of 10.0.1.0 subnet.
+# In case new prefix overlaps with existing node, the node and all
+# policies it carries need to be merged with the existing one(s).
+#
+# Do that here.
+do_overlap()
+{
+    local ns=$1
+
+    # adds new nodes to tree (neither network exists yet in policy database).
+    ip -net $ns xfrm policy add src 10.1.0.0/24 dst 10.0.0.0/24 dir fwd priority 200 action block
+
+    # adds a new node in the 10.0.0.0/24 tree (dst node exists).
+    ip -net $ns xfrm policy add src 10.2.0.0/24 dst 10.0.0.0/24 dir fwd priority 200 action block
+
+    # adds a 10.2.0.0/23 node, but for different dst.
+    ip -net $ns xfrm policy add src 10.2.0.0/23 dst 10.0.1.0/24 dir fwd priority 200 action block
+
+    # dst now overlaps with the 10.0.1.0/24 ESP policy in fwd.
+    # kernel must 'promote' existing one (10.0.0.0/24) to 10.0.0.0/23.
+    # But 10.0.0.0/23 also includes existing 10.0.1.0/24, so that node
+    # also has to be merged too, including source-sorted subtrees.
+    # old:
+    # 10.0.0.0/24 (node 1 in dst tree of the bin)
+    #    10.1.0.0/24 (node in src tree of dst node 1)
+    #    10.2.0.0/24 (node in src tree of dst node 1)
+    # 10.0.1.0/24 (node 2 in dst tree of the bin)
+    #    10.0.2.0/24 (node in src tree of dst node 2)
+    #    10.2.0.0/24 (node in src tree of dst node 2)
+    #
+    # The next 'policy add' adds dst '10.0.0.0/23', which means
+    # that dst node 1 and dst node 2 have to be merged including
+    # the sub-tree.  As no duplicates are allowed, policies in
+    # the two '10.0.2.0/24' are also merged.
+    #
+    # after the 'add', internal search tree should look like this:
+    # 10.0.0.0/23 (node in dst tree of bin)
+    #     10.0.2.0/24 (node in src tree of dst node)
+    #     10.1.0.0/24 (node in src tree of dst node)
+    #     10.2.0.0/24 (node in src tree of dst node)
+    #
+    # 10.0.0.0/24 and 10.0.1.0/24 nodes have been merged as 10.0.0.0/23.
+    ip -net $ns xfrm policy add src 10.1.0.0/24 dst 10.0.0.0/23 dir fwd priority 200 action block
 }

 do_esp_policy_get_check() {
@ -160,6 +222,41 @@ check_xfrm() {
 	return $lret
 }

+check_exceptions()
+{
+	logpostfix="$1"
+	local lret=0
+
+	# ping to .254 should be excluded from the tunnel (exception is in place).
+	check_xfrm 0 254
+	if [ $? -ne 0 ]; then
+		echo "FAIL: expected ping to .254 to fail ($logpostfix)"
+		lret=1
+	else
+		echo "PASS: ping to .254 bypassed ipsec tunnel ($logpostfix)"
+	fi
+
+	# ping to .253 should use use ipsec due to direct policy exception.
+	check_xfrm 1 253
+	if [ $? -ne 0 ]; then
+		echo "FAIL: expected ping to .253 to use ipsec tunnel ($logpostfix)"
+		lret=1
+	else
+		echo "PASS: direct policy matches ($logpostfix)"
+	fi
+
+	# ping to .2 should use ipsec.
+	check_xfrm 1 2
+	if [ $? -ne 0 ]; then
+		echo "FAIL: expected ping to .2 to use ipsec tunnel ($logpostfix)"
+		lret=1
+	else
+		echo "PASS: policy matches ($logpostfix)"
+	fi
+
+	return $lret
+}
+
 #check for needed privileges
 if [ "$(id -u)" -ne 0 ];then
 	echo "SKIP: Need root privileges"
@ -270,32 +367,44 @@ do_exception ns4 10.0.3.10 10.0.3.1 10.0.1.253 10.0.1.240/28
 do_exception ns3 dead:3::1 dead:3::10 dead:2::fd  dead:2:f0::/96
 do_exception ns4 dead:3::10 dead:3::1 dead:1::fd  dead:1:f0::/96

-# ping to .254 should now be excluded from the tunnel
-check_xfrm 0 254
+check_exceptions "exceptions"
 if [ $? -ne 0 ]; then
-	echo "FAIL: expected ping to .254 to fail"
 	ret=1
-else
-	echo "PASS: ping to .254 bypassed ipsec tunnel"
 fi

-# ping to .253 should use use ipsec due to direct policy exception.
-check_xfrm 1 253
+# insert block policies with adjacent/overlapping netmasks
+do_overlap ns3
+
+check_exceptions "exceptions and block policies"
 if [ $? -ne 0 ]; then
-	echo "FAIL: expected ping to .253 to use ipsec tunnel"
 	ret=1
-else
-	echo "PASS: direct policy matches"
 fi

-# ping to .2 should use ipsec.
-check_xfrm 1 2
-if [ $? -ne 0 ]; then
-	echo "FAIL: expected ping to .2 to use ipsec tunnel"
-	ret=1
-else
-	echo "PASS: policy matches"
-fi
+for n in ns3 ns4;do
+	ip -net $n xfrm policy set hthresh4 28 24 hthresh6 126 125
+	sleep $((RANDOM%5))
+done
+
+check_exceptions "exceptions and block policies after hresh changes"
+
+# full flush of policy db, check everything gets freed incl. internal meta data
+ip -net ns3 xfrm policy flush
+
+do_esp_policy ns3 10.0.3.1 10.0.3.10 10.0.1.0/24 10.0.2.0/24
+do_exception ns3 10.0.3.1 10.0.3.10 10.0.2.253 10.0.2.240/28
+
+# move inexact policies to hash table
+ip -net ns3 xfrm policy set hthresh4 16 16
+
+sleep $((RANDOM%5))
+check_exceptions "exceptions and block policies after hthresh change in ns3"
+
+# restore original hthresh settings -- move policies back to tables
+for n in ns3 ns4;do
+	ip -net $n xfrm policy set hthresh4 32 32 hthresh6 128 128
+	sleep $((RANDOM%5))
+done
+check_exceptions "exceptions and block policies after hresh change to normal"

 for i in 1 2 3 4;do ip netns del ns$i;done

--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@ -1133,6 +1133,21 @@ void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
 	pkey_assert(err);
 }

+void become_child(void)
+{
+	pid_t forkret;
+
+	forkret = fork();
+	pkey_assert(forkret >= 0);
+	dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+	if (!forkret) {
+		/* in the child */
+		return;
+	}
+	exit(0);
+}
+
 /* Assumes that all pkeys other than 'pkey' are unallocated */
 void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
 {
@ -1141,7 +1156,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
 	int nr_allocated_pkeys = 0;
 	int i;

-	for (i = 0; i < NR_PKEYS*2; i++) {
+	for (i = 0; i < NR_PKEYS*3; i++) {
 		int new_pkey;
 		dprintf1("%s() alloc loop: %d\n", __func__, i);
 		new_pkey = alloc_pkey();
@ -1152,20 +1167,26 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
 		if ((new_pkey == -1) && (errno == ENOSPC)) {
 			dprintf2("%s() failed to allocate pkey after %d tries\n",
 				__func__, nr_allocated_pkeys);
-			break;
+		} else {
+			/*
+			 * Ensure the number of successes never
+			 * exceeds the number of keys supported
+			 * in the hardware.
+			 */
+			pkey_assert(nr_allocated_pkeys < NR_PKEYS);
+			allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
 		}
-		pkey_assert(nr_allocated_pkeys < NR_PKEYS);
-		allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+
+		/*
+		 * Make sure that allocation state is properly
+		 * preserved across fork().
+		 */
+		if (i == NR_PKEYS*2)
+			become_child();
 	}

 	dprintf3("%s()::%d\n", __func__, __LINE__);

-	/*
-	 * ensure it did not reach the end of the loop without
-	 * failure:
-	 */
-	pkey_assert(i < NR_PKEYS*2);
-
 	/*
 	 * There are 16 pkeys supported in hardware.  Three are
 	 * allocated by the time we get here: