net: report RCU QS on threaded NAPI repolling

[sfrench/cifs-2.6.git] / net / core / dev.c
diff --git a/net/core/dev.c b/net/core/dev.c

index 76e6438f4858e246dfebb78364a253e77f9a86b4..9a67003e49db87f3f92b6c6296b3e7a5ca9d9171 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,41 +153,21 @@
  #include <linux/prandom.h>
  #include <linux/once_lite.h>
  #include <net/netdev_rx_queue.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
+#include <net/rps.h>
  
  #include "dev.h"
  #include "net-sysfs.h"
  
  static DEFINE_SPINLOCK(ptype_lock);
  struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-struct list_head ptype_all __read_mostly;      /* Taps */
  
  static int netif_rx_internal(struct sk_buff *skb);
  static int call_netdevice_notifiers_extack(unsigned long val,
                                            struct net_device *dev,
                                            struct netlink_ext_ack *extack);
  
-/*
- * The @dev_base_head list is protected by @dev_base_lock and the rtnl
- * semaphore.
- *
- * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
- *
- * Writers must hold the rtnl semaphore while they loop through the
- * dev_base_head list, and hold dev_base_lock for writing when they do the
- * actual updates.  This allows pure readers to access the list even
- * while a writer is preparing to update it.
- *
- * To put it another way, dev_base_lock is held for writing only to
- * protect against pure readers; the rtnl semaphore provides the
- * protection against other writers.
- *
- * See, for example usages, register_netdevice() and
- * unregister_netdevice(), which must be called with the rtnl
- * semaphore held.
- */
-DEFINE_RWLOCK(dev_base_lock);
-EXPORT_SYMBOL(dev_base_lock);
-
  static DEFINE_MUTEX(ifalias_mutex);
  
  /* protects napi_hash addition/deletion and napi_gen_id */
@@ -200,8 +180,9 @@ static DECLARE_RWSEM(devnet_rename_sem);
  
  static inline void dev_base_seq_inc(struct net *net)
  {
-       while (++net->dev_base_seq == 0)
-               ;
+       unsigned int val = net->dev_base_seq + 1;
+
+       WRITE_ONCE(net->dev_base_seq, val ?: 1);
  }
  
  static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -341,13 +322,22 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name)
         return 0;
  }
  
-static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+static void netdev_name_node_alt_free(struct rcu_head *head)
  {
-       list_del(&name_node->list);
+       struct netdev_name_node *name_node =
+               container_of(head, struct netdev_name_node, rcu);
+
         kfree(name_node->name);
         netdev_name_node_free(name_node);
  }
  
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+{
+       netdev_name_node_del(name_node);
+       list_del(&name_node->list);
+       call_rcu(&name_node->rcu, netdev_name_node_alt_free);
+}
+
  int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  {
         struct netdev_name_node *name_node;
@@ -362,10 +352,7 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
         if (name_node == dev->name_node || name_node->dev != dev)
                 return -EINVAL;
  
-       netdev_name_node_del(name_node);
-       synchronize_rcu();
         __netdev_name_node_alt_destroy(name_node);
-
         return 0;
  }
  
@@ -373,8 +360,10 @@ static void netdev_name_node_alt_flush(struct net_device *dev)
  {
         struct netdev_name_node *name_node, *tmp;
  
-       list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
-               __netdev_name_node_alt_destroy(name_node);
+       list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
+               list_del(&name_node->list);
+               netdev_name_node_alt_free(&name_node->rcu);
+       }
  }
  
  /* Device list insertion */
@@ -385,12 +374,10 @@ static void list_netdevice(struct net_device *dev)
  
         ASSERT_RTNL();
  
-       write_lock(&dev_base_lock);
         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
         netdev_name_node_add(net, dev->name_node);
         hlist_add_head_rcu(&dev->index_hlist,
                            dev_index_hash(net, dev->ifindex));
-       write_unlock(&dev_base_lock);
  
         netdev_for_each_altname(dev, name_node)
                 netdev_name_node_add(net, name_node);
@@ -404,7 +391,7 @@ static void list_netdevice(struct net_device *dev)
  /* Device list removal
   * caller must respect a RCU grace period before freeing/reusing dev
   */
-static void unlist_netdevice(struct net_device *dev, bool lock)
+static void unlist_netdevice(struct net_device *dev)
  {
         struct netdev_name_node *name_node;
         struct net *net = dev_net(dev);
@@ -417,13 +404,9 @@ static void unlist_netdevice(struct net_device *dev, bool lock)
                 netdev_name_node_del(name_node);
  
         /* Unlink dev from the device chain */
-       if (lock)
-               write_lock(&dev_base_lock);
         list_del_rcu(&dev->dev_list);
         netdev_name_node_del(dev->name_node);
         hlist_del_rcu(&dev->index_hlist);
-       if (lock)
-               write_unlock(&dev_base_lock);
  
         dev_base_seq_inc(dev_net(dev));
  }
@@ -442,6 +425,12 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
  DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  EXPORT_PER_CPU_SYMBOL(softnet_data);
  
+/* Page_pool has a lockless array/stack to alloc/recycle pages.
+ * PP consumers must pay attention to run APIs in the appropriate context
+ * (e.g. NAPI context).
+ */
+static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool);
+
  #ifdef CONFIG_LOCKDEP
  /*
   * register_netdevice() inits txq->_xmit_lock and sets lockdep class
@@ -551,7 +540,7 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  static inline struct list_head *ptype_head(const struct packet_type *pt)
  {
         if (pt->type == htons(ETH_P_ALL))
-               return pt->dev ? &pt->dev->ptype_all : &ptype_all;
+               return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
         else
                 return pt->dev ? &pt->dev->ptype_specific :
                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
@@ -653,7 +642,7 @@ int dev_get_iflink(const struct net_device *dev)
         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
                 return dev->netdev_ops->ndo_get_iflink(dev);
  
-       return dev->ifindex;
+       return READ_ONCE(dev->ifindex);
  }
  EXPORT_SYMBOL(dev_get_iflink);
  
@@ -738,9 +727,9 @@ EXPORT_SYMBOL_GPL(dev_fill_forward_path);
   *     @net: the applicable net namespace
   *     @name: name to find
   *
- *     Find an interface by name. Must be called under RTNL semaphore
- *     or @dev_base_lock. If the name is found a pointer to the device
- *     is returned. If the name is not found then %NULL is returned. The
+ *     Find an interface by name. Must be called under RTNL semaphore.
+ *     If the name is found a pointer to the device is returned.
+ *     If the name is not found then %NULL is returned. The
   *     reference counters are not incremented so the caller must be
   *     careful with locks.
   */
@@ -821,8 +810,7 @@ EXPORT_SYMBOL(netdev_get_by_name);
   *     Search for an interface by index. Returns %NULL if the device
   *     is not found or a pointer to the device. The device has not
   *     had its reference counter increased so the caller must be careful
- *     about locking. The caller must hold either the RTNL semaphore
- *     or @dev_base_lock.
+ *     about locking. The caller must hold the RTNL semaphore.
   */
  
  struct net_device *__dev_get_by_index(struct net *net, int ifindex)
@@ -1212,13 +1200,13 @@ int dev_change_name(struct net_device *dev, const char *newname)
                             dev->flags & IFF_UP ? " (while UP)" : "");
  
         old_assign_type = dev->name_assign_type;
-       dev->name_assign_type = NET_NAME_RENAMED;
+       WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
  
  rollback:
         ret = device_rename(&dev->dev, dev->name);
         if (ret) {
                 memcpy(dev->name, oldname, IFNAMSIZ);
-               dev->name_assign_type = old_assign_type;
+               WRITE_ONCE(dev->name_assign_type, old_assign_type);
                 up_write(&devnet_rename_sem);
                 return ret;
         }
@@ -1227,15 +1215,11 @@ rollback:
  
         netdev_adjacent_rename_links(dev, oldname);
  
-       write_lock(&dev_base_lock);
         netdev_name_node_del(dev->name_node);
-       write_unlock(&dev_base_lock);
  
-       synchronize_rcu();
+       synchronize_net();
  
-       write_lock(&dev_base_lock);
         netdev_name_node_add(net, dev->name_node);
-       write_unlock(&dev_base_lock);
  
         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
         ret = notifier_to_errno(ret);
@@ -1247,7 +1231,7 @@ rollback:
                         down_write(&devnet_rename_sem);
                         memcpy(dev->name, oldname, IFNAMSIZ);
                         memcpy(oldname, newname, IFNAMSIZ);
-                       dev->name_assign_type = old_assign_type;
+                       WRITE_ONCE(dev->name_assign_type, old_assign_type);
                         old_assign_type = NET_NAME_RENAMED;
                         goto rollback;
                 } else {
@@ -2242,7 +2226,8 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
   */
  bool dev_nit_active(struct net_device *dev)
  {
-       return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
+       return !list_empty(&net_hotdata.ptype_all) ||
+              !list_empty(&dev->ptype_all);
  }
  EXPORT_SYMBOL_GPL(dev_nit_active);
  
@@ -2253,15 +2238,14 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
  
  void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
  {
-       struct packet_type *ptype;
+       struct list_head *ptype_list = &net_hotdata.ptype_all;
+       struct packet_type *ptype, *pt_prev = NULL;
         struct sk_buff *skb2 = NULL;
-       struct packet_type *pt_prev = NULL;
-       struct list_head *ptype_list = &ptype_all;
  
         rcu_read_lock();
  again:
         list_for_each_entry_rcu(ptype, ptype_list, list) {
-               if (ptype->ignore_outgoing)
+               if (READ_ONCE(ptype->ignore_outgoing))
                         continue;
  
                 /* Never send packets back to the socket
@@ -2302,7 +2286,7 @@ again:
                 pt_prev = ptype;
         }
  
-       if (ptype_list == &ptype_all) {
+       if (ptype_list == &net_hotdata.ptype_all) {
                 ptype_list = &dev->ptype_all;
                 goto again;
         }
@@ -4421,19 +4405,10 @@ EXPORT_SYMBOL(__dev_direct_xmit);
   *                     Receiver routines
   *************************************************************************/
  
-int netdev_max_backlog __read_mostly = 1000;
-EXPORT_SYMBOL(netdev_max_backlog);
-
-int netdev_tstamp_prequeue __read_mostly = 1;
  unsigned int sysctl_skb_defer_max __read_mostly = 64;
-int netdev_budget __read_mostly = 300;
-/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
-unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
  int weight_p __read_mostly = 64;           /* old backlog weight */
  int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
  int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
-int dev_rx_weight __read_mostly = 64;
-int dev_tx_weight __read_mostly = 64;
  
  /* Called with irq disabled */
  static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4475,12 +4450,6 @@ static inline void ____napi_schedule(struct softnet_data *sd,
  
  #ifdef CONFIG_RPS
  
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-u32 rps_cpu_mask __read_mostly;
-EXPORT_SYMBOL(rps_cpu_mask);
-
  struct static_key_false rps_needed __read_mostly;
  EXPORT_SYMBOL(rps_needed);
  struct static_key_false rfs_needed __read_mostly;
@@ -4572,7 +4541,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
         if (!hash)
                 goto done;
  
-       sock_flow_table = rcu_dereference(rps_sock_flow_table);
+       sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
         if (flow_table && sock_flow_table) {
                 struct rps_dev_flow *rflow;
                 u32 next_cpu;
@@ -4582,10 +4551,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                  * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
                  */
                 ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
-               if ((ident ^ hash) & ~rps_cpu_mask)
+               if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
                         goto try_rps;
  
-               next_cpu = ident & rps_cpu_mask;
+               next_cpu = ident & net_hotdata.rps_cpu_mask;
  
                 /* OK, now we know there is a match,
                  * we can look at the local (per receive queue) flow table
@@ -4734,7 +4703,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
         struct softnet_data *sd;
         unsigned int old_flow, new_flow;
  
-       if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
+       if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
                 return false;
  
         sd = this_cpu_ptr(&softnet_data);
@@ -4782,7 +4751,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
         if (!netif_running(skb->dev))
                 goto drop;
         qlen = skb_queue_len(&sd->input_pkt_queue);
-       if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
+       if (qlen <= READ_ONCE(net_hotdata.max_backlog) &&
+           !skb_flow_limit(skb, qlen)) {
                 if (qlen) {
  enqueue:
                         __skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -4858,6 +4828,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
         xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
         xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
                          skb_headlen(skb) + mac_len, true);
+       if (skb_is_nonlinear(skb)) {
+               skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+               xdp_buff_set_frags_flag(xdp);
+       } else {
+               xdp_buff_clear_frags_flag(xdp);
+       }
  
         orig_data_end = xdp->data_end;
         orig_data = xdp->data;
@@ -4887,6 +4863,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                 skb->len += off; /* positive on grow, negative on shrink */
         }
  
+       /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+        * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+        */
+       if (xdp_buff_has_frags(xdp))
+               skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+       else
+               skb->data_len = 0;
+
         /* check if XDP changed eth hdr such SKB needs update */
         eth = (struct ethhdr *)xdp->data;
         if ((orig_eth_type != eth->h_proto) ||
@@ -4920,11 +4904,35 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
         return act;
  }
  
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+static int
+netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
+{
+       struct sk_buff *skb = *pskb;
+       int err, hroom, troom;
+
+       if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
+               return 0;
+
+       /* In case we have to go down the path and also linearize,
+        * then lets do the pskb_expand_head() work just once here.
+        */
+       hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+       troom = skb->tail + skb->data_len - skb->end;
+       err = pskb_expand_head(skb,
+                              hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+                              troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
+       if (err)
+               return err;
+
+       return skb_linearize(skb);
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
                                      struct xdp_buff *xdp,
                                      struct bpf_prog *xdp_prog)
  {
-       u32 act = XDP_DROP;
+       struct sk_buff *skb = *pskb;
+       u32 mac_len, act = XDP_DROP;
  
         /* Reinjected packets coming from act_mirred or similar should
          * not get XDP generic processing.
@@ -4932,41 +4940,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
         if (skb_is_redirected(skb))
                 return XDP_PASS;
  
-       /* XDP packets must be linear and must have sufficient headroom
-        * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
-        * native XDP provides, thus we need to do it here as well.
+       /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+        * bytes. This is the guarantee that also native XDP provides,
+        * thus we need to do it here as well.
          */
+       mac_len = skb->data - skb_mac_header(skb);
+       __skb_push(skb, mac_len);
+
         if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
             skb_headroom(skb) < XDP_PACKET_HEADROOM) {
-               int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
-               int troom = skb->tail + skb->data_len - skb->end;
-
-               /* In case we have to go down the path and also linearize,
-                * then lets do the pskb_expand_head() work just once here.
-                */
-               if (pskb_expand_head(skb,
-                                    hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
-                                    troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
-                       goto do_drop;
-               if (skb_linearize(skb))
+               if (netif_skb_check_for_xdp(pskb, xdp_prog))
                         goto do_drop;
         }
  
-       act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+       __skb_pull(*pskb, mac_len);
+
+       act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
         switch (act) {
         case XDP_REDIRECT:
         case XDP_TX:
         case XDP_PASS:
                 break;
         default:
-               bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
+               bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
                 fallthrough;
         case XDP_ABORTED:
-               trace_xdp_exception(skb->dev, xdp_prog, act);
+               trace_xdp_exception((*pskb)->dev, xdp_prog, act);
                 fallthrough;
         case XDP_DROP:
         do_drop:
-               kfree_skb(skb);
+               kfree_skb(*pskb);
                 break;
         }
  
@@ -5004,24 +5007,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
  
  static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
  
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
  {
         if (xdp_prog) {
                 struct xdp_buff xdp;
                 u32 act;
                 int err;
  
-               act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
+               act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
                 if (act != XDP_PASS) {
                         switch (act) {
                         case XDP_REDIRECT:
-                               err = xdp_do_generic_redirect(skb->dev, skb,
+                               err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
                                                               &xdp, xdp_prog);
                                 if (err)
                                         goto out_redir;
                                 break;
                         case XDP_TX:
-                               generic_xdp_tx(skb, xdp_prog);
+                               generic_xdp_tx(*pskb, xdp_prog);
                                 break;
                         }
                         return XDP_DROP;
@@ -5029,7 +5032,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
         }
         return XDP_PASS;
  out_redir:
-       kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
+       kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
         return XDP_DROP;
  }
  EXPORT_SYMBOL_GPL(do_xdp_generic);
@@ -5038,7 +5041,7 @@ static int netif_rx_internal(struct sk_buff *skb)
  {
         int ret;
  
-       net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+       net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
  
         trace_netif_rx(skb);
  
@@ -5330,7 +5333,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
         int ret = NET_RX_DROP;
         __be16 type;
  
-       net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
+       net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
  
         trace_netif_receive_skb(skb);
  
@@ -5352,7 +5355,8 @@ another_round:
                 int ret2;
  
                 migrate_disable();
-               ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+               ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+                                     &skb);
                 migrate_enable();
  
                 if (ret2 != XDP_PASS) {
@@ -5373,7 +5377,7 @@ another_round:
         if (pfmemalloc)
                 goto skip_taps;
  
-       list_for_each_entry_rcu(ptype, &ptype_all, list) {
+       list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
                 if (pt_prev)
                         ret = deliver_skb(skb, pt_prev, orig_dev);
                 pt_prev = ptype;
@@ -5713,7 +5717,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
  {
         int ret;
  
-       net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+       net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
  
         if (skb_defer_rx_timestamp(skb))
                 return NET_RX_SUCCESS;
@@ -5743,7 +5747,8 @@ void netif_receive_skb_list_internal(struct list_head *head)
  
         INIT_LIST_HEAD(&sublist);
         list_for_each_entry_safe(skb, next, head, list) {
-               net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+               net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
+                                   skb);
                 skb_list_del_init(skb);
                 if (!skb_defer_rx_timestamp(skb))
                         list_add_tail(&skb->list, &sublist);
@@ -5967,7 +5972,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
                 net_rps_action_and_irq_enable(sd);
         }
  
-       napi->weight = READ_ONCE(dev_rx_weight);
+       napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
         while (again) {
                 struct sk_buff *skb;
  
@@ -6156,6 +6161,27 @@ struct napi_struct *napi_by_id(unsigned int napi_id)
         return NULL;
  }
  
+static void skb_defer_free_flush(struct softnet_data *sd)
+{
+       struct sk_buff *skb, *next;
+
+       /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
+       if (!READ_ONCE(sd->defer_list))
+               return;
+
+       spin_lock(&sd->defer_lock);
+       skb = sd->defer_list;
+       sd->defer_list = NULL;
+       sd->defer_count = 0;
+       spin_unlock(&sd->defer_lock);
+
+       while (skb != NULL) {
+               next = skb->next;
+               napi_consume_skb(skb, 1);
+               skb = next;
+       }
+}
+
  #if defined(CONFIG_NET_RX_BUSY_POLL)
  
  static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
@@ -6177,8 +6203,13 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
         clear_bit(NAPI_STATE_SCHED, &napi->state);
  }
  
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
-                          u16 budget)
+enum {
+       NAPI_F_PREFER_BUSY_POLL = 1,
+       NAPI_F_END_ON_RESCHED   = 2,
+};
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
+                          unsigned flags, u16 budget)
  {
         bool skip_schedule = false;
         unsigned long timeout;
@@ -6198,7 +6229,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool
  
         local_bh_disable();
  
-       if (prefer_busy_poll) {
+       if (flags & NAPI_F_PREFER_BUSY_POLL) {
                 napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
                 timeout = READ_ONCE(napi->dev->gro_flush_timeout);
                 if (napi->defer_hard_irqs_count && timeout) {
@@ -6222,23 +6253,23 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool
         local_bh_enable();
  }
  
-void napi_busy_loop(unsigned int napi_id,
-                   bool (*loop_end)(void *, unsigned long),
-                   void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+static void __napi_busy_loop(unsigned int napi_id,
+                     bool (*loop_end)(void *, unsigned long),
+                     void *loop_end_arg, unsigned flags, u16 budget)
  {
         unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
         int (*napi_poll)(struct napi_struct *napi, int budget);
         void *have_poll_lock = NULL;
         struct napi_struct *napi;
  
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
  restart:
         napi_poll = NULL;
  
-       rcu_read_lock();
-
         napi = napi_by_id(napi_id);
         if (!napi)
-               goto out;
+               return;
  
         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                 preempt_disable();
@@ -6254,14 +6285,14 @@ restart:
                          */
                         if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
                                    NAPIF_STATE_IN_BUSY_POLL)) {
-                               if (prefer_busy_poll)
+                               if (flags & NAPI_F_PREFER_BUSY_POLL)
                                         set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                 goto count;
                         }
                         if (cmpxchg(&napi->state, val,
                                     val | NAPIF_STATE_IN_BUSY_POLL |
                                           NAPIF_STATE_SCHED) != val) {
-                               if (prefer_busy_poll)
+                               if (flags & NAPI_F_PREFER_BUSY_POLL)
                                         set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                 goto count;
                         }
@@ -6275,18 +6306,22 @@ count:
                 if (work > 0)
                         __NET_ADD_STATS(dev_net(napi->dev),
                                         LINUX_MIB_BUSYPOLLRXPACKETS, work);
+               skb_defer_free_flush(this_cpu_ptr(&softnet_data));
                 local_bh_enable();
  
                 if (!loop_end || loop_end(loop_end_arg, start_time))
                         break;
  
                 if (unlikely(need_resched())) {
+                       if (flags & NAPI_F_END_ON_RESCHED)
+                               break;
                         if (napi_poll)
-                               busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
+                               busy_poll_stop(napi, have_poll_lock, flags, budget);
                         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                                 preempt_enable();
                         rcu_read_unlock();
                         cond_resched();
+                       rcu_read_lock();
                         if (loop_end(loop_end_arg, start_time))
                                 return;
                         goto restart;
@@ -6294,10 +6329,31 @@ count:
                 cpu_relax();
         }
         if (napi_poll)
-               busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
+               busy_poll_stop(napi, have_poll_lock, flags, budget);
         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                 preempt_enable();
-out:
+}
+
+void napi_busy_loop_rcu(unsigned int napi_id,
+                       bool (*loop_end)(void *, unsigned long),
+                       void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+       unsigned flags = NAPI_F_END_ON_RESCHED;
+
+       if (prefer_busy_poll)
+               flags |= NAPI_F_PREFER_BUSY_POLL;
+
+       __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
+}
+
+void napi_busy_loop(unsigned int napi_id,
+                   bool (*loop_end)(void *, unsigned long),
+                   void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+       unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
+
+       rcu_read_lock();
+       __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
         rcu_read_unlock();
  }
  EXPORT_SYMBOL(napi_busy_loop);
@@ -6680,27 +6736,6 @@ static int napi_thread_wait(struct napi_struct *napi)
         return -1;
  }
  
-static void skb_defer_free_flush(struct softnet_data *sd)
-{
-       struct sk_buff *skb, *next;
-
-       /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
-       if (!READ_ONCE(sd->defer_list))
-               return;
-
-       spin_lock(&sd->defer_lock);
-       skb = sd->defer_list;
-       sd->defer_list = NULL;
-       sd->defer_count = 0;
-       spin_unlock(&sd->defer_lock);
-
-       while (skb != NULL) {
-               next = skb->next;
-               napi_consume_skb(skb, 1);
-               skb = next;
-       }
-}
-
  static int napi_threaded_poll(void *data)
  {
         struct napi_struct *napi = data;
@@ -6708,6 +6743,8 @@ static int napi_threaded_poll(void *data)
         void *have;
  
         while (!napi_thread_wait(napi)) {
+               unsigned long last_qs = jiffies;
+
                 for (;;) {
                         bool repoll = false;
  
@@ -6732,6 +6769,7 @@ static int napi_threaded_poll(void *data)
                         if (!repoll)
                                 break;
  
+                       rcu_softirq_qs_periodic(last_qs);
                         cond_resched();
                 }
         }
@@ -6742,8 +6780,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
  {
         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
         unsigned long time_limit = jiffies +
-               usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
-       int budget = READ_ONCE(netdev_budget);
+               usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
+       int budget = READ_ONCE(net_hotdata.netdev_budget);
         LIST_HEAD(list);
         LIST_HEAD(repoll);
  
@@ -8586,12 +8624,12 @@ unsigned int dev_get_flags(const struct net_device *dev)
  {
         unsigned int flags;
  
-       flags = (dev->flags & ~(IFF_PROMISC |
+       flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
                                 IFF_ALLMULTI |
                                 IFF_RUNNING |
                                 IFF_LOWER_UP |
                                 IFF_DORMANT)) |
-               (dev->gflags & (IFF_PROMISC |
+               (READ_ONCE(dev->gflags) & (IFF_PROMISC |
                                 IFF_ALLMULTI));
  
         if (netif_running(dev)) {
@@ -8914,7 +8952,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
  }
  EXPORT_SYMBOL(dev_set_mac_address);
  
-static DECLARE_RWSEM(dev_addr_sem);
+DECLARE_RWSEM(dev_addr_sem);
  
  int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
                              struct netlink_ext_ack *extack)
@@ -9668,11 +9706,11 @@ static void dev_index_release(struct net *net, int ifindex)
  /* Delayed registration/unregisteration */
  LIST_HEAD(net_todo_list);
  DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+atomic_t dev_unreg_count = ATOMIC_INIT(0);
  
  static void net_set_todo(struct net_device *dev)
  {
         list_add_tail(&dev->todo_list, &net_todo_list);
-       atomic_inc(&dev_net(dev)->dev_unreg_count);
  }
  
  static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -10237,9 +10275,9 @@ int register_netdevice(struct net_device *dev)
                 goto err_ifindex_release;
  
         ret = netdev_register_kobject(dev);
-       write_lock(&dev_base_lock);
-       dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
-       write_unlock(&dev_base_lock);
+
+       WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+
         if (ret)
                 goto err_uninit_notify;
  
@@ -10315,7 +10353,7 @@ EXPORT_SYMBOL(register_netdevice);
   *     that need to tie several hardware interfaces to a single NAPI
   *     poll scheduler due to HW limitations.
   */
-int init_dummy_netdev(struct net_device *dev)
+void init_dummy_netdev(struct net_device *dev)
  {
         /* Clear everything. Note we don't initialize spinlocks
          * are they aren't supposed to be taken by any of the
@@ -10343,8 +10381,6 @@ int init_dummy_netdev(struct net_device *dev)
          * because users of this 'device' dont need to change
          * its refcount.
          */
-
-       return 0;
  }
  EXPORT_SYMBOL_GPL(init_dummy_netdev);
  
@@ -10499,6 +10535,7 @@ void netdev_run_todo(void)
  {
         struct net_device *dev, *tmp;
         struct list_head list;
+       int cnt;
  #ifdef CONFIG_LOCKDEP
         struct list_head unlink_list;
  
@@ -10529,12 +10566,11 @@ void netdev_run_todo(void)
                         continue;
                 }
  
-               write_lock(&dev_base_lock);
-               dev->reg_state = NETREG_UNREGISTERED;
-               write_unlock(&dev_base_lock);
+               WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
                 linkwatch_sync_dev(dev);
         }
  
+       cnt = 0;
         while (!list_empty(&list)) {
                 dev = netdev_wait_allrefs_any(&list);
                 list_del(&dev->todo_list);
@@ -10552,12 +10588,13 @@ void netdev_run_todo(void)
                 if (dev->needs_free_netdev)
                         free_netdev(dev);
  
-               if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
-                       wake_up(&netdev_unregistering_wq);
+               cnt++;
  
                 /* Free network device */
                 kobject_put(&dev->dev.kobj);
         }
+       if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
+               wake_up(&netdev_unregistering_wq);
  }
  
  /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
@@ -10634,6 +10671,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                 ops->ndo_get_stats64(dev, storage);
         } else if (ops->ndo_get_stats) {
                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+       } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
+               dev_get_tstats64(dev, storage);
         } else {
                 netdev_stats_to_stats64(storage, &dev->stats);
         }
@@ -10948,7 +10987,7 @@ void free_netdev(struct net_device *dev)
         }
  
         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
-       dev->reg_state = NETREG_RELEASED;
+       WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
  
         /* will free via device release */
         put_device(&dev->dev);
@@ -11004,6 +11043,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
  {
         struct net_device *dev, *tmp;
         LIST_HEAD(close_head);
+       int cnt = 0;
  
         BUG_ON(dev_boot_phase);
         ASSERT_RTNL();
@@ -11035,10 +11075,8 @@ void unregister_netdevice_many_notify(struct list_head *head,
  
         list_for_each_entry(dev, head, unreg_list) {
                 /* And unlink it from device chain. */
-               write_lock(&dev_base_lock);
-               unlist_netdevice(dev, false);
-               dev->reg_state = NETREG_UNREGISTERING;
-               write_unlock(&dev_base_lock);
+               unlist_netdevice(dev);
+               WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
         }
         flush_all_backlogs();
  
@@ -11100,7 +11138,9 @@ void unregister_netdevice_many_notify(struct list_head *head,
         list_for_each_entry(dev, head, unreg_list) {
                 netdev_put(dev, &dev->dev_registered_tracker);
                 net_set_todo(dev);
+               cnt++;
         }
+       atomic_add(cnt, &dev_unreg_count);
  
         list_del(head);
  }
@@ -11218,7 +11258,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
         dev_close(dev);
  
         /* And unlink it from device chain */
-       unlist_netdevice(dev, true);
+       unlist_netdevice(dev);
  
         synchronize_net();
  
@@ -11554,11 +11594,8 @@ static void __net_exit default_device_exit_net(struct net *net)
                         snprintf(fb_name, IFNAMSIZ, "dev%%d");
  
                 netdev_for_each_altname_safe(dev, name_node, tmp)
-                       if (netdev_name_in_use(&init_net, name_node->name)) {
-                               netdev_name_node_del(name_node);
-                               synchronize_rcu();
+                       if (netdev_name_in_use(&init_net, name_node->name))
                                 __netdev_name_node_alt_destroy(name_node);
-                       }
  
                 err = dev_change_net_namespace(dev, &init_net, fb_name);
                 if (err) {
@@ -11631,11 +11668,12 @@ static void __init net_dev_struct_check(void)
  
         /* TXRX read-mostly hotpath */
         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
+       CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
-       CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 38);
+       CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
  
         /* RX read-mostly hotpath */
         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
@@ -11665,6 +11703,28 @@ static void __init net_dev_struct_check(void)
   *
   */
  
+/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
+#define SYSTEM_PERCPU_PAGE_POOL_SIZE   ((1 << 20) / PAGE_SIZE)
+
+static int net_page_pool_create(int cpuid)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+       struct page_pool_params page_pool_params = {
+               .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
+               .flags = PP_FLAG_SYSTEM_POOL,
+               .nid = NUMA_NO_NODE,
+       };
+       struct page_pool *pp_ptr;
+
+       pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
+       if (IS_ERR(pp_ptr))
+               return -ENOMEM;
+
+       per_cpu(system_page_pool, cpuid) = pp_ptr;
+#endif
+       return 0;
+}
+
  /*
   *       This is called single threaded during boot, so no need
   *       to take the rtnl semaphore.
@@ -11683,7 +11743,6 @@ static int __init net_dev_init(void)
         if (netdev_kobject_init())
                 goto out;
  
-       INIT_LIST_HEAD(&ptype_all);
         for (i = 0; i < PTYPE_HASH_SIZE; i++)
                 INIT_LIST_HEAD(&ptype_base[i]);
  
@@ -11717,6 +11776,9 @@ static int __init net_dev_init(void)
                 init_gro_hash(&sd->backlog);
                 sd->backlog.poll = process_backlog;
                 sd->backlog.weight = weight_p;
+
+               if (net_page_pool_create(i))
+                       goto out;
         }
  
         dev_boot_phase = 0;
@@ -11744,6 +11806,19 @@ static int __init net_dev_init(void)
         WARN_ON(rc < 0);
         rc = 0;
  out:
+       if (rc < 0) {
+               for_each_possible_cpu(i) {
+                       struct page_pool *pp_ptr;
+
+                       pp_ptr = per_cpu(system_page_pool, i);
+                       if (!pp_ptr)
+                               continue;
+
+                       page_pool_destroy(pp_ptr);
+                       per_cpu(system_page_pool, i) = NULL;
+               }
+       }
+
         return rc;
  }