diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/dev.c | 240 | ||||
| -rw-r--r-- | net/core/filter.c | 85 | ||||
| -rw-r--r-- | net/core/neighbour.c | 131 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 1 | ||||
| -rw-r--r-- | net/core/netmem_priv.h | 16 | ||||
| -rw-r--r-- | net/core/skbuff.c | 26 | ||||
| -rw-r--r-- | net/core/sock.c | 92 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 16 |
8 files changed, 452 insertions, 155 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 2acfa44927da..dccc1176f3c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1163,6 +1163,7 @@ void netdev_copy_name(struct net_device *dev, char *name) strscpy(name, dev->name, IFNAMSIZ); } while (read_seqretry(&netdev_rename_lock, seq)); } +EXPORT_IPV6_MOD_GPL(netdev_copy_name); /** * netdev_get_name - get a netdevice name, knowing its ifindex. @@ -3373,6 +3374,13 @@ static void __netif_reschedule(struct Qdisc *q) void __netif_schedule(struct Qdisc *q) { + /* If q->defer_list is not empty, at least one thread is + * in __dev_xmit_skb() before llist_del_all(&q->defer_list). + * This thread will attempt to run the queue. + */ + if (!llist_empty(&q->defer_list)) + return; + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } @@ -4125,9 +4133,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { + struct sk_buff *next, *to_free = NULL; spinlock_t *root_lock = qdisc_lock(q); - struct sk_buff *to_free = NULL; - bool contended; + struct llist_node *ll_list, *first_n; + unsigned long defer_count = 0; int rc; qdisc_calculate_pkt_len(skb, q); @@ -4167,67 +4176,81 @@ no_lock_out: return rc; } - if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { - kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); - return NET_XMIT_DROP; - } - /* - * Heuristic to force contended enqueues to serialize on a - * separate lock before trying to get qdisc main lock. - * This permits qdisc->running owner to get the lock more - * often and dequeue packets faster. - * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit - * and then other tasks will only enqueue packets. The packets will be - * sent after the qdisc owner is scheduled again. To prevent this - * scenario the task always serialize on the lock. + /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. + * In the try_cmpxchg() loop, we want to increment q->defer_count + * at most once to limit the number of skbs in defer_list. + * We perform the defer_count increment only if the list is not empty, + * because some arches have slow atomic_long_inc_return(). + */ + first_n = READ_ONCE(q->defer_list.first); + do { + if (first_n && !defer_count) { + defer_count = atomic_long_inc_return(&q->defer_count); + if (unlikely(defer_count > q->limit)) { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + return NET_XMIT_DROP; + } + } + skb->ll_node.next = first_n; + } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node)); + + /* If defer_list was not empty, we know the cpu which queued + * the first skb will process the whole list for us. */ - contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); - if (unlikely(contended)) - spin_lock(&q->busylock); + if (first_n) + return NET_XMIT_SUCCESS; spin_lock(root_lock); + + ll_list = llist_del_all(&q->defer_list); + /* There is a small race because we clear defer_count not atomically + * with the prior llist_del_all(). This means defer_list could grow + * over q->limit. + */ + atomic_long_set(&q->defer_count, 0); + + ll_list = llist_reverse_order(ll_list); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - __qdisc_drop(skb, &to_free); + llist_for_each_entry_safe(skb, next, ll_list, ll_node) + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; - } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && - qdisc_run_begin(q)) { + goto unlock; + } + if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + !llist_next(ll_list) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ + DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list, + struct sk_buff, + ll_node)); qdisc_bstats_update(q, skb); - - if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) __qdisc_run(q); - } - qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { - WRITE_ONCE(q->owner, smp_processor_id()); - rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - WRITE_ONCE(q->owner, -1); - if (qdisc_run_begin(q)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } - __qdisc_run(q); - qdisc_run_end(q); + int count = 0; + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + prefetch(next); + skb_mark_not_on_list(skb); + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + count++; } + qdisc_run(q); + if (count != 1) + rc = NET_XMIT_SUCCESS; } +unlock: spin_unlock(root_lock); if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); - if (unlikely(contended)) - spin_unlock(&q->busylock); return rc; } @@ -4591,6 +4614,32 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(dev_pick_tx_zero); +int sk_tx_queue_get(const struct sock *sk) +{ + int resel, val; + + if (!sk) + return -1; + /* Paired with WRITE_ONCE() in sk_tx_queue_clear() + * and sk_tx_queue_set(). + */ + val = READ_ONCE(sk->sk_tx_queue_mapping); + + if (val == NO_QUEUE_MAPPING) + return -1; + + if (!sk_fullsock(sk)) + return val; + + resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection); + if (resel && time_is_before_jiffies( + READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel)) + return -1; + + return val; +} +EXPORT_SYMBOL(sk_tx_queue_get); + u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { @@ -4606,8 +4655,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); - if (queue_index != new_index && sk && - sk_fullsock(sk) && + if (sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); @@ -5202,14 +5250,15 @@ void kick_defer_list_purge(unsigned int cpu) int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif -static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen, + int max_backlog) { #ifdef CONFIG_NET_FLOW_LIMIT - struct sd_flow_limit *fl; - struct softnet_data *sd; unsigned int old_flow, new_flow; + const struct softnet_data *sd; + struct sd_flow_limit *fl; - if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) + if (likely(qlen < (max_backlog >> 1))) return false; sd = this_cpu_ptr(&softnet_data); @@ -5254,19 +5303,19 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, u32 tail; reason = SKB_DROP_REASON_DEV_READY; - if (!netif_running(skb->dev)) + if (unlikely(!netif_running(skb->dev))) goto bad_dev; - reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); qlen = skb_queue_len_lockless(&sd->input_pkt_queue); max_backlog = READ_ONCE(net_hotdata.max_backlog); - if (unlikely(qlen > max_backlog)) + if (unlikely(qlen > max_backlog) || + skb_flow_limit(skb, qlen, max_backlog)) goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { + if (likely(qlen <= max_backlog)) { if (!qlen) { /* Schedule NAPI for backlog device. We can use * non atomic operation as we own the queue lock. @@ -5287,6 +5336,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: + reason = SKB_DROP_REASON_CPU_BACKLOG; numa_drop_add(&sd->drop_counters, 1); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); @@ -12646,6 +12696,94 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); +/** + * netdev_compute_master_upper_features - compute feature from lowers + * @dev: the upper device + * @update_header: whether to update upper device's header_len/headroom/tailroom + * + * Recompute the upper device's feature based on all lower devices. + */ +void netdev_compute_master_upper_features(struct net_device *dev, bool update_header) +{ + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES; + netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES; + netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES; + netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES; + netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES; + unsigned short max_header_len = ETH_HLEN; + unsigned int tso_max_size = TSO_MAX_SIZE; + unsigned short max_headroom = 0; + unsigned short max_tailroom = 0; + u16 tso_max_segs = TSO_MAX_SEGS; + struct net_device *lower_dev; + struct list_head *iter; + + mpls_features = netdev_base_features(mpls_features); + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + gso_partial_features = netdev_increment_features(gso_partial_features, + lower_dev->gso_partial_features, + MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES); + + vlan_features = netdev_increment_features(vlan_features, + lower_dev->vlan_features, + MASTER_UPPER_DEV_VLAN_FEATURES); + + enc_features = netdev_increment_features(enc_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_ENC_FEATURES); + + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + xfrm_features = netdev_increment_features(xfrm_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_XFRM_FEATURES); + + mpls_features = netdev_increment_features(mpls_features, + lower_dev->mpls_features, + MASTER_UPPER_DEV_MPLS_FEATURES); + + dst_release_flag &= lower_dev->priv_flags; + + if (update_header) { + max_header_len = max(max_header_len, lower_dev->hard_header_len); + max_headroom = max(max_headroom, lower_dev->needed_headroom); + max_tailroom = max(max_tailroom, lower_dev->needed_tailroom); + } + + tso_max_size = min(tso_max_size, lower_dev->tso_max_size); + tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs); + } + + dev->gso_partial_features = gso_partial_features; + dev->vlan_features = vlan_features; + dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + dev->hw_enc_features |= xfrm_features; + dev->mpls_features = mpls_features; + + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + dev->priv_flags |= IFF_XMIT_DST_RELEASE; + + if (update_header) { + dev->hard_header_len = max_header_len; + dev->needed_headroom = max_headroom; + dev->needed_tailroom = max_tailroom; + } + + netif_set_tso_max_segs(dev, tso_max_segs); + netif_set_tso_max_size(dev, tso_max_size); + + netdev_change_features(dev); +} +EXPORT_SYMBOL(netdev_compute_master_upper_features); + static struct hlist_head * __net_init netdev_create_hash(void) { int i; diff --git a/net/core/filter.c b/net/core/filter.c index 76628df1fc82..16105f52927d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5733,6 +5733,77 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk, + char *optval, int optlen, + bool getopt) +{ + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + + if (!sk_has_account(sk)) + return -EOPNOTSUPP; + + if (getopt) { + *(int *)optval = sk->sk_bypass_prot_mem; + return 0; + } + + val = *(int *)optval; + if (val < 0 || val > 1) + return -EINVAL; + + sk->sk_bypass_prot_mem = val; + return 0; +} + +BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) + return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false); + + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = { + .func = bpf_sock_create_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) { + int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true); + + if (err) + memset(optval, 0, optlen); + + return err; + } + + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = { + .func = bpf_sock_create_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -8062,6 +8133,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_getsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id, prog); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bddfa389effa..96a3b1a93252 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -81,7 +81,7 @@ static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family } /* - Neighbour hash table buckets are protected with rwlock tbl->lock. + Neighbour hash table buckets are protected with tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks @@ -149,7 +149,7 @@ static void neigh_update_gc_list(struct neighbour *n) { bool on_gc_list, exempt_from_gc; - write_lock_bh(&n->tbl->lock); + spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; @@ -172,14 +172,14 @@ static void neigh_update_gc_list(struct neighbour *n) } out: write_unlock(&n->lock); - write_unlock_bh(&n->tbl->lock); + spin_unlock_bh(&n->tbl->lock); } static void neigh_update_managed_list(struct neighbour *n) { bool on_managed_list, add_to_managed; - write_lock_bh(&n->tbl->lock); + spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; @@ -193,7 +193,7 @@ static void neigh_update_managed_list(struct neighbour *n) list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); - write_unlock_bh(&n->tbl->lock); + spin_unlock_bh(&n->tbl->lock); } static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, @@ -263,7 +263,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { if (refcount_read(&n->refcnt) == 1) { @@ -292,7 +292,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) WRITE_ONCE(tbl->last_flush, jiffies); unlock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); return shrunk; } @@ -454,23 +454,23 @@ static void neigh_flush_table(struct neigh_table *tbl) void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, false); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); if (likely(dev)) { neigh_flush_dev(tbl, dev, skip_perm); } else { DEBUG_NET_WARN_ON_ONCE(skip_perm); neigh_flush_table(tbl); } - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, @@ -687,7 +687,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); @@ -722,13 +722,13 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, hlist_add_head_rcu(&n->dev_list, neigh_get_dev_table(dev, tbl->family)); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); out_neigh_release: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); @@ -982,7 +982,7 @@ static void neigh_periodic_work(struct work_struct *work) NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); @@ -995,8 +995,7 @@ static void neigh_periodic_work(struct work_struct *work) WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); } if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) @@ -1037,9 +1036,9 @@ static void neigh_periodic_work(struct work_struct *work) * It's fine to release lock here, even if hash table * grows while we are preempted. */ - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); cond_resched(); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } @@ -1050,7 +1049,7 @@ out: */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) @@ -1642,12 +1641,12 @@ static void neigh_managed_work(struct work_struct *work) managed_work.work); struct neighbour *neigh; - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } static void neigh_proxy_process(struct timer_list *t) @@ -1749,8 +1748,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; @@ -1763,9 +1761,9 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, return NULL; } - write_lock_bh(&tbl->lock); - list_add(&p->list, &tbl->parms.list); - write_unlock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); + list_add_rcu(&p->list, &tbl->parms.list); + spin_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } @@ -1785,10 +1783,12 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; - write_lock_bh(&tbl->lock); - list_del(&parms->list); + + spin_lock_bh(&tbl->lock); + list_del_rcu(&parms->list); parms->dead = 1; - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); + netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } @@ -1810,8 +1810,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); - tbl->parms.reachable_time = - neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); + neigh_set_reach_time(&tbl->parms); tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); @@ -1838,7 +1837,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); - rwlock_init(&tbl->lock); + spin_lock_init(&tbl->lock); mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); @@ -1981,10 +1980,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, err = __neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid, extack); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); out: return err; @@ -2179,7 +2178,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) return -ENOBUFS; if ((parms->dev && - nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || + nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || @@ -2194,7 +2193,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || - nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, + nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || @@ -2231,8 +2230,6 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; @@ -2258,11 +2255,9 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; - rcu_read_lock(); nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); - rcu_read_unlock(); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; @@ -2300,12 +2295,10 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; - read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; nla_put_failure: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -2324,8 +2317,6 @@ static int neightbl_fill_param_info(struct sk_buff *skb, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; @@ -2334,11 +2325,9 @@ static int neightbl_fill_param_info(struct sk_buff *skb, neightbl_fill_parms(skb, parms) < 0) goto errout; - read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; errout: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -2375,9 +2364,9 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct nlattr *tb[NDTA_MAX + 1]; struct neigh_table *tbl; struct ndtmsg *ndtmsg; - struct nlattr *tb[NDTA_MAX+1]; bool found = false; int err, tidx; @@ -2393,26 +2382,33 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, ndtmsg = nlmsg_data(nlh); + rcu_read_lock(); + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { - tbl = rcu_dereference_rtnl(neigh_tables[tidx]); + tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; + if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } - if (!found) - return -ENOENT; + if (!found) { + rcu_read_unlock(); + err = -ENOENT; + goto errout; + } /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; @@ -2475,8 +2471,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, @@ -2532,7 +2527,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, err = 0; errout_tbl_lock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); + rcu_read_unlock(); errout: return err; } @@ -2579,10 +2575,12 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; + rcu_read_lock(); + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; - tbl = rcu_dereference_rtnl(neigh_tables[tidx]); + tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; @@ -2596,7 +2594,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) nidx = 0; p = list_next_entry(&tbl->parms, list); - list_for_each_entry_from(p, &tbl->parms_list, list) { + list_for_each_entry_from_rcu(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; @@ -2616,6 +2614,8 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) neigh_skip = 0; } out: + rcu_read_unlock(); + cb->args[0] = tidx; cb->args[1] = nidx; @@ -3127,14 +3127,14 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void rcu_read_lock(); nht = rcu_dereference(tbl->nht); - read_lock_bh(&tbl->lock); /* avoid resizes */ + spin_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } - read_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); @@ -3404,7 +3404,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl rcu_read_lock(); state->nht = rcu_dereference(tbl->nht); - read_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } @@ -3444,7 +3444,7 @@ void neigh_seq_stop(struct seq_file *seq, void *v) struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; - read_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); @@ -3721,8 +3721,7 @@ static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write * only be effective after the next time neigh_periodic_work * decides to recompute it */ - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); } return ret; } @@ -3918,8 +3917,10 @@ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, - {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, - {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, + {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; static int __init neigh_init(void) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b0e0f22d7b21..adcfef55a66f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -395,6 +395,7 @@ static __net_init void preinit_net_sysctl(struct net *net) net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; net->core.sysctl_tstamp_allow_data = 1; + net->core.sysctl_txq_reselection = msecs_to_jiffies(1000); } /* init code that must occur even if setup_net() is not called. */ diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index cd95394399b4..23175cb2bd86 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,19 +5,19 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; + return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) { - __netmem_clear_lsb(netmem)->pp_magic |= pp_magic; + netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; } static inline void netmem_clear_pp_magic(netmem_ref netmem) { - WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - __netmem_clear_lsb(netmem)->pp_magic = 0; + netmem_to_nmdesc(netmem)->pp_magic = 0; } static inline bool netmem_is_pp(netmem_ref netmem) @@ -27,13 +27,13 @@ static inline bool netmem_is_pp(netmem_ref netmem) static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) { - __netmem_clear_lsb(netmem)->pp = pool; + netmem_to_nmdesc(netmem)->pp = pool; } static inline void netmem_set_dma_addr(netmem_ref netmem, unsigned long dma_addr) { - __netmem_clear_lsb(netmem)->dma_addr = dma_addr; + netmem_to_nmdesc(netmem)->dma_addr = dma_addr; } static inline unsigned long netmem_get_dma_index(netmem_ref netmem) @@ -43,7 +43,7 @@ static inline unsigned long netmem_get_dma_index(netmem_ref netmem) if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) return 0; - magic = __netmem_clear_lsb(netmem)->pp_magic; + magic = netmem_to_nmdesc(netmem)->pp_magic; return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; } @@ -57,6 +57,6 @@ static inline void netmem_set_dma_index(netmem_ref netmem, return; magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); - __netmem_clear_lsb(netmem)->pp_magic = magic; + netmem_to_nmdesc(netmem)->pp_magic = magic; } #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6be01454f262..5b4bc8b1c7d5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -274,6 +274,11 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) } EXPORT_SYMBOL(__netdev_alloc_frag_align); +/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler + * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset. + */ +static u32 skbuff_cache_size __read_mostly; + static struct sk_buff *napi_skb_cache_get(void) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -293,7 +298,7 @@ static struct sk_buff *napi_skb_cache_get(void) skb = nc->skb_cache[--nc->skb_count]; local_unlock_nested_bh(&napi_alloc_cache.bh_lock); - kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); + kasan_mempool_unpoison_object(skb, skbuff_cache_size); return skb; } @@ -345,11 +350,9 @@ u32 napi_skb_cache_get_bulk(void **skbs, u32 n) get: for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { - u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache); - skbs[i] = nc->skb_cache[base + i]; - kasan_mempool_unpoison_object(skbs[i], cache_size); + kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size); memset(skbs[i], 0, offsetof(struct sk_buff, tail)); } @@ -1136,7 +1139,16 @@ void skb_release_head_state(struct sk_buff *skb) skb_dst_drop(skb); if (skb->destructor) { DEBUG_NET_WARN_ON_ONCE(in_hardirq()); - skb->destructor(skb); +#ifdef CONFIG_INET + INDIRECT_CALL_3(skb->destructor, + tcp_wfree, __sock_wfree, sock_wfree, + skb); +#else + INDIRECT_CALL_1(skb->destructor, + sock_wfree, + skb); + +#endif } #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); @@ -1428,7 +1440,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) kasan_mempool_unpoison_object(nc->skb_cache[i], - kmem_cache_size(net_hotdata.skbuff_cache)); + skbuff_cache_size); kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF, nc->skb_cache + NAPI_SKB_CACHE_HALF); @@ -5116,6 +5128,8 @@ void __init skb_init(void) offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); + skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache); + net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones), 0, diff --git a/net/core/sock.c b/net/core/sock.c index dc03d4b5909a..7a9bbc2afcf0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -155,7 +155,7 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); -static void sock_def_write_space_wfree(struct sock *sk); +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); static void sock_def_write_space(struct sock *sk); /** @@ -1046,9 +1046,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes) if (!charged) return -ENOMEM; + if (sk->sk_bypass_prot_mem) + goto success; + /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); + /* If the system goes into memory pressure with this * precharge, give up and return error. */ @@ -1057,6 +1061,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes) mem_cgroup_sk_uncharge(sk, pages); return -ENOMEM; } + +success: sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, @@ -2300,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; + + if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) + sk->sk_bypass_prot_mem = 1; + sk->sk_kern_sock = kern; sock_lock_init(sk); + sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); @@ -2313,7 +2324,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } sock_net_set(sk, net); - refcount_set(&sk->sk_wmem_alloc, 1); + refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); @@ -2451,13 +2462,16 @@ static void sk_init_common(struct sock *sk) } /** - * sk_clone_lock - clone a socket, and lock its clone - * @sk: the socket to clone - * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * sk_clone - clone a socket + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @lock: if true, lock the cloned sk * - * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + * If @lock is true, the clone is locked by bh_lock_sock(), and + * caller must unlock socket even in error path by bh_unlock_sock(). */ -struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +struct sock *sk_clone(const struct sock *sk, const gfp_t priority, + bool lock) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; @@ -2486,16 +2500,19 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } + sk_node_init(&newsk->sk_node); sock_lock_init(newsk); - bh_lock_sock(newsk); + + if (lock) + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); - /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ - refcount_set(&newsk->sk_wmem_alloc, 1); + refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); @@ -2580,12 +2597,13 @@ free: * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); + if (lock) + bh_unlock_sock(newsk); sk_free(newsk); newsk = NULL; goto out; } -EXPORT_SYMBOL_GPL(sk_clone_lock); +EXPORT_SYMBOL_GPL(sk_clone); static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) { @@ -2649,16 +2667,18 @@ EXPORT_SYMBOL_GPL(sk_setup_caps); */ void sock_wfree(struct sk_buff *skb) { - struct sock *sk = skb->sk; unsigned int len = skb->truesize; + struct sock *sk = skb->sk; bool free; + int old; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) && sk->sk_write_space == sock_def_write_space) { rcu_read_lock(); - free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); - sock_def_write_space_wfree(sk); + free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, + &old); + sock_def_write_space_wfree(sk, old - len); rcu_read_unlock(); if (unlikely(free)) __sk_free(sk); @@ -2695,6 +2715,8 @@ void __sock_wfree(struct sk_buff *skb) void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { + int old_wmem; + skb_orphan(skb); #ifdef CONFIG_INET if (unlikely(!sk_fullsock(sk))) @@ -2708,7 +2730,15 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) * is enough to guarantee sk_free() won't free this sock until * all in-flight packets are completed */ - refcount_add(skb->truesize, &sk->sk_wmem_alloc); + __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); + + /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket + * is in a host queue (qdisc, NIC queue). + * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue + * based on XPS for better performance. + * Otherwise clear ooo_okay to not risk Out Of Order delivery. + */ + skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); } EXPORT_SYMBOL(skb_set_owner_w); @@ -3136,8 +3166,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; - sk_enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; } EXPORT_SYMBOL(sk_page_frag_refill); @@ -3254,10 +3287,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - long allocated; + long allocated = 0; - sk_memory_allocated_add(sk, amt); - allocated = sk_memory_allocated(sk); + if (!sk->sk_bypass_prot_mem) { + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); + } if (mem_cgroup_sk_enabled(sk)) { memcg_enabled = true; @@ -3266,6 +3301,9 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) goto suppress_allocation; } + if (!allocated) + return 1; + /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); @@ -3344,7 +3382,8 @@ suppress_allocation: trace_sock_exceed_buf_limit(sk, prot, allocated, kind); - sk_memory_allocated_sub(sk, amt); + if (allocated) + sk_memory_allocated_sub(sk, amt); if (charged) mem_cgroup_sk_uncharge(sk, amt); @@ -3383,11 +3422,14 @@ EXPORT_SYMBOL(__sk_mem_schedule); */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, amount); - if (mem_cgroup_sk_enabled(sk)) mem_cgroup_sk_uncharge(sk, amount); + if (sk->sk_bypass_prot_mem) + return; + + sk_memory_allocated_sub(sk, amount); + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); @@ -3580,12 +3622,12 @@ static void sock_def_write_space(struct sock *sk) * for SOCK_RCU_FREE sockets under RCU read section and after putting * ->sk_wmem_alloc. */ -static void sock_def_write_space_wfree(struct sock *sk) +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) { /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if (sock_writeable(sk)) { + if (__sock_writeable(sk, wmem_alloc)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); /* rely on refcount_sub from sock_wfree() */ diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8cf04b57ade1..8d4decb2606f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -668,6 +668,13 @@ static struct ctl_table netns_core_table[] = { .proc_handler = proc_dou8vec_minmax, }, { + .procname = "txq_reselection_ms", + .data = &init_net.core.sysctl_txq_reselection, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { .procname = "tstamp_allow_data", .data = &init_net.core.sysctl_tstamp_allow_data, .maxlen = sizeof(u8), @@ -676,6 +683,15 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, + { + .procname = "bypass_prot_mem", + .data = &init_net.core.sysctl_bypass_prot_mem, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */ |
