diff options
Diffstat (limited to 'net')
139 files changed, 2421 insertions, 1376 deletions
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index e7315c01a299..30493ea3c010 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -542,6 +542,11 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb, struct ddpehdr *ddp = (struct ddpehdr *)skb->data; int ft = 2; + if (!at) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + /* * Compressible ? * diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 358fbe5e4d1d..b991d937205a 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -179,6 +179,7 @@ as_indicate_complete: break; default: pr_alert("bad message type %d\n", (int)msg->type); + dev_kfree_skb(skb); /* Paired with find_get_vcc(msg->vcc) above */ sock_put(sk); return -EINVAL; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f28e9cbf8ad5..b8b1b997960a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -173,19 +173,12 @@ free_orig_node_hash: static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh) + struct batadv_orig_node *orig_node) { struct batadv_neigh_node *neigh_node; neigh_node = batadv_neigh_node_get_or_create(orig_node, hard_iface, neigh_addr); - if (!neigh_node) - goto out; - - neigh_node->orig_node = orig_neigh; - -out: return neigh_node; } @@ -231,6 +224,8 @@ static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) hard_iface->bat_iv.ogm_buff = NULL; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); + + cancel_delayed_work_sync(&hard_iface->bat_iv.reschedule_work); } static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) @@ -335,7 +330,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); const char *fwd_str; u8 packet_num; - s16 buff_pos; + int buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; u8 *packet_pos; @@ -543,8 +538,10 @@ out: * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm + * + * Return: whether forward packet was scheduled */ -static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, +static bool batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, int packet_len, unsigned long send_time, bool direct_link, struct batadv_hard_iface *if_incoming, @@ -568,13 +565,13 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, skb = netdev_alloc_skb_ip_align(NULL, skb_size); if (!skb) - return; + return false; forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, queue_left, bat_priv, skb); if (!forw_packet_aggr) { kfree_skb(skb); - return; + return false; } forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; @@ -597,6 +594,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, batadv_iv_send_outstanding_bat_ogm_packet); batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time); + + return true; } /* aggregate a new packet into the existing ogm packet */ @@ -624,8 +623,10 @@ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr, * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm * @send_time: timestamp (jiffies) when the packet is to be sent + * + * Return: whether forward packet was scheduled */ -static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, +static bool batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned char *packet_buff, int packet_len, struct batadv_hard_iface *if_incoming, @@ -677,14 +678,16 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, if (!own_packet && atomic_read(&bat_priv->aggregated_ogms)) send_time += max_aggregation_jiffies; - batadv_iv_ogm_aggregate_new(packet_buff, packet_len, - send_time, direct_link, - if_incoming, if_outgoing, - own_packet); + return batadv_iv_ogm_aggregate_new(packet_buff, packet_len, + send_time, direct_link, + if_incoming, if_outgoing, + own_packet); } else { batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff, packet_len, direct_link); spin_unlock_bh(&bat_priv->forw_bat_list_lock); + + return true; } } @@ -797,6 +800,9 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) u32 seqno; u16 tvlv_len = 0; unsigned long send_time; + bool reschedule = false; + bool scheduled; + int ret; lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); @@ -820,9 +826,15 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); - tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, - ogm_buff_len, - BATADV_OGM_HLEN); + ret = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, + ogm_buff_len, + BATADV_OGM_HLEN); + if (ret < 0) { + reschedule = true; + goto out; + } + + tvlv_len = ret; } batadv_ogm_packet = (struct batadv_ogm_packet *)(*ogm_buff); @@ -841,8 +853,11 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) /* OGMs from secondary interfaces are only scheduled on their * respective interfaces. */ - batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, - hard_iface, hard_iface, 1, send_time); + scheduled = batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, + hard_iface, hard_iface, 1, send_time); + if (!scheduled) + reschedule = true; + goto out; } @@ -854,15 +869,28 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) continue; - batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, - *ogm_buff_len, hard_iface, - tmp_hard_iface, 1, send_time); - + scheduled = batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, + *ogm_buff_len, hard_iface, + tmp_hard_iface, 1, send_time); batadv_hardif_put(tmp_hard_iface); + + if (!scheduled && tmp_hard_iface == hard_iface) + reschedule = true; } rcu_read_unlock(); out: + if (reschedule) { + /* there was a failure scheduling the own forward packet. + * as result, the batadv_iv_send_outstanding_bat_ogm_packet() + * work item is no longer scheduled. it is therefore necessary + * to reschedule it manually + */ + queue_delayed_work(batadv_event_workqueue, + &hard_iface->bat_iv.reschedule_work, + msecs_to_jiffies(atomic_read(&bat_priv->orig_interval))); + } + batadv_hardif_put(primary_if); } @@ -877,6 +905,17 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } +static void batadv_iv_ogm_reschedule(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct batadv_hard_iface *hard_iface; + + hard_iface = container_of(delayed_work, + struct batadv_hard_iface, + bat_iv.reschedule_work); + batadv_iv_ogm_schedule(hard_iface); +} + /** * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly @@ -907,6 +946,31 @@ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, } /** + * batadv_iv_ogm_neigh_ifinfo_sum() - Get bcast_own sum for a last-hop neighbor + * @bat_priv: the bat priv with all the mesh interface information + * @neigh_node: last-hop neighbor of an originator + * + * Return: Number of replied (rebroadcasted) OGMs for the originator currently + * announced by the neighbor. Returns 0 if the neighbor's originator entry is + * not available anymore. + */ +static u8 batadv_iv_ogm_neigh_ifinfo_sum(struct batadv_priv *bat_priv, + const struct batadv_neigh_node *neigh_node) +{ + struct batadv_orig_node *orig_neigh; + u8 sum; + + orig_neigh = batadv_orig_hash_find(bat_priv, neigh_node->addr); + if (!orig_neigh) + return 0; + + sum = batadv_iv_orig_ifinfo_sum(orig_neigh, neigh_node->if_incoming); + batadv_orig_node_put(orig_neigh); + + return sum; +} + +/** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator * @bat_priv: the bat priv with all the mesh interface information @@ -975,17 +1039,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, } if (!neigh_node) { - struct batadv_orig_node *orig_tmp; - - orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); - if (!orig_tmp) - goto unlock; - neigh_node = batadv_iv_ogm_neigh_new(if_incoming, ethhdr->h_source, - orig_node, orig_tmp); - - batadv_orig_node_put(orig_tmp); + orig_node); if (!neigh_node) goto unlock; } else { @@ -1037,10 +1093,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { - sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, - router->if_incoming); - sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, - neigh_node->if_incoming); + sum_orig = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv, router); + sum_neigh = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv, + neigh_node); if (sum_orig >= sum_neigh) goto out; } @@ -1106,7 +1161,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, if (!neigh_node) neigh_node = batadv_iv_ogm_neigh_new(if_incoming, orig_neigh_node->orig, - orig_neigh_node, orig_neigh_node); if (!neigh_node) @@ -1303,6 +1357,32 @@ out: } /** + * batadv_orig_to_direct_router() - get direct next hop neighbor to an orig address + * @bat_priv: the bat priv with all the mesh interface information + * @orig_addr: the originator MAC address to search the best next hop router for + * @if_outgoing: the interface where the OGM should be sent to + * + * Return: A neighbor node which is the best router towards the given originator + * address. Bonding candidates are ignored. + */ +static struct batadv_neigh_node * +batadv_orig_to_direct_router(struct batadv_priv *bat_priv, u8 *orig_addr, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_orig_node *orig_node; + + orig_node = batadv_orig_hash_find(bat_priv, orig_addr); + if (!orig_node) + return NULL; + + neigh_node = batadv_orig_router_get(orig_node, if_outgoing); + batadv_orig_node_put(orig_node); + + return neigh_node; +} + +/** * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing * interface * @skb: the skb containing the OGM @@ -1372,8 +1452,9 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { - router_router = batadv_orig_router_get(router->orig_node, - if_outgoing); + router_router = batadv_orig_to_direct_router(bat_priv, + router->addr, + if_outgoing); router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); } @@ -2227,6 +2308,8 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface) { + INIT_DELAYED_WORK(&hard_iface->bat_iv.reschedule_work, batadv_iv_ogm_reschedule); + /* begin scheduling originator messages on that interface */ batadv_iv_ogm_schedule(hard_iface); } diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index e3870492dab7..d66ca77b1aaa 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -113,14 +113,14 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) /** * batadv_v_ogm_send_to_if() - send a batman ogm using a given interface + * @bat_priv: the bat priv with all the mesh interface information * @skb: the OGM to send * @hard_iface: the interface to use to send the OGM */ -static void batadv_v_ogm_send_to_if(struct sk_buff *skb, +static void batadv_v_ogm_send_to_if(struct batadv_priv *bat_priv, + struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); - if (hard_iface->if_status != BATADV_IF_ACTIVE) { kfree_skb(skb); return; @@ -187,6 +187,7 @@ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) /** * batadv_v_ogm_aggr_send() - flush & send aggregation queue + * @bat_priv: the bat priv with all the mesh interface information * @hard_iface: the interface with the aggregation queue to flush * * Aggregates all OGMv2 packets currently in the aggregation queue into a @@ -196,7 +197,8 @@ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ -static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) +static void batadv_v_ogm_aggr_send(struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface) { unsigned int aggr_len = hard_iface->bat_v.aggr_len; struct sk_buff *skb_aggr; @@ -226,27 +228,32 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) consume_skb(skb); } - batadv_v_ogm_send_to_if(skb_aggr, hard_iface); + batadv_v_ogm_send_to_if(bat_priv, skb_aggr, hard_iface); } /** * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface + * @bat_priv: the bat priv with all the mesh interface information * @skb: the OGM to queue * @hard_iface: the interface to queue the OGM on */ -static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, +static void batadv_v_ogm_queue_on_if(struct batadv_priv *bat_priv, + struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); + if (hard_iface->mesh_iface != bat_priv->mesh_iface) { + kfree_skb(skb); + return; + } if (!atomic_read(&bat_priv->aggregated_ogms)) { - batadv_v_ogm_send_to_if(skb, hard_iface); + batadv_v_ogm_send_to_if(bat_priv, skb, hard_iface); return; } spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); if (!batadv_v_ogm_queue_left(skb, hard_iface)) - batadv_v_ogm_aggr_send(hard_iface); + batadv_v_ogm_aggr_send(bat_priv, hard_iface); hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); @@ -262,10 +269,10 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; struct sk_buff *skb, *skb_tmp; - unsigned char *ogm_buff; + unsigned char **ogm_buff; struct list_head *iter; - int ogm_buff_len; - u16 tvlv_len = 0; + int *ogm_buff_len; + u16 tvlv_len; int ret; lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex); @@ -273,25 +280,27 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) goto out; - ogm_buff = bat_priv->bat_v.ogm_buff; - ogm_buff_len = bat_priv->bat_v.ogm_buff_len; + ogm_buff = &bat_priv->bat_v.ogm_buff; + ogm_buff_len = &bat_priv->bat_v.ogm_buff_len; + /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); - tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff, - &ogm_buff_len, - BATADV_OGM2_HLEN); + ret = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, + ogm_buff_len, + BATADV_OGM2_HLEN); + if (ret < 0) + goto reschedule; - bat_priv->bat_v.ogm_buff = ogm_buff; - bat_priv->bat_v.ogm_buff_len = ogm_buff_len; + tvlv_len = ret; - skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len); + skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + *ogm_buff_len); if (!skb) goto reschedule; skb_reserve(skb, ETH_HLEN); - skb_put_data(skb, ogm_buff, ogm_buff_len); + skb_put_data(skb, *ogm_buff, *ogm_buff_len); ogm_packet = (struct batadv_ogm2_packet *)skb->data; ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno)); @@ -343,7 +352,7 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) break; } - batadv_v_ogm_queue_on_if(skb_tmp, hard_iface); + batadv_v_ogm_queue_on_if(bat_priv, skb_tmp, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -383,12 +392,14 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) { struct batadv_hard_iface_bat_v *batv; struct batadv_hard_iface *hard_iface; + struct batadv_priv *bat_priv; batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); + bat_priv = netdev_priv(hard_iface->mesh_iface); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); - batadv_v_ogm_aggr_send(hard_iface); + batadv_v_ogm_aggr_send(bat_priv, hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_start_queue_timer(hard_iface); @@ -578,7 +589,7 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, if_outgoing->net_dev->name, ntohl(ogm_forward->throughput), ogm_forward->ttl, if_incoming->net_dev->name); - batadv_v_ogm_queue_on_if(skb, if_outgoing); + batadv_v_ogm_queue_on_if(bat_priv, skb, if_outgoing); out: batadv_orig_ifinfo_put(orig_ifinfo); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 51fe028b9088..ffe854018bd3 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -318,8 +318,8 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) if (claim->backbone_gw != backbone_gw) continue; - batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); + batadv_claim_put(claim); } spin_unlock_bh(list_lock); } @@ -356,12 +356,14 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, sizeof(local_claim_dest)); local_claim_dest.type = claimtype; - mesh_iface = primary_if->mesh_iface; + mesh_iface = READ_ONCE(primary_if->mesh_iface); + if (!mesh_iface) + goto out; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, /* IP DST: 0.0.0.0 */ zeroip, - primary_if->mesh_iface, + mesh_iface, /* IP SRC: 0.0.0.0 */ zeroip, /* Ethernet DST: Broadcast */ @@ -514,8 +516,8 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, entry->crc = BATADV_BLA_CRC_INIT; entry->bat_priv = bat_priv; spin_lock_init(&entry->crc_lock); - atomic_set(&entry->request_sent, 0); - atomic_set(&entry->wait_periods, 0); + entry->state = BATADV_BLA_BACKBONE_GW_SYNCED; + entry->wait_periods = 0; ether_addr_copy(entry->orig, orig); INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); kref_init(&entry->refcount); @@ -544,9 +546,13 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, batadv_bla_send_announce(bat_priv, entry); /* this will be decreased in the worker thread */ - atomic_inc(&entry->request_sent); - atomic_set(&entry->wait_periods, BATADV_BLA_WAIT_PERIODS); - atomic_inc(&bat_priv->bla.num_requests); + spin_lock_bh(&bat_priv->bla.num_requests_lock); + if (entry->state == BATADV_BLA_BACKBONE_GW_SYNCED) { + entry->state = BATADV_BLA_BACKBONE_GW_UNSYNCED; + entry->wait_periods = BATADV_BLA_WAIT_PERIODS; + atomic_inc(&bat_priv->bla.num_requests); + } + spin_unlock_bh(&bat_priv->bla.num_requests_lock); } return entry; @@ -649,10 +655,12 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) backbone_gw->vid, BATADV_CLAIM_TYPE_REQUEST); /* no local broadcasts should be sent or received, for now. */ - if (!atomic_read(&backbone_gw->request_sent)) { + spin_lock_bh(&backbone_gw->bat_priv->bla.num_requests_lock); + if (backbone_gw->state == BATADV_BLA_BACKBONE_GW_SYNCED) { + backbone_gw->state = BATADV_BLA_BACKBONE_GW_UNSYNCED; atomic_inc(&backbone_gw->bat_priv->bla.num_requests); - atomic_set(&backbone_gw->request_sent, 1); } + spin_unlock_bh(&backbone_gw->bat_priv->bla.num_requests_lock); } /** @@ -723,6 +731,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, if (unlikely(hash_added != 0)) { /* only local changes happened. */ + batadv_backbone_gw_put(backbone_gw); kfree(claim); return; } @@ -872,10 +881,12 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, /* if we have sent a request and the crc was OK, * we can allow traffic again. */ - if (atomic_read(&backbone_gw->request_sent)) { + spin_lock_bh(&bat_priv->bla.num_requests_lock); + if (backbone_gw->state == BATADV_BLA_BACKBONE_GW_UNSYNCED) { + backbone_gw->state = BATADV_BLA_BACKBONE_GW_SYNCED; atomic_dec(&backbone_gw->bat_priv->bla.num_requests); - atomic_set(&backbone_gw->request_sent, 0); } + spin_unlock_bh(&bat_priv->bla.num_requests_lock); } batadv_backbone_gw_put(backbone_gw); @@ -1223,6 +1234,7 @@ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) struct hlist_head *head; struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ + bool purged; int i; hash = bat_priv->bla.backbone_hash; @@ -1233,30 +1245,49 @@ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) head = &hash->table[i]; list_lock = &hash->list_locks[i]; - spin_lock_bh(list_lock); - hlist_for_each_entry_safe(backbone_gw, node_tmp, - head, hash_entry) { - if (now) - goto purge_now; - if (!batadv_has_timed_out(backbone_gw->lasttime, - BATADV_BLA_BACKBONE_TIMEOUT)) - continue; + do { + purged = false; + + spin_lock_bh(list_lock); + hlist_for_each_entry_safe(backbone_gw, node_tmp, + head, hash_entry) { + if (now) + goto purge_now; + if (!batadv_has_timed_out(backbone_gw->lasttime, + BATADV_BLA_BACKBONE_TIMEOUT)) + continue; - batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, - "%s(): backbone gw %pM timed out\n", - __func__, backbone_gw->orig); + batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, + "%s(): backbone gw %pM timed out\n", + __func__, backbone_gw->orig); purge_now: - /* don't wait for the pending request anymore */ - if (atomic_read(&backbone_gw->request_sent)) - atomic_dec(&bat_priv->bla.num_requests); + purged = true; - batadv_bla_del_backbone_claims(backbone_gw); + /* don't wait for the pending request anymore */ + spin_lock_bh(&bat_priv->bla.num_requests_lock); + if (backbone_gw->state == BATADV_BLA_BACKBONE_GW_UNSYNCED) + atomic_dec(&bat_priv->bla.num_requests); - hlist_del_rcu(&backbone_gw->hash_entry); - batadv_backbone_gw_put(backbone_gw); - } - spin_unlock_bh(list_lock); + backbone_gw->state = BATADV_BLA_BACKBONE_GW_STOPPED; + spin_unlock_bh(&bat_priv->bla.num_requests_lock); + + batadv_bla_del_backbone_claims(backbone_gw); + + hlist_del_rcu(&backbone_gw->hash_entry); + break; + } + spin_unlock_bh(list_lock); + + if (purged) { + /* reference for pending report_work */ + if (cancel_work_sync(&backbone_gw->report_work)) + batadv_backbone_gw_put(backbone_gw); + + /* reference for hash_entry */ + batadv_backbone_gw_put(backbone_gw); + } + } while (purged); } } @@ -1288,6 +1319,13 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { + /* only purge claims not currently in the process of being released. + * Such claims could otherwise have a NULL-ptr backbone_gw set because + * they already went through batadv_claim_release() + */ + if (!kref_get_unless_zero(&claim->refcount)) + continue; + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; @@ -1313,6 +1351,7 @@ purge_now: claim->addr, claim->vid); skip: batadv_backbone_gw_put(backbone_gw); + batadv_claim_put(claim); } rcu_read_unlock(); } @@ -1483,7 +1522,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) batadv_bla_send_loopdetect(bat_priv, backbone_gw); - /* request_sent is only set after creation to avoid + /* state is only set to unsynced after creation to avoid * problems when we are not yet known as backbone gw * in the backbone. * @@ -1492,14 +1531,21 @@ static void batadv_bla_periodic_work(struct work_struct *work) * some grace time. */ - if (atomic_read(&backbone_gw->request_sent) == 0) - continue; + spin_lock_bh(&bat_priv->bla.num_requests_lock); + if (backbone_gw->state != BATADV_BLA_BACKBONE_GW_UNSYNCED) + goto unlock_next; - if (!atomic_dec_and_test(&backbone_gw->wait_periods)) - continue; + if (backbone_gw->wait_periods > 0) + backbone_gw->wait_periods--; + + if (backbone_gw->wait_periods > 0) + goto unlock_next; + backbone_gw->state = BATADV_BLA_BACKBONE_GW_SYNCED; atomic_dec(&backbone_gw->bat_priv->bla.num_requests); - atomic_set(&backbone_gw->request_sent, 0); + +unlock_next: + spin_unlock_bh(&bat_priv->bla.num_requests_lock); } rcu_read_unlock(); } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 3efc4cf50b46..0a8bd95e2f99 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -696,6 +696,9 @@ static bool batadv_dat_forward_data(struct batadv_priv *bat_priv, goto free_orig; tmp_skb = pskb_copy_for_clone(skb, GFP_ATOMIC); + if (!tmp_skb) + goto free_neigh; + if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, tmp_skb, cand[i].orig_node, packet_subtype)) { diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index f4e45cc25816..e9553db42349 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -17,6 +17,7 @@ #include <linux/lockdep.h> #include <linux/minmax.h> #include <linux/netdevice.h> +#include <linux/overflow.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -80,9 +81,9 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node, * * Return: the maximum size of payload that can be fragmented. */ -static int batadv_frag_size_limit(void) +static size_t batadv_frag_size_limit(void) { - int limit = BATADV_FRAG_MAX_FRAG_SIZE; + size_t limit = BATADV_FRAG_MAX_FRAG_SIZE; limit -= sizeof(struct batadv_frag_packet); limit *= BATADV_FRAG_MAX_FRAGMENTS; @@ -143,7 +144,9 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, struct batadv_frag_packet *frag_packet; u8 bucket; u16 seqno, hdr_size = sizeof(struct batadv_frag_packet); + bool overflow = false; bool ret = false; + size_t data_len; /* Linearize packet to avoid linearizing 16 packets in a row when doing * the later merge. Non-linear merge should be added to remove this @@ -153,6 +156,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, goto err; frag_packet = (struct batadv_frag_packet *)skb->data; + data_len = skb->len - hdr_size; seqno = ntohs(frag_packet->seqno); bucket = seqno % BATADV_FRAG_BUFFER_COUNT; @@ -171,7 +175,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, spin_lock_bh(&chain->lock); if (batadv_frag_init_chain(chain, seqno)) { hlist_add_head(&frag_entry_new->list, &chain->fragment_list); - chain->size = skb->len - hdr_size; + chain->size = data_len; chain->timestamp = jiffies; chain->total_size = ntohs(frag_packet->total_size); ret = true; @@ -188,7 +192,11 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, if (frag_entry_curr->no < frag_entry_new->no) { hlist_add_before(&frag_entry_new->list, &frag_entry_curr->list); - chain->size += skb->len - hdr_size; + + if (check_add_overflow(chain->size, data_len, + &chain->size)) + overflow = true; + chain->timestamp = jiffies; ret = true; goto out; @@ -201,13 +209,16 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, /* Reached the end of the list, so insert after 'frag_entry_last'. */ if (likely(frag_entry_last)) { hlist_add_behind(&frag_entry_new->list, &frag_entry_last->list); - chain->size += skb->len - hdr_size; + + if (check_add_overflow(chain->size, data_len, &chain->size)) + overflow = true; + chain->timestamp = jiffies; ret = true; } out: - if (chain->size > batadv_frag_size_limit() || + if (overflow || chain->size > batadv_frag_size_limit() || chain->total_size != ntohs(frag_packet->total_size) || chain->total_size > batadv_frag_size_limit()) { /* Clear chain if total size of either the list or the packet @@ -294,6 +305,31 @@ free: } /** + * batadv_skb_is_frag() - check if newly merged skb contains unicast fragment + * @skb: newly merged skb + * + * Return: if newly merged skb is of type BATADV_UNICAST_FRAG + */ +static bool batadv_skb_is_frag(struct sk_buff *skb) +{ + struct batadv_ogm_packet *batadv_ogm_packet; + + /* packet should hold at least type and version */ + if (unlikely(!pskb_may_pull(skb, 2))) + return false; + + batadv_ogm_packet = (struct batadv_ogm_packet *)skb->data; + + if (batadv_ogm_packet->version != BATADV_COMPAT_VERSION) + return false; + + if (batadv_ogm_packet->packet_type != BATADV_UNICAST_FRAG) + return false; + + return true; +} + +/** * batadv_frag_skb_buffer() - buffer fragment for later merge * @skb: skb to buffer * @orig_node_src: originator that the skb is received from @@ -326,6 +362,16 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb, if (!skb_out) goto out_err; + /* fragment in fragment is not allowed. otherwise it is possible + * to exhaust the stack when receiving a matryoshka-style + * "fragments in a fragment packet" + */ + if (batadv_skb_is_frag(skb_out)) { + kfree_skb(skb_out); + skb_out = NULL; + goto out_err; + } + out: ret = true; out_err: diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 51e9c081a2a4..a9d0346e8332 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -478,10 +478,14 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, */ void batadv_gw_node_free(struct batadv_priv *bat_priv) { + struct batadv_gw_node *curr_gw; struct batadv_gw_node *gw_node; struct hlist_node *node_tmp; spin_lock_bh(&bat_priv->gw.list_lock); + curr_gw = rcu_replace_pointer(bat_priv->gw.curr_gw, NULL, true); + batadv_gw_node_put(curr_gw); + hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.gateway_list, list) { hlist_del_init_rcu(&gw_node->list); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 3a35aadd8b41..a4d33ee0fda5 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -249,6 +249,7 @@ void batadv_mesh_free(struct net_device *mesh_iface) atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); batadv_purge_outstanding_packets(bat_priv, NULL); + batadv_tp_stop_all(bat_priv); batadv_gw_node_free(bat_priv); diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c index 56ca1c1b83f2..e7aa45bc6b7a 100644 --- a/net/batman-adv/mesh-interface.c +++ b/net/batman-adv/mesh-interface.c @@ -787,6 +787,7 @@ static int batadv_meshif_init_late(struct net_device *dev) atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); + spin_lock_init(&bat_priv->bla.num_requests_lock); #endif atomic_set(&bat_priv->tp_num, 0); diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index b3468ccab535..ad4921b659d9 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -835,8 +835,6 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) orig_node = container_of(rcu, struct batadv_orig_node, rcu); - batadv_mcast_purge_orig(orig_node); - batadv_frag_purge_orig(orig_node, NULL); kfree(orig_node->tt_buff); @@ -887,6 +885,8 @@ void batadv_orig_node_release(struct kref *ref) } spin_unlock_bh(&orig_node->vlan_list_lock); + batadv_mcast_purge_orig(orig_node); + call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 2e42f6b348c8..0fc4ca78e84e 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -8,10 +8,12 @@ #include "main.h" #include <linux/atomic.h> +#include <linux/bug.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> +#include <linux/completion.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/etherdevice.h> @@ -253,6 +255,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, * batadv_tp_list_find() - find a tp_vars object in the global list * @bat_priv: the bat priv with all the mesh interface information * @dst: the other endpoint MAC address to look for + * @role: role of the session * * Look for a tp_vars object matching dst as end_point and return it after * having increment the refcounter. Return NULL is not found @@ -260,7 +263,8 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, * Return: matching tp_vars or NULL when no tp_vars with @dst was found */ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, - const u8 *dst) + const u8 *dst, + enum batadv_tp_meter_role role) { struct batadv_tp_vars *pos, *tp_vars = NULL; @@ -269,6 +273,9 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, if (!batadv_compare_eth(pos->other_end, dst)) continue; + if (pos->role != role) + continue; + /* most of the time this function is invoked during the normal * process..it makes sens to pay more when the session is * finished and to speed the process up during the measurement @@ -285,11 +292,32 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, } /** + * batadv_tp_list_active() - check if session from/to destination is ongoing + * @bat_priv: the bat priv with all the mesh interface information + * @dst: the other endpoint MAC address to look for + * + * Return: if matching session with @dst was found + */ +static bool batadv_tp_list_active(struct batadv_priv *bat_priv, const u8 *dst) + __must_hold(&bat_priv->tp_list_lock) +{ + struct batadv_tp_vars *tp_vars; + + hlist_for_each_entry_rcu(tp_vars, &bat_priv->tp_list, list) { + if (batadv_compare_eth(tp_vars->other_end, dst)) + return true; + } + + return false; +} + +/** * batadv_tp_list_find_session() - find tp_vars session object in the global * list * @bat_priv: the bat priv with all the mesh interface information * @dst: the other endpoint MAC address to look for * @session: session identifier + * @role: role of the session * * Look for a tp_vars object matching dst as end_point, session as tp meter * session and return it after having increment the refcounter. Return NULL @@ -299,7 +327,7 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, */ static struct batadv_tp_vars * batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst, - const u8 *session) + const u8 *session, enum batadv_tp_meter_role role) { struct batadv_tp_vars *pos, *tp_vars = NULL; @@ -311,6 +339,9 @@ batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst, if (memcmp(pos->session, session, sizeof(pos->session)) != 0) continue; + if (pos->role != role) + continue; + /* most of the time this function is invoked during the normal * process..it makes sense to pay more when the session is * finished and to speed the process up during the measurement @@ -365,32 +396,41 @@ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) } /** - * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer - * @bat_priv: the bat priv with all the mesh interface information - * @tp_vars: the private data of the current TP meter session to cleanup + * batadv_tp_list_detach() - remove tp session from mesh session list once + * @tp_vars: the private data of the current TP meter session */ -static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, - struct batadv_tp_vars *tp_vars) +static void batadv_tp_list_detach(struct batadv_tp_vars *tp_vars) { - cancel_delayed_work(&tp_vars->finish_work); + bool detached = false; spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); - hlist_del_rcu(&tp_vars->list); + if (!hlist_unhashed(&tp_vars->list)) { + hlist_del_init_rcu(&tp_vars->list); + detached = true; + } spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); + if (!detached) + return; + + atomic_dec(&tp_vars->bat_priv->tp_num); + /* drop list reference */ batadv_tp_vars_put(tp_vars); +} - atomic_dec(&tp_vars->bat_priv->tp_num); +/** + * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer + * @tp_vars: the private data of the current TP meter session to cleanup + */ +static void batadv_tp_sender_cleanup(struct batadv_tp_vars *tp_vars) +{ + cancel_delayed_work_sync(&tp_vars->finish_work); + + batadv_tp_list_detach(tp_vars); /* kill the timer and remove its reference */ - timer_delete_sync(&tp_vars->timer); - /* the worker might have rearmed itself therefore we kill it again. Note - * that if the worker should run again before invoking the following - * timer_delete(), it would not re-arm itself once again because the status - * is OFF now - */ - timer_delete(&tp_vars->timer); + timer_shutdown_sync(&tp_vars->timer); batadv_tp_vars_put(tp_vars); } @@ -402,11 +442,14 @@ static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, static void batadv_tp_sender_end(struct batadv_priv *bat_priv, struct batadv_tp_vars *tp_vars) { + enum batadv_tp_meter_reason reason; u32 session_cookie; + reason = atomic_read(&tp_vars->send_result); + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Test towards %pM finished..shutting down (reason=%d)\n", - tp_vars->other_end, tp_vars->reason); + tp_vars->other_end, reason); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Last timing stats: SRTT=%ums RTTVAR=%ums RTO=%ums\n", @@ -419,7 +462,7 @@ static void batadv_tp_sender_end(struct batadv_priv *bat_priv, session_cookie = batadv_tp_session_cookie(tp_vars->session, tp_vars->icmp_uid); - batadv_tp_batctl_notify(tp_vars->reason, + batadv_tp_batctl_notify(reason, tp_vars->other_end, bat_priv, tp_vars->start_time, @@ -435,10 +478,18 @@ static void batadv_tp_sender_end(struct batadv_priv *bat_priv, static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars, enum batadv_tp_meter_reason reason) { - if (!atomic_dec_and_test(&tp_vars->sending)) - return; + atomic_cmpxchg(&tp_vars->send_result, 0, reason); +} - tp_vars->reason = reason; +/** + * batadv_tp_sender_stopped() - check if tp session was stopped with reason + * @tp_vars: the private data of the current TP meter session + * + * Return: whether stop reason was found + */ +static bool batadv_tp_sender_stopped(struct batadv_tp_vars *tp_vars) +{ + return atomic_read(&tp_vars->send_result) != 0; } /** @@ -468,7 +519,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) /* most of the time this function is invoked while normal packet * reception... */ - if (unlikely(atomic_read(&tp_vars->sending) == 0)) + if (unlikely(batadv_tp_sender_stopped(tp_vars))) /* timer ref will be dropped in batadv_tp_sender_cleanup */ return; @@ -488,7 +539,7 @@ static void batadv_tp_sender_timeout(struct timer_list *t) struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer); struct batadv_priv *bat_priv = tp_vars->bat_priv; - if (atomic_read(&tp_vars->sending) == 0) + if (batadv_tp_sender_stopped(tp_vars)) return; /* if the user waited long enough...shutdown the test */ @@ -643,11 +694,11 @@ static void batadv_tp_recv_ack(struct batadv_priv *bat_priv, /* find the tp_vars */ tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, - icmp->session); + icmp->session, BATADV_TP_SENDER); if (unlikely(!tp_vars)) return; - if (unlikely(atomic_read(&tp_vars->sending) == 0)) + if (unlikely(batadv_tp_sender_stopped(tp_vars))) goto out; /* old ACK? silently drop it.. */ @@ -813,21 +864,21 @@ static int batadv_tp_send(void *arg) if (unlikely(tp_vars->role != BATADV_TP_SENDER)) { err = BATADV_TP_REASON_DST_UNREACHABLE; - tp_vars->reason = err; + batadv_tp_sender_shutdown(tp_vars, err); goto out; } orig_node = batadv_orig_hash_find(bat_priv, tp_vars->other_end); if (unlikely(!orig_node)) { err = BATADV_TP_REASON_DST_UNREACHABLE; - tp_vars->reason = err; + batadv_tp_sender_shutdown(tp_vars, err); goto out; } primary_if = batadv_primary_if_get_selected(bat_priv); if (unlikely(!primary_if)) { err = BATADV_TP_REASON_DST_UNREACHABLE; - tp_vars->reason = err; + batadv_tp_sender_shutdown(tp_vars, err); goto out; } @@ -846,7 +897,7 @@ static int batadv_tp_send(void *arg) queue_delayed_work(batadv_event_workqueue, &tp_vars->finish_work, msecs_to_jiffies(tp_vars->test_length)); - while (atomic_read(&tp_vars->sending) != 0) { + while (!batadv_tp_sender_stopped(tp_vars)) { if (unlikely(!batadv_tp_avail(tp_vars, payload_len))) { batadv_tp_wait_available(tp_vars, payload_len); continue; @@ -869,8 +920,7 @@ static int batadv_tp_send(void *arg) "Meter: %s() cannot send packets (%d)\n", __func__, err); /* ensure nobody else tries to stop the thread now */ - if (atomic_dec_and_test(&tp_vars->sending)) - tp_vars->reason = err; + batadv_tp_sender_shutdown(tp_vars, err); break; } @@ -886,7 +936,8 @@ out: batadv_orig_node_put(orig_node); batadv_tp_sender_end(bat_priv, tp_vars); - batadv_tp_sender_cleanup(bat_priv, tp_vars); + batadv_tp_sender_cleanup(tp_vars); + complete(&tp_vars->finished); batadv_tp_vars_put(tp_vars); @@ -918,7 +969,8 @@ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) batadv_tp_vars_put(tp_vars); /* cleanup of failed tp meter variables */ - batadv_tp_sender_cleanup(bat_priv, tp_vars); + batadv_tp_sender_cleanup(tp_vars); + complete(&tp_vars->finished); return; } @@ -947,10 +999,15 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, /* look for an already existing test towards this node */ spin_lock_bh(&bat_priv->tp_list_lock); - tp_vars = batadv_tp_list_find(bat_priv, dst); - if (tp_vars) { + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_DST_UNREACHABLE, + dst, bat_priv, session_cookie); + return; + } + + if (batadv_tp_list_active(bat_priv, dst)) { spin_unlock_bh(&bat_priv->tp_list_lock); - batadv_tp_vars_put(tp_vars); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: test to or from the same node already ongoing, aborting\n"); batadv_tp_batctl_error_notify(BATADV_TP_REASON_ALREADY_ONGOING, @@ -969,6 +1026,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); if (!tp_vars) { + atomic_dec(&bat_priv->tp_num); spin_unlock_bh(&bat_priv->tp_list_lock); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: %s cannot allocate list elements\n", @@ -982,7 +1040,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, ether_addr_copy(tp_vars->other_end, dst); kref_init(&tp_vars->refcount); tp_vars->role = BATADV_TP_SENDER; - atomic_set(&tp_vars->sending, 1); + atomic_set(&tp_vars->send_result, 0); memcpy(tp_vars->session, session_id, sizeof(session_id)); tp_vars->icmp_uid = icmp_uid; @@ -1017,6 +1075,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, tp_vars->start_time = jiffies; init_waitqueue_head(&tp_vars->more_bytes); + init_completion(&tp_vars->finished); spin_lock_init(&tp_vars->unacked_lock); INIT_LIST_HEAD(&tp_vars->unacked_list); @@ -1069,16 +1128,16 @@ void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, if (!orig_node) return; - tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig); + tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig, BATADV_TP_SENDER); if (!tp_vars) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: trying to interrupt an already over connection\n"); - goto out; + goto out_put_orig_node; } batadv_tp_sender_shutdown(tp_vars, return_value); batadv_tp_vars_put(tp_vars); -out: +out_put_orig_node: batadv_orig_node_put(orig_node); } @@ -1119,14 +1178,7 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t) "Shutting down for inactivity (more than %dms) from %pM\n", BATADV_TP_RECV_TIMEOUT, tp_vars->other_end); - spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); - hlist_del_rcu(&tp_vars->list); - spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); - - /* drop list reference */ - batadv_tp_vars_put(tp_vars); - - atomic_dec(&bat_priv->tp_num); + batadv_tp_list_detach(tp_vars); spin_lock_bh(&tp_vars->unacked_lock); list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { @@ -1136,6 +1188,9 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t) spin_unlock_bh(&tp_vars->unacked_lock); /* drop reference of timer */ + if (WARN_ON(atomic_xchg(&tp_vars->receiving, 0) != 1)) + return; + batadv_tp_vars_put(tp_vars); } @@ -1329,11 +1384,14 @@ static struct batadv_tp_vars * batadv_tp_init_recv(struct batadv_priv *bat_priv, const struct batadv_icmp_tp_packet *icmp) { - struct batadv_tp_vars *tp_vars; + struct batadv_tp_vars *tp_vars = NULL; spin_lock_bh(&bat_priv->tp_list_lock); + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + goto out_unlock; + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, - icmp->session); + icmp->session, BATADV_TP_RECEIVER); if (tp_vars) goto out_unlock; @@ -1344,11 +1402,14 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv, } tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); - if (!tp_vars) + if (!tp_vars) { + atomic_dec(&bat_priv->tp_num); goto out_unlock; + } ether_addr_copy(tp_vars->other_end, icmp->orig); tp_vars->role = BATADV_TP_RECEIVER; + atomic_set(&tp_vars->receiving, 1); memcpy(tp_vars->session, icmp->session, sizeof(tp_vars->session)); tp_vars->last_recv = BATADV_TP_FIRST_SEQ; tp_vars->bat_priv = bat_priv; @@ -1401,7 +1462,7 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv, } } else { tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, - icmp->session); + icmp->session, BATADV_TP_RECEIVER); if (!tp_vars) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Unexpected packet from %pM!\n", @@ -1410,13 +1471,6 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv, } } - if (unlikely(tp_vars->role != BATADV_TP_RECEIVER)) { - batadv_dbg(BATADV_DBG_TP_METER, bat_priv, - "Meter: dropping packet: not expected (role=%u)\n", - tp_vars->role); - goto out; - } - tp_vars->last_recv_time = jiffies; /* if the packet is a duplicate, it may be the case that an ACK has been @@ -1464,6 +1518,9 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_icmp_tp_packet *icmp; + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + goto out; + icmp = (struct batadv_icmp_tp_packet *)skb->data; switch (icmp->subtype) { @@ -1478,10 +1535,62 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) "Received unknown TP Metric packet type %u\n", icmp->subtype); } + +out: consume_skb(skb); } /** + * batadv_tp_stop_all() - stop all currently running tp meter sessions + * @bat_priv: the bat priv with all the mesh interface information + */ +void batadv_tp_stop_all(struct batadv_priv *bat_priv) +{ + struct batadv_tp_vars *tp_vars[BATADV_TP_MAX_NUM]; + struct batadv_tp_vars *tp_var; + size_t count = 0; + size_t i; + + spin_lock_bh(&bat_priv->tp_list_lock); + hlist_for_each_entry(tp_var, &bat_priv->tp_list, list) { + if (WARN_ON_ONCE(count >= BATADV_TP_MAX_NUM)) + break; + + if (!kref_get_unless_zero(&tp_var->refcount)) + continue; + + tp_vars[count++] = tp_var; + } + spin_unlock_bh(&bat_priv->tp_list_lock); + + for (i = 0; i < count; i++) { + tp_var = tp_vars[i]; + + switch (tp_var->role) { + case BATADV_TP_SENDER: + batadv_tp_sender_shutdown(tp_var, + BATADV_TP_REASON_CANCEL); + wake_up(&tp_var->more_bytes); + wait_for_completion(&tp_var->finished); + break; + case BATADV_TP_RECEIVER: + batadv_tp_list_detach(tp_var); + timer_shutdown_sync(&tp_var->timer); + + if (atomic_xchg(&tp_var->receiving, 0) != 1) + break; + + batadv_tp_vars_put(tp_var); + break; + } + + batadv_tp_vars_put(tp_var); + } + + synchronize_net(); +} + +/** * batadv_tp_meter_init() - initialize global tp_meter structures */ void __init batadv_tp_meter_init(void) diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index f0046d366eac..4e97cd10cd02 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -17,6 +17,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, u32 test_length, u32 *cookie); void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, u8 return_value); +void batadv_tp_stop_all(struct batadv_priv *bat_priv); void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb); #endif /* _NET_BATMAN_ADV_TP_METER_H_ */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 05cddcf994f6..9f6e67771ffa 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -797,24 +797,33 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, s32 *tt_len) { u16 num_vlan = 0; - u16 num_entries = 0; u16 tvlv_len = 0; unsigned int change_offset; struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_orig_node_vlan *vlan; + u16 total_entries = 0; u8 *tt_change_ptr; + int vlan_entries; + u16 sum_entries; spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { + vlan_entries = atomic_read(&vlan->tt.num_entries); + + if (check_add_overflow(vlan_entries, total_entries, &sum_entries)) { + *tt_len = 0; + goto out; + } + + total_entries = sum_entries; num_vlan++; - num_entries += atomic_read(&vlan->tt.num_entries); } change_offset = struct_size(*tt_data, vlan_data, num_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) - *tt_len = batadv_tt_len(num_entries); + *tt_len = batadv_tt_len(total_entries); if (change_offset > U16_MAX || *tt_len > U16_MAX - change_offset) { *tt_len = 0; @@ -835,14 +844,26 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (*tt_data)->vlan_data; + num_vlan = 0; hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { + vlan_entries = atomic_read(&vlan->tt.num_entries); + if (vlan_entries < 1) + continue; + tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); tt_vlan->reserved = 0; tt_vlan++; + num_vlan++; } + /* recalculate in case number of VLANs reduced */ + change_offset = struct_size(*tt_data, vlan_data, num_vlan); + tvlv_len = *tt_len + change_offset; + + (*tt_data)->num_vlan = htons(num_vlan); + tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; @@ -877,21 +898,25 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_meshif_vlan *vlan; + size_t change_offset; u16 num_vlan = 0; - u16 vlan_entries = 0; u16 total_entries = 0; u16 tvlv_len; u8 *tt_change_ptr; - int change_offset; + int vlan_entries; + u16 sum_entries; spin_lock_bh(&bat_priv->meshif_vlan_list_lock); hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); - if (vlan_entries < 1) - continue; + if (check_add_overflow(vlan_entries, total_entries, &sum_entries)) { + tvlv_len = 0; + goto out; + } + + total_entries = sum_entries; num_vlan++; - total_entries += vlan_entries; } change_offset = struct_size(*tt_data, vlan_data, num_vlan); @@ -900,8 +925,10 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, if (*tt_len < 0) *tt_len = batadv_tt_len(total_entries); - tvlv_len = *tt_len; - tvlv_len += change_offset; + if (check_add_overflow(*tt_len, change_offset, &tvlv_len)) { + tvlv_len = 0; + goto out; + } *tt_data = kmalloc(tvlv_len, GFP_ATOMIC); if (!*tt_data) { @@ -914,6 +941,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (*tt_data)->vlan_data; + num_vlan = 0; hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) @@ -924,8 +952,15 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, tt_vlan->reserved = 0; tt_vlan++; + num_vlan++; } + /* recalculate in case number of VLANs reduced */ + change_offset = struct_size(*tt_data, vlan_data, num_vlan); + tvlv_len = *tt_len + change_offset; + + (*tt_data)->num_vlan = htons(num_vlan); + tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 8129a3f9c44d..cc6ac580c620 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -8,10 +8,12 @@ #include <linux/byteorder/generic.h> #include <linux/container_of.h> +#include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/kref.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -159,10 +161,10 @@ batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) * * Return: size of all currently registered tvlv containers in bytes. */ -static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) +static size_t batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; - u16 tvlv_len = 0; + size_t tvlv_len = 0; lockdep_assert_held(&bat_priv->tvlv.container_list_lock); @@ -306,26 +308,35 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, * The ogm packet might be enlarged or shrunk depending on the current size * and the size of the to-be-appended tvlv containers. * - * Return: size of all appended tvlv containers in bytes. + * Return: size of all appended tvlv containers in bytes (max U16_MAX), negative + * if operation failed */ -u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, +int batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, int *packet_buff_len, int packet_min_len) { struct batadv_tvlv_container *tvlv; struct batadv_tvlv_hdr *tvlv_hdr; - u16 tvlv_value_len; + size_t tvlv_value_len; void *tvlv_value; + int tvlv_len_ret; bool ret; spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_value_len = batadv_tvlv_container_list_size(bat_priv); + if (tvlv_value_len > U16_MAX) { + tvlv_len_ret = -E2BIG; + goto end; + } ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len, packet_min_len, tvlv_value_len); - - if (!ret) + if (!ret) { + tvlv_len_ret = -ENOMEM; goto end; + } + + tvlv_len_ret = tvlv_value_len; if (!tvlv_value_len) goto end; @@ -344,7 +355,8 @@ u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, end: spin_unlock_bh(&bat_priv->tvlv.container_list_lock); - return tvlv_value_len; + + return tvlv_len_ret; } /** diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index e5697230d991..f96f6b3f44a0 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -16,7 +16,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len); -u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, +int batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, int *packet_buff_len, int packet_min_len); void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 8fc5fe0e9b05..a01ee46d97f3 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -14,6 +14,7 @@ #include <linux/average.h> #include <linux/bitops.h> #include <linux/compiler.h> +#include <linux/completion.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/kref.h> @@ -82,6 +83,9 @@ struct batadv_hard_iface_bat_iv { /** @ogm_seqno: OGM sequence number - used to identify each OGM */ atomic_t ogm_seqno; + /** @reschedule_work: recover OGM schedule after schedule error */ + struct delayed_work reschedule_work; + /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */ struct mutex ogm_buff_mutex; }; @@ -300,7 +304,7 @@ struct batadv_frag_table_entry { u16 seqno; /** @size: accumulated size of packets in list */ - u16 size; + size_t size; /** @total_size: expected size of the assembled packet */ u16 total_size; @@ -451,7 +455,7 @@ struct batadv_orig_node { * @tt_buff_len: length of the last tt changeset this node received * from the orig node */ - s16 tt_buff_len; + u16 tt_buff_len; /** @tt_buff_lock: lock that protects tt_buff and tt_buff_len */ spinlock_t tt_buff_lock; @@ -992,7 +996,7 @@ struct batadv_priv_tt { * @last_changeset_len: length of last tt changeset this host has * generated */ - s16 last_changeset_len; + u16 last_changeset_len; /** * @last_changeset_lock: lock protecting last_changeset & @@ -1023,6 +1027,12 @@ struct batadv_priv_bla { atomic_t num_requests; /** + * @num_requests_lock: locks update num_requests + + * batadv_backbone_gw::state + batadv_backbone_gw::wait_periods update + */ + spinlock_t num_requests_lock; + + /** * @claim_hash: hash table containing mesh nodes this host has claimed */ struct batadv_hashtable *claim_hash; @@ -1319,15 +1329,21 @@ struct batadv_tp_vars { /** @role: receiver/sender modi */ enum batadv_tp_meter_role role; - /** @sending: sending binary semaphore: 1 if sending, 0 is not */ - atomic_t sending; + /** + * @send_result: 0 when sending is ongoing and otherwise + * enum batadv_tp_meter_reason + */ + atomic_t send_result; - /** @reason: reason for a stopped session */ - enum batadv_tp_meter_reason reason; + /** @receiving: receiving binary semaphore: 1 if receiving, 0 is not */ + atomic_t receiving; /** @finish_work: work item for the finishing procedure */ struct delayed_work finish_work; + /** @finished: completion signaled when a sender thread exits */ + struct completion finished; + /** @test_length: test length in milliseconds */ u32 test_length; @@ -1662,6 +1678,27 @@ struct batadv_priv { #ifdef CONFIG_BATMAN_ADV_BLA +enum batadv_bla_backbone_gw_state { + /** + * @BATADV_BLA_BACKBONE_GW_STOPPED: backbone gw is being removed + * and it must not longer work on requests + */ + BATADV_BLA_BACKBONE_GW_STOPPED, + + /** + * @BATADV_BLA_BACKBONE_GW_UNSYNCED: backbone was detected out + * of sync and a request was send. No traffic is forwarded until the + * situation is resolved + */ + BATADV_BLA_BACKBONE_GW_UNSYNCED, + + /** + * @BATADV_BLA_BACKBONE_GW_SYNCED: backbone is consider to be in + * sync. traffic can be forwarded + */ + BATADV_BLA_BACKBONE_GW_SYNCED, +}; + /** * struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN */ @@ -1687,16 +1724,12 @@ struct batadv_bla_backbone_gw { /** * @wait_periods: grace time for bridge forward delays and bla group * forming at bootup phase - no bcast traffic is formwared until it has - * elapsed + * elapsed. Must only be access with num_requests_lock. */ - atomic_t wait_periods; + u8 wait_periods; - /** - * @request_sent: if this bool is set to true we are out of sync with - * this backbone gateway - no bcast traffic is formwared until the - * situation was resolved - */ - atomic_t request_sent; + /** @state: sync state. Must only be access with num_requests_lock. */ + enum batadv_bla_backbone_gw_state state; /** @crc: crc16 checksum over all claims */ u16 crc; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 33d053d63407..1a6aa3f8d4d6 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -154,6 +154,7 @@ struct sock *bt_sock_alloc(struct net *net, struct socket *sock, sock_init_data(sock, sk); INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + spin_lock_init(&bt_sk(sk)->accept_q_lock); sock_reset_flag(sk, SOCK_ZAPPED); @@ -214,6 +215,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) { const struct cred *old_cred; struct pid *old_pid; + struct bt_sock *par = bt_sk(parent); BT_DBG("parent %p, sk %p", parent, sk); @@ -224,9 +226,13 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) else lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; + spin_lock_bh(&par->accept_q_lock); + list_add_tail(&bt_sk(sk)->accept_q, &par->accept_q); + sk_acceptq_added(parent); + spin_unlock_bh(&par->accept_q_lock); + /* Copy credentials from parent since for incoming connections the * socket is allocated by the kernel. */ @@ -244,8 +250,6 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) bh_unlock_sock(sk); else release_sock(sk); - - sk_acceptq_added(parent); } EXPORT_SYMBOL(bt_accept_enqueue); @@ -254,45 +258,72 @@ EXPORT_SYMBOL(bt_accept_enqueue); */ void bt_accept_unlink(struct sock *sk) { + struct sock *parent = bt_sk(sk)->parent; + BT_DBG("sk %p state %d", sk, sk->sk_state); + spin_lock_bh(&bt_sk(parent)->accept_q_lock); list_del_init(&bt_sk(sk)->accept_q); - sk_acceptq_removed(bt_sk(sk)->parent); + sk_acceptq_removed(parent); + spin_unlock_bh(&bt_sk(parent)->accept_q_lock); bt_sk(sk)->parent = NULL; sock_put(sk); } EXPORT_SYMBOL(bt_accept_unlink); +static struct sock *bt_accept_get(struct sock *parent, struct sock *sk) +{ + struct bt_sock *bt = bt_sk(parent); + struct sock *next = NULL; + + /* accept_q is modified from child teardown paths too, so take a + * temporary reference before dropping the queue lock. + */ + spin_lock_bh(&bt->accept_q_lock); + + if (sk) { + if (bt_sk(sk)->parent != parent) + goto out; + + if (!list_is_last(&bt_sk(sk)->accept_q, &bt->accept_q)) { + next = &list_next_entry(bt_sk(sk), accept_q)->sk; + sock_hold(next); + } + } else if (!list_empty(&bt->accept_q)) { + next = &list_first_entry(&bt->accept_q, + struct bt_sock, accept_q)->sk; + sock_hold(next); + } + +out: + spin_unlock_bh(&bt->accept_q_lock); + return next; +} + struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) { - struct bt_sock *s, *n; - struct sock *sk; + struct sock *sk, *next; BT_DBG("parent %p", parent); restart: - list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { - sk = (struct sock *)s; - + for (sk = bt_accept_get(parent, NULL); sk; sk = next) { /* Prevent early freeing of sk due to unlink and sock_kill */ - sock_hold(sk); lock_sock(sk); /* Check sk has not already been unlinked via * bt_accept_unlink() due to serialisation caused by sk locking */ - if (!bt_sk(sk)->parent) { + if (bt_sk(sk)->parent != parent) { BT_DBG("sk %p, already unlinked", sk); release_sock(sk); sock_put(sk); - /* Restart the loop as sk is no longer in the list - * and also avoid a potential infinite loop because - * list_for_each_entry_safe() is not thread safe. - */ goto restart; } + next = bt_accept_get(parent, sk); + /* sk is safely in the parent list so reduce reference count */ sock_put(sk); @@ -309,7 +340,19 @@ restart: if (newsock) sock_graft(sk, newsock); + /* Hand the caller a reference taken while sk is + * still locked. bt_accept_unlink() just dropped + * the accept-queue reference; without this hold a + * concurrent teardown (e.g. l2cap_conn_del() -> + * l2cap_sock_kill()) could free sk between + * release_sock() and the caller using it. Every + * caller drops this with sock_put() when done. + */ + sock_hold(sk); + release_sock(sk); + if (next) + sock_put(next); return sk; } @@ -518,18 +561,28 @@ EXPORT_SYMBOL(bt_sock_stream_recvmsg); static inline __poll_t bt_accept_poll(struct sock *parent) { - struct bt_sock *s, *n; + struct bt_sock *bt = bt_sk(parent); + struct bt_sock *s; struct sock *sk; + __poll_t mask = 0; + + spin_lock_bh(&bt->accept_q_lock); + list_for_each_entry(s, &bt->accept_q, accept_q) { + int state; - list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { sk = (struct sock *)s; - if (sk->sk_state == BT_CONNECTED || - (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) && - sk->sk_state == BT_CONNECT2)) - return EPOLLIN | EPOLLRDNORM; + state = READ_ONCE(sk->sk_state); + + if (state == BT_CONNECTED || + (test_bit(BT_SK_DEFER_SETUP, &bt->flags) && + state == BT_CONNECT2)) { + mask = EPOLLIN | EPOLLRDNORM; + break; + } } + spin_unlock_bh(&bt->accept_q_lock); - return 0; + return mask; } __poll_t bt_sock_poll(struct file *file, struct socket *sock, diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 853c8d7644b5..0de5df690bd0 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -645,8 +645,8 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) goto failed; } - up_write(&bnep_session_sem); strcpy(req->device, dev->name); + up_write(&bnep_session_sem); return 0; failed: diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index fd3aacdea512..aff8562a8690 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4438,6 +4438,9 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) events[4] |= 0x02; /* LE BIG Info Advertising Report */ } + if (ll_ext_feature_capable(hdev)) + events[5] |= BIT(2); + if (le_cs_capable(hdev)) { /* Channel Sounding events */ events[5] |= 0x08; /* LE CS Read Remote Supported Cap Complete event */ @@ -7413,9 +7416,6 @@ static int hci_le_read_all_remote_features_sync(struct hci_dev *hdev, sizeof(cp), &cp, HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE, HCI_CMD_TIMEOUT, NULL); - - return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES, - sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_read_remote_features_sync(struct hci_dev *hdev, void *data) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 7cb2864fe872..d7af617cda45 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -751,6 +751,8 @@ static void iso_sock_cleanup_listen(struct sock *parent) while ((sk = bt_accept_dequeue(parent, NULL))) { iso_sock_close(sk); iso_sock_kill(sk); + /* Drop the reference handed back by bt_accept_dequeue(). */ + sock_put(sk); } /* If listening socket has a hcon, properly disconnect it */ @@ -1356,8 +1358,13 @@ static int iso_sock_accept(struct socket *sock, struct socket *newsock, } ch = bt_accept_dequeue(sk, newsock); - if (ch) + if (ch) { + /* Drop the bridging ref from bt_accept_dequeue(); + * the grafted socket keeps ch alive from here. + */ + sock_put(ch); break; + } if (!timeo) { err = -EAGAIN; @@ -2593,6 +2600,11 @@ int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags) break; case ISO_END: + if (!conn->rx_len) { + BT_ERR("Unexpected end frame (len %d)", skb->len); + goto drop; + } + skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 7701528f1167..fdccd62ccca8 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7274,7 +7274,7 @@ static void l2cap_ecred_reconfigure(struct l2cap_chan *chan) chan->ident = l2cap_get_ident(conn); l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_RECONF_REQ, - sizeof(pdu), &pdu); + struct_size(pdu, scid, 1), pdu); } int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index cf590a67d364..b34e7da8d906 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -349,8 +349,13 @@ static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, } nsk = bt_accept_dequeue(sk, newsock); - if (nsk) + if (nsk) { + /* Drop the bridging ref from bt_accept_dequeue(); + * the grafted socket keeps nsk alive from here. + */ + sock_put(nsk); break; + } if (!timeo) { err = -EAGAIN; @@ -1475,22 +1480,54 @@ static void l2cap_sock_cleanup_listen(struct sock *parent) BT_DBG("parent %p state %s", parent, state_to_string(parent->sk_state)); - /* Close not yet accepted channels */ + /* Close not yet accepted channels. + * + * bt_accept_dequeue() now returns sk with an extra reference held + * (taken while sk was still locked) so a concurrent l2cap_conn_del() + * -> l2cap_sock_kill() cannot free sk under us. + * + * cleanup_listen() runs under the parent sk lock, so unlike + * l2cap_sock_shutdown() we must NOT take conn->lock here: that would + * establish sk_lock -> conn->lock and invert the established + * conn->lock -> chan->lock -> sk_lock order (lockdep deadlock). + * + * Instead, briefly take the child sk lock to fetch and pin its chan. + * l2cap_conn_del() reaches the chan free only via + * l2cap_chan_del() -> l2cap_sock_teardown_cb(), which itself takes + * the child sk lock; holding it across l2cap_chan_hold_unless_zero() + * therefore guarantees the chan cannot be freed while we read and + * pin it (hold_unless_zero() additionally skips a chan already past + * its last reference). We then drop the sk lock before taking + * chan->lock, so sk and chan locks are never held together. + */ while ((sk = bt_accept_dequeue(parent, NULL))) { - struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct l2cap_chan *chan; + + lock_sock_nested(sk, L2CAP_NESTING_NORMAL); + chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan); + release_sock(sk); + if (!chan) { + /* l2cap_conn_del() already tearing this child down */ + sock_put(sk); + continue; + } BT_DBG("child chan %p state %s", chan, state_to_string(chan->state)); - l2cap_chan_hold(chan); l2cap_chan_lock(chan); - __clear_chan_timer(chan); l2cap_chan_close(chan, ECONNRESET); - l2cap_sock_kill(sk); - + /* l2cap_conn_del() may already have killed this socket + * (it sets SOCK_DEAD); skip the duplicate to avoid a + * double sock_put()/l2cap_chan_put(). + */ + if (!sock_flag(sk, SOCK_DEAD)) + l2cap_sock_kill(sk); l2cap_chan_unlock(chan); + l2cap_chan_put(chan); + sock_put(sk); } } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b05bb380e5f8..de5bd6b637b2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -9110,9 +9110,15 @@ static int add_ext_adv_data(struct sock *sk, struct hci_dev *hdev, void *data, struct adv_info *adv_instance; int err = 0; struct mgmt_pending_cmd *cmd; + u16 expected_len; BT_DBG("%s", hdev->name); + expected_len = struct_size(cp, data, cp->adv_data_len + cp->scan_rsp_len); + if (expected_len != data_len) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_STATUS_INVALID_PARAMS); + hci_dev_lock(hdev); adv_instance = hci_find_adv_instance(hdev, cp->instance); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index be6639cd6f59..bd7d959c6e9e 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -180,6 +180,8 @@ static void rfcomm_sock_cleanup_listen(struct sock *parent) while ((sk = bt_accept_dequeue(parent, NULL))) { rfcomm_sock_close(sk); rfcomm_sock_kill(sk); + /* Drop the reference handed back by bt_accept_dequeue(). */ + sock_put(sk); } parent->sk_state = BT_CLOSED; @@ -497,8 +499,13 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, } nsk = bt_accept_dequeue(sk, newsock); - if (nsk) + if (nsk) { + /* Drop the bridging ref from bt_accept_dequeue(); + * the grafted socket keeps nsk alive from here. + */ + sock_put(nsk); break; + } if (!timeo) { err = -EAGAIN; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index eba44525d41d..f1799c6a6f87 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -502,6 +502,8 @@ static void sco_sock_cleanup_listen(struct sock *parent) while ((sk = bt_accept_dequeue(parent, NULL))) { sco_sock_close(sk); sco_sock_kill(sk); + /* Drop the reference handed back by bt_accept_dequeue(). */ + sock_put(sk); } parent->sk_state = BT_CLOSED; @@ -765,8 +767,13 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock, } ch = bt_accept_dequeue(sk, newsock); - if (ch) + if (ch) { + /* Drop the bridging ref from bt_accept_dequeue(); + * the grafted socket keeps ch alive from here. + */ + sock_put(ch); break; + } if (!timeo) { err = -EAGAIN; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 881d866d687a..2eef4f3345cd 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4640,10 +4640,24 @@ static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, rcu_read_unlock(); } -static void br_multicast_del_grps(struct net_bridge *br) +static void br_multicast_enable_all_ports(struct net_bridge *br) { struct net_bridge_port *port; + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return; + + list_for_each_entry(port, &br->port_list, list) + __br_multicast_enable_port_ctx(&port->multicast_ctx); +} + +static void br_multicast_disable_all_ports(struct net_bridge *br) +{ + struct net_bridge_port *port; + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return; + list_for_each_entry(port, &br->port_list, list) __br_multicast_disable_port_ctx(&port->multicast_ctx); } @@ -4651,7 +4665,6 @@ static void br_multicast_del_grps(struct net_bridge *br) int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - struct net_bridge_port *port; bool change_snoopers = false; int err = 0; @@ -4668,7 +4681,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val, br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { change_snoopers = true; - br_multicast_del_grps(br); + br_multicast_disable_all_ports(br); goto unlock; } @@ -4676,8 +4689,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val, goto unlock; br_multicast_open(br); - list_for_each_entry(port, &br->port_list, list) - __br_multicast_enable_port_ctx(&port->multicast_ctx); + br_multicast_enable_all_ports(br); change_snoopers = true; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 0ab1c94db4b9..0a394e5f4391 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -297,7 +297,11 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ goto free_skb; } - neigh_hh_bridge(&neigh->hh, skb); + if (neigh_hh_bridge(&neigh->hh, skb)) { + neigh_release(neigh); + goto free_skb; + } + skb->dev = br_indev; ret = br_handle_frame_finish(net, sk, skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6fd5386a1d64..c04a4d0889ae 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1824,6 +1824,7 @@ static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr) { + unsigned int limit = U16_MAX - nla_total_size(0); struct nlattr *nla __maybe_unused; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; @@ -1841,6 +1842,7 @@ static int br_fill_linkxstats(struct sk_buff *skb, p = br_port_get_rtnl(dev); if (!p) return 0; + limit -= nla_total_size_64bit(sizeof(p->stp_xstats)); br = p->br; vg = nbp_vlan_group(p); break; @@ -1855,6 +1857,9 @@ static int br_fill_linkxstats(struct sk_buff *skb, if (vg) { u16 pvid; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + limit -= nla_total_size_64bit(sizeof(struct br_mcast_stats)); +#endif pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; @@ -1862,6 +1867,11 @@ static int br_fill_linkxstats(struct sk_buff *skb, if (++vl_idx < *prividx) continue; + + if (skb_tail_pointer(skb) - (unsigned char *)nest + + nla_total_size(sizeof(vxi)) >= limit) + goto nla_put_failure; + memset(&vxi, 0, sizeof(vxi)); vxi.vid = v->vid; vxi.flags = v->flags; diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 741360219552..f05c79f215ea 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -112,24 +112,22 @@ static struct pernet_operations broute_net_ops = { static int __init ebtable_broute_init(void) { - int ret = ebt_register_template(&broute_table, broute_table_init); + int ret = register_pernet_subsys(&broute_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&broute_net_ops); - if (ret) { - ebt_unregister_template(&broute_table); - return ret; - } + ret = ebt_register_template(&broute_table, broute_table_init); + if (ret) + unregister_pernet_subsys(&broute_net_ops); - return 0; + return ret; } static void __exit ebtable_broute_fini(void) { - unregister_pernet_subsys(&broute_net_ops); ebt_unregister_template(&broute_table); + unregister_pernet_subsys(&broute_net_ops); } module_init(ebtable_broute_init); diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index dacd81b12e62..0fc03b07e62a 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -93,24 +93,22 @@ static struct pernet_operations frame_filter_net_ops = { static int __init ebtable_filter_init(void) { - int ret = ebt_register_template(&frame_filter, frame_filter_table_init); + int ret = register_pernet_subsys(&frame_filter_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&frame_filter_net_ops); - if (ret) { - ebt_unregister_template(&frame_filter); - return ret; - } + ret = ebt_register_template(&frame_filter, frame_filter_table_init); + if (ret) + unregister_pernet_subsys(&frame_filter_net_ops); - return 0; + return ret; } static void __exit ebtable_filter_fini(void) { - unregister_pernet_subsys(&frame_filter_net_ops); ebt_unregister_template(&frame_filter); + unregister_pernet_subsys(&frame_filter_net_ops); } module_init(ebtable_filter_init); diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 0f2a8c6118d4..8a10375d8909 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -93,24 +93,22 @@ static struct pernet_operations frame_nat_net_ops = { static int __init ebtable_nat_init(void) { - int ret = ebt_register_template(&frame_nat, frame_nat_table_init); + int ret = register_pernet_subsys(&frame_nat_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&frame_nat_net_ops); - if (ret) { - ebt_unregister_template(&frame_nat); - return ret; - } + ret = ebt_register_template(&frame_nat, frame_nat_table_init); + if (ret) + unregister_pernet_subsys(&frame_nat_net_ops); return ret; } static void __exit ebtable_nat_fini(void) { - unregister_pernet_subsys(&frame_nat_net_ops); ebt_unregister_template(&frame_nat); + unregister_pernet_subsys(&frame_nat_net_ops); } module_init(ebtable_nat_init); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index aea3e19875c6..b9f4daac09af 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -42,6 +42,7 @@ struct ebt_pernet { struct list_head tables; + struct list_head dead_tables; }; struct ebt_template { @@ -1162,11 +1163,6 @@ free_newinfo: static void __ebt_unregister_table(struct net *net, struct ebt_table *table) { - mutex_lock(&ebt_mutex); - list_del(&table->list); - mutex_unlock(&ebt_mutex); - audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries, - AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, ebt_cleanup_entry, net, NULL); if (table->private->nentries) @@ -1267,13 +1263,15 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, for (i = 0; i < num_ops; i++) ops[i].priv = table; - list_add(&table->list, &ebt_net->tables); - mutex_unlock(&ebt_mutex); - table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); - if (ret) + if (ret) { + synchronize_rcu(); __ebt_unregister_table(net, table); + } else { + list_add(&table->list, &ebt_net->tables); + } + mutex_unlock(&ebt_mutex); audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REGISTER, GFP_KERNEL); @@ -1339,7 +1337,7 @@ void ebt_unregister_template(const struct ebt_table *t) } EXPORT_SYMBOL(ebt_unregister_template); -static struct ebt_table *__ebt_find_table(struct net *net, const char *name) +void ebt_unregister_table_pre_exit(struct net *net, const char *name) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table *t; @@ -1348,30 +1346,36 @@ static struct ebt_table *__ebt_find_table(struct net *net, const char *name) list_for_each_entry(t, &ebt_net->tables, list) { if (strcmp(t->name, name) == 0) { + list_move(&t->list, &ebt_net->dead_tables); mutex_unlock(&ebt_mutex); - return t; + nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks)); + return; } } mutex_unlock(&ebt_mutex); - return NULL; -} - -void ebt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct ebt_table *table = __ebt_find_table(net, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(ebt_unregister_table_pre_exit); void ebt_unregister_table(struct net *net, const char *name) { - struct ebt_table *table = __ebt_find_table(net, name); + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + struct ebt_table *t; - if (table) - __ebt_unregister_table(net, table); + mutex_lock(&ebt_mutex); + + list_for_each_entry(t, &ebt_net->dead_tables, list) { + if (strcmp(t->name, name) == 0) { + list_del(&t->list); + audit_log_nfcfg(t->name, AF_BRIDGE, t->private->nentries, + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + __ebt_unregister_table(net, t); + mutex_unlock(&ebt_mutex); + return; + } + } + + mutex_unlock(&ebt_mutex); } /* userspace just supplied us with counters */ @@ -2556,11 +2560,21 @@ static int __net_init ebt_pernet_init(struct net *net) struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); INIT_LIST_HEAD(&ebt_net->tables); + INIT_LIST_HEAD(&ebt_net->dead_tables); return 0; } +static void __net_exit ebt_pernet_exit(struct net *net) +{ + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + + WARN_ON_ONCE(!list_empty(&ebt_net->tables)); + WARN_ON_ONCE(!list_empty(&ebt_net->dead_tables)); +} + static struct pernet_operations ebt_net_ops = { .init = ebt_pernet_init, + .exit = ebt_pernet_exit, .id = &ebt_pernet_id, .size = sizeof(struct ebt_pernet), }; @@ -2569,19 +2583,20 @@ static int __init ebtables_init(void) { int ret; - ret = xt_register_target(&ebt_standard_target); + ret = register_pernet_subsys(&ebt_net_ops); if (ret < 0) return ret; - ret = nf_register_sockopt(&ebt_sockopts); + + ret = xt_register_target(&ebt_standard_target); if (ret < 0) { - xt_unregister_target(&ebt_standard_target); + unregister_pernet_subsys(&ebt_net_ops); return ret; } - ret = register_pernet_subsys(&ebt_net_ops); + ret = nf_register_sockopt(&ebt_sockopts); if (ret < 0) { - nf_unregister_sockopt(&ebt_sockopts); xt_unregister_target(&ebt_standard_target); + unregister_pernet_subsys(&ebt_net_ops); return ret; } diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 692e0b868822..9e64e82d0b63 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -115,6 +115,11 @@ static int __ceph_x_decrypt(const struct ceph_crypto_key *key, int usage_slot, if (ret) return ret; + if (plaintext_len < sizeof(*hdr)) { + pr_err("%s plaintext too small %d\n", __func__, plaintext_len); + return -EINVAL; + } + hdr = p + ceph_crypt_data_offset(key); if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) { pr_err("%s bad magic\n", __func__); diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 254ded0b05f6..521aec1d5fc0 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -47,7 +47,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_list(struct crush_bucket_list *b) @@ -55,14 +54,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) kfree(b->item_weights); kfree(b->sum_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { kfree(b->h.items); kfree(b->node_weights); - kfree(b); } void crush_destroy_bucket_straw(struct crush_bucket_straw *b) @@ -70,14 +67,12 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) kfree(b->straws); kfree(b->item_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) { kfree(b->item_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket(struct crush_bucket *b) @@ -99,6 +94,7 @@ void crush_destroy_bucket(struct crush_bucket *b) crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); break; } + kfree(b); } /** diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c89e66d4fcb7..8b5b0587a0cf 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -72,8 +72,7 @@ static int crush_decode_uniform_bucket(void **p, void *end, struct crush_bucket_uniform *b) { dout("crush_decode_uniform_bucket %p to %p\n", *p, end); - ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); - b->item_weight = ceph_decode_32(p); + ceph_decode_32_safe(p, end, b->item_weight, bad); return 0; bad: return -EINVAL; @@ -389,11 +388,15 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c) goto fail; if (arg->ids_size && - arg->ids_size != c->buckets[bucket_index]->size) + (!c->buckets[bucket_index] || + arg->ids_size != c->buckets[bucket_index]->size)) goto e_inval; } - insert_choose_arg_map(&c->choose_args, arg_map); + if (!__insert_choose_arg_map(&c->choose_args, arg_map)) { + ret = -EEXIST; + goto fail; + } } return 0; @@ -516,6 +519,10 @@ static struct crush_map *crush_decode(void *pbyval, void *end) b->id = ceph_decode_32(p); b->type = ceph_decode_16(p); b->alg = ceph_decode_8(p); + if (b->alg != alg) { + b->alg = 0; + goto bad; + } b->hash = ceph_decode_8(p); b->weight = ceph_decode_32(p); b->size = ceph_decode_32(p); @@ -1702,7 +1709,7 @@ static int osdmap_decode(void **p, void *end, bool msgr2, ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(struct_v >= 5 ? sizeof(u32) : sizeof(u8)) + - sizeof(*map->osd_weight), e_inval); + map->max_osd*sizeof(*map->osd_weight), e_inval); if (ceph_decode_32(p) != map->max_osd) goto e_inval; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 14eb7812bda4..ecd659f79fd4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -172,7 +172,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) struct bpf_map *map; smap = rcu_dereference(SDATA(selem)->smap); - if (!(smap->map.map_flags & BPF_F_CLONE)) + if (!smap || !(smap->map.map_flags & BPF_F_CLONE)) continue; /* Note that for lockless listeners adding new element @@ -531,10 +531,10 @@ err_free: } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); -static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) +static int diag_get(struct bpf_local_storage_map *smap, + struct bpf_local_storage_data *sdata, struct sk_buff *skb) { struct nlattr *nla_stg, *nla_value; - struct bpf_local_storage_map *smap; /* It cannot exceed max nlattr's payload */ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); @@ -543,7 +543,6 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) if (!nla_stg) return -EMSGSIZE; - smap = rcu_dereference(sdata->smap); if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) goto errout; @@ -558,6 +557,7 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) sdata->data, true); else copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + check_and_init_map_value(&smap->map, nla_data(nla_value)); nla_nest_end(skb, nla_stg); return 0; @@ -596,9 +596,11 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, saved_len = skb->len; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { smap = rcu_dereference(SDATA(selem)->smap); + if (!smap) + continue; diag_size += nla_value_size(smap->map.value_size); - if (nla_stgs && diag_get(SDATA(selem), skb)) + if (nla_stgs && diag_get(smap, SDATA(selem), skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } @@ -665,7 +667,7 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, diag_size += nla_value_size(diag->maps[i]->value_size); - if (nla_stgs && diag_get(sdata, skb)) + if (nla_stgs && diag_get((struct bpf_local_storage_map *)diag->maps[i], sdata, skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } diff --git a/net/core/dev.c b/net/core/dev.c index 8bfa8313ef62..0c6c270d9f7d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6862,9 +6862,9 @@ static void skb_defer_free_flush(void) #if defined(CONFIG_NET_RX_BUSY_POLL) -static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +static void __busy_poll_stop(struct napi_struct *napi, unsigned long timeout) { - if (!skip_schedule) { + if (!timeout) { gro_normal_list(&napi->gro); __napi_schedule(napi); return; @@ -6874,6 +6874,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) gro_flush_normal(&napi->gro, HZ >= 1000); clear_bit(NAPI_STATE_SCHED, &napi->state); + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); } enum { @@ -6885,8 +6887,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - bool skip_schedule = false; - unsigned long timeout; + unsigned long timeout = 0; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6906,10 +6907,12 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); - timeout = napi_get_gro_flush_timeout(napi); - if (napi->defer_hard_irqs_count && timeout) { - hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); - skip_schedule = true; + if (napi->defer_hard_irqs_count) { + /* A short enough gro flush timeout and long enough + * poll can result in timer firing too early. + * Timer will be armed later if necessary. + */ + timeout = napi_get_gro_flush_timeout(napi); } } @@ -6924,7 +6927,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) - __busy_poll_stop(napi, skip_schedule); + __busy_poll_stop(napi, timeout); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } diff --git a/net/core/devmem.c b/net/core/devmem.c index 468344739db2..4f71de44c0fb 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -241,6 +241,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, } if (direction == DMA_TO_DEVICE) { + if (!IS_ALIGNED(dmabuf->size, PAGE_SIZE)) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "TX dma-buf size must be a multiple of PAGE_SIZE"); + goto err_unmap; + } binding->tx_vec = kvmalloc_objs(struct net_iov *, dmabuf->size / PAGE_SIZE); if (!binding->tx_vec) { @@ -267,6 +272,12 @@ net_devmem_bind_dmabuf(struct net_device *dev, size_t len = sg_dma_len(sg); struct net_iov *niov; + if (!IS_ALIGNED(len, PAGE_SIZE)) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "dma-buf SG length must be PAGE_SIZE aligned"); + goto err_free_chunks; + } + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, dev_to_node(&dev->dev)); if (!owner) { diff --git a/net/core/failover.c b/net/core/failover.c index 11bb183c7a1b..e43c59cd6868 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -12,6 +12,7 @@ #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> +#include <net/netdev_lock.h> #include <net/failover.h> static LIST_HEAD(failover_list); @@ -221,8 +222,11 @@ failover_existing_slave_register(struct net_device *failover_dev) for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; - if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) + if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) { + netdev_lock_ops(dev); failover_slave_register(dev); + netdev_unlock_ops(dev); + } } rtnl_unlock(); } diff --git a/net/core/filter.c b/net/core/filter.c index 80a3b702a2d4..9590877b0714 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1654,15 +1654,24 @@ err_prog_put: return err; } +static void sk_reuseport_prog_free_rcu(struct rcu_head *rcu) +{ + struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); + struct bpf_prog *prog = aux->prog; + + bpf_release_orig_filter(prog); + bpf_prog_free(prog); +} + void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; - if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) - bpf_prog_put(prog); + if (bpf_prog_was_classic(prog)) + call_rcu(&prog->aux->rcu, sk_reuseport_prog_free_rcu); else - bpf_prog_destroy(prog); + bpf_prog_put(prog); } static inline int __bpf_try_make_writable(struct sk_buff *skb, @@ -5481,7 +5490,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!sk_is_tcp(sk)) return -EINVAL; switch (optname) { @@ -5688,6 +5697,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + /* + * TCP_NODELAY triggers tcp_push_pending_frames() and re-enters + * CA_EVENT_TX_START in bpf_tcp_cc. + */ + if (level == SOL_TCP && optname == TCP_NODELAY) + return -EOPNOTSUPP; + + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = { + .func = bpf_sk_setsockopt_nodelay, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { @@ -5833,6 +5866,12 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; + /* TCP_NODELAY triggers tcp_push_pending_frames() and re-enters these callbacks. */ + if ((bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB || + bpf_sock->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB) && + level == SOL_TCP && optname == TCP_NODELAY) + return -EOPNOTSUPP; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -6443,6 +6482,8 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; @@ -7443,7 +7484,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { - if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + if (sk_fullsock(sk) && sk_is_tcp(sk)) return (unsigned long)sk; return (unsigned long)NULL; @@ -11915,7 +11956,7 @@ BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) */ BTF_TYPE_EMIT(struct tcp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && - sk->sk_family == AF_INET6) + sk->sk_type == SOCK_STREAM && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; @@ -11931,7 +11972,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) { - if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + if (sk && sk_fullsock(sk) && sk_is_tcp(sk)) return (unsigned long)sk; return (unsigned long)NULL; diff --git a/net/core/gro.c b/net/core/gro.c index 31d21de5b15a..a84753983467 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -109,6 +109,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (p->pp_recycle != skb->pp_recycle) return -ETOOMANYREFS; + if (skb_zcopy(p) || skb_zcopy(skb)) + return -ETOOMANYREFS; + if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) || NAPI_GRO_CB(skb)->flush)) return -E2BIG; @@ -213,10 +216,12 @@ done: p->data_len += len; p->truesize += delta_truesize; p->len += len; + skb_shinfo(p)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG; if (lp != p) { lp->data_len += len; lp->truesize += delta_truesize; lp->len += len; + skb_shinfo(lp)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG; } NAPI_GRO_CB(skb)->same_flow = 1; return 0; @@ -244,6 +249,8 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) p->truesize += skb->truesize; p->len += skb->len; + skb_shinfo(p)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + NAPI_GRO_CB(skb)->same_flow = 1; return 0; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 84faace50ac2..3f4a17fa5713 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -319,6 +319,8 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) lockdep_assert_irqs_disabled(); dev = np->dev; + /* npinfo->txq belongs to np->dev, so retries must stay bound to it. */ + skb->dev = dev; rcu_read_lock(); npinfo = rcu_dereference_bh(dev->npinfo); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index df042da422ef..511c25bf6f2a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -6328,8 +6328,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, &filters, &idxattr, &prividx, extack); if (err < 0) { - /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ - WARN_ON(err == -EMSGSIZE); + /* -EMSGSIZE implies BUG in if_nlmsg_stats_size + * or a too big nested attribute. + */ kfree_skb(nskb); } else { err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7dad68e3b518..44ac121cfccb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2248,6 +2248,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, skb_frag_ref(skb, i); } skb_shinfo(n)->nr_frags = i; + skb_shinfo(n)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; } if (skb_has_frag_list(skb)) { @@ -4349,6 +4350,8 @@ onlymerged: tgt->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL; + skb_shinfo(tgt)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + skb_len_add(skb, -shiftlen); skb_len_add(tgt, shiftlen); @@ -4959,7 +4962,8 @@ normal: skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & + skb_shinfo(nskb)->flags |= (skb_shinfo(head_skb)->flags | + skb_shinfo(frag_skb)->flags) & SKBFL_SHARED_FRAG; if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) @@ -4976,6 +4980,9 @@ normal: nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; + + skb_shinfo(nskb)->flags |= skb_shinfo(frag_skb)->flags & SKBFL_SHARED_FRAG; + if (!skb_headlen(list_skb)) { BUG_ON(!nfrags); } else { @@ -6200,6 +6207,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, from_shinfo->frags, from_shinfo->nr_frags * sizeof(skb_frag_t)); to_shinfo->nr_frags += from_shinfo->nr_frags; + if (from_shinfo->nr_frags) + to_shinfo->flags |= from_shinfo->flags & SKBFL_SHARED_FRAG; if (!skb_cloned(from)) from_shinfo->nr_frags = 0; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 6187a83bd741..e1850caf1a71 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1268,12 +1268,19 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { const struct proto_ops *ops = NULL; + struct sk_psock *psock; struct socket *sock; int copied; trace_sk_data_ready(sk); rcu_read_lock(); + psock = sk_psock(sk); + if (psock && tls_sw_has_ctx_rx(sk)) { + psock->saved_data_ready(sk); + rcu_read_unlock(); + return; + } sock = READ_ONCE(sk->sk_socket); if (likely(sock)) ops = READ_ONCE(sock->ops); @@ -1283,8 +1290,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk) copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { - struct sk_psock *psock; - rcu_read_lock(); psock = sk_psock(sk); if (psock) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 02a68be3002a..99e3789492a0 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1630,18 +1630,23 @@ void sock_map_unhash(struct sock *sk) void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; +retry: rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + if (unlikely(saved_unhash == sock_map_unhash)) + goto retry; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); + + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; } - if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) - return; + if (saved_unhash) saved_unhash(sk); } @@ -1652,20 +1657,25 @@ void sock_map_destroy(struct sock *sk) void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; +retry: rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + if (unlikely(saved_destroy == sock_map_destroy)) + goto retry; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); + + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; } - if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) - return; + if (saved_destroy) saved_destroy(sk); } @@ -1676,32 +1686,33 @@ void sock_map_close(struct sock *sk, long timeout) void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; +retry: lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); - psock = sk_psock_get(sk); - if (unlikely(!psock)) - goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); + + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; } else { saved_close = READ_ONCE(sk->sk_prot)->close; -no_psock: rcu_read_unlock(); release_sock(sk); + + if (unlikely(saved_close == sock_map_close)) + goto retry; } - /* Make sure we do not recurse. This is a bug. - * Leak the socket instead of crashing on a stack overflow. - */ - if (WARN_ON_ONCE(saved_close == sock_map_close)) - return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 8bb98d3ea3db..a3a2cc6480a0 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -92,7 +92,7 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, u32 mask; if (end <= start) - return true; + return false; if (start % 32) { mask = ethnl_upper_bits(start); @@ -105,11 +105,11 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, start_word++; } - if (!memchr_inv(map + start_word, '\0', - (end_word - start_word) * sizeof(u32))) + if (memchr_inv(map + start_word, '\0', + (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) - return true; + return false; return map[end_word] & ethnl_lower_bits(end); } diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index d4e6887055ab..ddc6eab701ed 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -76,6 +76,7 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, struct nlattr **tb = info->attrs; struct phy_device_node *pdn; struct phy_device *phydev; + int ret; /* RTNL is held by the caller */ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER, @@ -88,8 +89,19 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, return -EOPNOTSUPP; rep_data->phyindex = phydev->phyindex; + rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL); - rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + if (!rep_data->name) + return -ENOMEM; + + if (phydev->drv) { + rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + if (!rep_data->drvname) { + ret = -ENOMEM; + goto err_free_name; + } + } + rep_data->upstream_type = pdn->upstream_type; if (pdn->upstream_type == PHY_UPSTREAM_PHY) { @@ -97,15 +109,33 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, rep_data->upstream_index = upstream->phyindex; } - if (pdn->parent_sfp_bus) + if (pdn->parent_sfp_bus) { rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus), GFP_KERNEL); + if (!rep_data->upstream_sfp_name) { + ret = -ENOMEM; + goto err_free_drvname; + } + } - if (phydev->sfp_bus) + if (phydev->sfp_bus) { rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus), GFP_KERNEL); + if (!rep_data->downstream_sfp_name) { + ret = -ENOMEM; + goto err_free_upstream_sfp; + } + } return 0; + +err_free_upstream_sfp: + kfree(rep_data->upstream_sfp_name); +err_free_drvname: + kfree(rep_data->drvname); +err_free_name: + kfree(rep_data->name); + return ret; } static int phy_fill_reply(struct sk_buff *skb, diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index d09875b33588..b514e43766ef 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -163,8 +163,8 @@ void hsr_del_nodes(struct list_head *node_db) struct hsr_node *tmp; list_for_each_entry_safe(node, tmp, node_db, mac_list) { - list_del(&node->mac_list); - hsr_free_node(node); + list_del_rcu(&node->mac_list); + call_rcu(&node->rcu_head, hsr_free_node_rcu); } } @@ -889,7 +889,10 @@ int hsr_get_node_data(struct hsr_priv *hsr, if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); - *addr_b_ifindex = port->dev->ifindex; + if (port) + *addr_b_ifindex = port->dev->ifindex; + else + *addr_b_ifindex = -1; } else { *addr_b_ifindex = -1; } diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 008edc7f6688..791e15063237 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -168,7 +168,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, */ if (prog_ops_moff(prog) != offsetof(struct tcp_congestion_ops, release)) - return &bpf_sk_setsockopt_proto; + return &bpf_sk_setsockopt_nodelay_proto; return NULL; case BPF_FUNC_getsockopt: /* Since get/setsockopt is usually expected to diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7eeff658b467..23e921d313b3 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -961,6 +961,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (IS_ERR(rt)) goto out_unlock; + if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto ende; + /* peer icmp_ratelimit */ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) goto ende; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 928654c34156..dbcd37dfdc15 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1108,7 +1108,7 @@ static void reqsk_timer_handler(struct timer_list *t) if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ - __inet_csk_reqsk_queue_drop(sk_listener, nreq, true); + __inet_csk_reqsk_queue_drop(sk_listener, nreq, false); goto no_ownership; } @@ -1134,7 +1134,7 @@ no_ownership: } drop: - __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); + __inet_csk_reqsk_queue_drop(oreq->rsk_listener, oreq, true); reqsk_put(oreq); } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 97ead883e4a1..ad2259678c78 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1501,13 +1501,11 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len static void __arpt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; + void *loc_cpu_entry; struct arpt_entry *iter; - private = xt_unregister_table(table); - /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) @@ -1515,6 +1513,7 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int arpt_register_table(struct net *net, @@ -1522,13 +1521,11 @@ int arpt_register_table(struct net *net, const struct arpt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1543,7 +1540,7 @@ int arpt_register_table(struct net *net, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct arpt_entry *iter; @@ -1553,46 +1550,12 @@ int arpt_register_table(struct net *net, return PTR_ERR(new_table); } - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - return ret; - -out_free: - __arpt_unregister_table(net, new_table); - return ret; -} - -void arpt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } -EXPORT_SYMBOL(arpt_unregister_table_pre_exit); void arpt_unregister_table(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_ARP, name); if (table) __arpt_unregister_table(net, table); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 78cd5ee24448..370b635e3523 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -43,7 +43,7 @@ static int arptable_filter_table_init(struct net *net) static void __net_exit arptable_filter_net_pre_exit(struct net *net) { - arpt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_ARP, "filter"); } static void __net_exit arptable_filter_net_exit(struct net *net) @@ -58,32 +58,33 @@ static struct pernet_operations arptable_filter_net_ops = { static int __init arptable_filter_init(void) { - int ret = xt_register_template(&packet_filter, - arptable_filter_table_init); - - if (ret < 0) - return ret; + int ret; arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table); - if (IS_ERR(arpfilter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(arpfilter_ops)) return PTR_ERR(arpfilter_ops); - } ret = register_pernet_subsys(&arptable_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, + arptable_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(arpfilter_ops); - return ret; + unregister_pernet_subsys(&arptable_filter_net_ops); + goto err_free; } + return 0; +err_free: + kfree(arpfilter_ops); return ret; } static void __exit arptable_filter_fini(void) { - unregister_pernet_subsys(&arptable_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&arptable_filter_net_ops); kfree(arpfilter_ops); } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 23c8deff8095..5cbdb0815857 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1704,12 +1704,10 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ipt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ipt_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1718,19 +1716,18 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1745,7 +1742,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ipt_entry *iter; @@ -1755,51 +1752,12 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - /* No template? No need to do anything. This is used by 'nat' table, it registers - * with the nat core instead of the netfilter core. - */ - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - return ret; - -out_free: - __ipt_unregister_table(net, new_table); - return ret; -} - -void ipt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } void ipt_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV4, name); if (table) __ipt_unregister_table(net, table); @@ -1887,7 +1845,6 @@ static void __exit ip_tables_fini(void) } EXPORT_SYMBOL(ipt_register_table); -EXPORT_SYMBOL(ipt_unregister_table_pre_exit); EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); module_init(ip_tables_init); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 3ab908b74795..672d7da1071d 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -61,7 +61,7 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) @@ -77,32 +77,33 @@ static struct pernet_operations iptable_filter_net_ops = { static int __init iptable_filter_init(void) { - int ret = xt_register_template(&packet_filter, - iptable_filter_table_init); - - if (ret < 0) - return ret; + int ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table); - if (IS_ERR(filter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(filter_ops)) return PTR_ERR(filter_ops); - } ret = register_pernet_subsys(&iptable_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, + iptable_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(filter_ops); - return ret; + unregister_pernet_subsys(&iptable_filter_net_ops); + goto err_free; } return 0; +err_free: + kfree(filter_ops); + return ret; } static void __exit iptable_filter_fini(void) { - unregister_pernet_subsys(&iptable_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&iptable_filter_net_ops); kfree(filter_ops); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 385d945d8ebe..13d25d9a4610 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -96,7 +96,7 @@ static int iptable_mangle_table_init(struct net *net) static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) @@ -111,32 +111,33 @@ static struct pernet_operations iptable_mangle_net_ops = { static int __init iptable_mangle_init(void) { - int ret = xt_register_template(&packet_mangler, - iptable_mangle_table_init); - if (ret < 0) - return ret; + int ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); - if (IS_ERR(mangle_ops)) { - xt_unregister_template(&packet_mangler); - ret = PTR_ERR(mangle_ops); - return ret; - } + if (IS_ERR(mangle_ops)) + return PTR_ERR(mangle_ops); ret = register_pernet_subsys(&iptable_mangle_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_mangler, + iptable_mangle_table_init); if (ret < 0) { - xt_unregister_template(&packet_mangler); - kfree(mangle_ops); - return ret; + unregister_pernet_subsys(&iptable_mangle_net_ops); + goto err_free; } + return 0; +err_free: + kfree(mangle_ops); return ret; } static void __exit iptable_mangle_fini(void) { - unregister_pernet_subsys(&iptable_mangle_net_ops); xt_unregister_template(&packet_mangler); + unregister_pernet_subsys(&iptable_mangle_net_ops); kfree(mangle_ops); } diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 625a1ca13b1b..a0df72554025 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -119,8 +119,11 @@ static int iptable_nat_table_init(struct net *net) } ret = ipt_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); + synchronize_rcu(); ipt_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; @@ -129,6 +132,7 @@ static int iptable_nat_table_init(struct net *net) static void __net_exit iptable_nat_net_pre_exit(struct net *net) { ipt_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); } static void __net_exit iptable_nat_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 0e7f53964d0a..2745c22f4034 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -53,7 +53,7 @@ static int iptable_raw_table_init(struct net *net) static void __net_exit iptable_raw_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "raw"); } static void __net_exit iptable_raw_net_exit(struct net *net) @@ -77,32 +77,32 @@ static int __init iptable_raw_init(void) pr_info("Enabling raw table before defrag\n"); } - ret = xt_register_template(table, - iptable_raw_table_init); - if (ret < 0) - return ret; - rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table); - if (IS_ERR(rawtable_ops)) { - xt_unregister_template(table); + if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); - } ret = register_pernet_subsys(&iptable_raw_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(table, + iptable_raw_table_init); if (ret < 0) { - xt_unregister_template(table); - kfree(rawtable_ops); - return ret; + unregister_pernet_subsys(&iptable_raw_net_ops); + goto err_free; } + return 0; +err_free: + kfree(rawtable_ops); return ret; } static void __exit iptable_raw_fini(void) { + xt_unregister_template(&packet_raw); unregister_pernet_subsys(&iptable_raw_net_ops); kfree(rawtable_ops); - xt_unregister_template(&packet_raw); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index d885443cb267..491894511c54 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -50,7 +50,7 @@ static int iptable_security_table_init(struct net *net) static void __net_exit iptable_security_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "security"); } static void __net_exit iptable_security_net_exit(struct net *net) @@ -65,33 +65,34 @@ static struct pernet_operations iptable_security_net_ops = { static int __init iptable_security_init(void) { - int ret = xt_register_template(&security_table, - iptable_security_table_init); - - if (ret < 0) - return ret; + int ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table); - if (IS_ERR(sectbl_ops)) { - xt_unregister_template(&security_table); + if (IS_ERR(sectbl_ops)) return PTR_ERR(sectbl_ops); - } ret = register_pernet_subsys(&iptable_security_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&security_table, + iptable_security_table_init); if (ret < 0) { - xt_unregister_template(&security_table); - kfree(sectbl_ops); - return ret; + unregister_pernet_subsys(&iptable_security_net_ops); + goto err_free; } + return 0; +err_free: + kfree(sectbl_ops); return ret; } static void __exit iptable_security_fini(void) { + xt_unregister_template(&security_table); unregister_pernet_subsys(&iptable_security_net_ops); kfree(sectbl_ops); - xt_unregister_template(&security_table); } module_init(iptable_security_init); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 5aaf9c62c8e1..68e88cb3e55c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -391,7 +391,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, * in, reject the frame as invalid */ err = -EINVAL; - if (iphlen > length) + if (iphlen > length || iphlen < sizeof(*iph)) goto error_free; if (iphlen >= sizeof(*iph)) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bc1296f0ea69..3d62d45d84bd 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1272,7 +1272,7 @@ static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 432fa28e47d4..389a7cc17110 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -299,9 +299,6 @@ enum { DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); -DEFINE_PER_CPU(u32, tcp_tw_isn); -EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); - long sysctl_tcp_mem[3] __read_mostly; DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index a97cdf3e6af4..0a4b38b315fe 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -116,7 +116,8 @@ struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, { struct tcp_ao_key *key; - hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) { + hlist_for_each_entry_rcu(key, &ao->head, node, + sk_fullsock(sk) && lockdep_sock_is_held(sk)) { if ((sndid >= 0 && key->sndid != sndid) || (rcvid >= 0 && key->rcvid != rcvid)) continue; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d5c9e65d9760..de9f68a9c0cf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7589,6 +7589,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; + u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -7599,20 +7600,16 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct dst_entry *dst; struct flowi fl; u8 syncookies; - u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif - isn = __this_cpu_read(tcp_tw_isn); - if (isn) { - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - __this_cpu_write(tcp_tw_isn, 0); - } else { + /* If isn is non-zero, this SYN originally matched a TIME_WAIT socket. + * TW sockets are converted to open requests without limitations, + * we skip the queue limits and syncookie checks in the block below. + */ + if (!isn) { syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c0526cc03980..fdc81150ff6c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2198,6 +2198,7 @@ lookup: } } + isn = 0; process: if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ @@ -2227,6 +2228,7 @@ process: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); + TCP_SKB_CB(skb)->tcp_tw_isn = isn; skb->dev = NULL; @@ -2313,7 +2315,6 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; - __this_cpu_write(tcp_tw_isn, isn); goto process; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9d8755705f7..6e4bb411dc04 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2626,6 +2626,7 @@ static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, todo = min_t(int, skb_frag_size(fragfrom), probe_size - len); len += todo; + skb_shinfo(to)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; if (lastfrag && skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a0813d425b71..29651b1a0bc7 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -482,11 +482,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, struct sock *sk = gso_skb->sk; unsigned int sum_truesize = 0; struct sk_buff *segs, *seg; - __be16 newlen, msslen; struct udphdr *uh; unsigned int mss; bool copy_dtor; __sum16 check; + __be16 newlen; int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; @@ -555,15 +555,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, return segs; } - msslen = htons(sizeof(*uh) + mss); - - /* GSO partial and frag_list segmentation only requires splitting - * the frame into an MSS multiple and possibly a remainder, both - * cases return a GSO skb. So update the mss now. - */ - if (skb_is_gso(segs)) - mss *= skb_shinfo(segs)->gso_segs; - seg = segs; uh = udp_hdr(seg); @@ -586,7 +577,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (!seg->next) break; - uh->len = msslen; + uh->len = newlen; uh->check = check; if (seg->ip_summed == CHECKSUM_PARTIAL) @@ -599,9 +590,12 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, uh = udp_hdr(seg); } - /* last packet can be partial gso_size, account for that in checksum */ - newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) + - seg->data_len); + /* Unless skb fits perfectly as GSO_PARTIAL, the trailing + * segment may not be full MSS, account for that in the checksum + */ + if (!skb_is_gso(seg)) + newlen = htons(skb_tail_pointer(seg) - + skb_transport_header(seg) + seg->data_len); check = csum16_add(csum16_sub(uh->check, uh->len), newlen); uh->len = newlen; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 03cbce842c1a..cf90f933ca1a 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -910,16 +910,27 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) { + enum skb_drop_reason drop_reason; struct ioam6_trace_hdr *trace; struct ioam6_namespace *ns; + struct inet6_dev *idev; struct ioam6_hdr *hdr; + drop_reason = SKB_DROP_REASON_IP_INHDR; + /* Bad alignment (must be 4n-aligned) */ if (optoff & 3) goto drop; + /* Does the device still have IPv6 configuration? */ + idev = __in6_dev_get(skb->dev); + if (!idev) { + drop_reason = SKB_DROP_REASON_IPV6DISABLED; + goto drop; + } + /* Ignore if IOAM is not enabled on ingress */ - if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled)) + if (!READ_ONCE(idev->cnf.ioam6_enabled)) goto ignore; /* Truncated Option header */ @@ -955,9 +966,9 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len)) goto drop; - /* Trace pointer may have changed */ - trace = (struct ioam6_trace_hdr *)(skb_network_header(skb) - + optoff + sizeof(*hdr)); + /* Trace and hdr pointers may have changed */ + hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff); + trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr)); ioam6_fill_trace_data(skb, ns, trace, true); @@ -972,7 +983,7 @@ ignore: return true; drop: - kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); + kfree_skb_reason(skb, drop_reason); return false; } diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index c92f98c6f6ec..b1ccdf0dc646 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -36,11 +36,11 @@ /* FL hash table */ #define FL_MAX_PER_SOCK 32 -#define FL_MAX_SIZE 4096 +#define FL_MAX_SIZE 8192 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) -static atomic_t fl_size = ATOMIC_INIT(0); +static int fl_size; static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); @@ -162,8 +162,9 @@ static void ip6_fl_gc(struct timer_list *unused) ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; + fl_size--; + fl->fl_net->ipv6.flowlabel_count--; fl_free(fl); - atomic_dec(&fl_size); continue; } if (!sched || time_before(ttd, sched)) @@ -172,7 +173,7 @@ static void ip6_fl_gc(struct timer_list *unused) flp = &fl->next; } } - if (!sched && atomic_read(&fl_size)) + if (!sched && fl_size) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); @@ -196,7 +197,8 @@ static void __net_exit ip6_fl_purge(struct net *net) atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); - atomic_dec(&fl_size); + fl_size--; + net->ipv6.flowlabel_count--; continue; } flp = &fl->next; @@ -210,10 +212,10 @@ static struct ip6_flowlabel *fl_intern(struct net *net, { struct ip6_flowlabel *lfl; + lockdep_assert_held(&ip6_fl_lock); + fl->label = label & IPV6_FLOWLABEL_MASK; - rcu_read_lock(); - spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; @@ -235,8 +237,6 @@ static struct ip6_flowlabel *fl_intern(struct net *net, lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); return lfl; } } @@ -244,9 +244,8 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); - atomic_inc(&fl_size); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); + fl_size++; + net->ipv6.flowlabel_count++; return NULL; } @@ -464,10 +463,17 @@ done: static int mem_check(struct sock *sk) { - int room = FL_MAX_SIZE - atomic_read(&fl_size); + const int unpriv_total_limit = FL_MAX_SIZE - (FL_MAX_SIZE / 4); + const int unpriv_user_limit = unpriv_total_limit / 2; + struct net *net = sock_net(sk); + int room; struct ipv6_fl_socklist *sfl; int count = 0; + lockdep_assert_held(&ip6_fl_lock); + + room = FL_MAX_SIZE - fl_size; + if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; @@ -478,7 +484,9 @@ static int mem_check(struct sock *sk) if (room <= 0 || ((count >= FL_MAX_PER_SOCK || - (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && + (count > 0 && room < FL_MAX_SIZE / 2) || + room < FL_MAX_SIZE / 4 || + net->ipv6.flowlabel_count >= unpriv_user_limit) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; @@ -692,11 +700,19 @@ release: if (!sfl1) goto done; + rcu_read_lock(); + spin_lock_bh(&ip6_fl_lock); err = mem_check(sk); + if (err == 0) + fl1 = fl_intern(net, fl, freq->flr_label); + else + fl1 = NULL; + spin_unlock_bh(&ip6_fl_lock); + rcu_read_unlock(); + if (err != 0) goto done; - fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index d585ac3c1113..9d9c3763f2f5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1713,12 +1713,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ip6t_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ip6t_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1727,19 +1725,18 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1754,7 +1751,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ip6t_entry *iter; @@ -1764,48 +1761,12 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - return ret; - -out_free: - __ip6t_unregister_table(net, new_table); - return ret; -} - -void ip6t_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } void ip6t_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV6, name); if (table) __ip6t_unregister_table(net, table); @@ -1894,7 +1855,6 @@ static void __exit ip6_tables_fini(void) } EXPORT_SYMBOL(ip6t_register_table); -EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index e7a3fb9355ee..450dd53846a2 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -168,6 +168,10 @@ static int hbh_mt6_check(const struct xt_mtchk_param *par) pr_debug("unknown flags %X\n", optsinfo->invflags); return -EINVAL; } + if (optsinfo->optsnr > IP6T_OPTS_OPTSNR) { + pr_debug("too many supported opts specified\n"); + return -EINVAL; + } if (optsinfo->flags & IP6T_OPTS_NSTRICT) { pr_debug("Not strict - not implemented"); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index e8992693e14a..b074fc477676 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -60,7 +60,7 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "filter"); } static void __net_exit ip6table_filter_net_exit(struct net *net) @@ -76,32 +76,32 @@ static struct pernet_operations ip6table_filter_net_ops = { static int __init ip6table_filter_init(void) { - int ret = xt_register_template(&packet_filter, - ip6table_filter_table_init); - - if (ret < 0) - return ret; + int ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table); - if (IS_ERR(filter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(filter_ops)) return PTR_ERR(filter_ops); - } ret = register_pernet_subsys(&ip6table_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, ip6table_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(filter_ops); - return ret; + unregister_pernet_subsys(&ip6table_filter_net_ops); + goto err_free; } + return 0; +err_free: + kfree(filter_ops); return ret; } static void __exit ip6table_filter_fini(void) { - unregister_pernet_subsys(&ip6table_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&ip6table_filter_net_ops); kfree(filter_ops); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 8dd4cd0c47bd..e6ee036a9b2c 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -89,7 +89,7 @@ static int ip6table_mangle_table_init(struct net *net) static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "mangle"); } static void __net_exit ip6table_mangle_net_exit(struct net *net) @@ -104,32 +104,33 @@ static struct pernet_operations ip6table_mangle_net_ops = { static int __init ip6table_mangle_init(void) { - int ret = xt_register_template(&packet_mangler, - ip6table_mangle_table_init); - - if (ret < 0) - return ret; + int ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); - if (IS_ERR(mangle_ops)) { - xt_unregister_template(&packet_mangler); + if (IS_ERR(mangle_ops)) return PTR_ERR(mangle_ops); - } ret = register_pernet_subsys(&ip6table_mangle_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_mangler, + ip6table_mangle_table_init); if (ret < 0) { - xt_unregister_template(&packet_mangler); - kfree(mangle_ops); - return ret; + unregister_pernet_subsys(&ip6table_mangle_net_ops); + goto err_free; } + return 0; +err_free: + kfree(mangle_ops); return ret; } static void __exit ip6table_mangle_fini(void) { - unregister_pernet_subsys(&ip6table_mangle_net_ops); xt_unregister_template(&packet_mangler); + unregister_pernet_subsys(&ip6table_mangle_net_ops); kfree(mangle_ops); } diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 5be723232df8..c2394e2c94b5 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -121,8 +121,11 @@ static int ip6table_nat_table_init(struct net *net) } ret = ip6t_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); + synchronize_rcu(); ip6t_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; @@ -131,6 +134,7 @@ static int ip6table_nat_table_init(struct net *net) static void __net_exit ip6table_nat_net_pre_exit(struct net *net) { ip6t_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); } static void __net_exit ip6table_nat_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index fc9f6754028f..3b161ee875bc 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -52,7 +52,7 @@ static int ip6table_raw_table_init(struct net *net) static void __net_exit ip6table_raw_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "raw"); } static void __net_exit ip6table_raw_net_exit(struct net *net) @@ -75,31 +75,31 @@ static int __init ip6table_raw_init(void) pr_info("Enabling raw table before defrag\n"); } - ret = xt_register_template(table, ip6table_raw_table_init); - if (ret < 0) - return ret; - /* Register hooks */ rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table); - if (IS_ERR(rawtable_ops)) { - xt_unregister_template(table); + if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); - } ret = register_pernet_subsys(&ip6table_raw_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(table, ip6table_raw_table_init); if (ret < 0) { - kfree(rawtable_ops); - xt_unregister_template(table); - return ret; + unregister_pernet_subsys(&ip6table_raw_net_ops); + goto err_free; } + return 0; +err_free: + kfree(rawtable_ops); return ret; } static void __exit ip6table_raw_fini(void) { - unregister_pernet_subsys(&ip6table_raw_net_ops); xt_unregister_template(&packet_raw); + unregister_pernet_subsys(&ip6table_raw_net_ops); kfree(rawtable_ops); } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 4df14a9bae78..4bd5d97b8ab6 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -49,7 +49,7 @@ static int ip6table_security_table_init(struct net *net) static void __net_exit ip6table_security_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "security"); } static void __net_exit ip6table_security_net_exit(struct net *net) @@ -64,32 +64,33 @@ static struct pernet_operations ip6table_security_net_ops = { static int __init ip6table_security_init(void) { - int ret = xt_register_template(&security_table, - ip6table_security_table_init); - - if (ret < 0) - return ret; + int ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table); - if (IS_ERR(sectbl_ops)) { - xt_unregister_template(&security_table); + if (IS_ERR(sectbl_ops)) return PTR_ERR(sectbl_ops); - } ret = register_pernet_subsys(&ip6table_security_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&security_table, + ip6table_security_table_init); if (ret < 0) { - kfree(sectbl_ops); - xt_unregister_template(&security_table); - return ret; + unregister_pernet_subsys(&ip6table_security_net_ops); + goto err_free; } + return 0; +err_free: + kfree(sectbl_ops); return ret; } static void __exit ip6table_security_fini(void) { - unregister_pernet_subsys(&ip6table_security_net_ops); xt_unregister_template(&security_table); + unregister_pernet_subsys(&ip6table_security_net_ops); kfree(sectbl_ops); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e3d355d1fbd6..b106e5fef9cb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6933,7 +6933,7 @@ int __init ip6_route_init(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) ret = bpf_iter_register(); if (ret) - goto out_register_late_subsys; + goto out_register_notifier; #endif for_each_possible_cpu(cpu) { @@ -6946,6 +6946,10 @@ int __init ip6_route_init(void) out: return ret; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +out_register_notifier: + unregister_netdevice_notifier(&ip6_route_dev_notifier); +#endif out_register_late_subsys: rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d13d49bfef19..36d75fb50a70 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1839,6 +1839,7 @@ lookup: } } + isn = 0; process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ @@ -1868,6 +1869,7 @@ process: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); + TCP_SKB_CB(skb)->tcp_tw_isn = isn; skb->dev = NULL; @@ -1956,7 +1958,6 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; - __this_cpu_write(tcp_tw_isn, isn); goto process; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 157fc23ce4e1..1455f67e01dd 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1360,7 +1360,7 @@ static void l2tp_session_unhash(struct l2tp_session *session) spin_lock_bh(&pn->l2tp_session_idr_lock); /* Remove from the per-tunnel list */ - list_del_init(&session->list); + list_del_rcu(&session->list); /* Remove from per-net IDR */ if (tunnel->version == L2TP_HDR_VER_3) { diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7b77d57c9f96..f9ee9947a94d 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2344,8 +2344,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, sta->sta.max_sp = params->max_sp; } - ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab, - params->ext_capab_len); + if (params->ext_capab) + ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab, + params->ext_capab_len); /* * cfg80211 validates this (1-2007) and allows setting the AID diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0a0f27836d57..b98ddfa3003e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8164,6 +8164,7 @@ ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata, "No active links for TID %d", tid); return -EINVAL; } + pos += map_size; } else { map = 0; } @@ -8182,7 +8183,6 @@ ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata, default: return -EINVAL; } - pos += map_size; } return 0; } @@ -11232,6 +11232,9 @@ static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, control = get_unaligned_le16(pos); link_id = control & IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID; + if (link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + continue; + link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index 2b3632c6008a..77894d997113 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -34,6 +34,22 @@ #include "led.h" #include "wep.h" +static const u8 empty_non_inheritance[] = { + WLAN_EID_EXTENSION, 1, WLAN_EID_EXT_NON_INHERITANCE, + /* + * cfg80211_is_element_inherited() hardcodes elements that + * cannot be inherited, so we just need an empty one to be + * calling it at all. + */ +}; + +struct ieee80211_elem_defrag { + const struct element *elem; + /* container start/len */ + const u8 *start; + size_t len; +}; + struct ieee80211_elems_parse { /* must be first for kfree to work */ struct ieee802_11_elems elems; @@ -41,11 +57,7 @@ struct ieee80211_elems_parse { /* The basic Multi-Link element in the original elements */ const struct element *ml_basic_elem; - /* The reconfiguration Multi-Link element in the original elements */ - const struct element *ml_reconf_elem; - - /* The EPCS Multi-Link element in the original elements */ - const struct element *ml_epcs_elem; + struct ieee80211_elem_defrag ml_reconf, ml_epcs; bool multi_link_inner; bool skip_vendor; @@ -162,10 +174,14 @@ ieee80211_parse_extension_element(u32 *crc, } break; case IEEE80211_ML_CONTROL_TYPE_RECONF: - elems_parse->ml_reconf_elem = elem; + elems_parse->ml_reconf.elem = elem; + elems_parse->ml_reconf.start = params->start; + elems_parse->ml_reconf.len = params->len; break; case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - elems_parse->ml_epcs_elem = elem; + elems_parse->ml_epcs.elem = elem; + elems_parse->ml_epcs.start = params->start; + elems_parse->ml_epcs.len = params->len; break; default: break; @@ -916,7 +932,7 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse, { struct ieee802_11_elems *elems = &elems_parse->elems; struct ieee80211_mle_per_sta_profile *prof; - const struct element *tmp; + const struct element *tmp, *ret; ssize_t ml_len; const u8 *end; @@ -986,50 +1002,40 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse, sub->from_ap = params->from_ap; sub->link_id = -1; - return cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - sub->start, sub->len); -} - -static void -ieee80211_mle_defrag_reconf(struct ieee80211_elems_parse *elems_parse) -{ - struct ieee802_11_elems *elems = &elems_parse->elems; - ssize_t ml_len; + ret = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub->start, sub->len); + if (ret) + return ret; - ml_len = cfg80211_defragment_element(elems_parse->ml_reconf_elem, - elems->ie_start, - elems->total_len, - elems_parse->scratch_pos, - elems_parse->scratch + - elems_parse->scratch_len - - elems_parse->scratch_pos, - WLAN_EID_FRAGMENT); - if (ml_len < 0) - return; - elems->ml_reconf = (void *)elems_parse->scratch_pos; - elems->ml_reconf_len = ml_len; - elems_parse->scratch_pos += ml_len; + /* + * Since we know we want and found a profile, apply an empty + * non-inheritance if the profile didn't have one, so that any + * element that shouldn't be inherited by spec isn't. + */ + return (const void *)empty_non_inheritance; } -static void -ieee80211_mle_defrag_epcs(struct ieee80211_elems_parse *elems_parse) +static const void * +ieee80211_mle_defrag(struct ieee80211_elems_parse *elems_parse, + struct ieee80211_elem_defrag *defrag, + size_t *out_len) { - struct ieee802_11_elems *elems = &elems_parse->elems; + const void *ret; ssize_t ml_len; - ml_len = cfg80211_defragment_element(elems_parse->ml_epcs_elem, - elems->ie_start, - elems->total_len, + ml_len = cfg80211_defragment_element(defrag->elem, + defrag->start, defrag->len, elems_parse->scratch_pos, elems_parse->scratch + elems_parse->scratch_len - elems_parse->scratch_pos, WLAN_EID_FRAGMENT); if (ml_len < 0) - return; - elems->ml_epcs = (void *)elems_parse->scratch_pos; - elems->ml_epcs_len = ml_len; + return NULL; + ret = elems_parse->scratch_pos; + *out_len = ml_len; elems_parse->scratch_pos += ml_len; + return ret; } struct ieee802_11_elems * @@ -1042,6 +1048,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) size_t scratch_len = 3 * params->len; bool multi_link_inner = false; + BUILD_BUG_ON(sizeof(empty_non_inheritance) != empty_non_inheritance[1] + 2); BUILD_BUG_ON(offsetof(typeof(*elems_parse), elems) != 0); /* cannot parse for both a specific link and non-transmitted BSS */ @@ -1089,6 +1096,17 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, sub.start, nontx_len); + /* + * If it's a non-transmitted BSS, we shouldn't pick + * any elements in the outer parsing that shouldn't + * be inherited. If the profile has a non-inheritance + * element this automatically happens, but if not then + * provide an empty one so that the hard-coded elements + * in cfg80211_is_element_inherited() are ignored, but + * it must be called. + */ + if (params->bss->transmitted_bss && !non_inherit) + non_inherit = (const void *)empty_non_inheritance; } else { /* must always parse to get elems_parse->ml_basic_elem */ non_inherit = ieee80211_prep_mle_link_parse(elems_parse, params, @@ -1109,9 +1127,12 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) _ieee802_11_parse_elems_full(&sub, elems_parse, NULL); } - ieee80211_mle_defrag_reconf(elems_parse); - - ieee80211_mle_defrag_epcs(elems_parse); + elems->ml_reconf = ieee80211_mle_defrag(elems_parse, + &elems_parse->ml_reconf, + &elems->ml_reconf_len); + elems->ml_epcs = ieee80211_mle_defrag(elems_parse, + &elems_parse->ml_epcs, + &elems->ml_epcs_len); if (elems->tim && !elems->parse_error) { const struct ieee80211_tim_ie *tim_ie = elems->tim; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d18e962126ce..3fb40449c6c5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4984,6 +4984,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, u8 sa[ETH_ALEN]; } addrs __aligned(2); struct ieee80211_sta_rx_stats *stats; + u32 encoded_rate; /* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write * to a common data structure; drivers can implement that per queue @@ -5091,11 +5092,14 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, /* push the addresses in front */ memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs)); + /* capture before mesh forward may memset or free skb->cb */ + encoded_rate = sta_stats_encode_rate(status); + res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb); switch (res) { case RX_QUEUED: stats->last_rx = jiffies; - stats->last_rate = sta_stats_encode_rate(status); + stats->last_rate = encoded_rate; return true; case RX_CONTINUE: break; diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c index 8a16672b94e2..4cc16cbeb328 100644 --- a/net/mptcp/bpf.c +++ b/net/mptcp/bpf.c @@ -14,7 +14,7 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { - if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk)) + if (sk && sk_fullsock(sk) && sk_is_tcp(sk) && sk_is_mptcp(sk)) return mptcp_sk(mptcp_subflow_ctx(sk)->conn); return NULL; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 3c152bf66cd5..3e770c7407e1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -364,7 +364,13 @@ static void mptcp_pm_add_timer(struct timer_list *timer) spin_lock_bh(&msk->pm.lock); - if (!mptcp_pm_should_add_signal_addr(msk)) { + /* The cancel path (mptcp_pm_del_add_timer()) can race with this + * callback. Once cancel updates retrans_times to MAX, suppress further + * retransmissions here. If this callback acquires pm.lock first, one + * final transmit attempt is still possible. + */ + if (entry->retrans_times < ADD_ADDR_RETRANS_MAX && + !mptcp_pm_should_add_signal_addr(msk)) { pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); @@ -414,8 +420,12 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, /* Note: entry might have been removed by another thread. * We hold rcu_read_lock() to ensure it is not freed under us. */ - if (stop_timer) - sk_stop_timer_sync(sk, &entry->add_timer); + if (stop_timer) { + if (check_id) + sk_stop_timer(sk, &entry->add_timer); + else + sk_stop_timer_sync(sk, &entry->add_timer); + } rcu_read_unlock(); return entry; @@ -882,6 +892,7 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, struct mptcp_addr_info *addr, bool *echo, bool *drop_other_suboptions) { + bool skip_add_addr = false; int ret = false; u8 add_addr; u8 family; @@ -903,24 +914,49 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, } *echo = mptcp_pm_should_add_signal_echo(msk); - port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); - - family = *echo ? msk->pm.remote.family : msk->pm.local.family; - if (remaining < mptcp_add_addr_len(family, *echo, port)) - goto out_unlock; - if (*echo) { *addr = msk->pm.remote; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO); + port = !!msk->pm.remote.port; + family = msk->pm.remote.family; } else { *addr = msk->pm.local; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL); + port = !!msk->pm.local.port; + family = msk->pm.local.family; } - WRITE_ONCE(msk->pm.addr_signal, add_addr); + + if (remaining < mptcp_add_addr_len(family, *echo, port)) { + struct net *net = sock_net((struct sock *)msk); + + if (!*drop_other_suboptions) + goto out_unlock; + + if (*echo) { + MPTCP_INC_STATS(net, MPTCP_MIB_ECHOADDTXDROP); + } else { + skip_add_addr = true; + MPTCP_INC_STATS(net, MPTCP_MIB_ADDADDRTXDROP); + } + goto drop_signal_mark; + } + ret = true; +drop_signal_mark: + WRITE_ONCE(msk->pm.addr_signal, add_addr); + out_unlock: spin_unlock_bh(&msk->pm.lock); + + /* On pure-ACK option-space exhaustion, stop retrying this ADD_ADDR: + * clear the signal bit, cancel the matching retransmission timer, and + * let the PM state machine progress. + */ + if (skip_add_addr) { + mptcp_pm_del_add_timer(msk, addr, true); + mptcp_pm_subflow_established(msk); + } return ret; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4546a8b09884..a72a6ad6ee8b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -397,12 +397,26 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb) return false; } - /* old data, keep it simple and drop the whole pkt, sender - * will retransmit as needed, if needed. + /* Completely old data? */ + if (!after64(MPTCP_SKB_CB(skb)->end_seq, msk->ack_seq)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + mptcp_drop(sk, skb); + return false; + } + + /* Partial packet: map_seq < ack_seq < end_seq. + * Skip the already-acked bytes and enqueue the new data. */ - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); - mptcp_drop(sk, skb); - return false; + copy_len = MPTCP_SKB_CB(skb)->end_seq - msk->ack_seq; + MPTCP_SKB_CB(skb)->offset += msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq - + MPTCP_SKB_CB(skb)->map_seq; + msk->bytes_received += copy_len; + WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); + + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); + return true; } static void mptcp_stop_rtx_timer(struct sock *sk) @@ -3473,6 +3487,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) /* for fallback's sake */ WRITE_ONCE(msk->ack_seq, 0); + atomic64_set(&msk->rcv_wnd_sent, 0); WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 1cf608e7357b..87b5796d0135 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -67,6 +67,12 @@ static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval, return 0; } +static void __mptcp_subflow_set_rcvbuf(struct sock *ssk, int val) +{ + WRITE_ONCE(ssk->sk_rcvbuf, val); + tcp_set_rcvbuf(ssk, val); +} + static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val) { struct mptcp_subflow_context *subflow; @@ -100,7 +106,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in case SO_RCVBUF: case SO_RCVBUFFORCE: ssk->sk_userlocks |= SOCK_RCVBUF_LOCK; - WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf); break; case SO_MARK: if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) { @@ -1560,7 +1566,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf; } if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) - WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf); } if (sock_flag(sk, SOCK_LINGER)) { diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index c5a26236a0bb..3706b4a85a0f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1613,6 +1613,7 @@ dump_last: ((dump_type == DUMP_ALL) == !!(set->type->features & IPSET_DUMP_LAST))) { write_unlock_bh(&ip_set_ref_lock); + set = NULL; continue; } pr_debug("List set: %s\n", set->name); @@ -1648,13 +1649,13 @@ dump_last: if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN && nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index))) goto nla_put_failure; + if (set->variant->uref) + set->variant->uref(set, cb, true); ret = set->variant->head(set, skb); if (ret < 0) goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; - if (set->variant->uref) - set->variant->uref(set, cb, true); fallthrough; default: ret = set->variant->list(set, skb, cb); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index b79e5dd2af03..04e4627ddfc1 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -386,8 +386,9 @@ static void mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) { int i; + u8 pos = smp_load_acquire(&n->pos); - for (i = 0; i < n->pos; i++) + for (i = 0; i < pos; i++) if (test_bit(i, n->used)) ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); } @@ -490,7 +491,7 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif - u8 htable_bits = t->htable_bits; + u8 pos, htable_bits = t->htable_bits; spin_lock_bh(&t->hregion[r].lock); for (i = ahash_bucket_start(r, htable_bits); @@ -498,7 +499,8 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) n = __ipset_dereference(hbucket(t, i)); if (!n) continue; - for (j = 0, d = 0; j < n->pos; j++) { + pos = smp_load_acquire(&n->pos); + for (j = 0, d = 0; j < pos; j++) { if (!test_bit(j, n->used)) { d++; continue; @@ -534,7 +536,7 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) /* Still try to delete expired elements. */ continue; tmp->size = n->size - AHASH_INIT_SIZE; - for (j = 0, d = 0; j < n->pos; j++) { + for (j = 0, d = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); @@ -623,7 +625,7 @@ mtype_resize(struct ip_set *set, bool retried) { struct htype *h = set->data; struct htable *t, *orig; - u8 htable_bits; + u8 pos, htable_bits; size_t hsize, dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; @@ -685,7 +687,8 @@ retry: n = __ipset_dereference(hbucket(orig, i)); if (!n) continue; - for (j = 0; j < n->pos; j++) { + pos = smp_load_acquire(&n->pos); + for (j = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); @@ -809,9 +812,10 @@ mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) { struct htype *h = set->data; const struct htable *t; - u32 i, j, r; struct hbucket *n; struct mtype_elem *data; + u32 i, j, r; + u8 pos; t = rcu_dereference_bh(h->table); for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { @@ -820,7 +824,8 @@ mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) n = rcu_dereference_bh(hbucket(t, i)); if (!n) continue; - for (j = 0; j < n->pos; j++) { + pos = smp_load_acquire(&n->pos); + for (j = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, set->dsize); @@ -848,6 +853,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, bool flag_exist = flags & IPSET_FLAG_EXIST; bool deleted = false, forceadd = false, reuse = false; u32 r, key, multi = 0, elements, maxelem; + u8 npos = 0; rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); @@ -889,7 +895,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ext_size(AHASH_INIT_SIZE, set->dsize); goto copy_elem; } - for (i = 0; i < n->pos; i++) { + npos = smp_load_acquire(&n->pos); + for (i = 0; i < npos; i++) { if (!test_bit(i, n->used)) { /* Reuse first deleted entry */ if (j == -1) { @@ -933,7 +940,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (elements >= maxelem) goto set_full; /* Create a new slot */ - if (n->pos >= n->size) { + if (npos >= n->size) { #ifdef IP_SET_HASH_WITH_MULTI if (h->bucketsize >= AHASH_MAX_TUNED) goto set_full; @@ -962,7 +969,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } copy_elem: - j = n->pos++; + j = npos++; data = ahash_data(n, j, set->dsize); copy_data: t->hregion[r].elements++; @@ -985,6 +992,8 @@ overwrite_extensions: if (SET_WITH_TIMEOUT(set)) ip_set_timeout_set(ext_timeout(data, set), ext->timeout); smp_mb__before_atomic(); + /* Ensure all data writes are visible before updating position */ + smp_store_release(&n->pos, npos); set_bit(j, n->used); if (old != ERR_PTR(-ENOENT)) { rcu_assign_pointer(hbucket(t, key), n); @@ -1043,6 +1052,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, j, k, r, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; size_t dsize = set->dsize; + u8 pos; /* Userspace add and resize is excluded by the mutex. * Kernespace add does not trigger resize. @@ -1058,7 +1068,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, n = rcu_dereference_bh(hbucket(t, key)); if (!n) goto out; - for (i = 0, k = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0, k = 0; i < pos; i++) { if (!test_bit(i, n->used)) { k++; continue; @@ -1072,8 +1083,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, ret = 0; clear_bit(i, n->used); smp_mb__after_atomic(); - if (i + 1 == n->pos) - n->pos--; + if (i + 1 == pos) + smp_store_release(&n->pos, --pos); t->hregion[r].elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) @@ -1094,11 +1105,11 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, x->flags = flags; } } - for (; i < n->pos; i++) { + for (; i < pos; i++) { if (!test_bit(i, n->used)) k++; } - if (k == n->pos) { + if (k == pos) { t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, key), NULL); kfree_rcu(n, rcu); @@ -1109,7 +1120,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!tmp) goto out; tmp->size = n->size - AHASH_INIT_SIZE; - for (j = 0, k = 0; j < n->pos; j++) { + for (j = 0, k = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); @@ -1170,6 +1181,7 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, int ret, i, j = 0; #endif u32 key, multi = 0; + u8 pos; pr_debug("test by nets\n"); for (; j < NLEN && h->nets[j].cidr[0] && !multi; j++) { @@ -1187,7 +1199,8 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, n = rcu_dereference_bh(hbucket(t, key)); if (!n) continue; - for (i = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0; i < pos; i++) { if (!test_bit(i, n->used)) continue; data = ahash_data(n, i, set->dsize); @@ -1221,6 +1234,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct mtype_elem *data; int i, ret = 0; u32 key, multi = 0; + u8 pos; rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); @@ -1243,7 +1257,8 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, ret = 0; goto out; } - for (i = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0; i < pos; i++) { if (!test_bit(i, n->used)) continue; data = ahash_data(n, i, set->dsize); @@ -1360,6 +1375,7 @@ mtype_list(const struct ip_set *set, /* We assume that one hash bucket fills into one page */ void *incomplete; int i, ret = 0; + u8 pos; atd = nla_nest_start(skb, IPSET_ATTR_ADT); if (!atd) @@ -1378,7 +1394,8 @@ mtype_list(const struct ip_set *set, cb->args[IPSET_CB_ARG0], t, n); if (!n) continue; - for (i = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0; i < pos; i++) { if (!test_bit(i, n->used)) continue; e = ahash_data(n, i, set->dsize); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index a22ec1a6f6ec..e26ca2a370e3 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -150,7 +150,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++, i++) { + for (; ip <= ip_to; i++) { e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_ipmark4_data_next(&h->next, &e); @@ -162,6 +162,10 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; ret = 0; + + if (ip == ip_to) + break; + ip++; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index e977b5a9c48d..41ca24a22a02 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -186,7 +186,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to;) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { @@ -203,6 +203,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], ret = 0; } + if (ip == ip_to) + break; + ip++; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 39a01934b153..b9ac2efaa15c 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -182,7 +182,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to;) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { @@ -199,6 +199,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], ret = 0; } + if (ip == ip_to) + break; + ip++; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 5c6de605a9fb..2d6652d43199 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -274,7 +274,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], p = port; ip2 = ip2_from; } - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to;) { e.ip = htonl(ip); for (; p <= port_to; p++) { e.port = htons(p); @@ -298,6 +298,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip2 = ip2_from; } p = port; + if (ip == ip_to) + break; + ip++; } return ret; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c7c7f6a7a9f6..bd9cae44d214 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -327,18 +327,22 @@ ip_vs_use_count_dec(void) /* Service hashing: * Operation Locking order * --------------------------------------------------------------------------- - * add table service_mutex, svc_resize_sem(W) - * del table service_mutex - * move between tables svc_resize_sem(W), seqcount_t(W), bit lock - * add/del service service_mutex, bit lock + * add first table service_mutex + * attach new table service_mutex + * add/del service service_mutex, RCU, bit lock + * move between tables (rehash) svc_resize_sem(W), seqcount_t(W), bit lock + * replace old with attached svc_resize_sem(W), svc_replace_sem(W) * find service RCU, seqcount_t(R) * walk services(blocking) service_mutex, svc_resize_sem(R) * walk services(non-blocking) RCU, seqcount_t(R) + * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R) + * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R) + * del table service_mutex after stopped work * - * - new tables are linked/unlinked under service_mutex and svc_resize_sem - * - new table is linked on resizing and all operations can run in parallel - * in 2 tables until the new table is registered as current one - * - two contexts can modify buckets: config and table resize, both in + * - new table is attached on resizing under service_mutex and all operations + * can run in parallel in 2 tables until the new table is registered as current + * one + * - two contexts can modify buckets: config and table resize (work), both in * process context * - only table resizer can move entries, so we do not protect t->seqc[] * items with t->lock[] @@ -346,9 +350,13 @@ ip_vs_use_count_dec(void) * services are moved to new table * - move operations may disturb readers: find operation will not miss entries * but walkers may see same entry twice if they are forced to retry chains - * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold - * service_mutex to disallow new tables to be installed or to check + * or to walk the newly attached second table + * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to check * svc_table_changes and repeat the RCU read section if new table is installed + * - walkers may serialize with the whole resizing process (svc_resize_sem) + * to prevent seeing same service twice or just with the svc_table + * replace (svc_replace_sem) when we can see entries twice but we + * prefer to run concurrently with the rehashing. */ /* @@ -387,9 +395,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* increase its refcnt because it is referenced by the svc table */ atomic_inc(&svc->refcnt); + /* We know if new table is attached under service_mutex but rely on + * RCU to hold the old table to be freed in resizer + */ + rcu_read_lock(); + + /* This can be the old or the new table */ + t = rcu_dereference(ipvs->svc_table); + /* New entries go into recent table */ - t = rcu_dereference_protected(ipvs->svc_table, 1); - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); if (svc->fwmark == 0) { /* @@ -410,6 +425,8 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) hlist_bl_add_head_rcu(&svc->s_list, head); hlist_bl_unlock(head); + rcu_read_unlock(); + return 1; } @@ -432,7 +449,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) return 0; } - t = rcu_dereference_protected(ipvs->svc_table, 1); + /* We know if new table is attached under service_mutex but rely on + * RCU to hold the old table to be freed in resizer + */ + rcu_read_lock(); + + /* This can be the old or the new table */ + t = rcu_dereference(ipvs->svc_table); hash_key = READ_ONCE(svc->hash_key); /* We need to lock the bucket in the right table */ if (ip_vs_rht_same_table(t, hash_key)) { @@ -443,13 +466,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) /* Moved to new table ? */ if (hash_key != hash_key2) { hlist_bl_unlock(head); - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); head = t->buckets + (hash_key2 & t->mask); hlist_bl_lock(head); } } else { /* It is already moved to new table */ - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); head = t->buckets + (hash_key & t->mask); hlist_bl_lock(head); } @@ -459,6 +482,8 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); hlist_bl_unlock(head); + + rcu_read_unlock(); return 1; } @@ -666,15 +691,14 @@ static void svc_resize_work_handler(struct work_struct *work) goto unlock_sem; more_work = false; clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + if (!READ_ONCE(ipvs->enable)) goto unlock_m; t = rcu_dereference_protected(ipvs->svc_table, 1); /* Do nothing if table is removed */ if (!t) goto unlock_m; - /* New table needs to be registered? BUG! */ - if (t != rcu_dereference_protected(t->new_tbl, 1)) + /* New table already attached? BUG! */ + if (t != rcu_access_pointer(t->new_tbl)) goto unlock_m; lfactor = sysctl_svc_lfactor(ipvs); @@ -691,6 +715,7 @@ static void svc_resize_work_handler(struct work_struct *work) /* Flip the table_id */ t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; + /* Attach new table */ rcu_assign_pointer(t->new_tbl, t_new); /* Allow add/del to new_tbl while moving from old table */ mutex_unlock(&ipvs->service_mutex); @@ -698,8 +723,8 @@ static void svc_resize_work_handler(struct work_struct *work) ip_vs_rht_for_each_bucket(t, bucket, head) { same_bucket: if (++limit >= 16) { - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, + /* Check if work is stopped */ + if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) goto unlock_sem; if (resched_score >= 100) { @@ -764,16 +789,12 @@ same_bucket: goto same_bucket; } - /* Tables can be switched only under service_mutex */ - while (!mutex_trylock(&ipvs->service_mutex)) { - cond_resched(); - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) - goto unlock_sem; - } - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) - goto unlock_m; + /* Serialize with readers that don't like svc_table changes */ + down_write(&ipvs->svc_replace_sem); + + /* Check if work is stopped to avoid synchronize_rcu() */ + if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_repl; rcu_assign_pointer(ipvs->svc_table, t_new); /* Inform readers that new table is installed */ @@ -781,8 +802,8 @@ same_bucket: atomic_inc(&ipvs->svc_table_changes); t_free = t; -unlock_m: - mutex_unlock(&ipvs->service_mutex); +unlock_repl: + up_write(&ipvs->svc_replace_sem); unlock_sem: up_write(&ipvs->svc_resize_sem); @@ -801,6 +822,11 @@ out: test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) return; queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); + return; + +unlock_m: + mutex_unlock(&ipvs->service_mutex); + goto unlock_sem; } static inline void @@ -1691,6 +1717,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_pe *pe = NULL; int ret_hooks = -1; int ret = 0; + bool grow; /* increase the module use count */ if (!ip_vs_use_count_inc()) @@ -1732,16 +1759,25 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif - t = rcu_dereference_protected(ipvs->svc_table, 1); + /* The old table can be freed, protect it with RCU */ + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); if (!t) { int lfactor = sysctl_svc_lfactor(ipvs); int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); + rcu_read_unlock(); t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); if (!t_new) { ret = -ENOMEM; goto out_err; } + grow = false; + } else { + /* Even the currently attached new table may need to grow */ + t = rcu_dereference(t->new_tbl); + grow = ip_vs_get_num_services(ipvs) + 1 > t->u_thresh; + rcu_read_unlock(); } if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { @@ -1800,6 +1836,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, goto out_err; if (t_new) { + /* Add table for first time */ clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); rcu_assign_pointer(ipvs->svc_table, t_new); t_new = NULL; @@ -1831,8 +1868,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_svc_hash(svc); /* Schedule resize work */ - if (t && ip_vs_get_num_services(ipvs) > t->u_thresh && - !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) + if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); @@ -2054,7 +2090,6 @@ static int ip_vs_del_service(struct ip_vs_service *svc) return -EEXIST; ipvs = svc->ipvs; ip_vs_unlink_service(svc, false); - t = rcu_dereference_protected(ipvs->svc_table, 1); /* Drop the table if no more services */ ns = ip_vs_get_num_services(ipvs); @@ -2062,6 +2097,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* Stop the resizer and drop the tables */ set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); cancel_delayed_work_sync(&ipvs->svc_resize_work); + t = rcu_dereference_protected(ipvs->svc_table, 1); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); /* Inform readers that table is removed */ @@ -2075,11 +2111,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc) t = p; } } - } else if (ns <= t->l_thresh && - !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, - &ipvs->work_flags)) { - queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, - 1); + } else { + bool shrink; + + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); + /* Even the currently attached new table may need to shrink */ + t = rcu_dereference(t->new_tbl); + shrink = ns <= t->l_thresh; + rcu_read_unlock(); + if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, + &ipvs->work_flags)) + queue_delayed_work(system_unbound_wq, + &ipvs->svc_resize_work, 1); } return 0; } @@ -2184,17 +2228,21 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, struct ip_vs_service *svc; struct hlist_bl_node *e; struct ip_vs_dest *dest; - int old_gen, new_gen; + int old_gen; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + /* Allow concurrent rehashing on resize but to avoid loop + * serialize with installing the new table. + */ + down_read(&ipvs->svc_replace_sem); + old_gen = atomic_read(&ipvs->svc_table_changes); rcu_read_lock(); -repeat: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { @@ -2207,17 +2255,17 @@ repeat: } resched_score++; if (resched_score >= 100) { - resched_score = 0; cond_resched_rcu(); - new_gen = atomic_read(&ipvs->svc_table_changes); - /* New table installed ? */ - if (old_gen != new_gen) { - old_gen = new_gen; - goto repeat; - } + /* Flushed? So no more dev refs */ + if (atomic_read(&ipvs->svc_table_changes) != old_gen) + goto done; + resched_score = 0; } } + +done: rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); return NOTIFY_DONE; } @@ -2244,6 +2292,10 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) struct ip_vs_service *svc; struct hlist_bl_node *e; + /* svc_table can not be replaced (svc_replace_sem) or + * removed (service_mutex) + */ + down_read(&ipvs->svc_replace_sem); rcu_read_lock(); ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { @@ -2259,6 +2311,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) } rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; @@ -3062,6 +3115,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) u32 sum; int i; + /* Info for conns */ rcu_read_lock(); t = rcu_dereference(ipvs->conn_tab); @@ -3123,6 +3177,12 @@ repeat_conn: } after_conns: + rcu_read_unlock(); + + /* Info for services */ + down_read(&ipvs->svc_replace_sem); + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); count = ip_vs_get_num_services(ipvs); @@ -3133,9 +3193,7 @@ after_conns: if (!count) goto after_svc; old_gen = atomic_read(&ipvs->svc_table_changes); - loops = 0; -repeat_svc: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ memset(counts, 0, sizeof(counts)); ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { @@ -3157,15 +3215,10 @@ repeat_svc: if (resched_score >= 100) { resched_score = 0; cond_resched_rcu(); - new_gen = atomic_read(&ipvs->svc_table_changes); - /* New table installed ? */ - if (old_gen != new_gen) { - /* Too many changes? */ - if (++loops >= 5) - goto after_svc; - old_gen = new_gen; - goto repeat_svc; - } + /* Flushed? */ + if (atomic_read(&ipvs->svc_table_changes) != + old_gen) + goto after_svc; } counts[count]++; } @@ -3184,6 +3237,9 @@ repeat_svc: } after_svc: + rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); + seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", ipvs->est_kt_count, ipvs->est_max_threads); seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); @@ -3191,7 +3247,6 @@ after_svc: ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * IPVS_EST_NTICKS); - rcu_read_unlock(); return 0; } @@ -3503,7 +3558,7 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs, int ret = 0; lockdep_assert_held(&ipvs->svc_resize_sem); - /* All service modifications are disabled, go ahead */ + /* All svc_table modifications are disabled, go ahead */ ip_vs_rht_walk_buckets(ipvs->svc_table, head) { hlist_bl_for_each_entry(svc, e, head, s_list) { /* Only expose IPv4 entries to old interface */ @@ -3687,7 +3742,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) pr_err("length: %u != %zu\n", *len, size); return -EINVAL; } - /* Protect against table resizer moving the entries. + /* Prevent modifications to the list with services. * Try reverse locking, so that we do not hold the mutex * while waiting for semaphore. */ @@ -4029,6 +4084,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int start = cb->args[0]; int idx = 0; + /* Make sure we do not see same service twice during resize */ down_read(&ipvs->svc_resize_sem); rcu_read_lock(); ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { @@ -5072,6 +5128,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) /* Initialize service_mutex, svc_table per netns */ __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); init_rwsem(&ipvs->svc_resize_sem); + init_rwsem(&ipvs->svc_replace_sem); INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); atomic_set(&ipvs->svc_table_changes, 0); RCU_INIT_POINTER(ipvs->svc_table, NULL); diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 4f39bf7c843f..75e53fde6b29 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -72,6 +72,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, exp->flags = NF_CT_EXPECT_PERMANENT; exp->class = NF_CT_EXPECT_CLASS_DEFAULT; rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b08189226320..8ba5b22a1eef 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1811,14 +1811,17 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { + struct nf_conntrack_helper *assign_helper; + /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; - if (exp->helper) { + assign_helper = rcu_dereference(exp->assign_helper); + if (assign_helper) { help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) - rcu_assign_pointer(help->helper, exp->helper); + rcu_assign_pointer(help->helper, assign_helper); } #ifdef CONFIG_NF_CONNTRACK_MARK diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 24d0576d84b7..8e943efbdf0a 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -344,6 +344,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, helper = rcu_dereference(help->helper); rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 3f5c50455b71..b2fe6554b9cf 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -643,7 +643,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, &nf_conntrack_helper_h245); + rcu_assign_pointer(exp->assign_helper, &nf_conntrack_helper_h245); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -767,7 +767,7 @@ static int expect_callforwarding(struct sk_buff *skb, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -1234,7 +1234,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3 : NULL, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ nathook = rcu_dereference(nfct_h323_nat_hook); @@ -1306,7 +1306,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_UDP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_ras); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_ras); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect RAS "); @@ -1523,7 +1523,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); @@ -1577,7 +1577,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a715304a53d8..17e971bd4c74 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -321,8 +321,8 @@ __printf(3, 4) void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, const char *fmt, ...) { + const char *helper_name = "(null)"; const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; struct va_format vaf; va_list args; @@ -331,14 +331,17 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, vaf.fmt = fmt; vaf.va = &args; - /* Called from the helper function, this call never fails */ help = nfct_help(ct); + if (help) { + const struct nf_conntrack_helper *helper; - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); + helper = rcu_dereference(help->helper); + if (helper) + helper_name = helper->name; + } nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, - "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); + "helper %s dropping packet: %pV ", helper_name, &vaf); va_end(args); } @@ -400,6 +403,11 @@ static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) this = rcu_dereference_protected(exp->helper, lockdep_is_held(&nf_conntrack_expect_lock)); + if (this == me) + return true; + + this = rcu_dereference_protected(exp->assign_helper, + lockdep_is_held(&nf_conntrack_expect_lock)); return this == me; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index eda5fe4a75c8..befa7e83ee49 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2634,6 +2634,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); @@ -2860,6 +2861,7 @@ static int ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report) { + struct nf_conntrack_helper *assign_helper = NULL; struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; struct nf_conntrack_expect *exp; @@ -2870,13 +2872,26 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; + if (!cda[CTA_EXPECT_TUPLE] || !cda[CTA_EXPECT_MASK]) + return -EINVAL; + err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda, ct, &tuple, &mask); if (err < 0) return err; + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + assign_helper = __nf_conntrack_helper_find(helpname, + nf_ct_l3num(ct), + tuple.dst.protonum); + if (!assign_helper) + return -EOPNOTSUPP; + } + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, - &tuple, &mask); + assign_helper, &tuple, &mask); if (IS_ERR(exp)) return PTR_ERR(exp); @@ -3515,6 +3530,7 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { @@ -3568,6 +3584,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, exp->zone = ct->zone; #endif rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, assign_helper); exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; exp->mask.src.u.all = mask->src.u.all; @@ -3623,7 +3640,7 @@ ctnetlink_create_expect(struct net *net, ct = nf_ct_tuplehash_to_ctrack(h); rcu_read_lock(); - exp = ctnetlink_alloc_expect(cda, ct, &tuple, &mask); + exp = ctnetlink_alloc_expect(cda, ct, NULL, &tuple, &mask); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto err_rcu; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 1eb55907d470..e69941f1a101 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1366,6 +1366,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, goto store_cseq; } + helper = rcu_dereference(nfct_help(ct)->helper); + if (!helper) + return NF_DROP; + exp = nf_ct_expect_alloc(ct); if (!exp) { nf_ct_helper_log(skb, ct, "cannot alloc expectation"); @@ -1376,14 +1380,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, if (sip_direct_signalling) saddr = &ct->tuplehash[!dir].tuple.src.u3; - helper = rcu_dereference(nfct_help(ct)->helper); - if (!helper) - return NF_DROP; - nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), saddr, &daddr, proto, NULL, &port); exp->timeout.expires = sip_timeout * HZ; - rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, helper); exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; hooks = rcu_dereference(nf_nat_sip_hooks); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index a6c81c04b3a5..57b450024a99 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -61,6 +61,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ + dev_put(entry->skb_dev); dev_put(state->in); dev_put(state->out); if (state->sk) @@ -102,6 +103,7 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) return false; + dev_hold(entry->skb_dev); dev_hold(state->in); dev_hold(state->out); @@ -202,11 +204,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, *entry = (struct nf_queue_entry) { .skb = skb, + .skb_dev = skb->dev, .state = *state, .hook_index = index, .size = sizeof(*entry) + route_key_size, }; - __nf_queue_entry_init_physdevs(entry); if (!nf_queue_entry_get_refs(entry)) { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 58304fd1f70f..984a0eb9e149 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1212,6 +1212,8 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) if (physinif == ifindex || physoutif == ifindex) return 1; #endif + if (entry->skb_dev && entry->skb_dev->ifindex == ifindex) + return 1; if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 60ee8d932fcb..fa2cc556331c 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1334,6 +1334,8 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj, if (nf_ct_expect_related(exp, 0) != 0) regs->verdict.code = NF_DROP; + + nf_ct_expect_put(exp); } static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 03ffb1159fc1..d14ca157910b 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -163,7 +163,6 @@ static int nft_inner_parse_l2l3(const struct nft_inner *priv, return -1; if (fragoff == 0) { - thoff = nhoff + sizeof(_ip6h); ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; ctx->inner_thoff = thoff; ctx->l4proto = l4proto; @@ -247,8 +246,8 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { - local_bh_enable(); local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + local_bh_enable(); return false; } *tun_ctx = *this_cpu_tun_ctx; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2c67c2e6b132..4e6708c23922 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -55,6 +55,9 @@ static struct list_head xt_templates[NFPROTO_NUMPROTO]; struct xt_pernet { struct list_head tables[NFPROTO_NUMPROTO]; + + /* stash area used during netns exit */ + struct list_head dead_tables[NFPROTO_NUMPROTO]; }; struct compat_delta { @@ -1472,11 +1475,9 @@ struct xt_counters *xt_counters_alloc(unsigned int counters) } EXPORT_SYMBOL(xt_counters_alloc); -struct xt_table_info * -xt_replace_table(struct xt_table *table, - unsigned int num_counters, - struct xt_table_info *newinfo, - int *error) +static struct xt_table_info * +do_replace_table(struct xt_table *table, unsigned int num_counters, + struct xt_table_info *newinfo, int *error) { struct xt_table_info *private; unsigned int cpu; @@ -1531,30 +1532,54 @@ xt_replace_table(struct xt_table *table, } } - audit_log_nfcfg(table->name, table->af, private->number, - !private->number ? AUDIT_XT_OP_REGISTER : - AUDIT_XT_OP_REPLACE, - GFP_KERNEL); + return private; +} + +struct xt_table_info * +xt_replace_table(struct xt_table *table, unsigned int num_counters, + struct xt_table_info *newinfo, + int *error) +{ + struct xt_table_info *private; + + private = do_replace_table(table, num_counters, newinfo, error); + if (private) + audit_log_nfcfg(table->name, table->af, private->number, + AUDIT_XT_OP_REPLACE, + GFP_KERNEL); + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); struct xt_table *xt_register_table(struct net *net, const struct xt_table *input_table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t, *table = NULL; + struct nf_hook_ops *ops = NULL; struct xt_table_info *private; - struct xt_table *t, *table; - int ret; + unsigned int num_ops; + int ret = -EINVAL; + + num_ops = hweight32(input_table->valid_hooks); + if (num_ops == 0) + goto out; + + ret = -ENOMEM; + if (template_ops) { + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); + if (!ops) + goto out; + } /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); - if (!table) { - ret = -ENOMEM; + if (!table) goto out; - } mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ @@ -1568,7 +1593,7 @@ struct xt_table *xt_register_table(struct net *net, /* Simplifies replace_table code. */ table->private = bootstrap; - if (!xt_replace_table(table, 0, newinfo, &ret)) + if (!do_replace_table(table, 0, newinfo, &ret)) goto unlock; private = table->private; @@ -1577,34 +1602,122 @@ struct xt_table *xt_register_table(struct net *net, /* save number of initial entries */ private->initial_entries = private->number; + if (ops) { + int i; + + for (i = 0; i < num_ops; i++) + ops[i].priv = table; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) { + mutex_unlock(&xt[table->af].mutex); + /* nf_register_net_hooks() might have published a + * base chain before internal error unwind. + */ + synchronize_rcu(); + goto out; + } + + table->ops = ops; + } + + audit_log_nfcfg(table->name, table->af, private->number, + AUDIT_XT_OP_REGISTER, GFP_KERNEL); + list_add(&table->list, &xt_net->tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; unlock: mutex_unlock(&xt[table->af].mutex); - kfree(table); out: + kfree(table); + kfree(ops); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(xt_register_table); -void *xt_unregister_table(struct xt_table *table) +/** + * xt_unregister_table_pre_exit - pre-shutdown unregister of a table + * @net: network namespace + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Unregisters the specified netfilter table from the given network namespace + * and also unregisters the hooks from netfilter core: no new packets will be + * processed. + * + * This must be called prior to xt_unregister_table_exit() from the pernet + * .pre_exit callback. After this call, the table is no longer visible to + * the get/setsockopt path. In case of rmmod, module exit path must have + * called xt_unregister_template() prior to unregistering pernet ops to + * prevent re-instantiation of the table. + * + * See also: xt_unregister_table_exit() + */ +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) { - struct xt_table_info *private; + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t; - mutex_lock(&xt[table->af].mutex); - private = table->private; - list_del(&table->list); - mutex_unlock(&xt[table->af].mutex); - audit_log_nfcfg(table->name, table->af, private->number, - AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); - kfree(table->ops); - kfree(table); + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_net->tables[af], list) { + if (strcmp(t->name, name) == 0) { + list_move(&t->list, &xt_net->dead_tables[af]); + mutex_unlock(&xt[af].mutex); - return private; + if (t->ops) /* nat table registers with nat core, t->ops is NULL. */ + nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks)); + return; + } + } + mutex_unlock(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_table_pre_exit); + +/** + * xt_unregister_table_exit - remove a table during namespace teardown + * @net: the network namespace from which to unregister the table + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Completes the unregister process for a table. This must be called from + * the pernet ops .exit callback. This is the second stage after + * xt_unregister_table_pre_exit(). + * + * pair with xt_unregister_table_pre_exit() during namespace shutdown. + * + * Return: the unregistered table or NULL if the table was never + * instantiated. The caller needs to kfree() the table after it + * has removed the family specific matches/targets. + */ +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *table; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(table, &xt_net->dead_tables[af], list) { + struct nf_hook_ops *ops = NULL; + + if (strcmp(table->name, name) != 0) + continue; + + list_del(&table->list); + + audit_log_nfcfg(table->name, table->af, table->private->number, + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + swap(table->ops, ops); + mutex_unlock(&xt[af].mutex); + + kfree(ops); + return table; + } + mutex_unlock(&xt[af].mutex); + + return NULL; } -EXPORT_SYMBOL_GPL(xt_unregister_table); +EXPORT_SYMBOL_GPL(xt_unregister_table_exit); #endif #ifdef CONFIG_PROC_FS @@ -2051,8 +2164,10 @@ static int __net_init xt_net_init(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { INIT_LIST_HEAD(&xt_net->tables[i]); + INIT_LIST_HEAD(&xt_net->dead_tables[i]); + } return 0; } @@ -2061,8 +2176,10 @@ static void __net_exit xt_net_exit(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { WARN_ON_ONCE(!list_empty(&xt_net->tables[i])); + WARN_ON_ONCE(!list_empty(&xt_net->dead_tables[i])); + } } static struct pernet_operations xt_net_ops = { diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d251d894afd4..0da39eaed255 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1972,8 +1972,10 @@ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return -EINVAL; + } group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group); @@ -1986,8 +1988,10 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return; + } group = family->mcgrp_offset + group; nlmsg_notify(sk, skb, info->snd_portid, group, diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index c42642075685..e7e8490a53d8 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -83,6 +83,14 @@ struct vport *ovs_netdev_link(struct vport *vport, bool tunnel) } rtnl_lock(); + /* Do not link devices that are not registered to avoid a potential + * race with the NETDEV_UNREGISTER notification in dp_device_event(). + */ + if (vport->dev->reg_state != NETREG_REGISTERED) { + err = -ENODEV; + goto error_put_unlock; + } + err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 4dbf0914df7d..706927139393 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -671,8 +671,23 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb) /* Look for an existing pipe handle */ sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle); - if (sknode) - return sk_receive_skb(sknode, skb, 1); + if (sknode) { + int rc; + + /* pep_do_rcv() runs from two contexts: from softirq via + * phonet_rcv() -> __sk_receive_skb() with BH disabled, + * and from process context via + * release_sock() -> __release_sock(), which drops + * the listener slock with spin_unlock_bh() before draining + * the backlog. The child pipe slock is taken below via + * bh_lock_sock_nested(), which does not itself disable BH, so + * disable BH here to keep both acquire contexts consistent. + */ + local_bh_disable(); + rc = sk_receive_skb(sknode, skb, 1); + local_bh_enable(); + return rc; + } switch (hdr->message_id) { case PNS_PEP_CONNECT_REQ: diff --git a/net/rds/message.c b/net/rds/message.c index 25fedcb3cd00..7feb0eb6537d 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -448,6 +448,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * for (i = 0; i < rm->data.op_nents; i++) put_page(sg_page(&rm->data.op_sg[i])); + rm->data.op_nents = 0; mmp = &rm->data.op_mmp_znotifier->z_mmp; mm_unaccount_pinned_pages(mmp); ret = -EFAULT; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 654e23d13e3d..5830b31a1f37 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -198,8 +198,13 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); write_lock_bh(&sock->sk->sk_callback_lock); - /* done under the callback_lock to serialize with write_space */ + /* done under the callback_lock to serialize with write_space. + * Set t_sock inside rds_tcp_tc_list_lock so readers walking + * rds_tcp_tc_list under the same lock cannot observe an + * entry whose t_sock is NULL. + */ spin_lock(&rds_tcp_tc_list_lock); + tc->t_sock = sock; list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); #if IS_ENABLED(CONFIG_IPV6) rds6_tcp_tc_count++; @@ -211,8 +216,6 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) /* accepted sockets need our listen data ready undone */ if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready) sock->sk->sk_data_ready = sock->sk->sk_user_data; - - tc->t_sock = sock; if (!tc->t_rtn) tc->t_rtn = net_generic(sock_net(sock->sk), rds_tcp_netid); tc->t_cpath = cp; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 27c2aa2dd023..98f2165159d7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -213,8 +213,6 @@ struct rxrpc_skb_priv { struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ - u8 flags; -#define RXRPC_RX_VERIFIED 0x01 }; struct { rxrpc_seq_t first_ack; /* First packet in acks table */ @@ -309,15 +307,16 @@ struct rxrpc_security { struct sk_buff *challenge); /* verify a response */ - int (*verify_response)(struct rxrpc_connection *, - struct sk_buff *); + int (*verify_response)(struct rxrpc_connection *conn, + struct sk_buff *response_skb, + void *response, unsigned int len); /* clear connection security */ void (*clear)(struct rxrpc_connection *); /* Default ticket -> key decoder */ int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int ticket_offset, unsigned int ticket_len, + void *ticket, unsigned int ticket_len, struct key **_key); }; @@ -774,6 +773,11 @@ struct rxrpc_call { struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ + void *rx_dec_buffer; /* Decryption buffer */ + unsigned short rx_dec_bsize; /* rx_dec_buffer size */ + unsigned short rx_dec_offset; /* Decrypted packet data offset */ + unsigned short rx_dec_len; /* Decrypted packet data len */ + rxrpc_seq_t rx_dec_seq; /* Packet in decryption buffer */ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ rxrpc_seq_t rx_consumed; /* Highest packet consumed */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index fdd683261226..fec59d9338b9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -332,25 +332,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - sp->hdr.securityIndex != 0 && - skb_cloned(skb)) { - /* Unshare the packet so that it can be - * modified by in-place decryption. - */ - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); - - if (nskb) { - rxrpc_new_skb(nskb, rxrpc_skb_new_unshared); - rxrpc_input_call_packet(call, nskb); - rxrpc_free_skb(nskb, rxrpc_skb_put_call_rx); - } else { - /* OOM - Drop the packet. */ - rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem); - } - } else { - rxrpc_input_call_packet(call, skb); - } + rxrpc_input_call_packet(call, skb); rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); did_receive = true; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f035f486c139..fcb9d38bb521 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -152,6 +152,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, spin_lock_init(&call->notify_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; + call->rx_pkt_offset = USHRT_MAX; call->tx_total_len = -1; call->tx_jumbo_max = 1; call->next_rx_timo = 20 * HZ; @@ -553,6 +554,7 @@ static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call) rxrpc_purge_queue(&call->recvmsg_queue); rxrpc_purge_queue(&call->rx_queue); rxrpc_purge_queue(&call->rx_oos_queue); + kfree(call->rx_dec_buffer); } /* diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a2130d25aaa9..c96ca615b787 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -243,27 +243,22 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call) static int rxrpc_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb) { + unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); + void *buffer; int ret; - if (skb_cloned(skb)) { - /* Copy the packet if shared so that we can do in-place - * decryption. - */ - struct sk_buff *nskb = skb_copy(skb, GFP_NOFS); + buffer = kmalloc(len, GFP_NOFS); + if (!buffer) + return -ENOMEM; - if (nskb) { - rxrpc_new_skb(nskb, rxrpc_skb_new_unshared); - ret = conn->security->verify_response(conn, nskb); - rxrpc_free_skb(nskb, rxrpc_skb_put_response_copy); - } else { - /* OOM - Drop the packet. */ - rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem); - ret = -ENOMEM; - } - } else { - ret = conn->security->verify_response(conn, skb); - } + ret = skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), buffer, len); + if (ret < 0) + goto out; + + ret = conn->security->verify_response(conn, skb, buffer, len); +out: + kfree(buffer); return ret; } diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 0a260df45d25..0b39046bdc61 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -32,9 +32,6 @@ static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - sp->flags |= RXRPC_RX_VERIFIED; return 0; } @@ -57,9 +54,10 @@ static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, } static int none_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb) + struct sk_buff *response_skb, + void *response, unsigned int len) { - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + return rxrpc_abort_conn(conn, response_skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_response); } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index e1f7513a46db..c940600117a4 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -147,15 +147,52 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) } /* - * Decrypt and verify a DATA packet. + * Decrypt and verify a DATA packet. The content of the packet is pulled out + * into a flat buffer rather than decrypting in place in the skbuff. This also + * has the advantage of aligning the buffer correctly for the crypto routines. + * + * We keep track of the sequence number of the packet currently decrypted into + * the buffer in ->rx_dec_seq. If MSG_PEEK is used and steps onto a new + * packet, subsequent recvmsg() calls will have to go back and re-decrypt the + * current packet. */ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + int ret; - if (sp->flags & RXRPC_RX_VERIFIED) - return 0; - return call->security->verify_packet(call, skb); + if (sp->len > call->rx_dec_bsize) { + /* Make sure we can hold a 1412-byte jumbo subpacket and make + * sure that the buffer size is aligned to a crypto blocksize. + */ + size_t size = clamp(round_up(sp->len, 32), 2048, 65535); + void *buffer = krealloc(call->rx_dec_buffer, size, GFP_NOFS); + + if (!buffer) + return -ENOMEM; + call->rx_dec_buffer = buffer; + call->rx_dec_bsize = size; + } + + ret = -EFAULT; + if (skb_copy_bits(skb, sp->offset, call->rx_dec_buffer, sp->len) < 0) + goto err; + + call->rx_dec_offset = 0; + call->rx_dec_len = sp->len; + call->rx_dec_seq = sp->hdr.seq; + ret = call->security->verify_packet(call, skb); + if (ret < 0) + goto err; + return 0; + +err: + kfree(call->rx_dec_buffer); + call->rx_dec_buffer = NULL; + call->rx_dec_bsize = 0; + call->rx_dec_offset = 0; + call->rx_dec_len = 0; + return ret; } /* @@ -283,16 +320,21 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (msg) sock_recv_timestamp(msg, sock->sk, skb); - if (rx_pkt_offset == 0) { + if (call->rx_dec_seq != sp->hdr.seq || + !call->rx_dec_buffer) { ret2 = rxrpc_verify_data(call, skb); trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, - sp->offset, sp->len, ret2); + call->rx_dec_offset, + call->rx_dec_len, ret2); if (ret2 < 0) { ret = ret2; goto out; } - rx_pkt_offset = sp->offset; - rx_pkt_len = sp->len; + } + + if (rx_pkt_offset == USHRT_MAX) { + rx_pkt_offset = call->rx_dec_offset; + rx_pkt_len = call->rx_dec_len; } else { trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); @@ -304,10 +346,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (copy > remain) copy = remain; if (copy > 0) { - ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter, - copy); - if (ret2 < 0) { - ret = ret2; + ret2 = copy_to_iter(call->rx_dec_buffer + rx_pkt_offset, + copy, iter); + if (ret2 != copy) { + ret = -EFAULT; goto out; } @@ -328,7 +370,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, /* The whole packet has been transferred. */ if (sp->hdr.flags & RXRPC_LAST_PACKET) ret = 1; - rx_pkt_offset = 0; + rx_pkt_offset = USHRT_MAX; rx_pkt_len = 0; skb = skb_peek_next(skb, &call->recvmsg_queue); diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 0d5e654da918..a1ee102abae1 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -473,15 +473,20 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxgk_header *hdr; struct krb5_buffer metadata; - unsigned int offset = sp->offset, len = sp->len; + unsigned int len = call->rx_dec_len; size_t data_offset = 0, data_len = len; + void *data = call->rx_dec_buffer, *p = data; u32 ac = 0; int ret = -ENOMEM; _enter(""); - crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, - &data_offset, &data_len); + if (crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len) < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_1_short_header); + goto put_gk; + } hdr = kzalloc_obj(*hdr, GFP_NOFS); if (!hdr) @@ -496,16 +501,15 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, metadata.len = sizeof(*hdr); metadata.data = hdr; - ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, - skb, &offset, &len, &ac); + ret = rxgk_verify_mic(gk->krb5, gk->rx_Kc, &metadata, &p, &len, &ac); kfree(hdr); if (ret < 0) { if (ret != -ENOMEM) rxrpc_abort_eproto(call, skb, ac, rxgk_abort_1_verify_mic_eproto); } else { - sp->offset = offset; - sp->len = len; + call->rx_dec_offset = p - data; + call->rx_dec_len = len; } put_gk: @@ -522,49 +526,53 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxgk_header hdr; - unsigned int offset = sp->offset, len = sp->len; + struct rxgk_header *hdr; + unsigned int offset = 0, len = call->rx_dec_len; + void *data = call->rx_dec_buffer, *p = data; int ret; u32 ac = 0; _enter(""); - ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); + if (crypto_krb5_check_data_len(gk->krb5, KRB5_ENCRYPT_MODE, + len, sizeof(*hdr)) < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + + ret = rxgk_decrypt(gk->krb5, gk->rx_enc, &p, &len, &ac); if (ret < 0) { if (ret != -ENOMEM) rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); goto error; } + offset = p - data; - if (len < sizeof(hdr)) { + if (len < sizeof(*hdr)) { ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, rxgk_abort_2_short_header); goto error; } /* Extract the header from the skb */ - ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr)); - if (ret < 0) { - ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, - rxgk_abort_2_short_encdata); - goto error; - } - offset += sizeof(hdr); - len -= sizeof(hdr); - - if (ntohl(hdr.epoch) != call->conn->proto.epoch || - ntohl(hdr.cid) != call->cid || - ntohl(hdr.call_number) != call->call_id || - ntohl(hdr.seq) != sp->hdr.seq || - ntohl(hdr.sec_index) != call->security_ix || - ntohl(hdr.data_len) > len) { + hdr = data + offset; + offset += sizeof(*hdr); + len -= sizeof(*hdr); + + if (ntohl(hdr->epoch) != call->conn->proto.epoch || + ntohl(hdr->cid) != call->cid || + ntohl(hdr->call_number) != call->call_id || + ntohl(hdr->seq) != sp->hdr.seq || + ntohl(hdr->sec_index) != call->security_ix || + ntohl(hdr->data_len) > len) { ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON, rxgk_abort_2_short_data); goto error; } - sp->offset = offset; - sp->len = ntohl(hdr.data_len); + call->rx_dec_offset = offset; + call->rx_dec_len = ntohl(hdr->data_len); ret = 0; error: rxgk_put(gk); @@ -1076,11 +1084,12 @@ static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge, * unsigned int call_numbers<>; * }; */ -static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, - const struct krb5_enctype *krb5, - struct sk_buff *skb, - __be32 *p, __be32 *end) +static int rxgk_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + void *auth, unsigned int auth_len) { + __be32 *p = auth, *end = auth + auth_len; u32 app_len, call_count, level, epoch, cid, i; _enter(""); @@ -1144,37 +1153,6 @@ static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, } /* - * Extract the authenticator and verify it. - */ -static int rxgk_verify_authenticator(struct rxrpc_connection *conn, - const struct krb5_enctype *krb5, - struct sk_buff *skb, - unsigned int auth_offset, unsigned int auth_len) -{ - void *auth; - __be32 *p; - int ret; - - auth = kmalloc(auth_len, GFP_NOFS); - if (!auth) - return -ENOMEM; - - ret = skb_copy_bits(skb, auth_offset, auth, auth_len); - if (ret < 0) { - ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, - rxgk_abort_resp_short_auth); - goto error; - } - - p = auth; - ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, - p + auth_len / sizeof(*p)); -error: - kfree(auth); - return ret; -} - -/* * Verify a response. * * struct RXGK_Response { @@ -1184,49 +1162,45 @@ error: * }; */ static int rxgk_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb) + struct sk_buff *skb, + void *buffer, unsigned int len) { const struct krb5_enctype *krb5; struct rxrpc_key_token *token; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxgk_response rhdr; + struct rxgk_response *rhdr; struct rxgk_context *gk; struct key *key = NULL; - unsigned int offset = sizeof(struct rxrpc_wire_header); - unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); - unsigned int token_offset, token_len; - unsigned int auth_offset, auth_len; + unsigned int resp_token_len, auth_len; + void *resp_token, *auth; __be32 xauth_len; int ret, ec; _enter("{%d}", conn->debug_id); /* Parse the RXGK_Response object */ - if (sizeof(rhdr) + sizeof(__be32) > len) + if (len < sizeof(*rhdr) + sizeof(__be32)) goto short_packet; - - if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0) - goto short_packet; - offset += sizeof(rhdr); - len -= sizeof(rhdr); - - token_offset = offset; - token_len = ntohl(rhdr.token_len); - if (token_len > len || - xdr_round_up(token_len) + sizeof(__be32) > len) + rhdr = buffer; + buffer += sizeof(*rhdr); + len -= sizeof(*rhdr); + + resp_token = buffer; + resp_token_len = ntohl(rhdr->token_len); + if (resp_token_len > len || + xdr_round_up(resp_token_len) + sizeof(__be32) > len) goto short_packet; - trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len); + trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, resp_token_len); - offset += xdr_round_up(token_len); - len -= xdr_round_up(token_len); + buffer += xdr_round_up(resp_token_len); + len -= xdr_round_up(resp_token_len); - if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0) - goto short_packet; - offset += sizeof(xauth_len); + xauth_len = *(__be32 *)buffer; + buffer += sizeof(xauth_len); len -= sizeof(xauth_len); - auth_offset = offset; + auth = buffer; auth_len = ntohl(xauth_len); if (auth_len > len) goto short_packet; @@ -1241,7 +1215,7 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, * to the app to deal with - which might mean a round trip to * userspace. */ - ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key); + ret = rxgk_extract_token(conn, skb, resp_token, resp_token_len, &key); if (ret < 0) goto out; @@ -1255,7 +1229,7 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, */ token = key->payload.data[0]; conn->security_level = token->rxgk->level; - conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time); + conn->rxgk.start_time = __be64_to_cpu(rhdr->start_time); gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS); if (IS_ERR(gk)) { @@ -1265,18 +1239,18 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, krb5 = gk->krb5; - trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len); + trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, + resp_token_len); /* Decrypt, parse and verify the authenticator. */ - ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb, - &auth_offset, &auth_len, &ec); + ret = rxgk_decrypt(krb5, gk->resp_enc, &auth, &auth_len, &ec); if (ret < 0) { rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret, rxgk_abort_resp_auth_dec); goto out_gk; } - ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len); + ret = rxgk_verify_authenticator(conn, krb5, skb, auth, auth_len); if (ret < 0) goto out_gk; diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index 0ef2a29eb695..200a30064fae 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -40,7 +40,7 @@ * }; */ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int ticket_offset, unsigned int ticket_len, + void *buffer, unsigned int ticket_len, struct key **_key) { struct rxrpc_key_token *token; @@ -49,7 +49,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, size_t pre_ticket_len, payload_len; unsigned int klen, enctype; void *payload, *ticket; - __be32 *t, *p, *q, tmp[2]; + __be32 *t, *p, *q, *tmp; int ret; _enter(""); @@ -59,10 +59,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, rxgk_abort_resp_short_yfs_tkt); /* Get the session key length */ - ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); - if (ret < 0) - return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, - rxgk_abort_resp_short_yfs_klen); + tmp = buffer; enctype = ntohl(tmp[0]); klen = ntohl(tmp[1]); @@ -84,12 +81,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, * it. */ ticket = payload + pre_ticket_len; - ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len); - if (ret < 0) { - ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, - rxgk_abort_resp_short_yfs_tkt); - goto error; - } + memcpy(ticket, buffer, ticket_len); /* Fill out the form header. */ p = payload; @@ -131,7 +123,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, goto error; } - /* Ticket read in with skb_copy_bits above */ + /* Ticket appended above. */ q += xdr_round_up(ticket_len) / 4; if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) { ret = -EIO; @@ -182,14 +174,15 @@ error: * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1] */ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int token_offset, unsigned int token_len, + void *token, unsigned int token_len, struct key **_key) { const struct krb5_enctype *krb5; const struct krb5_buffer *server_secret; struct crypto_aead *token_enc = NULL; struct key *server_key; - unsigned int ticket_offset, ticket_len; + unsigned int ticket_len; + void *ticket; u32 kvno, enctype; int ret, ec = 0; @@ -197,24 +190,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, __be32 kvno; __be32 enctype; __be32 token_len; - } container; + } *container; - if (token_len < sizeof(container)) + if (token_len < sizeof(*container)) goto short_packet; /* Decode the RXGK_TokenContainer object. This tells us which server * key we should be using. We can then fetch the key, get the secret * and set up the crypto to extract the token. */ - if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) - goto short_packet; + container = token; + token += sizeof(*container); - kvno = ntohl(container.kvno); - enctype = ntohl(container.enctype); - ticket_len = ntohl(container.token_len); - ticket_offset = token_offset + sizeof(container); + kvno = ntohl(container->kvno); + enctype = ntohl(container->enctype); + ticket_len = ntohl(container->token_len); - if (ticket_len > xdr_round_down(token_len - sizeof(container))) + if (ticket_len > xdr_round_down(token_len - sizeof(*container))) goto short_packet; _debug("KVNO %u", kvno); @@ -237,8 +229,8 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, * gain access to K0, from which we can derive the transport key and * thence decode the authenticator. */ - ret = rxgk_decrypt_skb(krb5, token_enc, skb, - &ticket_offset, &ticket_len, &ec); + ticket = token; + ret = rxgk_decrypt(krb5, token_enc, &ticket, &ticket_len, &ec); crypto_free_aead(token_enc); token_enc = NULL; if (ret < 0) { @@ -248,7 +240,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, return ret; } - ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, + ret = conn->security->default_decode_ticket(conn, skb, ticket, ticket_len, _key); if (ret < 0) goto cant_get_token; diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index 1e257d7ab8ec..3deed5863f5a 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -41,10 +41,10 @@ struct rxgk_context { * rxgk_app.c */ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int ticket_offset, unsigned int ticket_len, + void *ticket, unsigned int ticket_len, struct key **_key); int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int token_offset, unsigned int token_len, + void *token, unsigned int token_len, struct key **_key); /* @@ -62,31 +62,30 @@ int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, gfp_t gfp); /* - * Apply decryption and checksumming functions to part of an skbuff. The - * offset and length are updated to reflect the actual content of the encrypted + * Apply decryption and checksumming functions a flat data buffer. The data + * point and length are updated to reflect the actual content of the encrypted * region. */ -static inline -int rxgk_decrypt_skb(const struct krb5_enctype *krb5, - struct crypto_aead *aead, - struct sk_buff *skb, - unsigned int *_offset, unsigned int *_len, - int *_error_code) +static inline int rxgk_decrypt(const struct krb5_enctype *krb5, + struct crypto_aead *aead, + void **_data, unsigned int *_len, + int *_error_code) { - struct scatterlist sg[16]; + struct scatterlist sg[1]; size_t offset = 0, len = *_len; - int nr_sg, ret; + int ret; - sg_init_table(sg, ARRAY_SIZE(sg)); - nr_sg = skb_to_sgvec(skb, sg, *_offset, len); - if (unlikely(nr_sg < 0)) - return nr_sg; + sg_init_one(sg, *_data, len); - ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg, - &offset, &len); + ret = crypto_krb5_decrypt(krb5, aead, sg, 1, &offset, &len); switch (ret) { case 0: - *_offset += offset; + if (offset & 3) { + *_error_code = RXGK_INCONSISTENCY; + ret = -EPROTO; + break; + } + *_data += offset; *_len = len; break; case -EBADMSG: /* Checksum mismatch. */ @@ -106,31 +105,26 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5, } /* - * Check the MIC on a region of an skbuff. The offset and length are updated - * to reflect the actual content of the secure region. + * Check the MIC on a flat buffer. The data pointer and length are updated to + * reflect the actual content of the secure region. */ static inline -int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, - struct crypto_shash *shash, - const struct krb5_buffer *metadata, - struct sk_buff *skb, - unsigned int *_offset, unsigned int *_len, - u32 *_error_code) +int rxgk_verify_mic(const struct krb5_enctype *krb5, + struct crypto_shash *shash, + const struct krb5_buffer *metadata, + void **_data, unsigned int *_len, + u32 *_error_code) { - struct scatterlist sg[16]; + struct scatterlist sg[1]; size_t offset = 0, len = *_len; - int nr_sg, ret; + int ret; - sg_init_table(sg, ARRAY_SIZE(sg)); - nr_sg = skb_to_sgvec(skb, sg, *_offset, len); - if (unlikely(nr_sg < 0)) - return nr_sg; + sg_init_one(sg, *_data, len); - ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg, - &offset, &len); + ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, 1, &offset, &len); switch (ret) { case 0: - *_offset += offset; + *_data += offset; *_len = len; break; case -EBADMSG: /* Checksum mismatch */ diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index cba7935977f0..6fbd883401ac 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -430,27 +430,25 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_seq_t seq, struct skcipher_request *req) { - struct rxkad_level1_hdr sechdr; + struct rxkad_level1_hdr *sechdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; - struct scatterlist sg[16]; - u32 data_size, buf; + struct scatterlist sg[1]; + void *data = call->rx_dec_buffer; + u32 len = sp->len, data_size, buf; u16 check; int ret; _enter(""); - if (sp->len < 8) + if (len < 8) return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, rxkad_abort_1_short_header); /* Decrypt the skbuff in-place. TODO: We really want to decrypt * directly into the target buffer. */ - sg_init_table(sg, ARRAY_SIZE(sg)); - ret = skb_to_sgvec(skb, sg, sp->offset, 8); - if (unlikely(ret < 0)) - return ret; + sg_init_one(sg, data, len); /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); @@ -464,13 +462,11 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, return ret; /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) - return rxrpc_abort_eproto(call, skb, RXKADDATALEN, - rxkad_abort_1_short_encdata); - sp->offset += sizeof(sechdr); - sp->len -= sizeof(sechdr); + sechdr = data; + call->rx_dec_offset = sizeof(*sechdr); + len -= sizeof(*sechdr); - buf = ntohl(sechdr.data_size); + buf = ntohl(sechdr->data_size); data_size = buf & 0xffff; check = buf >> 16; @@ -479,10 +475,10 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, if (check != 0) return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, rxkad_abort_1_short_check); - if (data_size > sp->len) + if (data_size > len) return rxrpc_abort_eproto(call, skb, RXKADDATALEN, rxkad_abort_1_short_data); - sp->len = data_size; + call->rx_dec_len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; @@ -496,43 +492,28 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxkad_level2_hdr sechdr; + struct rxkad_level2_hdr *sechdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; - struct scatterlist _sg[4], *sg; - u32 data_size, buf; + struct scatterlist sg[1]; + void *data = call->rx_dec_buffer; + u32 len = sp->len, data_size, buf; u16 check; - int nsg, ret; + int ret; - _enter(",{%d}", sp->len); + _enter(",{%d}", len); - if (sp->len < 8) + if (len < 8) return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, rxkad_abort_2_short_header); /* Don't let the crypto algo see a misaligned length. */ - sp->len = round_down(sp->len, 8); + len = round_down(len, 8); - /* Decrypt the skbuff in-place. TODO: We really want to decrypt - * directly into the target buffer. + /* Decrypt in place in the call's decryption buffer. TODO: We really + * want to decrypt directly into the target buffer. */ - sg = _sg; - nsg = skb_shinfo(skb)->nr_frags + 1; - if (nsg <= 4) { - nsg = 4; - } else { - sg = kmalloc_objs(*sg, nsg, GFP_NOIO); - if (!sg) - return -ENOMEM; - } - - sg_init_table(sg, nsg); - ret = skb_to_sgvec(skb, sg, sp->offset, sp->len); - if (unlikely(ret < 0)) { - if (sg != _sg) - kfree(sg); - return ret; - } + sg_init_one(sg, data, len); /* decrypt from the session key */ token = call->conn->key->payload.data[0]; @@ -540,11 +521,9 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, sp->len, iv.x); + skcipher_request_set_crypt(req, sg, sg, len, iv.x); ret = crypto_skcipher_decrypt(req); skcipher_request_zero(req); - if (sg != _sg) - kfree(sg); if (ret < 0) { if (ret == -ENOMEM) return ret; @@ -553,13 +532,11 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, } /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) - return rxrpc_abort_eproto(call, skb, RXKADDATALEN, - rxkad_abort_2_short_len); - sp->offset += sizeof(sechdr); - sp->len -= sizeof(sechdr); + sechdr = data; + call->rx_dec_offset = sizeof(*sechdr); + len -= sizeof(*sechdr); - buf = ntohl(sechdr.data_size); + buf = ntohl(sechdr->data_size); data_size = buf & 0xffff; check = buf >> 16; @@ -569,17 +546,18 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, rxkad_abort_2_short_check); - if (data_size > sp->len) + if (data_size > len) return rxrpc_abort_eproto(call, skb, RXKADDATALEN, rxkad_abort_2_short_data); - sp->len = data_size; + call->rx_dec_len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; } /* - * Verify the security on a received packet and the subpackets therein. + * Verify the security on a received (sub)packet. If the packet needs + * modifying (e.g. decrypting), it must be copied. */ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) { @@ -985,7 +963,6 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, *_expiry = 0; ASSERT(server_key->payload.data[0] != NULL); - ASSERTCMP((unsigned long) ticket & 7UL, ==, 0); memcpy(&iv, &server_key->payload.data[2], sizeof(iv)); @@ -1134,14 +1111,15 @@ unlock: * verify a response */ static int rxkad_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb) + struct sk_buff *skb, + void *buffer, unsigned int len) { struct rxkad_response *response; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; struct key *server_key; time64_t expiry; - void *ticket = NULL; + void *ticket; u32 version, kvno, ticket_len, level; __be32 csum; int ret, i; @@ -1164,13 +1142,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, } } - ret = -ENOMEM; - response = kzalloc_obj(struct rxkad_response, GFP_NOFS); - if (!response) - goto error; - - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - response, sizeof(*response)) < 0) { + response = buffer; + if (len < sizeof(*response)) { ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, rxkad_abort_resp_short); goto error; @@ -1182,6 +1155,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, trace_rxrpc_rx_response(conn, sp->hdr.serial, version, kvno, ticket_len); + buffer += sizeof(*response); + len -= sizeof(*response); + if (version != RXKAD_VERSION) { ret = rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, rxkad_abort_resp_version); @@ -1201,13 +1177,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, } /* extract the kerberos ticket and decrypt and decode it */ - ret = -ENOMEM; - ticket = kmalloc(ticket_len, GFP_NOFS); - if (!ticket) - goto error; - - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), - ticket, ticket_len) < 0) { + ticket = buffer; + if (ticket_len > len) { ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, rxkad_abort_resp_short_tkt); goto error; @@ -1287,8 +1258,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno); error: - kfree(ticket); - kfree(response); key_put(server_key); _leave(" = %d", ret); return ret; diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 8c9a0400c862..0f953bd46b58 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -243,6 +243,20 @@ static struct sk_buff *cbs_dequeue(struct Qdisc *sch) return q->dequeue(sch); } +static void cbs_reset(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + /* Nothing to do if we couldn't create the underlying qdisc */ + if (!q->qdisc) + return; + + qdisc_reset(q->qdisc); + qdisc_watchdog_cancel(&q->watchdog); + q->credits = 0; + q->last = 0; +} + static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, }; @@ -540,7 +554,7 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .dequeue = cbs_dequeue, .peek = qdisc_peek_dequeued, .init = cbs_init, - .reset = qdisc_reset_queue, + .reset = cbs_reset, .destroy = cbs_destroy, .change = cbs_change, .dump = cbs_dump, diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 241e6a46bd00..a22489c14458 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -938,6 +938,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, int err; sch->flags |= TCQ_F_DEQUEUE_DROPS; + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1), extack); @@ -950,8 +952,6 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, q->sch = sch; dualpi2_reset_default(sch); - hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED_SOFT); if (opt && nla_len(opt)) { err = dualpi2_change(sch, opt, extack); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 58d0d9747f0b..1d2568bb6bc2 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1986,6 +1986,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_unlock; iov_iter_revert(&msg->msg_iter, err); + + /* sctp_sendmsg_to_asoc() may have released the socket + * lock (sctp_wait_for_sndbuf), during which other + * associations on ep->asocs could have been peeled + * off or freed. @asoc itself is revalidated by the + * base.dead and base.sk checks in sctp_wait_for_sndbuf, + * so re-derive the cached cursor from it. + */ + tmp = list_next_entry(asoc, asocs); } goto out_unlock; diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 94bc9c7382ea..dea9270f3e57 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -21,6 +21,8 @@ #define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK +static_assert(NET_SHAPER_ID_UNSPEC == NET_SHAPER_MAX_HANDLE_ID + 1); + struct net_shaper_hierarchy { struct xarray shapers; }; @@ -90,6 +92,12 @@ static int net_shaper_handle_size(void) nla_total_size(sizeof(u32))); } +static int net_shaper_group_reply_size(void) +{ + return nla_total_size(sizeof(u32)) + /* NET_SHAPER_A_IFINDEX */ + net_shaper_handle_size(); /* NET_SHAPER_A_HANDLE */ +} + static int net_shaper_fill_binding(struct sk_buff *msg, const struct net_shaper_binding *binding, u32 type) @@ -130,35 +138,58 @@ handle_nest_cancel: return -EMSGSIZE; } +static void net_shaper_copy(struct net_shaper *dst, + const struct net_shaper *src) +{ + WRITE_ONCE(dst->parent.scope, READ_ONCE(src->parent.scope)); + WRITE_ONCE(dst->parent.id, READ_ONCE(src->parent.id)); + WRITE_ONCE(dst->handle.scope, READ_ONCE(src->handle.scope)); + WRITE_ONCE(dst->handle.id, READ_ONCE(src->handle.id)); + + WRITE_ONCE(dst->metric, READ_ONCE(src->metric)); + WRITE_ONCE(dst->bw_min, READ_ONCE(src->bw_min)); + WRITE_ONCE(dst->bw_max, READ_ONCE(src->bw_max)); + WRITE_ONCE(dst->burst, READ_ONCE(src->burst)); + WRITE_ONCE(dst->priority, READ_ONCE(src->priority)); + WRITE_ONCE(dst->weight, READ_ONCE(src->weight)); + + /* private fields are only used on the write path under the lock */ + data_race(dst->leaves = src->leaves); +} + static int net_shaper_fill_one(struct sk_buff *msg, const struct net_shaper_binding *binding, const struct net_shaper *shaper, const struct genl_info *info) { + struct net_shaper cur; void *hdr; hdr = genlmsg_iput(msg, info); if (!hdr) return -EMSGSIZE; + /* Make a copy to avoid data races */ + net_shaper_copy(&cur, shaper); + if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || - net_shaper_fill_handle(msg, &shaper->parent, + net_shaper_fill_handle(msg, &cur.parent, NET_SHAPER_A_PARENT) || - net_shaper_fill_handle(msg, &shaper->handle, + net_shaper_fill_handle(msg, &cur.handle, NET_SHAPER_A_HANDLE) || - ((shaper->bw_min || shaper->bw_max || shaper->burst) && - nla_put_u32(msg, NET_SHAPER_A_METRIC, shaper->metric)) || - (shaper->bw_min && - nla_put_uint(msg, NET_SHAPER_A_BW_MIN, shaper->bw_min)) || - (shaper->bw_max && - nla_put_uint(msg, NET_SHAPER_A_BW_MAX, shaper->bw_max)) || - (shaper->burst && - nla_put_uint(msg, NET_SHAPER_A_BURST, shaper->burst)) || - (shaper->priority && - nla_put_u32(msg, NET_SHAPER_A_PRIORITY, shaper->priority)) || - (shaper->weight && - nla_put_u32(msg, NET_SHAPER_A_WEIGHT, shaper->weight))) + ((cur.bw_min || cur.bw_max || cur.burst) && + nla_put_u32(msg, NET_SHAPER_A_METRIC, cur.metric)) || + (cur.bw_min && + nla_put_uint(msg, NET_SHAPER_A_BW_MIN, cur.bw_min)) || + (cur.bw_max && + nla_put_uint(msg, NET_SHAPER_A_BW_MAX, cur.bw_max)) || + (cur.burst && + nla_put_uint(msg, NET_SHAPER_A_BURST, cur.burst)) || + (cur.priority && + nla_put_u32(msg, NET_SHAPER_A_PRIORITY, cur.priority)) || + (cur.weight && + nla_put_u32(msg, NET_SHAPER_A_WEIGHT, cur.weight))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -275,25 +306,24 @@ static void net_shaper_default_parent(const struct net_shaper_handle *handle, parent->id = 0; } -/* - * MARK_0 is already in use due to XA_FLAGS_ALLOC, can't reuse such flag as - * it's cleared by xa_store(). - */ -#define NET_SHAPER_NOT_VALID XA_MARK_1 - static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, const struct net_shaper_handle *handle) { u32 index = net_shaper_handle_to_index(handle); struct net_shaper_hierarchy *hierarchy; + struct net_shaper *cur; hierarchy = net_shaper_hierarchy_rcu(binding); - if (!hierarchy || xa_get_mark(&hierarchy->shapers, index, - NET_SHAPER_NOT_VALID)) + if (!hierarchy) + return NULL; + + cur = xa_load(&hierarchy->shapers, index); + /* Check valid before reading fields */ + if (!cur || !smp_load_acquire(&cur->valid)) return NULL; - return xa_load(&hierarchy->shapers, index); + return cur; } /* Allocate on demand the per device shaper's hierarchy container. @@ -348,7 +378,7 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding, handle->id == NET_SHAPER_ID_UNSPEC) { u32 min, max; - handle->id = NET_SHAPER_ID_MASK - 1; + handle->id = NET_SHAPER_MAX_HANDLE_ID; max = net_shaper_handle_to_index(handle); handle->id = 0; min = net_shaper_handle_to_index(handle); @@ -370,13 +400,10 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding, goto free_id; } - /* Mark 'tentative' shaper inside the hierarchy container. - * xa_set_mark is a no-op if the previous store fails. + /* Insert as 'tentative' (no VALID mark). The mark will be set by + * net_shaper_commit() once the driver-side configuration succeeds. */ - xa_lock(&hierarchy->shapers); - prev = __xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); - __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID); - xa_unlock(&hierarchy->shapers); + prev = xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); if (xa_err(prev)) { NL_SET_ERR_MSG(extack, "Can't insert shaper into device store"); kfree_rcu(cur, rcu); @@ -410,12 +437,10 @@ static void net_shaper_commit(struct net_shaper_binding *binding, if (WARN_ON_ONCE(!cur)) continue; - /* Successful update: drop the tentative mark - * and update the hierarchy container. - */ - __xa_clear_mark(&hierarchy->shapers, index, - NET_SHAPER_NOT_VALID); - *cur = shapers[i]; + /* Successful update: update the hierarchy container... */ + net_shaper_copy(cur, &shapers[i]); + /* ... publish to lockless readers. */ + smp_store_release(&cur->valid, true); } xa_unlock(&hierarchy->shapers); } @@ -431,10 +456,11 @@ static void net_shaper_rollback(struct net_shaper_binding *binding) return; xa_lock(&hierarchy->shapers); - xa_for_each_marked(&hierarchy->shapers, index, cur, - NET_SHAPER_NOT_VALID) { + xa_for_each(&hierarchy->shapers, index, cur) { + if (cur->valid) + continue; __xa_erase(&hierarchy->shapers, index); - kfree(cur); + kfree_rcu(cur, rcu); } xa_unlock(&hierarchy->shapers); } @@ -465,10 +491,21 @@ static int net_shaper_parse_handle(const struct nlattr *attr, * shaper (any other value). */ id_attr = tb[NET_SHAPER_A_HANDLE_ID]; - if (id_attr) + if (id_attr) { id = nla_get_u32(id_attr); - else if (handle->scope == NET_SHAPER_SCOPE_NODE) + } else if (handle->scope == NET_SHAPER_SCOPE_NODE) { id = NET_SHAPER_ID_UNSPEC; + } else if (handle->scope == NET_SHAPER_SCOPE_QUEUE) { + NL_SET_ERR_ATTR_MISS(info->extack, attr, + NET_SHAPER_A_HANDLE_ID); + return -EINVAL; + } + + if (id && handle->scope == NET_SHAPER_SCOPE_NETDEV) { + NL_SET_ERR_MSG_ATTR(info->extack, id_attr, + "Netdev scope is a singleton, must use ID 0"); + return -EINVAL; + } handle->id = id; return 0; @@ -836,7 +873,12 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, goto out_unlock; for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, - U32_MAX, XA_PRESENT)); ctx->start_index++) { + U32_MAX, XA_PRESENT)); + ctx->start_index++) { + /* Check valid before reading fields */ + if (!smp_load_acquire(&shaper->valid)) + continue; + ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; @@ -932,6 +974,46 @@ static int net_shaper_handle_cmp(const struct net_shaper_handle *a, return memcmp(a, b, sizeof(*a)); } +static int net_shaper_parse_leaves(struct net_shaper_binding *binding, + struct genl_info *info, + const struct net_shaper *node, + struct net_shaper *leaves, + int leaves_count) +{ + struct nlattr *attr; + int i, j, ret, rem; + + i = 0; + nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + if (WARN_ON_ONCE(i >= leaves_count)) + return -EINVAL; + + ret = net_shaper_parse_leaf(binding, attr, info, + node, &leaves[i]); + if (ret) + return ret; + + /* Reject duplicates */ + for (j = 0; j < i; j++) { + if (net_shaper_handle_cmp(&leaves[i].handle, + &leaves[j].handle)) + continue; + + NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr, + "Duplicate leaf shaper %d:%d", + leaves[i].handle.scope, + leaves[i].handle.id); + return -EINVAL; + } + + i++; + } + + return 0; +} + static int net_shaper_parent_from_leaves(int leaves_count, const struct net_shaper *leaves, struct net_shaper *node, @@ -964,15 +1046,22 @@ static int __net_shaper_group(struct net_shaper_binding *binding, int i, ret; if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { + struct net_shaper *cur = NULL; + new_node = node->handle.id == NET_SHAPER_ID_UNSPEC; - if (!new_node && !net_shaper_lookup(binding, &node->handle)) { - /* The related attribute is not available when - * reaching here from the delete() op. - */ - NL_SET_ERR_MSG_FMT(extack, "Node shaper %d:%d does not exists", - node->handle.scope, node->handle.id); - return -ENOENT; + if (!new_node) { + cur = net_shaper_lookup(binding, &node->handle); + if (!cur) { + /* The related attribute is not available + * when reaching here from the delete() op. + */ + NL_SET_ERR_MSG_FMT(extack, + "Node shaper %d:%d does not exist", + node->handle.scope, + node->handle.id); + return -ENOENT; + } } /* When unspecified, the node parent scope is inherited from @@ -986,6 +1075,15 @@ static int __net_shaper_group(struct net_shaper_binding *binding, return ret; } + if (cur && net_shaper_handle_cmp(&cur->parent, + &node->parent)) { + NL_SET_ERR_MSG_FMT(extack, + "Cannot reparent node shaper %d:%d", + node->handle.scope, + node->handle.id); + return -EOPNOTSUPP; + } + } else { net_shaper_default_parent(&node->handle, &node->parent); } @@ -1162,7 +1260,7 @@ static int net_shaper_group_send_reply(struct net_shaper_binding *binding, free_msg: /* Should never happen as msg is pre-allocated with enough space. */ WARN_ONCE(true, "calculated message payload length (%d)", - net_shaper_handle_size()); + net_shaper_group_reply_size()); nlmsg_free(msg); return -EMSGSIZE; } @@ -1172,10 +1270,9 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) struct net_shaper **old_nodes, *leaves, node = {}; struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; - int i, ret, rem, leaves_count; + int i, ret, leaves_count; int old_nodes_count = 0; struct sk_buff *msg; - struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_LEAVES)) return -EINVAL; @@ -1203,26 +1300,19 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto free_leaves; - i = 0; - nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, - genlmsg_data(info->genlhdr), - genlmsg_len(info->genlhdr), rem) { - if (WARN_ON_ONCE(i >= leaves_count)) - goto free_leaves; - - ret = net_shaper_parse_leaf(binding, attr, info, - &node, &leaves[i]); - if (ret) - goto free_leaves; - i++; - } + ret = net_shaper_parse_leaves(binding, info, &node, + leaves, leaves_count); + if (ret) + goto free_leaves; /* Prepare the msg reply in advance, to avoid device operation * rollback on allocation failure. */ - msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL); - if (!msg) + msg = genlmsg_new(net_shaper_group_reply_size(), GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; goto free_leaves; + } hierarchy = net_shaper_hierarchy_setup(binding); if (!hierarchy) { diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c index 9b29be3ef19a..76eff85ec66d 100644 --- a/net/shaper/shaper_nl_gen.c +++ b/net/shaper/shaper_nl_gen.c @@ -11,10 +11,15 @@ #include <uapi/linux/net_shaper.h> +/* Integer value ranges */ +static const struct netlink_range_validation net_shaper_a_handle_id_range = { + .max = NET_SHAPER_MAX_HANDLE_ID, +}; + /* Common nested types */ const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1] = { [NET_SHAPER_A_HANDLE_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3), - [NET_SHAPER_A_HANDLE_ID] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &net_shaper_a_handle_id_range), }; const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1] = { diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h index 42c46c52c775..2406652a9014 100644 --- a/net/shaper/shaper_nl_gen.h +++ b/net/shaper/shaper_nl_gen.h @@ -12,6 +12,8 @@ #include <uapi/linux/net_shaper.h> +#define NET_SHAPER_MAX_HANDLE_ID 67108862 + /* Common nested types */ extern const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1]; extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1]; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 185dbed7de5d..dffbd529762d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1400,7 +1400,8 @@ smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc, int i; for (i = 0; i < ini->ism_offered_cnt + 1; i++) { - if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { + if (ini->ism_dev[i] && + ini->ism_chid[i] == ntohs(aclc->d1.chid)) { ini->ism_selected = i; return 0; } @@ -3054,18 +3055,17 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname, smc = smc_sk(sk); + /* pre-fetch user data outside the lock */ + if (optname == SMC_LIMIT_HS) { + if (optlen < sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + } + lock_sock(sk); switch (optname) { case SMC_LIMIT_HS: - if (optlen < sizeof(int)) { - rc = -EINVAL; - break; - } - if (copy_from_sockptr(&val, optval, sizeof(int))) { - rc = -EFAULT; - break; - } - smc->limit_smc_hs = !!val; rc = 0; break; diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index a9a6e3c1113a..53da84f57fd6 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(smc_msg_event, __field(const void *, smc) __field(u64, net_cookie) __field(size_t, len) - __string(name, smc->conn.lnk->ibname) + __string(name, smc->conn.lnk ? smc->conn.lnk->ibname : "") ), TP_fast_assign( diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 7081c1214e6c..b5474ce534fb 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -403,7 +403,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) INIT_LIST_HEAD(&cd->readers); spin_lock_init(&cd->queue_lock); init_waitqueue_head(&cd->queue_wait); - cd->next_seqno = 0; + cd->next_seqno = 1; spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2590e855f6a5..964ebc268ee4 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -789,23 +789,33 @@ static int tls_push_record(struct sock *sk, int flags, i = msg_pl->sg.end; sk_msg_iter_var_prev(i); + /* msg_pl->sg.data is a ring; data[MAX+1] is reserved for the wrap + * link (frags won't use it). 'i' is now the last filled entry: + * + * i end start + * v v v [ rsv ] + * [ d ][ d ][ ][ ]...[ ][ d ][ d ][ d ][chain] + * ^ END v + * `-----------------------------------------' + * + * Note that SGL does not allow chain-after-chain, so for TLS 1.3, + * we must make sure we don't create the wrap entry and then chain + * link to content_type immediately at index 0. + */ + if (i < msg_pl->sg.start) + sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data), + msg_pl->sg.data); + rec->content_type = record_type; if (prot->version == TLS_1_3_VERSION) { /* Add content type to end of message. No padding added */ sg_set_buf(&rec->sg_content_type, &rec->content_type, 1); sg_mark_end(&rec->sg_content_type); - sg_chain(msg_pl->sg.data, msg_pl->sg.end + 1, - &rec->sg_content_type); + sg_chain(msg_pl->sg.data, i + 2, &rec->sg_content_type); } else { sg_mark_end(sk_msg_elem(msg_pl, i)); } - if (msg_pl->sg.end < msg_pl->sg.start) { - sg_chain(&msg_pl->sg.data[msg_pl->sg.start], - MAX_SKB_FRAGS - msg_pl->sg.start + 1, - msg_pl->sg.data); - } - i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); @@ -1356,9 +1366,14 @@ unlock: mutex_unlock(&tls_ctx->tx_lock); } +/* When has_copied is true the caller has already moved bytes to + * userspace. Report sk_err but leave it set so the next read + * surfaces it instead of a spurious EOF, otherwise sk_err is + * consumed via sock_error(). + */ static int tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, - bool released) + bool released, bool has_copied) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -1376,8 +1391,11 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, if (!sk_psock_queue_empty(psock)) return 0; - if (sk->sk_err) + if (sk->sk_err) { + if (has_copied) + return -READ_ONCE(sk->sk_err); return sock_error(sk); + } if (ret < 0) return ret; @@ -1413,7 +1431,7 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, } if (unlikely(!tls_strp_msg_load(&ctx->strp, released))) - return tls_rx_rec_wait(sk, psock, nonblock, false); + return tls_rx_rec_wait(sk, psock, nonblock, false, has_copied); return 1; } @@ -2100,7 +2118,7 @@ int tls_sw_recvmsg(struct sock *sk, int to_decrypt, chunk; err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT, - released); + released, !!(decrypted + copied)); if (err <= 0) { if (psock) { chunk = sk_msg_recvmsg(sk, psock, msg, len, @@ -2287,7 +2305,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct tls_decrypt_arg darg; err = tls_rx_rec_wait(sk, NULL, flags & SPLICE_F_NONBLOCK, - true); + true, false); if (err <= 0) goto splice_read_end; @@ -2373,7 +2391,7 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, } else { struct tls_decrypt_arg darg; - err = tls_rx_rec_wait(sk, NULL, true, released); + err = tls_rx_rec_wait(sk, NULL, true, released, !!copied); if (err <= 0) goto read_sock_end; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1cbf36ea043b..dc71ed79be4a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2711,8 +2711,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, - struct sk_buff *last, unsigned int last_len, - bool freezable) + struct sk_buff *last, bool freezable) { unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; @@ -2725,7 +2724,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || - (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || @@ -2921,7 +2919,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, int flags = state->flags; bool check_creds = false; struct scm_cookie scm; - unsigned int last_len; struct unix_sock *u; int copied = 0; int err = 0; @@ -2967,7 +2964,6 @@ redo: goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); - last_len = last ? last->len : 0; again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -3001,8 +2997,7 @@ again: mutex_unlock(&u->iolock); - timeo = unix_stream_data_wait(sk, timeo, last, - last_len, freezable); + timeo = unix_stream_data_wait(sk, timeo, last, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); @@ -3019,7 +3014,6 @@ unlock: while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; - last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; @@ -3094,7 +3088,6 @@ unlock: skip = 0; last = skb; - last_len = skb->len; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 9b8014516f4f..df3b418e0392 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -70,34 +70,6 @@ static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops, return true; } -static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, - struct sk_buff *skb, - struct msghdr *msg, - size_t pkt_len, - bool zerocopy) -{ - struct ubuf_info *uarg; - - if (msg->msg_ubuf) { - uarg = msg->msg_ubuf; - net_zcopy_get(uarg); - } else { - struct ubuf_info_msgzc *uarg_zc; - - uarg = msg_zerocopy_realloc(sk_vsock(vsk), - pkt_len, NULL, false); - if (!uarg) - return -1; - - uarg_zc = uarg_to_msgzc(uarg); - uarg_zc->zerocopy = zerocopy ? 1 : 0; - } - - skb_zcopy_init(skb, uarg); - - return 0; -} - static int virtio_transport_fill_skb(struct sk_buff *skb, struct virtio_vsock_pkt_info *info, size_t len, @@ -136,27 +108,6 @@ static void virtio_transport_init_hdr(struct sk_buff *skb, hdr->fwd_cnt = cpu_to_le32(0); } -static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb, - void *dst, - size_t len) -{ - struct iov_iter iov_iter = { 0 }; - struct kvec kvec; - size_t to_copy; - - kvec.iov_base = dst; - kvec.iov_len = len; - - iov_iter.iter_type = ITER_KVEC; - iov_iter.kvec = &kvec; - iov_iter.nr_segs = 1; - - to_copy = min_t(size_t, len, skb->len); - - skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, - &iov_iter, to_copy); -} - /* Packet capture */ static struct sk_buff *virtio_transport_build_skb(void *opaque) { @@ -166,12 +117,12 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) struct sk_buff *skb; size_t payload_len; - /* A packet could be split to fit the RX buffer, so we can retrieve - * the payload length from the header and the buffer pointer taking - * care of the offset in the original packet. + /* A packet could be split to fit the RX buffer, so we use + * the payload length from the header, which has been updated + * by the sender to reflect the fragment size. */ pkt_hdr = virtio_vsock_hdr(pkt); - payload_len = pkt->len; + payload_len = le32_to_cpu(pkt_hdr->len); skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, GFP_ATOMIC); @@ -214,12 +165,18 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); if (payload_len) { - if (skb_is_nonlinear(pkt)) { - void *data = skb_put(skb, payload_len); - - virtio_transport_copy_nonlinear_skb(pkt, data, payload_len); - } else { - skb_put_data(skb, pkt->data, payload_len); + struct iov_iter iov_iter; + struct kvec kvec; + void *data = skb_put(skb, payload_len); + + kvec.iov_base = data; + kvec.iov_len = payload_len; + iov_iter_kvec(&iov_iter, ITER_DEST, &kvec, 1, payload_len); + + if (skb_copy_datagram_iter(pkt, VIRTIO_VSOCK_SKB_CB(pkt)->offset, + &iov_iter, payload_len)) { + kfree_skb(skb); + return NULL; } } @@ -332,8 +289,10 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, u32 src_cid, src_port, dst_cid, dst_port; const struct virtio_transport *t_ops; struct virtio_vsock_sock *vvs; + struct ubuf_info *uarg = NULL; u32 pkt_len = info->pkt_len; bool can_zcopy = false; + bool have_uref = false; u32 rest_len; int ret; @@ -375,6 +334,25 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, if (can_zcopy) max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, (MAX_SKB_FRAGS * PAGE_SIZE)); + + if (info->msg->msg_flags & MSG_ZEROCOPY && + info->op == VIRTIO_VSOCK_OP_RW) { + uarg = info->msg->msg_ubuf; + + if (!uarg) { + uarg = msg_zerocopy_realloc(sk_vsock(vsk), + pkt_len, NULL, false); + if (!uarg) { + virtio_transport_put_credit(vvs, pkt_len); + return -ENOMEM; + } + + if (!can_zcopy) + uarg_to_msgzc(uarg)->zerocopy = 0; + + have_uref = true; + } + } } rest_len = pkt_len; @@ -393,27 +371,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, break; } - /* We process buffer part by part, allocating skb on - * each iteration. If this is last skb for this buffer - * and MSG_ZEROCOPY mode is in use - we must allocate - * completion for the current syscall. - * - * Pass pkt_len because msg iter is already consumed - * by virtio_transport_fill_skb(), so iter->count - * can not be used for RLIMIT_MEMLOCK pinned-pages - * accounting done by msg_zerocopy_realloc(). - */ - if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY && - skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) { - if (virtio_transport_init_zcopy_skb(vsk, skb, - info->msg, - pkt_len, - can_zcopy)) { - kfree_skb(skb); - ret = -ENOMEM; - break; - } - } + skb_zcopy_set(skb, uarg, NULL); virtio_transport_inc_tx_pkt(vvs, skb); @@ -437,6 +395,18 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, virtio_transport_put_credit(vvs, rest_len); + /* msg_zerocopy_realloc() initializes the ubuf_info refcnt to 1. + * skb_zcopy_set() increases it for each skb, so we can drop that + * initial reference to keep it balanced. + */ + if (have_uref) { + if (rest_len == pkt_len) + /* No data sent, abort the notification. */ + net_zcopy_put_abort(uarg, true); + else + net_zcopy_put(uarg); + } + /* Return number of bytes, if any data has been sent. */ if (rest_len != pkt_len) ret = pkt_len - rest_len; @@ -449,7 +419,14 @@ static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, { u64 skb_overhead = (skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0); - if (skb_overhead + vvs->buf_used + len > vvs->buf_alloc) + /* Allow at most buf_alloc * 2 total budget (payload + overhead), + * similar to how SO_RCVBUF is doubled to reserve space for sk_buff + * metadata. Check payload against buf_alloc to be sure the other + * peer is respecting the credit, and sk_buff overhead to bound + * queue growth. + */ + if ((u64)vvs->buf_used + len > vvs->buf_alloc || + skb_overhead > vvs->buf_alloc) return false; vvs->rx_bytes += len; @@ -1365,7 +1342,7 @@ destroy: return err; } -static void +static bool virtio_transport_recv_enqueue(struct vsock_sock *vsk, struct sk_buff *skb) { @@ -1380,10 +1357,8 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, spin_lock_bh(&vvs->rx_lock); can_enqueue = virtio_transport_inc_rx_pkt(vvs, len); - if (!can_enqueue) { - free_pkt = true; + if (!can_enqueue) goto out; - } if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count++; @@ -1423,6 +1398,8 @@ out: spin_unlock_bh(&vvs->rx_lock); if (free_pkt) kfree_skb(skb); + + return can_enqueue; } static int @@ -1435,7 +1412,17 @@ virtio_transport_recv_connected(struct sock *sk, switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RW: - virtio_transport_recv_enqueue(vsk, skb); + if (!virtio_transport_recv_enqueue(vsk, skb)) { + /* There is no more space to queue the packet, so let's + * close the connection; otherwise, we'll lose data. + */ + (void)virtio_transport_reset(vsk, skb); + virtio_transport_do_close(vsk, true); + sk->sk_err = ENOBUFS; + sk_error_report(sk); + vsock_remove_sock(vsk); + break; + } vsock_data_ready(sk); return err; case VIRTIO_VSOCK_OP_CREDIT_REQUEST: diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 4296ca1183f1..d2579380f51e 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1164,7 +1164,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, /* Close and cleanup the connection. */ vmci_transport_send_reset(pending, pkt); skerr = EPROTO; - err = pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST ? 0 : -EINVAL; + err = -EINVAL; goto destroy; } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 328af43ef832..358cbc9e43d8 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2462,6 +2462,9 @@ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, memcpy(merged_ie + copied_len, next_sub->data, next_sub->datalen); copied_len += next_sub->datalen; + + mbssid_elem = next_mbssid; + sub_elem = next_sub; } return copied_len; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 22d9d9bae8f5..63d145b524c9 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -789,6 +789,8 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); if (!chandef.chan) return -EINVAL; + if (!cfg80211_chandef_valid(&chandef)) + return -EINVAL; return cfg80211_set_monitor_channel(rdev, dev, &chandef); case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index afa457506274..3bff346308d0 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -184,6 +184,10 @@ static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value, } xs = (struct xdp_sock *)sock->sk; + if (!READ_ONCE(xs->rx)) { + sockfd_put(sock); + return -ENOBUFS; + } map_entry = &m->xsk_map[i]; node = xsk_map_node_alloc(m, map_entry); |
