diff options
Diffstat (limited to 'net')
718 files changed, 51989 insertions, 27458 deletions
diff --git a/net/802/Kconfig b/net/802/Kconfig index be33d27c8e69..80d4bf78905d 100644 --- a/net/802/Kconfig +++ b/net/802/Kconfig @@ -5,3 +5,6 @@ config STP config GARP tristate select STP + +config MRP + tristate diff --git a/net/802/Makefile b/net/802/Makefile index a30d6e385aed..37e654d6615e 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_IPX) += p8022.o psnap.o p8023.o obj-$(CONFIG_ATALK) += p8022.o psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o +obj-$(CONFIG_MRP) += mrp.o diff --git a/net/802/garp.c b/net/802/garp.c index 8456f5d98b85..5d9630a0eb93 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -609,8 +609,12 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl /* Delete timer and generate a final TRANSMIT_PDU event to flush out * all pending messages before the applicant is gone. */ del_timer_sync(&app->join_timer); + + spin_lock_bh(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); garp_pdu_queue(app); + spin_unlock_bh(&app->lock); + garp_queue_xmit(app); dev_mc_del(dev, appl->proto.group_address); diff --git a/net/802/mrp.c b/net/802/mrp.c new file mode 100644 index 000000000000..e085bcc754f6 --- /dev/null +++ b/net/802/mrp.c @@ -0,0 +1,899 @@ +/* + * IEEE 802.1Q Multiple Registration Protocol (MRP) + * + * Copyright (c) 2012 Massachusetts Institute of Technology + * + * Adapted from code in net/802/garp.c + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <net/mrp.h> +#include <asm/unaligned.h> + +static unsigned int mrp_join_time __read_mostly = 200; +module_param(mrp_join_time, uint, 0644); +MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)"); +MODULE_LICENSE("GPL"); + +static const u8 +mrp_applicant_state_table[MRP_APPLICANT_MAX + 1][MRP_EVENT_MAX + 1] = { + [MRP_APPLICANT_VO] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, + [MRP_EVENT_LV] = MRP_APPLICANT_VO, + [MRP_EVENT_TX] = MRP_APPLICANT_VO, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_VO, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AO, + [MRP_EVENT_R_IN] = MRP_APPLICANT_VO, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VO, + [MRP_EVENT_R_MT] = MRP_APPLICANT_VO, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VO, + }, + [MRP_APPLICANT_VP] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, + [MRP_EVENT_LV] = MRP_APPLICANT_VO, + [MRP_EVENT_TX] = MRP_APPLICANT_AA, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_VP, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AP, + [MRP_EVENT_R_IN] = MRP_APPLICANT_VP, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VP, + [MRP_EVENT_R_MT] = MRP_APPLICANT_VP, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VP, + }, + [MRP_APPLICANT_VN] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_VN, + [MRP_EVENT_LV] = MRP_APPLICANT_LA, + [MRP_EVENT_TX] = MRP_APPLICANT_AN, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_VN, + [MRP_EVENT_R_IN] = MRP_APPLICANT_VN, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VN, + [MRP_EVENT_R_MT] = MRP_APPLICANT_VN, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VN, + }, + [MRP_APPLICANT_AN] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_AN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_AN, + [MRP_EVENT_LV] = MRP_APPLICANT_LA, + [MRP_EVENT_TX] = MRP_APPLICANT_QA, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_AN, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AN, + [MRP_EVENT_R_IN] = MRP_APPLICANT_AN, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AN, + [MRP_EVENT_R_MT] = MRP_APPLICANT_AN, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AN, + }, + [MRP_APPLICANT_AA] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, + [MRP_EVENT_LV] = MRP_APPLICANT_LA, + [MRP_EVENT_TX] = MRP_APPLICANT_QA, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_AA, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, + [MRP_EVENT_R_IN] = MRP_APPLICANT_AA, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, + [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, + }, + [MRP_APPLICANT_QA] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_QA, + [MRP_EVENT_LV] = MRP_APPLICANT_LA, + [MRP_EVENT_TX] = MRP_APPLICANT_QA, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_QA, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, + [MRP_EVENT_R_IN] = MRP_APPLICANT_QA, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, + [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, + }, + [MRP_APPLICANT_LA] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, + [MRP_EVENT_LV] = MRP_APPLICANT_LA, + [MRP_EVENT_TX] = MRP_APPLICANT_VO, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_LA, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_LA, + [MRP_EVENT_R_IN] = MRP_APPLICANT_LA, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_LA, + [MRP_EVENT_R_MT] = MRP_APPLICANT_LA, + [MRP_EVENT_R_LV] = MRP_APPLICANT_LA, + [MRP_EVENT_R_LA] = MRP_APPLICANT_LA, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_LA, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_LA, + }, + [MRP_APPLICANT_AO] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, + [MRP_EVENT_LV] = MRP_APPLICANT_AO, + [MRP_EVENT_TX] = MRP_APPLICANT_AO, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_AO, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, + [MRP_EVENT_R_IN] = MRP_APPLICANT_AO, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, + [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AO, + }, + [MRP_APPLICANT_QO] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, + [MRP_EVENT_LV] = MRP_APPLICANT_QO, + [MRP_EVENT_TX] = MRP_APPLICANT_QO, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_QO, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, + [MRP_EVENT_R_IN] = MRP_APPLICANT_QO, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, + [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_QO, + }, + [MRP_APPLICANT_AP] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, + [MRP_EVENT_LV] = MRP_APPLICANT_AO, + [MRP_EVENT_TX] = MRP_APPLICANT_QA, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_AP, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, + [MRP_EVENT_R_IN] = MRP_APPLICANT_AP, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, + [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, + }, + [MRP_APPLICANT_QP] = { + [MRP_EVENT_NEW] = MRP_APPLICANT_VN, + [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, + [MRP_EVENT_LV] = MRP_APPLICANT_QO, + [MRP_EVENT_TX] = MRP_APPLICANT_QP, + [MRP_EVENT_R_NEW] = MRP_APPLICANT_QP, + [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, + [MRP_EVENT_R_IN] = MRP_APPLICANT_QP, + [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, + [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, + [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, + [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, + [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, + [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, + }, +}; + +static const u8 +mrp_tx_action_table[MRP_APPLICANT_MAX + 1] = { + [MRP_APPLICANT_VO] = MRP_TX_ACTION_S_IN_OPTIONAL, + [MRP_APPLICANT_VP] = MRP_TX_ACTION_S_JOIN_IN, + [MRP_APPLICANT_VN] = MRP_TX_ACTION_S_NEW, + [MRP_APPLICANT_AN] = MRP_TX_ACTION_S_NEW, + [MRP_APPLICANT_AA] = MRP_TX_ACTION_S_JOIN_IN, + [MRP_APPLICANT_QA] = MRP_TX_ACTION_S_JOIN_IN_OPTIONAL, + [MRP_APPLICANT_LA] = MRP_TX_ACTION_S_LV, + [MRP_APPLICANT_AO] = MRP_TX_ACTION_S_IN_OPTIONAL, + [MRP_APPLICANT_QO] = MRP_TX_ACTION_S_IN_OPTIONAL, + [MRP_APPLICANT_AP] = MRP_TX_ACTION_S_JOIN_IN, + [MRP_APPLICANT_QP] = MRP_TX_ACTION_S_IN_OPTIONAL, +}; + +static void mrp_attrvalue_inc(void *value, u8 len) +{ + u8 *v = (u8 *)value; + + /* Add 1 to the last byte. If it becomes zero, + * go to the previous byte and repeat. + */ + while (len > 0 && !++v[--len]) + ; +} + +static int mrp_attr_cmp(const struct mrp_attr *attr, + const void *value, u8 len, u8 type) +{ + if (attr->type != type) + return attr->type - type; + if (attr->len != len) + return attr->len - len; + return memcmp(attr->value, value, len); +} + +static struct mrp_attr *mrp_attr_lookup(const struct mrp_applicant *app, + const void *value, u8 len, u8 type) +{ + struct rb_node *parent = app->mad.rb_node; + struct mrp_attr *attr; + int d; + + while (parent) { + attr = rb_entry(parent, struct mrp_attr, node); + d = mrp_attr_cmp(attr, value, len, type); + if (d > 0) + parent = parent->rb_left; + else if (d < 0) + parent = parent->rb_right; + else + return attr; + } + return NULL; +} + +static struct mrp_attr *mrp_attr_create(struct mrp_applicant *app, + const void *value, u8 len, u8 type) +{ + struct rb_node *parent = NULL, **p = &app->mad.rb_node; + struct mrp_attr *attr; + int d; + + while (*p) { + parent = *p; + attr = rb_entry(parent, struct mrp_attr, node); + d = mrp_attr_cmp(attr, value, len, type); + if (d > 0) + p = &parent->rb_left; + else if (d < 0) + p = &parent->rb_right; + else { + /* The attribute already exists; re-use it. */ + return attr; + } + } + attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC); + if (!attr) + return attr; + attr->state = MRP_APPLICANT_VO; + attr->type = type; + attr->len = len; + memcpy(attr->value, value, len); + + rb_link_node(&attr->node, parent, p); + rb_insert_color(&attr->node, &app->mad); + return attr; +} + +static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) +{ + rb_erase(&attr->node, &app->mad); + kfree(attr); +} + +static int mrp_pdu_init(struct mrp_applicant *app) +{ + struct sk_buff *skb; + struct mrp_pdu_hdr *ph; + + skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev), + GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + skb->dev = app->dev; + skb->protocol = app->app->pkttype.type; + skb_reserve(skb, LL_RESERVED_SPACE(app->dev)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + ph = (struct mrp_pdu_hdr *)__skb_put(skb, sizeof(*ph)); + ph->version = app->app->version; + + app->pdu = skb; + return 0; +} + +static int mrp_pdu_append_end_mark(struct mrp_applicant *app) +{ + __be16 *endmark; + + if (skb_tailroom(app->pdu) < sizeof(*endmark)) + return -1; + endmark = (__be16 *)__skb_put(app->pdu, sizeof(*endmark)); + put_unaligned(MRP_END_MARK, endmark); + return 0; +} + +static void mrp_pdu_queue(struct mrp_applicant *app) +{ + if (!app->pdu) + return; + + if (mrp_cb(app->pdu)->mh) + mrp_pdu_append_end_mark(app); + mrp_pdu_append_end_mark(app); + + dev_hard_header(app->pdu, app->dev, ntohs(app->app->pkttype.type), + app->app->group_address, app->dev->dev_addr, + app->pdu->len); + + skb_queue_tail(&app->queue, app->pdu); + app->pdu = NULL; +} + +static void mrp_queue_xmit(struct mrp_applicant *app) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&app->queue))) + dev_queue_xmit(skb); +} + +static int mrp_pdu_append_msg_hdr(struct mrp_applicant *app, + u8 attrtype, u8 attrlen) +{ + struct mrp_msg_hdr *mh; + + if (mrp_cb(app->pdu)->mh) { + if (mrp_pdu_append_end_mark(app) < 0) + return -1; + mrp_cb(app->pdu)->mh = NULL; + mrp_cb(app->pdu)->vah = NULL; + } + + if (skb_tailroom(app->pdu) < sizeof(*mh)) + return -1; + mh = (struct mrp_msg_hdr *)__skb_put(app->pdu, sizeof(*mh)); + mh->attrtype = attrtype; + mh->attrlen = attrlen; + mrp_cb(app->pdu)->mh = mh; + return 0; +} + +static int mrp_pdu_append_vecattr_hdr(struct mrp_applicant *app, + const void *firstattrvalue, u8 attrlen) +{ + struct mrp_vecattr_hdr *vah; + + if (skb_tailroom(app->pdu) < sizeof(*vah) + attrlen) + return -1; + vah = (struct mrp_vecattr_hdr *)__skb_put(app->pdu, + sizeof(*vah) + attrlen); + put_unaligned(0, &vah->lenflags); + memcpy(vah->firstattrvalue, firstattrvalue, attrlen); + mrp_cb(app->pdu)->vah = vah; + memcpy(mrp_cb(app->pdu)->attrvalue, firstattrvalue, attrlen); + return 0; +} + +static int mrp_pdu_append_vecattr_event(struct mrp_applicant *app, + const struct mrp_attr *attr, + enum mrp_vecattr_event vaevent) +{ + u16 len, pos; + u8 *vaevents; + int err; +again: + if (!app->pdu) { + err = mrp_pdu_init(app); + if (err < 0) + return err; + } + + /* If there is no Message header in the PDU, or the Message header is + * for a different attribute type, add an EndMark (if necessary) and a + * new Message header to the PDU. + */ + if (!mrp_cb(app->pdu)->mh || + mrp_cb(app->pdu)->mh->attrtype != attr->type || + mrp_cb(app->pdu)->mh->attrlen != attr->len) { + if (mrp_pdu_append_msg_hdr(app, attr->type, attr->len) < 0) + goto queue; + } + + /* If there is no VectorAttribute header for this Message in the PDU, + * or this attribute's value does not sequentially follow the previous + * attribute's value, add a new VectorAttribute header to the PDU. + */ + if (!mrp_cb(app->pdu)->vah || + memcmp(mrp_cb(app->pdu)->attrvalue, attr->value, attr->len)) { + if (mrp_pdu_append_vecattr_hdr(app, attr->value, attr->len) < 0) + goto queue; + } + + len = be16_to_cpu(get_unaligned(&mrp_cb(app->pdu)->vah->lenflags)); + pos = len % 3; + + /* Events are packed into Vectors in the PDU, three to a byte. Add a + * byte to the end of the Vector if necessary. + */ + if (!pos) { + if (skb_tailroom(app->pdu) < sizeof(u8)) + goto queue; + vaevents = (u8 *)__skb_put(app->pdu, sizeof(u8)); + } else { + vaevents = (u8 *)(skb_tail_pointer(app->pdu) - sizeof(u8)); + } + + switch (pos) { + case 0: + *vaevents = vaevent * (__MRP_VECATTR_EVENT_MAX * + __MRP_VECATTR_EVENT_MAX); + break; + case 1: + *vaevents += vaevent * __MRP_VECATTR_EVENT_MAX; + break; + case 2: + *vaevents += vaevent; + break; + default: + WARN_ON(1); + } + + /* Increment the length of the VectorAttribute in the PDU, as well as + * the value of the next attribute that would continue its Vector. + */ + put_unaligned(cpu_to_be16(++len), &mrp_cb(app->pdu)->vah->lenflags); + mrp_attrvalue_inc(mrp_cb(app->pdu)->attrvalue, attr->len); + + return 0; + +queue: + mrp_pdu_queue(app); + goto again; +} + +static void mrp_attr_event(struct mrp_applicant *app, + struct mrp_attr *attr, enum mrp_event event) +{ + enum mrp_applicant_state state; + + state = mrp_applicant_state_table[attr->state][event]; + if (state == MRP_APPLICANT_INVALID) { + WARN_ON(1); + return; + } + + if (event == MRP_EVENT_TX) { + /* When appending the attribute fails, don't update its state + * in order to retry at the next TX event. + */ + + switch (mrp_tx_action_table[attr->state]) { + case MRP_TX_ACTION_NONE: + case MRP_TX_ACTION_S_JOIN_IN_OPTIONAL: + case MRP_TX_ACTION_S_IN_OPTIONAL: + break; + case MRP_TX_ACTION_S_NEW: + if (mrp_pdu_append_vecattr_event( + app, attr, MRP_VECATTR_EVENT_NEW) < 0) + return; + break; + case MRP_TX_ACTION_S_JOIN_IN: + if (mrp_pdu_append_vecattr_event( + app, attr, MRP_VECATTR_EVENT_JOIN_IN) < 0) + return; + break; + case MRP_TX_ACTION_S_LV: + if (mrp_pdu_append_vecattr_event( + app, attr, MRP_VECATTR_EVENT_LV) < 0) + return; + /* As a pure applicant, sending a leave message + * implies that the attribute was unregistered and + * can be destroyed. + */ + mrp_attr_destroy(app, attr); + return; + default: + WARN_ON(1); + } + } + + attr->state = state; +} + +int mrp_request_join(const struct net_device *dev, + const struct mrp_application *appl, + const void *value, u8 len, u8 type) +{ + struct mrp_port *port = rtnl_dereference(dev->mrp_port); + struct mrp_applicant *app = rtnl_dereference( + port->applicants[appl->type]); + struct mrp_attr *attr; + + if (sizeof(struct mrp_skb_cb) + len > + FIELD_SIZEOF(struct sk_buff, cb)) + return -ENOMEM; + + spin_lock_bh(&app->lock); + attr = mrp_attr_create(app, value, len, type); + if (!attr) { + spin_unlock_bh(&app->lock); + return -ENOMEM; + } + mrp_attr_event(app, attr, MRP_EVENT_JOIN); + spin_unlock_bh(&app->lock); + return 0; +} +EXPORT_SYMBOL_GPL(mrp_request_join); + +void mrp_request_leave(const struct net_device *dev, + const struct mrp_application *appl, + const void *value, u8 len, u8 type) +{ + struct mrp_port *port = rtnl_dereference(dev->mrp_port); + struct mrp_applicant *app = rtnl_dereference( + port->applicants[appl->type]); + struct mrp_attr *attr; + + if (sizeof(struct mrp_skb_cb) + len > + FIELD_SIZEOF(struct sk_buff, cb)) + return; + + spin_lock_bh(&app->lock); + attr = mrp_attr_lookup(app, value, len, type); + if (!attr) { + spin_unlock_bh(&app->lock); + return; + } + mrp_attr_event(app, attr, MRP_EVENT_LV); + spin_unlock_bh(&app->lock); +} +EXPORT_SYMBOL_GPL(mrp_request_leave); + +static void mrp_mad_event(struct mrp_applicant *app, enum mrp_event event) +{ + struct rb_node *node, *next; + struct mrp_attr *attr; + + for (node = rb_first(&app->mad); + next = node ? rb_next(node) : NULL, node != NULL; + node = next) { + attr = rb_entry(node, struct mrp_attr, node); + mrp_attr_event(app, attr, event); + } +} + +static void mrp_join_timer_arm(struct mrp_applicant *app) +{ + unsigned long delay; + + delay = (u64)msecs_to_jiffies(mrp_join_time) * net_random() >> 32; + mod_timer(&app->join_timer, jiffies + delay); +} + +static void mrp_join_timer(unsigned long data) +{ + struct mrp_applicant *app = (struct mrp_applicant *)data; + + spin_lock(&app->lock); + mrp_mad_event(app, MRP_EVENT_TX); + mrp_pdu_queue(app); + spin_unlock(&app->lock); + + mrp_queue_xmit(app); + mrp_join_timer_arm(app); +} + +static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) +{ + __be16 endmark; + + if (skb_copy_bits(skb, *offset, &endmark, sizeof(endmark)) < 0) + return -1; + if (endmark == MRP_END_MARK) { + *offset += sizeof(endmark); + return -1; + } + return 0; +} + +static void mrp_pdu_parse_vecattr_event(struct mrp_applicant *app, + struct sk_buff *skb, + enum mrp_vecattr_event vaevent) +{ + struct mrp_attr *attr; + enum mrp_event event; + + attr = mrp_attr_lookup(app, mrp_cb(skb)->attrvalue, + mrp_cb(skb)->mh->attrlen, + mrp_cb(skb)->mh->attrtype); + if (attr == NULL) + return; + + switch (vaevent) { + case MRP_VECATTR_EVENT_NEW: + event = MRP_EVENT_R_NEW; + break; + case MRP_VECATTR_EVENT_JOIN_IN: + event = MRP_EVENT_R_JOIN_IN; + break; + case MRP_VECATTR_EVENT_IN: + event = MRP_EVENT_R_IN; + break; + case MRP_VECATTR_EVENT_JOIN_MT: + event = MRP_EVENT_R_JOIN_MT; + break; + case MRP_VECATTR_EVENT_MT: + event = MRP_EVENT_R_MT; + break; + case MRP_VECATTR_EVENT_LV: + event = MRP_EVENT_R_LV; + break; + default: + return; + } + + mrp_attr_event(app, attr, event); +} + +static int mrp_pdu_parse_vecattr(struct mrp_applicant *app, + struct sk_buff *skb, int *offset) +{ + struct mrp_vecattr_hdr _vah; + u16 valen; + u8 vaevents, vaevent; + + mrp_cb(skb)->vah = skb_header_pointer(skb, *offset, sizeof(_vah), + &_vah); + if (!mrp_cb(skb)->vah) + return -1; + *offset += sizeof(_vah); + + if (get_unaligned(&mrp_cb(skb)->vah->lenflags) & + MRP_VECATTR_HDR_FLAG_LA) + mrp_mad_event(app, MRP_EVENT_R_LA); + valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) & + MRP_VECATTR_HDR_LEN_MASK); + + /* The VectorAttribute structure in a PDU carries event information + * about one or more attributes having consecutive values. Only the + * value for the first attribute is contained in the structure. So + * we make a copy of that value, and then increment it each time we + * advance to the next event in its Vector. + */ + if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen > + FIELD_SIZEOF(struct sk_buff, cb)) + return -1; + if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue, + mrp_cb(skb)->mh->attrlen) < 0) + return -1; + *offset += mrp_cb(skb)->mh->attrlen; + + /* In a VectorAttribute, the Vector contains events which are packed + * three to a byte. We process one byte of the Vector at a time. + */ + while (valen > 0) { + if (skb_copy_bits(skb, *offset, &vaevents, + sizeof(vaevents)) < 0) + return -1; + *offset += sizeof(vaevents); + + /* Extract and process the first event. */ + vaevent = vaevents / (__MRP_VECATTR_EVENT_MAX * + __MRP_VECATTR_EVENT_MAX); + if (vaevent >= __MRP_VECATTR_EVENT_MAX) { + /* The byte is malformed; stop processing. */ + return -1; + } + mrp_pdu_parse_vecattr_event(app, skb, vaevent); + + /* If present, extract and process the second event. */ + if (!--valen) + break; + mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, + mrp_cb(skb)->mh->attrlen); + vaevents %= (__MRP_VECATTR_EVENT_MAX * + __MRP_VECATTR_EVENT_MAX); + vaevent = vaevents / __MRP_VECATTR_EVENT_MAX; + mrp_pdu_parse_vecattr_event(app, skb, vaevent); + + /* If present, extract and process the third event. */ + if (!--valen) + break; + mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, + mrp_cb(skb)->mh->attrlen); + vaevents %= __MRP_VECATTR_EVENT_MAX; + vaevent = vaevents; + mrp_pdu_parse_vecattr_event(app, skb, vaevent); + } + return 0; +} + +static int mrp_pdu_parse_msg(struct mrp_applicant *app, struct sk_buff *skb, + int *offset) +{ + struct mrp_msg_hdr _mh; + + mrp_cb(skb)->mh = skb_header_pointer(skb, *offset, sizeof(_mh), &_mh); + if (!mrp_cb(skb)->mh) + return -1; + *offset += sizeof(_mh); + + if (mrp_cb(skb)->mh->attrtype == 0 || + mrp_cb(skb)->mh->attrtype > app->app->maxattr || + mrp_cb(skb)->mh->attrlen == 0) + return -1; + + while (skb->len > *offset) { + if (mrp_pdu_parse_end_mark(skb, offset) < 0) + break; + if (mrp_pdu_parse_vecattr(app, skb, offset) < 0) + return -1; + } + return 0; +} + +static int mrp_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct mrp_application *appl = container_of(pt, struct mrp_application, + pkttype); + struct mrp_port *port; + struct mrp_applicant *app; + struct mrp_pdu_hdr _ph; + const struct mrp_pdu_hdr *ph; + int offset = skb_network_offset(skb); + + /* If the interface is in promiscuous mode, drop the packet if + * it was unicast to another host. + */ + if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) + goto out; + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto out; + port = rcu_dereference(dev->mrp_port); + if (unlikely(!port)) + goto out; + app = rcu_dereference(port->applicants[appl->type]); + if (unlikely(!app)) + goto out; + + ph = skb_header_pointer(skb, offset, sizeof(_ph), &_ph); + if (!ph) + goto out; + offset += sizeof(_ph); + + if (ph->version != app->app->version) + goto out; + + spin_lock(&app->lock); + while (skb->len > offset) { + if (mrp_pdu_parse_end_mark(skb, &offset) < 0) + break; + if (mrp_pdu_parse_msg(app, skb, &offset) < 0) + break; + } + spin_unlock(&app->lock); +out: + kfree_skb(skb); + return 0; +} + +static int mrp_init_port(struct net_device *dev) +{ + struct mrp_port *port; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + rcu_assign_pointer(dev->mrp_port, port); + return 0; +} + +static void mrp_release_port(struct net_device *dev) +{ + struct mrp_port *port = rtnl_dereference(dev->mrp_port); + unsigned int i; + + for (i = 0; i <= MRP_APPLICATION_MAX; i++) { + if (rtnl_dereference(port->applicants[i])) + return; + } + RCU_INIT_POINTER(dev->mrp_port, NULL); + kfree_rcu(port, rcu); +} + +int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) +{ + struct mrp_applicant *app; + int err; + + ASSERT_RTNL(); + + if (!rtnl_dereference(dev->mrp_port)) { + err = mrp_init_port(dev); + if (err < 0) + goto err1; + } + + err = -ENOMEM; + app = kzalloc(sizeof(*app), GFP_KERNEL); + if (!app) + goto err2; + + err = dev_mc_add(dev, appl->group_address); + if (err < 0) + goto err3; + + app->dev = dev; + app->app = appl; + app->mad = RB_ROOT; + spin_lock_init(&app->lock); + skb_queue_head_init(&app->queue); + rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); + setup_timer(&app->join_timer, mrp_join_timer, (unsigned long)app); + mrp_join_timer_arm(app); + return 0; + +err3: + kfree(app); +err2: + mrp_release_port(dev); +err1: + return err; +} +EXPORT_SYMBOL_GPL(mrp_init_applicant); + +void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) +{ + struct mrp_port *port = rtnl_dereference(dev->mrp_port); + struct mrp_applicant *app = rtnl_dereference( + port->applicants[appl->type]); + + ASSERT_RTNL(); + + RCU_INIT_POINTER(port->applicants[appl->type], NULL); + + /* Delete timer and generate a final TX event to flush out + * all pending messages before the applicant is gone. + */ + del_timer_sync(&app->join_timer); + + spin_lock(&app->lock); + mrp_mad_event(app, MRP_EVENT_TX); + mrp_pdu_queue(app); + spin_unlock(&app->lock); + + mrp_queue_xmit(app); + + dev_mc_del(dev, appl->group_address); + kfree_rcu(app, rcu); + mrp_release_port(dev); +} +EXPORT_SYMBOL_GPL(mrp_uninit_applicant); + +int mrp_register_application(struct mrp_application *appl) +{ + appl->pkttype.func = mrp_rcv; + dev_add_pack(&appl->pkttype); + return 0; +} +EXPORT_SYMBOL_GPL(mrp_register_application); + +void mrp_unregister_application(struct mrp_application *appl) +{ + dev_remove_pack(&appl->pkttype); +} +EXPORT_SYMBOL_GPL(mrp_unregister_application); diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig index fa073a54963e..b85a91fa61f1 100644 --- a/net/8021q/Kconfig +++ b/net/8021q/Kconfig @@ -3,7 +3,7 @@ # config VLAN_8021Q - tristate "802.1Q VLAN Support" + tristate "802.1Q/802.1ad VLAN Support" ---help--- Select this and you will be able to create 802.1Q VLAN interfaces on your ethernet interfaces. 802.1Q VLAN supports almost @@ -27,3 +27,14 @@ config VLAN_8021Q_GVRP automatic propagation of registered VLANs to switches. If unsure, say N. + +config VLAN_8021Q_MVRP + bool "MVRP (Multiple VLAN Registration Protocol) support" + depends on VLAN_8021Q + select MRP + help + Select this to enable MVRP end-system support. MVRP is used for + automatic propagation of registered VLANs to switches; it + supersedes GVRP and is not backwards-compatible. + + If unsure, say N. diff --git a/net/8021q/Makefile b/net/8021q/Makefile index 9f4f174ead1c..7bc8db08d7ef 100644 --- a/net/8021q/Makefile +++ b/net/8021q/Makefile @@ -6,5 +6,6 @@ obj-$(CONFIG_VLAN_8021Q) += 8021q.o 8021q-y := vlan.o vlan_dev.o vlan_netlink.o 8021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o +8021q-$(CONFIG_VLAN_8021Q_MVRP) += vlan_mvrp.o 8021q-$(CONFIG_PROC_FS) += vlanproc.o diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index a292e8050ef2..9424f3718ea7 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -51,14 +51,18 @@ const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ -static int vlan_group_prealloc_vid(struct vlan_group *vg, u16 vlan_id) +static int vlan_group_prealloc_vid(struct vlan_group *vg, + __be16 vlan_proto, u16 vlan_id) { struct net_device **array; + unsigned int pidx, vidx; unsigned int size; ASSERT_RTNL(); - array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; + pidx = vlan_proto_idx(vlan_proto); + vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; + array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; @@ -67,7 +71,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, u16 vlan_id) if (array == NULL) return -ENOBUFS; - vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN] = array; + vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } @@ -86,49 +90,49 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) grp = &vlan_info->grp; - /* Take it out of our own structures, but be sure to interlock with - * HW accelerating devices or SW vlan input packet processing if - * VLAN is not 0 (leave it there for 802.1p). - */ - if (vlan_id) - vlan_vid_del(real_dev, vlan_id); - grp->nr_vlan_devs--; + if (vlan->flags & VLAN_FLAG_MVRP) + vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); - vlan_group_set_device(grp, vlan_id, NULL); + vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); - if (grp->nr_vlan_devs == 0) + netdev_upper_dev_unlink(real_dev, dev); + + if (grp->nr_vlan_devs == 0) { + vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); + } + + /* Take it out of our own structures, but be sure to interlock with + * HW accelerating devices or SW vlan input packet processing if + * VLAN is not 0 (leave it there for 802.1p). + */ + if (vlan_id) + vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); /* Get rid of the vlan's reference to real_dev */ dev_put(real_dev); } -int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id) +int vlan_check_real_dev(struct net_device *real_dev, + __be16 protocol, u16 vlan_id) { const char *name = real_dev->name; - const struct net_device_ops *ops = real_dev->netdev_ops; if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { pr_info("VLANs not supported on %s\n", name); return -EOPNOTSUPP; } - if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) && - (!ops->ndo_vlan_rx_add_vid || !ops->ndo_vlan_rx_kill_vid)) { - pr_info("Device %s has buggy VLAN hw accel\n", name); - return -EOPNOTSUPP; - } - - if (vlan_find_dev(real_dev, vlan_id) != NULL) + if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) return -EEXIST; return 0; @@ -143,7 +147,7 @@ int register_vlan_dev(struct net_device *dev) struct vlan_group *grp; int err; - err = vlan_vid_add(real_dev, vlan_id); + err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; @@ -156,15 +160,22 @@ int register_vlan_dev(struct net_device *dev) err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; + err = vlan_mvrp_init_applicant(real_dev); + if (err < 0) + goto out_uninit_gvrp; } - err = vlan_group_prealloc_vid(grp, vlan_id); + err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) - goto out_uninit_applicant; + goto out_uninit_mvrp; + + err = netdev_upper_dev_link(real_dev, dev); + if (err) + goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) - goto out_uninit_applicant; + goto out_upper_dev_unlink; /* Account for reference in struct vlan_dev_priv */ dev_hold(real_dev); @@ -175,16 +186,21 @@ int register_vlan_dev(struct net_device *dev) /* So, got the sucker initialized, now lets place * it into our local structure. */ - vlan_group_set_device(grp, vlan_id, dev); + vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; -out_uninit_applicant: +out_upper_dev_unlink: + netdev_upper_dev_unlink(real_dev, dev); +out_uninit_mvrp: + if (grp->nr_vlan_devs == 0) + vlan_mvrp_uninit_applicant(real_dev); +out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: - vlan_vid_del(real_dev, vlan_id); + vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } @@ -202,7 +218,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) if (vlan_id >= VLAN_VID_MASK) return -ERANGE; - err = vlan_check_real_dev(real_dev, vlan_id); + err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id); if (err < 0) return err; @@ -244,6 +260,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) new_dev->mtu = real_dev->mtu; new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT); + vlan_dev_priv(new_dev)->vlan_proto = htons(ETH_P_8021Q); vlan_dev_priv(new_dev)->vlan_id = vlan_id; vlan_dev_priv(new_dev)->real_dev = real_dev; vlan_dev_priv(new_dev)->dent = NULL; @@ -290,7 +307,7 @@ static void vlan_transfer_features(struct net_device *dev, { vlandev->gso_max_size = dev->gso_max_size; - if (dev->features & NETIF_F_HW_VLAN_TX) + if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; @@ -330,16 +347,17 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; + bool last = false; LIST_HEAD(list); if (is_vlan_dev(dev)) __vlan_device_event(dev, event); if ((event == NETDEV_UP) && - (dev->features & NETIF_F_HW_VLAN_FILTER)) { + (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); - vlan_vid_add(dev, 0); + vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } vlan_info = rtnl_dereference(dev->vlan_info); @@ -354,22 +372,13 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ - for (i = 0; i < VLAN_N_VID; i++) { - vlandev = vlan_group_get_device(grp, i); - if (!vlandev) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) netif_stacked_transfer_operstate(dev, vlandev); - } break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ - for (i = 0; i < VLAN_N_VID; i++) { - vlandev = vlan_group_get_device(grp, i); - if (!vlandev) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; @@ -379,11 +388,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, break; case NETDEV_CHANGEMTU: - for (i = 0; i < VLAN_N_VID; i++) { - vlandev = vlan_group_get_device(grp, i); - if (!vlandev) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; @@ -393,26 +398,16 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ - for (i = 0; i < VLAN_N_VID; i++) { - vlandev = vlan_group_get_device(grp, i); - if (!vlandev) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); - } - break; case NETDEV_DOWN: - if (dev->features & NETIF_F_HW_VLAN_FILTER) - vlan_vid_del(dev, 0); + if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) + vlan_vid_del(dev, htons(ETH_P_8021Q), 0); /* Put all VLANs for this dev in the down state too. */ - for (i = 0; i < VLAN_N_VID; i++) { - vlandev = vlan_group_get_device(grp, i); - if (!vlandev) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; @@ -426,11 +421,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ - for (i = 0; i < VLAN_N_VID; i++) { - vlandev = vlan_group_get_device(grp, i); - if (!vlandev) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (flgs & IFF_UP) continue; @@ -447,17 +438,15 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, if (dev->reg_state != NETREG_UNREGISTERING) break; - for (i = 0; i < VLAN_N_VID; i++) { - vlandev = vlan_group_get_device(grp, i); - if (!vlandev) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) - i = VLAN_N_VID; + last = true; unregister_vlan_dev(vlandev, &list); + if (last) + break; } unregister_netdevice_many(&list); break; @@ -471,13 +460,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: /* Propagate to vlan devices */ - for (i = 0; i < VLAN_N_VID; i++) { - vlandev = vlan_group_get_device(grp, i); - if (!vlandev) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); - } break; } @@ -654,13 +638,19 @@ static int __init vlan_proto_init(void) if (err < 0) goto err3; - err = vlan_netlink_init(); + err = vlan_mvrp_init(); if (err < 0) goto err4; + err = vlan_netlink_init(); + if (err < 0) + goto err5; + vlan_ioctl_set(vlan_ioctl_handler); return 0; +err5: + vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: @@ -681,6 +671,7 @@ static void __exit vlan_cleanup_module(void) unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ + vlan_mvrp_uninit(); vlan_gvrp_uninit(); } diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index a4886d94c40c..ba5983f34c42 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -49,6 +49,7 @@ struct netpoll; * @ingress_priority_map: ingress priority mappings * @nr_egress_mappings: number of egress priority mappings * @egress_priority_map: hash of egress priority mappings + * @vlan_proto: VLAN encapsulation protocol * @vlan_id: VLAN identifier * @flags: device flags * @real_dev: underlying netdevice @@ -62,6 +63,7 @@ struct vlan_dev_priv { unsigned int nr_egress_mappings; struct vlan_priority_tci_mapping *egress_priority_map[16]; + __be16 vlan_proto; u16 vlan_id; u16 flags; @@ -87,10 +89,17 @@ static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev) #define VLAN_GROUP_ARRAY_SPLIT_PARTS 8 #define VLAN_GROUP_ARRAY_PART_LEN (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS) +enum vlan_protos { + VLAN_PROTO_8021Q = 0, + VLAN_PROTO_8021AD, + VLAN_PROTO_NUM, +}; + struct vlan_group { unsigned int nr_vlan_devs; struct hlist_node hlist; /* linked list */ - struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS]; + struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM] + [VLAN_GROUP_ARRAY_SPLIT_PARTS]; }; struct vlan_info { @@ -103,37 +112,67 @@ struct vlan_info { struct rcu_head rcu; }; -static inline struct net_device *vlan_group_get_device(struct vlan_group *vg, - u16 vlan_id) +static inline unsigned int vlan_proto_idx(__be16 proto) +{ + switch (proto) { + case __constant_htons(ETH_P_8021Q): + return VLAN_PROTO_8021Q; + case __constant_htons(ETH_P_8021AD): + return VLAN_PROTO_8021AD; + default: + BUG(); + return 0; + } +} + +static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg, + unsigned int pidx, + u16 vlan_id) { struct net_device **array; - array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; + + array = vg->vlan_devices_arrays[pidx] + [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL; } +static inline struct net_device *vlan_group_get_device(struct vlan_group *vg, + __be16 vlan_proto, + u16 vlan_id) +{ + return __vlan_group_get_device(vg, vlan_proto_idx(vlan_proto), vlan_id); +} + static inline void vlan_group_set_device(struct vlan_group *vg, - u16 vlan_id, + __be16 vlan_proto, u16 vlan_id, struct net_device *dev) { struct net_device **array; if (!vg) return; - array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; + array = vg->vlan_devices_arrays[vlan_proto_idx(vlan_proto)] + [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev; } /* Must be invoked with rcu_read_lock or with RTNL. */ static inline struct net_device *vlan_find_dev(struct net_device *real_dev, - u16 vlan_id) + __be16 vlan_proto, u16 vlan_id) { struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info); if (vlan_info) - return vlan_group_get_device(&vlan_info->grp, vlan_id); + return vlan_group_get_device(&vlan_info->grp, + vlan_proto, vlan_id); return NULL; } +#define vlan_group_for_each_dev(grp, i, dev) \ + for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \ + if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \ + (i) % VLAN_N_VID))) + /* found in vlan_dev.c */ void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); @@ -142,7 +181,8 @@ int vlan_dev_set_egress_priority(const struct net_device *dev, int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); -int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id); +int vlan_check_real_dev(struct net_device *real_dev, + __be16 protocol, u16 vlan_id); void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); @@ -171,6 +211,22 @@ static inline int vlan_gvrp_init(void) { return 0; } static inline void vlan_gvrp_uninit(void) {} #endif +#ifdef CONFIG_VLAN_8021Q_MVRP +extern int vlan_mvrp_request_join(const struct net_device *dev); +extern void vlan_mvrp_request_leave(const struct net_device *dev); +extern int vlan_mvrp_init_applicant(struct net_device *dev); +extern void vlan_mvrp_uninit_applicant(struct net_device *dev); +extern int vlan_mvrp_init(void); +extern void vlan_mvrp_uninit(void); +#else +static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; } +static inline void vlan_mvrp_request_leave(const struct net_device *dev) {} +static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; } +static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {} +static inline int vlan_mvrp_init(void) { return 0; } +static inline void vlan_mvrp_uninit(void) {} +#endif + extern const char vlan_fullname[]; extern const char vlan_version[]; extern int vlan_netlink_init(void); diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 65e06abe023f..8a15eaadc4bd 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -8,11 +8,12 @@ bool vlan_do_receive(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; + __be16 vlan_proto = skb->vlan_proto; u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; struct net_device *vlan_dev; struct vlan_pcpu_stats *rx_stats; - vlan_dev = vlan_find_dev(skb->dev, vlan_id); + vlan_dev = vlan_find_dev(skb->dev, vlan_proto, vlan_id); if (!vlan_dev) return false; @@ -38,7 +39,8 @@ bool vlan_do_receive(struct sk_buff **skbp) * original position later */ skb_push(skb, offset); - skb = *skbp = vlan_insert_tag(skb, skb->vlan_tci); + skb = *skbp = vlan_insert_tag(skb, skb->vlan_proto, + skb->vlan_tci); if (!skb) return false; skb_pull(skb, offset + VLAN_HLEN); @@ -60,21 +62,27 @@ bool vlan_do_receive(struct sk_buff **skbp) return true; } -/* Must be invoked with rcu_read_lock or with RTNL. */ -struct net_device *__vlan_find_dev_deep(struct net_device *real_dev, - u16 vlan_id) +/* Must be invoked with rcu_read_lock. */ +struct net_device *__vlan_find_dev_deep(struct net_device *dev, + __be16 vlan_proto, u16 vlan_id) { - struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info); + struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info); if (vlan_info) { - return vlan_group_get_device(&vlan_info->grp, vlan_id); + return vlan_group_get_device(&vlan_info->grp, + vlan_proto, vlan_id); } else { /* - * Bonding slaves do not have grp assigned to themselves. - * Grp is assigned to bonding master instead. + * Lower devices of master uppers (bonding, team) do not have + * grp assigned to themselves. Grp is assigned to upper device + * instead. */ - if (netif_is_bond_slave(real_dev)) - return __vlan_find_dev_deep(real_dev->master, vlan_id); + struct net_device *upper_dev; + + upper_dev = netdev_master_upper_dev_get_rcu(dev); + if (upper_dev) + return __vlan_find_dev_deep(upper_dev, + vlan_proto, vlan_id); } return NULL; @@ -121,7 +129,7 @@ struct sk_buff *vlan_untag(struct sk_buff *skb) vhdr = (struct vlan_hdr *) skb->data; vlan_tci = ntohs(vhdr->h_vlan_TCI); - __vlan_hwaccel_put_tag(skb, vlan_tci); + __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); skb_pull_rcsum(skb, VLAN_HLEN); vlan_set_encap_proto(skb, vhdr); @@ -140,6 +148,7 @@ err_free: kfree_skb(skb); return NULL; } +EXPORT_SYMBOL(vlan_untag); /* @@ -148,10 +157,11 @@ err_free: static void vlan_group_free(struct vlan_group *grp) { - int i; + int i, j; - for (i = 0; i < VLAN_GROUP_ARRAY_SPLIT_PARTS; i++) - kfree(grp->vlan_devices_arrays[i]); + for (i = 0; i < VLAN_PROTO_NUM; i++) + for (j = 0; j < VLAN_GROUP_ARRAY_SPLIT_PARTS; j++) + kfree(grp->vlan_devices_arrays[i][j]); } static void vlan_info_free(struct vlan_info *vlan_info) @@ -180,35 +190,49 @@ static struct vlan_info *vlan_info_alloc(struct net_device *dev) struct vlan_vid_info { struct list_head list; - unsigned short vid; + __be16 proto; + u16 vid; int refcount; }; +static bool vlan_hw_filter_capable(const struct net_device *dev, + const struct vlan_vid_info *vid_info) +{ + if (vid_info->proto == htons(ETH_P_8021Q) && + dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) + return true; + if (vid_info->proto == htons(ETH_P_8021AD) && + dev->features & NETIF_F_HW_VLAN_STAG_FILTER) + return true; + return false; +} + static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info, - unsigned short vid) + __be16 proto, u16 vid) { struct vlan_vid_info *vid_info; list_for_each_entry(vid_info, &vlan_info->vid_list, list) { - if (vid_info->vid == vid) + if (vid_info->proto == proto && vid_info->vid == vid) return vid_info; } return NULL; } -static struct vlan_vid_info *vlan_vid_info_alloc(unsigned short vid) +static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid) { struct vlan_vid_info *vid_info; vid_info = kzalloc(sizeof(struct vlan_vid_info), GFP_KERNEL); if (!vid_info) return NULL; + vid_info->proto = proto; vid_info->vid = vid; return vid_info; } -static int __vlan_vid_add(struct vlan_info *vlan_info, unsigned short vid, +static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid, struct vlan_vid_info **pvid_info) { struct net_device *dev = vlan_info->real_dev; @@ -216,13 +240,12 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, unsigned short vid, struct vlan_vid_info *vid_info; int err; - vid_info = vlan_vid_info_alloc(vid); + vid_info = vlan_vid_info_alloc(proto, vid); if (!vid_info) return -ENOMEM; - if ((dev->features & NETIF_F_HW_VLAN_FILTER) && - ops->ndo_vlan_rx_add_vid) { - err = ops->ndo_vlan_rx_add_vid(dev, vid); + if (vlan_hw_filter_capable(dev, vid_info)) { + err = ops->ndo_vlan_rx_add_vid(dev, proto, vid); if (err) { kfree(vid_info); return err; @@ -234,7 +257,7 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, unsigned short vid, return 0; } -int vlan_vid_add(struct net_device *dev, unsigned short vid) +int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid) { struct vlan_info *vlan_info; struct vlan_vid_info *vid_info; @@ -250,9 +273,9 @@ int vlan_vid_add(struct net_device *dev, unsigned short vid) return -ENOMEM; vlan_info_created = true; } - vid_info = vlan_vid_info_get(vlan_info, vid); + vid_info = vlan_vid_info_get(vlan_info, proto, vid); if (!vid_info) { - err = __vlan_vid_add(vlan_info, vid, &vid_info); + err = __vlan_vid_add(vlan_info, proto, vid, &vid_info); if (err) goto out_free_vlan_info; } @@ -275,15 +298,15 @@ static void __vlan_vid_del(struct vlan_info *vlan_info, { struct net_device *dev = vlan_info->real_dev; const struct net_device_ops *ops = dev->netdev_ops; - unsigned short vid = vid_info->vid; + __be16 proto = vid_info->proto; + u16 vid = vid_info->vid; int err; - if ((dev->features & NETIF_F_HW_VLAN_FILTER) && - ops->ndo_vlan_rx_kill_vid) { - err = ops->ndo_vlan_rx_kill_vid(dev, vid); + if (vlan_hw_filter_capable(dev, vid_info)) { + err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid); if (err) { - pr_warn("failed to kill vid %d for device %s\n", - vid, dev->name); + pr_warn("failed to kill vid %04x/%d for device %s\n", + proto, vid, dev->name); } } list_del(&vid_info->list); @@ -291,7 +314,7 @@ static void __vlan_vid_del(struct vlan_info *vlan_info, vlan_info->nr_vids--; } -void vlan_vid_del(struct net_device *dev, unsigned short vid) +void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid) { struct vlan_info *vlan_info; struct vlan_vid_info *vid_info; @@ -302,7 +325,7 @@ void vlan_vid_del(struct net_device *dev, unsigned short vid) if (!vlan_info) return; - vid_info = vlan_vid_info_get(vlan_info, vid); + vid_info = vlan_vid_info_get(vlan_info, proto, vid); if (!vid_info) return; vid_info->refcount--; @@ -330,7 +353,7 @@ int vlan_vids_add_by_dev(struct net_device *dev, return 0; list_for_each_entry(vid_info, &vlan_info->vid_list, list) { - err = vlan_vid_add(dev, vid_info->vid); + err = vlan_vid_add(dev, vid_info->proto, vid_info->vid); if (err) goto unwind; } @@ -340,7 +363,7 @@ unwind: list_for_each_entry_continue_reverse(vid_info, &vlan_info->vid_list, list) { - vlan_vid_del(dev, vid_info->vid); + vlan_vid_del(dev, vid_info->proto, vid_info->vid); } return err; @@ -360,7 +383,7 @@ void vlan_vids_del_by_dev(struct net_device *dev, return; list_for_each_entry(vid_info, &vlan_info->vid_list, list) - vlan_vid_del(dev, vid_info->vid); + vlan_vid_del(dev, vid_info->proto, vid_info->vid); } EXPORT_SYMBOL(vlan_vids_del_by_dev); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 4a6d31a082b9..3a8c8fd63c88 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -99,6 +99,7 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, const void *daddr, const void *saddr, unsigned int len) { + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_hdr *vhdr; unsigned int vhdrlen = 0; u16 vlan_tci = 0; @@ -120,8 +121,8 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, else vhdr->h_vlan_encapsulated_proto = htons(len); - skb->protocol = htons(ETH_P_8021Q); - type = ETH_P_8021Q; + skb->protocol = vlan->vlan_proto; + type = ntohs(vlan->vlan_proto); vhdrlen = VLAN_HLEN; } @@ -161,12 +162,12 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ - if (veth->h_vlan_proto != htons(ETH_P_8021Q) || + if (veth->h_vlan_proto != vlan->vlan_proto || vlan->flags & VLAN_FLAG_REORDER_HDR) { u16 vlan_tci; vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb); - skb = __vlan_hwaccel_put_tag(skb, vlan_tci); + skb = __vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci); } skb->dev = vlan->real_dev; @@ -261,7 +262,7 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) u32 old_flags = vlan->flags; if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING)) + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) return -EINVAL; vlan->flags = (old_flags & ~mask) | (flags & mask); @@ -272,6 +273,13 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) else vlan_gvrp_request_leave(dev); } + + if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_MVRP) { + if (vlan->flags & VLAN_FLAG_MVRP) + vlan_mvrp_request_join(dev); + else + vlan_mvrp_request_leave(dev); + } return 0; } @@ -312,6 +320,9 @@ static int vlan_dev_open(struct net_device *dev) if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); + if (vlan->flags & VLAN_FLAG_MVRP) + vlan_mvrp_request_join(dev); + if (netif_carrier_ok(real_dev)) netif_carrier_on(dev); return 0; @@ -573,7 +584,7 @@ static int vlan_dev_init(struct net_device *dev) #endif dev->needed_headroom = real_dev->needed_headroom; - if (real_dev->features & NETIF_F_HW_VLAN_TX) { + if (real_dev->features & NETIF_F_HW_VLAN_CTAG_TX) { dev->header_ops = real_dev->header_ops; dev->hard_header_len = real_dev->hard_header_len; } else { @@ -617,7 +628,7 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev, netdev_features_t features) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - u32 old_features = features; + netdev_features_t old_features = features; features &= real_dev->vlan_features; features |= NETIF_F_RXCSUM; @@ -640,9 +651,9 @@ static int vlan_ethtool_get_settings(struct net_device *dev, static void vlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strcpy(info->driver, vlan_fullname); - strcpy(info->version, vlan_version); - strcpy(info->fw_version, "N/A"); + strlcpy(info->driver, vlan_fullname, sizeof(info->driver)); + strlcpy(info->version, vlan_version, sizeof(info->version)); + strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); } static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) @@ -723,7 +734,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev) vlan->netpoll = NULL; - __netpoll_free_rcu(netpoll); + __netpoll_free_async(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ diff --git a/net/8021q/vlan_gvrp.c b/net/8021q/vlan_gvrp.c index 6f9755352760..66a80320b032 100644 --- a/net/8021q/vlan_gvrp.c +++ b/net/8021q/vlan_gvrp.c @@ -32,6 +32,8 @@ int vlan_gvrp_request_join(const struct net_device *dev) const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); __be16 vlan_id = htons(vlan->vlan_id); + if (vlan->vlan_proto != htons(ETH_P_8021Q)) + return 0; return garp_request_join(vlan->real_dev, &vlan_gvrp_app, &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID); } @@ -41,6 +43,8 @@ void vlan_gvrp_request_leave(const struct net_device *dev) const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); __be16 vlan_id = htons(vlan->vlan_id); + if (vlan->vlan_proto != htons(ETH_P_8021Q)) + return; garp_request_leave(vlan->real_dev, &vlan_gvrp_app, &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID); } diff --git a/net/8021q/vlan_mvrp.c b/net/8021q/vlan_mvrp.c new file mode 100644 index 000000000000..e0fe091801b0 --- /dev/null +++ b/net/8021q/vlan_mvrp.c @@ -0,0 +1,76 @@ +/* + * IEEE 802.1Q Multiple VLAN Registration Protocol (MVRP) + * + * Copyright (c) 2012 Massachusetts Institute of Technology + * + * Adapted from code in net/8021q/vlan_gvrp.c + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ +#include <linux/types.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/mrp.h> +#include "vlan.h" + +#define MRP_MVRP_ADDRESS { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x21 } + +enum mvrp_attributes { + MVRP_ATTR_INVALID, + MVRP_ATTR_VID, + __MVRP_ATTR_MAX +}; +#define MVRP_ATTR_MAX (__MVRP_ATTR_MAX - 1) + +static struct mrp_application vlan_mrp_app __read_mostly = { + .type = MRP_APPLICATION_MVRP, + .maxattr = MVRP_ATTR_MAX, + .pkttype.type = htons(ETH_P_MVRP), + .group_address = MRP_MVRP_ADDRESS, + .version = 0, +}; + +int vlan_mvrp_request_join(const struct net_device *dev) +{ + const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + __be16 vlan_id = htons(vlan->vlan_id); + + if (vlan->vlan_proto != htons(ETH_P_8021Q)) + return 0; + return mrp_request_join(vlan->real_dev, &vlan_mrp_app, + &vlan_id, sizeof(vlan_id), MVRP_ATTR_VID); +} + +void vlan_mvrp_request_leave(const struct net_device *dev) +{ + const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + __be16 vlan_id = htons(vlan->vlan_id); + + if (vlan->vlan_proto != htons(ETH_P_8021Q)) + return; + mrp_request_leave(vlan->real_dev, &vlan_mrp_app, + &vlan_id, sizeof(vlan_id), MVRP_ATTR_VID); +} + +int vlan_mvrp_init_applicant(struct net_device *dev) +{ + return mrp_init_applicant(dev, &vlan_mrp_app); +} + +void vlan_mvrp_uninit_applicant(struct net_device *dev) +{ + mrp_uninit_applicant(dev, &vlan_mrp_app); +} + +int __init vlan_mvrp_init(void) +{ + return mrp_register_application(&vlan_mrp_app); +} + +void vlan_mvrp_uninit(void) +{ + mrp_unregister_application(&vlan_mrp_app); +} diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 708c80ea1874..309129732285 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -23,6 +23,7 @@ static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = { [IFLA_VLAN_FLAGS] = { .len = sizeof(struct ifla_vlan_flags) }, [IFLA_VLAN_EGRESS_QOS] = { .type = NLA_NESTED }, [IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED }, + [IFLA_VLAN_PROTOCOL] = { .type = NLA_U16 }, }; static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = { @@ -53,6 +54,16 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[]) if (!data) return -EINVAL; + if (data[IFLA_VLAN_PROTOCOL]) { + switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) { + case __constant_htons(ETH_P_8021Q): + case __constant_htons(ETH_P_8021AD): + break; + default: + return -EPROTONOSUPPORT; + } + } + if (data[IFLA_VLAN_ID]) { id = nla_get_u16(data[IFLA_VLAN_ID]); if (id >= VLAN_VID_MASK) @@ -62,7 +73,7 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[]) flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING)) + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) return -EINVAL; } @@ -107,6 +118,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev; + __be16 proto; int err; if (!data[IFLA_VLAN_ID]) @@ -118,11 +130,17 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, if (!real_dev) return -ENODEV; - vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]); - vlan->real_dev = real_dev; - vlan->flags = VLAN_FLAG_REORDER_HDR; + if (data[IFLA_VLAN_PROTOCOL]) + proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]); + else + proto = htons(ETH_P_8021Q); + + vlan->vlan_proto = proto; + vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]); + vlan->real_dev = real_dev; + vlan->flags = VLAN_FLAG_REORDER_HDR; - err = vlan_check_real_dev(real_dev, vlan->vlan_id); + err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id); if (err < 0) return err; @@ -151,7 +169,8 @@ static size_t vlan_get_size(const struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); - return nla_total_size(2) + /* IFLA_VLAN_ID */ + return nla_total_size(2) + /* IFLA_VLAN_PROTOCOL */ + nla_total_size(2) + /* IFLA_VLAN_ID */ sizeof(struct ifla_vlan_flags) + /* IFLA_VLAN_FLAGS */ vlan_qos_map_size(vlan->nr_ingress_mappings) + vlan_qos_map_size(vlan->nr_egress_mappings); @@ -166,7 +185,8 @@ static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) struct nlattr *nest; unsigned int i; - if (nla_put_u16(skb, IFLA_VLAN_ID, vlan_dev_priv(dev)->vlan_id)) + if (nla_put_be16(skb, IFLA_VLAN_PROTOCOL, vlan->vlan_proto) || + nla_put_u16(skb, IFLA_VLAN_ID, vlan->vlan_id)) goto nla_put_failure; if (vlan->flags) { f.flags = vlan->flags; diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index 4de77ea5fa37..1d0e89213a28 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -93,7 +93,7 @@ static const struct file_operations vlan_fops = { static int vlandev_seq_open(struct inode *inode, struct file *file) { - return single_open(file, vlandev_seq_show, PDE(inode)->data); + return single_open(file, vlandev_seq_show, PDE_DATA(inode)); } static const struct file_operations vlandev_fops = { @@ -131,7 +131,7 @@ void vlan_proc_cleanup(struct net *net) remove_proc_entry(name_conf, vn->proc_vlan_dir); if (vn->proc_vlan_dir) - proc_net_remove(net, name_root); + remove_proc_entry(name_root, net->proc_net); /* Dynamically added entries should be cleaned up as their vlan_device * is removed, so we should not have to take care of it here... @@ -184,14 +184,9 @@ int vlan_proc_add_dev(struct net_device *vlandev) */ int vlan_proc_rem_dev(struct net_device *vlandev) { - struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id); - /** NOTE: This will consume the memory pointed to by dent, it seems. */ - if (vlan_dev_priv(vlandev)->dent) { - remove_proc_entry(vlan_dev_priv(vlandev)->dent->name, - vn->proc_vlan_dir); - vlan_dev_priv(vlandev)->dent = NULL; - } + proc_remove(vlan_dev_priv(vlandev)->dent); + vlan_dev_priv(vlandev)->dent = NULL; return 0; } diff --git a/net/9p/Kconfig b/net/9p/Kconfig index d9ea09b11cf8..a75174a33723 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -23,7 +23,7 @@ config NET_9P_VIRTIO guest partitions and a host partition. config NET_9P_RDMA - depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS && EXPERIMENTAL + depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS tristate "9P RDMA Transport (Experimental)" help This builds support for an RDMA transport. diff --git a/net/9p/client.c b/net/9p/client.c index 34d417670935..8eb75425e6e6 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1100,7 +1100,7 @@ void p9_client_begin_disconnect(struct p9_client *clnt) EXPORT_SYMBOL(p9_client_begin_disconnect); struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, - char *uname, u32 n_uname, char *aname) + char *uname, kuid_t n_uname, char *aname) { int err = 0; struct p9_req_t *req; @@ -1117,7 +1117,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, goto error; } - req = p9_client_rpc(clnt, P9_TATTACH, "ddss?d", fid->fid, + req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid, afid ? afid->fid : P9_NOFID, uname, aname, n_uname); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1270,7 +1270,7 @@ error: EXPORT_SYMBOL(p9_client_open); int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode, - gid_t gid, struct p9_qid *qid) + kgid_t gid, struct p9_qid *qid) { int err = 0; struct p9_client *clnt; @@ -1279,13 +1279,14 @@ int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode, p9_debug(P9_DEBUG_9P, ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n", - ofid->fid, name, flags, mode, gid); + ofid->fid, name, flags, mode, + from_kgid(&init_user_ns, gid)); clnt = ofid->clnt; if (ofid->mode != -1) return -EINVAL; - req = p9_client_rpc(clnt, P9_TLCREATE, "dsddd", ofid->fid, name, flags, + req = p9_client_rpc(clnt, P9_TLCREATE, "dsddg", ofid->fid, name, flags, mode, gid); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1358,7 +1359,7 @@ error: } EXPORT_SYMBOL(p9_client_fcreate); -int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, gid_t gid, +int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, kgid_t gid, struct p9_qid *qid) { int err = 0; @@ -1369,7 +1370,7 @@ int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, gid_t gid, dfid->fid, name, symtgt); clnt = dfid->clnt; - req = p9_client_rpc(clnt, P9_TSYMLINK, "dssd", dfid->fid, name, symtgt, + req = p9_client_rpc(clnt, P9_TSYMLINK, "dssg", dfid->fid, name, symtgt, gid); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1710,7 +1711,9 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) (unsigned long long)ret->qid.path, ret->qid.version, ret->mode, ret->atime, ret->mtime, (unsigned long long)ret->length, ret->name, ret->uid, ret->gid, ret->muid, ret->extension, - ret->n_uid, ret->n_gid, ret->n_muid); + from_kuid(&init_user_ns, ret->n_uid), + from_kgid(&init_user_ns, ret->n_gid), + from_kuid(&init_user_ns, ret->n_muid)); p9_free_req(clnt, req); return ret; @@ -1764,8 +1767,10 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, "<<< st_btime_sec=%lld st_btime_nsec=%lld\n" "<<< st_gen=%lld st_data_version=%lld", ret->st_result_mask, ret->qid.type, ret->qid.path, - ret->qid.version, ret->st_mode, ret->st_nlink, ret->st_uid, - ret->st_gid, ret->st_rdev, ret->st_size, ret->st_blksize, + ret->qid.version, ret->st_mode, ret->st_nlink, + from_kuid(&init_user_ns, ret->st_uid), + from_kgid(&init_user_ns, ret->st_gid), + ret->st_rdev, ret->st_size, ret->st_blksize, ret->st_blocks, ret->st_atime_sec, ret->st_atime_nsec, ret->st_mtime_sec, ret->st_mtime_nsec, ret->st_ctime_sec, ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec, @@ -1828,7 +1833,9 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) (unsigned long long)wst->qid.path, wst->qid.version, wst->mode, wst->atime, wst->mtime, (unsigned long long)wst->length, wst->name, wst->uid, wst->gid, wst->muid, wst->extension, - wst->n_uid, wst->n_gid, wst->n_muid); + from_kuid(&init_user_ns, wst->n_uid), + from_kgid(&init_user_ns, wst->n_gid), + from_kuid(&init_user_ns, wst->n_muid)); req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", fid->fid, wst->size+2, wst); if (IS_ERR(req)) { @@ -1857,7 +1864,9 @@ int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr) " valid=%x mode=%x uid=%d gid=%d size=%lld\n" " atime_sec=%lld atime_nsec=%lld\n" " mtime_sec=%lld mtime_nsec=%lld\n", - p9attr->valid, p9attr->mode, p9attr->uid, p9attr->gid, + p9attr->valid, p9attr->mode, + from_kuid(&init_user_ns, p9attr->uid), + from_kgid(&init_user_ns, p9attr->gid), p9attr->size, p9attr->atime_sec, p9attr->atime_nsec, p9attr->mtime_sec, p9attr->mtime_nsec); @@ -2106,7 +2115,7 @@ error: EXPORT_SYMBOL(p9_client_readdir); int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode, - dev_t rdev, gid_t gid, struct p9_qid *qid) + dev_t rdev, kgid_t gid, struct p9_qid *qid) { int err; struct p9_client *clnt; @@ -2116,7 +2125,7 @@ int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode, clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d " "minor %d\n", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev)); - req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddd", fid->fid, name, mode, + req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddg", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev), gid); if (IS_ERR(req)) return PTR_ERR(req); @@ -2137,7 +2146,7 @@ error: EXPORT_SYMBOL(p9_client_mknod_dotl); int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode, - gid_t gid, struct p9_qid *qid) + kgid_t gid, struct p9_qid *qid) { int err; struct p9_client *clnt; @@ -2146,8 +2155,8 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode, err = 0; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n", - fid->fid, name, mode, gid); - req = p9_client_rpc(clnt, P9_TMKDIR, "dsdd", fid->fid, name, mode, + fid->fid, name, mode, from_kgid(&init_user_ns, gid)); + req = p9_client_rpc(clnt, P9_TMKDIR, "dsdg", fid->fid, name, mode, gid); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/net/9p/error.c b/net/9p/error.c index 2ab2de76010f..126fd0dceea2 100644 --- a/net/9p/error.c +++ b/net/9p/error.c @@ -221,15 +221,13 @@ EXPORT_SYMBOL(p9_error_init); int p9_errstr2errno(char *errstr, int len) { int errno; - struct hlist_node *p; struct errormap *c; int bucket; errno = 0; - p = NULL; c = NULL; bucket = jhash(errstr, len, 0) % ERRHASHSZ; - hlist_for_each_entry(c, p, &hash_errmap[bucket], list) { + hlist_for_each_entry(c, &hash_errmap[bucket], list) { if (c->namelen == len && !memcmp(c->name, errstr, len)) { errno = c->val; break; diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 3d33ecf13327..ab9127ec5b7a 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -85,6 +85,8 @@ pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size) d - int32_t q - int64_t s - string + u - numeric uid + g - numeric gid S - stat Q - qid D - data blob (int32_t size followed by void *, results are not freed) @@ -163,6 +165,26 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, (*sptr)[len] = 0; } break; + case 'u': { + kuid_t *uid = va_arg(ap, kuid_t *); + __le32 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *uid = make_kuid(&init_user_ns, + le32_to_cpu(le_val)); + } break; + case 'g': { + kgid_t *gid = va_arg(ap, kgid_t *); + __le32 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *gid = make_kgid(&init_user_ns, + le32_to_cpu(le_val)); + } break; case 'Q':{ struct p9_qid *qid = va_arg(ap, struct p9_qid *); @@ -177,11 +199,12 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, va_arg(ap, struct p9_wstat *); memset(stbuf, 0, sizeof(struct p9_wstat)); - stbuf->n_uid = stbuf->n_gid = stbuf->n_muid = - -1; + stbuf->n_uid = stbuf->n_muid = INVALID_UID; + stbuf->n_gid = INVALID_GID; + errcode = p9pdu_readf(pdu, proto_version, - "wwdQdddqssss?sddd", + "wwdQdddqssss?sugu", &stbuf->size, &stbuf->type, &stbuf->dev, &stbuf->qid, &stbuf->mode, &stbuf->atime, @@ -294,7 +317,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, memset(stbuf, 0, sizeof(struct p9_stat_dotl)); errcode = p9pdu_readf(pdu, proto_version, - "qQdddqqqqqqqqqqqqqqq", + "qQdugqqqqqqqqqqqqqqq", &stbuf->st_result_mask, &stbuf->qid, &stbuf->st_mode, @@ -377,6 +400,20 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, errcode = -EFAULT; } break; + case 'u': { + kuid_t uid = va_arg(ap, kuid_t); + __le32 val = cpu_to_le32( + from_kuid(&init_user_ns, uid)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } break; + case 'g': { + kgid_t gid = va_arg(ap, kgid_t); + __le32 val = cpu_to_le32( + from_kgid(&init_user_ns, gid)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } break; case 'Q':{ const struct p9_qid *qid = va_arg(ap, const struct p9_qid *); @@ -390,7 +427,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, va_arg(ap, const struct p9_wstat *); errcode = p9pdu_writef(pdu, proto_version, - "wwdQdddqssss?sddd", + "wwdQdddqssss?sugu", stbuf->size, stbuf->type, stbuf->dev, &stbuf->qid, stbuf->mode, stbuf->atime, @@ -468,7 +505,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, struct p9_iattr_dotl *); errcode = p9pdu_writef(pdu, proto_version, - "ddddqqqqq", + "ddugqqqqq", p9attr->valid, p9attr->mode, p9attr->uid, diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index fd05c81cb348..e1c26b101830 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -87,7 +87,7 @@ struct virtio_chan { /* This is global limit. Since we don't have a global structure, * will be placing it in each channel. */ - int p9_max_pages; + unsigned long p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; @@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start, if (s > count) s = count; BUG_ON(index > limit); + /* Make sure we don't terminate early. */ + sg_unmark_end(&sg[index]); sg_set_buf(&sg[index++], data, s); count -= s; data += s; } - + if (index-start) + sg_mark_end(&sg[index - 1]); return index-start; } @@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit, s = rest_of_page(data); if (s > count) s = count; + /* Make sure we don't terminate early. */ + sg_unmark_end(&sg[index]); sg_set_page(&sg[index++], pdata[i++], s, data_off); data_off = 0; data += s; count -= s; nr_pages--; } + + if (index-start) + sg_mark_end(&sg[index - 1]); return index - start; } @@ -256,9 +264,10 @@ static int p9_virtio_request(struct p9_client *client, struct p9_req_t *req) { int err; - int in, out; + int in, out, out_sgs, in_sgs; unsigned long flags; struct virtio_chan *chan = client->trans; + struct scatterlist *sgs[2]; p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); @@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) req_retry: spin_lock_irqsave(&chan->lock, flags); + out_sgs = in_sgs = 0; /* Handle out VirtIO ring buffers */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + if (out) + sgs[out_sgs++] = chan->sg; in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); + if (in) + sgs[out_sgs + in_sgs++] = chan->sg + out; - err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { @@ -289,7 +303,7 @@ req_retry: } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, - "virtio rpc add_buf returned failure\n"); + "virtio rpc add_sgs returned failure\n"); return -EIO; } } @@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, char *uidata, char *uodata, int inlen, int outlen, int in_hdr_len, int kern_buf) { - int in, out, err; + int in, out, err, out_sgs, in_sgs; unsigned long flags; int in_nr_pages = 0, out_nr_pages = 0; struct page **in_pages = NULL, **out_pages = NULL; struct virtio_chan *chan = client->trans; + struct scatterlist *sgs[4]; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); @@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, req->status = REQ_STATUS_SENT; req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); + + out_sgs = in_sgs = 0; + /* out data */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); - if (out_pages) + if (out) + sgs[out_sgs++] = chan->sg; + + if (out_pages) { + sgs[out_sgs++] = chan->sg + out; out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, out_pages, out_nr_pages, uodata, outlen); + } + /* * Take care of in data * For example TREAD have 11. @@ -412,11 +436,17 @@ req_retry_pinned: */ in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); - if (in_pages) + if (in) + sgs[out_sgs + in_sgs++] = chan->sg + out; + + if (in_pages) { + sgs[out_sgs + in_sgs++] = chan->sg + out + in; in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, in_pages, in_nr_pages, uidata, inlen); + } - err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, + BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs)); + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { @@ -432,7 +462,7 @@ req_retry_pinned: } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, - "virtio rpc add_buf returned failure\n"); + "virtio rpc add_sgs returned failure\n"); err = -EIO; goto err_out; } diff --git a/net/9p/util.c b/net/9p/util.c index 6ceeeb384de7..59f278e64f58 100644 --- a/net/9p/util.c +++ b/net/9p/util.c @@ -87,23 +87,18 @@ EXPORT_SYMBOL(p9_idpool_destroy); int p9_idpool_get(struct p9_idpool *p) { - int i = 0; - int error; + int i; unsigned long flags; -retry: - if (idr_pre_get(&p->pool, GFP_NOFS) == 0) - return -1; - + idr_preload(GFP_NOFS); spin_lock_irqsave(&p->lock, flags); /* no need to store exactly p, we just need something non-null */ - error = idr_get_new(&p->pool, p, &i); - spin_unlock_irqrestore(&p->lock, flags); + i = idr_alloc(&p->pool, p, 0, 0, GFP_NOWAIT); - if (error == -EAGAIN) - goto retry; - else if (error) + spin_unlock_irqrestore(&p->lock, flags); + idr_preload_end(); + if (i < 0) return -1; p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", i, p); diff --git a/net/Kconfig b/net/Kconfig index 30b48f523135..2ddc9046868e 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -90,7 +90,6 @@ config NETWORK_SECMARK config NETWORK_PHY_TIMESTAMPING bool "Timestamping in PHY devices" - depends on EXPERIMENTAL help This allows timestamping of network packets by PHYs with hardware timestamping capabilities. This option adds some @@ -209,7 +208,6 @@ source "net/ipx/Kconfig" source "drivers/net/appletalk/Kconfig" source "net/x25/Kconfig" source "net/lapb/Kconfig" -source "net/wanrouter/Kconfig" source "net/phonet/Kconfig" source "net/ieee802154/Kconfig" source "net/mac802154/Kconfig" @@ -218,6 +216,8 @@ source "net/dcb/Kconfig" source "net/dns_resolver/Kconfig" source "net/batman-adv/Kconfig" source "net/openvswitch/Kconfig" +source "net/vmw_vsock/Kconfig" +source "net/netlink/Kconfig" config RPS boolean @@ -232,7 +232,7 @@ config RFS_ACCEL config XPS boolean - depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS + depends on SMP && USE_GENERIC_SMP_HELPERS default y config NETPRIO_CGROUP @@ -278,7 +278,7 @@ config NET_PKTGEN config NET_TCPPROBE tristate "TCP connection probing" - depends on INET && EXPERIMENTAL && PROC_FS && KPROBES + depends on INET && PROC_FS && KPROBES ---help--- This module allows for capturing the changes to TCP connection state in response to incoming packets. It is used for debugging @@ -295,7 +295,7 @@ config NET_TCPPROBE config NET_DROP_MONITOR tristate "Network packet drop alerting service" - depends on INET && EXPERIMENTAL && TRACEPOINTS + depends on INET && TRACEPOINTS ---help--- This feature provides an alerting service to userspace in the event that packets are discarded in the network stack. Alerts diff --git a/net/Makefile b/net/Makefile index 4f4ee083064c..091e7b04f301 100644 --- a/net/Makefile +++ b/net/Makefile @@ -26,7 +26,6 @@ obj-$(CONFIG_BRIDGE) += bridge/ obj-$(CONFIG_NET_DSA) += dsa/ obj-$(CONFIG_IPX) += ipx/ obj-$(CONFIG_ATALK) += appletalk/ -obj-$(CONFIG_WAN_ROUTER) += wanrouter/ obj-$(CONFIG_X25) += x25/ obj-$(CONFIG_LAPB) += lapb/ obj-$(CONFIG_NETROM) += netrom/ @@ -70,3 +69,4 @@ obj-$(CONFIG_CEPH_LIB) += ceph/ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ obj-$(CONFIG_NFC) += nfc/ obj-$(CONFIG_OPENVSWITCH) += openvswitch/ +obj-$(CONFIG_VSOCKETS) += vmw_vsock/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 33475291c9c1..ef12839a7cfe 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -93,10 +93,9 @@ static struct sock *atalk_search_socket(struct sockaddr_at *to, struct atalk_iface *atif) { struct sock *s; - struct hlist_node *node; read_lock_bh(&atalk_sockets_lock); - sk_for_each(s, node, &atalk_sockets) { + sk_for_each(s, &atalk_sockets) { struct atalk_sock *at = at_sk(s); if (to->sat_port != at->src_port) @@ -141,11 +140,10 @@ static struct sock *atalk_find_or_insert_socket(struct sock *sk, struct sockaddr_at *sat) { struct sock *s; - struct hlist_node *node; struct atalk_sock *at; write_lock_bh(&atalk_sockets_lock); - sk_for_each(s, node, &atalk_sockets) { + sk_for_each(s, &atalk_sockets) { at = at_sk(s); if (at->src_net == sat->sat_addr.s_net && @@ -1084,9 +1082,8 @@ static int atalk_pick_and_bind_port(struct sock *sk, struct sockaddr_at *sat) sat->sat_port < ATPORT_LAST; sat->sat_port++) { struct sock *s; - struct hlist_node *node; - sk_for_each(s, node, &atalk_sockets) { + sk_for_each(s, &atalk_sockets) { struct atalk_sock *at = at_sk(s); if (at->src_net == sat->sat_addr.s_net && @@ -1256,7 +1253,7 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, goto out; *uaddr_len = sizeof(struct sockaddr_at); - memset(&sat.sat_zero, 0, sizeof(sat.sat_zero)); + memset(&sat, 0, sizeof(sat)); if (peer) { err = -ENOTCONN; diff --git a/net/atm/common.c b/net/atm/common.c index 806fc0a40051..737bef59ce89 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -270,11 +270,11 @@ void atm_dev_release_vccs(struct atm_dev *dev) write_lock_irq(&vcc_sklist_lock); for (i = 0; i < VCC_HTABLE_SIZE; i++) { struct hlist_head *head = &vcc_hash[i]; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; struct sock *s; struct atm_vcc *vcc; - sk_for_each_safe(s, node, tmp, head) { + sk_for_each_safe(s, tmp, head) { vcc = atm_sk(s); if (vcc->dev == dev) { vcc_release_async(vcc, -EPIPE); @@ -317,11 +317,10 @@ static int adjust_tp(struct atm_trafprm *tp, unsigned char aal) static int check_ci(const struct atm_vcc *vcc, short vpi, int vci) { struct hlist_head *head = &vcc_hash[vci & (VCC_HTABLE_SIZE - 1)]; - struct hlist_node *node; struct sock *s; struct atm_vcc *walk; - sk_for_each(s, node, head) { + sk_for_each(s, head) { walk = atm_sk(s); if (walk->dev != vcc->dev) continue; @@ -532,6 +531,8 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, struct sk_buff *skb; int copied, error = -EINVAL; + msg->msg_namelen = 0; + if (sock->state != SS_CONNECTED) return -ENOTCONN; diff --git a/net/atm/lec.c b/net/atm/lec.c index 2e3d942e77f1..f23916be18fb 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -842,7 +842,9 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl, --*l; } - hlist_for_each_entry_from(tmp, e, next) { + tmp = container_of(e, struct lec_arp_table, next); + + hlist_for_each_entry_from(tmp, next) { if (--*l < 0) break; } @@ -1307,7 +1309,6 @@ lec_arp_add(struct lec_priv *priv, struct lec_arp_table *entry) static int lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove) { - struct hlist_node *node; struct lec_arp_table *entry; int i, remove_vcc = 1; @@ -1326,7 +1327,7 @@ lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove) * ESI_FLUSH_PENDING, ESI_FORWARD_DIRECT */ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { - hlist_for_each_entry(entry, node, + hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (memcmp(to_remove->atm_addr, entry->atm_addr, ATM_ESA_LEN) == 0) { @@ -1364,14 +1365,13 @@ static const char *get_status_string(unsigned char st) static void dump_arp_table(struct lec_priv *priv) { - struct hlist_node *node; struct lec_arp_table *rulla; char buf[256]; int i, j, offset; pr_info("Dump %p:\n", priv); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { - hlist_for_each_entry(rulla, node, + hlist_for_each_entry(rulla, &priv->lec_arp_tables[i], next) { offset = 0; offset += sprintf(buf, "%d: %p\n", i, rulla); @@ -1403,7 +1403,7 @@ static void dump_arp_table(struct lec_priv *priv) if (!hlist_empty(&priv->lec_no_forward)) pr_info("No forward\n"); - hlist_for_each_entry(rulla, node, &priv->lec_no_forward, next) { + hlist_for_each_entry(rulla, &priv->lec_no_forward, next) { offset = 0; offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); offset += sprintf(buf + offset, " Atm:"); @@ -1428,7 +1428,7 @@ static void dump_arp_table(struct lec_priv *priv) if (!hlist_empty(&priv->lec_arp_empty_ones)) pr_info("Empty ones\n"); - hlist_for_each_entry(rulla, node, &priv->lec_arp_empty_ones, next) { + hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) { offset = 0; offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); offset += sprintf(buf + offset, " Atm:"); @@ -1453,7 +1453,7 @@ static void dump_arp_table(struct lec_priv *priv) if (!hlist_empty(&priv->mcast_fwds)) pr_info("Multicast Forward VCCs\n"); - hlist_for_each_entry(rulla, node, &priv->mcast_fwds, next) { + hlist_for_each_entry(rulla, &priv->mcast_fwds, next) { offset = 0; offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); offset += sprintf(buf + offset, " Atm:"); @@ -1487,7 +1487,7 @@ static void dump_arp_table(struct lec_priv *priv) static void lec_arp_destroy(struct lec_priv *priv) { unsigned long flags; - struct hlist_node *node, *next; + struct hlist_node *next; struct lec_arp_table *entry; int i; @@ -1499,7 +1499,7 @@ static void lec_arp_destroy(struct lec_priv *priv) spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { lec_arp_remove(priv, entry); lec_arp_put(entry); @@ -1507,7 +1507,7 @@ static void lec_arp_destroy(struct lec_priv *priv) INIT_HLIST_HEAD(&priv->lec_arp_tables[i]); } - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { del_timer_sync(&entry->timer); lec_arp_clear_vccs(entry); @@ -1516,7 +1516,7 @@ static void lec_arp_destroy(struct lec_priv *priv) } INIT_HLIST_HEAD(&priv->lec_arp_empty_ones); - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_no_forward, next) { del_timer_sync(&entry->timer); lec_arp_clear_vccs(entry); @@ -1525,7 +1525,7 @@ static void lec_arp_destroy(struct lec_priv *priv) } INIT_HLIST_HEAD(&priv->lec_no_forward); - hlist_for_each_entry_safe(entry, node, next, &priv->mcast_fwds, next) { + hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) { /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ lec_arp_clear_vccs(entry); hlist_del(&entry->next); @@ -1542,14 +1542,13 @@ static void lec_arp_destroy(struct lec_priv *priv) static struct lec_arp_table *lec_arp_find(struct lec_priv *priv, const unsigned char *mac_addr) { - struct hlist_node *node; struct hlist_head *head; struct lec_arp_table *entry; pr_debug("%pM\n", mac_addr); head = &priv->lec_arp_tables[HASH(mac_addr[ETH_ALEN - 1])]; - hlist_for_each_entry(entry, node, head, next) { + hlist_for_each_entry(entry, head, next) { if (ether_addr_equal(mac_addr, entry->mac_addr)) return entry; } @@ -1686,7 +1685,7 @@ static void lec_arp_check_expire(struct work_struct *work) unsigned long flags; struct lec_priv *priv = container_of(work, struct lec_priv, lec_arp_work.work); - struct hlist_node *node, *next; + struct hlist_node *next; struct lec_arp_table *entry; unsigned long now; int i; @@ -1696,7 +1695,7 @@ static void lec_arp_check_expire(struct work_struct *work) restart: spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { if (__lec_arp_check_expire(entry, now, priv)) { struct sk_buff *skb; @@ -1823,14 +1822,14 @@ lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long permanent) { unsigned long flags; - struct hlist_node *node, *next; + struct hlist_node *next; struct lec_arp_table *entry; int i; pr_debug("\n"); spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN) && (permanent || @@ -1855,7 +1854,7 @@ lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, unsigned int targetless_le_arp) { unsigned long flags; - struct hlist_node *node, *next; + struct hlist_node *next; struct lec_arp_table *entry, *tmp; int i; @@ -1870,7 +1869,7 @@ lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, * we have no entry in the cache. 7.1.30 */ if (!hlist_empty(&priv->lec_arp_empty_ones)) { - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN) == 0) { hlist_del(&entry->next); @@ -1915,7 +1914,7 @@ lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN); del_timer(&entry->timer); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { - hlist_for_each_entry(tmp, node, + hlist_for_each_entry(tmp, &priv->lec_arp_tables[i], next) { if (entry != tmp && !memcmp(tmp->atm_addr, atm_addr, ATM_ESA_LEN)) { @@ -1956,7 +1955,6 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb)) { unsigned long flags; - struct hlist_node *node; struct lec_arp_table *entry; int i, found_entry = 0; @@ -2026,7 +2024,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, ioc_data->atm_addr[16], ioc_data->atm_addr[17], ioc_data->atm_addr[18], ioc_data->atm_addr[19]); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { - hlist_for_each_entry(entry, node, + hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (memcmp (ioc_data->atm_addr, entry->atm_addr, @@ -2103,7 +2101,6 @@ out: static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id) { unsigned long flags; - struct hlist_node *node; struct lec_arp_table *entry; int i; @@ -2111,7 +2108,7 @@ static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id) restart: spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { - hlist_for_each_entry(entry, node, + hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (entry->flush_tran_id == tran_id && entry->status == ESI_FLUSH_PENDING) { @@ -2140,13 +2137,12 @@ lec_set_flush_tran_id(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long tran_id) { unsigned long flags; - struct hlist_node *node; struct lec_arp_table *entry; int i; spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) - hlist_for_each_entry(entry, node, + hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN)) { entry->flush_tran_id = tran_id; @@ -2198,7 +2194,7 @@ out: static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) { unsigned long flags; - struct hlist_node *node, *next; + struct hlist_node *next; struct lec_arp_table *entry; int i; @@ -2208,7 +2204,7 @@ static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { if (vcc == entry->vcc) { lec_arp_remove(priv, entry); @@ -2219,7 +2215,7 @@ static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) } } - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (entry->vcc == vcc) { lec_arp_clear_vccs(entry); @@ -2229,7 +2225,7 @@ static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) } } - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_no_forward, next) { if (entry->recv_vcc == vcc) { lec_arp_clear_vccs(entry); @@ -2239,7 +2235,7 @@ static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) } } - hlist_for_each_entry_safe(entry, node, next, &priv->mcast_fwds, next) { + hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) { if (entry->recv_vcc == vcc) { lec_arp_clear_vccs(entry); /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ @@ -2257,13 +2253,13 @@ lec_arp_check_empties(struct lec_priv *priv, struct atm_vcc *vcc, struct sk_buff *skb) { unsigned long flags; - struct hlist_node *node, *next; + struct hlist_node *next; struct lec_arp_table *entry, *tmp; struct lecdatahdr_8023 *hdr = (struct lecdatahdr_8023 *)skb->data; unsigned char *src = hdr->h_source; spin_lock_irqsave(&priv->lec_arp_lock, flags); - hlist_for_each_entry_safe(entry, node, next, + hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (vcc == entry->vcc) { del_timer(&entry->timer); diff --git a/net/atm/lec.h b/net/atm/lec.h index a86aff9a3c04..4149db1b7885 100644 --- a/net/atm/lec.h +++ b/net/atm/lec.h @@ -58,7 +58,7 @@ struct lane2_ops { * field in h_type field. Data follows immediately after header. * 2. LLC Data frames whose total length, including LLC field and data, * but not padding required to meet the minimum data frame length, - * is less than 1536(0x0600) MUST be encoded by placing that length + * is less than ETH_P_802_3_MIN MUST be encoded by placing that length * in the h_type field. The LLC field follows header immediately. * 3. LLC data frames longer than this maximum MUST be encoded by placing * the value 0 in the h_type field. diff --git a/net/atm/proc.c b/net/atm/proc.c index 0d020de8d233..bbb6461a4b7f 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -385,7 +385,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, page = get_zeroed_page(GFP_KERNEL); if (!page) return -ENOMEM; - dev = PDE(file->f_path.dentry->d_inode)->data; + dev = PDE_DATA(file_inode(file)); if (!dev->ops->proc_read) length = -EINVAL; else { @@ -460,7 +460,7 @@ static void atm_proc_dirs_remove(void) if (e->dirent) remove_proc_entry(e->name, atm_proc_root); } - proc_net_remove(&init_net, "atm"); + remove_proc_entry("atm", init_net.proc_net); } int __init atm_proc_init(void) diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 86767ca908a3..4176887e72eb 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -217,7 +217,6 @@ static void purge_vcc(struct atm_vcc *vcc) static void sigd_close(struct atm_vcc *vcc) { - struct hlist_node *node; struct sock *s; int i; @@ -231,7 +230,7 @@ static void sigd_close(struct atm_vcc *vcc) for (i = 0; i < VCC_HTABLE_SIZE; ++i) { struct hlist_head *head = &vcc_hash[i]; - sk_for_each(s, node, head) { + sk_for_each(s, head) { vcc = atm_sk(s); purge_vcc(vcc); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 779095ded689..e277e38f736b 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -81,14 +81,13 @@ static void ax25_kill_by_device(struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *s; - struct hlist_node *node; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; spin_lock_bh(&ax25_list_lock); again: - ax25_for_each(s, node, &ax25_list) { + ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { s->ax25_dev = NULL; spin_unlock_bh(&ax25_list_lock); @@ -158,10 +157,9 @@ struct sock *ax25_find_listener(ax25_address *addr, int digi, struct net_device *dev, int type) { ax25_cb *s; - struct hlist_node *node; spin_lock(&ax25_list_lock); - ax25_for_each(s, node, &ax25_list) { + ax25_for_each(s, &ax25_list) { if ((s->iamdigi && !digi) || (!s->iamdigi && digi)) continue; if (s->sk && !ax25cmp(&s->source_addr, addr) && @@ -187,10 +185,9 @@ struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr, { struct sock *sk = NULL; ax25_cb *s; - struct hlist_node *node; spin_lock(&ax25_list_lock); - ax25_for_each(s, node, &ax25_list) { + ax25_for_each(s, &ax25_list) { if (s->sk && !ax25cmp(&s->source_addr, my_addr) && !ax25cmp(&s->dest_addr, dest_addr) && s->sk->sk_type == type) { @@ -213,10 +210,9 @@ ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr, ax25_digi *digi, struct net_device *dev) { ax25_cb *s; - struct hlist_node *node; spin_lock_bh(&ax25_list_lock); - ax25_for_each(s, node, &ax25_list) { + ax25_for_each(s, &ax25_list) { if (s->sk && s->sk->sk_type != SOCK_SEQPACKET) continue; if (s->ax25_dev == NULL) @@ -248,10 +244,9 @@ void ax25_send_to_raw(ax25_address *addr, struct sk_buff *skb, int proto) { ax25_cb *s; struct sk_buff *copy; - struct hlist_node *node; spin_lock(&ax25_list_lock); - ax25_for_each(s, node, &ax25_list) { + ax25_for_each(s, &ax25_list) { if (s->sk != NULL && ax25cmp(&s->source_addr, addr) == 0 && s->sk->sk_type == SOCK_RAW && s->sk->sk_protocol == proto && @@ -1647,6 +1642,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, ax25_address src; const unsigned char *mac = skb_mac_header(skb); + memset(sax, 0, sizeof(struct full_sockaddr_ax25)); ax25_addr_parse(mac + 1, skb->data - mac - 1, &src, NULL, &digi, NULL, NULL); sax->sax25_family = AF_AX25; @@ -1992,9 +1988,10 @@ static int __init ax25_init(void) dev_add_pack(&ax25_packet_type); register_netdevice_notifier(&ax25_dev_notifier); - proc_net_fops_create(&init_net, "ax25_route", S_IRUGO, &ax25_route_fops); - proc_net_fops_create(&init_net, "ax25", S_IRUGO, &ax25_info_fops); - proc_net_fops_create(&init_net, "ax25_calls", S_IRUGO, &ax25_uid_fops); + proc_create("ax25_route", S_IRUGO, init_net.proc_net, + &ax25_route_fops); + proc_create("ax25", S_IRUGO, init_net.proc_net, &ax25_info_fops); + proc_create("ax25_calls", S_IRUGO, init_net.proc_net, &ax25_uid_fops); out: return rc; } @@ -2008,9 +2005,9 @@ MODULE_ALIAS_NETPROTO(PF_AX25); static void __exit ax25_exit(void) { - proc_net_remove(&init_net, "ax25_route"); - proc_net_remove(&init_net, "ax25"); - proc_net_remove(&init_net, "ax25_calls"); + remove_proc_entry("ax25_route", init_net.proc_net); + remove_proc_entry("ax25", init_net.proc_net); + remove_proc_entry("ax25_calls", init_net.proc_net); unregister_netdevice_notifier(&ax25_dev_notifier); diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c index 5ea7fd3e2af9..e05bd57b5afd 100644 --- a/net/ax25/ax25_ds_subr.c +++ b/net/ax25/ax25_ds_subr.c @@ -39,7 +39,6 @@ void ax25_ds_nr_error_recovery(ax25_cb *ax25) void ax25_ds_enquiry_response(ax25_cb *ax25) { ax25_cb *ax25o; - struct hlist_node *node; /* Please note that neither DK4EG's nor DG2FEF's * DAMA spec mention the following behaviour as seen @@ -80,7 +79,7 @@ void ax25_ds_enquiry_response(ax25_cb *ax25) ax25_ds_set_timer(ax25->ax25_dev); spin_lock(&ax25_list_lock); - ax25_for_each(ax25o, node, &ax25_list) { + ax25_for_each(ax25o, &ax25_list) { if (ax25o == ax25) continue; @@ -159,10 +158,9 @@ static int ax25_check_dama_slave(ax25_dev *ax25_dev) { ax25_cb *ax25; int res = 0; - struct hlist_node *node; spin_lock(&ax25_list_lock); - ax25_for_each(ax25, node, &ax25_list) + ax25_for_each(ax25, &ax25_list) if (ax25->ax25_dev == ax25_dev && (ax25->condition & AX25_COND_DAMA_MODE) && ax25->state > AX25_STATE_1) { res = 1; break; diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index 993c439b4f71..951cd57bb07d 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -70,7 +70,6 @@ static void ax25_ds_timeout(unsigned long arg) { ax25_dev *ax25_dev = (struct ax25_dev *) arg; ax25_cb *ax25; - struct hlist_node *node; if (ax25_dev == NULL || !ax25_dev->dama.slave) return; /* Yikes! */ @@ -81,7 +80,7 @@ static void ax25_ds_timeout(unsigned long arg) } spin_lock(&ax25_list_lock); - ax25_for_each(ax25, node, &ax25_list) { + ax25_for_each(ax25, &ax25_list) { if (ax25->ax25_dev != ax25_dev || !(ax25->condition & AX25_COND_DAMA_MODE)) continue; diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c index 7d5f24b82cc8..7f16e8a931b2 100644 --- a/net/ax25/ax25_iface.c +++ b/net/ax25/ax25_iface.c @@ -193,10 +193,9 @@ int ax25_listen_mine(ax25_address *callsign, struct net_device *dev) void ax25_link_failed(ax25_cb *ax25, int reason) { struct ax25_linkfail *lf; - struct hlist_node *node; spin_lock_bh(&linkfail_lock); - hlist_for_each_entry(lf, node, &ax25_linkfail_list, lf_node) + hlist_for_each_entry(lf, &ax25_linkfail_list, lf_node) lf->func(ax25, reason); spin_unlock_bh(&linkfail_lock); } diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c index 957999e43ff7..71c4badbc807 100644 --- a/net/ax25/ax25_uid.c +++ b/net/ax25/ax25_uid.c @@ -54,10 +54,9 @@ EXPORT_SYMBOL(ax25_uid_policy); ax25_uid_assoc *ax25_findbyuid(kuid_t uid) { ax25_uid_assoc *ax25_uid, *res = NULL; - struct hlist_node *node; read_lock(&ax25_uid_lock); - ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) { + ax25_uid_for_each(ax25_uid, &ax25_uid_list) { if (uid_eq(ax25_uid->uid, uid)) { ax25_uid_hold(ax25_uid); res = ax25_uid; @@ -74,7 +73,6 @@ EXPORT_SYMBOL(ax25_findbyuid); int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) { ax25_uid_assoc *ax25_uid; - struct hlist_node *node; ax25_uid_assoc *user; unsigned long res; @@ -82,7 +80,7 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) case SIOCAX25GETUID: res = -ENOENT; read_lock(&ax25_uid_lock); - ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) { + ax25_uid_for_each(ax25_uid, &ax25_uid_list) { if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) { res = from_kuid_munged(current_user_ns(), ax25_uid->uid); break; @@ -126,7 +124,7 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) ax25_uid = NULL; write_lock(&ax25_uid_lock); - ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) { + ax25_uid_for_each(ax25_uid, &ax25_uid_list) { if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) break; } @@ -212,11 +210,10 @@ const struct file_operations ax25_uid_fops = { void __exit ax25_uid_free(void) { ax25_uid_assoc *ax25_uid; - struct hlist_node *node; write_lock(&ax25_uid_lock); again: - ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) { + ax25_uid_for_each(ax25_uid, &ax25_uid_list) { hlist_del_init(&ax25_uid->uid_node); ax25_uid_put(ax25_uid); goto again; diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 8d8afb134b3a..fa780b76630e 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -36,6 +36,20 @@ config BATMAN_ADV_DAT mesh networks. If you think that your network does not need this option you can safely remove it and save some space. +config BATMAN_ADV_NC + bool "Network Coding" + depends on BATMAN_ADV + default n + help + This option enables network coding, a mechanism that aims to + increase the overall network throughput by fusing multiple + packets in one transmission. + Note that interfaces controlled by batman-adv must be manually + configured to have promiscuous mode enabled in order to make + network coding work. + If you think that your network does not need this feature you + can safely disable it and save some space. + config BATMAN_ADV_DEBUG bool "B.A.T.M.A.N. debugging" depends on BATMAN_ADV diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index e45e3b4e32e3..acbac2a9c62f 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # @@ -30,6 +30,7 @@ batman-adv-y += hard-interface.o batman-adv-y += hash.o batman-adv-y += icmp_socket.o batman-adv-y += main.o +batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += ring_buffer.o batman-adv-y += routing.o diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index a0ba3bff9b36..a4808c29ea3d 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 7d02ebd11a7f..071f288b77a8 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -27,6 +27,7 @@ #include "hard-interface.h" #include "send.h" #include "bat_algo.h" +#include "network-coding.h" static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, @@ -123,7 +124,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += random32() % (2 * BATADV_JITTER); + msecs += prandom_u32() % (2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } @@ -131,7 +132,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { - return jiffies + msecs_to_jiffies(random32() % (BATADV_JITTER / 2)); + return jiffies + msecs_to_jiffies(prandom_u32() % (BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ @@ -183,7 +184,6 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, /* adjust all flags and log packets */ while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len, batadv_ogm_packet->tt_num_changes)) { - /* we might have aggregated direct link packets with an * ordinary base packet */ @@ -261,7 +261,6 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) */ if ((directlink && (batadv_ogm_packet->header.ttl == 1)) || (forw_packet->own && (forw_packet->if_incoming != primary_if))) { - /* FIXME: what about aggregated packets ? */ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s packet (originator %pM, seqno %u, TTL %d) on interface %s [%pM]\n", @@ -325,7 +324,6 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, if (time_before(send_time, forw_packet->send_time) && time_after_eq(aggregation_end_time, forw_packet->send_time) && (aggregated_bytes <= BATADV_MAX_AGGREGATION_BYTES)) { - /* check aggregation compatibility * -> direct link packets are broadcasted on * their interface only @@ -490,7 +488,6 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, */ struct batadv_forw_packet *forw_packet_aggr = NULL; struct batadv_forw_packet *forw_packet_pos = NULL; - struct hlist_node *tmp_node; struct batadv_ogm_packet *batadv_ogm_packet; bool direct_link; unsigned long max_aggregation_jiffies; @@ -503,7 +500,7 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->forw_bat_list_lock); /* own packets are not to be aggregated */ if ((atomic_read(&bat_priv->aggregated_ogms)) && (!own_packet)) { - hlist_for_each_entry(forw_packet_pos, tmp_node, + hlist_for_each_entry(forw_packet_pos, &bat_priv->forw_bat_list, list) { if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet, bat_priv, packet_len, @@ -658,7 +655,6 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_node_tmp; - struct hlist_node *node; int if_num; uint8_t sum_orig, sum_neigh; uint8_t *neigh_addr; @@ -668,7 +664,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, "update_originator(): Searching and updating originator entry of received packet\n"); rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_neigh_node, node, + hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { neigh_addr = tmp_neigh_node->addr; if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && @@ -804,7 +800,6 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; - struct hlist_node *node; uint8_t total_count; uint8_t orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; @@ -813,9 +808,8 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, /* find corresponding one hop neighbor */ rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_neigh_node, node, + hlist_for_each_entry_rcu(tmp_neigh_node, &orig_neigh_node->neigh_list, list) { - if (!batadv_compare_eth(tmp_neigh_node->addr, orig_neigh_node->orig)) continue; @@ -924,7 +918,6 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_orig_node *orig_node; struct batadv_neigh_node *tmp_neigh_node; - struct hlist_node *node; int is_duplicate = 0; int32_t seq_diff; int need_update = 0; @@ -947,9 +940,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, goto out; rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_neigh_node, node, + hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { - is_duplicate |= batadv_test_bit(tmp_neigh_node->real_bits, orig_node->last_real_seqno, seqno); @@ -1033,7 +1025,7 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, is_single_hop_neigh = true; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, ttvn %u, crc %u, changes %u, td %d, TTL %d, V %d, IDF %d)\n", + "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, ttvn %u, crc %#.4x, changes %u, tq %d, TTL %d, V %d, IDF %d)\n", ethhdr->h_source, if_incoming->net_dev->name, if_incoming->net_dev->dev_addr, batadv_ogm_packet->orig, batadv_ogm_packet->prev_sender, @@ -1194,6 +1186,10 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, if (!orig_neigh_node) goto out; + /* Update nc_nodes of the originator */ + batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node, + batadv_ogm_packet, is_single_hop_neigh); + orig_neigh_router = batadv_orig_node_get_router(orig_neigh_node); /* drop packet if sender is not a direct neighbor and if we @@ -1223,7 +1219,6 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, /* is single hop (direct) neighbor */ if (is_single_hop_neigh) { - /* mark direct link on incoming interface */ batadv_iv_ogm_forward(orig_node, ethhdr, batadv_ogm_packet, is_single_hop_neigh, @@ -1298,7 +1293,8 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; /* unpack the aggregated packets and process them one by one */ - do { + while (batadv_iv_ogm_aggr_packet(buff_pos, packet_len, + batadv_ogm_packet->tt_num_changes)) { tt_buff = packet_buff + buff_pos + BATADV_OGM_HLEN; batadv_iv_ogm_process(ethhdr, batadv_ogm_packet, tt_buff, @@ -1309,8 +1305,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, packet_pos = packet_buff + buff_pos; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; - } while (batadv_iv_ogm_aggr_packet(buff_pos, packet_len, - batadv_ogm_packet->tt_num_changes)); + } kfree_skb(skb); return NET_RX_SUCCESS; diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 5453b17d8df2..973982414d58 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2013 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index cebaae7e148b..a81b9322e382 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2013 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 5aebe9327d68..379061c72549 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -34,13 +34,14 @@ static const uint8_t batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; static void batadv_bla_periodic_work(struct work_struct *work); -static void batadv_bla_send_announce(struct batadv_priv *bat_priv, - struct batadv_backbone_gw *backbone_gw); +static void +batadv_bla_send_announce(struct batadv_priv *bat_priv, + struct batadv_bla_backbone_gw *backbone_gw); /* return the index of the claim */ static inline uint32_t batadv_choose_claim(const void *data, uint32_t size) { - struct batadv_claim *claim = (struct batadv_claim *)data; + struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; uint32_t hash = 0; hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr)); @@ -57,7 +58,7 @@ static inline uint32_t batadv_choose_claim(const void *data, uint32_t size) static inline uint32_t batadv_choose_backbone_gw(const void *data, uint32_t size) { - struct batadv_claim *claim = (struct batadv_claim *)data; + const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; uint32_t hash = 0; hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr)); @@ -75,9 +76,9 @@ static inline uint32_t batadv_choose_backbone_gw(const void *data, static int batadv_compare_backbone_gw(const struct hlist_node *node, const void *data2) { - const void *data1 = container_of(node, struct batadv_backbone_gw, + const void *data1 = container_of(node, struct batadv_bla_backbone_gw, hash_entry); - const struct batadv_backbone_gw *gw1 = data1, *gw2 = data2; + const struct batadv_bla_backbone_gw *gw1 = data1, *gw2 = data2; if (!batadv_compare_eth(gw1->orig, gw2->orig)) return 0; @@ -92,9 +93,9 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node, static int batadv_compare_claim(const struct hlist_node *node, const void *data2) { - const void *data1 = container_of(node, struct batadv_claim, + const void *data1 = container_of(node, struct batadv_bla_claim, hash_entry); - const struct batadv_claim *cl1 = data1, *cl2 = data2; + const struct batadv_bla_claim *cl1 = data1, *cl2 = data2; if (!batadv_compare_eth(cl1->addr, cl2->addr)) return 0; @@ -106,7 +107,8 @@ static int batadv_compare_claim(const struct hlist_node *node, } /* free a backbone gw */ -static void batadv_backbone_gw_free_ref(struct batadv_backbone_gw *backbone_gw) +static void +batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw) { if (atomic_dec_and_test(&backbone_gw->refcount)) kfree_rcu(backbone_gw, rcu); @@ -115,16 +117,16 @@ static void batadv_backbone_gw_free_ref(struct batadv_backbone_gw *backbone_gw) /* finally deinitialize the claim */ static void batadv_claim_free_rcu(struct rcu_head *rcu) { - struct batadv_claim *claim; + struct batadv_bla_claim *claim; - claim = container_of(rcu, struct batadv_claim, rcu); + claim = container_of(rcu, struct batadv_bla_claim, rcu); batadv_backbone_gw_free_ref(claim->backbone_gw); kfree(claim); } /* free a claim, call claim_free_rcu if its the last reference */ -static void batadv_claim_free_ref(struct batadv_claim *claim) +static void batadv_claim_free_ref(struct batadv_bla_claim *claim) { if (atomic_dec_and_test(&claim->refcount)) call_rcu(&claim->rcu, batadv_claim_free_rcu); @@ -136,14 +138,14 @@ static void batadv_claim_free_ref(struct batadv_claim *claim) * looks for a claim in the hash, and returns it if found * or NULL otherwise. */ -static struct batadv_claim *batadv_claim_hash_find(struct batadv_priv *bat_priv, - struct batadv_claim *data) +static struct batadv_bla_claim +*batadv_claim_hash_find(struct batadv_priv *bat_priv, + struct batadv_bla_claim *data) { struct batadv_hashtable *hash = bat_priv->bla.claim_hash; struct hlist_head *head; - struct hlist_node *node; - struct batadv_claim *claim; - struct batadv_claim *claim_tmp = NULL; + struct batadv_bla_claim *claim; + struct batadv_bla_claim *claim_tmp = NULL; int index; if (!hash) @@ -153,7 +155,7 @@ static struct batadv_claim *batadv_claim_hash_find(struct batadv_priv *bat_priv, head = &hash->table[index]; rcu_read_lock(); - hlist_for_each_entry_rcu(claim, node, head, hash_entry) { + hlist_for_each_entry_rcu(claim, head, hash_entry) { if (!batadv_compare_claim(&claim->hash_entry, data)) continue; @@ -176,15 +178,14 @@ static struct batadv_claim *batadv_claim_hash_find(struct batadv_priv *bat_priv, * * Returns claim if found or NULL otherwise. */ -static struct batadv_backbone_gw * +static struct batadv_bla_backbone_gw * batadv_backbone_hash_find(struct batadv_priv *bat_priv, uint8_t *addr, short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; - struct hlist_node *node; - struct batadv_backbone_gw search_entry, *backbone_gw; - struct batadv_backbone_gw *backbone_gw_tmp = NULL; + struct batadv_bla_backbone_gw search_entry, *backbone_gw; + struct batadv_bla_backbone_gw *backbone_gw_tmp = NULL; int index; if (!hash) @@ -197,7 +198,7 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, head = &hash->table[index]; rcu_read_lock(); - hlist_for_each_entry_rcu(backbone_gw, node, head, hash_entry) { + hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (!batadv_compare_backbone_gw(&backbone_gw->hash_entry, &search_entry)) continue; @@ -215,12 +216,12 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, /* delete all claims for a backbone */ static void -batadv_bla_del_backbone_claims(struct batadv_backbone_gw *backbone_gw) +batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_hashtable *hash; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct hlist_head *head; - struct batadv_claim *claim; + struct batadv_bla_claim *claim; int i; spinlock_t *list_lock; /* protects write access to the hash lists */ @@ -233,14 +234,13 @@ batadv_bla_del_backbone_claims(struct batadv_backbone_gw *backbone_gw) list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(claim, node, node_tmp, + hlist_for_each_entry_safe(claim, node_tmp, head, hash_entry) { - if (claim->backbone_gw != backbone_gw) continue; batadv_claim_free_ref(claim); - hlist_del_rcu(node); + hlist_del_rcu(&claim->hash_entry); } spin_unlock_bh(list_lock); } @@ -338,11 +338,10 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, "bla_send_claim(): REQUEST of %pM to %pMon vid %d\n", ethhdr->h_source, ethhdr->h_dest, vid); break; - } if (vid != -1) - skb = vlan_insert_tag(skb, vid); + skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid); skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, soft_iface); @@ -366,11 +365,11 @@ out: * searches for the backbone gw or creates a new one if it could not * be found. */ -static struct batadv_backbone_gw * +static struct batadv_bla_backbone_gw * batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, uint8_t *orig, short vid, bool own_backbone) { - struct batadv_backbone_gw *entry; + struct batadv_bla_backbone_gw *entry; struct batadv_orig_node *orig_node; int hash_added; @@ -437,7 +436,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, short vid) { - struct batadv_backbone_gw *backbone_gw; + struct batadv_bla_backbone_gw *backbone_gw; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, primary_if->net_dev->dev_addr, @@ -459,11 +458,10 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, short vid) { - struct hlist_node *node; struct hlist_head *head; struct batadv_hashtable *hash; - struct batadv_claim *claim; - struct batadv_backbone_gw *backbone_gw; + struct batadv_bla_claim *claim; + struct batadv_bla_backbone_gw *backbone_gw; int i; batadv_dbg(BATADV_DBG_BLA, bat_priv, @@ -480,7 +478,7 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(claim, node, head, hash_entry) { + hlist_for_each_entry_rcu(claim, head, hash_entry) { /* only own claims are interesting */ if (claim->backbone_gw != backbone_gw) continue; @@ -502,7 +500,7 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, * After the request, it will repeat all of his own claims and finally * send an announcement claim with which we can check again. */ -static void batadv_bla_send_request(struct batadv_backbone_gw *backbone_gw) +static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) { /* first, remove all old entries */ batadv_bla_del_backbone_claims(backbone_gw); @@ -528,7 +526,7 @@ static void batadv_bla_send_request(struct batadv_backbone_gw *backbone_gw) * places. */ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, - struct batadv_backbone_gw *backbone_gw) + struct batadv_bla_backbone_gw *backbone_gw) { uint8_t mac[ETH_ALEN]; __be16 crc; @@ -539,7 +537,6 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, batadv_bla_send_claim(bat_priv, mac, backbone_gw->vid, BATADV_CLAIM_TYPE_ANNOUNCE); - } /** @@ -551,10 +548,10 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, */ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, const uint8_t *mac, const short vid, - struct batadv_backbone_gw *backbone_gw) + struct batadv_bla_backbone_gw *backbone_gw) { - struct batadv_claim *claim; - struct batadv_claim search_claim; + struct batadv_bla_claim *claim; + struct batadv_bla_claim search_claim; int hash_added; memcpy(search_claim.addr, mac, ETH_ALEN); @@ -598,7 +595,6 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); batadv_backbone_gw_free_ref(claim->backbone_gw); - } /* set (new) backbone gw */ atomic_inc(&backbone_gw->refcount); @@ -617,7 +613,7 @@ claim_free_ref: static void batadv_bla_del_claim(struct batadv_priv *bat_priv, const uint8_t *mac, const short vid) { - struct batadv_claim search_claim, *claim; + struct batadv_bla_claim search_claim, *claim; memcpy(search_claim.addr, mac, ETH_ALEN); search_claim.vid = vid; @@ -643,7 +639,7 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, uint8_t *an_addr, uint8_t *backbone_addr, short vid) { - struct batadv_backbone_gw *backbone_gw; + struct batadv_bla_backbone_gw *backbone_gw; uint16_t crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) @@ -661,12 +657,12 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, crc = ntohs(*((__be16 *)(&an_addr[4]))); batadv_dbg(BATADV_DBG_BLA, bat_priv, - "handle_announce(): ANNOUNCE vid %d (sent by %pM)... CRC = %04x\n", + "handle_announce(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", vid, backbone_gw->orig, crc); if (backbone_gw->crc != crc) { batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, - "handle_announce(): CRC FAILED for %pM/%d (my = %04x, sent = %04x)\n", + "handle_announce(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n", backbone_gw->orig, backbone_gw->vid, backbone_gw->crc, crc); @@ -715,7 +711,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, uint8_t *backbone_addr, uint8_t *claim_addr, short vid) { - struct batadv_backbone_gw *backbone_gw; + struct batadv_bla_backbone_gw *backbone_gw; /* unclaim in any case if it is our own */ if (primary_if && batadv_compare_eth(backbone_addr, @@ -744,7 +740,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, uint8_t *backbone_addr, uint8_t *claim_addr, short vid) { - struct batadv_backbone_gw *backbone_gw; + struct batadv_bla_backbone_gw *backbone_gw; /* register the gateway if not yet available, and add the claim. */ @@ -835,7 +831,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, /* if our mesh friends mac is bigger, use it for ourselves. */ if (ntohs(bla_dst->group) > ntohs(bla_dst_own->group)) { batadv_dbg(BATADV_DBG_BLA, bat_priv, - "taking other backbones claim group: %04x\n", + "taking other backbones claim group: %#.4x\n", ntohs(bla_dst->group)); bla_dst_own->group = bla_dst->group; } @@ -958,8 +954,8 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, */ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) { - struct batadv_backbone_gw *backbone_gw; - struct hlist_node *node, *node_tmp; + struct batadv_bla_backbone_gw *backbone_gw; + struct hlist_node *node_tmp; struct hlist_head *head; struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ @@ -974,7 +970,7 @@ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(backbone_gw, node, node_tmp, + hlist_for_each_entry_safe(backbone_gw, node_tmp, head, hash_entry) { if (now) goto purge_now; @@ -993,7 +989,7 @@ purge_now: batadv_bla_del_backbone_claims(backbone_gw); - hlist_del_rcu(node); + hlist_del_rcu(&backbone_gw->hash_entry); batadv_backbone_gw_free_ref(backbone_gw); } spin_unlock_bh(list_lock); @@ -1013,8 +1009,7 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, int now) { - struct batadv_claim *claim; - struct hlist_node *node; + struct batadv_bla_claim *claim; struct hlist_head *head; struct batadv_hashtable *hash; int i; @@ -1027,7 +1022,7 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(claim, node, head, hash_entry) { + hlist_for_each_entry_rcu(claim, head, hash_entry) { if (now) goto purge_now; if (!batadv_compare_eth(claim->backbone_gw->orig, @@ -1062,8 +1057,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif) { - struct batadv_backbone_gw *backbone_gw; - struct hlist_node *node; + struct batadv_bla_backbone_gw *backbone_gw; struct hlist_head *head; struct batadv_hashtable *hash; __be16 group; @@ -1087,7 +1081,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(backbone_gw, node, head, hash_entry) { + hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { /* own orig still holds the old value. */ if (!batadv_compare_eth(backbone_gw->orig, oldif->net_dev->dev_addr)) @@ -1104,16 +1098,6 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, } } - - -/* (re)start the timer */ -static void batadv_bla_start_timer(struct batadv_priv *bat_priv) -{ - INIT_DELAYED_WORK(&bat_priv->bla.work, batadv_bla_periodic_work); - queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, - msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); -} - /* periodic work to do: * * purge structures when they are too old * * send announcements @@ -1123,9 +1107,8 @@ static void batadv_bla_periodic_work(struct work_struct *work) struct delayed_work *delayed_work; struct batadv_priv *bat_priv; struct batadv_priv_bla *priv_bla; - struct hlist_node *node; struct hlist_head *head; - struct batadv_backbone_gw *backbone_gw; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hashtable *hash; struct batadv_hard_iface *primary_if; int i; @@ -1151,7 +1134,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(backbone_gw, node, head, hash_entry) { + hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) continue; @@ -1184,7 +1167,8 @@ out: if (primary_if) batadv_hardif_free_ref(primary_if); - batadv_bla_start_timer(bat_priv); + queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, + msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); } /* The hash for claim and backbone hash receive the same key because they @@ -1242,7 +1226,10 @@ int batadv_bla_init(struct batadv_priv *bat_priv) batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hashes initialized\n"); - batadv_bla_start_timer(bat_priv); + INIT_DELAYED_WORK(&bat_priv->bla.work, batadv_bla_periodic_work); + + queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, + msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); return 0; } @@ -1329,8 +1316,7 @@ int batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; - struct hlist_node *node; - struct batadv_backbone_gw *backbone_gw; + struct batadv_bla_backbone_gw *backbone_gw; int i; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) @@ -1343,7 +1329,7 @@ int batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig) head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(backbone_gw, node, head, hash_entry) { + hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (batadv_compare_eth(backbone_gw->orig, orig)) { rcu_read_unlock(); return 1; @@ -1371,7 +1357,7 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, { struct ethhdr *ethhdr; struct vlan_ethhdr *vhdr; - struct batadv_backbone_gw *backbone_gw; + struct batadv_bla_backbone_gw *backbone_gw; short vid = -1; if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance)) @@ -1442,7 +1428,7 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, short vid, bool is_bcast) { struct ethhdr *ethhdr; - struct batadv_claim search_claim, *claim = NULL; + struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; int ret; @@ -1536,7 +1522,7 @@ out: int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, short vid) { struct ethhdr *ethhdr; - struct batadv_claim search_claim, *claim = NULL; + struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; int ret = 0; @@ -1612,9 +1598,8 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) struct net_device *net_dev = (struct net_device *)seq->private; struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hashtable *hash = bat_priv->bla.claim_hash; - struct batadv_claim *claim; + struct batadv_bla_claim *claim; struct batadv_hard_iface *primary_if; - struct hlist_node *node; struct hlist_head *head; uint32_t i; bool is_own; @@ -1626,19 +1611,19 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) primary_addr = primary_if->net_dev->dev_addr; seq_printf(seq, - "Claims announced for the mesh %s (orig %pM, group id %04x)\n", + "Claims announced for the mesh %s (orig %pM, group id %#.4x)\n", net_dev->name, primary_addr, ntohs(bat_priv->bla.claim_dest.group)); - seq_printf(seq, " %-17s %-5s %-17s [o] (%-4s)\n", + seq_printf(seq, " %-17s %-5s %-17s [o] (%-6s)\n", "Client", "VID", "Originator", "CRC"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(claim, node, head, hash_entry) { + hlist_for_each_entry_rcu(claim, head, hash_entry) { is_own = batadv_compare_eth(claim->backbone_gw->orig, primary_addr); - seq_printf(seq, " * %pM on % 5d by %pM [%c] (%04x)\n", + seq_printf(seq, " * %pM on % 5d by %pM [%c] (%#.4x)\n", claim->addr, claim->vid, claim->backbone_gw->orig, (is_own ? 'x' : ' '), @@ -1657,9 +1642,8 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) struct net_device *net_dev = (struct net_device *)seq->private; struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; - struct batadv_backbone_gw *backbone_gw; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hard_iface *primary_if; - struct hlist_node *node; struct hlist_head *head; int secs, msecs; uint32_t i; @@ -1672,16 +1656,16 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) primary_addr = primary_if->net_dev->dev_addr; seq_printf(seq, - "Backbones announced for the mesh %s (orig %pM, group id %04x)\n", + "Backbones announced for the mesh %s (orig %pM, group id %#.4x)\n", net_dev->name, primary_addr, ntohs(bat_priv->bla.claim_dest.group)); - seq_printf(seq, " %-17s %-5s %-9s (%-4s)\n", + seq_printf(seq, " %-17s %-5s %-9s (%-6s)\n", "Originator", "VID", "last seen", "CRC"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(backbone_gw, node, head, hash_entry) { + hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime); secs = msecs / 1000; @@ -1693,7 +1677,7 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) continue; seq_printf(seq, - " * %pM on % 5d % 4i.%03is (%04x)\n", + " * %pM on % 5d % 4i.%03is (%#.4x)\n", backbone_gw->orig, backbone_gw->vid, secs, msecs, backbone_gw->crc); } diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 196d9a0254bc..dea2fbc5d98d 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 6f58ddd53bff..f186a55b23c3 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -32,6 +32,7 @@ #include "icmp_socket.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" +#include "network-coding.h" static struct dentry *batadv_debugfs; @@ -40,13 +41,14 @@ static struct dentry *batadv_debugfs; static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; -static char *batadv_log_char_addr(struct batadv_debug_log *debug_log, +static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log, size_t idx) { return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK]; } -static void batadv_emit_log_char(struct batadv_debug_log *debug_log, char c) +static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log, + char c) { char *char_addr; @@ -59,7 +61,7 @@ static void batadv_emit_log_char(struct batadv_debug_log *debug_log, char c) } __printf(2, 3) -static int batadv_fdebug_log(struct batadv_debug_log *debug_log, +static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, const char *fmt, ...) { va_list args; @@ -114,7 +116,7 @@ static int batadv_log_release(struct inode *inode, struct file *file) return 0; } -static int batadv_log_empty(struct batadv_debug_log *debug_log) +static int batadv_log_empty(struct batadv_priv_debug_log *debug_log) { return !(debug_log->log_start - debug_log->log_end); } @@ -123,7 +125,7 @@ static ssize_t batadv_log_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct batadv_priv *bat_priv = file->private_data; - struct batadv_debug_log *debug_log = bat_priv->debug_log; + struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; int error, i = 0; char *char_addr; char c; @@ -164,7 +166,6 @@ static ssize_t batadv_log_read(struct file *file, char __user *buf, buf++; i++; - } spin_unlock_bh(&debug_log->lock); @@ -178,7 +179,7 @@ static ssize_t batadv_log_read(struct file *file, char __user *buf, static unsigned int batadv_log_poll(struct file *file, poll_table *wait) { struct batadv_priv *bat_priv = file->private_data; - struct batadv_debug_log *debug_log = bat_priv->debug_log; + struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; poll_wait(file, &debug_log->queue_wait, wait); @@ -230,7 +231,6 @@ static void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) #else /* CONFIG_BATMAN_ADV_DEBUG */ static int batadv_debug_log_setup(struct batadv_priv *bat_priv) { - bat_priv->debug_log = NULL; return 0; } @@ -311,6 +311,14 @@ struct batadv_debuginfo { const struct file_operations fops; }; +#ifdef CONFIG_BATMAN_ADV_NC +static int batadv_nc_nodes_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, batadv_nc_nodes_seq_print_text, net_dev); +} +#endif + #define BATADV_DEBUGINFO(_name, _mode, _open) \ struct batadv_debuginfo batadv_debuginfo_##_name = { \ .attr = { .name = __stringify(_name), \ @@ -349,6 +357,9 @@ static BATADV_DEBUGINFO(dat_cache, S_IRUGO, batadv_dat_cache_open); static BATADV_DEBUGINFO(transtable_local, S_IRUGO, batadv_transtable_local_open); static BATADV_DEBUGINFO(vis_data, S_IRUGO, batadv_vis_data_open); +#ifdef CONFIG_BATMAN_ADV_NC +static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open); +#endif static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { &batadv_debuginfo_originators, @@ -363,6 +374,9 @@ static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { #endif &batadv_debuginfo_transtable_local, &batadv_debuginfo_vis_data, +#ifdef CONFIG_BATMAN_ADV_NC + &batadv_debuginfo_nc_nodes, +#endif NULL, }; @@ -397,10 +411,8 @@ err: void batadv_debugfs_destroy(void) { - if (batadv_debugfs) { - debugfs_remove_recursive(batadv_debugfs); - batadv_debugfs = NULL; - } + debugfs_remove_recursive(batadv_debugfs); + batadv_debugfs = NULL; } int batadv_debugfs_add_meshif(struct net_device *dev) @@ -434,6 +446,9 @@ int batadv_debugfs_add_meshif(struct net_device *dev) } } + if (batadv_nc_init_debugfs(bat_priv) < 0) + goto rem_attr; + return 0; rem_attr: debugfs_remove_recursive(bat_priv->debug_dir); diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 3319e1f21f55..f8c3849edff4 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 553921511e4e..8e15d966d9b0 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -83,7 +83,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, { spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_dat_entry *dat_entry; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct hlist_head *head; uint32_t i; @@ -95,7 +95,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, list_lock = &bat_priv->dat.hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(dat_entry, node, node_tmp, head, + hlist_for_each_entry_safe(dat_entry, node_tmp, head, hash_entry) { /* if an helper function has been passed as parameter, * ask it if the entry has to be purged or not @@ -103,7 +103,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, if (to_purge && !to_purge(dat_entry)) continue; - hlist_del_rcu(node); + hlist_del_rcu(&dat_entry->hash_entry); batadv_dat_entry_free_ref(dat_entry); } spin_unlock_bh(list_lock); @@ -235,7 +235,6 @@ static struct batadv_dat_entry * batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip) { struct hlist_head *head; - struct hlist_node *node; struct batadv_dat_entry *dat_entry, *dat_entry_tmp = NULL; struct batadv_hashtable *hash = bat_priv->dat.hash; uint32_t index; @@ -247,7 +246,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip) head = &hash->table[index]; rcu_read_lock(); - hlist_for_each_entry_rcu(dat_entry, node, head, hash_entry) { + hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { if (dat_entry->ip != ip) continue; @@ -465,7 +464,6 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, batadv_dat_addr_t max = 0, tmp_max = 0; struct batadv_orig_node *orig_node, *max_orig_node = NULL; struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_node *node; struct hlist_head *head; int i; @@ -481,7 +479,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { /* the dht space is a ring and addresses are unsigned */ tmp_max = BATADV_DAT_ADDR_MAX - orig_node->dat_addr + ip_key; @@ -686,7 +684,6 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) struct batadv_hashtable *hash = bat_priv->dat.hash; struct batadv_dat_entry *dat_entry; struct batadv_hard_iface *primary_if; - struct hlist_node *node; struct hlist_head *head; unsigned long last_seen_jiffies; int last_seen_msecs, last_seen_secs, last_seen_mins; @@ -704,7 +701,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(dat_entry, node, head, hash_entry) { + hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { last_seen_jiffies = jiffies - dat_entry->last_update; last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); last_seen_mins = last_seen_msecs / 60000; @@ -819,7 +816,6 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, bool ret = false; struct batadv_dat_entry *dat_entry = NULL; struct sk_buff *skb_new; - struct batadv_hard_iface *primary_if = NULL; if (!atomic_read(&bat_priv->distributed_arp_table)) goto out; @@ -841,22 +837,18 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst); if (dat_entry) { - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto out; - skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src, - primary_if->soft_iface, ip_dst, hw_src, + bat_priv->soft_iface, ip_dst, hw_src, dat_entry->mac_addr, hw_src); if (!skb_new) goto out; skb_reset_mac_header(skb_new); skb_new->protocol = eth_type_trans(skb_new, - primary_if->soft_iface); + bat_priv->soft_iface); bat_priv->stats.rx_packets++; bat_priv->stats.rx_bytes += skb->len + ETH_HLEN; - primary_if->soft_iface->last_rx = jiffies; + bat_priv->soft_iface->last_rx = jiffies; netif_rx(skb_new); batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n"); @@ -869,8 +861,6 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, out: if (dat_entry) batadv_dat_entry_free_ref(dat_entry); - if (primary_if) - batadv_hardif_free_ref(primary_if); return ret; } @@ -890,7 +880,6 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, __be32 ip_src, ip_dst; uint8_t *hw_src; struct sk_buff *skb_new; - struct batadv_hard_iface *primary_if = NULL; struct batadv_dat_entry *dat_entry = NULL; bool ret = false; int err; @@ -915,12 +904,8 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, if (!dat_entry) goto out; - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto out; - skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src, - primary_if->soft_iface, ip_dst, hw_src, + bat_priv->soft_iface, ip_dst, hw_src, dat_entry->mac_addr, hw_src); if (!skb_new) @@ -944,8 +929,6 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, out: if (dat_entry) batadv_dat_entry_free_ref(dat_entry); - if (primary_if) - batadv_hardif_free_ref(primary_if); if (ret) kfree_skb(skb); return ret; diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index d060c033e7de..125c8c6fcfad 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2013 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index dd07c7e3654f..f105219f4a4b 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -114,7 +114,6 @@ static struct batadv_gw_node * batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_neigh_node *router; - struct hlist_node *node; struct batadv_gw_node *gw_node, *curr_gw = NULL; uint32_t max_gw_factor = 0, tmp_gw_factor = 0; uint32_t gw_divisor; @@ -127,7 +126,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) gw_divisor *= 64; rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, node, &bat_priv->gw.list, list) { + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { if (gw_node->deleted) continue; @@ -344,7 +343,6 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, uint8_t new_gwflags) { - struct hlist_node *node; struct batadv_gw_node *gw_node, *curr_gw; /* Note: We don't need a NULL check here, since curr_gw never gets @@ -355,7 +353,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, curr_gw = batadv_gw_get_selected_gw_node(bat_priv); rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, node, &bat_priv->gw.list, list) { + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { if (gw_node->orig_node != orig_node) continue; @@ -403,7 +401,7 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, void batadv_gw_node_purge(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node, *curr_gw; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; unsigned long timeout = msecs_to_jiffies(2 * BATADV_PURGE_TIMEOUT); int do_deselect = 0; @@ -411,7 +409,7 @@ void batadv_gw_node_purge(struct batadv_priv *bat_priv) spin_lock_bh(&bat_priv->gw.list_lock); - hlist_for_each_entry_safe(gw_node, node, node_tmp, + hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.list, list) { if (((!gw_node->deleted) || (time_before(jiffies, gw_node->deleted + timeout))) && @@ -476,7 +474,6 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hard_iface *primary_if; struct batadv_gw_node *gw_node; - struct hlist_node *node; int gw_count = 0; primary_if = batadv_seq_print_text_primary_if_get(seq); @@ -490,7 +487,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) primary_if->net_dev->dev_addr, net_dev->name); rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, node, &bat_priv->gw.list, list) { + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { if (gw_node->deleted) continue; @@ -503,7 +500,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) rcu_read_unlock(); if (gw_count == 0) - seq_printf(seq, "No gateways in range ...\n"); + seq_puts(seq, "No gateways in range ...\n"); out: if (primary_if) diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index f0d129e323c8..039902dca4a6 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 9001208d1752..84bb2b18d711 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 13697f6e7113..509b2bf8c2f4 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index f1d37cd81815..522243aff2f3 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -307,11 +307,35 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) batadv_update_min_mtu(hard_iface->soft_iface); } +/** + * batadv_master_del_slave - remove hard_iface from the current master interface + * @slave: the interface enslaved in another master + * @master: the master from which slave has to be removed + * + * Invoke ndo_del_slave on master passing slave as argument. In this way slave + * is free'd and master can correctly change its internal state. + * Return 0 on success, a negative value representing the error otherwise + */ +static int batadv_master_del_slave(struct batadv_hard_iface *slave, + struct net_device *master) +{ + int ret; + + if (!master) + return 0; + + ret = -EBUSY; + if (master->netdev_ops->ndo_del_slave) + ret = master->netdev_ops->ndo_del_slave(master, slave->net_dev); + + return ret; +} + int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, const char *iface_name) { struct batadv_priv *bat_priv; - struct net_device *soft_iface; + struct net_device *soft_iface, *master; __be16 ethertype = __constant_htons(ETH_P_BATMAN); int ret; @@ -321,11 +345,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (!atomic_inc_not_zero(&hard_iface->refcount)) goto out; - /* hard-interface is part of a bridge */ - if (hard_iface->net_dev->priv_flags & IFF_BRIDGE_PORT) - pr_err("You are about to enable batman-adv on '%s' which already is part of a bridge. Unless you know exactly what you are doing this is probably wrong and won't work the way you think it would.\n", - hard_iface->net_dev->name); - soft_iface = dev_get_by_name(&init_net, iface_name); if (!soft_iface) { @@ -347,12 +366,24 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, goto err_dev; } + /* check if the interface is enslaved in another virtual one and + * in that case unlink it first + */ + master = netdev_master_upper_dev_get(hard_iface->net_dev); + ret = batadv_master_del_slave(hard_iface, master); + if (ret) + goto err_dev; + hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); + ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface); + if (ret) + goto err_dev; + ret = bat_priv->bat_algo_ops->bat_iface_enable(hard_iface); if (ret < 0) - goto err_dev; + goto err_upper; hard_iface->if_num = bat_priv->num_ifaces; bat_priv->num_ifaces++; @@ -362,7 +393,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, bat_priv->bat_algo_ops->bat_iface_disable(hard_iface); bat_priv->num_ifaces--; hard_iface->if_status = BATADV_IF_NOT_IN_USE; - goto err_dev; + goto err_upper; } hard_iface->batman_adv_ptype.type = ethertype; @@ -401,14 +432,18 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, out: return 0; +err_upper: + netdev_upper_dev_unlink(hard_iface->net_dev, soft_iface); err_dev: + hard_iface->soft_iface = NULL; dev_put(soft_iface); err: batadv_hardif_free_ref(hard_iface); return ret; } -void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) +void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, + enum batadv_hard_if_cleanup autodel) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if = NULL; @@ -446,9 +481,10 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) dev_put(hard_iface->soft_iface); /* nobody uses this interface anymore */ - if (!bat_priv->num_ifaces) - batadv_softif_destroy(hard_iface->soft_iface); + if (!bat_priv->num_ifaces && autodel == BATADV_IF_CLEANUP_AUTO) + batadv_softif_destroy_sysfs(hard_iface->soft_iface); + netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); hard_iface->soft_iface = NULL; batadv_hardif_free_ref(hard_iface); @@ -457,6 +493,24 @@ out: batadv_hardif_free_ref(primary_if); } +/** + * batadv_hardif_remove_interface_finish - cleans up the remains of a hardif + * @work: work queue item + * + * Free the parts of the hard interface which can not be removed under + * rtnl lock (to prevent deadlock situations). + */ +static void batadv_hardif_remove_interface_finish(struct work_struct *work) +{ + struct batadv_hard_iface *hard_iface; + + hard_iface = container_of(work, struct batadv_hard_iface, + cleanup_work); + + batadv_sysfs_del_hardif(&hard_iface->hardif_obj); + batadv_hardif_free_ref(hard_iface); +} + static struct batadv_hard_iface * batadv_hardif_add_interface(struct net_device *net_dev) { @@ -484,6 +538,9 @@ batadv_hardif_add_interface(struct net_device *net_dev) hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; INIT_LIST_HEAD(&hard_iface->list); + INIT_WORK(&hard_iface->cleanup_work, + batadv_hardif_remove_interface_finish); + /* extra reference for return */ atomic_set(&hard_iface->refcount, 2); @@ -512,14 +569,14 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) /* first deactivate interface */ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) - batadv_hardif_disable_interface(hard_iface); + batadv_hardif_disable_interface(hard_iface, + BATADV_IF_CLEANUP_AUTO); if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) return; hard_iface->if_status = BATADV_IF_TO_BE_REMOVED; - batadv_sysfs_del_hardif(&hard_iface->hardif_obj); - batadv_hardif_free_ref(hard_iface); + queue_work(batadv_event_workqueue, &hard_iface->cleanup_work); } void batadv_hardif_remove_interfaces(void) @@ -543,6 +600,11 @@ static int batadv_hard_if_event(struct notifier_block *this, struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; + if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) { + batadv_sysfs_add_meshif(net_dev); + return NOTIFY_DONE; + } + hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface && event == NETDEV_REGISTER) hard_iface = batadv_hardif_add_interface(net_dev); diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 3732366e7445..49892881a7c5 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -29,13 +29,24 @@ enum batadv_hard_if_state { BATADV_IF_I_WANT_YOU, }; +/** + * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal + * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface + * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was removed + */ +enum batadv_hard_if_cleanup { + BATADV_IF_CLEANUP_KEEP, + BATADV_IF_CLEANUP_AUTO, +}; + extern struct notifier_block batadv_hard_if_notifier; struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, const char *iface_name); -void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface); +void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, + enum batadv_hard_if_cleanup autodel); void batadv_hardif_remove_interfaces(void); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 15a849c2d414..7198dafd3bf3 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2013 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index e05333905afd..1b4da72f2093 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2013 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -89,7 +89,7 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, * * Returns the new hash value. */ -static inline uint32_t batadv_hash_bytes(uint32_t hash, void *data, +static inline uint32_t batadv_hash_bytes(uint32_t hash, const void *data, uint32_t size) { const unsigned char *key = data; diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 87ca8095b011..0ba6c899b2d3 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 29443a1dbb5c..1fcca37b6223 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index f65a222b7b83..3e30a0f1b908 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -35,6 +35,7 @@ #include "vis.h" #include "hash.h" #include "bat_algo.h" +#include "network-coding.h" /* List manipulations on hardif_list have to be rtnl_lock()'ed, @@ -70,6 +71,7 @@ static int __init batadv_init(void) batadv_debugfs_init(); register_netdevice_notifier(&batadv_hard_if_notifier); + rtnl_link_register(&batadv_link_ops); pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n", BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION); @@ -80,6 +82,7 @@ static int __init batadv_init(void) static void __exit batadv_exit(void) { batadv_debugfs_destroy(); + rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); batadv_hardif_remove_interfaces(); @@ -135,6 +138,10 @@ int batadv_mesh_init(struct net_device *soft_iface) if (ret < 0) goto err; + ret = batadv_nc_init(bat_priv); + if (ret < 0) + goto err; + atomic_set(&bat_priv->gw.reselect, 0); atomic_set(&bat_priv->mesh_state, BATADV_MESH_ACTIVE); @@ -157,6 +164,7 @@ void batadv_mesh_free(struct net_device *soft_iface) batadv_gw_node_purge(bat_priv); batadv_originator_free(bat_priv); + batadv_nc_free(bat_priv); batadv_tt_free(bat_priv); @@ -169,7 +177,13 @@ void batadv_mesh_free(struct net_device *soft_iface) atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); } -int batadv_is_my_mac(const uint8_t *addr) +/** + * batadv_is_my_mac - check if the given mac address belongs to any of the real + * interfaces in the current mesh + * @bat_priv: the bat priv with all the soft interface information + * @addr: the address to check + */ +int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) { const struct batadv_hard_iface *hard_iface; @@ -178,6 +192,9 @@ int batadv_is_my_mac(const uint8_t *addr) if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) { rcu_read_unlock(); return 1; @@ -345,9 +362,8 @@ void batadv_recv_handler_unregister(uint8_t packet_type) static struct batadv_algo_ops *batadv_algo_get(char *name) { struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; - struct hlist_node *node; - hlist_for_each_entry(bat_algo_ops_tmp, node, &batadv_algo_list, list) { + hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) { if (strcmp(bat_algo_ops_tmp->name, name) != 0) continue; @@ -411,11 +427,10 @@ out: int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) { struct batadv_algo_ops *bat_algo_ops; - struct hlist_node *node; - seq_printf(seq, "Available routing algorithms:\n"); + seq_puts(seq, "Available routing algorithms:\n"); - hlist_for_each_entry(bat_algo_ops, node, &batadv_algo_list, list) { + hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { seq_printf(seq, "%s\n", bat_algo_ops->name); } diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2f85577086a7..59a0d6af15c8 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -26,7 +26,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2012.5.0" +#define BATADV_SOURCE_VERSION "2013.2.0" #endif /* B.A.T.M.A.N. parameters */ @@ -41,9 +41,11 @@ * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE */ #define BATADV_PURGE_TIMEOUT 200000 /* 200 seconds */ -#define BATADV_TT_LOCAL_TIMEOUT 3600000 /* in milliseconds */ +#define BATADV_TT_LOCAL_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_CLIENT_ROAM_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */ +#define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */ +#define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */ #define BATADV_DAT_ENTRY_TIMEOUT (5*60000) /* 5 mins in milliseconds */ /* sliding packet range of received originator messages in sequence numbers * (should be a multiple of our word size) @@ -103,6 +105,8 @@ #define BATADV_RESET_PROTECTION_MS 30000 #define BATADV_EXPECTED_SEQNO_RANGE 65536 +#define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */ + enum batadv_mesh_state { BATADV_MESH_INACTIVE, BATADV_MESH_ACTIVE, @@ -148,6 +152,7 @@ enum batadv_uev_type { #include <linux/percpu.h> #include <linux/slab.h> #include <net/sock.h> /* struct sock */ +#include <net/rtnetlink.h> #include <linux/jiffies.h> #include <linux/seq_file.h> #include "types.h" @@ -160,7 +165,7 @@ extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *soft_iface); void batadv_mesh_free(struct net_device *soft_iface); -int batadv_is_my_mac(const uint8_t *addr); +int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr); struct batadv_hard_iface * batadv_seq_print_text_primary_if_get(struct seq_file *seq); int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, @@ -183,6 +188,7 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); * @BATADV_DBG_TT: translation table messages * @BATADV_DBG_BLA: bridge loop avoidance messages * @BATADV_DBG_DAT: ARP snooping and DAT related messages + * @BATADV_DBG_NC: network coding related messages * @BATADV_DBG_ALL: the union of all the above log levels */ enum batadv_dbg_level { @@ -191,7 +197,8 @@ enum batadv_dbg_level { BATADV_DBG_TT = BIT(2), BATADV_DBG_BLA = BIT(3), BATADV_DBG_DAT = BIT(4), - BATADV_DBG_ALL = 31, + BATADV_DBG_NC = BIT(5), + BATADV_DBG_ALL = 63, }; #ifdef CONFIG_BATMAN_ADV_DEBUG @@ -276,9 +283,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, size_t count) { - int cpu = get_cpu(); - per_cpu_ptr(bat_priv->bat_counters, cpu)[idx] += count; - put_cpu(); + this_cpu_add(bat_priv->bat_counters[idx], count); } #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) @@ -298,4 +303,10 @@ static inline uint64_t batadv_sum_counter(struct batadv_priv *bat_priv, return sum; } +/* Define a macro to reach the control buffer of the skb. The members of the + * control buffer are defined in struct batadv_skb_cb in types.h. + * The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h. + */ +#define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0])) + #endif /* _NET_BATMAN_ADV_MAIN_H_ */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c new file mode 100644 index 000000000000..f7c54305a918 --- /dev/null +++ b/net/batman-adv/network-coding.c @@ -0,0 +1,1822 @@ +/* Copyright (C) 2012-2013 B.A.T.M.A.N. contributors: + * + * Martin Hundebøll, Jeppe Ledet-Pedersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/debugfs.h> + +#include "main.h" +#include "hash.h" +#include "network-coding.h" +#include "send.h" +#include "originator.h" +#include "hard-interface.h" +#include "routing.h" + +static struct lock_class_key batadv_nc_coding_hash_lock_class_key; +static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; + +static void batadv_nc_worker(struct work_struct *work); +static int batadv_nc_recv_coded_packet(struct sk_buff *skb, + struct batadv_hard_iface *recv_if); + +/** + * batadv_nc_start_timer - initialise the nc periodic worker + * @bat_priv: the bat priv with all the soft interface information + */ +static void batadv_nc_start_timer(struct batadv_priv *bat_priv) +{ + queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work, + msecs_to_jiffies(10)); +} + +/** + * batadv_nc_init - initialise coding hash table and start house keeping + * @bat_priv: the bat priv with all the soft interface information + */ +int batadv_nc_init(struct batadv_priv *bat_priv) +{ + bat_priv->nc.timestamp_fwd_flush = jiffies; + bat_priv->nc.timestamp_sniffed_purge = jiffies; + + if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash) + return 0; + + bat_priv->nc.coding_hash = batadv_hash_new(128); + if (!bat_priv->nc.coding_hash) + goto err; + + batadv_hash_set_lock_class(bat_priv->nc.coding_hash, + &batadv_nc_coding_hash_lock_class_key); + + bat_priv->nc.decoding_hash = batadv_hash_new(128); + if (!bat_priv->nc.decoding_hash) + goto err; + + batadv_hash_set_lock_class(bat_priv->nc.coding_hash, + &batadv_nc_decoding_hash_lock_class_key); + + /* Register our packet type */ + if (batadv_recv_handler_register(BATADV_CODED, + batadv_nc_recv_coded_packet) < 0) + goto err; + + INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker); + batadv_nc_start_timer(bat_priv); + + return 0; + +err: + return -ENOMEM; +} + +/** + * batadv_nc_init_bat_priv - initialise the nc specific bat_priv variables + * @bat_priv: the bat priv with all the soft interface information + */ +void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) +{ + atomic_set(&bat_priv->network_coding, 1); + bat_priv->nc.min_tq = 200; + bat_priv->nc.max_fwd_delay = 10; + bat_priv->nc.max_buffer_time = 200; +} + +/** + * batadv_nc_init_orig - initialise the nc fields of an orig_node + * @orig_node: the orig_node which is going to be initialised + */ +void batadv_nc_init_orig(struct batadv_orig_node *orig_node) +{ + INIT_LIST_HEAD(&orig_node->in_coding_list); + INIT_LIST_HEAD(&orig_node->out_coding_list); + spin_lock_init(&orig_node->in_coding_list_lock); + spin_lock_init(&orig_node->out_coding_list_lock); +} + +/** + * batadv_nc_node_free_rcu - rcu callback to free an nc node and remove + * its refcount on the orig_node + * @rcu: rcu pointer of the nc node + */ +static void batadv_nc_node_free_rcu(struct rcu_head *rcu) +{ + struct batadv_nc_node *nc_node; + + nc_node = container_of(rcu, struct batadv_nc_node, rcu); + batadv_orig_node_free_ref(nc_node->orig_node); + kfree(nc_node); +} + +/** + * batadv_nc_node_free_ref - decrements the nc node refcounter and possibly + * frees it + * @nc_node: the nc node to free + */ +static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node) +{ + if (atomic_dec_and_test(&nc_node->refcount)) + call_rcu(&nc_node->rcu, batadv_nc_node_free_rcu); +} + +/** + * batadv_nc_path_free_ref - decrements the nc path refcounter and possibly + * frees it + * @nc_path: the nc node to free + */ +static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path) +{ + if (atomic_dec_and_test(&nc_path->refcount)) + kfree_rcu(nc_path, rcu); +} + +/** + * batadv_nc_packet_free - frees nc packet + * @nc_packet: the nc packet to free + */ +static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet) +{ + if (nc_packet->skb) + kfree_skb(nc_packet->skb); + + batadv_nc_path_free_ref(nc_packet->nc_path); + kfree(nc_packet); +} + +/** + * batadv_nc_to_purge_nc_node - checks whether an nc node has to be purged + * @bat_priv: the bat priv with all the soft interface information + * @nc_node: the nc node to check + * + * Returns true if the entry has to be purged now, false otherwise + */ +static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, + struct batadv_nc_node *nc_node) +{ + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + return true; + + return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT); +} + +/** + * batadv_nc_to_purge_nc_path_coding - checks whether an nc path has timed out + * @bat_priv: the bat priv with all the soft interface information + * @nc_path: the nc path to check + * + * Returns true if the entry has to be purged now, false otherwise + */ +static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, + struct batadv_nc_path *nc_path) +{ + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + return true; + + /* purge the path when no packets has been added for 10 times the + * max_fwd_delay time + */ + return batadv_has_timed_out(nc_path->last_valid, + bat_priv->nc.max_fwd_delay * 10); +} + +/** + * batadv_nc_to_purge_nc_path_decoding - checks whether an nc path has timed out + * @bat_priv: the bat priv with all the soft interface information + * @nc_path: the nc path to check + * + * Returns true if the entry has to be purged now, false otherwise + */ +static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, + struct batadv_nc_path *nc_path) +{ + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + return true; + + /* purge the path when no packets has been added for 10 times the + * max_buffer time + */ + return batadv_has_timed_out(nc_path->last_valid, + bat_priv->nc.max_buffer_time*10); +} + +/** + * batadv_nc_purge_orig_nc_nodes - go through list of nc nodes and purge stale + * entries + * @bat_priv: the bat priv with all the soft interface information + * @list: list of nc nodes + * @lock: nc node list lock + * @to_purge: function in charge to decide whether an entry has to be purged or + * not. This function takes the nc node as argument and has to return + * a boolean value: true if the entry has to be deleted, false + * otherwise + */ +static void +batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, + struct list_head *list, + spinlock_t *lock, + bool (*to_purge)(struct batadv_priv *, + struct batadv_nc_node *)) +{ + struct batadv_nc_node *nc_node, *nc_node_tmp; + + /* For each nc_node in list */ + spin_lock_bh(lock); + list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) { + /* if an helper function has been passed as parameter, + * ask it if the entry has to be purged or not + */ + if (to_purge && !to_purge(bat_priv, nc_node)) + continue; + + batadv_dbg(BATADV_DBG_NC, bat_priv, + "Removing nc_node %pM -> %pM\n", + nc_node->addr, nc_node->orig_node->orig); + list_del_rcu(&nc_node->list); + batadv_nc_node_free_ref(nc_node); + } + spin_unlock_bh(lock); +} + +/** + * batadv_nc_purge_orig - purges all nc node data attached of the given + * originator + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig_node with the nc node entries to be purged + * @to_purge: function in charge to decide whether an entry has to be purged or + * not. This function takes the nc node as argument and has to return + * a boolean value: true is the entry has to be deleted, false + * otherwise + */ +void batadv_nc_purge_orig(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + bool (*to_purge)(struct batadv_priv *, + struct batadv_nc_node *)) +{ + /* Check ingoing nc_node's of this orig_node */ + batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list, + &orig_node->in_coding_list_lock, + to_purge); + + /* Check outgoing nc_node's of this orig_node */ + batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list, + &orig_node->out_coding_list_lock, + to_purge); +} + +/** + * batadv_nc_purge_orig_hash - traverse entire originator hash to check if they + * have timed out nc nodes + * @bat_priv: the bat priv with all the soft interface information + */ +static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct hlist_head *head; + struct batadv_orig_node *orig_node; + uint32_t i; + + if (!hash) + return; + + /* For each orig_node */ + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) + batadv_nc_purge_orig(bat_priv, orig_node, + batadv_nc_to_purge_nc_node); + rcu_read_unlock(); + } +} + +/** + * batadv_nc_purge_paths - traverse all nc paths part of the hash and remove + * unused ones + * @bat_priv: the bat priv with all the soft interface information + * @hash: hash table containing the nc paths to check + * @to_purge: function in charge to decide whether an entry has to be purged or + * not. This function takes the nc node as argument and has to return + * a boolean value: true is the entry has to be deleted, false + * otherwise + */ +static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, + struct batadv_hashtable *hash, + bool (*to_purge)(struct batadv_priv *, + struct batadv_nc_path *)) +{ + struct hlist_head *head; + struct hlist_node *node_tmp; + struct batadv_nc_path *nc_path; + spinlock_t *lock; /* Protects lists in hash */ + uint32_t i; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + lock = &hash->list_locks[i]; + + /* For each nc_path in this bin */ + spin_lock_bh(lock); + hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) { + /* if an helper function has been passed as parameter, + * ask it if the entry has to be purged or not + */ + if (to_purge && !to_purge(bat_priv, nc_path)) + continue; + + /* purging an non-empty nc_path should never happen, but + * is observed under high CPU load. Delay the purging + * until next iteration to allow the packet_list to be + * emptied first. + */ + if (!unlikely(list_empty(&nc_path->packet_list))) { + net_ratelimited_function(printk, + KERN_WARNING + "Skipping free of non-empty nc_path (%pM -> %pM)!\n", + nc_path->prev_hop, + nc_path->next_hop); + continue; + } + + /* nc_path is unused, so remove it */ + batadv_dbg(BATADV_DBG_NC, bat_priv, + "Remove nc_path %pM -> %pM\n", + nc_path->prev_hop, nc_path->next_hop); + hlist_del_rcu(&nc_path->hash_entry); + batadv_nc_path_free_ref(nc_path); + } + spin_unlock_bh(lock); + } +} + +/** + * batadv_nc_hash_key_gen - computes the nc_path hash key + * @key: buffer to hold the final hash key + * @src: source ethernet mac address going into the hash key + * @dst: destination ethernet mac address going into the hash key + */ +static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, + const char *dst) +{ + memcpy(key->prev_hop, src, sizeof(key->prev_hop)); + memcpy(key->next_hop, dst, sizeof(key->next_hop)); +} + +/** + * batadv_nc_hash_choose - compute the hash value for an nc path + * @data: data to hash + * @size: size of the hash table + * + * Returns the selected index in the hash table for the given data. + */ +static uint32_t batadv_nc_hash_choose(const void *data, uint32_t size) +{ + const struct batadv_nc_path *nc_path = data; + uint32_t hash = 0; + + hash = batadv_hash_bytes(hash, &nc_path->prev_hop, + sizeof(nc_path->prev_hop)); + hash = batadv_hash_bytes(hash, &nc_path->next_hop, + sizeof(nc_path->next_hop)); + + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash % size; +} + +/** + * batadv_nc_hash_compare - comparing function used in the network coding hash + * tables + * @node: node in the local table + * @data2: second object to compare the node to + * + * Returns 1 if the two entry are the same, 0 otherwise + */ +static int batadv_nc_hash_compare(const struct hlist_node *node, + const void *data2) +{ + const struct batadv_nc_path *nc_path1, *nc_path2; + + nc_path1 = container_of(node, struct batadv_nc_path, hash_entry); + nc_path2 = data2; + + /* Return 1 if the two keys are identical */ + if (memcmp(nc_path1->prev_hop, nc_path2->prev_hop, + sizeof(nc_path1->prev_hop)) != 0) + return 0; + + if (memcmp(nc_path1->next_hop, nc_path2->next_hop, + sizeof(nc_path1->next_hop)) != 0) + return 0; + + return 1; +} + +/** + * batadv_nc_hash_find - search for an existing nc path and return it + * @hash: hash table containing the nc path + * @data: search key + * + * Returns the nc_path if found, NULL otherwise. + */ +static struct batadv_nc_path * +batadv_nc_hash_find(struct batadv_hashtable *hash, + void *data) +{ + struct hlist_head *head; + struct batadv_nc_path *nc_path, *nc_path_tmp = NULL; + int index; + + if (!hash) + return NULL; + + index = batadv_nc_hash_choose(data, hash->size); + head = &hash->table[index]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(nc_path, head, hash_entry) { + if (!batadv_nc_hash_compare(&nc_path->hash_entry, data)) + continue; + + if (!atomic_inc_not_zero(&nc_path->refcount)) + continue; + + nc_path_tmp = nc_path; + break; + } + rcu_read_unlock(); + + return nc_path_tmp; +} + +/** + * batadv_nc_send_packet - send non-coded packet and free nc_packet struct + * @nc_packet: the nc packet to send + */ +static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) +{ + batadv_send_skb_packet(nc_packet->skb, + nc_packet->neigh_node->if_incoming, + nc_packet->nc_path->next_hop); + nc_packet->skb = NULL; + batadv_nc_packet_free(nc_packet); +} + +/** + * batadv_nc_sniffed_purge - Checks timestamp of given sniffed nc_packet. + * @bat_priv: the bat priv with all the soft interface information + * @nc_path: the nc path the packet belongs to + * @nc_packet: the nc packet to be checked + * + * Checks whether the given sniffed (overheard) nc_packet has hit its buffering + * timeout. If so, the packet is no longer kept and the entry deleted from the + * queue. Has to be called with the appropriate locks. + * + * Returns false as soon as the entry in the fifo queue has not been timed out + * yet and true otherwise. + */ +static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, + struct batadv_nc_path *nc_path, + struct batadv_nc_packet *nc_packet) +{ + unsigned long timeout = bat_priv->nc.max_buffer_time; + bool res = false; + + /* Packets are added to tail, so the remaining packets did not time + * out and we can stop processing the current queue + */ + if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && + !batadv_has_timed_out(nc_packet->timestamp, timeout)) + goto out; + + /* purge nc packet */ + list_del(&nc_packet->list); + batadv_nc_packet_free(nc_packet); + + res = true; + +out: + return res; +} + +/** + * batadv_nc_fwd_flush - Checks the timestamp of the given nc packet. + * @bat_priv: the bat priv with all the soft interface information + * @nc_path: the nc path the packet belongs to + * @nc_packet: the nc packet to be checked + * + * Checks whether the given nc packet has hit its forward timeout. If so, the + * packet is no longer delayed, immediately sent and the entry deleted from the + * queue. Has to be called with the appropriate locks. + * + * Returns false as soon as the entry in the fifo queue has not been timed out + * yet and true otherwise. + */ +static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, + struct batadv_nc_path *nc_path, + struct batadv_nc_packet *nc_packet) +{ + unsigned long timeout = bat_priv->nc.max_fwd_delay; + + /* Packets are added to tail, so the remaining packets did not time + * out and we can stop processing the current queue + */ + if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && + !batadv_has_timed_out(nc_packet->timestamp, timeout)) + return false; + + /* Send packet */ + batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); + batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, + nc_packet->skb->len + ETH_HLEN); + list_del(&nc_packet->list); + batadv_nc_send_packet(nc_packet); + + return true; +} + +/** + * batadv_nc_process_nc_paths - traverse given nc packet pool and free timed out + * nc packets + * @bat_priv: the bat priv with all the soft interface information + * @hash: to be processed hash table + * @process_fn: Function called to process given nc packet. Should return true + * to encourage this function to proceed with the next packet. + * Otherwise the rest of the current queue is skipped. + */ +static void +batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, + struct batadv_hashtable *hash, + bool (*process_fn)(struct batadv_priv *, + struct batadv_nc_path *, + struct batadv_nc_packet *)) +{ + struct hlist_head *head; + struct batadv_nc_packet *nc_packet, *nc_packet_tmp; + struct batadv_nc_path *nc_path; + bool ret; + int i; + + if (!hash) + return; + + /* Loop hash table bins */ + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + /* Loop coding paths */ + rcu_read_lock(); + hlist_for_each_entry_rcu(nc_path, head, hash_entry) { + /* Loop packets */ + spin_lock_bh(&nc_path->packet_list_lock); + list_for_each_entry_safe(nc_packet, nc_packet_tmp, + &nc_path->packet_list, list) { + ret = process_fn(bat_priv, nc_path, nc_packet); + if (!ret) + break; + } + spin_unlock_bh(&nc_path->packet_list_lock); + } + rcu_read_unlock(); + } +} + +/** + * batadv_nc_worker - periodic task for house keeping related to network coding + * @work: kernel work struct + */ +static void batadv_nc_worker(struct work_struct *work) +{ + struct delayed_work *delayed_work; + struct batadv_priv_nc *priv_nc; + struct batadv_priv *bat_priv; + unsigned long timeout; + + delayed_work = container_of(work, struct delayed_work, work); + priv_nc = container_of(delayed_work, struct batadv_priv_nc, work); + bat_priv = container_of(priv_nc, struct batadv_priv, nc); + + batadv_nc_purge_orig_hash(bat_priv); + batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, + batadv_nc_to_purge_nc_path_coding); + batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, + batadv_nc_to_purge_nc_path_decoding); + + timeout = bat_priv->nc.max_fwd_delay; + + if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) { + batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash, + batadv_nc_fwd_flush); + bat_priv->nc.timestamp_fwd_flush = jiffies; + } + + if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge, + bat_priv->nc.max_buffer_time)) { + batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash, + batadv_nc_sniffed_purge); + bat_priv->nc.timestamp_sniffed_purge = jiffies; + } + + /* Schedule a new check */ + batadv_nc_start_timer(bat_priv); +} + +/** + * batadv_can_nc_with_orig - checks whether the given orig node is suitable for + * coding or not + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: neighboring orig node which may be used as nc candidate + * @ogm_packet: incoming ogm packet also used for the checks + * + * Returns true if: + * 1) The OGM must have the most recent sequence number. + * 2) The TTL must be decremented by one and only one. + * 3) The OGM must be received from the first hop from orig_node. + * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq. + */ +static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_ogm_packet *ogm_packet) +{ + if (orig_node->last_real_seqno != ntohl(ogm_packet->seqno)) + return false; + if (orig_node->last_ttl != ogm_packet->header.ttl + 1) + return false; + if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender)) + return false; + if (ogm_packet->tq < bat_priv->nc.min_tq) + return false; + + return true; +} + +/** + * batadv_nc_find_nc_node - search for an existing nc node and return it + * @orig_node: orig node originating the ogm packet + * @orig_neigh_node: neighboring orig node from which we received the ogm packet + * (can be equal to orig_node) + * @in_coding: traverse incoming or outgoing network coding list + * + * Returns the nc_node if found, NULL otherwise. + */ +static struct batadv_nc_node +*batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + bool in_coding) +{ + struct batadv_nc_node *nc_node, *nc_node_out = NULL; + struct list_head *list; + + if (in_coding) + list = &orig_neigh_node->in_coding_list; + else + list = &orig_neigh_node->out_coding_list; + + /* Traverse list of nc_nodes to orig_node */ + rcu_read_lock(); + list_for_each_entry_rcu(nc_node, list, list) { + if (!batadv_compare_eth(nc_node->addr, orig_node->orig)) + continue; + + if (!atomic_inc_not_zero(&nc_node->refcount)) + continue; + + /* Found a match */ + nc_node_out = nc_node; + break; + } + rcu_read_unlock(); + + return nc_node_out; +} + +/** + * batadv_nc_get_nc_node - retrieves an nc node or creates the entry if it was + * not found + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node originating the ogm packet + * @orig_neigh_node: neighboring orig node from which we received the ogm packet + * (can be equal to orig_node) + * @in_coding: traverse incoming or outgoing network coding list + * + * Returns the nc_node if found or created, NULL in case of an error. + */ +static struct batadv_nc_node +*batadv_nc_get_nc_node(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + bool in_coding) +{ + struct batadv_nc_node *nc_node; + spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ + struct list_head *list; + + /* Check if nc_node is already added */ + nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); + + /* Node found */ + if (nc_node) + return nc_node; + + nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); + if (!nc_node) + return NULL; + + if (!atomic_inc_not_zero(&orig_neigh_node->refcount)) + goto free; + + /* Initialize nc_node */ + INIT_LIST_HEAD(&nc_node->list); + memcpy(nc_node->addr, orig_node->orig, ETH_ALEN); + nc_node->orig_node = orig_neigh_node; + atomic_set(&nc_node->refcount, 2); + + /* Select ingoing or outgoing coding node */ + if (in_coding) { + lock = &orig_neigh_node->in_coding_list_lock; + list = &orig_neigh_node->in_coding_list; + } else { + lock = &orig_neigh_node->out_coding_list_lock; + list = &orig_neigh_node->out_coding_list; + } + + batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", + nc_node->addr, nc_node->orig_node->orig); + + /* Add nc_node to orig_node */ + spin_lock_bh(lock); + list_add_tail_rcu(&nc_node->list, list); + spin_unlock_bh(lock); + + return nc_node; + +free: + kfree(nc_node); + return NULL; +} + +/** + * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node structs + * (best called on incoming OGMs) + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node originating the ogm packet + * @orig_neigh_node: neighboring orig node from which we received the ogm packet + * (can be equal to orig_node) + * @ogm_packet: incoming ogm packet + * @is_single_hop_neigh: orig_node is a single hop neighbor + */ +void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + struct batadv_ogm_packet *ogm_packet, + int is_single_hop_neigh) +{ + struct batadv_nc_node *in_nc_node = NULL, *out_nc_node = NULL; + + /* Check if network coding is enabled */ + if (!atomic_read(&bat_priv->network_coding)) + goto out; + + /* accept ogms from 'good' neighbors and single hop neighbors */ + if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) && + !is_single_hop_neigh) + goto out; + + /* Add orig_node as in_nc_node on hop */ + in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node, + orig_neigh_node, true); + if (!in_nc_node) + goto out; + + in_nc_node->last_seen = jiffies; + + /* Add hop as out_nc_node on orig_node */ + out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node, + orig_node, false); + if (!out_nc_node) + goto out; + + out_nc_node->last_seen = jiffies; + +out: + if (in_nc_node) + batadv_nc_node_free_ref(in_nc_node); + if (out_nc_node) + batadv_nc_node_free_ref(out_nc_node); +} + +/** + * batadv_nc_get_path - get existing nc_path or allocate a new one + * @bat_priv: the bat priv with all the soft interface information + * @hash: hash table containing the nc path + * @src: ethernet source address - first half of the nc path search key + * @dst: ethernet destination address - second half of the nc path search key + * + * Returns pointer to nc_path if the path was found or created, returns NULL + * on error. + */ +static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, + struct batadv_hashtable *hash, + uint8_t *src, + uint8_t *dst) +{ + int hash_added; + struct batadv_nc_path *nc_path, nc_path_key; + + batadv_nc_hash_key_gen(&nc_path_key, src, dst); + + /* Search for existing nc_path */ + nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key); + + if (nc_path) { + /* Set timestamp to delay removal of nc_path */ + nc_path->last_valid = jiffies; + return nc_path; + } + + /* No existing nc_path was found; create a new */ + nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC); + + if (!nc_path) + return NULL; + + /* Initialize nc_path */ + INIT_LIST_HEAD(&nc_path->packet_list); + spin_lock_init(&nc_path->packet_list_lock); + atomic_set(&nc_path->refcount, 2); + nc_path->last_valid = jiffies; + memcpy(nc_path->next_hop, dst, ETH_ALEN); + memcpy(nc_path->prev_hop, src, ETH_ALEN); + + batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n", + nc_path->prev_hop, + nc_path->next_hop); + + /* Add nc_path to hash table */ + hash_added = batadv_hash_add(hash, batadv_nc_hash_compare, + batadv_nc_hash_choose, &nc_path_key, + &nc_path->hash_entry); + + if (hash_added < 0) { + kfree(nc_path); + return NULL; + } + + return nc_path; +} + +/** + * batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair + * selection of a receiver with slightly lower TQ than the other + * @tq: to be weighted tq value + */ +static uint8_t batadv_nc_random_weight_tq(uint8_t tq) +{ + uint8_t rand_val, rand_tq; + + get_random_bytes(&rand_val, sizeof(rand_val)); + + /* randomize the estimated packet loss (max TQ - estimated TQ) */ + rand_tq = rand_val * (BATADV_TQ_MAX_VALUE - tq); + + /* normalize the randomized packet loss */ + rand_tq /= BATADV_TQ_MAX_VALUE; + + /* convert to (randomized) estimated tq again */ + return BATADV_TQ_MAX_VALUE - rand_tq; +} + +/** + * batadv_nc_memxor - XOR destination with source + * @dst: byte array to XOR into + * @src: byte array to XOR from + * @len: length of destination array + */ +static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; ++i) + dst[i] ^= src[i]; +} + +/** + * batadv_nc_code_packets - code a received unicast_packet with an nc packet + * into a coded_packet and send it + * @bat_priv: the bat priv with all the soft interface information + * @skb: data skb to forward + * @ethhdr: pointer to the ethernet header inside the skb + * @nc_packet: structure containing the packet to the skb can be coded with + * @neigh_node: next hop to forward packet to + * + * Returns true if both packets are consumed, false otherwise. + */ +static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, + struct sk_buff *skb, + struct ethhdr *ethhdr, + struct batadv_nc_packet *nc_packet, + struct batadv_neigh_node *neigh_node) +{ + uint8_t tq_weighted_neigh, tq_weighted_coding; + struct sk_buff *skb_dest, *skb_src; + struct batadv_unicast_packet *packet1; + struct batadv_unicast_packet *packet2; + struct batadv_coded_packet *coded_packet; + struct batadv_neigh_node *neigh_tmp, *router_neigh; + struct batadv_neigh_node *router_coding = NULL; + uint8_t *first_source, *first_dest, *second_source, *second_dest; + __be32 packet_id1, packet_id2; + size_t count; + bool res = false; + int coding_len; + int unicast_size = sizeof(*packet1); + int coded_size = sizeof(*coded_packet); + int header_add = coded_size - unicast_size; + + router_neigh = batadv_orig_node_get_router(neigh_node->orig_node); + if (!router_neigh) + goto out; + + neigh_tmp = nc_packet->neigh_node; + router_coding = batadv_orig_node_get_router(neigh_tmp->orig_node); + if (!router_coding) + goto out; + + tq_weighted_neigh = batadv_nc_random_weight_tq(router_neigh->tq_avg); + tq_weighted_coding = batadv_nc_random_weight_tq(router_coding->tq_avg); + + /* Select one destination for the MAC-header dst-field based on + * weighted TQ-values. + */ + if (tq_weighted_neigh >= tq_weighted_coding) { + /* Destination from nc_packet is selected for MAC-header */ + first_dest = nc_packet->nc_path->next_hop; + first_source = nc_packet->nc_path->prev_hop; + second_dest = neigh_node->addr; + second_source = ethhdr->h_source; + packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data; + packet2 = (struct batadv_unicast_packet *)skb->data; + packet_id1 = nc_packet->packet_id; + packet_id2 = batadv_skb_crc32(skb, + skb->data + sizeof(*packet2)); + } else { + /* Destination for skb is selected for MAC-header */ + first_dest = neigh_node->addr; + first_source = ethhdr->h_source; + second_dest = nc_packet->nc_path->next_hop; + second_source = nc_packet->nc_path->prev_hop; + packet1 = (struct batadv_unicast_packet *)skb->data; + packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data; + packet_id1 = batadv_skb_crc32(skb, + skb->data + sizeof(*packet1)); + packet_id2 = nc_packet->packet_id; + } + + /* Instead of zero padding the smallest data buffer, we + * code into the largest. + */ + if (skb->len <= nc_packet->skb->len) { + skb_dest = nc_packet->skb; + skb_src = skb; + } else { + skb_dest = skb; + skb_src = nc_packet->skb; + } + + /* coding_len is used when decoding the packet shorter packet */ + coding_len = skb_src->len - unicast_size; + + if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0) + goto out; + + skb_push(skb_dest, header_add); + + coded_packet = (struct batadv_coded_packet *)skb_dest->data; + skb_reset_mac_header(skb_dest); + + coded_packet->header.packet_type = BATADV_CODED; + coded_packet->header.version = BATADV_COMPAT_VERSION; + coded_packet->header.ttl = packet1->header.ttl; + + /* Info about first unicast packet */ + memcpy(coded_packet->first_source, first_source, ETH_ALEN); + memcpy(coded_packet->first_orig_dest, packet1->dest, ETH_ALEN); + coded_packet->first_crc = packet_id1; + coded_packet->first_ttvn = packet1->ttvn; + + /* Info about second unicast packet */ + memcpy(coded_packet->second_dest, second_dest, ETH_ALEN); + memcpy(coded_packet->second_source, second_source, ETH_ALEN); + memcpy(coded_packet->second_orig_dest, packet2->dest, ETH_ALEN); + coded_packet->second_crc = packet_id2; + coded_packet->second_ttl = packet2->header.ttl; + coded_packet->second_ttvn = packet2->ttvn; + coded_packet->coded_len = htons(coding_len); + + /* This is where the magic happens: Code skb_src into skb_dest */ + batadv_nc_memxor(skb_dest->data + coded_size, + skb_src->data + unicast_size, coding_len); + + /* Update counters accordingly */ + if (BATADV_SKB_CB(skb_src)->decoded && + BATADV_SKB_CB(skb_dest)->decoded) { + /* Both packets are recoded */ + count = skb_src->len + ETH_HLEN; + count += skb_dest->len + ETH_HLEN; + batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2); + batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count); + } else if (!BATADV_SKB_CB(skb_src)->decoded && + !BATADV_SKB_CB(skb_dest)->decoded) { + /* Both packets are newly coded */ + count = skb_src->len + ETH_HLEN; + count += skb_dest->len + ETH_HLEN; + batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2); + batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count); + } else if (BATADV_SKB_CB(skb_src)->decoded && + !BATADV_SKB_CB(skb_dest)->decoded) { + /* skb_src recoded and skb_dest is newly coded */ + batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); + batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, + skb_src->len + ETH_HLEN); + batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); + batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, + skb_dest->len + ETH_HLEN); + } else if (!BATADV_SKB_CB(skb_src)->decoded && + BATADV_SKB_CB(skb_dest)->decoded) { + /* skb_src is newly coded and skb_dest is recoded */ + batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); + batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, + skb_src->len + ETH_HLEN); + batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); + batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, + skb_dest->len + ETH_HLEN); + } + + /* skb_src is now coded into skb_dest, so free it */ + kfree_skb(skb_src); + + /* avoid duplicate free of skb from nc_packet */ + nc_packet->skb = NULL; + batadv_nc_packet_free(nc_packet); + + /* Send the coded packet and return true */ + batadv_send_skb_packet(skb_dest, neigh_node->if_incoming, first_dest); + res = true; +out: + if (router_neigh) + batadv_neigh_node_free_ref(router_neigh); + if (router_coding) + batadv_neigh_node_free_ref(router_coding); + return res; +} + +/** + * batadv_nc_skb_coding_possible - true if a decoded skb is available at dst. + * @skb: data skb to forward + * @dst: destination mac address of the other skb to code with + * @src: source mac address of skb + * + * Whenever we network code a packet we have to check whether we received it in + * a network coded form. If so, we may not be able to use it for coding because + * some neighbors may also have received (overheard) the packet in the network + * coded form without being able to decode it. It is hard to know which of the + * neighboring nodes was able to decode the packet, therefore we can only + * re-code the packet if the source of the previous encoded packet is involved. + * Since the source encoded the packet we can be certain it has all necessary + * decode information. + * + * Returns true if coding of a decoded packet is allowed. + */ +static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, + uint8_t *dst, uint8_t *src) +{ + if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src)) + return false; + else + return true; +} + +/** + * batadv_nc_path_search - Find the coding path matching in_nc_node and + * out_nc_node to retrieve a buffered packet that can be used for coding. + * @bat_priv: the bat priv with all the soft interface information + * @in_nc_node: pointer to skb next hop's neighbor nc node + * @out_nc_node: pointer to skb source's neighbor nc node + * @skb: data skb to forward + * @eth_dst: next hop mac address of skb + * + * Returns true if coding of a decoded skb is allowed. + */ +static struct batadv_nc_packet * +batadv_nc_path_search(struct batadv_priv *bat_priv, + struct batadv_nc_node *in_nc_node, + struct batadv_nc_node *out_nc_node, + struct sk_buff *skb, + uint8_t *eth_dst) +{ + struct batadv_nc_path *nc_path, nc_path_key; + struct batadv_nc_packet *nc_packet_out = NULL; + struct batadv_nc_packet *nc_packet, *nc_packet_tmp; + struct batadv_hashtable *hash = bat_priv->nc.coding_hash; + int idx; + + if (!hash) + return NULL; + + /* Create almost path key */ + batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr, + out_nc_node->addr); + idx = batadv_nc_hash_choose(&nc_path_key, hash->size); + + /* Check for coding opportunities in this nc_path */ + rcu_read_lock(); + hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) { + if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr)) + continue; + + if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr)) + continue; + + spin_lock_bh(&nc_path->packet_list_lock); + if (list_empty(&nc_path->packet_list)) { + spin_unlock_bh(&nc_path->packet_list_lock); + continue; + } + + list_for_each_entry_safe(nc_packet, nc_packet_tmp, + &nc_path->packet_list, list) { + if (!batadv_nc_skb_coding_possible(nc_packet->skb, + eth_dst, + in_nc_node->addr)) + continue; + + /* Coding opportunity is found! */ + list_del(&nc_packet->list); + nc_packet_out = nc_packet; + break; + } + + spin_unlock_bh(&nc_path->packet_list_lock); + break; + } + rcu_read_unlock(); + + return nc_packet_out; +} + +/** + * batadv_nc_skb_src_search - Loops through the list of neighoring nodes of the + * skb's sender (may be equal to the originator). + * @bat_priv: the bat priv with all the soft interface information + * @skb: data skb to forward + * @eth_dst: next hop mac address of skb + * @eth_src: source mac address of skb + * @in_nc_node: pointer to skb next hop's neighbor nc node + * + * Returns an nc packet if a suitable coding packet was found, NULL otherwise. + */ +static struct batadv_nc_packet * +batadv_nc_skb_src_search(struct batadv_priv *bat_priv, + struct sk_buff *skb, + uint8_t *eth_dst, + uint8_t *eth_src, + struct batadv_nc_node *in_nc_node) +{ + struct batadv_orig_node *orig_node; + struct batadv_nc_node *out_nc_node; + struct batadv_nc_packet *nc_packet = NULL; + + orig_node = batadv_orig_hash_find(bat_priv, eth_src); + if (!orig_node) + return NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(out_nc_node, + &orig_node->out_coding_list, list) { + /* Check if the skb is decoded and if recoding is possible */ + if (!batadv_nc_skb_coding_possible(skb, + out_nc_node->addr, eth_src)) + continue; + + /* Search for an opportunity in this nc_path */ + nc_packet = batadv_nc_path_search(bat_priv, in_nc_node, + out_nc_node, skb, eth_dst); + if (nc_packet) + break; + } + rcu_read_unlock(); + + batadv_orig_node_free_ref(orig_node); + return nc_packet; +} + +/** + * batadv_nc_skb_store_before_coding - set the ethernet src and dst of the + * unicast skb before it is stored for use in later decoding + * @bat_priv: the bat priv with all the soft interface information + * @skb: data skb to store + * @eth_dst_new: new destination mac address of skb + */ +static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, + struct sk_buff *skb, + uint8_t *eth_dst_new) +{ + struct ethhdr *ethhdr; + + /* Copy skb header to change the mac header */ + skb = pskb_copy(skb, GFP_ATOMIC); + if (!skb) + return; + + /* Set the mac header as if we actually sent the packet uncoded */ + ethhdr = (struct ethhdr *)skb_mac_header(skb); + memcpy(ethhdr->h_source, ethhdr->h_dest, ETH_ALEN); + memcpy(ethhdr->h_dest, eth_dst_new, ETH_ALEN); + + /* Set data pointer to MAC header to mimic packets from our tx path */ + skb_push(skb, ETH_HLEN); + + /* Add the packet to the decoding packet pool */ + batadv_nc_skb_store_for_decoding(bat_priv, skb); + + /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free + * our ref + */ + kfree_skb(skb); +} + +/** + * batadv_nc_skb_dst_search - Loops through list of neighboring nodes to dst. + * @skb: data skb to forward + * @neigh_node: next hop to forward packet to + * @ethhdr: pointer to the ethernet header inside the skb + * + * Loops through list of neighboring nodes the next hop has a good connection to + * (receives OGMs with a sufficient quality). We need to find a neighbor of our + * next hop that potentially sent a packet which our next hop also received + * (overheard) and has stored for later decoding. + * + * Returns true if the skb was consumed (encoded packet sent) or false otherwise + */ +static bool batadv_nc_skb_dst_search(struct sk_buff *skb, + struct batadv_neigh_node *neigh_node, + struct ethhdr *ethhdr) +{ + struct net_device *netdev = neigh_node->if_incoming->soft_iface; + struct batadv_priv *bat_priv = netdev_priv(netdev); + struct batadv_orig_node *orig_node = neigh_node->orig_node; + struct batadv_nc_node *nc_node; + struct batadv_nc_packet *nc_packet = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) { + /* Search for coding opportunity with this in_nc_node */ + nc_packet = batadv_nc_skb_src_search(bat_priv, skb, + neigh_node->addr, + ethhdr->h_source, nc_node); + + /* Opportunity was found, so stop searching */ + if (nc_packet) + break; + } + rcu_read_unlock(); + + if (!nc_packet) + return false; + + /* Save packets for later decoding */ + batadv_nc_skb_store_before_coding(bat_priv, skb, + neigh_node->addr); + batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb, + nc_packet->neigh_node->addr); + + /* Code and send packets */ + if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet, + neigh_node)) + return true; + + /* out of mem ? Coding failed - we have to free the buffered packet + * to avoid memleaks. The skb passed as argument will be dealt with + * by the calling function. + */ + batadv_nc_send_packet(nc_packet); + return false; +} + +/** + * batadv_nc_skb_add_to_path - buffer skb for later encoding / decoding + * @skb: skb to add to path + * @nc_path: path to add skb to + * @neigh_node: next hop to forward packet to + * @packet_id: checksum to identify packet + * + * Returns true if the packet was buffered or false in case of an error. + */ +static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, + struct batadv_nc_path *nc_path, + struct batadv_neigh_node *neigh_node, + __be32 packet_id) +{ + struct batadv_nc_packet *nc_packet; + + nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC); + if (!nc_packet) + return false; + + /* Initialize nc_packet */ + nc_packet->timestamp = jiffies; + nc_packet->packet_id = packet_id; + nc_packet->skb = skb; + nc_packet->neigh_node = neigh_node; + nc_packet->nc_path = nc_path; + + /* Add coding packet to list */ + spin_lock_bh(&nc_path->packet_list_lock); + list_add_tail(&nc_packet->list, &nc_path->packet_list); + spin_unlock_bh(&nc_path->packet_list_lock); + + return true; +} + +/** + * batadv_nc_skb_forward - try to code a packet or add it to the coding packet + * buffer + * @skb: data skb to forward + * @neigh_node: next hop to forward packet to + * @ethhdr: pointer to the ethernet header inside the skb + * + * Returns true if the skb was consumed (encoded packet sent) or false otherwise + */ +bool batadv_nc_skb_forward(struct sk_buff *skb, + struct batadv_neigh_node *neigh_node, + struct ethhdr *ethhdr) +{ + const struct net_device *netdev = neigh_node->if_incoming->soft_iface; + struct batadv_priv *bat_priv = netdev_priv(netdev); + struct batadv_unicast_packet *packet; + struct batadv_nc_path *nc_path; + __be32 packet_id; + u8 *payload; + + /* Check if network coding is enabled */ + if (!atomic_read(&bat_priv->network_coding)) + goto out; + + /* We only handle unicast packets */ + payload = skb_network_header(skb); + packet = (struct batadv_unicast_packet *)payload; + if (packet->header.packet_type != BATADV_UNICAST) + goto out; + + /* Try to find a coding opportunity and send the skb if one is found */ + if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr)) + return true; + + /* Find or create a nc_path for this src-dst pair */ + nc_path = batadv_nc_get_path(bat_priv, + bat_priv->nc.coding_hash, + ethhdr->h_source, + neigh_node->addr); + + if (!nc_path) + goto out; + + /* Add skb to nc_path */ + packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); + if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id)) + goto free_nc_path; + + /* Packet is consumed */ + return true; + +free_nc_path: + batadv_nc_path_free_ref(nc_path); +out: + /* Packet is not consumed */ + return false; +} + +/** + * batadv_nc_skb_store_for_decoding - save a clone of the skb which can be used + * when decoding coded packets + * @bat_priv: the bat priv with all the soft interface information + * @skb: data skb to store + */ +void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + struct batadv_unicast_packet *packet; + struct batadv_nc_path *nc_path; + struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); + __be32 packet_id; + u8 *payload; + + /* Check if network coding is enabled */ + if (!atomic_read(&bat_priv->network_coding)) + goto out; + + /* Check for supported packet type */ + payload = skb_network_header(skb); + packet = (struct batadv_unicast_packet *)payload; + if (packet->header.packet_type != BATADV_UNICAST) + goto out; + + /* Find existing nc_path or create a new */ + nc_path = batadv_nc_get_path(bat_priv, + bat_priv->nc.decoding_hash, + ethhdr->h_source, + ethhdr->h_dest); + + if (!nc_path) + goto out; + + /* Clone skb and adjust skb->data to point at batman header */ + skb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto free_nc_path; + + if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) + goto free_skb; + + if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN))) + goto free_skb; + + /* Add skb to nc_path */ + packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); + if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id)) + goto free_skb; + + batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER); + return; + +free_skb: + kfree_skb(skb); +free_nc_path: + batadv_nc_path_free_ref(nc_path); +out: + return; +} + +/** + * batadv_nc_skb_store_sniffed_unicast - check if a received unicast packet + * should be saved in the decoding buffer and, if so, store it there + * @bat_priv: the bat priv with all the soft interface information + * @skb: unicast skb to store + */ +void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); + + if (batadv_is_my_mac(bat_priv, ethhdr->h_dest)) + return; + + /* Set data pointer to MAC header to mimic packets from our tx path */ + skb_push(skb, ETH_HLEN); + + batadv_nc_skb_store_for_decoding(bat_priv, skb); +} + +/** + * batadv_nc_skb_decode_packet - decode given skb using the decode data stored + * in nc_packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: unicast skb to decode + * @nc_packet: decode data needed to decode the skb + * + * Returns pointer to decoded unicast packet if the packet was decoded or NULL + * in case of an error. + */ +static struct batadv_unicast_packet * +batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, + struct batadv_nc_packet *nc_packet) +{ + const int h_size = sizeof(struct batadv_unicast_packet); + const int h_diff = sizeof(struct batadv_coded_packet) - h_size; + struct batadv_unicast_packet *unicast_packet; + struct batadv_coded_packet coded_packet_tmp; + struct ethhdr *ethhdr, ethhdr_tmp; + uint8_t *orig_dest, ttl, ttvn; + unsigned int coding_len; + + /* Save headers temporarily */ + memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp)); + memcpy(ðhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp)); + + if (skb_cow(skb, 0) < 0) + return NULL; + + if (unlikely(!skb_pull_rcsum(skb, h_diff))) + return NULL; + + /* Data points to batman header, so set mac header 14 bytes before + * and network to data + */ + skb_set_mac_header(skb, -ETH_HLEN); + skb_reset_network_header(skb); + + /* Reconstruct original mac header */ + ethhdr = (struct ethhdr *)skb_mac_header(skb); + memcpy(ethhdr, ðhdr_tmp, sizeof(*ethhdr)); + + /* Select the correct unicast header information based on the location + * of our mac address in the coded_packet header + */ + if (batadv_is_my_mac(bat_priv, coded_packet_tmp.second_dest)) { + /* If we are the second destination the packet was overheard, + * so the Ethernet address must be copied to h_dest and + * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST + */ + memcpy(ethhdr->h_dest, coded_packet_tmp.second_dest, ETH_ALEN); + skb->pkt_type = PACKET_HOST; + + orig_dest = coded_packet_tmp.second_orig_dest; + ttl = coded_packet_tmp.second_ttl; + ttvn = coded_packet_tmp.second_ttvn; + } else { + orig_dest = coded_packet_tmp.first_orig_dest; + ttl = coded_packet_tmp.header.ttl; + ttvn = coded_packet_tmp.first_ttvn; + } + + coding_len = ntohs(coded_packet_tmp.coded_len); + + if (coding_len > skb->len) + return NULL; + + /* Here the magic is reversed: + * extract the missing packet from the received coded packet + */ + batadv_nc_memxor(skb->data + h_size, + nc_packet->skb->data + h_size, + coding_len); + + /* Resize decoded skb if decoded with larger packet */ + if (nc_packet->skb->len > coding_len + h_size) + pskb_trim_rcsum(skb, coding_len + h_size); + + /* Create decoded unicast packet */ + unicast_packet = (struct batadv_unicast_packet *)skb->data; + unicast_packet->header.packet_type = BATADV_UNICAST; + unicast_packet->header.version = BATADV_COMPAT_VERSION; + unicast_packet->header.ttl = ttl; + memcpy(unicast_packet->dest, orig_dest, ETH_ALEN); + unicast_packet->ttvn = ttvn; + + batadv_nc_packet_free(nc_packet); + return unicast_packet; +} + +/** + * batadv_nc_find_decoding_packet - search through buffered decoding data to + * find the data needed to decode the coded packet + * @bat_priv: the bat priv with all the soft interface information + * @ethhdr: pointer to the ethernet header inside the coded packet + * @coded: coded packet we try to find decode data for + * + * Returns pointer to nc packet if the needed data was found or NULL otherwise. + */ +static struct batadv_nc_packet * +batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, + struct ethhdr *ethhdr, + struct batadv_coded_packet *coded) +{ + struct batadv_hashtable *hash = bat_priv->nc.decoding_hash; + struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL; + struct batadv_nc_path *nc_path, nc_path_key; + uint8_t *dest, *source; + __be32 packet_id; + int index; + + if (!hash) + return NULL; + + /* Select the correct packet id based on the location of our mac-addr */ + dest = ethhdr->h_source; + if (!batadv_is_my_mac(bat_priv, coded->second_dest)) { + source = coded->second_source; + packet_id = coded->second_crc; + } else { + source = coded->first_source; + packet_id = coded->first_crc; + } + + batadv_nc_hash_key_gen(&nc_path_key, source, dest); + index = batadv_nc_hash_choose(&nc_path_key, hash->size); + + /* Search for matching coding path */ + rcu_read_lock(); + hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) { + /* Find matching nc_packet */ + spin_lock_bh(&nc_path->packet_list_lock); + list_for_each_entry(tmp_nc_packet, + &nc_path->packet_list, list) { + if (packet_id == tmp_nc_packet->packet_id) { + list_del(&tmp_nc_packet->list); + + nc_packet = tmp_nc_packet; + break; + } + } + spin_unlock_bh(&nc_path->packet_list_lock); + + if (nc_packet) + break; + } + rcu_read_unlock(); + + if (!nc_packet) + batadv_dbg(BATADV_DBG_NC, bat_priv, + "No decoding packet found for %u\n", packet_id); + + return nc_packet; +} + +/** + * batadv_nc_recv_coded_packet - try to decode coded packet and enqueue the + * resulting unicast packet + * @skb: incoming coded packet + * @recv_if: pointer to interface this packet was received on + */ +static int batadv_nc_recv_coded_packet(struct sk_buff *skb, + struct batadv_hard_iface *recv_if) +{ + struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_unicast_packet *unicast_packet; + struct batadv_coded_packet *coded_packet; + struct batadv_nc_packet *nc_packet; + struct ethhdr *ethhdr; + int hdr_size = sizeof(*coded_packet); + + /* Check if network coding is enabled */ + if (!atomic_read(&bat_priv->network_coding)) + return NET_RX_DROP; + + /* Make sure we can access (and remove) header */ + if (unlikely(!pskb_may_pull(skb, hdr_size))) + return NET_RX_DROP; + + coded_packet = (struct batadv_coded_packet *)skb->data; + ethhdr = (struct ethhdr *)skb_mac_header(skb); + + /* Verify frame is destined for us */ + if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) && + !batadv_is_my_mac(bat_priv, coded_packet->second_dest)) + return NET_RX_DROP; + + /* Update stat counter */ + if (batadv_is_my_mac(bat_priv, coded_packet->second_dest)) + batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED); + + nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr, + coded_packet); + if (!nc_packet) { + batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); + return NET_RX_DROP; + } + + /* Make skb's linear, because decoding accesses the entire buffer */ + if (skb_linearize(skb) < 0) + goto free_nc_packet; + + if (skb_linearize(nc_packet->skb) < 0) + goto free_nc_packet; + + /* Decode the packet */ + unicast_packet = batadv_nc_skb_decode_packet(bat_priv, skb, nc_packet); + if (!unicast_packet) { + batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); + goto free_nc_packet; + } + + /* Mark packet as decoded to do correct recoding when forwarding */ + BATADV_SKB_CB(skb)->decoded = true; + batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE); + batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES, + skb->len + ETH_HLEN); + return batadv_recv_unicast_packet(skb, recv_if); + +free_nc_packet: + batadv_nc_packet_free(nc_packet); + return NET_RX_DROP; +} + +/** + * batadv_nc_free - clean up network coding memory + * @bat_priv: the bat priv with all the soft interface information + */ +void batadv_nc_free(struct batadv_priv *bat_priv) +{ + batadv_recv_handler_unregister(BATADV_CODED); + cancel_delayed_work_sync(&bat_priv->nc.work); + + batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL); + batadv_hash_destroy(bat_priv->nc.coding_hash); + batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL); + batadv_hash_destroy(bat_priv->nc.decoding_hash); +} + +/** + * batadv_nc_nodes_seq_print_text - print the nc node information + * @seq: seq file to print on + * @offset: not used + */ +int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_priv *bat_priv = netdev_priv(net_dev); + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct batadv_hard_iface *primary_if; + struct hlist_head *head; + struct batadv_orig_node *orig_node; + struct batadv_nc_node *nc_node; + int i; + + primary_if = batadv_seq_print_text_primary_if_get(seq); + if (!primary_if) + goto out; + + /* Traverse list of originators */ + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + /* For each orig_node in this bin */ + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + seq_printf(seq, "Node: %pM\n", orig_node->orig); + + seq_puts(seq, " Ingoing: "); + /* For each in_nc_node to this orig_node */ + list_for_each_entry_rcu(nc_node, + &orig_node->in_coding_list, + list) + seq_printf(seq, "%pM ", + nc_node->addr); + seq_puts(seq, "\n"); + + seq_puts(seq, " Outgoing: "); + /* For out_nc_node to this orig_node */ + list_for_each_entry_rcu(nc_node, + &orig_node->out_coding_list, + list) + seq_printf(seq, "%pM ", + nc_node->addr); + seq_puts(seq, "\n\n"); + } + rcu_read_unlock(); + } + +out: + if (primary_if) + batadv_hardif_free_ref(primary_if); + return 0; +} + +/** + * batadv_nc_init_debugfs - create nc folder and related files in debugfs + * @bat_priv: the bat priv with all the soft interface information + */ +int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) +{ + struct dentry *nc_dir, *file; + + nc_dir = debugfs_create_dir("nc", bat_priv->debug_dir); + if (!nc_dir) + goto out; + + file = debugfs_create_u8("min_tq", S_IRUGO | S_IWUSR, nc_dir, + &bat_priv->nc.min_tq); + if (!file) + goto out; + + file = debugfs_create_u32("max_fwd_delay", S_IRUGO | S_IWUSR, nc_dir, + &bat_priv->nc.max_fwd_delay); + if (!file) + goto out; + + file = debugfs_create_u32("max_buffer_time", S_IRUGO | S_IWUSR, nc_dir, + &bat_priv->nc.max_buffer_time); + if (!file) + goto out; + + return 0; + +out: + return -ENOMEM; +} diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h new file mode 100644 index 000000000000..4fa6d0caddbd --- /dev/null +++ b/net/batman-adv/network-coding.h @@ -0,0 +1,123 @@ +/* Copyright (C) 2012-2013 B.A.T.M.A.N. contributors: + * + * Martin Hundebøll, Jeppe Ledet-Pedersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ +#define _NET_BATMAN_ADV_NETWORK_CODING_H_ + +#ifdef CONFIG_BATMAN_ADV_NC + +int batadv_nc_init(struct batadv_priv *bat_priv); +void batadv_nc_free(struct batadv_priv *bat_priv); +void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + struct batadv_ogm_packet *ogm_packet, + int is_single_hop_neigh); +void batadv_nc_purge_orig(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + bool (*to_purge)(struct batadv_priv *, + struct batadv_nc_node *)); +void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv); +void batadv_nc_init_orig(struct batadv_orig_node *orig_node); +bool batadv_nc_skb_forward(struct sk_buff *skb, + struct batadv_neigh_node *neigh_node, + struct ethhdr *ethhdr); +void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, + struct sk_buff *skb); +void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, + struct sk_buff *skb); +int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset); +int batadv_nc_init_debugfs(struct batadv_priv *bat_priv); + +#else /* ifdef CONFIG_BATMAN_ADV_NC */ + +static inline int batadv_nc_init(struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline void batadv_nc_free(struct batadv_priv *bat_priv) +{ + return; +} + +static inline void +batadv_nc_update_nc_node(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + struct batadv_ogm_packet *ogm_packet, + int is_single_hop_neigh) +{ + return; +} + +static inline void +batadv_nc_purge_orig(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + bool (*to_purge)(struct batadv_priv *, + struct batadv_nc_node *)) +{ + return; +} + +static inline void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) +{ + return; +} + +static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node) +{ + return; +} + +static inline bool batadv_nc_skb_forward(struct sk_buff *skb, + struct batadv_neigh_node *neigh_node, + struct ethhdr *ethhdr) +{ + return false; +} + +static inline void +batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + return; +} + +static inline void +batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + return; +} + +static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq, + void *offset) +{ + return 0; +} + +static inline int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) +{ + return 0; +} + +#endif /* ifdef CONFIG_BATMAN_ADV_NC */ + +#endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 8c32cf1c2dec..2f3452546636 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -28,15 +28,12 @@ #include "unicast.h" #include "soft-interface.h" #include "bridge_loop_avoidance.h" +#include "network-coding.h" -static void batadv_purge_orig(struct work_struct *work); +/* hash class keys */ +static struct lock_class_key batadv_orig_hash_lock_class_key; -static void batadv_start_purge_timer(struct batadv_priv *bat_priv) -{ - INIT_DELAYED_WORK(&bat_priv->orig_work, batadv_purge_orig); - queue_delayed_work(batadv_event_workqueue, - &bat_priv->orig_work, msecs_to_jiffies(1000)); -} +static void batadv_purge_orig(struct work_struct *work); /* returns 1 if they are the same originator */ static int batadv_compare_orig(const struct hlist_node *node, const void *data2) @@ -57,7 +54,14 @@ int batadv_originator_init(struct batadv_priv *bat_priv) if (!bat_priv->orig_hash) goto err; - batadv_start_purge_timer(bat_priv); + batadv_hash_set_lock_class(bat_priv->orig_hash, + &batadv_orig_hash_lock_class_key); + + INIT_DELAYED_WORK(&bat_priv->orig_work, batadv_purge_orig); + queue_delayed_work(batadv_event_workqueue, + &bat_priv->orig_work, + msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); + return 0; err: @@ -115,7 +119,7 @@ out: static void batadv_orig_node_free_rcu(struct rcu_head *rcu) { - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node, *tmp_neigh_node; struct batadv_orig_node *orig_node; @@ -131,7 +135,7 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) } /* for all neighbors towards this originator ... */ - hlist_for_each_entry_safe(neigh_node, node, node_tmp, + hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); batadv_neigh_node_free_ref(neigh_node); @@ -139,6 +143,9 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) spin_unlock_bh(&orig_node->neigh_list_lock); + /* Free nc_nodes */ + batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); + batadv_frag_list_free(&orig_node->frag_list); batadv_tt_global_del_orig(orig_node->bat_priv, orig_node, "originator timed out"); @@ -158,7 +165,7 @@ void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) void batadv_originator_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; @@ -176,10 +183,9 @@ void batadv_originator_free(struct batadv_priv *bat_priv) list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(orig_node, node, node_tmp, + hlist_for_each_entry_safe(orig_node, node_tmp, head, hash_entry) { - - hlist_del_rcu(node); + hlist_del_rcu(&orig_node->hash_entry); batadv_orig_node_free_ref(orig_node); } spin_unlock_bh(list_lock); @@ -217,6 +223,8 @@ struct batadv_orig_node *batadv_get_orig_node(struct batadv_priv *bat_priv, spin_lock_init(&orig_node->neigh_list_lock); spin_lock_init(&orig_node->tt_buff_lock); + batadv_nc_init_orig(orig_node); + /* extra reference for return */ atomic_set(&orig_node->refcount, 2); @@ -272,7 +280,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_neigh_node **best_neigh_node) { - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; bool neigh_purged = false; unsigned long last_seen; @@ -283,9 +291,8 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ - hlist_for_each_entry_safe(neigh_node, node, node_tmp, + hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { - last_seen = neigh_node->last_seen; if_incoming = neigh_node->if_incoming; @@ -293,7 +300,6 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, (if_incoming->if_status == BATADV_IF_INACTIVE) || (if_incoming->if_status == BATADV_IF_NOT_IN_USE) || (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) { - if ((if_incoming->if_status == BATADV_IF_INACTIVE) || (if_incoming->if_status == BATADV_IF_NOT_IN_USE) || (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) @@ -348,7 +354,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, static void _batadv_purge_orig(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; @@ -363,13 +369,13 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv) list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(orig_node, node, node_tmp, + hlist_for_each_entry_safe(orig_node, node_tmp, head, hash_entry) { if (batadv_purge_orig_node(bat_priv, orig_node)) { if (orig_node->gw_flags) batadv_gw_node_delete(bat_priv, orig_node); - hlist_del_rcu(node); + hlist_del_rcu(&orig_node->hash_entry); batadv_orig_node_free_ref(orig_node); continue; } @@ -393,7 +399,9 @@ static void batadv_purge_orig(struct work_struct *work) delayed_work = container_of(work, struct delayed_work, work); bat_priv = container_of(delayed_work, struct batadv_priv, orig_work); _batadv_purge_orig(bat_priv); - batadv_start_purge_timer(bat_priv); + queue_delayed_work(batadv_event_workqueue, + &bat_priv->orig_work, + msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); } void batadv_purge_orig_ref(struct batadv_priv *bat_priv) @@ -406,7 +414,6 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) struct net_device *net_dev = (struct net_device *)seq->private; struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_node *node, *node_tmp; struct hlist_head *head; struct batadv_hard_iface *primary_if; struct batadv_orig_node *orig_node; @@ -432,7 +439,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { neigh_node = batadv_orig_node_get_router(orig_node); if (!neigh_node) continue; @@ -451,14 +458,14 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) neigh_node->addr, neigh_node->if_incoming->net_dev->name); - hlist_for_each_entry_rcu(neigh_node_tmp, node_tmp, + hlist_for_each_entry_rcu(neigh_node_tmp, &orig_node->neigh_list, list) { seq_printf(seq, " %pM (%3i)", neigh_node_tmp->addr, neigh_node_tmp->tq_avg); } - seq_printf(seq, "\n"); + seq_puts(seq, "\n"); batman_count++; next: @@ -468,7 +475,7 @@ next: } if (batman_count == 0) - seq_printf(seq, "No batman nodes in range ...\n"); + seq_puts(seq, "No batman nodes in range ...\n"); out: if (primary_if) @@ -509,7 +516,6 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_node *node; struct hlist_head *head; struct batadv_orig_node *orig_node; uint32_t i; @@ -522,7 +528,7 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { spin_lock_bh(&orig_node->ogm_cnt_lock); ret = batadv_orig_node_add_if(orig_node, max_if_num); spin_unlock_bh(&orig_node->ogm_cnt_lock); @@ -593,7 +599,6 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_node *node; struct hlist_head *head; struct batadv_hard_iface *hard_iface_tmp; struct batadv_orig_node *orig_node; @@ -607,7 +612,7 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { spin_lock_bh(&orig_node->ogm_cnt_lock); ret = batadv_orig_node_del_if(orig_node, max_if_num, hard_iface->if_num); diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 9778e656dec7..7df48fa7669d 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -68,7 +68,6 @@ batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; - struct hlist_node *node; struct batadv_orig_node *orig_node, *orig_node_tmp = NULL; int index; @@ -79,7 +78,7 @@ batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data) head = &hash->table[index]; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (!batadv_compare_eth(orig_node, data)) continue; diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index cb6405bf755c..a51ccfc39da4 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -30,6 +30,7 @@ enum batadv_packettype { BATADV_TT_QUERY = 0x07, BATADV_ROAM_ADV = 0x08, BATADV_UNICAST_4ADDR = 0x09, + BATADV_CODED = 0x0a, }; /** @@ -278,4 +279,36 @@ struct batadv_tt_change { uint8_t addr[ETH_ALEN]; } __packed; +/** + * struct batadv_coded_packet - network coded packet + * @header: common batman packet header and ttl of first included packet + * @reserved: Align following fields to 2-byte boundaries + * @first_source: original source of first included packet + * @first_orig_dest: original destinal of first included packet + * @first_crc: checksum of first included packet + * @first_ttvn: tt-version number of first included packet + * @second_ttl: ttl of second packet + * @second_dest: second receiver of this coded packet + * @second_source: original source of second included packet + * @second_orig_dest: original destination of second included packet + * @second_crc: checksum of second included packet + * @second_ttvn: tt version number of second included packet + * @coded_len: length of network coded part of the payload + */ +struct batadv_coded_packet { + struct batadv_header header; + uint8_t first_ttvn; + /* uint8_t first_dest[ETH_ALEN]; - saved in mac header destination */ + uint8_t first_source[ETH_ALEN]; + uint8_t first_orig_dest[ETH_ALEN]; + __be32 first_crc; + uint8_t second_ttl; + uint8_t second_ttvn; + uint8_t second_dest[ETH_ALEN]; + uint8_t second_source[ETH_ALEN]; + uint8_t second_orig_dest[ETH_ALEN]; + __be32 second_crc; + __be16 coded_len; +}; + #endif /* _NET_BATMAN_ADV_PACKET_H_ */ diff --git a/net/batman-adv/ring_buffer.c b/net/batman-adv/ring_buffer.c index c8f61e395b74..ccab0bbdbb59 100644 --- a/net/batman-adv/ring_buffer.c +++ b/net/batman-adv/ring_buffer.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/ring_buffer.h b/net/batman-adv/ring_buffer.h index fda8c17df273..3f92ae248e83 100644 --- a/net/batman-adv/ring_buffer.h +++ b/net/batman-adv/ring_buffer.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 1aa1722d0187..b27a4d792d15 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -29,6 +29,7 @@ #include "unicast.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" +#include "network-coding.h" static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); @@ -37,7 +38,6 @@ void batadv_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_node *node; struct hlist_head *head; struct batadv_orig_node *orig_node; unsigned long *word; @@ -49,7 +49,7 @@ void batadv_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { spin_lock_bh(&orig_node->ogm_cnt_lock); word_index = hard_iface->if_num * BATADV_NUM_WORDS; word = &(orig_node->bcast_own[word_index]); @@ -80,7 +80,6 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, /* route added */ } else if ((!curr_router) && (neigh_node)) { - batadv_dbg(BATADV_DBG_ROUTES, bat_priv, "Adding route towards: %pM (via %pM)\n", orig_node->orig, neigh_node->addr); @@ -147,7 +146,6 @@ out: void batadv_bonding_candidate_add(struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node) { - struct hlist_node *node; struct batadv_neigh_node *tmp_neigh_node, *router = NULL; uint8_t interference_candidate = 0; @@ -170,9 +168,8 @@ void batadv_bonding_candidate_add(struct batadv_orig_node *orig_node, * interface. If we do, we won't select this candidate because of * possible interference. */ - hlist_for_each_entry_rcu(tmp_neigh_node, node, + hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { - if (tmp_neigh_node == neigh_node) continue; @@ -406,7 +403,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, goto out; /* not for me */ - if (!batadv_is_my_mac(ethhdr->h_dest)) + if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest)) goto out; icmp_packet = (struct batadv_icmp_packet_rr *)skb->data; @@ -420,7 +417,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, } /* packet for me */ - if (batadv_is_my_mac(icmp_packet->dst)) + if (batadv_is_my_mac(bat_priv, icmp_packet->dst)) return batadv_recv_my_icmp_packet(bat_priv, skb, hdr_size); /* TTL exceeded */ @@ -552,27 +549,39 @@ batadv_find_ifalter_router(struct batadv_orig_node *primary_orig, return router; } -static int batadv_check_unicast_packet(struct sk_buff *skb, int hdr_size) +/** + * batadv_check_unicast_packet - Check for malformed unicast packets + * @bat_priv: the bat priv with all the soft interface information + * @skb: packet to check + * @hdr_size: size of header to pull + * + * Check for short header and bad addresses in given packet. Returns negative + * value when check fails and 0 otherwise. The negative value depends on the + * reason: -ENODATA for bad header, -EBADR for broadcast destination or source, + * and -EREMOTE for non-local (other host) destination. + */ +static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, int hdr_size) { struct ethhdr *ethhdr; /* drop packet if it has not necessary minimum size */ if (unlikely(!pskb_may_pull(skb, hdr_size))) - return -1; + return -ENODATA; ethhdr = (struct ethhdr *)skb_mac_header(skb); /* packet with unicast indication but broadcast recipient */ if (is_broadcast_ether_addr(ethhdr->h_dest)) - return -1; + return -EBADR; /* packet with broadcast sender address */ if (is_broadcast_ether_addr(ethhdr->h_source)) - return -1; + return -EBADR; /* not for me */ - if (!batadv_is_my_mac(ethhdr->h_dest)) - return -1; + if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest)) + return -EREMOTE; return 0; } @@ -586,7 +595,7 @@ int batadv_recv_tt_query(struct sk_buff *skb, struct batadv_hard_iface *recv_if) char tt_flag; size_t packet_size; - if (batadv_check_unicast_packet(skb, hdr_size) < 0) + if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0) return NET_RX_DROP; /* I could need to modify it */ @@ -618,7 +627,7 @@ int batadv_recv_tt_query(struct sk_buff *skb, struct batadv_hard_iface *recv_if) case BATADV_TT_RESPONSE: batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_RX); - if (batadv_is_my_mac(tt_query->dst)) { + if (batadv_is_my_mac(bat_priv, tt_query->dst)) { /* packet needs to be linearized to access the TT * changes */ @@ -661,14 +670,15 @@ int batadv_recv_roam_adv(struct sk_buff *skb, struct batadv_hard_iface *recv_if) struct batadv_roam_adv_packet *roam_adv_packet; struct batadv_orig_node *orig_node; - if (batadv_check_unicast_packet(skb, sizeof(*roam_adv_packet)) < 0) + if (batadv_check_unicast_packet(bat_priv, skb, + sizeof(*roam_adv_packet)) < 0) goto out; batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_RX); roam_adv_packet = (struct batadv_roam_adv_packet *)skb->data; - if (!batadv_is_my_mac(roam_adv_packet->dst)) + if (!batadv_is_my_mac(bat_priv, roam_adv_packet->dst)) return batadv_route_unicast_packet(skb, recv_if); /* check if it is a backbone gateway. we don't accept @@ -836,7 +846,6 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, if (unicast_packet->header.packet_type == BATADV_UNICAST_FRAG && batadv_frag_can_reassemble(skb, neigh_node->if_incoming->net_dev->mtu)) { - ret = batadv_frag_reassemble_skb(skb, bat_priv, &new_skb); if (ret == NET_RX_DROP) @@ -855,14 +864,17 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, /* decrement ttl */ unicast_packet->header.ttl--; - /* Update stats counter */ - batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); - batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, - skb->len + ETH_HLEN); - - /* route it */ - if (batadv_send_skb_to_orig(skb, orig_node, recv_if)) + /* network code packet if possible */ + if (batadv_nc_skb_forward(skb, neigh_node, ethhdr)) { ret = NET_RX_SUCCESS; + } else if (batadv_send_skb_to_orig(skb, orig_node, recv_if)) { + ret = NET_RX_SUCCESS; + + /* Update stats counter */ + batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); + batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, + skb->len + ETH_HLEN); + } out: if (neigh_node) @@ -927,7 +939,7 @@ out: } static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, - struct sk_buff *skb) { + struct sk_buff *skb, int hdr_len) { uint8_t curr_ttvn, old_ttvn; struct batadv_orig_node *orig_node; struct ethhdr *ethhdr; @@ -936,7 +948,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, int is_old_ttvn; /* check if there is enough data before accessing it */ - if (pskb_may_pull(skb, sizeof(*unicast_packet) + ETH_HLEN) < 0) + if (pskb_may_pull(skb, hdr_len + ETH_HLEN) < 0) return 0; /* create a copy of the skb (in case of for re-routing) to modify it. */ @@ -944,7 +956,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, return 0; unicast_packet = (struct batadv_unicast_packet *)skb->data; - ethhdr = (struct ethhdr *)(skb->data + sizeof(*unicast_packet)); + ethhdr = (struct ethhdr *)(skb->data + hdr_len); /* check if the destination client was served by this node and it is now * roaming. In this case, it means that the node has got a ROAM_ADV @@ -972,7 +984,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * last time) the packet had an updated information or not */ curr_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn); - if (!batadv_is_my_mac(unicast_packet->dest)) { + if (!batadv_is_my_mac(bat_priv, unicast_packet->dest)) { orig_node = batadv_orig_hash_find(bat_priv, unicast_packet->dest); /* if it is not possible to find the orig_node representing the @@ -1038,7 +1050,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, struct batadv_unicast_4addr_packet *unicast_4addr_packet; uint8_t *orig_addr; struct batadv_orig_node *orig_node = NULL; - int hdr_size = sizeof(*unicast_packet); + int check, hdr_size = sizeof(*unicast_packet); bool is4addr; unicast_packet = (struct batadv_unicast_packet *)skb->data; @@ -1049,14 +1061,22 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, if (is4addr) hdr_size = sizeof(*unicast_4addr_packet); - if (batadv_check_unicast_packet(skb, hdr_size) < 0) - return NET_RX_DROP; + /* function returns -EREMOTE for promiscuous packets */ + check = batadv_check_unicast_packet(bat_priv, skb, hdr_size); - if (!batadv_check_unicast_ttvn(bat_priv, skb)) + /* Even though the packet is not for us, we might save it to use for + * decoding a later received coded packet + */ + if (check == -EREMOTE) + batadv_nc_skb_store_sniffed_unicast(bat_priv, skb); + + if (check < 0) + return NET_RX_DROP; + if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size)) return NET_RX_DROP; /* packet for me */ - if (batadv_is_my_mac(unicast_packet->dest)) { + if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) { if (is4addr) { batadv_dat_inc_counter(bat_priv, unicast_4addr_packet->subtype); @@ -1093,17 +1113,16 @@ int batadv_recv_ucast_frag_packet(struct sk_buff *skb, struct sk_buff *new_skb = NULL; int ret; - if (batadv_check_unicast_packet(skb, hdr_size) < 0) + if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0) return NET_RX_DROP; - if (!batadv_check_unicast_ttvn(bat_priv, skb)) + if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size)) return NET_RX_DROP; unicast_packet = (struct batadv_unicast_frag_packet *)skb->data; /* packet for me */ - if (batadv_is_my_mac(unicast_packet->dest)) { - + if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) { ret = batadv_frag_reassemble_skb(skb, bat_priv, &new_skb); if (ret == NET_RX_DROP) @@ -1157,13 +1176,13 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, goto out; /* ignore broadcasts sent by myself */ - if (batadv_is_my_mac(ethhdr->h_source)) + if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto out; bcast_packet = (struct batadv_bcast_packet *)skb->data; /* ignore broadcasts originated by myself */ - if (batadv_is_my_mac(bcast_packet->orig)) + if (batadv_is_my_mac(bat_priv, bcast_packet->orig)) goto out; if (bcast_packet->header.ttl < 2) @@ -1249,14 +1268,14 @@ int batadv_recv_vis_packet(struct sk_buff *skb, ethhdr = (struct ethhdr *)skb_mac_header(skb); /* not for me */ - if (!batadv_is_my_mac(ethhdr->h_dest)) + if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest)) return NET_RX_DROP; /* ignore own packets */ - if (batadv_is_my_mac(vis_packet->vis_orig)) + if (batadv_is_my_mac(bat_priv, vis_packet->vis_orig)) return NET_RX_DROP; - if (batadv_is_my_mac(vis_packet->sender_orig)) + if (batadv_is_my_mac(bat_priv, vis_packet->sender_orig)) return NET_RX_DROP; switch (vis_packet->vis_type) { diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 9262279ea667..99eeafaba407 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 4425af9dad40..263cfd1ccee7 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -27,6 +27,7 @@ #include "vis.h" #include "gateway_common.h" #include "originator.h" +#include "network-coding.h" #include <linux/if_ether.h> @@ -39,6 +40,7 @@ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const uint8_t *dst_addr) { + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct ethhdr *ethhdr; if (hard_iface->if_status != BATADV_IF_ACTIVE) @@ -70,6 +72,9 @@ int batadv_send_skb_packet(struct sk_buff *skb, skb->dev = hard_iface->net_dev; + /* Save a clone of the skb to use when decoding coded packets */ + batadv_nc_skb_store_for_decoding(bat_priv, skb); + /* dev_queue_xmit() returns a negative result on error. However on * congestion and traffic shaping, it drops and returns NET_XMIT_DROP * (which is > 0). This will not be treated as an error. @@ -155,8 +160,6 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, spin_unlock_bh(&bat_priv->forw_bcast_list_lock); /* start timer for this packet */ - INIT_DELAYED_WORK(&forw_packet->delayed_work, - batadv_send_outstanding_bcast_packet); queue_delayed_work(batadv_event_workqueue, &forw_packet->delayed_work, send_time); } @@ -210,6 +213,9 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, /* how often did we send the bcast packet ? */ forw_packet->num_packets = 0; + INIT_DELAYED_WORK(&forw_packet->delayed_work, + batadv_send_outstanding_bcast_packet); + _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, delay); return NETDEV_TX_OK; @@ -315,7 +321,7 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface) { struct batadv_forw_packet *forw_packet; - struct hlist_node *tmp_node, *safe_tmp_node; + struct hlist_node *safe_tmp_node; bool pending; if (hard_iface) @@ -328,9 +334,8 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, /* free bcast list */ spin_lock_bh(&bat_priv->forw_bcast_list_lock); - hlist_for_each_entry_safe(forw_packet, tmp_node, safe_tmp_node, + hlist_for_each_entry_safe(forw_packet, safe_tmp_node, &bat_priv->forw_bcast_list, list) { - /* if purge_outstanding_packets() was called with an argument * we delete only packets belonging to the given interface */ @@ -355,9 +360,8 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, /* free batman packet list */ spin_lock_bh(&bat_priv->forw_bat_list_lock); - hlist_for_each_entry_safe(forw_packet, tmp_node, safe_tmp_node, + hlist_for_each_entry_safe(forw_packet, safe_tmp_node, &bat_priv->forw_bat_list, list) { - /* if purge_outstanding_packets() was called with an argument * we delete only packets belonging to the given interface */ diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 0078dece1abc..38e662f619ac 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 6b548fde8e04..6f20d339e33a 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -37,6 +37,7 @@ #include <linux/if_ether.h> #include "unicast.h" #include "bridge_loop_avoidance.h" +#include "network-coding.h" static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd); @@ -124,7 +125,6 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) batadv_tt_local_add(dev, addr->sa_data, BATADV_NULL_IFINDEX); } - dev->addr_assign_type &= ~NET_ADDR_RANDOM; return 0; } @@ -181,7 +181,8 @@ static int batadv_interface_tx(struct sk_buff *skb, goto dropped; /* Register the client MAC in the transtable */ - batadv_tt_local_add(soft_iface, ethhdr->h_source, skb->skb_iif); + if (!is_multicast_ether_addr(ethhdr->h_source)) + batadv_tt_local_add(soft_iface, ethhdr->h_source, skb->skb_iif); /* don't accept stp packets. STP does not help in meshes. * better use the bridge loop avoidance ... @@ -401,86 +402,58 @@ static void batadv_set_lockdep_class(struct net_device *dev) } /** - * batadv_softif_init - Late stage initialization of soft interface - * @dev: registered network device to modify + * batadv_softif_destroy_finish - cleans up the remains of a softif + * @work: work queue item * - * Returns error code on failures + * Free the parts of the soft interface which can not be removed under + * rtnl lock (to prevent deadlock situations). */ -static int batadv_softif_init(struct net_device *dev) -{ - batadv_set_lockdep_class(dev); - - return 0; -} - -static const struct net_device_ops batadv_netdev_ops = { - .ndo_init = batadv_softif_init, - .ndo_open = batadv_interface_open, - .ndo_stop = batadv_interface_release, - .ndo_get_stats = batadv_interface_stats, - .ndo_set_mac_address = batadv_interface_set_mac_addr, - .ndo_change_mtu = batadv_interface_change_mtu, - .ndo_start_xmit = batadv_interface_tx, - .ndo_validate_addr = eth_validate_addr -}; - -static void batadv_interface_setup(struct net_device *dev) +static void batadv_softif_destroy_finish(struct work_struct *work) { - struct batadv_priv *priv = netdev_priv(dev); - - ether_setup(dev); - - dev->netdev_ops = &batadv_netdev_ops; - dev->destructor = free_netdev; - dev->tx_queue_len = 0; + struct batadv_priv *bat_priv; + struct net_device *soft_iface; - /* can't call min_mtu, because the needed variables - * have not been initialized yet - */ - dev->mtu = ETH_DATA_LEN; - /* reserve more space in the skbuff for our header */ - dev->hard_header_len = BATADV_HEADER_LEN; + bat_priv = container_of(work, struct batadv_priv, + cleanup_work); + soft_iface = bat_priv->soft_iface; - /* generate random address */ - eth_hw_addr_random(dev); - - SET_ETHTOOL_OPS(dev, &batadv_ethtool_ops); + batadv_sysfs_del_meshif(soft_iface); - memset(priv, 0, sizeof(*priv)); + rtnl_lock(); + unregister_netdevice(soft_iface); + rtnl_unlock(); } -struct net_device *batadv_softif_create(const char *name) +/** + * batadv_softif_init_late - late stage initialization of soft interface + * @dev: registered network device to modify + * + * Returns error code on failures + */ +static int batadv_softif_init_late(struct net_device *dev) { - struct net_device *soft_iface; struct batadv_priv *bat_priv; int ret; size_t cnt_len = sizeof(uint64_t) * BATADV_CNT_NUM; - soft_iface = alloc_netdev(sizeof(*bat_priv), name, - batadv_interface_setup); - - if (!soft_iface) - goto out; + batadv_set_lockdep_class(dev); - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(dev); + bat_priv->soft_iface = dev; + INIT_WORK(&bat_priv->cleanup_work, batadv_softif_destroy_finish); /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(uint64_t)); if (!bat_priv->bat_counters) - goto free_soft_iface; - - ret = register_netdevice(soft_iface); - if (ret < 0) { - pr_err("Unable to register the batman interface '%s': %i\n", - name, ret); - goto free_bat_counters; - } + return -ENOMEM; atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); +#ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bridge_loop_avoidance, 0); +#endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); #endif @@ -491,7 +464,9 @@ struct net_device *batadv_softif_create(const char *name) atomic_set(&bat_priv->gw_bandwidth, 41); atomic_set(&bat_priv->orig_interval, 1000); atomic_set(&bat_priv->hop_penalty, 30); +#ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); +#endif atomic_set(&bat_priv->fragmentation, 1); atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN); atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN); @@ -510,47 +485,194 @@ struct net_device *batadv_softif_create(const char *name) bat_priv->primary_if = NULL; bat_priv->num_ifaces = 0; - ret = batadv_algo_select(bat_priv, batadv_routing_algo); - if (ret < 0) - goto unreg_soft_iface; + batadv_nc_init_bat_priv(bat_priv); - ret = batadv_sysfs_add_meshif(soft_iface); + ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) - goto unreg_soft_iface; + goto free_bat_counters; - ret = batadv_debugfs_add_meshif(soft_iface); + ret = batadv_debugfs_add_meshif(dev); if (ret < 0) - goto unreg_sysfs; + goto free_bat_counters; - ret = batadv_mesh_init(soft_iface); + ret = batadv_mesh_init(dev); if (ret < 0) goto unreg_debugfs; - return soft_iface; + return 0; unreg_debugfs: - batadv_debugfs_del_meshif(soft_iface); -unreg_sysfs: - batadv_sysfs_del_meshif(soft_iface); -unreg_soft_iface: - free_percpu(bat_priv->bat_counters); - unregister_netdevice(soft_iface); - return NULL; - + batadv_debugfs_del_meshif(dev); free_bat_counters: free_percpu(bat_priv->bat_counters); -free_soft_iface: - free_netdev(soft_iface); + + return ret; +} + +/** + * batadv_softif_slave_add - Add a slave interface to a batadv_soft_interface + * @dev: batadv_soft_interface used as master interface + * @slave_dev: net_device which should become the slave interface + * + * Return 0 if successful or error otherwise. + */ +static int batadv_softif_slave_add(struct net_device *dev, + struct net_device *slave_dev) +{ + struct batadv_hard_iface *hard_iface; + int ret = -EINVAL; + + hard_iface = batadv_hardif_get_by_netdev(slave_dev); + if (!hard_iface || hard_iface->soft_iface != NULL) + goto out; + + ret = batadv_hardif_enable_interface(hard_iface, dev->name); + out: - return NULL; + if (hard_iface) + batadv_hardif_free_ref(hard_iface); + return ret; } -void batadv_softif_destroy(struct net_device *soft_iface) +/** + * batadv_softif_slave_del - Delete a slave iface from a batadv_soft_interface + * @dev: batadv_soft_interface used as master interface + * @slave_dev: net_device which should be removed from the master interface + * + * Return 0 if successful or error otherwise. + */ +static int batadv_softif_slave_del(struct net_device *dev, + struct net_device *slave_dev) { - batadv_debugfs_del_meshif(soft_iface); + struct batadv_hard_iface *hard_iface; + int ret = -EINVAL; + + hard_iface = batadv_hardif_get_by_netdev(slave_dev); + + if (!hard_iface || hard_iface->soft_iface != dev) + goto out; + + batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_KEEP); + ret = 0; + +out: + if (hard_iface) + batadv_hardif_free_ref(hard_iface); + return ret; +} + +static const struct net_device_ops batadv_netdev_ops = { + .ndo_init = batadv_softif_init_late, + .ndo_open = batadv_interface_open, + .ndo_stop = batadv_interface_release, + .ndo_get_stats = batadv_interface_stats, + .ndo_set_mac_address = batadv_interface_set_mac_addr, + .ndo_change_mtu = batadv_interface_change_mtu, + .ndo_start_xmit = batadv_interface_tx, + .ndo_validate_addr = eth_validate_addr, + .ndo_add_slave = batadv_softif_slave_add, + .ndo_del_slave = batadv_softif_slave_del, +}; + +/** + * batadv_softif_free - Deconstructor of batadv_soft_interface + * @dev: Device to cleanup and remove + */ +static void batadv_softif_free(struct net_device *dev) +{ + batadv_debugfs_del_meshif(dev); + batadv_mesh_free(dev); + + /* some scheduled RCU callbacks need the bat_priv struct to accomplish + * their tasks. Wait for them all to be finished before freeing the + * netdev and its private data (bat_priv) + */ + rcu_barrier(); + + free_netdev(dev); +} + +/** + * batadv_softif_init_early - early stage initialization of soft interface + * @dev: registered network device to modify + */ +static void batadv_softif_init_early(struct net_device *dev) +{ + struct batadv_priv *priv = netdev_priv(dev); + + ether_setup(dev); + + dev->netdev_ops = &batadv_netdev_ops; + dev->destructor = batadv_softif_free; + dev->tx_queue_len = 0; + + /* can't call min_mtu, because the needed variables + * have not been initialized yet + */ + dev->mtu = ETH_DATA_LEN; + /* reserve more space in the skbuff for our header */ + dev->hard_header_len = BATADV_HEADER_LEN; + + /* generate random address */ + eth_hw_addr_random(dev); + + SET_ETHTOOL_OPS(dev, &batadv_ethtool_ops); + + memset(priv, 0, sizeof(*priv)); +} + +struct net_device *batadv_softif_create(const char *name) +{ + struct net_device *soft_iface; + int ret; + + soft_iface = alloc_netdev(sizeof(struct batadv_priv), name, + batadv_softif_init_early); + if (!soft_iface) + return NULL; + + soft_iface->rtnl_link_ops = &batadv_link_ops; + + ret = register_netdevice(soft_iface); + if (ret < 0) { + pr_err("Unable to register the batman interface '%s': %i\n", + name, ret); + free_netdev(soft_iface); + return NULL; + } + + return soft_iface; +} + +/** + * batadv_softif_destroy_sysfs - deletion of batadv_soft_interface via sysfs + * @soft_iface: the to-be-removed batman-adv interface + */ +void batadv_softif_destroy_sysfs(struct net_device *soft_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(soft_iface); + + queue_work(batadv_event_workqueue, &bat_priv->cleanup_work); +} + +/** + * batadv_softif_destroy_netlink - deletion of batadv_soft_interface via netlink + * @soft_iface: the to-be-removed batman-adv interface + * @head: list pointer + */ +static void batadv_softif_destroy_netlink(struct net_device *soft_iface, + struct list_head *head) +{ + struct batadv_hard_iface *hard_iface; + + list_for_each_entry(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface == soft_iface) + batadv_hardif_disable_interface(hard_iface, + BATADV_IF_CLEANUP_KEEP); + } + batadv_sysfs_del_meshif(soft_iface); - batadv_mesh_free(soft_iface); - unregister_netdevice(soft_iface); + unregister_netdevice_queue(soft_iface, head); } int batadv_softif_is_valid(const struct net_device *net_dev) @@ -561,6 +683,13 @@ int batadv_softif_is_valid(const struct net_device *net_dev) return 0; } +struct rtnl_link_ops batadv_link_ops __read_mostly = { + .kind = "batadv", + .priv_size = sizeof(struct batadv_priv), + .setup = batadv_softif_init_early, + .dellink = batadv_softif_destroy_netlink, +}; + /* ethtool */ static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) { @@ -581,10 +710,10 @@ static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) static void batadv_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strcpy(info->driver, "B.A.T.M.A.N. advanced"); - strcpy(info->version, BATADV_SOURCE_VERSION); - strcpy(info->fw_version, "N/A"); - strcpy(info->bus_info, "batman"); + strlcpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); + strlcpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); + strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); + strlcpy(info->bus_info, "batman", sizeof(info->bus_info)); } static u32 batadv_get_msglevel(struct net_device *dev) @@ -632,6 +761,17 @@ static const struct { { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif +#ifdef CONFIG_BATMAN_ADV_NC + { "nc_code" }, + { "nc_code_bytes" }, + { "nc_recode" }, + { "nc_recode_bytes" }, + { "nc_buffer" }, + { "nc_decode" }, + { "nc_decode_bytes" }, + { "nc_decode_failed" }, + { "nc_sniffed" }, +#endif }; static void batadv_get_strings(struct net_device *dev, uint32_t stringset, diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 07a08fed28b9..2f2472c2ea0d 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -25,7 +25,8 @@ void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, struct batadv_hard_iface *recv_if, int hdr_size, struct batadv_orig_node *orig_node); struct net_device *batadv_softif_create(const char *name); -void batadv_softif_destroy(struct net_device *soft_iface); +void batadv_softif_destroy_sysfs(struct net_device *soft_iface); int batadv_softif_is_valid(const struct net_device *net_dev); +extern struct rtnl_link_ops batadv_link_ops; #endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */ diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 84a55cb19b0b..15a22efa9a67 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -442,6 +442,9 @@ static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, #ifdef CONFIG_BATMAN_ADV_DEBUG BATADV_ATTR_SIF_UINT(log_level, S_IRUGO | S_IWUSR, 0, BATADV_DBG_ALL, NULL); #endif +#ifdef CONFIG_BATMAN_ADV_NC +BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR, NULL); +#endif static struct batadv_attribute *batadv_mesh_attrs[] = { &batadv_attr_aggregated_ogms, @@ -464,6 +467,9 @@ static struct batadv_attribute *batadv_mesh_attrs[] = { #ifdef CONFIG_BATMAN_ADV_DEBUG &batadv_attr_log_level, #endif +#ifdef CONFIG_BATMAN_ADV_NC + &batadv_attr_network_coding, +#endif NULL, }; @@ -582,13 +588,15 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, } if (status_tmp == BATADV_IF_NOT_IN_USE) { - batadv_hardif_disable_interface(hard_iface); + batadv_hardif_disable_interface(hard_iface, + BATADV_IF_CLEANUP_AUTO); goto unlock; } /* if the interface already is in use */ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) - batadv_hardif_disable_interface(hard_iface); + batadv_hardif_disable_interface(hard_iface, + BATADV_IF_CLEANUP_AUTO); ret = batadv_hardif_enable_interface(hard_iface, buff); @@ -688,15 +696,10 @@ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, enum batadv_uev_action action, const char *data) { int ret = -ENOMEM; - struct batadv_hard_iface *primary_if; struct kobject *bat_kobj; char *uevent_env[4] = { NULL, NULL, NULL, NULL }; - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto out; - - bat_kobj = &primary_if->soft_iface->dev.kobj; + bat_kobj = &bat_priv->soft_iface->dev.kobj; uevent_env[0] = kmalloc(strlen(BATADV_UEV_TYPE_VAR) + strlen(batadv_uev_type_str[type]) + 1, @@ -732,9 +735,6 @@ out: kfree(uevent_env[1]); kfree(uevent_env[2]); - if (primary_if) - batadv_hardif_free_ref(primary_if); - if (ret) batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n", diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 3fd1412b0620..479acf4c16f4 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 22457a7952ba..5e89deeb9542 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -29,6 +29,10 @@ #include <linux/crc16.h> +/* hash class keys */ +static struct lock_class_key batadv_tt_local_hash_lock_class_key; +static struct lock_class_key batadv_tt_global_hash_lock_class_key; + static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client, struct batadv_orig_node *orig_node); static void batadv_tt_purge(struct work_struct *work); @@ -48,18 +52,10 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0); } -static void batadv_tt_start_timer(struct batadv_priv *bat_priv) -{ - INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge); - queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work, - msecs_to_jiffies(5000)); -} - static struct batadv_tt_common_entry * batadv_tt_hash_find(struct batadv_hashtable *hash, const void *data) { struct hlist_head *head; - struct hlist_node *node; struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_common_entry *tt_common_entry_tmp = NULL; uint32_t index; @@ -71,7 +67,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const void *data) head = &hash->table[index]; rcu_read_lock(); - hlist_for_each_entry_rcu(tt_common_entry, node, head, hash_entry) { + hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { if (!batadv_compare_eth(tt_common_entry, data)) continue; @@ -112,7 +108,6 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const void *data) struct batadv_tt_global_entry, common); return tt_global_entry; - } static void @@ -235,6 +230,9 @@ static int batadv_tt_local_init(struct batadv_priv *bat_priv) if (!bat_priv->tt.local_hash) return -ENOMEM; + batadv_hash_set_lock_class(bat_priv->tt.local_hash, + &batadv_tt_local_hash_lock_class_key); + return 0; } @@ -249,7 +247,6 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_orig, tt_global->common.addr); batadv_tt_global_entry_free_ref(tt_global); - } void batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, @@ -259,7 +256,6 @@ void batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, struct batadv_tt_local_entry *tt_local; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; - struct hlist_node *node; struct batadv_tt_orig_list_entry *orig_entry; int hash_added; bool roamed_back = false; @@ -305,7 +301,11 @@ void batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, (uint8_t)atomic_read(&bat_priv->tt.vn)); memcpy(tt_local->common.addr, addr, ETH_ALEN); - tt_local->common.flags = BATADV_NO_FLAGS; + /* The local entry has to be marked as NEW to avoid to send it in + * a full table response going out before the next ttvn increment + * (consistency check) + */ + tt_local->common.flags = BATADV_TT_CLIENT_NEW; if (batadv_is_wifi_iface(ifindex)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; atomic_set(&tt_local->common.refcount, 2); @@ -316,12 +316,6 @@ void batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, if (batadv_compare_eth(addr, soft_iface->dev_addr)) tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE; - /* The local entry has to be marked as NEW to avoid to send it in - * a full table response going out before the next ttvn increment - * (consistency check) - */ - tt_local->common.flags |= BATADV_TT_CLIENT_NEW; - hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_orig, &tt_local->common, &tt_local->common.hash_entry); @@ -343,7 +337,7 @@ check_roaming: /* These node are probably going to update their tt table */ head = &tt_global->orig_list; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_entry, node, head, list) { + hlist_for_each_entry_rcu(orig_entry, head, list) { batadv_send_roam_adv(bat_priv, tt_global->common.addr, orig_entry->orig_node); } @@ -391,25 +385,19 @@ static void batadv_tt_prepare_packet_buff(struct batadv_priv *bat_priv, int *packet_buff_len, int min_packet_len) { - struct batadv_hard_iface *primary_if; int req_len; - primary_if = batadv_primary_if_get_selected(bat_priv); - req_len = min_packet_len; req_len += batadv_tt_len(atomic_read(&bat_priv->tt.local_changes)); /* if we have too many changes for one packet don't send any * and wait for the tt table request which will be fragmented */ - if ((!primary_if) || (req_len > primary_if->soft_iface->mtu)) + if (req_len > bat_priv->soft_iface->mtu) req_len = min_packet_len; batadv_tt_realloc_packet_buff(packet_buff, packet_buff_len, min_packet_len, req_len); - - if (primary_if) - batadv_hardif_free_ref(primary_if); } static int batadv_tt_changes_fill_buff(struct batadv_priv *bat_priv, @@ -472,37 +460,56 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common_entry; + struct batadv_tt_local_entry *tt_local; struct batadv_hard_iface *primary_if; - struct hlist_node *node; struct hlist_head *head; uint32_t i; + int last_seen_secs; + int last_seen_msecs; + unsigned long last_seen_jiffies; + bool no_purge; + uint16_t np_flag = BATADV_TT_CLIENT_NOPURGE; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) goto out; seq_printf(seq, - "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n", - net_dev->name, (uint8_t)atomic_read(&bat_priv->tt.vn)); + "Locally retrieved addresses (from %s) announced via TT (TTVN: %u CRC: %#.4x):\n", + net_dev->name, (uint8_t)atomic_read(&bat_priv->tt.vn), + bat_priv->tt.local_crc); + seq_printf(seq, " %-13s %-7s %-10s\n", "Client", "Flags", + "Last seen"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(tt_common_entry, node, + hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { - seq_printf(seq, " * %pM [%c%c%c%c%c]\n", + tt_local = container_of(tt_common_entry, + struct batadv_tt_local_entry, + common); + last_seen_jiffies = jiffies - tt_local->last_seen; + last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); + last_seen_secs = last_seen_msecs / 1000; + last_seen_msecs = last_seen_msecs % 1000; + + no_purge = tt_common_entry->flags & np_flag; + + seq_printf(seq, " * %pM [%c%c%c%c%c] %3u.%03u\n", tt_common_entry->addr, (tt_common_entry->flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_NOPURGE ? 'P' : '.'), + no_purge ? 'P' : '.', (tt_common_entry->flags & BATADV_TT_CLIENT_NEW ? 'N' : '.'), (tt_common_entry->flags & BATADV_TT_CLIENT_PENDING ? 'X' : '.'), (tt_common_entry->flags & - BATADV_TT_CLIENT_WIFI ? 'W' : '.')); + BATADV_TT_CLIENT_WIFI ? 'W' : '.'), + no_purge ? 0 : last_seen_secs, + no_purge ? 0 : last_seen_msecs); } rcu_read_unlock(); } @@ -589,9 +596,9 @@ static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv, { struct batadv_tt_local_entry *tt_local_entry; struct batadv_tt_common_entry *tt_common_entry; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; - hlist_for_each_entry_safe(tt_common_entry, node, node_tmp, head, + hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { tt_local_entry = container_of(tt_common_entry, struct batadv_tt_local_entry, @@ -627,7 +634,6 @@ static void batadv_tt_local_purge(struct batadv_priv *bat_priv) batadv_tt_local_purge_list(bat_priv, head); spin_unlock_bh(list_lock); } - } static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) @@ -636,7 +642,7 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct hlist_head *head; uint32_t i; @@ -650,9 +656,9 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(tt_common_entry, node, node_tmp, + hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { - hlist_del_rcu(node); + hlist_del_rcu(&tt_common_entry->hash_entry); tt_local = container_of(tt_common_entry, struct batadv_tt_local_entry, common); @@ -676,6 +682,9 @@ static int batadv_tt_global_init(struct batadv_priv *bat_priv) if (!bat_priv->tt.global_hash) return -ENOMEM; + batadv_hash_set_lock_class(bat_priv->tt.global_hash, + &batadv_tt_global_hash_lock_class_key); + return 0; } @@ -706,11 +715,10 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, { struct batadv_tt_orig_list_entry *tmp_orig_entry, *orig_entry = NULL; const struct hlist_head *head; - struct hlist_node *node; rcu_read_lock(); head = &entry->orig_list; - hlist_for_each_entry_rcu(tmp_orig_entry, node, head, list) { + hlist_for_each_entry_rcu(tmp_orig_entry, head, list) { if (tmp_orig_entry->orig_node != orig_node) continue; if (!atomic_inc_not_zero(&tmp_orig_entry->refcount)) @@ -894,7 +902,7 @@ out_remove: /* remove address from local hash if present */ local_flags = batadv_tt_local_remove(bat_priv, tt_addr, "global tt received", - !!(flags & BATADV_TT_CLIENT_ROAM)); + flags & BATADV_TT_CLIENT_ROAM); tt_global_entry->common.flags |= local_flags & BATADV_TT_CLIENT_WIFI; if (!(flags & BATADV_TT_CLIENT_ROAM)) @@ -922,12 +930,11 @@ batadv_transtable_best_orig(struct batadv_tt_global_entry *tt_global_entry) { struct batadv_neigh_node *router = NULL; struct hlist_head *head; - struct hlist_node *node; struct batadv_tt_orig_list_entry *orig_entry, *best_entry = NULL; int best_tq = 0; head = &tt_global_entry->orig_list; - hlist_for_each_entry_rcu(orig_entry, node, head, list) { + hlist_for_each_entry_rcu(orig_entry, head, list) { router = batadv_orig_node_get_router(orig_entry->orig_node); if (!router) continue; @@ -955,7 +962,6 @@ batadv_tt_global_print_entry(struct batadv_tt_global_entry *tt_global_entry, struct seq_file *seq) { struct hlist_head *head; - struct hlist_node *node; struct batadv_tt_orig_list_entry *orig_entry, *best_entry; struct batadv_tt_common_entry *tt_common_entry; uint16_t flags; @@ -967,10 +973,11 @@ batadv_tt_global_print_entry(struct batadv_tt_global_entry *tt_global_entry, best_entry = batadv_transtable_best_orig(tt_global_entry); if (best_entry) { last_ttvn = atomic_read(&best_entry->orig_node->last_ttvn); - seq_printf(seq, " %c %pM (%3u) via %pM (%3u) [%c%c%c]\n", + seq_printf(seq, + " %c %pM (%3u) via %pM (%3u) (%#.4x) [%c%c%c]\n", '*', tt_global_entry->common.addr, best_entry->ttvn, best_entry->orig_node->orig, - last_ttvn, + last_ttvn, best_entry->orig_node->tt_crc, (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.')); @@ -978,7 +985,7 @@ batadv_tt_global_print_entry(struct batadv_tt_global_entry *tt_global_entry, head = &tt_global_entry->orig_list; - hlist_for_each_entry_rcu(orig_entry, node, head, list) { + hlist_for_each_entry_rcu(orig_entry, head, list) { if (best_entry == orig_entry) continue; @@ -1001,7 +1008,6 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_global_entry *tt_global; struct batadv_hard_iface *primary_if; - struct hlist_node *node; struct hlist_head *head; uint32_t i; @@ -1012,14 +1018,15 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "Globally announced TT entries received via the mesh %s\n", net_dev->name); - seq_printf(seq, " %-13s %s %-15s %s %s\n", - "Client", "(TTVN)", "Originator", "(Curr TTVN)", "Flags"); + seq_printf(seq, " %-13s %s %-15s %s (%-6s) %s\n", + "Client", "(TTVN)", "Originator", "(Curr TTVN)", "CRC", + "Flags"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(tt_common_entry, node, + hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, @@ -1039,17 +1046,16 @@ static void batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) { struct hlist_head *head; - struct hlist_node *node, *safe; + struct hlist_node *safe; struct batadv_tt_orig_list_entry *orig_entry; spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; - hlist_for_each_entry_safe(orig_entry, node, safe, head, list) { - hlist_del_rcu(node); + hlist_for_each_entry_safe(orig_entry, safe, head, list) { + hlist_del_rcu(&orig_entry->list); batadv_tt_orig_list_entry_free_ref(orig_entry); } spin_unlock_bh(&tt_global_entry->list_lock); - } static void @@ -1059,18 +1065,18 @@ batadv_tt_global_del_orig_entry(struct batadv_priv *bat_priv, const char *message) { struct hlist_head *head; - struct hlist_node *node, *safe; + struct hlist_node *safe; struct batadv_tt_orig_list_entry *orig_entry; spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; - hlist_for_each_entry_safe(orig_entry, node, safe, head, list) { + hlist_for_each_entry_safe(orig_entry, safe, head, list) { if (orig_entry->orig_node == orig_node) { batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting %pM from global tt entry %pM: %s\n", orig_node->orig, tt_global_entry->common.addr, message); - hlist_del_rcu(node); + hlist_del_rcu(&orig_entry->list); batadv_tt_orig_list_entry_free_ref(orig_entry); } } @@ -1089,7 +1095,6 @@ batadv_tt_global_del_roaming(struct batadv_priv *bat_priv, { bool last_entry = true; struct hlist_head *head; - struct hlist_node *node; struct batadv_tt_orig_list_entry *orig_entry; /* no local entry exists, case 1: @@ -1098,7 +1103,7 @@ batadv_tt_global_del_roaming(struct batadv_priv *bat_priv, rcu_read_lock(); head = &tt_global_entry->orig_list; - hlist_for_each_entry_rcu(orig_entry, node, head, list) { + hlist_for_each_entry_rcu(orig_entry, head, list) { if (orig_entry->orig_node != orig_node) { last_entry = false; break; @@ -1183,7 +1188,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_tt_common_entry *tt_common_entry; uint32_t i; struct batadv_hashtable *hash = bat_priv->tt.global_hash; - struct hlist_node *node, *safe; + struct hlist_node *safe; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ @@ -1195,7 +1200,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(tt_common_entry, node, safe, + hlist_for_each_entry_safe(tt_common_entry, safe, head, hash_entry) { tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, @@ -1208,7 +1213,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM: %s\n", tt_global->common.addr, message); - hlist_del_rcu(node); + hlist_del_rcu(&tt_common_entry->hash_entry); batadv_tt_global_entry_free_ref(tt_global); } } @@ -1243,7 +1248,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct hlist_head *head; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; spinlock_t *list_lock; /* protects write access to the hash lists */ uint32_t i; char *msg = NULL; @@ -1255,7 +1260,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv) list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(tt_common, node, node_tmp, head, + hlist_for_each_entry_safe(tt_common, node_tmp, head, hash_entry) { tt_global = container_of(tt_common, struct batadv_tt_global_entry, @@ -1268,7 +1273,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv) "Deleting global tt entry (%pM): %s\n", tt_global->common.addr, msg); - hlist_del_rcu(node); + hlist_del_rcu(&tt_common->hash_entry); batadv_tt_global_entry_free_ref(tt_global); } @@ -1282,7 +1287,7 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_global_entry *tt_global; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct hlist_head *head; uint32_t i; @@ -1296,9 +1301,9 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(tt_common_entry, node, node_tmp, + hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { - hlist_del_rcu(node); + hlist_del_rcu(&tt_common_entry->hash_entry); tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); @@ -1378,7 +1383,6 @@ static uint16_t batadv_tt_global_crc(struct batadv_priv *bat_priv, struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; - struct hlist_node *node; struct hlist_head *head; uint32_t i; int j; @@ -1387,7 +1391,7 @@ static uint16_t batadv_tt_global_crc(struct batadv_priv *bat_priv, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(tt_common, node, head, hash_entry) { + hlist_for_each_entry_rcu(tt_common, head, hash_entry) { tt_global = container_of(tt_common, struct batadv_tt_global_entry, common); @@ -1430,7 +1434,6 @@ static uint16_t batadv_tt_local_crc(struct batadv_priv *bat_priv) uint16_t total = 0, total_one; struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; - struct hlist_node *node; struct hlist_head *head; uint32_t i; int j; @@ -1439,7 +1442,7 @@ static uint16_t batadv_tt_local_crc(struct batadv_priv *bat_priv) head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(tt_common, node, head, hash_entry) { + hlist_for_each_entry_rcu(tt_common, head, hash_entry) { /* not yet committed clients have not to be taken into * account while computing the CRC */ @@ -1571,14 +1574,13 @@ static int batadv_tt_global_valid(const void *entry_ptr, static struct sk_buff * batadv_tt_response_fill_table(uint16_t tt_len, uint8_t ttvn, struct batadv_hashtable *hash, - struct batadv_hard_iface *primary_if, + struct batadv_priv *bat_priv, int (*valid_cb)(const void *, const void *), void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_query_packet *tt_response; struct batadv_tt_change *tt_change; - struct hlist_node *node; struct hlist_head *head; struct sk_buff *skb = NULL; uint16_t tt_tot, tt_count; @@ -1586,8 +1588,8 @@ batadv_tt_response_fill_table(uint16_t tt_len, uint8_t ttvn, uint32_t i; size_t len; - if (tt_query_size + tt_len > primary_if->soft_iface->mtu) { - tt_len = primary_if->soft_iface->mtu - tt_query_size; + if (tt_query_size + tt_len > bat_priv->soft_iface->mtu) { + tt_len = bat_priv->soft_iface->mtu - tt_query_size; tt_len -= tt_len % sizeof(struct batadv_tt_change); } tt_tot = tt_len / sizeof(struct batadv_tt_change); @@ -1608,7 +1610,7 @@ batadv_tt_response_fill_table(uint16_t tt_len, uint8_t ttvn, for (i = 0; i < hash->size; i++) { head = &hash->table[i]; - hlist_for_each_entry_rcu(tt_common_entry, node, + hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { if (tt_count == tt_tot) break; @@ -1707,7 +1709,6 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv, { struct batadv_orig_node *req_dst_orig_node; struct batadv_orig_node *res_dst_orig_node = NULL; - struct batadv_hard_iface *primary_if = NULL; uint8_t orig_ttvn, req_ttvn, ttvn; int ret = false; unsigned char *tt_buff; @@ -1732,10 +1733,6 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv, if (!res_dst_orig_node) goto out; - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto out; - orig_ttvn = (uint8_t)atomic_read(&req_dst_orig_node->last_ttvn); req_ttvn = tt_request->ttvn; @@ -1783,7 +1780,7 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv, skb = batadv_tt_response_fill_table(tt_len, ttvn, bat_priv->tt.global_hash, - primary_if, + bat_priv, batadv_tt_global_valid, req_dst_orig_node); if (!skb) @@ -1820,12 +1817,9 @@ out: batadv_orig_node_free_ref(res_dst_orig_node); if (req_dst_orig_node) batadv_orig_node_free_ref(req_dst_orig_node); - if (primary_if) - batadv_hardif_free_ref(primary_if); if (!ret) kfree_skb(skb); return ret; - } static bool @@ -1900,7 +1894,7 @@ batadv_send_my_tt_response(struct batadv_priv *bat_priv, skb = batadv_tt_response_fill_table(tt_len, ttvn, bat_priv->tt.local_hash, - primary_if, + bat_priv, batadv_tt_local_valid_entry, NULL); if (!skb) @@ -1946,7 +1940,7 @@ out: bool batadv_send_tt_response(struct batadv_priv *bat_priv, struct batadv_tt_query_packet *tt_request) { - if (batadv_is_my_mac(tt_request->dst)) { + if (batadv_is_my_mac(bat_priv, tt_request->dst)) { /* don't answer backbone gws! */ if (batadv_bla_is_backbone_gw_orig(bat_priv, tt_request->src)) return true; @@ -2111,7 +2105,9 @@ int batadv_tt_init(struct batadv_priv *bat_priv) if (ret < 0) return ret; - batadv_tt_start_timer(bat_priv); + INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge); + queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work, + msecs_to_jiffies(BATADV_TT_WORK_PERIOD)); return 1; } @@ -2261,7 +2257,8 @@ static void batadv_tt_purge(struct work_struct *work) batadv_tt_req_purge(bat_priv); batadv_tt_roam_purge(bat_priv); - batadv_tt_start_timer(bat_priv); + queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work, + msecs_to_jiffies(BATADV_TT_WORK_PERIOD)); } void batadv_tt_free(struct batadv_priv *bat_priv) @@ -2286,7 +2283,6 @@ static uint16_t batadv_tt_set_flags(struct batadv_hashtable *hash, uint32_t i; uint16_t changed_num = 0; struct hlist_head *head; - struct hlist_node *node; struct batadv_tt_common_entry *tt_common_entry; if (!hash) @@ -2296,7 +2292,7 @@ static uint16_t batadv_tt_set_flags(struct batadv_hashtable *hash, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(tt_common_entry, node, + hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { if (enable) { if ((tt_common_entry->flags & flags) == flags) @@ -2321,7 +2317,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ uint32_t i; @@ -2334,7 +2330,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); - hlist_for_each_entry_safe(tt_common, node, node_tmp, head, + hlist_for_each_entry_safe(tt_common, node_tmp, head, hash_entry) { if (!(tt_common->flags & BATADV_TT_CLIENT_PENDING)) continue; @@ -2344,7 +2340,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) tt_common->addr); atomic_dec(&bat_priv->tt.local_entry_num); - hlist_del_rcu(node); + hlist_del_rcu(&tt_common->hash_entry); tt_local = container_of(tt_common, struct batadv_tt_local_entry, common); @@ -2352,7 +2348,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) } spin_unlock_bh(list_lock); } - } static int batadv_tt_commit_changes(struct batadv_priv *bat_priv, @@ -2496,7 +2491,7 @@ void batadv_tt_update_orig(struct batadv_priv *bat_priv, orig_node->tt_crc != tt_crc) { request_table: batadv_dbg(BATADV_DBG_TT, bat_priv, - "TT inconsistency for %pM. Need to retrieve the correct information (ttvn: %u last_ttvn: %u crc: %u last_crc: %u num_changes: %u)\n", + "TT inconsistency for %pM. Need to retrieve the correct information (ttvn: %u last_ttvn: %u crc: %#.4x last_crc: %#.4x num_changes: %u)\n", orig_node->orig, ttvn, orig_ttvn, tt_crc, orig_node->tt_crc, tt_num_changes); batadv_send_tt_request(bat_priv, orig_node, ttvn, @@ -2520,7 +2515,7 @@ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, if (!tt_global_entry) goto out; - ret = !!(tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM); + ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM; batadv_tt_global_entry_free_ref(tt_global_entry); out: return ret; @@ -2549,7 +2544,6 @@ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, batadv_tt_local_entry_free_ref(tt_local_entry); out: return ret; - } bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 46d4451a59ee..ab8e683b402f 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ae9ac9aca8c5..aba8364c3689 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -24,6 +24,9 @@ #include "bitarray.h" #include <linux/kernel.h> +/** + * Maximum overhead for the encapsulation for a payload packet + */ #define BATADV_HEADER_LEN \ (ETH_HLEN + max(sizeof(struct batadv_unicast_packet), \ sizeof(struct batadv_bcast_packet))) @@ -51,6 +54,22 @@ struct batadv_hard_iface_bat_iv { atomic_t ogm_seqno; }; +/** + * struct batadv_hard_iface - network device known to batman-adv + * @list: list node for batadv_hardif_list + * @if_num: identificator of the interface + * @if_status: status of the interface for batman-adv + * @net_dev: pointer to the net_device + * @frag_seqno: last fragment sequence number sent by this interface + * @hardif_obj: kobject of the per interface sysfs "mesh" directory + * @refcount: number of contexts the object is used + * @batman_adv_ptype: packet type describing packets that should be processed by + * batman-adv for this interface + * @soft_iface: the batman-adv interface which uses this network interface + * @rcu: struct used for freeing in an RCU-safe manner + * @bat_iv: BATMAN IV specific per hard interface data + * @cleanup_work: work queue callback item for hard interface deinit + */ struct batadv_hard_iface { struct list_head list; int16_t if_num; @@ -63,22 +82,56 @@ struct batadv_hard_iface { struct net_device *soft_iface; struct rcu_head rcu; struct batadv_hard_iface_bat_iv bat_iv; + struct work_struct cleanup_work; }; /** - * struct batadv_orig_node - structure for orig_list maintaining nodes of mesh - * @primary_addr: hosts primary interface address - * @last_seen: when last packet from this node was received - * @bcast_seqno_reset: time when the broadcast seqno window was reset - * @batman_seqno_reset: time when the batman seqno window was reset - * @gw_flags: flags related to gateway class - * @flags: for now only VIS_SERVER flag - * @last_real_seqno: last and best known sequence number - * @last_ttl: ttl of last received packet - * @last_bcast_seqno: last broadcast sequence number received by this host - * - * @candidates: how many candidates are available - * @selected: next bonding candidate + * struct batadv_orig_node - structure for orig_list maintaining nodes of mesh + * @orig: originator ethernet address + * @primary_addr: hosts primary interface address + * @router: router that should be used to reach this originator + * @batadv_dat_addr_t: address of the orig node in the distributed hash + * @bcast_own: bitfield containing the number of our OGMs this orig_node + * rebroadcasted "back" to us (relative to last_real_seqno) + * @bcast_own_sum: counted result of bcast_own + * @last_seen: time when last packet from this node was received + * @bcast_seqno_reset: time when the broadcast seqno window was reset + * @batman_seqno_reset: time when the batman seqno window was reset + * @gw_flags: flags related to gateway class + * @flags: for now only VIS_SERVER flag + * @last_ttvn: last seen translation table version number + * @tt_crc: CRC of the translation table + * @tt_buff: last tt changeset this node received from the orig node + * @tt_buff_len: length of the last tt changeset this node received from the + * orig node + * @tt_buff_lock: lock that protects tt_buff and tt_buff_len + * @tt_size: number of global TT entries announced by the orig node + * @tt_initialised: bool keeping track of whether or not this node have received + * any translation table information from the orig node yet + * @last_real_seqno: last and best known sequence number + * @last_ttl: ttl of last received packet + * @bcast_bits: bitfield containing the info which payload broadcast originated + * from this orig node this host already has seen (relative to + * last_bcast_seqno) + * @last_bcast_seqno: last broadcast sequence number received by this host + * @neigh_list: list of potential next hop neighbor towards this orig node + * @frag_list: fragmentation buffer list for fragment re-assembly + * @last_frag_packet: time when last fragmented packet from this node was + * received + * @neigh_list_lock: lock protecting neigh_list, router and bonding_list + * @hash_entry: hlist node for batadv_priv::orig_hash + * @bat_priv: pointer to soft_iface this orig node belongs to + * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum, + * neigh_node->real_bits & neigh_node->real_packet_count + * @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno + * @bond_candidates: how many candidates are available + * @bond_list: list of bonding candidates + * @refcount: number of contexts the object is used + * @rcu: struct used for freeing in an RCU-safe manner + * @in_coding_list: list of nodes this orig can hear + * @out_coding_list: list of nodes that can hear this orig + * @in_coding_list_lock: protects in_coding_list + * @out_coding_list_lock: protects out_coding_list */ struct batadv_orig_node { uint8_t orig[ETH_ALEN]; @@ -94,11 +147,11 @@ struct batadv_orig_node { unsigned long batman_seqno_reset; uint8_t gw_flags; uint8_t flags; - atomic_t last_ttvn; /* last seen translation table version number */ + atomic_t last_ttvn; uint16_t tt_crc; unsigned char *tt_buff; int16_t tt_buff_len; - spinlock_t tt_buff_lock; /* protects tt_buff */ + spinlock_t tt_buff_lock; /* protects tt_buff & tt_buff_len */ atomic_t tt_size; bool tt_initialised; uint32_t last_real_seqno; @@ -107,23 +160,37 @@ struct batadv_orig_node { uint32_t last_bcast_seqno; struct hlist_head neigh_list; struct list_head frag_list; - spinlock_t neigh_list_lock; /* protects neigh_list and router */ - atomic_t refcount; - struct rcu_head rcu; + unsigned long last_frag_packet; + /* neigh_list_lock protects: neigh_list, router & bonding_list */ + spinlock_t neigh_list_lock; struct hlist_node hash_entry; struct batadv_priv *bat_priv; - unsigned long last_frag_packet; /* ogm_cnt_lock protects: bcast_own, bcast_own_sum, - * neigh_node->real_bits, neigh_node->real_packet_count + * neigh_node->real_bits & neigh_node->real_packet_count */ spinlock_t ogm_cnt_lock; - /* bcast_seqno_lock protects bcast_bits, last_bcast_seqno */ + /* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */ spinlock_t bcast_seqno_lock; - spinlock_t tt_list_lock; /* protects tt_list */ atomic_t bond_candidates; struct list_head bond_list; + atomic_t refcount; + struct rcu_head rcu; +#ifdef CONFIG_BATMAN_ADV_NC + struct list_head in_coding_list; + struct list_head out_coding_list; + spinlock_t in_coding_list_lock; /* Protects in_coding_list */ + spinlock_t out_coding_list_lock; /* Protects out_coding_list */ +#endif }; +/** + * struct batadv_gw_node - structure for orig nodes announcing gw capabilities + * @list: list node for batadv_priv_gw::list + * @orig_node: pointer to corresponding orig node + * @deleted: this struct is scheduled for deletion + * @refcount: number of contexts the object is used + * @rcu: struct used for freeing in an RCU-safe manner + */ struct batadv_gw_node { struct hlist_node list; struct batadv_orig_node *orig_node; @@ -132,13 +199,28 @@ struct batadv_gw_node { struct rcu_head rcu; }; -/* batadv_neigh_node - * @last_seen: when last packet via this neighbor was received +/** + * struct batadv_neigh_node - structure for single hop neighbors + * @list: list node for batadv_orig_node::neigh_list + * @addr: mac address of neigh node + * @tq_recv: ring buffer of received TQ values from this neigh node + * @tq_index: ring buffer index + * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv) + * @last_ttl: last received ttl from this neigh node + * @bonding_list: list node for batadv_orig_node::bond_list + * @last_seen: when last packet via this neighbor was received + * @real_bits: bitfield containing the number of OGMs received from this neigh + * node (relative to orig_node->last_real_seqno) + * @real_packet_count: counted result of real_bits + * @orig_node: pointer to corresponding orig_node + * @if_incoming: pointer to incoming hard interface + * @lq_update_lock: lock protecting tq_recv & tq_index + * @refcount: number of contexts the object is used + * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_neigh_node { struct hlist_node list; uint8_t addr[ETH_ALEN]; - uint8_t real_packet_count; uint8_t tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE]; uint8_t tq_index; uint8_t tq_avg; @@ -146,13 +228,20 @@ struct batadv_neigh_node { struct list_head bonding_list; unsigned long last_seen; DECLARE_BITMAP(real_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); - atomic_t refcount; - struct rcu_head rcu; + uint8_t real_packet_count; struct batadv_orig_node *orig_node; struct batadv_hard_iface *if_incoming; - spinlock_t lq_update_lock; /* protects: tq_recv, tq_index */ + spinlock_t lq_update_lock; /* protects tq_recv & tq_index */ + atomic_t refcount; + struct rcu_head rcu; }; +/** + * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression + * @orig[ETH_ALEN]: mac address of orig node orginating the broadcast + * @crc: crc32 checksum of broadcast payload + * @entrytime: time when the broadcast packet was received + */ #ifdef CONFIG_BATMAN_ADV_BLA struct batadv_bcast_duplist_entry { uint8_t orig[ETH_ALEN]; @@ -161,6 +250,44 @@ struct batadv_bcast_duplist_entry { }; #endif +/** + * enum batadv_counters - indices for traffic counters + * @BATADV_CNT_TX: transmitted payload traffic packet counter + * @BATADV_CNT_TX_BYTES: transmitted payload traffic bytes counter + * @BATADV_CNT_TX_DROPPED: dropped transmission payload traffic packet counter + * @BATADV_CNT_RX: received payload traffic packet counter + * @BATADV_CNT_RX_BYTES: received payload traffic bytes counter + * @BATADV_CNT_FORWARD: forwarded payload traffic packet counter + * @BATADV_CNT_FORWARD_BYTES: forwarded payload traffic bytes counter + * @BATADV_CNT_MGMT_TX: transmitted routing protocol traffic packet counter + * @BATADV_CNT_MGMT_TX_BYTES: transmitted routing protocol traffic bytes counter + * @BATADV_CNT_MGMT_RX: received routing protocol traffic packet counter + * @BATADV_CNT_MGMT_RX_BYTES: received routing protocol traffic bytes counter + * @BATADV_CNT_TT_REQUEST_TX: transmitted tt req traffic packet counter + * @BATADV_CNT_TT_REQUEST_RX: received tt req traffic packet counter + * @BATADV_CNT_TT_RESPONSE_TX: transmitted tt resp traffic packet counter + * @BATADV_CNT_TT_RESPONSE_RX: received tt resp traffic packet counter + * @BATADV_CNT_TT_ROAM_ADV_TX: transmitted tt roam traffic packet counter + * @BATADV_CNT_TT_ROAM_ADV_RX: received tt roam traffic packet counter + * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter + * @BATADV_CNT_DAT_GET_RX: received dht GET traffic packet counter + * @BATADV_CNT_DAT_PUT_TX: transmitted dht PUT traffic packet counter + * @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter + * @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic packet + * counter + * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter + * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes counter + * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet counter + * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes counter + * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc decoding + * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter + * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes counter + * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic packet + * counter + * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in promisc + * mode. + * @BATADV_CNT_NUM: number of traffic counters + */ enum batadv_counters { BATADV_CNT_TX, BATADV_CNT_TX_BYTES, @@ -186,20 +313,40 @@ enum batadv_counters { BATADV_CNT_DAT_PUT_RX, BATADV_CNT_DAT_CACHED_REPLY_TX, #endif +#ifdef CONFIG_BATMAN_ADV_NC + BATADV_CNT_NC_CODE, + BATADV_CNT_NC_CODE_BYTES, + BATADV_CNT_NC_RECODE, + BATADV_CNT_NC_RECODE_BYTES, + BATADV_CNT_NC_BUFFER, + BATADV_CNT_NC_DECODE, + BATADV_CNT_NC_DECODE_BYTES, + BATADV_CNT_NC_DECODE_FAILED, + BATADV_CNT_NC_SNIFFED, +#endif BATADV_CNT_NUM, }; /** * struct batadv_priv_tt - per mesh interface translation table data * @vn: translation table version number + * @ogm_append_cnt: counter of number of OGMs containing the local tt diff * @local_changes: changes registered in an originator interval - * @poss_change: Detect an ongoing roaming phase. If true, then this node - * received a roaming_adv and has to inspect every packet directed to it to - * check whether it still is the true destination or not. This flag will be - * reset to false as soon as the this node's ttvn is increased * @changes_list: tracks tt local changes within an originator interval - * @req_list: list of pending tt_requests + * @local_hash: local translation table hash table + * @global_hash: global translation table hash table + * @req_list: list of pending & unanswered tt_requests + * @roam_list: list of the last roaming events of each client limiting the + * number of roaming events to avoid route flapping + * @changes_list_lock: lock protecting changes_list + * @req_list_lock: lock protecting req_list + * @roam_list_lock: lock protecting roam_list + * @local_entry_num: number of entries in the local hash table * @local_crc: Checksum of the local table, recomputed before sending a new OGM + * @last_changeset: last tt changeset this host has generated + * @last_changeset_len: length of last tt changeset this host has generated + * @last_changeset_lock: lock protecting last_changeset & last_changeset_len + * @work: work queue callback item for translation table purging */ struct batadv_priv_tt { atomic_t vn; @@ -217,36 +364,83 @@ struct batadv_priv_tt { uint16_t local_crc; unsigned char *last_changeset; int16_t last_changeset_len; - spinlock_t last_changeset_lock; /* protects last_changeset */ + /* protects last_changeset & last_changeset_len */ + spinlock_t last_changeset_lock; struct delayed_work work; }; +/** + * struct batadv_priv_bla - per mesh interface bridge loope avoidance data + * @num_requests; number of bla requests in flight + * @claim_hash: hash table containing mesh nodes this host has claimed + * @backbone_hash: hash table containing all detected backbone gateways + * @bcast_duplist: recently received broadcast packets array (for broadcast + * duplicate suppression) + * @bcast_duplist_curr: index of last broadcast packet added to bcast_duplist + * @bcast_duplist_lock: lock protecting bcast_duplist & bcast_duplist_curr + * @claim_dest: local claim data (e.g. claim group) + * @work: work queue callback item for cleanups & bla announcements + */ #ifdef CONFIG_BATMAN_ADV_BLA struct batadv_priv_bla { - atomic_t num_requests; /* number of bla requests in flight */ + atomic_t num_requests; struct batadv_hashtable *claim_hash; struct batadv_hashtable *backbone_hash; struct batadv_bcast_duplist_entry bcast_duplist[BATADV_DUPLIST_SIZE]; int bcast_duplist_curr; - /* protects bcast_duplist and bcast_duplist_curr */ + /* protects bcast_duplist & bcast_duplist_curr */ spinlock_t bcast_duplist_lock; struct batadv_bla_claim_dst claim_dest; struct delayed_work work; }; #endif +/** + * struct batadv_debug_log - debug logging data + * @log_buff: buffer holding the logs (ring bufer) + * @log_start: index of next character to read + * @log_end: index of next character to write + * @lock: lock protecting log_buff, log_start & log_end + * @queue_wait: log reader's wait queue + */ +#ifdef CONFIG_BATMAN_ADV_DEBUG +struct batadv_priv_debug_log { + char log_buff[BATADV_LOG_BUF_LEN]; + unsigned long log_start; + unsigned long log_end; + spinlock_t lock; /* protects log_buff, log_start and log_end */ + wait_queue_head_t queue_wait; +}; +#endif + +/** + * struct batadv_priv_gw - per mesh interface gateway data + * @list: list of available gateway nodes + * @list_lock: lock protecting gw_list & curr_gw + * @curr_gw: pointer to currently selected gateway node + * @reselect: bool indicating a gateway re-selection is in progress + */ struct batadv_priv_gw { struct hlist_head list; - spinlock_t list_lock; /* protects gw_list and curr_gw */ + spinlock_t list_lock; /* protects gw_list & curr_gw */ struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */ atomic_t reselect; }; +/** + * struct batadv_priv_vis - per mesh interface vis data + * @send_list: list of batadv_vis_info packets to sent + * @hash: hash table containing vis data from other nodes in the network + * @hash_lock: lock protecting the hash table + * @list_lock: lock protecting my_info::recv_list + * @work: work queue callback item for vis packet sending + * @my_info: holds this node's vis data sent on a regular basis + */ struct batadv_priv_vis { struct list_head send_list; struct batadv_hashtable *hash; spinlock_t hash_lock; /* protects hash */ - spinlock_t list_lock; /* protects info::recv_list */ + spinlock_t list_lock; /* protects my_info::recv_list */ struct delayed_work work; struct batadv_vis_info *my_info; }; @@ -265,30 +459,109 @@ struct batadv_priv_dat { }; #endif +/** + * struct batadv_priv_nc - per mesh interface network coding private data + * @work: work queue callback item for cleanup + * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs + * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq + * @max_fwd_delay: maximum packet forward delay to allow coding of packets + * @max_buffer_time: buffer time for sniffed packets used to decoding + * @timestamp_fwd_flush: timestamp of last forward packet queue flush + * @timestamp_sniffed_purge: timestamp of last sniffed packet queue purge + * @coding_hash: Hash table used to buffer skbs while waiting for another + * incoming skb to code it with. Skbs are added to the buffer just before being + * forwarded in routing.c + * @decoding_hash: Hash table used to buffer skbs that might be needed to decode + * a received coded skb. The buffer is used for 1) skbs arriving on the + * soft-interface; 2) skbs overheard on the hard-interface; and 3) skbs + * forwarded by batman-adv. + */ +struct batadv_priv_nc { + struct delayed_work work; + struct dentry *debug_dir; + u8 min_tq; + u32 max_fwd_delay; + u32 max_buffer_time; + unsigned long timestamp_fwd_flush; + unsigned long timestamp_sniffed_purge; + struct batadv_hashtable *coding_hash; + struct batadv_hashtable *decoding_hash; +}; + +/** + * struct batadv_priv - per mesh interface data + * @mesh_state: current status of the mesh (inactive/active/deactivating) + * @soft_iface: net device which holds this struct as private data + * @stats: structure holding the data for the ndo_get_stats() call + * @bat_counters: mesh internal traffic statistic counters (see batadv_counters) + * @aggregated_ogms: bool indicating whether OGM aggregation is enabled + * @bonding: bool indicating whether traffic bonding is enabled + * @fragmentation: bool indicating whether traffic fragmentation is enabled + * @ap_isolation: bool indicating whether ap isolation is enabled + * @bridge_loop_avoidance: bool indicating whether bridge loop avoidance is + * enabled + * @distributed_arp_table: bool indicating whether distributed ARP table is + * enabled + * @vis_mode: vis operation: client or server (see batadv_vis_packettype) + * @gw_mode: gateway operation: off, client or server (see batadv_gw_modes) + * @gw_sel_class: gateway selection class (applies if gw_mode client) + * @gw_bandwidth: gateway announced bandwidth (applies if gw_mode server) + * @orig_interval: OGM broadcast interval in milliseconds + * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop + * @log_level: configured log level (see batadv_dbg_level) + * @bcast_seqno: last sent broadcast packet sequence number + * @bcast_queue_left: number of remaining buffered broadcast packet slots + * @batman_queue_left: number of remaining OGM packet slots + * @num_ifaces: number of interfaces assigned to this mesh interface + * @mesh_obj: kobject for sysfs mesh subdirectory + * @debug_dir: dentry for debugfs batman-adv subdirectory + * @forw_bat_list: list of aggregated OGMs that will be forwarded + * @forw_bcast_list: list of broadcast packets that will be rebroadcasted + * @orig_hash: hash table containing mesh participants (orig nodes) + * @forw_bat_list_lock: lock protecting forw_bat_list + * @forw_bcast_list_lock: lock protecting forw_bcast_list + * @orig_work: work queue callback item for orig node purging + * @cleanup_work: work queue callback item for soft interface deinit + * @primary_if: one of the hard interfaces assigned to this mesh interface + * becomes the primary interface + * @bat_algo_ops: routing algorithm used by this mesh interface + * @bla: bridge loope avoidance data + * @debug_log: holding debug logging relevant data + * @gw: gateway data + * @tt: translation table data + * @vis: vis data + * @dat: distributed arp table data + * @network_coding: bool indicating whether network coding is enabled + * @batadv_priv_nc: network coding data + */ struct batadv_priv { atomic_t mesh_state; + struct net_device *soft_iface; struct net_device_stats stats; uint64_t __percpu *bat_counters; /* Per cpu counters */ - atomic_t aggregated_ogms; /* boolean */ - atomic_t bonding; /* boolean */ - atomic_t fragmentation; /* boolean */ - atomic_t ap_isolation; /* boolean */ - atomic_t bridge_loop_avoidance; /* boolean */ + atomic_t aggregated_ogms; + atomic_t bonding; + atomic_t fragmentation; + atomic_t ap_isolation; +#ifdef CONFIG_BATMAN_ADV_BLA + atomic_t bridge_loop_avoidance; +#endif #ifdef CONFIG_BATMAN_ADV_DAT - atomic_t distributed_arp_table; /* boolean */ + atomic_t distributed_arp_table; +#endif + atomic_t vis_mode; + atomic_t gw_mode; + atomic_t gw_sel_class; + atomic_t gw_bandwidth; + atomic_t orig_interval; + atomic_t hop_penalty; +#ifdef CONFIG_BATMAN_ADV_DEBUG + atomic_t log_level; #endif - atomic_t vis_mode; /* VIS_TYPE_* */ - atomic_t gw_mode; /* GW_MODE_* */ - atomic_t gw_sel_class; /* uint */ - atomic_t gw_bandwidth; /* gw bandwidth */ - atomic_t orig_interval; /* uint */ - atomic_t hop_penalty; /* uint */ - atomic_t log_level; /* uint */ atomic_t bcast_seqno; atomic_t bcast_queue_left; atomic_t batman_queue_left; char num_ifaces; - struct batadv_debug_log *debug_log; struct kobject *mesh_obj; struct dentry *debug_dir; struct hlist_head forw_bat_list; @@ -297,34 +570,118 @@ struct batadv_priv { spinlock_t forw_bat_list_lock; /* protects forw_bat_list */ spinlock_t forw_bcast_list_lock; /* protects forw_bcast_list */ struct delayed_work orig_work; + struct work_struct cleanup_work; struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */ struct batadv_algo_ops *bat_algo_ops; #ifdef CONFIG_BATMAN_ADV_BLA struct batadv_priv_bla bla; #endif +#ifdef CONFIG_BATMAN_ADV_DEBUG + struct batadv_priv_debug_log *debug_log; +#endif struct batadv_priv_gw gw; struct batadv_priv_tt tt; struct batadv_priv_vis vis; #ifdef CONFIG_BATMAN_ADV_DAT struct batadv_priv_dat dat; #endif +#ifdef CONFIG_BATMAN_ADV_NC + atomic_t network_coding; + struct batadv_priv_nc nc; +#endif /* CONFIG_BATMAN_ADV_NC */ }; +/** + * struct batadv_socket_client - layer2 icmp socket client data + * @queue_list: packet queue for packets destined for this socket client + * @queue_len: number of packets in the packet queue (queue_list) + * @index: socket client's index in the batadv_socket_client_hash + * @lock: lock protecting queue_list, queue_len & index + * @queue_wait: socket client's wait queue + * @bat_priv: pointer to soft_iface this client belongs to + */ struct batadv_socket_client { struct list_head queue_list; unsigned int queue_len; unsigned char index; - spinlock_t lock; /* protects queue_list, queue_len, index */ + spinlock_t lock; /* protects queue_list, queue_len & index */ wait_queue_head_t queue_wait; struct batadv_priv *bat_priv; }; +/** + * struct batadv_socket_packet - layer2 icmp packet for socket client + * @list: list node for batadv_socket_client::queue_list + * @icmp_len: size of the layer2 icmp packet + * @icmp_packet: layer2 icmp packet + */ struct batadv_socket_packet { struct list_head list; size_t icmp_len; struct batadv_icmp_packet_rr icmp_packet; }; +/** + * struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN + * @orig: originator address of backbone node (mac address of primary iface) + * @vid: vlan id this gateway was detected on + * @hash_entry: hlist node for batadv_priv_bla::backbone_hash + * @bat_priv: pointer to soft_iface this backbone gateway belongs to + * @lasttime: last time we heard of this backbone gw + * @wait_periods: grace time for bridge forward delays and bla group forming at + * bootup phase - no bcast traffic is formwared until it has elapsed + * @request_sent: if this bool is set to true we are out of sync with this + * backbone gateway - no bcast traffic is formwared until the situation was + * resolved + * @crc: crc16 checksum over all claims + * @refcount: number of contexts the object is used + * @rcu: struct used for freeing in an RCU-safe manner + */ +#ifdef CONFIG_BATMAN_ADV_BLA +struct batadv_bla_backbone_gw { + uint8_t orig[ETH_ALEN]; + short vid; + struct hlist_node hash_entry; + struct batadv_priv *bat_priv; + unsigned long lasttime; + atomic_t wait_periods; + atomic_t request_sent; + uint16_t crc; + atomic_t refcount; + struct rcu_head rcu; +}; + +/** + * struct batadv_bla_claim - claimed non-mesh client structure + * @addr: mac address of claimed non-mesh client + * @vid: vlan id this client was detected on + * @batadv_bla_backbone_gw: pointer to backbone gw claiming this client + * @lasttime: last time we heard of claim (locals only) + * @hash_entry: hlist node for batadv_priv_bla::claim_hash + * @refcount: number of contexts the object is used + * @rcu: struct used for freeing in an RCU-safe manner + */ +struct batadv_bla_claim { + uint8_t addr[ETH_ALEN]; + short vid; + struct batadv_bla_backbone_gw *backbone_gw; + unsigned long lasttime; + struct hlist_node hash_entry; + struct rcu_head rcu; + atomic_t refcount; +}; +#endif + +/** + * struct batadv_tt_common_entry - tt local & tt global common data + * @addr: mac address of non-mesh client + * @hash_entry: hlist node for batadv_priv_tt::local_hash or for + * batadv_priv_tt::global_hash + * @flags: various state handling flags (see batadv_tt_client_flags) + * @added_at: timestamp used for purging stale tt common entries + * @refcount: number of contexts the object is used + * @rcu: struct used for freeing in an RCU-safe manner + */ struct batadv_tt_common_entry { uint8_t addr[ETH_ALEN]; struct hlist_node hash_entry; @@ -334,62 +691,76 @@ struct batadv_tt_common_entry { struct rcu_head rcu; }; +/** + * struct batadv_tt_local_entry - translation table local entry data + * @common: general translation table data + * @last_seen: timestamp used for purging stale tt local entries + */ struct batadv_tt_local_entry { struct batadv_tt_common_entry common; unsigned long last_seen; }; +/** + * struct batadv_tt_global_entry - translation table global entry data + * @common: general translation table data + * @orig_list: list of orig nodes announcing this non-mesh client + * @list_lock: lock protecting orig_list + * @roam_at: time at which TT_GLOBAL_ROAM was set + */ struct batadv_tt_global_entry { struct batadv_tt_common_entry common; struct hlist_head orig_list; - spinlock_t list_lock; /* protects the list */ - unsigned long roam_at; /* time at which TT_GLOBAL_ROAM was set */ + spinlock_t list_lock; /* protects orig_list */ + unsigned long roam_at; }; +/** + * struct batadv_tt_orig_list_entry - orig node announcing a non-mesh client + * @orig_node: pointer to orig node announcing this non-mesh client + * @ttvn: translation table version number which added the non-mesh client + * @list: list node for batadv_tt_global_entry::orig_list + * @refcount: number of contexts the object is used + * @rcu: struct used for freeing in an RCU-safe manner + */ struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; uint8_t ttvn; - atomic_t refcount; - struct rcu_head rcu; struct hlist_node list; -}; - -#ifdef CONFIG_BATMAN_ADV_BLA -struct batadv_backbone_gw { - uint8_t orig[ETH_ALEN]; - short vid; /* used VLAN ID */ - struct hlist_node hash_entry; - struct batadv_priv *bat_priv; - unsigned long lasttime; /* last time we heard of this backbone gw */ - atomic_t wait_periods; - atomic_t request_sent; atomic_t refcount; struct rcu_head rcu; - uint16_t crc; /* crc checksum over all claims */ }; -struct batadv_claim { - uint8_t addr[ETH_ALEN]; - short vid; - struct batadv_backbone_gw *backbone_gw; - unsigned long lasttime; /* last time we heard of claim (locals only) */ - struct rcu_head rcu; - atomic_t refcount; - struct hlist_node hash_entry; -}; -#endif - +/** + * struct batadv_tt_change_node - structure for tt changes occured + * @list: list node for batadv_priv_tt::changes_list + * @change: holds the actual translation table diff data + */ struct batadv_tt_change_node { struct list_head list; struct batadv_tt_change change; }; +/** + * struct batadv_tt_req_node - data to keep track of the tt requests in flight + * @addr: mac address address of the originator this request was sent to + * @issued_at: timestamp used for purging stale tt requests + * @list: list node for batadv_priv_tt::req_list + */ struct batadv_tt_req_node { uint8_t addr[ETH_ALEN]; unsigned long issued_at; struct list_head list; }; +/** + * struct batadv_tt_roam_node - roaming client data + * @addr: mac address of the client in the roaming phase + * @counter: number of allowed roaming events per client within a single + * OGM interval (changes are committed with each OGM) + * @first_time: timestamp used for purging stale roaming node entries + * @list: list node for batadv_priv_tt::roam_list + */ struct batadv_tt_roam_node { uint8_t addr[ETH_ALEN]; atomic_t counter; @@ -397,8 +768,88 @@ struct batadv_tt_roam_node { struct list_head list; }; -/* forw_packet - structure for forw_list maintaining packets to be - * send/forwarded +/** + * struct batadv_nc_node - network coding node + * @list: next and prev pointer for the list handling + * @addr: the node's mac address + * @refcount: number of contexts the object is used by + * @rcu: struct used for freeing in an RCU-safe manner + * @orig_node: pointer to corresponding orig node struct + * @last_seen: timestamp of last ogm received from this node + */ +struct batadv_nc_node { + struct list_head list; + uint8_t addr[ETH_ALEN]; + atomic_t refcount; + struct rcu_head rcu; + struct batadv_orig_node *orig_node; + unsigned long last_seen; +}; + +/** + * struct batadv_nc_path - network coding path + * @hash_entry: next and prev pointer for the list handling + * @rcu: struct used for freeing in an RCU-safe manner + * @refcount: number of contexts the object is used by + * @packet_list: list of buffered packets for this path + * @packet_list_lock: access lock for packet list + * @next_hop: next hop (destination) of path + * @prev_hop: previous hop (source) of path + * @last_valid: timestamp for last validation of path + */ +struct batadv_nc_path { + struct hlist_node hash_entry; + struct rcu_head rcu; + atomic_t refcount; + struct list_head packet_list; + spinlock_t packet_list_lock; /* Protects packet_list */ + uint8_t next_hop[ETH_ALEN]; + uint8_t prev_hop[ETH_ALEN]; + unsigned long last_valid; +}; + +/** + * struct batadv_nc_packet - network coding packet used when coding and + * decoding packets + * @list: next and prev pointer for the list handling + * @packet_id: crc32 checksum of skb data + * @timestamp: field containing the info when the packet was added to path + * @neigh_node: pointer to original next hop neighbor of skb + * @skb: skb which can be encoded or used for decoding + * @nc_path: pointer to path this nc packet is attached to + */ +struct batadv_nc_packet { + struct list_head list; + __be32 packet_id; + unsigned long timestamp; + struct batadv_neigh_node *neigh_node; + struct sk_buff *skb; + struct batadv_nc_path *nc_path; +}; + +/** + * batadv_skb_cb - control buffer structure used to store private data relevant + * to batman-adv in the skb->cb buffer in skbs. + * @decoded: Marks a skb as decoded, which is checked when searching for coding + * opportunities in network-coding.c + */ +struct batadv_skb_cb { + bool decoded; +}; + +/** + * struct batadv_forw_packet - structure for bcast packets to be sent/forwarded + * @list: list node for batadv_socket_client::queue_list + * @send_time: execution time for delayed_work (packet sending) + * @own: bool for locally generated packets (local OGMs are re-scheduled after + * sending) + * @skb: bcast packet's skb buffer + * @packet_len: size of aggregated OGM packet inside the skb buffer + * @direct_link_flags: direct link flags for aggregated OGM packets + * @num_packets: counter for bcast packet retransmission + * @delayed_work: work queue callback item for packet sending + * @if_incoming: pointer incoming hard-iface or primary iface if locally + * generated packet */ struct batadv_forw_packet { struct hlist_node list; @@ -412,72 +863,98 @@ struct batadv_forw_packet { struct batadv_hard_iface *if_incoming; }; -/* While scanning for vis-entries of a particular vis-originator - * this list collects its interfaces to create a subgraph/cluster - * out of them later +/** + * struct batadv_frag_packet_list_entry - storage for fragment packet + * @list: list node for orig_node::frag_list + * @seqno: sequence number of the fragment + * @skb: fragment's skb buffer */ -struct batadv_if_list_entry { - uint8_t addr[ETH_ALEN]; - bool primary; - struct hlist_node list; -}; - -struct batadv_debug_log { - char log_buff[BATADV_LOG_BUF_LEN]; - unsigned long log_start; - unsigned long log_end; - spinlock_t lock; /* protects log_buff, log_start and log_end */ - wait_queue_head_t queue_wait; -}; - struct batadv_frag_packet_list_entry { struct list_head list; uint16_t seqno; struct sk_buff *skb; }; +/** + * struct batadv_vis_info - local data for vis information + * @first_seen: timestamp used for purging stale vis info entries + * @recv_list: List of server-neighbors we have received this packet from. This + * packet should not be re-forward to them again. List elements are struct + * batadv_vis_recvlist_node + * @send_list: list of packets to be forwarded + * @refcount: number of contexts the object is used + * @hash_entry: hlist node for batadv_priv_vis::hash + * @bat_priv: pointer to soft_iface this orig node belongs to + * @skb_packet: contains the vis packet + */ struct batadv_vis_info { unsigned long first_seen; - /* list of server-neighbors we received a vis-packet - * from. we should not reply to them. - */ struct list_head recv_list; struct list_head send_list; struct kref refcount; struct hlist_node hash_entry; struct batadv_priv *bat_priv; - /* this packet might be part of the vis send queue. */ struct sk_buff *skb_packet; - /* vis_info may follow here */ } __packed; +/** + * struct batadv_vis_info_entry - contains link information for vis + * @src: source MAC of the link, all zero for local TT entry + * @dst: destination MAC of the link, client mac address for local TT entry + * @quality: transmission quality of the link, or 0 for local TT entry + */ struct batadv_vis_info_entry { uint8_t src[ETH_ALEN]; uint8_t dest[ETH_ALEN]; - uint8_t quality; /* quality = 0 client */ + uint8_t quality; } __packed; -struct batadv_recvlist_node { +/** + * struct batadv_vis_recvlist_node - list entry for batadv_vis_info::recv_list + * @list: list node for batadv_vis_info::recv_list + * @mac: MAC address of the originator from where the vis_info was received + */ +struct batadv_vis_recvlist_node { struct list_head list; uint8_t mac[ETH_ALEN]; }; +/** + * struct batadv_vis_if_list_entry - auxiliary data for vis data generation + * @addr: MAC address of the interface + * @primary: true if this interface is the primary interface + * @list: list node the interface list + * + * While scanning for vis-entries of a particular vis-originator + * this list collects its interfaces to create a subgraph/cluster + * out of them later + */ +struct batadv_vis_if_list_entry { + uint8_t addr[ETH_ALEN]; + bool primary; + struct hlist_node list; +}; + +/** + * struct batadv_algo_ops - mesh algorithm callbacks + * @list: list node for the batadv_algo_list + * @name: name of the algorithm + * @bat_iface_enable: init routing info when hard-interface is enabled + * @bat_iface_disable: de-init routing info when hard-interface is disabled + * @bat_iface_update_mac: (re-)init mac addresses of the protocol information + * belonging to this hard-interface + * @bat_primary_iface_set: called when primary interface is selected / changed + * @bat_ogm_schedule: prepare a new outgoing OGM for the send queue + * @bat_ogm_emit: send scheduled OGM + */ struct batadv_algo_ops { struct hlist_node list; char *name; - /* init routing info when hard-interface is enabled */ int (*bat_iface_enable)(struct batadv_hard_iface *hard_iface); - /* de-init routing info when hard-interface is disabled */ void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface); - /* (re-)init mac addresses of the protocol information - * belonging to this hard-interface - */ void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface); - /* called when primary interface is selected / changed */ void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface); - /* prepare a new outgoing OGM for the send queue */ void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface); - /* send scheduled OGM */ void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet); }; diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c index 10aff49fcf25..0bb3b5982f94 100644 --- a/net/batman-adv/unicast.c +++ b/net/batman-adv/unicast.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: * * Andreas Langer * @@ -122,7 +122,7 @@ batadv_frag_search_packet(struct list_head *head, { struct batadv_frag_packet_list_entry *tfp; struct batadv_unicast_frag_packet *tmp_up = NULL; - int is_head_tmp, is_head; + bool is_head_tmp, is_head; uint16_t search_seqno; if (up->flags & BATADV_UNI_FRAG_HEAD) @@ -130,10 +130,9 @@ batadv_frag_search_packet(struct list_head *head, else search_seqno = ntohs(up->seqno)-1; - is_head = !!(up->flags & BATADV_UNI_FRAG_HEAD); + is_head = up->flags & BATADV_UNI_FRAG_HEAD; list_for_each_entry(tfp, head, list) { - if (!tfp->skb) continue; @@ -143,7 +142,7 @@ batadv_frag_search_packet(struct list_head *head, tmp_up = (struct batadv_unicast_frag_packet *)tfp->skb->data; if (tfp->seqno == search_seqno) { - is_head_tmp = !!(tmp_up->flags & BATADV_UNI_FRAG_HEAD); + is_head_tmp = tmp_up->flags & BATADV_UNI_FRAG_HEAD; if (is_head_tmp != is_head) return tfp; else @@ -162,7 +161,6 @@ void batadv_frag_list_free(struct list_head *head) struct batadv_frag_packet_list_entry *pf, *tmp_pf; if (!list_empty(head)) { - list_for_each_entry_safe(pf, tmp_pf, head, list) { kfree_skb(pf->skb); list_del(&pf->list); diff --git a/net/batman-adv/unicast.h b/net/batman-adv/unicast.h index 61abba58bd8f..429cf8a4a31e 100644 --- a/net/batman-adv/unicast.h +++ b/net/batman-adv/unicast.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2013 B.A.T.M.A.N. contributors: * * Andreas Langer * diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c index 0f65a9de5f74..1625e5793a89 100644 --- a/net/batman-adv/vis.c +++ b/net/batman-adv/vis.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2008-2013 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -28,14 +28,15 @@ #define BATADV_MAX_VIS_PACKET_SIZE 1000 -static void batadv_start_vis_timer(struct batadv_priv *bat_priv); +/* hash class keys */ +static struct lock_class_key batadv_vis_hash_lock_class_key; /* free the info */ static void batadv_free_info(struct kref *ref) { struct batadv_vis_info *info; struct batadv_priv *bat_priv; - struct batadv_recvlist_node *entry, *tmp; + struct batadv_vis_recvlist_node *entry, *tmp; info = container_of(ref, struct batadv_vis_info, refcount); bat_priv = info->bat_priv; @@ -96,7 +97,6 @@ batadv_vis_hash_find(struct batadv_priv *bat_priv, const void *data) { struct batadv_hashtable *hash = bat_priv->vis.hash; struct hlist_head *head; - struct hlist_node *node; struct batadv_vis_info *vis_info, *vis_info_tmp = NULL; uint32_t index; @@ -107,8 +107,8 @@ batadv_vis_hash_find(struct batadv_priv *bat_priv, const void *data) head = &hash->table[index]; rcu_read_lock(); - hlist_for_each_entry_rcu(vis_info, node, head, hash_entry) { - if (!batadv_vis_info_cmp(node, data)) + hlist_for_each_entry_rcu(vis_info, head, hash_entry) { + if (!batadv_vis_info_cmp(&vis_info->hash_entry, data)) continue; vis_info_tmp = vis_info; @@ -126,10 +126,9 @@ static void batadv_vis_data_insert_interface(const uint8_t *interface, struct hlist_head *if_list, bool primary) { - struct batadv_if_list_entry *entry; - struct hlist_node *pos; + struct batadv_vis_if_list_entry *entry; - hlist_for_each_entry(entry, pos, if_list, list) { + hlist_for_each_entry(entry, if_list, list) { if (batadv_compare_eth(entry->addr, interface)) return; } @@ -146,12 +145,11 @@ static void batadv_vis_data_insert_interface(const uint8_t *interface, static void batadv_vis_data_read_prim_sec(struct seq_file *seq, const struct hlist_head *if_list) { - struct batadv_if_list_entry *entry; - struct hlist_node *pos; + struct batadv_vis_if_list_entry *entry; - hlist_for_each_entry(entry, pos, if_list, list) { + hlist_for_each_entry(entry, if_list, list) { if (entry->primary) - seq_printf(seq, "PRIMARY, "); + seq_puts(seq, "PRIMARY, "); else seq_printf(seq, "SEC %pM, ", entry->addr); } @@ -196,10 +194,9 @@ static void batadv_vis_data_read_entries(struct seq_file *seq, struct batadv_vis_info_entry *entries) { int i; - struct batadv_if_list_entry *entry; - struct hlist_node *pos; + struct batadv_vis_if_list_entry *entry; - hlist_for_each_entry(entry, pos, list, list) { + hlist_for_each_entry(entry, list, list) { seq_printf(seq, "%pM,", entry->addr); for (i = 0; i < packet->entries; i++) @@ -210,24 +207,23 @@ static void batadv_vis_data_read_entries(struct seq_file *seq, if (batadv_compare_eth(entry->addr, packet->vis_orig)) batadv_vis_data_read_prim_sec(seq, list); - seq_printf(seq, "\n"); + seq_puts(seq, "\n"); } } static void batadv_vis_seq_print_text_bucket(struct seq_file *seq, const struct hlist_head *head) { - struct hlist_node *node; struct batadv_vis_info *info; struct batadv_vis_packet *packet; uint8_t *entries_pos; struct batadv_vis_info_entry *entries; - struct batadv_if_list_entry *entry; - struct hlist_node *pos, *n; + struct batadv_vis_if_list_entry *entry; + struct hlist_node *n; HLIST_HEAD(vis_if_list); - hlist_for_each_entry_rcu(info, node, head, hash_entry) { + hlist_for_each_entry_rcu(info, head, hash_entry) { packet = (struct batadv_vis_packet *)info->skb_packet->data; entries_pos = (uint8_t *)packet + sizeof(*packet); entries = (struct batadv_vis_info_entry *)entries_pos; @@ -239,7 +235,7 @@ static void batadv_vis_seq_print_text_bucket(struct seq_file *seq, batadv_vis_data_read_entries(seq, &vis_if_list, packet, entries); - hlist_for_each_entry_safe(entry, pos, n, &vis_if_list, list) { + hlist_for_each_entry_safe(entry, n, &vis_if_list, list) { hlist_del(&entry->list); kfree(entry); } @@ -304,7 +300,7 @@ static void batadv_send_list_del(struct batadv_vis_info *info) static void batadv_recv_list_add(struct batadv_priv *bat_priv, struct list_head *recv_list, const char *mac) { - struct batadv_recvlist_node *entry; + struct batadv_vis_recvlist_node *entry; entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) @@ -321,7 +317,7 @@ static int batadv_recv_list_is_in(struct batadv_priv *bat_priv, const struct list_head *recv_list, const char *mac) { - const struct batadv_recvlist_node *entry; + const struct batadv_vis_recvlist_node *entry; spin_lock_bh(&bat_priv->vis.list_lock); list_for_each_entry(entry, recv_list, list) { @@ -481,7 +477,7 @@ void batadv_receive_client_update_packet(struct batadv_priv *bat_priv, /* Are we the target for this VIS packet? */ if (vis_server == BATADV_VIS_TYPE_SERVER_SYNC && - batadv_is_my_mac(vis_packet->target_orig)) + batadv_is_my_mac(bat_priv, vis_packet->target_orig)) are_target = 1; spin_lock_bh(&bat_priv->vis.hash_lock); @@ -500,7 +496,7 @@ void batadv_receive_client_update_packet(struct batadv_priv *bat_priv, batadv_send_list_add(bat_priv, info); /* ... we're not the recipient (and thus need to forward). */ - } else if (!batadv_is_my_mac(packet->target_orig)) { + } else if (!batadv_is_my_mac(bat_priv, packet->target_orig)) { batadv_send_list_add(bat_priv, info); } @@ -518,7 +514,6 @@ static int batadv_find_best_vis_server(struct batadv_priv *bat_priv, { struct batadv_hashtable *hash = bat_priv->orig_hash; struct batadv_neigh_node *router; - struct hlist_node *node; struct hlist_head *head; struct batadv_orig_node *orig_node; struct batadv_vis_packet *packet; @@ -531,7 +526,7 @@ static int batadv_find_best_vis_server(struct batadv_priv *bat_priv, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { router = batadv_orig_node_get_router(orig_node); if (!router) continue; @@ -570,7 +565,6 @@ static bool batadv_vis_packet_full(const struct batadv_vis_info *info) static int batadv_generate_vis_packet(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_node *node; struct hlist_head *head; struct batadv_orig_node *orig_node; struct batadv_neigh_node *router; @@ -604,7 +598,7 @@ static int batadv_generate_vis_packet(struct batadv_priv *bat_priv) head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { router = batadv_orig_node_get_router(orig_node); if (!router) continue; @@ -643,7 +637,7 @@ next: head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(tt_common_entry, node, head, + hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { packet_pos = skb_put(info->skb_packet, sizeof(*entry)); entry = (struct batadv_vis_info_entry *)packet_pos; @@ -672,14 +666,14 @@ static void batadv_purge_vis_packets(struct batadv_priv *bat_priv) { uint32_t i; struct batadv_hashtable *hash = bat_priv->vis.hash; - struct hlist_node *node, *node_tmp; + struct hlist_node *node_tmp; struct hlist_head *head; struct batadv_vis_info *info; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; - hlist_for_each_entry_safe(info, node, node_tmp, + hlist_for_each_entry_safe(info, node_tmp, head, hash_entry) { /* never purge own data. */ if (info == bat_priv->vis.my_info) @@ -687,7 +681,7 @@ static void batadv_purge_vis_packets(struct batadv_priv *bat_priv) if (batadv_has_timed_out(info->first_seen, BATADV_VIS_TIMEOUT)) { - hlist_del(node); + hlist_del(&info->hash_entry); batadv_send_list_del(info); kref_put(&info->refcount, batadv_free_info); } @@ -699,7 +693,6 @@ static void batadv_broadcast_vis_packet(struct batadv_priv *bat_priv, struct batadv_vis_info *info) { struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_node *node; struct hlist_head *head; struct batadv_orig_node *orig_node; struct batadv_vis_packet *packet; @@ -714,7 +707,7 @@ static void batadv_broadcast_vis_packet(struct batadv_priv *bat_priv, head = &hash->table[i]; rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { /* if it's a vis server and reachable, send it. */ if (!(orig_node->flags & BATADV_VIS_SERVER)) continue; @@ -827,7 +820,9 @@ static void batadv_send_vis_packets(struct work_struct *work) kref_put(&info->refcount, batadv_free_info); } spin_unlock_bh(&bat_priv->vis.hash_lock); - batadv_start_vis_timer(bat_priv); + + queue_delayed_work(batadv_event_workqueue, &bat_priv->vis.work, + msecs_to_jiffies(BATADV_VIS_INTERVAL)); } /* init the vis server. this may only be called when if_list is already @@ -852,6 +847,9 @@ int batadv_vis_init(struct batadv_priv *bat_priv) goto err; } + batadv_hash_set_lock_class(bat_priv->vis.hash, + &batadv_vis_hash_lock_class_key); + bat_priv->vis.my_info = kmalloc(BATADV_MAX_VIS_PACKET_SIZE, GFP_ATOMIC); if (!bat_priv->vis.my_info) goto err; @@ -894,7 +892,11 @@ int batadv_vis_init(struct batadv_priv *bat_priv) } spin_unlock_bh(&bat_priv->vis.hash_lock); - batadv_start_vis_timer(bat_priv); + + INIT_DELAYED_WORK(&bat_priv->vis.work, batadv_send_vis_packets); + queue_delayed_work(batadv_event_workqueue, &bat_priv->vis.work, + msecs_to_jiffies(BATADV_VIS_INTERVAL)); + return 0; free_info: @@ -931,11 +933,3 @@ void batadv_vis_quit(struct batadv_priv *bat_priv) bat_priv->vis.my_info = NULL; spin_unlock_bh(&bat_priv->vis.hash_lock); } - -/* schedule packets for (re)transmission */ -static void batadv_start_vis_timer(struct batadv_priv *bat_priv) -{ - INIT_DELAYED_WORK(&bat_priv->vis.work, batadv_send_vis_packets); - queue_delayed_work(batadv_event_workqueue, &bat_priv->vis.work, - msecs_to_jiffies(BATADV_VIS_INTERVAL)); -} diff --git a/net/batman-adv/vis.h b/net/batman-adv/vis.h index 873282fa86da..ad92b0e3c230 100644 --- a/net/batman-adv/vis.h +++ b/net/batman-adv/vis.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2012 B.A.T.M.A.N. contributors: +/* Copyright (C) 2008-2013 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 2f67d5ecc907..17f33a62f6db 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -290,7 +290,7 @@ static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb, goto done; } - mgr->state = READ_LOC_AMP_INFO; + set_bit(READ_LOC_AMP_INFO, &mgr->state); hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL); done: @@ -397,13 +397,12 @@ static int a2mp_getampassoc_rsp(struct amp_mgr *mgr, struct sk_buff *skb, if (ctrl) { u8 *assoc; - assoc = kzalloc(assoc_len, GFP_KERNEL); + assoc = kmemdup(rsp->amp_assoc, assoc_len, GFP_KERNEL); if (!assoc) { amp_ctrl_put(ctrl); return -ENOMEM; } - memcpy(assoc, rsp->amp_assoc, assoc_len); ctrl->assoc = assoc; ctrl->assoc_len = assoc_len; ctrl->assoc_rem_len = assoc_len; @@ -472,13 +471,12 @@ static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, size_t assoc_len = le16_to_cpu(hdr->len) - sizeof(*req); u8 *assoc; - assoc = kzalloc(assoc_len, GFP_KERNEL); + assoc = kmemdup(req->amp_assoc, assoc_len, GFP_KERNEL); if (!assoc) { amp_ctrl_put(ctrl); return -ENOMEM; } - memcpy(assoc, req->amp_assoc, assoc_len); ctrl->assoc = assoc; ctrl->assoc_len = assoc_len; ctrl->assoc_rem_len = assoc_len; @@ -499,8 +497,16 @@ send_rsp: if (hdev) hci_dev_put(hdev); - a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, hdr->ident, sizeof(rsp), - &rsp); + /* Reply error now and success after HCI Write Remote AMP Assoc + command complete with success status + */ + if (rsp.status != A2MP_STATUS_SUCCESS) { + a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, hdr->ident, + sizeof(rsp), &rsp); + } else { + set_bit(WRITE_REMOTE_AMP_ASSOC, &mgr->state); + mgr->ident = hdr->ident; + } skb_pull(skb, le16_to_cpu(hdr->len)); return 0; @@ -840,7 +846,7 @@ struct amp_mgr *amp_mgr_lookup_by_state(u8 state) mutex_lock(&_mgr_list_lock); list_for_each_entry(mgr, &_mgr_list, list) { - if (mgr->state == state) { + if (test_and_clear_bit(state, &mgr->state)) { amp_mgr_get(mgr); mutex_unlock(&_mgr_list_lock); return mgr; @@ -949,6 +955,32 @@ clean: kfree(req); } +void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status) +{ + struct amp_mgr *mgr; + struct a2mp_physlink_rsp rsp; + struct hci_conn *hs_hcon; + + mgr = amp_mgr_lookup_by_state(WRITE_REMOTE_AMP_ASSOC); + if (!mgr) + return; + + hs_hcon = hci_conn_hash_lookup_state(hdev, AMP_LINK, BT_CONNECT); + if (!hs_hcon) { + rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION; + } else { + rsp.remote_id = hs_hcon->remote_id; + rsp.status = A2MP_STATUS_SUCCESS; + } + + BT_DBG("%s mgr %p hs_hcon %p status %u", hdev->name, mgr, hs_hcon, + status); + + rsp.local_id = hdev->id; + a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, mgr->ident, sizeof(rsp), &rsp); + amp_mgr_put(mgr); +} + void a2mp_discover_amp(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 5355df63d39b..9096137c889c 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -92,23 +92,14 @@ int bt_sock_register(int proto, const struct net_proto_family *ops) } EXPORT_SYMBOL(bt_sock_register); -int bt_sock_unregister(int proto) +void bt_sock_unregister(int proto) { - int err = 0; - if (proto < 0 || proto >= BT_MAX_PROTO) - return -EINVAL; + return; write_lock(&bt_proto_lock); - - if (!bt_proto[proto]) - err = -ENOENT; - else - bt_proto[proto] = NULL; - + bt_proto[proto] = NULL; write_unlock(&bt_proto_lock); - - return err; } EXPORT_SYMBOL(bt_sock_unregister); @@ -230,6 +221,8 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, if (flags & (MSG_OOB)) return -EOPNOTSUPP; + msg->msg_namelen = 0; + skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) { if (sk->sk_shutdown & RCV_SHUTDOWN) @@ -237,8 +230,6 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, return err; } - msg->msg_namelen = 0; - copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; @@ -422,7 +413,8 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock, return bt_accept_poll(sk); if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= POLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0); if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP | POLLIN | POLLRDNORM; @@ -617,7 +609,7 @@ static int bt_seq_open(struct inode *inode, struct file *file) struct bt_sock_list *sk_list; struct bt_seq_state *s; - sk_list = PDE(inode)->data; + sk_list = PDE_DATA(inode); s = __seq_open_private(file, &bt_seq_ops, sizeof(struct bt_seq_state)); if (!s) @@ -627,35 +619,30 @@ static int bt_seq_open(struct inode *inode, struct file *file) return 0; } -int bt_procfs_init(struct module* module, struct net *net, const char *name, +static const struct file_operations bt_fops = { + .open = bt_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private +}; + +int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list* sk_list, int (* seq_show)(struct seq_file *, void *)) { - struct proc_dir_entry * pde; - sk_list->custom_seq_show = seq_show; - sk_list->fops.owner = module; - sk_list->fops.open = bt_seq_open; - sk_list->fops.read = seq_read; - sk_list->fops.llseek = seq_lseek; - sk_list->fops.release = seq_release_private; - - pde = proc_net_fops_create(net, name, 0, &sk_list->fops); - if (!pde) + if (!proc_create_data(name, 0, net->proc_net, &bt_fops, sk_list)) return -ENOMEM; - - pde->data = sk_list; - return 0; } void bt_procfs_cleanup(struct net *net, const char *name) { - proc_net_remove(net, name); + remove_proc_entry(name, net->proc_net); } #else -int bt_procfs_init(struct module* module, struct net *net, const char *name, +int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list* sk_list, int (* seq_show)(struct seq_file *, void *)) { diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index 1b0d92c0643a..d459ed43c779 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -236,7 +236,7 @@ void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr) cp.max_len = cpu_to_le16(hdev->amp_assoc_size); - mgr->state = READ_LOC_AMP_ASSOC; + set_bit(READ_LOC_AMP_ASSOC, &mgr->state); hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); } @@ -250,7 +250,7 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev, cp.len_so_far = cpu_to_le16(0); cp.max_len = cpu_to_le16(hdev->amp_assoc_size); - mgr->state = READ_LOC_AMP_ASSOC_FINAL; + set_bit(READ_LOC_AMP_ASSOC_FINAL, &mgr->state); /* Read Local AMP Assoc final link information data */ hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); @@ -317,7 +317,9 @@ void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle) if (!hcon) return; - amp_write_rem_assoc_frag(hdev, hcon); + /* Send A2MP create phylink rsp when all fragments are written */ + if (amp_write_rem_assoc_frag(hdev, hcon)) + a2mp_send_create_phy_link_rsp(hdev, 0); } void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle) @@ -403,26 +405,20 @@ void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon) void amp_create_logical_link(struct l2cap_chan *chan) { + struct hci_conn *hs_hcon = chan->hs_hcon; struct hci_cp_create_accept_logical_link cp; - struct hci_conn *hcon; struct hci_dev *hdev; - BT_DBG("chan %p", chan); + BT_DBG("chan %p hs_hcon %p dst %pMR", chan, hs_hcon, chan->conn->dst); - if (!chan->hs_hcon) + if (!hs_hcon) return; hdev = hci_dev_hold(chan->hs_hcon->hdev); if (!hdev) return; - BT_DBG("chan %p dst %pMR", chan, chan->conn->dst); - - hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK, chan->conn->dst); - if (!hcon) - goto done; - - cp.phy_handle = hcon->handle; + cp.phy_handle = hs_hcon->handle; cp.tx_flow_spec.id = chan->local_id; cp.tx_flow_spec.stype = chan->local_stype; @@ -438,14 +434,13 @@ void amp_create_logical_link(struct l2cap_chan *chan) cp.rx_flow_spec.acc_lat = cpu_to_le32(chan->remote_acc_lat); cp.rx_flow_spec.flush_to = cpu_to_le32(chan->remote_flush_to); - if (hcon->out) + if (hs_hcon->out) hci_send_cmd(hdev, HCI_OP_CREATE_LOGICAL_LINK, sizeof(cp), &cp); else hci_send_cmd(hdev, HCI_OP_ACCEPT_LOGICAL_LINK, sizeof(cp), &cp); -done: hci_dev_put(hdev); } diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index a5b639702637..e430b1abcd2f 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -33,7 +33,6 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> -#include <net/bluetooth/l2cap.h> #include "bnep.h" diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c index e58c8b32589c..4b488ec26105 100644 --- a/net/bluetooth/bnep/netdev.c +++ b/net/bluetooth/bnep/netdev.c @@ -136,7 +136,7 @@ static u16 bnep_net_eth_proto(struct sk_buff *skb) struct ethhdr *eh = (void *) skb->data; u16 proto = ntohs(eh->h_proto); - if (proto >= 1536) + if (proto >= ETH_P_802_3_MIN) return proto; if (get_unaligned((__be16 *) skb->data) == htons(0xFFFF)) diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index e7154a58465f..5f051290daba 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -234,7 +234,7 @@ int __init bnep_sock_init(void) goto error; } - err = bt_procfs_init(THIS_MODULE, &init_net, "bnep", &bnep_sk_list, NULL); + err = bt_procfs_init(&init_net, "bnep", &bnep_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create BNEP proc file"); bt_sock_unregister(BTPROTO_BNEP); @@ -253,8 +253,6 @@ error: void __exit bnep_sock_cleanup(void) { bt_procfs_cleanup(&init_net, "bnep"); - if (bt_sock_unregister(BTPROTO_BNEP) < 0) - BT_ERR("Can't unregister BNEP socket"); - + bt_sock_unregister(BTPROTO_BNEP); proto_unregister(&bnep_proto); } diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index a4a9d4b6816c..cd75e4d64b90 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -539,7 +539,7 @@ static int cmtp_proc_show(struct seq_file *m, void *v) static int cmtp_proc_open(struct inode *inode, struct file *file) { - return single_open(file, cmtp_proc_show, PDE(inode)->data); + return single_open(file, cmtp_proc_show, PDE_DATA(inode)); } static const struct file_operations cmtp_proc_fops = { diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index 1c57482112b6..d82787d417bd 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -245,7 +245,7 @@ int cmtp_init_sockets(void) goto error; } - err = bt_procfs_init(THIS_MODULE, &init_net, "cmtp", &cmtp_sk_list, NULL); + err = bt_procfs_init(&init_net, "cmtp", &cmtp_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create CMTP proc file"); bt_sock_unregister(BTPROTO_HIDP); @@ -264,8 +264,6 @@ error: void cmtp_cleanup_sockets(void) { bt_procfs_cleanup(&init_net, "cmtp"); - if (bt_sock_unregister(BTPROTO_CMTP) < 0) - BT_ERR("Can't unregister CMTP socket"); - + bt_sock_unregister(BTPROTO_CMTP); proto_unregister(&cmtp_proto); } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 4925a02ae7e4..6c7f36379722 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -117,7 +117,17 @@ static void hci_acl_create_connection_cancel(struct hci_conn *conn) hci_send_cmd(conn->hdev, HCI_OP_CREATE_CONN_CANCEL, sizeof(cp), &cp); } -void hci_acl_disconn(struct hci_conn *conn, __u8 reason) +static void hci_reject_sco(struct hci_conn *conn) +{ + struct hci_cp_reject_sync_conn_req cp; + + cp.reason = HCI_ERROR_REMOTE_USER_TERM; + bacpy(&cp.bdaddr, &conn->dst); + + hci_send_cmd(conn->hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp); +} + +void hci_disconnect(struct hci_conn *conn, __u8 reason) { struct hci_cp_disconnect cp; @@ -253,7 +263,7 @@ static void hci_conn_disconnect(struct hci_conn *conn) hci_amp_disconn(conn, reason); break; default: - hci_acl_disconn(conn, reason); + hci_disconnect(conn, reason); break; } } @@ -276,6 +286,8 @@ static void hci_conn_timeout(struct work_struct *work) hci_acl_create_connection_cancel(conn); else if (conn->type == LE_LINK) hci_le_create_connection_cancel(conn); + } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { + hci_reject_sco(conn); } break; case BT_CONFIG: @@ -398,8 +410,6 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); - atomic_set(&conn->devref, 0); - hci_conn_init_sysfs(conn); return conn; @@ -433,7 +443,7 @@ int hci_conn_del(struct hci_conn *conn) struct hci_conn *acl = conn->link; if (acl) { acl->link = NULL; - hci_conn_put(acl); + hci_conn_drop(acl); } } @@ -448,12 +458,11 @@ int hci_conn_del(struct hci_conn *conn) skb_queue_purge(&conn->data_q); - hci_conn_put_device(conn); + hci_conn_del_sysfs(conn); hci_dev_put(hdev); - if (conn->handle == 0) - kfree(conn); + hci_conn_put(conn); return 0; } @@ -565,7 +574,7 @@ static struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, if (!sco) { sco = hci_conn_add(hdev, type, dst); if (!sco) { - hci_conn_put(acl); + hci_conn_drop(acl); return ERR_PTR(-ENOMEM); } } @@ -835,19 +844,6 @@ void hci_conn_check_pending(struct hci_dev *hdev) hci_dev_unlock(hdev); } -void hci_conn_hold_device(struct hci_conn *conn) -{ - atomic_inc(&conn->devref); -} -EXPORT_SYMBOL(hci_conn_hold_device); - -void hci_conn_put_device(struct hci_conn *conn) -{ - if (atomic_dec_and_test(&conn->devref)) - hci_conn_del_sysfs(conn); -} -EXPORT_SYMBOL(hci_conn_put_device); - int hci_get_conn_list(void __user *arg) { struct hci_conn *c; @@ -980,7 +976,7 @@ void hci_chan_del(struct hci_chan *chan) synchronize_rcu(); - hci_conn_put(conn); + hci_conn_drop(conn); skb_queue_purge(&chan->data_q); kfree(chan); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0f78e34220c9..33843c5c4939 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -57,36 +57,9 @@ static void hci_notify(struct hci_dev *hdev, int event) /* ---- HCI requests ---- */ -void hci_req_complete(struct hci_dev *hdev, __u16 cmd, int result) +static void hci_req_sync_complete(struct hci_dev *hdev, u8 result) { - BT_DBG("%s command 0x%4.4x result 0x%2.2x", hdev->name, cmd, result); - - /* If this is the init phase check if the completed command matches - * the last init command, and if not just return. - */ - if (test_bit(HCI_INIT, &hdev->flags) && hdev->init_last_cmd != cmd) { - struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data; - u16 opcode = __le16_to_cpu(sent->opcode); - struct sk_buff *skb; - - /* Some CSR based controllers generate a spontaneous - * reset complete event during init and any pending - * command will never be completed. In such a case we - * need to resend whatever was the last sent - * command. - */ - - if (cmd != HCI_OP_RESET || opcode == HCI_OP_RESET) - return; - - skb = skb_clone(hdev->sent_cmd, GFP_ATOMIC); - if (skb) { - skb_queue_head(&hdev->cmd_q, skb); - queue_work(hdev->workqueue, &hdev->cmd_work); - } - - return; - } + BT_DBG("%s result 0x%2.2x", hdev->name, result); if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = result; @@ -106,22 +79,158 @@ static void hci_req_cancel(struct hci_dev *hdev, int err) } } +static struct sk_buff *hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, + u8 event) +{ + struct hci_ev_cmd_complete *ev; + struct hci_event_hdr *hdr; + struct sk_buff *skb; + + hci_dev_lock(hdev); + + skb = hdev->recv_evt; + hdev->recv_evt = NULL; + + hci_dev_unlock(hdev); + + if (!skb) + return ERR_PTR(-ENODATA); + + if (skb->len < sizeof(*hdr)) { + BT_ERR("Too short HCI event"); + goto failed; + } + + hdr = (void *) skb->data; + skb_pull(skb, HCI_EVENT_HDR_SIZE); + + if (event) { + if (hdr->evt != event) + goto failed; + return skb; + } + + if (hdr->evt != HCI_EV_CMD_COMPLETE) { + BT_DBG("Last event is not cmd complete (0x%2.2x)", hdr->evt); + goto failed; + } + + if (skb->len < sizeof(*ev)) { + BT_ERR("Too short cmd_complete event"); + goto failed; + } + + ev = (void *) skb->data; + skb_pull(skb, sizeof(*ev)); + + if (opcode == __le16_to_cpu(ev->opcode)) + return skb; + + BT_DBG("opcode doesn't match (0x%2.2x != 0x%2.2x)", opcode, + __le16_to_cpu(ev->opcode)); + +failed: + kfree_skb(skb); + return ERR_PTR(-ENODATA); +} + +struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u8 event, u32 timeout) +{ + DECLARE_WAITQUEUE(wait, current); + struct hci_request req; + int err = 0; + + BT_DBG("%s", hdev->name); + + hci_req_init(&req, hdev); + + hci_req_add_ev(&req, opcode, plen, param, event); + + hdev->req_status = HCI_REQ_PEND; + + err = hci_req_run(&req, hci_req_sync_complete); + if (err < 0) + return ERR_PTR(err); + + add_wait_queue(&hdev->req_wait_q, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + schedule_timeout(timeout); + + remove_wait_queue(&hdev->req_wait_q, &wait); + + if (signal_pending(current)) + return ERR_PTR(-EINTR); + + switch (hdev->req_status) { + case HCI_REQ_DONE: + err = -bt_to_errno(hdev->req_result); + break; + + case HCI_REQ_CANCELED: + err = -hdev->req_result; + break; + + default: + err = -ETIMEDOUT; + break; + } + + hdev->req_status = hdev->req_result = 0; + + BT_DBG("%s end: err %d", hdev->name, err); + + if (err < 0) + return ERR_PTR(err); + + return hci_get_cmd_complete(hdev, opcode, event); +} +EXPORT_SYMBOL(__hci_cmd_sync_ev); + +struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout) +{ + return __hci_cmd_sync_ev(hdev, opcode, plen, param, 0, timeout); +} +EXPORT_SYMBOL(__hci_cmd_sync); + /* Execute request and wait for completion. */ -static int __hci_request(struct hci_dev *hdev, - void (*req)(struct hci_dev *hdev, unsigned long opt), - unsigned long opt, __u32 timeout) +static int __hci_req_sync(struct hci_dev *hdev, + void (*func)(struct hci_request *req, + unsigned long opt), + unsigned long opt, __u32 timeout) { + struct hci_request req; DECLARE_WAITQUEUE(wait, current); int err = 0; BT_DBG("%s start", hdev->name); + hci_req_init(&req, hdev); + hdev->req_status = HCI_REQ_PEND; + func(&req, opt); + + err = hci_req_run(&req, hci_req_sync_complete); + if (err < 0) { + hdev->req_status = 0; + + /* ENODATA means the HCI request command queue is empty. + * This can happen when a request with conditionals doesn't + * trigger any commands to be sent. This is normal behavior + * and should not trigger an error return. + */ + if (err == -ENODATA) + return 0; + + return err; + } + add_wait_queue(&hdev->req_wait_q, &wait); set_current_state(TASK_INTERRUPTIBLE); - req(hdev, opt); schedule_timeout(timeout); remove_wait_queue(&hdev->req_wait_q, &wait); @@ -150,9 +259,10 @@ static int __hci_request(struct hci_dev *hdev, return err; } -static int hci_request(struct hci_dev *hdev, - void (*req)(struct hci_dev *hdev, unsigned long opt), - unsigned long opt, __u32 timeout) +static int hci_req_sync(struct hci_dev *hdev, + void (*req)(struct hci_request *req, + unsigned long opt), + unsigned long opt, __u32 timeout) { int ret; @@ -161,75 +271,66 @@ static int hci_request(struct hci_dev *hdev, /* Serialize all requests */ hci_req_lock(hdev); - ret = __hci_request(hdev, req, opt, timeout); + ret = __hci_req_sync(hdev, req, opt, timeout); hci_req_unlock(hdev); return ret; } -static void hci_reset_req(struct hci_dev *hdev, unsigned long opt) +static void hci_reset_req(struct hci_request *req, unsigned long opt) { - BT_DBG("%s %ld", hdev->name, opt); + BT_DBG("%s %ld", req->hdev->name, opt); /* Reset device */ - set_bit(HCI_RESET, &hdev->flags); - hci_send_cmd(hdev, HCI_OP_RESET, 0, NULL); + set_bit(HCI_RESET, &req->hdev->flags); + hci_req_add(req, HCI_OP_RESET, 0, NULL); } -static void bredr_init(struct hci_dev *hdev) +static void bredr_init(struct hci_request *req) { - hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_PACKET_BASED; + req->hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_PACKET_BASED; /* Read Local Supported Features */ - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_FEATURES, 0, NULL); + hci_req_add(req, HCI_OP_READ_LOCAL_FEATURES, 0, NULL); /* Read Local Version */ - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL); + hci_req_add(req, HCI_OP_READ_LOCAL_VERSION, 0, NULL); + + /* Read BD Address */ + hci_req_add(req, HCI_OP_READ_BD_ADDR, 0, NULL); } -static void amp_init(struct hci_dev *hdev) +static void amp_init(struct hci_request *req) { - hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_BLOCK_BASED; + req->hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_BLOCK_BASED; /* Read Local Version */ - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL); + hci_req_add(req, HCI_OP_READ_LOCAL_VERSION, 0, NULL); /* Read Local AMP Info */ - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL); + hci_req_add(req, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL); /* Read Data Blk size */ - hci_send_cmd(hdev, HCI_OP_READ_DATA_BLOCK_SIZE, 0, NULL); + hci_req_add(req, HCI_OP_READ_DATA_BLOCK_SIZE, 0, NULL); } -static void hci_init_req(struct hci_dev *hdev, unsigned long opt) +static void hci_init1_req(struct hci_request *req, unsigned long opt) { - struct sk_buff *skb; + struct hci_dev *hdev = req->hdev; BT_DBG("%s %ld", hdev->name, opt); - /* Driver initialization */ - - /* Special commands */ - while ((skb = skb_dequeue(&hdev->driver_init))) { - bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; - skb->dev = (void *) hdev; - - skb_queue_tail(&hdev->cmd_q, skb); - queue_work(hdev->workqueue, &hdev->cmd_work); - } - skb_queue_purge(&hdev->driver_init); - /* Reset */ if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) - hci_reset_req(hdev, 0); + hci_reset_req(req, 0); switch (hdev->dev_type) { case HCI_BREDR: - bredr_init(hdev); + bredr_init(req); break; case HCI_AMP: - amp_init(hdev); + amp_init(req); break; default: @@ -238,44 +339,347 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt) } } -static void hci_scan_req(struct hci_dev *hdev, unsigned long opt) +static void bredr_setup(struct hci_request *req) +{ + struct hci_cp_delete_stored_link_key cp; + __le16 param; + __u8 flt_type; + + /* Read Buffer Size (ACL mtu, max pkt, etc.) */ + hci_req_add(req, HCI_OP_READ_BUFFER_SIZE, 0, NULL); + + /* Read Class of Device */ + hci_req_add(req, HCI_OP_READ_CLASS_OF_DEV, 0, NULL); + + /* Read Local Name */ + hci_req_add(req, HCI_OP_READ_LOCAL_NAME, 0, NULL); + + /* Read Voice Setting */ + hci_req_add(req, HCI_OP_READ_VOICE_SETTING, 0, NULL); + + /* Clear Event Filters */ + flt_type = HCI_FLT_CLEAR_ALL; + hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &flt_type); + + /* Connection accept timeout ~20 secs */ + param = __constant_cpu_to_le16(0x7d00); + hci_req_add(req, HCI_OP_WRITE_CA_TIMEOUT, 2, ¶m); + + bacpy(&cp.bdaddr, BDADDR_ANY); + cp.delete_all = 0x01; + hci_req_add(req, HCI_OP_DELETE_STORED_LINK_KEY, sizeof(cp), &cp); + + /* Read page scan parameters */ + if (req->hdev->hci_ver > BLUETOOTH_VER_1_1) { + hci_req_add(req, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL); + hci_req_add(req, HCI_OP_READ_PAGE_SCAN_TYPE, 0, NULL); + } +} + +static void le_setup(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + + /* Read LE Buffer Size */ + hci_req_add(req, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL); + + /* Read LE Local Supported Features */ + hci_req_add(req, HCI_OP_LE_READ_LOCAL_FEATURES, 0, NULL); + + /* Read LE Advertising Channel TX Power */ + hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); + + /* Read LE White List Size */ + hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, 0, NULL); + + /* Read LE Supported States */ + hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL); + + /* LE-only controllers have LE implicitly enabled */ + if (!lmp_bredr_capable(hdev)) + set_bit(HCI_LE_ENABLED, &hdev->dev_flags); +} + +static u8 hci_get_inquiry_mode(struct hci_dev *hdev) +{ + if (lmp_ext_inq_capable(hdev)) + return 0x02; + + if (lmp_inq_rssi_capable(hdev)) + return 0x01; + + if (hdev->manufacturer == 11 && hdev->hci_rev == 0x00 && + hdev->lmp_subver == 0x0757) + return 0x01; + + if (hdev->manufacturer == 15) { + if (hdev->hci_rev == 0x03 && hdev->lmp_subver == 0x6963) + return 0x01; + if (hdev->hci_rev == 0x09 && hdev->lmp_subver == 0x6963) + return 0x01; + if (hdev->hci_rev == 0x00 && hdev->lmp_subver == 0x6965) + return 0x01; + } + + if (hdev->manufacturer == 31 && hdev->hci_rev == 0x2005 && + hdev->lmp_subver == 0x1805) + return 0x01; + + return 0x00; +} + +static void hci_setup_inquiry_mode(struct hci_request *req) +{ + u8 mode; + + mode = hci_get_inquiry_mode(req->hdev); + + hci_req_add(req, HCI_OP_WRITE_INQUIRY_MODE, 1, &mode); +} + +static void hci_setup_event_mask(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + + /* The second byte is 0xff instead of 0x9f (two reserved bits + * disabled) since a Broadcom 1.2 dongle doesn't respond to the + * command otherwise. + */ + u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 }; + + /* CSR 1.1 dongles does not accept any bitfield so don't try to set + * any event mask for pre 1.2 devices. + */ + if (hdev->hci_ver < BLUETOOTH_VER_1_2) + return; + + if (lmp_bredr_capable(hdev)) { + events[4] |= 0x01; /* Flow Specification Complete */ + events[4] |= 0x02; /* Inquiry Result with RSSI */ + events[4] |= 0x04; /* Read Remote Extended Features Complete */ + events[5] |= 0x08; /* Synchronous Connection Complete */ + events[5] |= 0x10; /* Synchronous Connection Changed */ + } + + if (lmp_inq_rssi_capable(hdev)) + events[4] |= 0x02; /* Inquiry Result with RSSI */ + + if (lmp_sniffsubr_capable(hdev)) + events[5] |= 0x20; /* Sniff Subrating */ + + if (lmp_pause_enc_capable(hdev)) + events[5] |= 0x80; /* Encryption Key Refresh Complete */ + + if (lmp_ext_inq_capable(hdev)) + events[5] |= 0x40; /* Extended Inquiry Result */ + + if (lmp_no_flush_capable(hdev)) + events[7] |= 0x01; /* Enhanced Flush Complete */ + + if (lmp_lsto_capable(hdev)) + events[6] |= 0x80; /* Link Supervision Timeout Changed */ + + if (lmp_ssp_capable(hdev)) { + events[6] |= 0x01; /* IO Capability Request */ + events[6] |= 0x02; /* IO Capability Response */ + events[6] |= 0x04; /* User Confirmation Request */ + events[6] |= 0x08; /* User Passkey Request */ + events[6] |= 0x10; /* Remote OOB Data Request */ + events[6] |= 0x20; /* Simple Pairing Complete */ + events[7] |= 0x04; /* User Passkey Notification */ + events[7] |= 0x08; /* Keypress Notification */ + events[7] |= 0x10; /* Remote Host Supported + * Features Notification + */ + } + + if (lmp_le_capable(hdev)) + events[7] |= 0x20; /* LE Meta-Event */ + + hci_req_add(req, HCI_OP_SET_EVENT_MASK, sizeof(events), events); + + if (lmp_le_capable(hdev)) { + memset(events, 0, sizeof(events)); + events[0] = 0x1f; + hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, + sizeof(events), events); + } +} + +static void hci_init2_req(struct hci_request *req, unsigned long opt) +{ + struct hci_dev *hdev = req->hdev; + + if (lmp_bredr_capable(hdev)) + bredr_setup(req); + + if (lmp_le_capable(hdev)) + le_setup(req); + + hci_setup_event_mask(req); + + if (hdev->hci_ver > BLUETOOTH_VER_1_1) + hci_req_add(req, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL); + + if (lmp_ssp_capable(hdev)) { + if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { + u8 mode = 0x01; + hci_req_add(req, HCI_OP_WRITE_SSP_MODE, + sizeof(mode), &mode); + } else { + struct hci_cp_write_eir cp; + + memset(hdev->eir, 0, sizeof(hdev->eir)); + memset(&cp, 0, sizeof(cp)); + + hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp); + } + } + + if (lmp_inq_rssi_capable(hdev)) + hci_setup_inquiry_mode(req); + + if (lmp_inq_tx_pwr_capable(hdev)) + hci_req_add(req, HCI_OP_READ_INQ_RSP_TX_POWER, 0, NULL); + + if (lmp_ext_feat_capable(hdev)) { + struct hci_cp_read_local_ext_features cp; + + cp.page = 0x01; + hci_req_add(req, HCI_OP_READ_LOCAL_EXT_FEATURES, + sizeof(cp), &cp); + } + + if (test_bit(HCI_LINK_SECURITY, &hdev->dev_flags)) { + u8 enable = 1; + hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, sizeof(enable), + &enable); + } +} + +static void hci_setup_link_policy(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_write_def_link_policy cp; + u16 link_policy = 0; + + if (lmp_rswitch_capable(hdev)) + link_policy |= HCI_LP_RSWITCH; + if (lmp_hold_capable(hdev)) + link_policy |= HCI_LP_HOLD; + if (lmp_sniff_capable(hdev)) + link_policy |= HCI_LP_SNIFF; + if (lmp_park_capable(hdev)) + link_policy |= HCI_LP_PARK; + + cp.policy = cpu_to_le16(link_policy); + hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, sizeof(cp), &cp); +} + +static void hci_set_le_support(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_write_le_host_supported cp; + + /* LE-only devices do not support explicit enablement */ + if (!lmp_bredr_capable(hdev)) + return; + + memset(&cp, 0, sizeof(cp)); + + if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) { + cp.le = 0x01; + cp.simul = lmp_le_br_capable(hdev); + } + + if (cp.le != lmp_host_le_capable(hdev)) + hci_req_add(req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), + &cp); +} + +static void hci_init3_req(struct hci_request *req, unsigned long opt) +{ + struct hci_dev *hdev = req->hdev; + u8 p; + + if (hdev->commands[5] & 0x10) + hci_setup_link_policy(req); + + if (lmp_le_capable(hdev)) { + hci_set_le_support(req); + hci_update_ad(req); + } + + /* Read features beyond page 1 if available */ + for (p = 2; p < HCI_MAX_PAGES && p <= hdev->max_page; p++) { + struct hci_cp_read_local_ext_features cp; + + cp.page = p; + hci_req_add(req, HCI_OP_READ_LOCAL_EXT_FEATURES, + sizeof(cp), &cp); + } +} + +static int __hci_init(struct hci_dev *hdev) +{ + int err; + + err = __hci_req_sync(hdev, hci_init1_req, 0, HCI_INIT_TIMEOUT); + if (err < 0) + return err; + + /* HCI_BREDR covers both single-mode LE, BR/EDR and dual-mode + * BR/EDR/LE type controllers. AMP controllers only need the + * first stage init. + */ + if (hdev->dev_type != HCI_BREDR) + return 0; + + err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT); + if (err < 0) + return err; + + return __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT); +} + +static void hci_scan_req(struct hci_request *req, unsigned long opt) { __u8 scan = opt; - BT_DBG("%s %x", hdev->name, scan); + BT_DBG("%s %x", req->hdev->name, scan); /* Inquiry and Page scans */ - hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } -static void hci_auth_req(struct hci_dev *hdev, unsigned long opt) +static void hci_auth_req(struct hci_request *req, unsigned long opt) { __u8 auth = opt; - BT_DBG("%s %x", hdev->name, auth); + BT_DBG("%s %x", req->hdev->name, auth); /* Authentication */ - hci_send_cmd(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth); + hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth); } -static void hci_encrypt_req(struct hci_dev *hdev, unsigned long opt) +static void hci_encrypt_req(struct hci_request *req, unsigned long opt) { __u8 encrypt = opt; - BT_DBG("%s %x", hdev->name, encrypt); + BT_DBG("%s %x", req->hdev->name, encrypt); /* Encryption */ - hci_send_cmd(hdev, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt); + hci_req_add(req, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt); } -static void hci_linkpol_req(struct hci_dev *hdev, unsigned long opt) +static void hci_linkpol_req(struct hci_request *req, unsigned long opt) { __le16 policy = cpu_to_le16(opt); - BT_DBG("%s %x", hdev->name, policy); + BT_DBG("%s %x", req->hdev->name, policy); /* Default link policy */ - hci_send_cmd(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy); + hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy); } /* Get HCI device by index. @@ -512,9 +916,10 @@ static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) return copied; } -static void hci_inq_req(struct hci_dev *hdev, unsigned long opt) +static void hci_inq_req(struct hci_request *req, unsigned long opt) { struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt; + struct hci_dev *hdev = req->hdev; struct hci_cp_inquiry cp; BT_DBG("%s", hdev->name); @@ -526,7 +931,13 @@ static void hci_inq_req(struct hci_dev *hdev, unsigned long opt) memcpy(&cp.lap, &ir->lap, 3); cp.length = ir->length; cp.num_rsp = ir->num_rsp; - hci_send_cmd(hdev, HCI_OP_INQUIRY, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); +} + +static int wait_inquiry(void *word) +{ + schedule(); + return signal_pending(current); } int hci_inquiry(void __user *arg) @@ -556,9 +967,17 @@ int hci_inquiry(void __user *arg) timeo = ir.length * msecs_to_jiffies(2000); if (do_inquiry) { - err = hci_request(hdev, hci_inq_req, (unsigned long)&ir, timeo); + err = hci_req_sync(hdev, hci_inq_req, (unsigned long) &ir, + timeo); if (err < 0) goto done; + + /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is + * cleared). If it is interrupted by a signal, return -EINTR. + */ + if (wait_on_bit(&hdev->flags, HCI_INQUIRY, wait_inquiry, + TASK_INTERRUPTIBLE)) + return -EINTR; } /* for unlimited number of responses we will use buffer with @@ -654,39 +1073,29 @@ static u8 create_ad(struct hci_dev *hdev, u8 *ptr) return ad_len; } -int hci_update_ad(struct hci_dev *hdev) +void hci_update_ad(struct hci_request *req) { + struct hci_dev *hdev = req->hdev; struct hci_cp_le_set_adv_data cp; u8 len; - int err; - hci_dev_lock(hdev); - - if (!lmp_le_capable(hdev)) { - err = -EINVAL; - goto unlock; - } + if (!lmp_le_capable(hdev)) + return; memset(&cp, 0, sizeof(cp)); len = create_ad(hdev, cp.data); if (hdev->adv_data_len == len && - memcmp(cp.data, hdev->adv_data, len) == 0) { - err = 0; - goto unlock; - } + memcmp(cp.data, hdev->adv_data, len) == 0) + return; memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); hdev->adv_data_len = len; cp.length = len; - err = hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); - -unlock: - hci_dev_unlock(hdev); - return err; + hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); } /* ---- HCI ioctl helpers ---- */ @@ -719,34 +1128,37 @@ int hci_dev_open(__u16 dev) goto done; } - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) - set_bit(HCI_RAW, &hdev->flags); - - /* Treat all non BR/EDR controllers as raw devices if - enable_hs is not set */ - if (hdev->dev_type != HCI_BREDR && !enable_hs) - set_bit(HCI_RAW, &hdev->flags); - if (hdev->open(hdev)) { ret = -EIO; goto done; } - if (!test_bit(HCI_RAW, &hdev->flags)) { - atomic_set(&hdev->cmd_cnt, 1); - set_bit(HCI_INIT, &hdev->flags); - hdev->init_last_cmd = 0; + atomic_set(&hdev->cmd_cnt, 1); + set_bit(HCI_INIT, &hdev->flags); - ret = __hci_request(hdev, hci_init_req, 0, HCI_INIT_TIMEOUT); + if (hdev->setup && test_bit(HCI_SETUP, &hdev->dev_flags)) + ret = hdev->setup(hdev); - clear_bit(HCI_INIT, &hdev->flags); + if (!ret) { + /* Treat all non BR/EDR controllers as raw devices if + * enable_hs is not set. + */ + if (hdev->dev_type != HCI_BREDR && !enable_hs) + set_bit(HCI_RAW, &hdev->flags); + + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + set_bit(HCI_RAW, &hdev->flags); + + if (!test_bit(HCI_RAW, &hdev->flags)) + ret = __hci_init(hdev); } + clear_bit(HCI_INIT, &hdev->flags); + if (!ret) { hci_dev_hold(hdev); set_bit(HCI_UP, &hdev->flags); hci_notify(hdev, HCI_DEV_UP); - hci_update_ad(hdev); if (!test_bit(HCI_SETUP, &hdev->dev_flags) && mgmt_valid_hdev(hdev)) { hci_dev_lock(hdev); @@ -828,7 +1240,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) if (!test_bit(HCI_RAW, &hdev->flags) && test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { set_bit(HCI_INIT, &hdev->flags); - __hci_request(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT); + __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT); clear_bit(HCI_INIT, &hdev->flags); } @@ -847,10 +1259,17 @@ static int hci_dev_do_close(struct hci_dev *hdev) hdev->sent_cmd = NULL; } + kfree_skb(hdev->recv_evt); + hdev->recv_evt = NULL; + /* After this point our queues are empty * and no tasks are scheduled. */ hdev->close(hdev); + /* Clear flags */ + hdev->flags = 0; + hdev->dev_flags &= ~HCI_PERSISTENT_MASK; + if (!test_and_clear_bit(HCI_AUTO_OFF, &hdev->dev_flags) && mgmt_valid_hdev(hdev)) { hci_dev_lock(hdev); @@ -858,9 +1277,6 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_dev_unlock(hdev); } - /* Clear flags */ - hdev->flags = 0; - /* Controller radio is available but is currently powered down */ hdev->amp_status = 0; @@ -921,7 +1337,7 @@ int hci_dev_reset(__u16 dev) hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; if (!test_bit(HCI_RAW, &hdev->flags)) - ret = __hci_request(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT); + ret = __hci_req_sync(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT); done: hci_req_unlock(hdev); @@ -960,8 +1376,8 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) switch (cmd) { case HCISETAUTH: - err = hci_request(hdev, hci_auth_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, + HCI_INIT_TIMEOUT); break; case HCISETENCRYPT: @@ -972,24 +1388,24 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) if (!test_bit(HCI_AUTH, &hdev->flags)) { /* Auth must be enabled first */ - err = hci_request(hdev, hci_auth_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, + HCI_INIT_TIMEOUT); if (err) break; } - err = hci_request(hdev, hci_encrypt_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + err = hci_req_sync(hdev, hci_encrypt_req, dr.dev_opt, + HCI_INIT_TIMEOUT); break; case HCISETSCAN: - err = hci_request(hdev, hci_scan_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + err = hci_req_sync(hdev, hci_scan_req, dr.dev_opt, + HCI_INIT_TIMEOUT); break; case HCISETLINKPOL: - err = hci_request(hdev, hci_linkpol_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + err = hci_req_sync(hdev, hci_linkpol_req, dr.dev_opt, + HCI_INIT_TIMEOUT); break; case HCISETLINKMODE: @@ -1146,7 +1562,8 @@ static void hci_power_on(struct work_struct *work) return; if (test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) - schedule_delayed_work(&hdev->power_off, HCI_AUTO_OFF_TIMEOUT); + queue_delayed_work(hdev->req_workqueue, &hdev->power_off, + HCI_AUTO_OFF_TIMEOUT); if (test_and_clear_bit(HCI_SETUP, &hdev->dev_flags)) mgmt_index_added(hdev); @@ -1182,14 +1599,10 @@ static void hci_discov_off(struct work_struct *work) int hci_uuids_clear(struct hci_dev *hdev) { - struct list_head *p, *n; - - list_for_each_safe(p, n, &hdev->uuids) { - struct bt_uuid *uuid; + struct bt_uuid *uuid, *tmp; - uuid = list_entry(p, struct bt_uuid, list); - - list_del(p); + list_for_each_entry_safe(uuid, tmp, &hdev->uuids, list) { + list_del(&uuid->list); kfree(uuid); } @@ -1569,7 +1982,7 @@ int hci_blacklist_del(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) return mgmt_device_unblocked(hdev, bdaddr, type); } -static void le_scan_param_req(struct hci_dev *hdev, unsigned long opt) +static void le_scan_param_req(struct hci_request *req, unsigned long opt) { struct le_scan_params *param = (struct le_scan_params *) opt; struct hci_cp_le_set_scan_param cp; @@ -1579,18 +1992,18 @@ static void le_scan_param_req(struct hci_dev *hdev, unsigned long opt) cp.interval = cpu_to_le16(param->interval); cp.window = cpu_to_le16(param->window); - hci_send_cmd(hdev, HCI_OP_LE_SET_SCAN_PARAM, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(cp), &cp); } -static void le_scan_enable_req(struct hci_dev *hdev, unsigned long opt) +static void le_scan_enable_req(struct hci_request *req, unsigned long opt) { struct hci_cp_le_set_scan_enable cp; memset(&cp, 0, sizeof(cp)); - cp.enable = 1; - cp.filter_dup = 1; + cp.enable = LE_SCAN_ENABLE; + cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - hci_send_cmd(hdev, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); } static int hci_do_le_scan(struct hci_dev *hdev, u8 type, u16 interval, @@ -1611,18 +2024,18 @@ static int hci_do_le_scan(struct hci_dev *hdev, u8 type, u16 interval, hci_req_lock(hdev); - err = __hci_request(hdev, le_scan_param_req, (unsigned long) ¶m, - timeo); + err = __hci_req_sync(hdev, le_scan_param_req, (unsigned long) ¶m, + timeo); if (!err) - err = __hci_request(hdev, le_scan_enable_req, 0, timeo); + err = __hci_req_sync(hdev, le_scan_enable_req, 0, timeo); hci_req_unlock(hdev); if (err < 0) return err; - schedule_delayed_work(&hdev->le_scan_disable, - msecs_to_jiffies(timeout)); + queue_delayed_work(hdev->workqueue, &hdev->le_scan_disable, + timeout); return 0; } @@ -1732,7 +2145,6 @@ struct hci_dev *hci_alloc_dev(void) INIT_DELAYED_WORK(&hdev->discov_off, hci_discov_off); INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work); - skb_queue_head_init(&hdev->driver_init); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); skb_queue_head_init(&hdev->raw_q); @@ -1751,8 +2163,6 @@ EXPORT_SYMBOL(hci_alloc_dev); /* Free HCI device */ void hci_free_dev(struct hci_dev *hdev) { - skb_queue_purge(&hdev->driver_init); - /* will free via device release */ put_device(&hdev->dev); } @@ -1799,6 +2209,15 @@ int hci_register_dev(struct hci_dev *hdev) goto err; } + hdev->req_workqueue = alloc_workqueue(hdev->name, + WQ_HIGHPRI | WQ_UNBOUND | + WQ_MEM_RECLAIM, 1); + if (!hdev->req_workqueue) { + destroy_workqueue(hdev->workqueue); + error = -ENOMEM; + goto err; + } + error = hci_add_sysfs(hdev); if (error < 0) goto err_wqueue; @@ -1821,12 +2240,13 @@ int hci_register_dev(struct hci_dev *hdev) hci_notify(hdev, HCI_DEV_REG); hci_dev_hold(hdev); - schedule_work(&hdev->power_on); + queue_work(hdev->req_workqueue, &hdev->power_on); return id; err_wqueue: destroy_workqueue(hdev->workqueue); + destroy_workqueue(hdev->req_workqueue); err: ida_simple_remove(&hci_index_ida, hdev->id); write_lock(&hci_dev_list_lock); @@ -1880,6 +2300,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_del_sysfs(hdev); destroy_workqueue(hdev->workqueue); + destroy_workqueue(hdev->req_workqueue); hci_dev_lock(hdev); hci_blacklist_clear(hdev); @@ -1921,7 +2342,7 @@ int hci_recv_frame(struct sk_buff *skb) return -ENXIO; } - /* Incomming skb */ + /* Incoming skb */ bt_cb(skb)->incoming = 1; /* Time stamp */ @@ -2152,20 +2573,55 @@ static int hci_send_frame(struct sk_buff *skb) return hdev->send(skb); } -/* Send HCI command */ -int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, void *param) +void hci_req_init(struct hci_request *req, struct hci_dev *hdev) +{ + skb_queue_head_init(&req->cmd_q); + req->hdev = hdev; + req->err = 0; +} + +int hci_req_run(struct hci_request *req, hci_req_complete_t complete) +{ + struct hci_dev *hdev = req->hdev; + struct sk_buff *skb; + unsigned long flags; + + BT_DBG("length %u", skb_queue_len(&req->cmd_q)); + + /* If an error occured during request building, remove all HCI + * commands queued on the HCI request queue. + */ + if (req->err) { + skb_queue_purge(&req->cmd_q); + return req->err; + } + + /* Do not allow empty requests */ + if (skb_queue_empty(&req->cmd_q)) + return -ENODATA; + + skb = skb_peek_tail(&req->cmd_q); + bt_cb(skb)->req.complete = complete; + + spin_lock_irqsave(&hdev->cmd_q.lock, flags); + skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q); + spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); + + queue_work(hdev->workqueue, &hdev->cmd_work); + + return 0; +} + +static struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, + u32 plen, const void *param) { int len = HCI_COMMAND_HDR_SIZE + plen; struct hci_command_hdr *hdr; struct sk_buff *skb; - BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); - skb = bt_skb_alloc(len, GFP_ATOMIC); - if (!skb) { - BT_ERR("%s no memory for command", hdev->name); - return -ENOMEM; - } + if (!skb) + return NULL; hdr = (struct hci_command_hdr *) skb_put(skb, HCI_COMMAND_HDR_SIZE); hdr->opcode = cpu_to_le16(opcode); @@ -2179,8 +2635,27 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, void *param) bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; skb->dev = (void *) hdev; - if (test_bit(HCI_INIT, &hdev->flags)) - hdev->init_last_cmd = opcode; + return skb; +} + +/* Send HCI command */ +int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, + const void *param) +{ + struct sk_buff *skb; + + BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); + + skb = hci_prepare_cmd(hdev, opcode, plen, param); + if (!skb) { + BT_ERR("%s no memory for command", hdev->name); + return -ENOMEM; + } + + /* Stand-alone HCI commands must be flaged as + * single-command requests. + */ + bt_cb(skb)->req.start = true; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); @@ -2188,6 +2663,43 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, void *param) return 0; } +/* Queue a command to an asynchronous HCI request */ +void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen, + const void *param, u8 event) +{ + struct hci_dev *hdev = req->hdev; + struct sk_buff *skb; + + BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); + + /* If an error occured during request building, there is no point in + * queueing the HCI command. We can simply return. + */ + if (req->err) + return; + + skb = hci_prepare_cmd(hdev, opcode, plen, param); + if (!skb) { + BT_ERR("%s no memory for command (opcode 0x%4.4x)", + hdev->name, opcode); + req->err = -ENOMEM; + return; + } + + if (skb_queue_empty(&req->cmd_q)) + bt_cb(skb)->req.start = true; + + bt_cb(skb)->req.event = event; + + skb_queue_tail(&req->cmd_q, skb); +} + +void hci_req_add(struct hci_request *req, u16 opcode, u32 plen, + const void *param) +{ + hci_req_add_ev(req, opcode, plen, param, 0); +} + /* Get data from the previously sent command */ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) { @@ -2390,7 +2902,7 @@ static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) if (c->type == type && c->sent) { BT_ERR("%s killing stalled connection %pMR", hdev->name, &c->dst); - hci_acl_disconn(c, HCI_ERROR_REMOTE_USER_TERM); + hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); } } @@ -2852,6 +3364,97 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(skb); } +static bool hci_req_is_complete(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + skb = skb_peek(&hdev->cmd_q); + if (!skb) + return true; + + return bt_cb(skb)->req.start; +} + +static void hci_resend_last(struct hci_dev *hdev) +{ + struct hci_command_hdr *sent; + struct sk_buff *skb; + u16 opcode; + + if (!hdev->sent_cmd) + return; + + sent = (void *) hdev->sent_cmd->data; + opcode = __le16_to_cpu(sent->opcode); + if (opcode == HCI_OP_RESET) + return; + + skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); + if (!skb) + return; + + skb_queue_head(&hdev->cmd_q, skb); + queue_work(hdev->workqueue, &hdev->cmd_work); +} + +void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status) +{ + hci_req_complete_t req_complete = NULL; + struct sk_buff *skb; + unsigned long flags; + + BT_DBG("opcode 0x%04x status 0x%02x", opcode, status); + + /* If the completed command doesn't match the last one that was + * sent we need to do special handling of it. + */ + if (!hci_sent_cmd_data(hdev, opcode)) { + /* Some CSR based controllers generate a spontaneous + * reset complete event during init and any pending + * command will never be completed. In such a case we + * need to resend whatever was the last sent + * command. + */ + if (test_bit(HCI_INIT, &hdev->flags) && opcode == HCI_OP_RESET) + hci_resend_last(hdev); + + return; + } + + /* If the command succeeded and there's still more commands in + * this request the request is not yet complete. + */ + if (!status && !hci_req_is_complete(hdev)) + return; + + /* If this was the last command in a request the complete + * callback would be found in hdev->sent_cmd instead of the + * command queue (hdev->cmd_q). + */ + if (hdev->sent_cmd) { + req_complete = bt_cb(hdev->sent_cmd)->req.complete; + if (req_complete) + goto call_complete; + } + + /* Remove all pending commands belonging to this request */ + spin_lock_irqsave(&hdev->cmd_q.lock, flags); + while ((skb = __skb_dequeue(&hdev->cmd_q))) { + if (bt_cb(skb)->req.start) { + __skb_queue_head(&hdev->cmd_q, skb); + break; + } + + req_complete = bt_cb(skb)->req.complete; + kfree_skb(skb); + } + spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); + +call_complete: + if (req_complete) + req_complete(hdev, status); +} + static void hci_rx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, rx_work); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 81b44481d0d9..b93cd2eb5d58 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -48,13 +48,13 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) } clear_bit(HCI_INQUIRY, &hdev->flags); + smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + wake_up_bit(&hdev->flags, HCI_INQUIRY); hci_dev_lock(hdev); hci_discovery_set_state(hdev, DISCOVERY_STOPPED); hci_dev_unlock(hdev); - hci_req_complete(hdev, HCI_OP_INQUIRY_CANCEL, status); - hci_conn_check_pending(hdev); } @@ -183,8 +183,6 @@ static void hci_cc_write_def_link_policy(struct hci_dev *hdev, if (!status) hdev->link_policy = get_unaligned_le16(sent); - - hci_req_complete(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, status); } static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) @@ -195,11 +193,8 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_RESET, &hdev->flags); - hci_req_complete(hdev, HCI_OP_RESET, status); - /* Reset all non-persistent flags */ - hdev->dev_flags &= ~(BIT(HCI_LE_SCAN) | BIT(HCI_PENDING_CLASS) | - BIT(HCI_PERIODIC_INQ)); + hdev->dev_flags &= ~HCI_PERSISTENT_MASK; hdev->discovery.state = DISCOVERY_STOPPED; hdev->inq_tx_power = HCI_TX_POWER_INVALID; @@ -228,11 +223,6 @@ static void hci_cc_write_local_name(struct hci_dev *hdev, struct sk_buff *skb) memcpy(hdev->dev_name, sent, HCI_MAX_NAME_LENGTH); hci_dev_unlock(hdev); - - if (!status && !test_bit(HCI_INIT, &hdev->flags)) - hci_update_ad(hdev); - - hci_req_complete(hdev, HCI_OP_WRITE_LOCAL_NAME, status); } static void hci_cc_read_local_name(struct hci_dev *hdev, struct sk_buff *skb) @@ -270,8 +260,6 @@ static void hci_cc_write_auth_enable(struct hci_dev *hdev, struct sk_buff *skb) if (test_bit(HCI_MGMT, &hdev->dev_flags)) mgmt_auth_enable_complete(hdev, status); - - hci_req_complete(hdev, HCI_OP_WRITE_AUTH_ENABLE, status); } static void hci_cc_write_encrypt_mode(struct hci_dev *hdev, struct sk_buff *skb) @@ -293,8 +281,6 @@ static void hci_cc_write_encrypt_mode(struct hci_dev *hdev, struct sk_buff *skb) else clear_bit(HCI_ENCRYPT, &hdev->flags); } - - hci_req_complete(hdev, HCI_OP_WRITE_ENCRYPT_MODE, status); } static void hci_cc_write_scan_enable(struct hci_dev *hdev, struct sk_buff *skb) @@ -343,7 +329,6 @@ static void hci_cc_write_scan_enable(struct hci_dev *hdev, struct sk_buff *skb) done: hci_dev_unlock(hdev); - hci_req_complete(hdev, HCI_OP_WRITE_SCAN_ENABLE, status); } static void hci_cc_read_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb) @@ -435,15 +420,6 @@ static void hci_cc_write_voice_setting(struct hci_dev *hdev, hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING); } -static void hci_cc_host_buffer_size(struct hci_dev *hdev, struct sk_buff *skb) -{ - __u8 status = *((__u8 *) skb->data); - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - hci_req_complete(hdev, HCI_OP_HOST_BUFFER_SIZE, status); -} - static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb) { __u8 status = *((__u8 *) skb->data); @@ -457,9 +433,9 @@ static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb) if (!status) { if (sent->mode) - hdev->host_features[0] |= LMP_HOST_SSP; + hdev->features[1][0] |= LMP_HOST_SSP; else - hdev->host_features[0] &= ~LMP_HOST_SSP; + hdev->features[1][0] &= ~LMP_HOST_SSP; } if (test_bit(HCI_MGMT, &hdev->dev_flags)) @@ -472,202 +448,6 @@ static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb) } } -static u8 hci_get_inquiry_mode(struct hci_dev *hdev) -{ - if (lmp_ext_inq_capable(hdev)) - return 2; - - if (lmp_inq_rssi_capable(hdev)) - return 1; - - if (hdev->manufacturer == 11 && hdev->hci_rev == 0x00 && - hdev->lmp_subver == 0x0757) - return 1; - - if (hdev->manufacturer == 15) { - if (hdev->hci_rev == 0x03 && hdev->lmp_subver == 0x6963) - return 1; - if (hdev->hci_rev == 0x09 && hdev->lmp_subver == 0x6963) - return 1; - if (hdev->hci_rev == 0x00 && hdev->lmp_subver == 0x6965) - return 1; - } - - if (hdev->manufacturer == 31 && hdev->hci_rev == 0x2005 && - hdev->lmp_subver == 0x1805) - return 1; - - return 0; -} - -static void hci_setup_inquiry_mode(struct hci_dev *hdev) -{ - u8 mode; - - mode = hci_get_inquiry_mode(hdev); - - hci_send_cmd(hdev, HCI_OP_WRITE_INQUIRY_MODE, 1, &mode); -} - -static void hci_setup_event_mask(struct hci_dev *hdev) -{ - /* The second byte is 0xff instead of 0x9f (two reserved bits - * disabled) since a Broadcom 1.2 dongle doesn't respond to the - * command otherwise */ - u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 }; - - /* CSR 1.1 dongles does not accept any bitfield so don't try to set - * any event mask for pre 1.2 devices */ - if (hdev->hci_ver < BLUETOOTH_VER_1_2) - return; - - if (lmp_bredr_capable(hdev)) { - events[4] |= 0x01; /* Flow Specification Complete */ - events[4] |= 0x02; /* Inquiry Result with RSSI */ - events[4] |= 0x04; /* Read Remote Extended Features Complete */ - events[5] |= 0x08; /* Synchronous Connection Complete */ - events[5] |= 0x10; /* Synchronous Connection Changed */ - } - - if (lmp_inq_rssi_capable(hdev)) - events[4] |= 0x02; /* Inquiry Result with RSSI */ - - if (lmp_sniffsubr_capable(hdev)) - events[5] |= 0x20; /* Sniff Subrating */ - - if (lmp_pause_enc_capable(hdev)) - events[5] |= 0x80; /* Encryption Key Refresh Complete */ - - if (lmp_ext_inq_capable(hdev)) - events[5] |= 0x40; /* Extended Inquiry Result */ - - if (lmp_no_flush_capable(hdev)) - events[7] |= 0x01; /* Enhanced Flush Complete */ - - if (lmp_lsto_capable(hdev)) - events[6] |= 0x80; /* Link Supervision Timeout Changed */ - - if (lmp_ssp_capable(hdev)) { - events[6] |= 0x01; /* IO Capability Request */ - events[6] |= 0x02; /* IO Capability Response */ - events[6] |= 0x04; /* User Confirmation Request */ - events[6] |= 0x08; /* User Passkey Request */ - events[6] |= 0x10; /* Remote OOB Data Request */ - events[6] |= 0x20; /* Simple Pairing Complete */ - events[7] |= 0x04; /* User Passkey Notification */ - events[7] |= 0x08; /* Keypress Notification */ - events[7] |= 0x10; /* Remote Host Supported - * Features Notification */ - } - - if (lmp_le_capable(hdev)) - events[7] |= 0x20; /* LE Meta-Event */ - - hci_send_cmd(hdev, HCI_OP_SET_EVENT_MASK, sizeof(events), events); - - if (lmp_le_capable(hdev)) { - memset(events, 0, sizeof(events)); - events[0] = 0x1f; - hci_send_cmd(hdev, HCI_OP_LE_SET_EVENT_MASK, - sizeof(events), events); - } -} - -static void bredr_setup(struct hci_dev *hdev) -{ - struct hci_cp_delete_stored_link_key cp; - __le16 param; - __u8 flt_type; - - /* Read Buffer Size (ACL mtu, max pkt, etc.) */ - hci_send_cmd(hdev, HCI_OP_READ_BUFFER_SIZE, 0, NULL); - - /* Read Class of Device */ - hci_send_cmd(hdev, HCI_OP_READ_CLASS_OF_DEV, 0, NULL); - - /* Read Local Name */ - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_NAME, 0, NULL); - - /* Read Voice Setting */ - hci_send_cmd(hdev, HCI_OP_READ_VOICE_SETTING, 0, NULL); - - /* Clear Event Filters */ - flt_type = HCI_FLT_CLEAR_ALL; - hci_send_cmd(hdev, HCI_OP_SET_EVENT_FLT, 1, &flt_type); - - /* Connection accept timeout ~20 secs */ - param = __constant_cpu_to_le16(0x7d00); - hci_send_cmd(hdev, HCI_OP_WRITE_CA_TIMEOUT, 2, ¶m); - - bacpy(&cp.bdaddr, BDADDR_ANY); - cp.delete_all = 1; - hci_send_cmd(hdev, HCI_OP_DELETE_STORED_LINK_KEY, sizeof(cp), &cp); -} - -static void le_setup(struct hci_dev *hdev) -{ - /* Read LE Buffer Size */ - hci_send_cmd(hdev, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL); - - /* Read LE Advertising Channel TX Power */ - hci_send_cmd(hdev, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); -} - -static void hci_setup(struct hci_dev *hdev) -{ - if (hdev->dev_type != HCI_BREDR) - return; - - /* Read BD Address */ - hci_send_cmd(hdev, HCI_OP_READ_BD_ADDR, 0, NULL); - - if (lmp_bredr_capable(hdev)) - bredr_setup(hdev); - - if (lmp_le_capable(hdev)) - le_setup(hdev); - - hci_setup_event_mask(hdev); - - if (hdev->hci_ver > BLUETOOTH_VER_1_1) - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL); - - if (lmp_ssp_capable(hdev)) { - if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { - u8 mode = 0x01; - hci_send_cmd(hdev, HCI_OP_WRITE_SSP_MODE, - sizeof(mode), &mode); - } else { - struct hci_cp_write_eir cp; - - memset(hdev->eir, 0, sizeof(hdev->eir)); - memset(&cp, 0, sizeof(cp)); - - hci_send_cmd(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp); - } - } - - if (lmp_inq_rssi_capable(hdev)) - hci_setup_inquiry_mode(hdev); - - if (lmp_inq_tx_pwr_capable(hdev)) - hci_send_cmd(hdev, HCI_OP_READ_INQ_RSP_TX_POWER, 0, NULL); - - if (lmp_ext_feat_capable(hdev)) { - struct hci_cp_read_local_ext_features cp; - - cp.page = 0x01; - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, sizeof(cp), - &cp); - } - - if (test_bit(HCI_LINK_SECURITY, &hdev->dev_flags)) { - u8 enable = 1; - hci_send_cmd(hdev, HCI_OP_WRITE_AUTH_ENABLE, sizeof(enable), - &enable); - } -} - static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_rp_read_local_version *rp = (void *) skb->data; @@ -675,7 +455,7 @@ static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); if (rp->status) - goto done; + return; hdev->hci_ver = rp->hci_ver; hdev->hci_rev = __le16_to_cpu(rp->hci_rev); @@ -685,30 +465,6 @@ static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s manufacturer 0x%4.4x hci ver %d:%d", hdev->name, hdev->manufacturer, hdev->hci_ver, hdev->hci_rev); - - if (test_bit(HCI_INIT, &hdev->flags)) - hci_setup(hdev); - -done: - hci_req_complete(hdev, HCI_OP_READ_LOCAL_VERSION, rp->status); -} - -static void hci_setup_link_policy(struct hci_dev *hdev) -{ - struct hci_cp_write_def_link_policy cp; - u16 link_policy = 0; - - if (lmp_rswitch_capable(hdev)) - link_policy |= HCI_LP_RSWITCH; - if (lmp_hold_capable(hdev)) - link_policy |= HCI_LP_HOLD; - if (lmp_sniff_capable(hdev)) - link_policy |= HCI_LP_SNIFF; - if (lmp_park_capable(hdev)) - link_policy |= HCI_LP_PARK; - - cp.policy = cpu_to_le16(link_policy); - hci_send_cmd(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, sizeof(cp), &cp); } static void hci_cc_read_local_commands(struct hci_dev *hdev, @@ -718,16 +474,8 @@ static void hci_cc_read_local_commands(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (rp->status) - goto done; - - memcpy(hdev->commands, rp->commands, sizeof(hdev->commands)); - - if (test_bit(HCI_INIT, &hdev->flags) && (hdev->commands[5] & 0x10)) - hci_setup_link_policy(hdev); - -done: - hci_req_complete(hdev, HCI_OP_READ_LOCAL_COMMANDS, rp->status); + if (!rp->status) + memcpy(hdev->commands, rp->commands, sizeof(hdev->commands)); } static void hci_cc_read_local_features(struct hci_dev *hdev, @@ -745,18 +493,18 @@ static void hci_cc_read_local_features(struct hci_dev *hdev, /* Adjust default settings according to features * supported by device. */ - if (hdev->features[0] & LMP_3SLOT) + if (hdev->features[0][0] & LMP_3SLOT) hdev->pkt_type |= (HCI_DM3 | HCI_DH3); - if (hdev->features[0] & LMP_5SLOT) + if (hdev->features[0][0] & LMP_5SLOT) hdev->pkt_type |= (HCI_DM5 | HCI_DH5); - if (hdev->features[1] & LMP_HV2) { + if (hdev->features[0][1] & LMP_HV2) { hdev->pkt_type |= (HCI_HV2); hdev->esco_type |= (ESCO_HV2); } - if (hdev->features[1] & LMP_HV3) { + if (hdev->features[0][1] & LMP_HV3) { hdev->pkt_type |= (HCI_HV3); hdev->esco_type |= (ESCO_HV3); } @@ -764,42 +512,26 @@ static void hci_cc_read_local_features(struct hci_dev *hdev, if (lmp_esco_capable(hdev)) hdev->esco_type |= (ESCO_EV3); - if (hdev->features[4] & LMP_EV4) + if (hdev->features[0][4] & LMP_EV4) hdev->esco_type |= (ESCO_EV4); - if (hdev->features[4] & LMP_EV5) + if (hdev->features[0][4] & LMP_EV5) hdev->esco_type |= (ESCO_EV5); - if (hdev->features[5] & LMP_EDR_ESCO_2M) + if (hdev->features[0][5] & LMP_EDR_ESCO_2M) hdev->esco_type |= (ESCO_2EV3); - if (hdev->features[5] & LMP_EDR_ESCO_3M) + if (hdev->features[0][5] & LMP_EDR_ESCO_3M) hdev->esco_type |= (ESCO_3EV3); - if (hdev->features[5] & LMP_EDR_3S_ESCO) + if (hdev->features[0][5] & LMP_EDR_3S_ESCO) hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5); BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name, - hdev->features[0], hdev->features[1], - hdev->features[2], hdev->features[3], - hdev->features[4], hdev->features[5], - hdev->features[6], hdev->features[7]); -} - -static void hci_set_le_support(struct hci_dev *hdev) -{ - struct hci_cp_write_le_host_supported cp; - - memset(&cp, 0, sizeof(cp)); - - if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) { - cp.le = 1; - cp.simul = lmp_le_br_capable(hdev); - } - - if (cp.le != lmp_host_le_capable(hdev)) - hci_send_cmd(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), - &cp); + hdev->features[0][0], hdev->features[0][1], + hdev->features[0][2], hdev->features[0][3], + hdev->features[0][4], hdev->features[0][5], + hdev->features[0][6], hdev->features[0][7]); } static void hci_cc_read_local_ext_features(struct hci_dev *hdev, @@ -810,22 +542,12 @@ static void hci_cc_read_local_ext_features(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); if (rp->status) - goto done; - - switch (rp->page) { - case 0: - memcpy(hdev->features, rp->features, 8); - break; - case 1: - memcpy(hdev->host_features, rp->features, 8); - break; - } + return; - if (test_bit(HCI_INIT, &hdev->flags) && lmp_le_capable(hdev)) - hci_set_le_support(hdev); + hdev->max_page = rp->max_page; -done: - hci_req_complete(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, rp->status); + if (rp->page < HCI_MAX_PAGES) + memcpy(hdev->features[rp->page], rp->features, 8); } static void hci_cc_read_flow_control_mode(struct hci_dev *hdev, @@ -835,12 +557,8 @@ static void hci_cc_read_flow_control_mode(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (rp->status) - return; - - hdev->flow_ctl_mode = rp->mode; - - hci_req_complete(hdev, HCI_OP_READ_FLOW_CONTROL_MODE, rp->status); + if (!rp->status) + hdev->flow_ctl_mode = rp->mode; } static void hci_cc_read_buffer_size(struct hci_dev *hdev, struct sk_buff *skb) @@ -877,8 +595,65 @@ static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb) if (!rp->status) bacpy(&hdev->bdaddr, &rp->bdaddr); +} + +static void hci_cc_read_page_scan_activity(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_page_scan_activity *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - hci_req_complete(hdev, HCI_OP_READ_BD_ADDR, rp->status); + if (test_bit(HCI_INIT, &hdev->flags) && !rp->status) { + hdev->page_scan_interval = __le16_to_cpu(rp->interval); + hdev->page_scan_window = __le16_to_cpu(rp->window); + } +} + +static void hci_cc_write_page_scan_activity(struct hci_dev *hdev, + struct sk_buff *skb) +{ + u8 status = *((u8 *) skb->data); + struct hci_cp_write_page_scan_activity *sent; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY); + if (!sent) + return; + + hdev->page_scan_interval = __le16_to_cpu(sent->interval); + hdev->page_scan_window = __le16_to_cpu(sent->window); +} + +static void hci_cc_read_page_scan_type(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_page_scan_type *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (test_bit(HCI_INIT, &hdev->flags) && !rp->status) + hdev->page_scan_type = rp->type; +} + +static void hci_cc_write_page_scan_type(struct hci_dev *hdev, + struct sk_buff *skb) +{ + u8 status = *((u8 *) skb->data); + u8 *type; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + type = hci_sent_cmd_data(hdev, HCI_OP_WRITE_PAGE_SCAN_TYPE); + if (type) + hdev->page_scan_type = *type; } static void hci_cc_read_data_block_size(struct hci_dev *hdev, @@ -899,17 +674,6 @@ static void hci_cc_read_data_block_size(struct hci_dev *hdev, BT_DBG("%s blk mtu %d cnt %d len %d", hdev->name, hdev->block_mtu, hdev->block_cnt, hdev->block_len); - - hci_req_complete(hdev, HCI_OP_READ_DATA_BLOCK_SIZE, rp->status); -} - -static void hci_cc_write_ca_timeout(struct hci_dev *hdev, struct sk_buff *skb) -{ - __u8 status = *((__u8 *) skb->data); - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - hci_req_complete(hdev, HCI_OP_WRITE_CA_TIMEOUT, status); } static void hci_cc_read_local_amp_info(struct hci_dev *hdev, @@ -933,8 +697,6 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev, hdev->amp_be_flush_to = __le32_to_cpu(rp->be_flush_to); hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to); - hci_req_complete(hdev, HCI_OP_READ_LOCAL_AMP_INFO, rp->status); - a2mp_rsp: a2mp_send_getinfo_rsp(hdev); } @@ -976,35 +738,6 @@ a2mp_rsp: a2mp_send_create_phy_link_req(hdev, rp->status); } -static void hci_cc_delete_stored_link_key(struct hci_dev *hdev, - struct sk_buff *skb) -{ - __u8 status = *((__u8 *) skb->data); - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - hci_req_complete(hdev, HCI_OP_DELETE_STORED_LINK_KEY, status); -} - -static void hci_cc_set_event_mask(struct hci_dev *hdev, struct sk_buff *skb) -{ - __u8 status = *((__u8 *) skb->data); - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - hci_req_complete(hdev, HCI_OP_SET_EVENT_MASK, status); -} - -static void hci_cc_write_inquiry_mode(struct hci_dev *hdev, - struct sk_buff *skb) -{ - __u8 status = *((__u8 *) skb->data); - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - hci_req_complete(hdev, HCI_OP_WRITE_INQUIRY_MODE, status); -} - static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, struct sk_buff *skb) { @@ -1014,17 +747,6 @@ static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, if (!rp->status) hdev->inq_tx_power = rp->tx_power; - - hci_req_complete(hdev, HCI_OP_READ_INQ_RSP_TX_POWER, rp->status); -} - -static void hci_cc_set_event_flt(struct hci_dev *hdev, struct sk_buff *skb) -{ - __u8 status = *((__u8 *) skb->data); - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - hci_req_complete(hdev, HCI_OP_SET_EVENT_FLT, status); } static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb) @@ -1086,33 +808,28 @@ static void hci_cc_le_read_buffer_size(struct hci_dev *hdev, hdev->le_cnt = hdev->le_pkts; BT_DBG("%s le mtu %d:%d", hdev->name, hdev->le_mtu, hdev->le_pkts); - - hci_req_complete(hdev, HCI_OP_LE_READ_BUFFER_SIZE, rp->status); } -static void hci_cc_le_read_adv_tx_power(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_le_read_local_features(struct hci_dev *hdev, + struct sk_buff *skb) { - struct hci_rp_le_read_adv_tx_power *rp = (void *) skb->data; + struct hci_rp_le_read_local_features *rp = (void *) skb->data; BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (!rp->status) { - hdev->adv_tx_power = rp->tx_power; - if (!test_bit(HCI_INIT, &hdev->flags)) - hci_update_ad(hdev); - } - - hci_req_complete(hdev, HCI_OP_LE_READ_ADV_TX_POWER, rp->status); + if (!rp->status) + memcpy(hdev->le_features, rp->features, 8); } -static void hci_cc_le_set_event_mask(struct hci_dev *hdev, struct sk_buff *skb) +static void hci_cc_le_read_adv_tx_power(struct hci_dev *hdev, + struct sk_buff *skb) { - __u8 status = *((__u8 *) skb->data); + struct hci_rp_le_read_adv_tx_power *rp = (void *) skb->data; - BT_DBG("%s status 0x%2.2x", hdev->name, status); + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - hci_req_complete(hdev, HCI_OP_LE_SET_EVENT_MASK, status); + if (!rp->status) + hdev->adv_tx_power = rp->tx_power; } static void hci_cc_user_confirm_reply(struct hci_dev *hdev, struct sk_buff *skb) @@ -1209,12 +926,15 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_LE_PERIPHERAL, &hdev->dev_flags); } - hci_dev_unlock(hdev); + if (!test_bit(HCI_INIT, &hdev->flags)) { + struct hci_request req; - if (!test_bit(HCI_INIT, &hdev->flags)) - hci_update_ad(hdev); + hci_req_init(&req, hdev); + hci_update_ad(&req); + hci_req_run(&req, NULL); + } - hci_req_complete(hdev, HCI_OP_LE_SET_ADV_ENABLE, status); + hci_dev_unlock(hdev); } static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb) @@ -1223,8 +943,6 @@ static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, status); - hci_req_complete(hdev, HCI_OP_LE_SET_SCAN_PARAM, status); - if (status) { hci_dev_lock(hdev); mgmt_start_discovery_failed(hdev, status); @@ -1246,9 +964,7 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, return; switch (cp->enable) { - case LE_SCANNING_ENABLED: - hci_req_complete(hdev, HCI_OP_LE_SET_SCAN_ENABLE, status); - + case LE_SCAN_ENABLE: if (status) { hci_dev_lock(hdev); mgmt_start_discovery_failed(hdev, status); @@ -1263,7 +979,7 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, hci_dev_unlock(hdev); break; - case LE_SCANNING_DISABLED: + case LE_SCAN_DISABLE: if (status) { hci_dev_lock(hdev); mgmt_stop_discovery_failed(hdev, status); @@ -1290,28 +1006,26 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, } } -static void hci_cc_le_ltk_reply(struct hci_dev *hdev, struct sk_buff *skb) +static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, + struct sk_buff *skb) { - struct hci_rp_le_ltk_reply *rp = (void *) skb->data; + struct hci_rp_le_read_white_list_size *rp = (void *) skb->data; - BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size); - if (rp->status) - return; - - hci_req_complete(hdev, HCI_OP_LE_LTK_REPLY, rp->status); + if (!rp->status) + hdev->le_white_list_size = rp->size; } -static void hci_cc_le_ltk_neg_reply(struct hci_dev *hdev, struct sk_buff *skb) +static void hci_cc_le_read_supported_states(struct hci_dev *hdev, + struct sk_buff *skb) { - struct hci_rp_le_ltk_neg_reply *rp = (void *) skb->data; + struct hci_rp_le_read_supported_states *rp = (void *) skb->data; BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (rp->status) - return; - - hci_req_complete(hdev, HCI_OP_LE_LTK_NEG_REPLY, rp->status); + if (!rp->status) + memcpy(hdev->le_states, rp->le_states, 8); } static void hci_cc_write_le_host_supported(struct hci_dev *hdev, @@ -1328,21 +1042,19 @@ static void hci_cc_write_le_host_supported(struct hci_dev *hdev, if (!status) { if (sent->le) - hdev->host_features[0] |= LMP_HOST_LE; + hdev->features[1][0] |= LMP_HOST_LE; else - hdev->host_features[0] &= ~LMP_HOST_LE; + hdev->features[1][0] &= ~LMP_HOST_LE; if (sent->simul) - hdev->host_features[0] |= LMP_HOST_LE_BREDR; + hdev->features[1][0] |= LMP_HOST_LE_BREDR; else - hdev->host_features[0] &= ~LMP_HOST_LE_BREDR; + hdev->features[1][0] &= ~LMP_HOST_LE_BREDR; } if (test_bit(HCI_MGMT, &hdev->dev_flags) && !test_bit(HCI_INIT, &hdev->flags)) mgmt_le_enable_complete(hdev, sent->le, status); - - hci_req_complete(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, status); } static void hci_cc_write_remote_amp_assoc(struct hci_dev *hdev, @@ -1364,7 +1076,6 @@ static void hci_cs_inquiry(struct hci_dev *hdev, __u8 status) BT_DBG("%s status 0x%2.2x", hdev->name, status); if (status) { - hci_req_complete(hdev, HCI_OP_INQUIRY, status); hci_conn_check_pending(hdev); hci_dev_lock(hdev); if (test_bit(HCI_MGMT, &hdev->dev_flags)) @@ -1475,7 +1186,7 @@ static void hci_cs_auth_requested(struct hci_dev *hdev, __u8 status) if (conn) { if (conn->state == BT_CONFIG) { hci_proto_connect_cfm(conn, status); - hci_conn_put(conn); + hci_conn_drop(conn); } } @@ -1502,7 +1213,7 @@ static void hci_cs_set_conn_encrypt(struct hci_dev *hdev, __u8 status) if (conn) { if (conn->state == BT_CONFIG) { hci_proto_connect_cfm(conn, status); - hci_conn_put(conn); + hci_conn_drop(conn); } } @@ -1664,7 +1375,7 @@ static void hci_cs_read_remote_features(struct hci_dev *hdev, __u8 status) if (conn) { if (conn->state == BT_CONFIG) { hci_proto_connect_cfm(conn, status); - hci_conn_put(conn); + hci_conn_drop(conn); } } @@ -1691,7 +1402,7 @@ static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status) if (conn) { if (conn->state == BT_CONFIG) { hci_proto_connect_cfm(conn, status); - hci_conn_put(conn); + hci_conn_drop(conn); } } @@ -1836,11 +1547,6 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, __u8 status) } } -static void hci_cs_le_start_enc(struct hci_dev *hdev, u8 status) -{ - BT_DBG("%s status 0x%2.2x", hdev->name, status); -} - static void hci_cs_create_phylink(struct hci_dev *hdev, u8 status) { struct hci_cp_create_phy_link *cp; @@ -1882,11 +1588,6 @@ static void hci_cs_accept_phylink(struct hci_dev *hdev, u8 status) amp_write_remote_assoc(hdev, cp->phy_handle); } -static void hci_cs_create_logical_link(struct hci_dev *hdev, u8 status) -{ - BT_DBG("%s status 0x%2.2x", hdev->name, status); -} - static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { __u8 status = *((__u8 *) skb->data); @@ -1895,13 +1596,14 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, status); - hci_req_complete(hdev, HCI_OP_INQUIRY, status); - hci_conn_check_pending(hdev); if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags)) return; + smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + wake_up_bit(&hdev->flags, HCI_INQUIRY); + if (!test_bit(HCI_MGMT, &hdev->dev_flags)) return; @@ -2000,7 +1702,6 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) } else conn->state = BT_CONNECTED; - hci_conn_hold_device(conn); hci_conn_add_sysfs(conn); if (test_bit(HCI_AUTH, &hdev->flags)) @@ -2047,42 +1748,6 @@ unlock: hci_conn_check_pending(hdev); } -void hci_conn_accept(struct hci_conn *conn, int mask) -{ - struct hci_dev *hdev = conn->hdev; - - BT_DBG("conn %p", conn); - - conn->state = BT_CONFIG; - - if (!lmp_esco_capable(hdev)) { - struct hci_cp_accept_conn_req cp; - - bacpy(&cp.bdaddr, &conn->dst); - - if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER)) - cp.role = 0x00; /* Become master */ - else - cp.role = 0x01; /* Remain slave */ - - hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp); - } else /* lmp_esco_capable(hdev)) */ { - struct hci_cp_accept_sync_conn_req cp; - - bacpy(&cp.bdaddr, &conn->dst); - cp.pkt_type = cpu_to_le16(conn->pkt_type); - - cp.tx_bandwidth = __constant_cpu_to_le32(0x00001f40); - cp.rx_bandwidth = __constant_cpu_to_le32(0x00001f40); - cp.max_latency = __constant_cpu_to_le16(0xffff); - cp.content_format = cpu_to_le16(hdev->voice_setting); - cp.retrans_effort = 0xff; - - hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, - sizeof(cp), &cp); - } -} - static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_conn_request *ev = (void *) skb->data; @@ -2154,7 +1819,6 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) } else { conn->state = BT_CONNECT2; hci_proto_connect_cfm(conn, 0); - hci_conn_put(conn); } } else { /* Connection rejected */ @@ -2261,14 +1925,14 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) } else { conn->state = BT_CONNECTED; hci_proto_connect_cfm(conn, ev->status); - hci_conn_put(conn); + hci_conn_drop(conn); } } else { hci_auth_cfm(conn, ev->status); hci_conn_hold(conn); conn->disc_timeout = HCI_DISCONN_TIMEOUT; - hci_conn_put(conn); + hci_conn_drop(conn); } if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) { @@ -2351,8 +2015,8 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); if (ev->status && conn->state == BT_CONNECTED) { - hci_acl_disconn(conn, HCI_ERROR_AUTH_FAILURE); - hci_conn_put(conn); + hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); + hci_conn_drop(conn); goto unlock; } @@ -2361,7 +2025,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->state = BT_CONNECTED; hci_proto_connect_cfm(conn, ev->status); - hci_conn_put(conn); + hci_conn_drop(conn); } else hci_encrypt_cfm(conn, ev->status, ev->encrypt); } @@ -2408,7 +2072,7 @@ static void hci_remote_features_evt(struct hci_dev *hdev, goto unlock; if (!ev->status) - memcpy(conn->features, ev->features, 8); + memcpy(conn->features[0], ev->features, 8); if (conn->state != BT_CONFIG) goto unlock; @@ -2436,27 +2100,17 @@ static void hci_remote_features_evt(struct hci_dev *hdev, if (!hci_outgoing_auth_needed(hdev, conn)) { conn->state = BT_CONNECTED; hci_proto_connect_cfm(conn, ev->status); - hci_conn_put(conn); + hci_conn_drop(conn); } unlock: hci_dev_unlock(hdev); } -static void hci_remote_version_evt(struct hci_dev *hdev, struct sk_buff *skb) -{ - BT_DBG("%s", hdev->name); -} - -static void hci_qos_setup_complete_evt(struct hci_dev *hdev, - struct sk_buff *skb) -{ - BT_DBG("%s", hdev->name); -} - static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_cmd_complete *ev = (void *) skb->data; + u8 status = skb->data[sizeof(*ev)]; __u16 opcode; skb_pull(skb, sizeof(*ev)); @@ -2540,10 +2194,6 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cc_write_voice_setting(hdev, skb); break; - case HCI_OP_HOST_BUFFER_SIZE: - hci_cc_host_buffer_size(hdev, skb); - break; - case HCI_OP_WRITE_SSP_MODE: hci_cc_write_ssp_mode(hdev, skb); break; @@ -2572,46 +2222,42 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cc_read_bd_addr(hdev, skb); break; - case HCI_OP_READ_DATA_BLOCK_SIZE: - hci_cc_read_data_block_size(hdev, skb); + case HCI_OP_READ_PAGE_SCAN_ACTIVITY: + hci_cc_read_page_scan_activity(hdev, skb); break; - case HCI_OP_WRITE_CA_TIMEOUT: - hci_cc_write_ca_timeout(hdev, skb); + case HCI_OP_WRITE_PAGE_SCAN_ACTIVITY: + hci_cc_write_page_scan_activity(hdev, skb); break; - case HCI_OP_READ_FLOW_CONTROL_MODE: - hci_cc_read_flow_control_mode(hdev, skb); + case HCI_OP_READ_PAGE_SCAN_TYPE: + hci_cc_read_page_scan_type(hdev, skb); break; - case HCI_OP_READ_LOCAL_AMP_INFO: - hci_cc_read_local_amp_info(hdev, skb); + case HCI_OP_WRITE_PAGE_SCAN_TYPE: + hci_cc_write_page_scan_type(hdev, skb); break; - case HCI_OP_READ_LOCAL_AMP_ASSOC: - hci_cc_read_local_amp_assoc(hdev, skb); + case HCI_OP_READ_DATA_BLOCK_SIZE: + hci_cc_read_data_block_size(hdev, skb); break; - case HCI_OP_DELETE_STORED_LINK_KEY: - hci_cc_delete_stored_link_key(hdev, skb); + case HCI_OP_READ_FLOW_CONTROL_MODE: + hci_cc_read_flow_control_mode(hdev, skb); break; - case HCI_OP_SET_EVENT_MASK: - hci_cc_set_event_mask(hdev, skb); + case HCI_OP_READ_LOCAL_AMP_INFO: + hci_cc_read_local_amp_info(hdev, skb); break; - case HCI_OP_WRITE_INQUIRY_MODE: - hci_cc_write_inquiry_mode(hdev, skb); + case HCI_OP_READ_LOCAL_AMP_ASSOC: + hci_cc_read_local_amp_assoc(hdev, skb); break; case HCI_OP_READ_INQ_RSP_TX_POWER: hci_cc_read_inq_rsp_tx_power(hdev, skb); break; - case HCI_OP_SET_EVENT_FLT: - hci_cc_set_event_flt(hdev, skb); - break; - case HCI_OP_PIN_CODE_REPLY: hci_cc_pin_code_reply(hdev, skb); break; @@ -2628,12 +2274,12 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cc_le_read_buffer_size(hdev, skb); break; - case HCI_OP_LE_READ_ADV_TX_POWER: - hci_cc_le_read_adv_tx_power(hdev, skb); + case HCI_OP_LE_READ_LOCAL_FEATURES: + hci_cc_le_read_local_features(hdev, skb); break; - case HCI_OP_LE_SET_EVENT_MASK: - hci_cc_le_set_event_mask(hdev, skb); + case HCI_OP_LE_READ_ADV_TX_POWER: + hci_cc_le_read_adv_tx_power(hdev, skb); break; case HCI_OP_USER_CONFIRM_REPLY: @@ -2664,12 +2310,12 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cc_le_set_scan_enable(hdev, skb); break; - case HCI_OP_LE_LTK_REPLY: - hci_cc_le_ltk_reply(hdev, skb); + case HCI_OP_LE_READ_WHITE_LIST_SIZE: + hci_cc_le_read_white_list_size(hdev, skb); break; - case HCI_OP_LE_LTK_NEG_REPLY: - hci_cc_le_ltk_neg_reply(hdev, skb); + case HCI_OP_LE_READ_SUPPORTED_STATES: + hci_cc_le_read_supported_states(hdev, skb); break; case HCI_OP_WRITE_LE_HOST_SUPPORTED: @@ -2685,9 +2331,11 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) break; } - if (ev->opcode != HCI_OP_NOP) + if (opcode != HCI_OP_NOP) del_timer(&hdev->cmd_timer); + hci_req_cmd_complete(hdev, opcode, status); + if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) { atomic_set(&hdev->cmd_cnt, 1); if (!skb_queue_empty(&hdev->cmd_q)) @@ -2757,10 +2405,6 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cs_le_create_conn(hdev, ev->status); break; - case HCI_OP_LE_START_ENC: - hci_cs_le_start_enc(hdev, ev->status); - break; - case HCI_OP_CREATE_PHY_LINK: hci_cs_create_phylink(hdev, ev->status); break; @@ -2769,18 +2413,18 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cs_accept_phylink(hdev, ev->status); break; - case HCI_OP_CREATE_LOGICAL_LINK: - hci_cs_create_logical_link(hdev, ev->status); - break; - default: BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode); break; } - if (ev->opcode != HCI_OP_NOP) + if (opcode != HCI_OP_NOP) del_timer(&hdev->cmd_timer); + if (ev->status || + (hdev->sent_cmd && !bt_cb(hdev->sent_cmd)->req.event)) + hci_req_cmd_complete(hdev, opcode, ev->status); + if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) { atomic_set(&hdev->cmd_cnt, 1); if (!skb_queue_empty(&hdev->cmd_q)) @@ -2996,7 +2640,7 @@ static void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb) if (conn->state == BT_CONNECTED) { hci_conn_hold(conn); conn->disc_timeout = HCI_PAIRING_TIMEOUT; - hci_conn_put(conn); + hci_conn_drop(conn); } if (!test_bit(HCI_PAIRABLE, &hdev->dev_flags)) @@ -3099,7 +2743,7 @@ static void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) if (ev->key_type != HCI_LK_CHANGED_COMBINATION) conn->key_type = ev->key_type; - hci_conn_put(conn); + hci_conn_drop(conn); } if (test_bit(HCI_LINK_KEYS, &hdev->dev_flags)) @@ -3240,6 +2884,9 @@ static void hci_remote_ext_features_evt(struct hci_dev *hdev, if (!conn) goto unlock; + if (ev->page < HCI_MAX_PAGES) + memcpy(conn->features[ev->page], ev->features, 8); + if (!ev->status && ev->page == 0x01) { struct inquiry_entry *ie; @@ -3247,8 +2894,19 @@ static void hci_remote_ext_features_evt(struct hci_dev *hdev, if (ie) ie->data.ssp_mode = (ev->features[0] & LMP_HOST_SSP); - if (ev->features[0] & LMP_HOST_SSP) + if (ev->features[0] & LMP_HOST_SSP) { set_bit(HCI_CONN_SSP_ENABLED, &conn->flags); + } else { + /* It is mandatory by the Bluetooth specification that + * Extended Inquiry Results are only used when Secure + * Simple Pairing is enabled, but some devices violate + * this. + * + * To make these devices work, the internal SSP + * enabled flag needs to be cleared if the remote host + * features do not indicate SSP support */ + clear_bit(HCI_CONN_SSP_ENABLED, &conn->flags); + } } if (conn->state != BT_CONFIG) @@ -3268,7 +2926,7 @@ static void hci_remote_ext_features_evt(struct hci_dev *hdev, if (!hci_outgoing_auth_needed(hdev, conn)) { conn->state = BT_CONNECTED; hci_proto_connect_cfm(conn, ev->status); - hci_conn_put(conn); + hci_conn_drop(conn); } unlock: @@ -3302,7 +2960,6 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; - hci_conn_hold_device(conn); hci_conn_add_sysfs(conn); break; @@ -3331,18 +2988,6 @@ unlock: hci_dev_unlock(hdev); } -static void hci_sync_conn_changed_evt(struct hci_dev *hdev, struct sk_buff *skb) -{ - BT_DBG("%s", hdev->name); -} - -static void hci_sniff_subrate_evt(struct hci_dev *hdev, struct sk_buff *skb) -{ - struct hci_ev_sniff_subrate *ev = (void *) skb->data; - - BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); -} - static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3412,8 +3057,8 @@ static void hci_key_refresh_complete_evt(struct hci_dev *hdev, clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); if (ev->status && conn->state == BT_CONNECTED) { - hci_acl_disconn(conn, HCI_ERROR_AUTH_FAILURE); - hci_conn_put(conn); + hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); + hci_conn_drop(conn); goto unlock; } @@ -3422,13 +3067,13 @@ static void hci_key_refresh_complete_evt(struct hci_dev *hdev, conn->state = BT_CONNECTED; hci_proto_connect_cfm(conn, ev->status); - hci_conn_put(conn); + hci_conn_drop(conn); } else { hci_auth_cfm(conn, ev->status); hci_conn_hold(conn); conn->disc_timeout = HCI_DISCONN_TIMEOUT; - hci_conn_put(conn); + hci_conn_drop(conn); } unlock: @@ -3689,7 +3334,7 @@ static void hci_simple_pair_complete_evt(struct hci_dev *hdev, mgmt_auth_failed(hdev, &conn->dst, conn->type, conn->dst_type, ev->status); - hci_conn_put(conn); + hci_conn_drop(conn); unlock: hci_dev_unlock(hdev); @@ -3700,11 +3345,16 @@ static void hci_remote_host_features_evt(struct hci_dev *hdev, { struct hci_ev_remote_host_features *ev = (void *) skb->data; struct inquiry_entry *ie; + struct hci_conn *conn; BT_DBG("%s", hdev->name); hci_dev_lock(hdev); + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn) + memcpy(conn->features[1], ev->features, 8); + ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr); if (ie) ie->data.ssp_mode = (ev->features[0] & LMP_HOST_SSP); @@ -3777,9 +3427,8 @@ static void hci_phy_link_complete_evt(struct hci_dev *hdev, hci_conn_hold(hcon); hcon->disc_timeout = HCI_DISCONN_TIMEOUT; - hci_conn_put(hcon); + hci_conn_drop(hcon); - hci_conn_hold_device(hcon); hci_conn_add_sysfs(hcon); amp_physical_cfm(bredr_hcon, hcon); @@ -3913,7 +3562,6 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; - hci_conn_hold_device(conn); hci_conn_add_sysfs(conn); hci_proto_connect_cfm(conn, ev->status); @@ -3928,8 +3576,6 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) void *ptr = &skb->data[1]; s8 rssi; - hci_dev_lock(hdev); - while (num_reports--) { struct hci_ev_le_advertising_info *ev = ptr; @@ -3939,8 +3585,6 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) ptr += sizeof(*ev) + ev->length + 1; } - - hci_dev_unlock(hdev); } static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) @@ -4031,8 +3675,27 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) struct hci_event_hdr *hdr = (void *) skb->data; __u8 event = hdr->evt; + hci_dev_lock(hdev); + + /* Received events are (currently) only needed when a request is + * ongoing so avoid unnecessary memory allocation. + */ + if (hdev->req_status == HCI_REQ_PEND) { + kfree_skb(hdev->recv_evt); + hdev->recv_evt = skb_clone(skb, GFP_KERNEL); + } + + hci_dev_unlock(hdev); + skb_pull(skb, HCI_EVENT_HDR_SIZE); + if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->req.event == event) { + struct hci_command_hdr *hdr = (void *) hdev->sent_cmd->data; + u16 opcode = __le16_to_cpu(hdr->opcode); + + hci_req_cmd_complete(hdev, opcode, 0); + } + switch (event) { case HCI_EV_INQUIRY_COMPLETE: hci_inquiry_complete_evt(hdev, skb); @@ -4074,14 +3737,6 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_remote_features_evt(hdev, skb); break; - case HCI_EV_REMOTE_VERSION: - hci_remote_version_evt(hdev, skb); - break; - - case HCI_EV_QOS_SETUP_COMPLETE: - hci_qos_setup_complete_evt(hdev, skb); - break; - case HCI_EV_CMD_COMPLETE: hci_cmd_complete_evt(hdev, skb); break; @@ -4138,14 +3793,6 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_sync_conn_complete_evt(hdev, skb); break; - case HCI_EV_SYNC_CONN_CHANGED: - hci_sync_conn_changed_evt(hdev, skb); - break; - - case HCI_EV_SNIFF_SUBRATE: - hci_sniff_subrate_evt(hdev, skb); - break; - case HCI_EV_EXTENDED_INQUIRY_RESULT: hci_extended_inquiry_result_evt(hdev, skb); break; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 07f073935811..9bd7d959e384 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -70,14 +70,13 @@ static struct bt_sock_list hci_sk_list = { void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) { struct sock *sk; - struct hlist_node *node; struct sk_buff *skb_copy = NULL; BT_DBG("hdev %p len %d", hdev, skb->len); read_lock(&hci_sk_list.lock); - sk_for_each(sk, node, &hci_sk_list.head) { + sk_for_each(sk, &hci_sk_list.head) { struct hci_filter *flt; struct sk_buff *nskb; @@ -142,13 +141,12 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) void hci_send_to_control(struct sk_buff *skb, struct sock *skip_sk) { struct sock *sk; - struct hlist_node *node; BT_DBG("len %d", skb->len); read_lock(&hci_sk_list.lock); - sk_for_each(sk, node, &hci_sk_list.head) { + sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; /* Skip the original socket */ @@ -176,7 +174,6 @@ void hci_send_to_control(struct sk_buff *skb, struct sock *skip_sk) void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) { struct sock *sk; - struct hlist_node *node; struct sk_buff *skb_copy = NULL; __le16 opcode; @@ -210,7 +207,7 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) read_lock(&hci_sk_list.lock); - sk_for_each(sk, node, &hci_sk_list.head) { + sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; if (sk->sk_state != BT_BOUND) @@ -251,13 +248,12 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) static void send_monitor_event(struct sk_buff *skb) { struct sock *sk; - struct hlist_node *node; BT_DBG("len %d", skb->len); read_lock(&hci_sk_list.lock); - sk_for_each(sk, node, &hci_sk_list.head) { + sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; if (sk->sk_state != BT_BOUND) @@ -393,11 +389,10 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event) if (event == HCI_DEV_UNREG) { struct sock *sk; - struct hlist_node *node; /* Detach sockets from device */ read_lock(&hci_sk_list.lock); - sk_for_each(sk, node, &hci_sk_list.head) { + sk_for_each(sk, &hci_sk_list.head) { bh_lock_sock_nested(sk); if (hci_pi(sk)->hdev == hdev) { hci_pi(sk)->hdev = NULL; @@ -859,6 +854,11 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } else { + /* Stand-alone HCI commands must be flaged as + * single-command requests. + */ + bt_cb(skb)->req.start = true; + skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } @@ -1107,7 +1107,7 @@ int __init hci_sock_init(void) goto error; } - err = bt_procfs_init(THIS_MODULE, &init_net, "hci", &hci_sk_list, NULL); + err = bt_procfs_init(&init_net, "hci", &hci_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create HCI proc file"); bt_sock_unregister(BTPROTO_HCI); @@ -1126,8 +1126,6 @@ error: void hci_sock_cleanup(void) { bt_procfs_cleanup(&init_net, "hci"); - if (bt_sock_unregister(BTPROTO_HCI) < 0) - BT_ERR("HCI socket unregistration failed"); - + bt_sock_unregister(BTPROTO_HCI); proto_unregister(&hci_sk_proto); } diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 55cceee02a84..7ad6ecf36f20 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -2,6 +2,7 @@ #include <linux/debugfs.h> #include <linux/module.h> +#include <asm/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> @@ -47,10 +48,10 @@ static ssize_t show_link_features(struct device *dev, struct hci_conn *conn = to_hci_conn(dev); return sprintf(buf, "0x%02x%02x%02x%02x%02x%02x%02x%02x\n", - conn->features[0], conn->features[1], - conn->features[2], conn->features[3], - conn->features[4], conn->features[5], - conn->features[6], conn->features[7]); + conn->features[0][0], conn->features[0][1], + conn->features[0][2], conn->features[0][3], + conn->features[0][4], conn->features[0][5], + conn->features[0][6], conn->features[0][7]); } #define LINK_ATTR(_name, _mode, _show, _store) \ @@ -145,7 +146,6 @@ void hci_conn_del_sysfs(struct hci_conn *conn) } device_del(&conn->dev); - put_device(&conn->dev); hci_dev_put(hdev); } @@ -233,10 +233,10 @@ static ssize_t show_features(struct device *dev, struct hci_dev *hdev = to_hci_dev(dev); return sprintf(buf, "0x%02x%02x%02x%02x%02x%02x%02x%02x\n", - hdev->features[0], hdev->features[1], - hdev->features[2], hdev->features[3], - hdev->features[4], hdev->features[5], - hdev->features[6], hdev->features[7]); + hdev->features[0][0], hdev->features[0][1], + hdev->features[0][2], hdev->features[0][3], + hdev->features[0][4], hdev->features[0][5], + hdev->features[0][6], hdev->features[0][7]); } static ssize_t show_manufacturer(struct device *dev, @@ -461,19 +461,18 @@ static const struct file_operations blacklist_fops = { static void print_bt_uuid(struct seq_file *f, u8 *uuid) { - __be32 data0, data4; - __be16 data1, data2, data3, data5; + u32 data0, data5; + u16 data1, data2, data3, data4; - memcpy(&data0, &uuid[0], 4); - memcpy(&data1, &uuid[4], 2); - memcpy(&data2, &uuid[6], 2); - memcpy(&data3, &uuid[8], 2); - memcpy(&data4, &uuid[10], 4); - memcpy(&data5, &uuid[14], 2); + data5 = get_unaligned_le32(uuid); + data4 = get_unaligned_le16(uuid + 4); + data3 = get_unaligned_le16(uuid + 6); + data2 = get_unaligned_le16(uuid + 8); + data1 = get_unaligned_le16(uuid + 10); + data0 = get_unaligned_le32(uuid + 12); - seq_printf(f, "%.8x-%.4x-%.4x-%.4x-%.8x%.4x\n", - ntohl(data0), ntohs(data1), ntohs(data2), ntohs(data3), - ntohl(data4), ntohs(data5)); + seq_printf(f, "%.8x-%.4x-%.4x-%.4x-%.4x%.8x\n", + data0, data1, data2, data3, data4, data5); } static int uuids_show(struct seq_file *f, void *p) @@ -590,10 +589,8 @@ int __init bt_sysfs_init(void) bt_debugfs = debugfs_create_dir("bluetooth", NULL); bt_class = class_create(THIS_MODULE, "bluetooth"); - if (IS_ERR(bt_class)) - return PTR_ERR(bt_class); - return 0; + return PTR_RET(bt_class); } void bt_sysfs_cleanup(void) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index a7352ff3fd1e..940f5acb6694 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1,6 +1,7 @@ /* HIDP implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org> + Copyright (C) 2013 David Herrmann <dh.herrmann@gmail.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as @@ -20,6 +21,7 @@ SOFTWARE IS DISCLAIMED. */ +#include <linux/kref.h> #include <linux/module.h> #include <linux/file.h> #include <linux/kthread.h> @@ -59,39 +61,20 @@ static unsigned char hidp_keycode[256] = { static unsigned char hidp_mkeyspat[] = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }; -static struct hidp_session *__hidp_get_session(bdaddr_t *bdaddr) -{ - struct hidp_session *session; - - BT_DBG(""); +static int hidp_session_probe(struct l2cap_conn *conn, + struct l2cap_user *user); +static void hidp_session_remove(struct l2cap_conn *conn, + struct l2cap_user *user); +static int hidp_session_thread(void *arg); +static void hidp_session_terminate(struct hidp_session *s); - list_for_each_entry(session, &hidp_session_list, list) { - if (!bacmp(bdaddr, &session->bdaddr)) - return session; - } - - return NULL; -} - -static void __hidp_link_session(struct hidp_session *session) -{ - list_add(&session->list, &hidp_session_list); -} - -static void __hidp_unlink_session(struct hidp_session *session) -{ - hci_conn_put_device(session->conn); - - list_del(&session->list); -} - -static void __hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci) +static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci) { memset(ci, 0, sizeof(*ci)); bacpy(&ci->bdaddr, &session->bdaddr); ci->flags = session->flags; - ci->state = session->state; + ci->state = BT_CONNECTED; ci->vendor = 0x0000; ci->product = 0x0000; @@ -115,58 +98,80 @@ static void __hidp_copy_session(struct hidp_session *session, struct hidp_connin } } -static int hidp_queue_event(struct hidp_session *session, struct input_dev *dev, - unsigned int type, unsigned int code, int value) +/* assemble skb, queue message on @transmit and wake up the session thread */ +static int hidp_send_message(struct hidp_session *session, struct socket *sock, + struct sk_buff_head *transmit, unsigned char hdr, + const unsigned char *data, int size) { - unsigned char newleds; struct sk_buff *skb; + struct sock *sk = sock->sk; - BT_DBG("session %p type %d code %d value %d", session, type, code, value); - - if (type != EV_LED) - return -1; - - newleds = (!!test_bit(LED_KANA, dev->led) << 3) | - (!!test_bit(LED_COMPOSE, dev->led) << 3) | - (!!test_bit(LED_SCROLLL, dev->led) << 2) | - (!!test_bit(LED_CAPSL, dev->led) << 1) | - (!!test_bit(LED_NUML, dev->led)); - - if (session->leds == newleds) - return 0; + BT_DBG("session %p data %p size %d", session, data, size); - session->leds = newleds; + if (atomic_read(&session->terminate)) + return -EIO; - skb = alloc_skb(3, GFP_ATOMIC); + skb = alloc_skb(size + 1, GFP_ATOMIC); if (!skb) { BT_ERR("Can't allocate memory for new frame"); return -ENOMEM; } - *skb_put(skb, 1) = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; - *skb_put(skb, 1) = 0x01; - *skb_put(skb, 1) = newleds; - - skb_queue_tail(&session->intr_transmit, skb); + *skb_put(skb, 1) = hdr; + if (data && size > 0) + memcpy(skb_put(skb, size), data, size); - hidp_schedule(session); + skb_queue_tail(transmit, skb); + wake_up_interruptible(sk_sleep(sk)); return 0; } -static int hidp_hidinput_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) +static int hidp_send_ctrl_message(struct hidp_session *session, + unsigned char hdr, const unsigned char *data, + int size) { - struct hid_device *hid = input_get_drvdata(dev); - struct hidp_session *session = hid->driver_data; + return hidp_send_message(session, session->ctrl_sock, + &session->ctrl_transmit, hdr, data, size); +} - return hidp_queue_event(session, dev, type, code, value); +static int hidp_send_intr_message(struct hidp_session *session, + unsigned char hdr, const unsigned char *data, + int size) +{ + return hidp_send_message(session, session->intr_sock, + &session->intr_transmit, hdr, data, size); } -static int hidp_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) +static int hidp_input_event(struct input_dev *dev, unsigned int type, + unsigned int code, int value) { struct hidp_session *session = input_get_drvdata(dev); + unsigned char newleds; + unsigned char hdr, data[2]; + + BT_DBG("session %p type %d code %d value %d", + session, type, code, value); + + if (type != EV_LED) + return -1; + + newleds = (!!test_bit(LED_KANA, dev->led) << 3) | + (!!test_bit(LED_COMPOSE, dev->led) << 3) | + (!!test_bit(LED_SCROLLL, dev->led) << 2) | + (!!test_bit(LED_CAPSL, dev->led) << 1) | + (!!test_bit(LED_NUML, dev->led)); - return hidp_queue_event(session, dev, type, code, value); + if (session->leds == newleds) + return 0; + + session->leds = newleds; + + hdr = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; + data[0] = 0x01; + data[1] = newleds; + + return hidp_send_intr_message(session, hdr, data, 2); } static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) @@ -224,71 +229,9 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) input_sync(dev); } -static int __hidp_send_ctrl_message(struct hidp_session *session, - unsigned char hdr, unsigned char *data, - int size) -{ - struct sk_buff *skb; - - BT_DBG("session %p data %p size %d", session, data, size); - - if (atomic_read(&session->terminate)) - return -EIO; - - skb = alloc_skb(size + 1, GFP_ATOMIC); - if (!skb) { - BT_ERR("Can't allocate memory for new frame"); - return -ENOMEM; - } - - *skb_put(skb, 1) = hdr; - if (data && size > 0) - memcpy(skb_put(skb, size), data, size); - - skb_queue_tail(&session->ctrl_transmit, skb); - - return 0; -} - -static int hidp_send_ctrl_message(struct hidp_session *session, - unsigned char hdr, unsigned char *data, int size) -{ - int err; - - err = __hidp_send_ctrl_message(session, hdr, data, size); - - hidp_schedule(session); - - return err; -} - -static int hidp_queue_report(struct hidp_session *session, - unsigned char *data, int size) -{ - struct sk_buff *skb; - - BT_DBG("session %p hid %p data %p size %d", session, session->hid, data, size); - - skb = alloc_skb(size + 1, GFP_ATOMIC); - if (!skb) { - BT_ERR("Can't allocate memory for new frame"); - return -ENOMEM; - } - - *skb_put(skb, 1) = 0xa2; - if (size > 0) - memcpy(skb_put(skb, size), data, size); - - skb_queue_tail(&session->intr_transmit, skb); - - hidp_schedule(session); - - return 0; -} - static int hidp_send_report(struct hidp_session *session, struct hid_report *report) { - unsigned char buf[32]; + unsigned char buf[32], hdr; int rsize; rsize = ((report->size - 1) >> 3) + 1 + (report->id > 0); @@ -296,8 +239,9 @@ static int hidp_send_report(struct hidp_session *session, struct hid_report *rep return -EIO; hid_output_report(report, buf); + hdr = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; - return hidp_queue_report(session, buf, rsize); + return hidp_send_intr_message(session, hdr, buf, rsize); } static int hidp_get_raw_report(struct hid_device *hid, @@ -311,6 +255,9 @@ static int hidp_get_raw_report(struct hid_device *hid, int numbered_reports = hid->report_enum[report_type].numbered; int ret; + if (atomic_read(&session->terminate)) + return -EIO; + switch (report_type) { case HID_FEATURE_REPORT: report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_FEATURE; @@ -333,17 +280,19 @@ static int hidp_get_raw_report(struct hid_device *hid, session->waiting_report_number = numbered_reports ? report_number : -1; set_bit(HIDP_WAITING_FOR_RETURN, &session->flags); data[0] = report_number; - ret = hidp_send_ctrl_message(hid->driver_data, report_type, data, 1); + ret = hidp_send_ctrl_message(session, report_type, data, 1); if (ret) goto err; /* Wait for the return of the report. The returned report gets put in session->report_return. */ - while (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags)) { + while (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) && + !atomic_read(&session->terminate)) { int res; res = wait_event_interruptible_timeout(session->report_queue, - !test_bit(HIDP_WAITING_FOR_RETURN, &session->flags), + !test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) + || atomic_read(&session->terminate), 5*HZ); if (res == 0) { /* timeout */ @@ -386,14 +335,11 @@ static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, s struct hidp_session *session = hid->driver_data; int ret; - switch (report_type) { - case HID_FEATURE_REPORT: - report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE; - break; - case HID_OUTPUT_REPORT: - report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_OUPUT; - break; - default: + if (report_type == HID_OUTPUT_REPORT) { + report_type = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; + return hidp_send_intr_message(session, report_type, + data, count); + } else if (report_type != HID_FEATURE_REPORT) { return -EINVAL; } @@ -402,17 +348,19 @@ static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, s /* Set up our wait, and send the report request to the device. */ set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags); - ret = hidp_send_ctrl_message(hid->driver_data, report_type, data, - count); + report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE; + ret = hidp_send_ctrl_message(session, report_type, data, count); if (ret) goto err; /* Wait for the ACK from the device. */ - while (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)) { + while (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags) && + !atomic_read(&session->terminate)) { int res; res = wait_event_interruptible_timeout(session->report_queue, - !test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags), + !test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags) + || atomic_read(&session->terminate), 10*HZ); if (res == 0) { /* timeout */ @@ -443,8 +391,7 @@ static void hidp_idle_timeout(unsigned long arg) { struct hidp_session *session = (struct hidp_session *) arg; - atomic_inc(&session->terminate); - wake_up_process(session->task); + hidp_session_terminate(session); } static void hidp_set_timer(struct hidp_session *session) @@ -487,12 +434,12 @@ static void hidp_process_handshake(struct hidp_session *session, case HIDP_HSHK_ERR_FATAL: /* Device requests a reboot, as this is the only way this error * can be recovered. */ - __hidp_send_ctrl_message(session, + hidp_send_ctrl_message(session, HIDP_TRANS_HID_CONTROL | HIDP_CTRL_SOFT_RESET, NULL, 0); break; default: - __hidp_send_ctrl_message(session, + hidp_send_ctrl_message(session, HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0); break; } @@ -512,8 +459,7 @@ static void hidp_process_hid_control(struct hidp_session *session, skb_queue_purge(&session->ctrl_transmit); skb_queue_purge(&session->intr_transmit); - atomic_inc(&session->terminate); - wake_up_process(current); + hidp_session_terminate(session); } } @@ -541,7 +487,7 @@ static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb, break; default: - __hidp_send_ctrl_message(session, + hidp_send_ctrl_message(session, HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0); } @@ -588,7 +534,7 @@ static void hidp_recv_ctrl_frame(struct hidp_session *session, break; default: - __hidp_send_ctrl_message(session, + hidp_send_ctrl_message(session, HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_UNSUPPORTED_REQUEST, NULL, 0); break; } @@ -639,32 +585,24 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len) return kernel_sendmsg(sock, &msg, &iv, 1, len); } -static void hidp_process_intr_transmit(struct hidp_session *session) +/* dequeue message from @transmit and send via @sock */ +static void hidp_process_transmit(struct hidp_session *session, + struct sk_buff_head *transmit, + struct socket *sock) { struct sk_buff *skb; + int ret; BT_DBG("session %p", session); - while ((skb = skb_dequeue(&session->intr_transmit))) { - if (hidp_send_frame(session->intr_sock, skb->data, skb->len) < 0) { - skb_queue_head(&session->intr_transmit, skb); + while ((skb = skb_dequeue(transmit))) { + ret = hidp_send_frame(sock, skb->data, skb->len); + if (ret == -EAGAIN) { + skb_queue_head(transmit, skb); break; - } - - hidp_set_timer(session); - kfree_skb(skb); - } -} - -static void hidp_process_ctrl_transmit(struct hidp_session *session) -{ - struct sk_buff *skb; - - BT_DBG("session %p", session); - - while ((skb = skb_dequeue(&session->ctrl_transmit))) { - if (hidp_send_frame(session->ctrl_sock, skb->data, skb->len) < 0) { - skb_queue_head(&session->ctrl_transmit, skb); + } else if (ret < 0) { + hidp_session_terminate(session); + kfree_skb(skb); break; } @@ -673,121 +611,6 @@ static void hidp_process_ctrl_transmit(struct hidp_session *session) } } -static int hidp_session(void *arg) -{ - struct hidp_session *session = arg; - struct sock *ctrl_sk = session->ctrl_sock->sk; - struct sock *intr_sk = session->intr_sock->sk; - struct sk_buff *skb; - wait_queue_t ctrl_wait, intr_wait; - - BT_DBG("session %p", session); - - __module_get(THIS_MODULE); - set_user_nice(current, -15); - - init_waitqueue_entry(&ctrl_wait, current); - init_waitqueue_entry(&intr_wait, current); - add_wait_queue(sk_sleep(ctrl_sk), &ctrl_wait); - add_wait_queue(sk_sleep(intr_sk), &intr_wait); - session->waiting_for_startup = 0; - wake_up_interruptible(&session->startup_queue); - set_current_state(TASK_INTERRUPTIBLE); - while (!atomic_read(&session->terminate)) { - if (ctrl_sk->sk_state != BT_CONNECTED || - intr_sk->sk_state != BT_CONNECTED) - break; - - while ((skb = skb_dequeue(&intr_sk->sk_receive_queue))) { - skb_orphan(skb); - if (!skb_linearize(skb)) - hidp_recv_intr_frame(session, skb); - else - kfree_skb(skb); - } - - hidp_process_intr_transmit(session); - - while ((skb = skb_dequeue(&ctrl_sk->sk_receive_queue))) { - skb_orphan(skb); - if (!skb_linearize(skb)) - hidp_recv_ctrl_frame(session, skb); - else - kfree_skb(skb); - } - - hidp_process_ctrl_transmit(session); - - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - set_current_state(TASK_RUNNING); - remove_wait_queue(sk_sleep(intr_sk), &intr_wait); - remove_wait_queue(sk_sleep(ctrl_sk), &ctrl_wait); - - clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags); - clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags); - wake_up_interruptible(&session->report_queue); - - down_write(&hidp_session_sem); - - hidp_del_timer(session); - - if (session->input) { - input_unregister_device(session->input); - session->input = NULL; - } - - if (session->hid) { - hid_destroy_device(session->hid); - session->hid = NULL; - } - - /* Wakeup user-space polling for socket errors */ - session->intr_sock->sk->sk_err = EUNATCH; - session->ctrl_sock->sk->sk_err = EUNATCH; - - hidp_schedule(session); - - fput(session->intr_sock->file); - - wait_event_timeout(*(sk_sleep(ctrl_sk)), - (ctrl_sk->sk_state == BT_CLOSED), msecs_to_jiffies(500)); - - fput(session->ctrl_sock->file); - - __hidp_unlink_session(session); - - up_write(&hidp_session_sem); - - kfree(session->rd_data); - kfree(session); - module_put_and_exit(0); - return 0; -} - -static struct hci_conn *hidp_get_connection(struct hidp_session *session) -{ - bdaddr_t *src = &bt_sk(session->ctrl_sock->sk)->src; - bdaddr_t *dst = &bt_sk(session->ctrl_sock->sk)->dst; - struct hci_conn *conn; - struct hci_dev *hdev; - - hdev = hci_get_route(dst, src); - if (!hdev) - return NULL; - - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); - if (conn) - hci_conn_hold_device(conn); - hci_dev_unlock(hdev); - - hci_dev_put(hdev); - - return conn; -} - static int hidp_setup_input(struct hidp_session *session, struct hidp_connadd_req *req) { @@ -835,7 +658,7 @@ static int hidp_setup_input(struct hidp_session *session, input->relbit[0] |= BIT_MASK(REL_WHEEL); } - input->dev.parent = &session->conn->dev; + input->dev.parent = &session->conn->hcon->dev; input->event = hidp_input_event; @@ -894,7 +717,6 @@ static struct hid_ll_driver hidp_hid_driver = { .stop = hidp_stop, .open = hidp_open, .close = hidp_close, - .hidinput_input_event = hidp_hidinput_event, }; /* This function sets up the hid device. It does not add it @@ -939,7 +761,7 @@ static int hidp_setup_hid(struct hidp_session *session, snprintf(hid->uniq, sizeof(hid->uniq), "%pMR", &bt_sk(session->ctrl_sock->sk)->dst); - hid->dev.parent = &session->conn->dev; + hid->dev.parent = &session->conn->hcon->dev; hid->ll_driver = &hidp_hid_driver; hid->hid_get_raw_report = hidp_get_raw_report; @@ -961,80 +783,217 @@ fault: return err; } -int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock) +/* initialize session devices */ +static int hidp_session_dev_init(struct hidp_session *session, + struct hidp_connadd_req *req) { - struct hidp_session *session, *s; - int vendor, product; - int err; + int ret; - BT_DBG(""); + if (req->rd_size > 0) { + ret = hidp_setup_hid(session, req); + if (ret && ret != -ENODEV) + return ret; + } - if (bacmp(&bt_sk(ctrl_sock->sk)->src, &bt_sk(intr_sock->sk)->src) || - bacmp(&bt_sk(ctrl_sock->sk)->dst, &bt_sk(intr_sock->sk)->dst)) - return -ENOTUNIQ; + if (!session->hid) { + ret = hidp_setup_input(session, req); + if (ret < 0) + return ret; + } - BT_DBG("rd_data %p rd_size %d", req->rd_data, req->rd_size); + return 0; +} - down_write(&hidp_session_sem); +/* destroy session devices */ +static void hidp_session_dev_destroy(struct hidp_session *session) +{ + if (session->hid) + put_device(&session->hid->dev); + else if (session->input) + input_put_device(session->input); - s = __hidp_get_session(&bt_sk(ctrl_sock->sk)->dst); - if (s && s->state == BT_CONNECTED) { - up_write(&hidp_session_sem); - return -EEXIST; - } + kfree(session->rd_data); + session->rd_data = NULL; +} - session = kzalloc(sizeof(struct hidp_session), GFP_KERNEL); - if (!session) { - up_write(&hidp_session_sem); - return -ENOMEM; - } +/* add HID/input devices to their underlying bus systems */ +static int hidp_session_dev_add(struct hidp_session *session) +{ + int ret; - bacpy(&session->bdaddr, &bt_sk(ctrl_sock->sk)->dst); + /* Both HID and input systems drop a ref-count when unregistering the + * device but they don't take a ref-count when registering them. Work + * around this by explicitly taking a refcount during registration + * which is dropped automatically by unregistering the devices. */ - session->ctrl_mtu = min_t(uint, l2cap_pi(ctrl_sock->sk)->chan->omtu, - l2cap_pi(ctrl_sock->sk)->chan->imtu); - session->intr_mtu = min_t(uint, l2cap_pi(intr_sock->sk)->chan->omtu, - l2cap_pi(intr_sock->sk)->chan->imtu); + if (session->hid) { + ret = hid_add_device(session->hid); + if (ret) + return ret; + get_device(&session->hid->dev); + } else if (session->input) { + ret = input_register_device(session->input); + if (ret) + return ret; + input_get_device(session->input); + } - BT_DBG("ctrl mtu %d intr mtu %d", session->ctrl_mtu, session->intr_mtu); + return 0; +} - session->ctrl_sock = ctrl_sock; - session->intr_sock = intr_sock; - session->state = BT_CONNECTED; +/* remove HID/input devices from their bus systems */ +static void hidp_session_dev_del(struct hidp_session *session) +{ + if (session->hid) + hid_destroy_device(session->hid); + else if (session->input) + input_unregister_device(session->input); +} - session->conn = hidp_get_connection(session); - if (!session->conn) { - err = -ENOTCONN; - goto failed; - } +/* + * Create new session object + * Allocate session object, initialize static fields, copy input data into the + * object and take a reference to all sub-objects. + * This returns 0 on success and puts a pointer to the new session object in + * \out. Otherwise, an error code is returned. + * The new session object has an initial ref-count of 1. + */ +static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, + struct socket *ctrl_sock, + struct socket *intr_sock, + struct hidp_connadd_req *req, + struct l2cap_conn *conn) +{ + struct hidp_session *session; + int ret; + struct bt_sock *ctrl, *intr; + + ctrl = bt_sk(ctrl_sock->sk); + intr = bt_sk(intr_sock->sk); - setup_timer(&session->timer, hidp_idle_timeout, (unsigned long)session); + session = kzalloc(sizeof(*session), GFP_KERNEL); + if (!session) + return -ENOMEM; + /* object and runtime management */ + kref_init(&session->ref); + atomic_set(&session->state, HIDP_SESSION_IDLING); + init_waitqueue_head(&session->state_queue); + session->flags = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID); + + /* connection management */ + bacpy(&session->bdaddr, bdaddr); + session->conn = conn; + session->user.probe = hidp_session_probe; + session->user.remove = hidp_session_remove; + session->ctrl_sock = ctrl_sock; + session->intr_sock = intr_sock; skb_queue_head_init(&session->ctrl_transmit); skb_queue_head_init(&session->intr_transmit); + session->ctrl_mtu = min_t(uint, l2cap_pi(ctrl)->chan->omtu, + l2cap_pi(ctrl)->chan->imtu); + session->intr_mtu = min_t(uint, l2cap_pi(intr)->chan->omtu, + l2cap_pi(intr)->chan->imtu); + session->idle_to = req->idle_to; + /* device management */ + setup_timer(&session->timer, hidp_idle_timeout, + (unsigned long)session); + + /* session data */ mutex_init(&session->report_mutex); init_waitqueue_head(&session->report_queue); - init_waitqueue_head(&session->startup_queue); - session->waiting_for_startup = 1; - session->flags = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID); - session->idle_to = req->idle_to; - __hidp_link_session(session); + ret = hidp_session_dev_init(session, req); + if (ret) + goto err_free; - if (req->rd_size > 0) { - err = hidp_setup_hid(session, req); - if (err && err != -ENODEV) - goto purge; - } + l2cap_conn_get(session->conn); + get_file(session->intr_sock->file); + get_file(session->ctrl_sock->file); + *out = session; + return 0; - if (!session->hid) { - err = hidp_setup_input(session, req); - if (err < 0) - goto purge; +err_free: + kfree(session); + return ret; +} + +/* increase ref-count of the given session by one */ +static void hidp_session_get(struct hidp_session *session) +{ + kref_get(&session->ref); +} + +/* release callback */ +static void session_free(struct kref *ref) +{ + struct hidp_session *session = container_of(ref, struct hidp_session, + ref); + + hidp_session_dev_destroy(session); + skb_queue_purge(&session->ctrl_transmit); + skb_queue_purge(&session->intr_transmit); + fput(session->intr_sock->file); + fput(session->ctrl_sock->file); + l2cap_conn_put(session->conn); + kfree(session); +} + +/* decrease ref-count of the given session by one */ +static void hidp_session_put(struct hidp_session *session) +{ + kref_put(&session->ref, session_free); +} + +/* + * Search the list of active sessions for a session with target address + * \bdaddr. You must hold at least a read-lock on \hidp_session_sem. As long as + * you do not release this lock, the session objects cannot vanish and you can + * safely take a reference to the session yourself. + */ +static struct hidp_session *__hidp_session_find(const bdaddr_t *bdaddr) +{ + struct hidp_session *session; + + list_for_each_entry(session, &hidp_session_list, list) { + if (!bacmp(bdaddr, &session->bdaddr)) + return session; } - hidp_set_timer(session); + return NULL; +} + +/* + * Same as __hidp_session_find() but no locks must be held. This also takes a + * reference of the returned session (if non-NULL) so you must drop this + * reference if you no longer use the object. + */ +static struct hidp_session *hidp_session_find(const bdaddr_t *bdaddr) +{ + struct hidp_session *session; + + down_read(&hidp_session_sem); + + session = __hidp_session_find(bdaddr); + if (session) + hidp_session_get(session); + + up_read(&hidp_session_sem); + + return session; +} + +/* + * Start session synchronously + * This starts a session thread and waits until initialization + * is done or returns an error if it couldn't be started. + * If this returns 0 the session thread is up and running. You must call + * hipd_session_stop_sync() before deleting any runtime resources. + */ +static int hidp_session_start_sync(struct hidp_session *session) +{ + unsigned int vendor, product; if (session->hid) { vendor = session->hid->vendor; @@ -1047,98 +1006,320 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, product = 0x0000; } - session->task = kthread_run(hidp_session, session, "khidpd_%04x%04x", - vendor, product); - if (IS_ERR(session->task)) { - err = PTR_ERR(session->task); - goto unlink; - } + session->task = kthread_run(hidp_session_thread, session, + "khidpd_%04x%04x", vendor, product); + if (IS_ERR(session->task)) + return PTR_ERR(session->task); - while (session->waiting_for_startup) { - wait_event_interruptible(session->startup_queue, - !session->waiting_for_startup); - } + while (atomic_read(&session->state) <= HIDP_SESSION_IDLING) + wait_event(session->state_queue, + atomic_read(&session->state) > HIDP_SESSION_IDLING); - if (session->hid) - err = hid_add_device(session->hid); - else - err = input_register_device(session->input); + return 0; +} - if (err < 0) { - atomic_inc(&session->terminate); - wake_up_process(session->task); - up_write(&hidp_session_sem); - return err; - } +/* + * Terminate session thread + * Wake up session thread and notify it to stop. This is asynchronous and + * returns immediately. Call this whenever a runtime error occurs and you want + * the session to stop. + * Note: wake_up_process() performs any necessary memory-barriers for us. + */ +static void hidp_session_terminate(struct hidp_session *session) +{ + atomic_inc(&session->terminate); + wake_up_process(session->task); +} - if (session->input) { - hidp_send_ctrl_message(session, - HIDP_TRANS_SET_PROTOCOL | HIDP_PROTO_BOOT, NULL, 0); - session->flags |= (1 << HIDP_BOOT_PROTOCOL_MODE); +/* + * Probe HIDP session + * This is called from the l2cap_conn core when our l2cap_user object is bound + * to the hci-connection. We get the session via the \user object and can now + * start the session thread, register the HID/input devices and link it into + * the global session list. + * The global session-list owns its own reference to the session object so you + * can drop your own reference after registering the l2cap_user object. + */ +static int hidp_session_probe(struct l2cap_conn *conn, + struct l2cap_user *user) +{ + struct hidp_session *session = container_of(user, + struct hidp_session, + user); + struct hidp_session *s; + int ret; - session->leds = 0xff; - hidp_input_event(session->input, EV_LED, 0, 0); + down_write(&hidp_session_sem); + + /* check that no other session for this device exists */ + s = __hidp_session_find(&session->bdaddr); + if (s) { + ret = -EEXIST; + goto out_unlock; } + ret = hidp_session_start_sync(session); + if (ret) + goto out_unlock; + + ret = hidp_session_dev_add(session); + if (ret) + goto out_stop; + + hidp_session_get(session); + list_add(&session->list, &hidp_session_list); + ret = 0; + goto out_unlock; + +out_stop: + hidp_session_terminate(session); +out_unlock: up_write(&hidp_session_sem); - return 0; + return ret; +} -unlink: +/* + * Remove HIDP session + * Called from the l2cap_conn core when either we explicitly unregistered + * the l2cap_user object or if the underlying connection is shut down. + * We signal the hidp-session thread to shut down, unregister the HID/input + * devices and unlink the session from the global list. + * This drops the reference to the session that is owned by the global + * session-list. + * Note: We _must_ not synchronosly wait for the session-thread to shut down. + * This is, because the session-thread might be waiting for an HCI lock that is + * held while we are called. Therefore, we only unregister the devices and + * notify the session-thread to terminate. The thread itself owns a reference + * to the session object so it can safely shut down. + */ +static void hidp_session_remove(struct l2cap_conn *conn, + struct l2cap_user *user) +{ + struct hidp_session *session = container_of(user, + struct hidp_session, + user); + + down_write(&hidp_session_sem); + + hidp_session_terminate(session); + hidp_session_dev_del(session); + list_del(&session->list); + + up_write(&hidp_session_sem); + + hidp_session_put(session); +} + +/* + * Session Worker + * This performs the actual main-loop of the HIDP worker. We first check + * whether the underlying connection is still alive, then parse all pending + * messages and finally send all outstanding messages. + */ +static void hidp_session_run(struct hidp_session *session) +{ + struct sock *ctrl_sk = session->ctrl_sock->sk; + struct sock *intr_sk = session->intr_sock->sk; + struct sk_buff *skb; + + for (;;) { + /* + * This thread can be woken up two ways: + * - You call hidp_session_terminate() which sets the + * session->terminate flag and wakes this thread up. + * - Via modifying the socket state of ctrl/intr_sock. This + * thread is woken up by ->sk_state_changed(). + * + * Note: set_current_state() performs any necessary + * memory-barriers for us. + */ + set_current_state(TASK_INTERRUPTIBLE); + + if (atomic_read(&session->terminate)) + break; + + if (ctrl_sk->sk_state != BT_CONNECTED || + intr_sk->sk_state != BT_CONNECTED) + break; + + /* parse incoming intr-skbs */ + while ((skb = skb_dequeue(&intr_sk->sk_receive_queue))) { + skb_orphan(skb); + if (!skb_linearize(skb)) + hidp_recv_intr_frame(session, skb); + else + kfree_skb(skb); + } + + /* send pending intr-skbs */ + hidp_process_transmit(session, &session->intr_transmit, + session->intr_sock); + + /* parse incoming ctrl-skbs */ + while ((skb = skb_dequeue(&ctrl_sk->sk_receive_queue))) { + skb_orphan(skb); + if (!skb_linearize(skb)) + hidp_recv_ctrl_frame(session, skb); + else + kfree_skb(skb); + } + + /* send pending ctrl-skbs */ + hidp_process_transmit(session, &session->ctrl_transmit, + session->ctrl_sock); + + schedule(); + } + + atomic_inc(&session->terminate); + set_current_state(TASK_RUNNING); +} + +/* + * HIDP session thread + * This thread runs the I/O for a single HIDP session. Startup is synchronous + * which allows us to take references to ourself here instead of doing that in + * the caller. + * When we are ready to run we notify the caller and call hidp_session_run(). + */ +static int hidp_session_thread(void *arg) +{ + struct hidp_session *session = arg; + wait_queue_t ctrl_wait, intr_wait; + + BT_DBG("session %p", session); + + /* initialize runtime environment */ + hidp_session_get(session); + __module_get(THIS_MODULE); + set_user_nice(current, -15); + hidp_set_timer(session); + + init_waitqueue_entry(&ctrl_wait, current); + init_waitqueue_entry(&intr_wait, current); + add_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait); + add_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait); + /* This memory barrier is paired with wq_has_sleeper(). See + * sock_poll_wait() for more information why this is needed. */ + smp_mb(); + + /* notify synchronous startup that we're ready */ + atomic_inc(&session->state); + wake_up(&session->state_queue); + + /* run session */ + hidp_session_run(session); + + /* cleanup runtime environment */ + remove_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait); + remove_wait_queue(sk_sleep(session->intr_sock->sk), &ctrl_wait); + wake_up_interruptible(&session->report_queue); hidp_del_timer(session); - if (session->input) { - input_unregister_device(session->input); - session->input = NULL; + /* + * If we stopped ourself due to any internal signal, we should try to + * unregister our own session here to avoid having it linger until the + * parent l2cap_conn dies or user-space cleans it up. + * This does not deadlock as we don't do any synchronous shutdown. + * Instead, this call has the same semantics as if user-space tried to + * delete the session. + */ + l2cap_unregister_user(session->conn, &session->user); + hidp_session_put(session); + + module_put_and_exit(0); + return 0; +} + +static int hidp_verify_sockets(struct socket *ctrl_sock, + struct socket *intr_sock) +{ + struct bt_sock *ctrl, *intr; + struct hidp_session *session; + + if (!l2cap_is_socket(ctrl_sock) || !l2cap_is_socket(intr_sock)) + return -EINVAL; + + ctrl = bt_sk(ctrl_sock->sk); + intr = bt_sk(intr_sock->sk); + + if (bacmp(&ctrl->src, &intr->src) || bacmp(&ctrl->dst, &intr->dst)) + return -ENOTUNIQ; + if (ctrl->sk.sk_state != BT_CONNECTED || + intr->sk.sk_state != BT_CONNECTED) + return -EBADFD; + + /* early session check, we check again during session registration */ + session = hidp_session_find(&ctrl->dst); + if (session) { + hidp_session_put(session); + return -EEXIST; } - if (session->hid) { - hid_destroy_device(session->hid); - session->hid = NULL; + return 0; +} + +int hidp_connection_add(struct hidp_connadd_req *req, + struct socket *ctrl_sock, + struct socket *intr_sock) +{ + struct hidp_session *session; + struct l2cap_conn *conn; + struct l2cap_chan *chan = l2cap_pi(ctrl_sock->sk)->chan; + int ret; + + ret = hidp_verify_sockets(ctrl_sock, intr_sock); + if (ret) + return ret; + + conn = NULL; + l2cap_chan_lock(chan); + if (chan->conn) { + l2cap_conn_get(chan->conn); + conn = chan->conn; } + l2cap_chan_unlock(chan); - kfree(session->rd_data); - session->rd_data = NULL; + if (!conn) + return -EBADFD; -purge: - __hidp_unlink_session(session); + ret = hidp_session_new(&session, &bt_sk(ctrl_sock->sk)->dst, ctrl_sock, + intr_sock, req, conn); + if (ret) + goto out_conn; - skb_queue_purge(&session->ctrl_transmit); - skb_queue_purge(&session->intr_transmit); + ret = l2cap_register_user(conn, &session->user); + if (ret) + goto out_session; -failed: - up_write(&hidp_session_sem); + ret = 0; - kfree(session); - return err; +out_session: + hidp_session_put(session); +out_conn: + l2cap_conn_put(conn); + return ret; } -int hidp_del_connection(struct hidp_conndel_req *req) +int hidp_connection_del(struct hidp_conndel_req *req) { struct hidp_session *session; - int err = 0; - BT_DBG(""); + session = hidp_session_find(&req->bdaddr); + if (!session) + return -ENOENT; - down_read(&hidp_session_sem); + if (req->flags & (1 << HIDP_VIRTUAL_CABLE_UNPLUG)) + hidp_send_ctrl_message(session, + HIDP_TRANS_HID_CONTROL | + HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, + NULL, 0); + else + l2cap_unregister_user(session->conn, &session->user); - session = __hidp_get_session(&req->bdaddr); - if (session) { - if (req->flags & (1 << HIDP_VIRTUAL_CABLE_UNPLUG)) { - hidp_send_ctrl_message(session, - HIDP_TRANS_HID_CONTROL | HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, NULL, 0); - } else { - /* Flush the transmit queues */ - skb_queue_purge(&session->ctrl_transmit); - skb_queue_purge(&session->intr_transmit); - - atomic_inc(&session->terminate); - wake_up_process(session->task); - } - } else - err = -ENOENT; + hidp_session_put(session); - up_read(&hidp_session_sem); - return err; + return 0; } int hidp_get_connlist(struct hidp_connlist_req *req) @@ -1153,7 +1334,7 @@ int hidp_get_connlist(struct hidp_connlist_req *req) list_for_each_entry(session, &hidp_session_list, list) { struct hidp_conninfo ci; - __hidp_copy_session(session, &ci); + hidp_copy_session(session, &ci); if (copy_to_user(req->ci, &ci, sizeof(ci))) { err = -EFAULT; @@ -1174,18 +1355,14 @@ int hidp_get_connlist(struct hidp_connlist_req *req) int hidp_get_conninfo(struct hidp_conninfo *ci) { struct hidp_session *session; - int err = 0; - - down_read(&hidp_session_sem); - session = __hidp_get_session(&ci->bdaddr); - if (session) - __hidp_copy_session(session, ci); - else - err = -ENOENT; + session = hidp_session_find(&ci->bdaddr); + if (session) { + hidp_copy_session(session, ci); + hidp_session_put(session); + } - up_read(&hidp_session_sem); - return err; + return session ? 0 : -ENOENT; } static int __init hidp_init(void) @@ -1204,6 +1381,7 @@ module_init(hidp_init); module_exit(hidp_exit); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); +MODULE_AUTHOR("David Herrmann <dh.herrmann@gmail.com>"); MODULE_DESCRIPTION("Bluetooth HIDP ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h index af1bcc823f26..6162ce8606ac 100644 --- a/net/bluetooth/hidp/hidp.h +++ b/net/bluetooth/hidp/hidp.h @@ -24,7 +24,9 @@ #define __HIDP_H #include <linux/types.h> +#include <linux/kref.h> #include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/l2cap.h> /* HIDP header masks */ #define HIDP_HEADER_TRANS_MASK 0xf0 @@ -119,43 +121,52 @@ struct hidp_connlist_req { struct hidp_conninfo __user *ci; }; -int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock); -int hidp_del_connection(struct hidp_conndel_req *req); +int hidp_connection_add(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock); +int hidp_connection_del(struct hidp_conndel_req *req); int hidp_get_connlist(struct hidp_connlist_req *req); int hidp_get_conninfo(struct hidp_conninfo *ci); +enum hidp_session_state { + HIDP_SESSION_IDLING, + HIDP_SESSION_RUNNING, +}; + /* HIDP session defines */ struct hidp_session { struct list_head list; + struct kref ref; - struct hci_conn *conn; + /* runtime management */ + atomic_t state; + wait_queue_head_t state_queue; + atomic_t terminate; + struct task_struct *task; + unsigned long flags; + /* connection management */ + bdaddr_t bdaddr; + struct l2cap_conn *conn; + struct l2cap_user user; struct socket *ctrl_sock; struct socket *intr_sock; - - bdaddr_t bdaddr; - - unsigned long state; - unsigned long flags; - unsigned long idle_to; - + struct sk_buff_head ctrl_transmit; + struct sk_buff_head intr_transmit; uint ctrl_mtu; uint intr_mtu; + unsigned long idle_to; - atomic_t terminate; - struct task_struct *task; - - unsigned char keys[8]; - unsigned char leds; - + /* device management */ struct input_dev *input; - struct hid_device *hid; - struct timer_list timer; - struct sk_buff_head ctrl_transmit; - struct sk_buff_head intr_transmit; + /* Report descriptor */ + __u8 *rd_data; + uint rd_size; + + /* session data */ + unsigned char keys[8]; + unsigned char leds; /* Used in hidp_get_raw_report() */ int waiting_report_type; /* HIDP_DATA_RTYPE_* */ @@ -166,24 +177,8 @@ struct hidp_session { /* Used in hidp_output_raw_report() */ int output_report_success; /* boolean */ - - /* Report descriptor */ - __u8 *rd_data; - uint rd_size; - - wait_queue_head_t startup_queue; - int waiting_for_startup; }; -static inline void hidp_schedule(struct hidp_session *session) -{ - struct sock *ctrl_sk = session->ctrl_sock->sk; - struct sock *intr_sk = session->intr_sock->sk; - - wake_up_interruptible(sk_sleep(ctrl_sk)); - wake_up_interruptible(sk_sleep(intr_sk)); -} - /* HIDP init defines */ extern int __init hidp_init_sockets(void); extern void __exit hidp_cleanup_sockets(void); diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 82a829d90b0f..cb3fdde1968a 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -77,21 +77,12 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return err; } - if (csock->sk->sk_state != BT_CONNECTED || - isock->sk->sk_state != BT_CONNECTED) { - sockfd_put(csock); - sockfd_put(isock); - return -EBADFD; - } + err = hidp_connection_add(&ca, csock, isock); + if (!err && copy_to_user(argp, &ca, sizeof(ca))) + err = -EFAULT; - err = hidp_add_connection(&ca, csock, isock); - if (!err) { - if (copy_to_user(argp, &ca, sizeof(ca))) - err = -EFAULT; - } else { - sockfd_put(csock); - sockfd_put(isock); - } + sockfd_put(csock); + sockfd_put(isock); return err; @@ -102,7 +93,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; - return hidp_del_connection(&cd); + return hidp_connection_del(&cd); case HIDPGETCONNLIST: if (copy_from_user(&cl, argp, sizeof(cl))) @@ -284,7 +275,7 @@ int __init hidp_init_sockets(void) goto error; } - err = bt_procfs_init(THIS_MODULE, &init_net, "hidp", &hidp_sk_list, NULL); + err = bt_procfs_init(&init_net, "hidp", &hidp_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create HIDP proc file"); bt_sock_unregister(BTPROTO_HIDP); @@ -296,7 +287,6 @@ int __init hidp_init_sockets(void) return 0; error: - BT_ERR("Can't register HIDP socket"); proto_unregister(&hidp_proto); return err; } @@ -304,8 +294,6 @@ error: void __exit hidp_cleanup_sockets(void) { bt_procfs_cleanup(&init_net, "hidp"); - if (bt_sock_unregister(BTPROTO_HIDP) < 0) - BT_ERR("Can't unregister HIDP socket"); - + bt_sock_unregister(BTPROTO_HIDP); proto_unregister(&hidp_proto); } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 22e658322845..a76d1ac0321b 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -571,7 +571,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) chan->conn = NULL; if (chan->chan_type != L2CAP_CHAN_CONN_FIX_A2MP) - hci_conn_put(conn->hcon); + hci_conn_drop(conn->hcon); if (mgr && mgr->bredr_chan == chan) mgr->bredr_chan = NULL; @@ -1446,6 +1446,89 @@ static void l2cap_info_timeout(struct work_struct *work) l2cap_conn_start(conn); } +/* + * l2cap_user + * External modules can register l2cap_user objects on l2cap_conn. The ->probe + * callback is called during registration. The ->remove callback is called + * during unregistration. + * An l2cap_user object can either be explicitly unregistered or when the + * underlying l2cap_conn object is deleted. This guarantees that l2cap->hcon, + * l2cap->hchan, .. are valid as long as the remove callback hasn't been called. + * External modules must own a reference to the l2cap_conn object if they intend + * to call l2cap_unregister_user(). The l2cap_conn object might get destroyed at + * any time if they don't. + */ + +int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user) +{ + struct hci_dev *hdev = conn->hcon->hdev; + int ret; + + /* We need to check whether l2cap_conn is registered. If it is not, we + * must not register the l2cap_user. l2cap_conn_del() is unregisters + * l2cap_conn objects, but doesn't provide its own locking. Instead, it + * relies on the parent hci_conn object to be locked. This itself relies + * on the hci_dev object to be locked. So we must lock the hci device + * here, too. */ + + hci_dev_lock(hdev); + + if (user->list.next || user->list.prev) { + ret = -EINVAL; + goto out_unlock; + } + + /* conn->hchan is NULL after l2cap_conn_del() was called */ + if (!conn->hchan) { + ret = -ENODEV; + goto out_unlock; + } + + ret = user->probe(conn, user); + if (ret) + goto out_unlock; + + list_add(&user->list, &conn->users); + ret = 0; + +out_unlock: + hci_dev_unlock(hdev); + return ret; +} +EXPORT_SYMBOL(l2cap_register_user); + +void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user) +{ + struct hci_dev *hdev = conn->hcon->hdev; + + hci_dev_lock(hdev); + + if (!user->list.next || !user->list.prev) + goto out_unlock; + + list_del(&user->list); + user->list.next = NULL; + user->list.prev = NULL; + user->remove(conn, user); + +out_unlock: + hci_dev_unlock(hdev); +} +EXPORT_SYMBOL(l2cap_unregister_user); + +static void l2cap_unregister_all_users(struct l2cap_conn *conn) +{ + struct l2cap_user *user; + + while (!list_empty(&conn->users)) { + user = list_first_entry(&conn->users, struct l2cap_user, list); + list_del(&user->list); + user->list.next = NULL; + user->list.prev = NULL; + user->remove(conn, user); + } +} + static void l2cap_conn_del(struct hci_conn *hcon, int err) { struct l2cap_conn *conn = hcon->l2cap_data; @@ -1458,6 +1541,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) kfree_skb(conn->rx_skb); + l2cap_unregister_all_users(conn); + mutex_lock(&conn->chan_lock); /* Kill channels */ @@ -1486,7 +1571,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) } hcon->l2cap_data = NULL; - kfree(conn); + conn->hchan = NULL; + l2cap_conn_put(conn); } static void security_timeout(struct work_struct *work) @@ -1502,12 +1588,12 @@ static void security_timeout(struct work_struct *work) } } -static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) +static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) { struct l2cap_conn *conn = hcon->l2cap_data; struct hci_chan *hchan; - if (conn || status) + if (conn) return conn; hchan = hci_chan_create(hcon); @@ -1520,24 +1606,21 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) return NULL; } + kref_init(&conn->ref); hcon->l2cap_data = conn; conn->hcon = hcon; + hci_conn_get(conn->hcon); conn->hchan = hchan; BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan); switch (hcon->type) { - case AMP_LINK: - conn->mtu = hcon->hdev->block_mtu; - break; - case LE_LINK: if (hcon->hdev->le_mtu) { conn->mtu = hcon->hdev->le_mtu; break; } /* fall through */ - default: conn->mtu = hcon->hdev->acl_mtu; break; @@ -1552,6 +1635,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) mutex_init(&conn->chan_lock); INIT_LIST_HEAD(&conn->chan_l); + INIT_LIST_HEAD(&conn->users); if (hcon->type == LE_LINK) INIT_DELAYED_WORK(&conn->security_timer, security_timeout); @@ -1563,6 +1647,26 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) return conn; } +static void l2cap_conn_free(struct kref *ref) +{ + struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); + + hci_conn_put(conn->hcon); + kfree(conn); +} + +void l2cap_conn_get(struct l2cap_conn *conn) +{ + kref_get(&conn->ref); +} +EXPORT_SYMBOL(l2cap_conn_get); + +void l2cap_conn_put(struct l2cap_conn *conn) +{ + kref_put(&conn->ref, l2cap_conn_free); +} +EXPORT_SYMBOL(l2cap_conn_put); + /* ---- Socket interface ---- */ /* Find socket with psm and source / destination bdaddr. @@ -1700,9 +1804,9 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, goto done; } - conn = l2cap_conn_add(hcon, 0); + conn = l2cap_conn_add(hcon); if (!conn) { - hci_conn_put(hcon); + hci_conn_drop(hcon); err = -ENOMEM; goto done; } @@ -1712,7 +1816,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, if (!list_empty(&conn->chan_l)) { err = -EBUSY; - hci_conn_put(hcon); + hci_conn_drop(hcon); } if (err) @@ -6210,12 +6314,13 @@ drop: kfree_skb(skb); } -static void l2cap_att_channel(struct l2cap_conn *conn, u16 cid, +static void l2cap_att_channel(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_chan *chan; - chan = l2cap_global_chan_by_scid(0, cid, conn->src, conn->dst); + chan = l2cap_global_chan_by_scid(0, L2CAP_CID_LE_DATA, + conn->src, conn->dst); if (!chan) goto drop; @@ -6264,7 +6369,7 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) break; case L2CAP_CID_LE_DATA: - l2cap_att_channel(conn, cid, skb); + l2cap_att_channel(conn, skb); break; case L2CAP_CID_SMP: @@ -6318,7 +6423,7 @@ void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); if (!status) { - conn = l2cap_conn_add(hcon, status); + conn = l2cap_conn_add(hcon); if (conn) l2cap_conn_ready(conn); } else { @@ -6487,7 +6592,7 @@ int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) goto drop; if (!conn) - conn = l2cap_conn_add(hcon, 0); + conn = l2cap_conn_add(hcon); if (!conn) goto drop; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 1bcfb8422fdc..36fed40c162c 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -43,6 +43,12 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent); static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio); +bool l2cap_is_socket(struct socket *sock) +{ + return sock && sock->ops == &l2cap_sock_ops; +} +EXPORT_SYMBOL(l2cap_is_socket); + static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; @@ -1292,7 +1298,7 @@ int __init l2cap_init_sockets(void) goto error; } - err = bt_procfs_init(THIS_MODULE, &init_net, "l2cap", &l2cap_sk_list, + err = bt_procfs_init(&init_net, "l2cap", &l2cap_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create L2CAP proc file"); @@ -1312,8 +1318,6 @@ error: void l2cap_cleanup_sockets(void) { bt_procfs_cleanup(&init_net, "l2cap"); - if (bt_sock_unregister(BTPROTO_L2CAP) < 0) - BT_ERR("L2CAP socket unregistration failed"); - + bt_sock_unregister(BTPROTO_L2CAP); proto_unregister(&l2cap_proto); } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f559b966279c..35fef22703e9 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -35,7 +35,7 @@ bool enable_hs; #define MGMT_VERSION 1 -#define MGMT_REVISION 2 +#define MGMT_REVISION 3 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -106,11 +106,10 @@ static const u16 mgmt_events[] = { * These LE scan and inquiry parameters were chosen according to LE General * Discovery Procedure specification. */ -#define LE_SCAN_TYPE 0x01 #define LE_SCAN_WIN 0x12 #define LE_SCAN_INT 0x12 -#define LE_SCAN_TIMEOUT_LE_ONLY 10240 /* TGAP(gen_disc_scan_min) */ -#define LE_SCAN_TIMEOUT_BREDR_LE 5120 /* TGAP(100)/2 */ +#define LE_SCAN_TIMEOUT_LE_ONLY msecs_to_jiffies(10240) +#define LE_SCAN_TIMEOUT_BREDR_LE msecs_to_jiffies(5120) #define INQUIRY_LEN_BREDR 0x08 /* TGAP(100) */ #define INQUIRY_LEN_BREDR_LE 0x04 /* TGAP(100)/2 */ @@ -384,7 +383,8 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (lmp_bredr_capable(hdev)) { settings |= MGMT_SETTING_CONNECTABLE; - settings |= MGMT_SETTING_FAST_CONNECTABLE; + if (hdev->hci_ver >= BLUETOOTH_VER_1_2) + settings |= MGMT_SETTING_FAST_CONNECTABLE; settings |= MGMT_SETTING_DISCOVERABLE; settings |= MGMT_SETTING_BREDR; settings |= MGMT_SETTING_LINK_SECURITY; @@ -409,6 +409,9 @@ static u32 get_current_settings(struct hci_dev *hdev) if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) settings |= MGMT_SETTING_CONNECTABLE; + if (test_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags)) + settings |= MGMT_SETTING_FAST_CONNECTABLE; + if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) settings |= MGMT_SETTING_DISCOVERABLE; @@ -435,35 +438,117 @@ static u32 get_current_settings(struct hci_dev *hdev) #define PNP_INFO_SVCLASS_ID 0x1200 -static u8 bluetooth_base_uuid[] = { - 0xFB, 0x34, 0x9B, 0x5F, 0x80, 0x00, 0x00, 0x80, - 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; +static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 4) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + u16 uuid16; -static u16 get_uuid16(u8 *uuid128) + if (uuid->size != 16) + continue; + + uuid16 = get_unaligned_le16(&uuid->uuid[12]); + if (uuid16 < 0x1100) + continue; + + if (uuid16 == PNP_INFO_SVCLASS_ID) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID16_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + sizeof(u16) > len) { + uuids_start[1] = EIR_UUID16_SOME; + break; + } + + *ptr++ = (uuid16 & 0x00ff); + *ptr++ = (uuid16 & 0xff00) >> 8; + uuids_start[0] += sizeof(uuid16); + } + + return ptr; +} + +static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) { - u32 val; - int i; + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; - for (i = 0; i < 12; i++) { - if (bluetooth_base_uuid[i] != uuid128[i]) - return 0; + if (len < 6) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + if (uuid->size != 32) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID32_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + sizeof(u32) > len) { + uuids_start[1] = EIR_UUID32_SOME; + break; + } + + memcpy(ptr, &uuid->uuid[12], sizeof(u32)); + ptr += sizeof(u32); + uuids_start[0] += sizeof(u32); } - val = get_unaligned_le32(&uuid128[12]); - if (val > 0xffff) - return 0; + return ptr; +} + +static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 18) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + if (uuid->size != 128) + continue; - return (u16) val; + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID128_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + 16 > len) { + uuids_start[1] = EIR_UUID128_SOME; + break; + } + + memcpy(ptr, uuid->uuid, 16); + ptr += 16; + uuids_start[0] += 16; + } + + return ptr; } static void create_eir(struct hci_dev *hdev, u8 *data) { u8 *ptr = data; - u16 eir_len = 0; - u16 uuid16_list[HCI_MAX_EIR_LENGTH / sizeof(u16)]; - int i, truncated = 0; - struct bt_uuid *uuid; size_t name_len; name_len = strlen(hdev->dev_name); @@ -481,7 +566,6 @@ static void create_eir(struct hci_dev *hdev, u8 *data) memcpy(ptr + 2, hdev->dev_name, name_len); - eir_len += (name_len + 2); ptr += (name_len + 2); } @@ -490,7 +574,6 @@ static void create_eir(struct hci_dev *hdev, u8 *data) ptr[1] = EIR_TX_POWER; ptr[2] = (u8) hdev->inq_tx_power; - eir_len += 3; ptr += 3; } @@ -503,88 +586,41 @@ static void create_eir(struct hci_dev *hdev, u8 *data) put_unaligned_le16(hdev->devid_product, ptr + 6); put_unaligned_le16(hdev->devid_version, ptr + 8); - eir_len += 10; ptr += 10; } - memset(uuid16_list, 0, sizeof(uuid16_list)); - - /* Group all UUID16 types */ - list_for_each_entry(uuid, &hdev->uuids, list) { - u16 uuid16; - - uuid16 = get_uuid16(uuid->uuid); - if (uuid16 == 0) - return; - - if (uuid16 < 0x1100) - continue; - - if (uuid16 == PNP_INFO_SVCLASS_ID) - continue; - - /* Stop if not enough space to put next UUID */ - if (eir_len + 2 + sizeof(u16) > HCI_MAX_EIR_LENGTH) { - truncated = 1; - break; - } - - /* Check for duplicates */ - for (i = 0; uuid16_list[i] != 0; i++) - if (uuid16_list[i] == uuid16) - break; - - if (uuid16_list[i] == 0) { - uuid16_list[i] = uuid16; - eir_len += sizeof(u16); - } - } - - if (uuid16_list[0] != 0) { - u8 *length = ptr; - - /* EIR Data type */ - ptr[1] = truncated ? EIR_UUID16_SOME : EIR_UUID16_ALL; - - ptr += 2; - eir_len += 2; - - for (i = 0; uuid16_list[i] != 0; i++) { - *ptr++ = (uuid16_list[i] & 0x00ff); - *ptr++ = (uuid16_list[i] & 0xff00) >> 8; - } - - /* EIR Data length */ - *length = (i * sizeof(u16)) + 1; - } + ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); + ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); + ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); } -static int update_eir(struct hci_dev *hdev) +static void update_eir(struct hci_request *req) { + struct hci_dev *hdev = req->hdev; struct hci_cp_write_eir cp; if (!hdev_is_powered(hdev)) - return 0; + return; if (!lmp_ext_inq_capable(hdev)) - return 0; + return; if (!test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) - return 0; + return; if (test_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) - return 0; + return; memset(&cp, 0, sizeof(cp)); create_eir(hdev, cp.data); if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) - return 0; + return; memcpy(hdev->eir, cp.data, sizeof(cp.data)); - return hci_send_cmd(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp); } static u8 get_service_classes(struct hci_dev *hdev) @@ -598,47 +634,48 @@ static u8 get_service_classes(struct hci_dev *hdev) return val; } -static int update_class(struct hci_dev *hdev) +static void update_class(struct hci_request *req) { + struct hci_dev *hdev = req->hdev; u8 cod[3]; - int err; BT_DBG("%s", hdev->name); if (!hdev_is_powered(hdev)) - return 0; + return; if (test_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) - return 0; + return; cod[0] = hdev->minor_class; cod[1] = hdev->major_class; cod[2] = get_service_classes(hdev); if (memcmp(cod, hdev->dev_class, 3) == 0) - return 0; - - err = hci_send_cmd(hdev, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod); - if (err == 0) - set_bit(HCI_PENDING_CLASS, &hdev->dev_flags); + return; - return err; + hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod); } static void service_cache_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, service_cache.work); + struct hci_request req; if (!test_and_clear_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) return; + hci_req_init(&req, hdev); + hci_dev_lock(hdev); - update_eir(hdev); - update_class(hdev); + update_eir(&req); + update_class(&req); hci_dev_unlock(hdev); + + hci_req_run(&req, NULL); } static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev) @@ -728,13 +765,9 @@ static void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, void *data), void *data) { - struct list_head *p, *n; - - list_for_each_safe(p, n, &hdev->mgmt_pending) { - struct pending_cmd *cmd; - - cmd = list_entry(p, struct pending_cmd, list); + struct pending_cmd *cmd, *tmp; + list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (opcode > 0 && cmd->opcode != opcode) continue; @@ -777,14 +810,19 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("request for %s", hdev->name); + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, + MGMT_STATUS_INVALID_PARAMS); + hci_dev_lock(hdev); if (test_and_clear_bit(HCI_AUTO_OFF, &hdev->dev_flags)) { cancel_delayed_work(&hdev->power_off); if (cp->val) { - err = send_settings_rsp(sk, MGMT_OP_SET_POWERED, hdev); - mgmt_powered(hdev, 1); + mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, + data, len); + err = mgmt_powered(hdev, 1); goto failed; } } @@ -807,9 +845,9 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, } if (cp->val) - schedule_work(&hdev->power_on); + queue_work(hdev->req_workqueue, &hdev->power_on); else - schedule_work(&hdev->power_off.work); + queue_work(hdev->req_workqueue, &hdev->power_off.work); err = 0; @@ -872,6 +910,10 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, return cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, MGMT_STATUS_NOT_SUPPORTED); + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, + MGMT_STATUS_INVALID_PARAMS); + timeout = __le16_to_cpu(cp->timeout); if (!cp->val && timeout > 0) return cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, @@ -957,11 +999,64 @@ failed: return err; } +static void write_fast_connectable(struct hci_request *req, bool enable) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_write_page_scan_activity acp; + u8 type; + + if (hdev->hci_ver < BLUETOOTH_VER_1_2) + return; + + if (enable) { + type = PAGE_SCAN_TYPE_INTERLACED; + + /* 160 msec page scan interval */ + acp.interval = __constant_cpu_to_le16(0x0100); + } else { + type = PAGE_SCAN_TYPE_STANDARD; /* default */ + + /* default 1.28 sec page scan */ + acp.interval = __constant_cpu_to_le16(0x0800); + } + + acp.window = __constant_cpu_to_le16(0x0012); + + if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval || + __cpu_to_le16(hdev->page_scan_window) != acp.window) + hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY, + sizeof(acp), &acp); + + if (hdev->page_scan_type != type) + hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type); +} + +static void set_connectable_complete(struct hci_dev *hdev, u8 status) +{ + struct pending_cmd *cmd; + + BT_DBG("status 0x%02x", status); + + hci_dev_lock(hdev); + + cmd = mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev); + if (!cmd) + goto unlock; + + send_settings_rsp(cmd->sk, MGMT_OP_SET_CONNECTABLE, hdev); + + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); +} + static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct pending_cmd *cmd; + struct hci_request req; u8 scan; int err; @@ -971,6 +1066,10 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, return cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, MGMT_STATUS_NOT_SUPPORTED); + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, + MGMT_STATUS_INVALID_PARAMS); + hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { @@ -1024,7 +1123,20 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, cancel_delayed_work(&hdev->discov_off); } - err = hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + hci_req_init(&req, hdev); + + hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + + /* If we're going from non-connectable to connectable or + * vice-versa when fast connectable is enabled ensure that fast + * connectable gets disabled. write_fast_connectable won't do + * anything if the page scan parameters are already what they + * should be. + */ + if (cp->val || test_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags)) + write_fast_connectable(&req, false); + + err = hci_req_run(&req, set_connectable_complete); if (err < 0) mgmt_pending_remove(cmd); @@ -1041,6 +1153,10 @@ static int set_pairable(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("request for %s", hdev->name); + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_PAIRABLE, + MGMT_STATUS_INVALID_PARAMS); + hci_dev_lock(hdev); if (cp->val) @@ -1073,6 +1189,10 @@ static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data, return cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, MGMT_STATUS_NOT_SUPPORTED); + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, + MGMT_STATUS_INVALID_PARAMS); + hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { @@ -1133,13 +1253,15 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) BT_DBG("request for %s", hdev->name); - hci_dev_lock(hdev); + if (!lmp_ssp_capable(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, + MGMT_STATUS_NOT_SUPPORTED); - if (!lmp_ssp_capable(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, - MGMT_STATUS_NOT_SUPPORTED); - goto failed; - } + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, + MGMT_STATUS_INVALID_PARAMS); + + hci_dev_lock(hdev); val = !!cp->val; @@ -1199,6 +1321,10 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) return cmd_status(sk, hdev->id, MGMT_OP_SET_HS, MGMT_STATUS_NOT_SUPPORTED); + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_HS, + MGMT_STATUS_INVALID_PARAMS); + if (cp->val) set_bit(HCI_HS_ENABLED, &hdev->dev_flags); else @@ -1217,13 +1343,20 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) BT_DBG("request for %s", hdev->name); - hci_dev_lock(hdev); + if (!lmp_le_capable(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_LE, + MGMT_STATUS_NOT_SUPPORTED); - if (!lmp_le_capable(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_LE, - MGMT_STATUS_NOT_SUPPORTED); - goto unlock; - } + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_LE, + MGMT_STATUS_INVALID_PARAMS); + + /* LE-only devices do not allow toggling LE on/off */ + if (!lmp_bredr_capable(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_LE, + MGMT_STATUS_REJECTED); + + hci_dev_lock(hdev); val = !!cp->val; enabled = lmp_host_le_capable(hdev); @@ -1275,10 +1408,79 @@ unlock: return err; } +/* This is a helper function to test for pending mgmt commands that can + * cause CoD or EIR HCI commands. We can only allow one such pending + * mgmt command at a time since otherwise we cannot easily track what + * the current values are, will be, and based on that calculate if a new + * HCI command needs to be sent and if yes with what value. + */ +static bool pending_eir_or_class(struct hci_dev *hdev) +{ + struct pending_cmd *cmd; + + list_for_each_entry(cmd, &hdev->mgmt_pending, list) { + switch (cmd->opcode) { + case MGMT_OP_ADD_UUID: + case MGMT_OP_REMOVE_UUID: + case MGMT_OP_SET_DEV_CLASS: + case MGMT_OP_SET_POWERED: + return true; + } + } + + return false; +} + +static const u8 bluetooth_base_uuid[] = { + 0xfb, 0x34, 0x9b, 0x5f, 0x80, 0x00, 0x00, 0x80, + 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static u8 get_uuid_size(const u8 *uuid) +{ + u32 val; + + if (memcmp(uuid, bluetooth_base_uuid, 12)) + return 128; + + val = get_unaligned_le32(&uuid[12]); + if (val > 0xffff) + return 32; + + return 16; +} + +static void mgmt_class_complete(struct hci_dev *hdev, u16 mgmt_op, u8 status) +{ + struct pending_cmd *cmd; + + hci_dev_lock(hdev); + + cmd = mgmt_pending_find(mgmt_op, hdev); + if (!cmd) + goto unlock; + + cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(status), + hdev->dev_class, 3); + + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); +} + +static void add_uuid_complete(struct hci_dev *hdev, u8 status) +{ + BT_DBG("status 0x%02x", status); + + mgmt_class_complete(hdev, MGMT_OP_ADD_UUID, status); +} + static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_add_uuid *cp = data; struct pending_cmd *cmd; + struct hci_request req; struct bt_uuid *uuid; int err; @@ -1286,7 +1488,7 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) hci_dev_lock(hdev); - if (test_bit(HCI_PENDING_CLASS, &hdev->dev_flags)) { + if (pending_eir_or_class(hdev)) { err = cmd_status(sk, hdev->id, MGMT_OP_ADD_UUID, MGMT_STATUS_BUSY); goto failed; @@ -1300,26 +1502,32 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) memcpy(uuid->uuid, cp->uuid, 16); uuid->svc_hint = cp->svc_hint; + uuid->size = get_uuid_size(cp->uuid); - list_add(&uuid->list, &hdev->uuids); + list_add_tail(&uuid->list, &hdev->uuids); - err = update_class(hdev); - if (err < 0) - goto failed; + hci_req_init(&req, hdev); - err = update_eir(hdev); - if (err < 0) - goto failed; + update_class(&req); + update_eir(&req); + + err = hci_req_run(&req, add_uuid_complete); + if (err < 0) { + if (err != -ENODATA) + goto failed; - if (!test_bit(HCI_PENDING_CLASS, &hdev->dev_flags)) { err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_UUID, 0, hdev->dev_class, 3); goto failed; } cmd = mgmt_pending_add(sk, MGMT_OP_ADD_UUID, hdev, data, len); - if (!cmd) + if (!cmd) { err = -ENOMEM; + goto failed; + } + + err = 0; failed: hci_dev_unlock(hdev); @@ -1332,27 +1540,36 @@ static bool enable_service_cache(struct hci_dev *hdev) return false; if (!test_and_set_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) { - schedule_delayed_work(&hdev->service_cache, CACHE_TIMEOUT); + queue_delayed_work(hdev->workqueue, &hdev->service_cache, + CACHE_TIMEOUT); return true; } return false; } +static void remove_uuid_complete(struct hci_dev *hdev, u8 status) +{ + BT_DBG("status 0x%02x", status); + + mgmt_class_complete(hdev, MGMT_OP_REMOVE_UUID, status); +} + static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_remove_uuid *cp = data; struct pending_cmd *cmd; - struct list_head *p, *n; + struct bt_uuid *match, *tmp; u8 bt_uuid_any[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + struct hci_request req; int err, found; BT_DBG("request for %s", hdev->name); hci_dev_lock(hdev); - if (test_bit(HCI_PENDING_CLASS, &hdev->dev_flags)) { + if (pending_eir_or_class(hdev)) { err = cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID, MGMT_STATUS_BUSY); goto unlock; @@ -1372,9 +1589,7 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, found = 0; - list_for_each_safe(p, n, &hdev->uuids) { - struct bt_uuid *match = list_entry(p, struct bt_uuid, list); - + list_for_each_entry_safe(match, tmp, &hdev->uuids, list) { if (memcmp(match->uuid, cp->uuid, 16) != 0) continue; @@ -1390,46 +1605,69 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, } update_class: - err = update_class(hdev); - if (err < 0) - goto unlock; + hci_req_init(&req, hdev); - err = update_eir(hdev); - if (err < 0) - goto unlock; + update_class(&req); + update_eir(&req); + + err = hci_req_run(&req, remove_uuid_complete); + if (err < 0) { + if (err != -ENODATA) + goto unlock; - if (!test_bit(HCI_PENDING_CLASS, &hdev->dev_flags)) { err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_UUID, 0, hdev->dev_class, 3); goto unlock; } cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_UUID, hdev, data, len); - if (!cmd) + if (!cmd) { err = -ENOMEM; + goto unlock; + } + + err = 0; unlock: hci_dev_unlock(hdev); return err; } +static void set_class_complete(struct hci_dev *hdev, u8 status) +{ + BT_DBG("status 0x%02x", status); + + mgmt_class_complete(hdev, MGMT_OP_SET_DEV_CLASS, status); +} + static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_dev_class *cp = data; struct pending_cmd *cmd; + struct hci_request req; int err; BT_DBG("request for %s", hdev->name); + if (!lmp_bredr_capable(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, + MGMT_STATUS_NOT_SUPPORTED); + hci_dev_lock(hdev); - if (test_bit(HCI_PENDING_CLASS, &hdev->dev_flags)) { + if (pending_eir_or_class(hdev)) { err = cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, MGMT_STATUS_BUSY); goto unlock; } + if ((cp->minor & 0x03) != 0 || (cp->major & 0xe0) != 0) { + err = cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, + MGMT_STATUS_INVALID_PARAMS); + goto unlock; + } + hdev->major_class = cp->major; hdev->minor_class = cp->minor; @@ -1439,26 +1677,34 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } + hci_req_init(&req, hdev); + if (test_and_clear_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) { hci_dev_unlock(hdev); cancel_delayed_work_sync(&hdev->service_cache); hci_dev_lock(hdev); - update_eir(hdev); + update_eir(&req); } - err = update_class(hdev); - if (err < 0) - goto unlock; + update_class(&req); + + err = hci_req_run(&req, set_class_complete); + if (err < 0) { + if (err != -ENODATA) + goto unlock; - if (!test_bit(HCI_PENDING_CLASS, &hdev->dev_flags)) { err = cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0, hdev->dev_class, 3); goto unlock; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_DEV_CLASS, hdev, data, len); - if (!cmd) + if (!cmd) { err = -ENOMEM; + goto unlock; + } + + err = 0; unlock: hci_dev_unlock(hdev); @@ -1483,9 +1729,21 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, MGMT_STATUS_INVALID_PARAMS); } + if (cp->debug_keys != 0x00 && cp->debug_keys != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, + MGMT_STATUS_INVALID_PARAMS); + BT_DBG("%s debug_keys %u key_count %u", hdev->name, cp->debug_keys, key_count); + for (i = 0; i < key_count; i++) { + struct mgmt_link_key_info *key = &cp->keys[i]; + + if (key->addr.type != BDADDR_BREDR) + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, + MGMT_STATUS_INVALID_PARAMS); + } + hci_dev_lock(hdev); hci_link_keys_clear(hdev); @@ -1533,12 +1791,22 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; int err; - hci_dev_lock(hdev); - memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); rp.addr.type = cp->addr.type; + if (!bdaddr_type_is_valid(cp->addr.type)) + return cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); + + if (cp->disconnect != 0x00 && cp->disconnect != 0x01) + return cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); + + hci_dev_lock(hdev); + if (!hdev_is_powered(hdev)) { err = cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); @@ -1596,6 +1864,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_disconnect *cp = data; + struct mgmt_rp_disconnect rp; struct hci_cp_disconnect dc; struct pending_cmd *cmd; struct hci_conn *conn; @@ -1603,17 +1872,26 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG(""); + memset(&rp, 0, sizeof(rp)); + bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); + rp.addr.type = cp->addr.type; + + if (!bdaddr_type_is_valid(cp->addr.type)) + return cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); + hci_dev_lock(hdev); if (!test_bit(HCI_UP, &hdev->flags)) { - err = cmd_status(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_NOT_POWERED); + err = cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, + MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); goto failed; } if (mgmt_pending_find(MGMT_OP_DISCONNECT, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_BUSY); + err = cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, + MGMT_STATUS_BUSY, &rp, sizeof(rp)); goto failed; } @@ -1624,8 +1902,8 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr); if (!conn || conn->state == BT_OPEN || conn->state == BT_CLOSED) { - err = cmd_status(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_NOT_CONNECTED); + err = cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, + MGMT_STATUS_NOT_CONNECTED, &rp, sizeof(rp)); goto failed; } @@ -1857,7 +2135,7 @@ static void pairing_complete(struct pending_cmd *cmd, u8 status) conn->security_cfm_cb = NULL; conn->disconn_cfm_cb = NULL; - hci_conn_put(conn); + hci_conn_drop(conn); mgmt_pending_remove(cmd); } @@ -1903,11 +2181,20 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG(""); + memset(&rp, 0, sizeof(rp)); + bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); + rp.addr.type = cp->addr.type; + + if (!bdaddr_type_is_valid(cp->addr.type)) + return cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); + hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_PAIR_DEVICE, - MGMT_STATUS_NOT_POWERED); + err = cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, + MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); goto unlock; } @@ -1924,10 +2211,6 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, conn = hci_connect(hdev, LE_LINK, &cp->addr.bdaddr, cp->addr.type, sec_level, auth_type); - memset(&rp, 0, sizeof(rp)); - bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); - rp.addr.type = cp->addr.type; - if (IS_ERR(conn)) { int status; @@ -1943,7 +2226,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, } if (conn->connect_cfm_cb) { - hci_conn_put(conn); + hci_conn_drop(conn); err = cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, MGMT_STATUS_BUSY, &rp, sizeof(rp)); goto unlock; @@ -1952,7 +2235,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, cmd = mgmt_pending_add(sk, MGMT_OP_PAIR_DEVICE, hdev, data, len); if (!cmd) { err = -ENOMEM; - hci_conn_put(conn); + hci_conn_drop(conn); goto unlock; } @@ -2021,7 +2304,7 @@ unlock: } static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, - bdaddr_t *bdaddr, u8 type, u16 mgmt_op, + struct mgmt_addr_info *addr, u16 mgmt_op, u16 hci_op, __le32 passkey) { struct pending_cmd *cmd; @@ -2031,37 +2314,41 @@ static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_status(sk, hdev->id, mgmt_op, - MGMT_STATUS_NOT_POWERED); + err = cmd_complete(sk, hdev->id, mgmt_op, + MGMT_STATUS_NOT_POWERED, addr, + sizeof(*addr)); goto done; } - if (type == BDADDR_BREDR) - conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, bdaddr); + if (addr->type == BDADDR_BREDR) + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &addr->bdaddr); else - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr); + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &addr->bdaddr); if (!conn) { - err = cmd_status(sk, hdev->id, mgmt_op, - MGMT_STATUS_NOT_CONNECTED); + err = cmd_complete(sk, hdev->id, mgmt_op, + MGMT_STATUS_NOT_CONNECTED, addr, + sizeof(*addr)); goto done; } - if (type == BDADDR_LE_PUBLIC || type == BDADDR_LE_RANDOM) { + if (addr->type == BDADDR_LE_PUBLIC || addr->type == BDADDR_LE_RANDOM) { /* Continue with pairing via SMP */ err = smp_user_confirm_reply(conn, mgmt_op, passkey); if (!err) - err = cmd_status(sk, hdev->id, mgmt_op, - MGMT_STATUS_SUCCESS); + err = cmd_complete(sk, hdev->id, mgmt_op, + MGMT_STATUS_SUCCESS, addr, + sizeof(*addr)); else - err = cmd_status(sk, hdev->id, mgmt_op, - MGMT_STATUS_FAILED); + err = cmd_complete(sk, hdev->id, mgmt_op, + MGMT_STATUS_FAILED, addr, + sizeof(*addr)); goto done; } - cmd = mgmt_pending_add(sk, mgmt_op, hdev, bdaddr, sizeof(*bdaddr)); + cmd = mgmt_pending_add(sk, mgmt_op, hdev, addr, sizeof(*addr)); if (!cmd) { err = -ENOMEM; goto done; @@ -2071,11 +2358,12 @@ static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, if (hci_op == HCI_OP_USER_PASSKEY_REPLY) { struct hci_cp_user_passkey_reply cp; - bacpy(&cp.bdaddr, bdaddr); + bacpy(&cp.bdaddr, &addr->bdaddr); cp.passkey = passkey; err = hci_send_cmd(hdev, hci_op, sizeof(cp), &cp); } else - err = hci_send_cmd(hdev, hci_op, sizeof(*bdaddr), bdaddr); + err = hci_send_cmd(hdev, hci_op, sizeof(addr->bdaddr), + &addr->bdaddr); if (err < 0) mgmt_pending_remove(cmd); @@ -2092,7 +2380,7 @@ static int pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev, BT_DBG(""); - return user_pairing_resp(sk, hdev, &cp->addr.bdaddr, cp->addr.type, + return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_PIN_CODE_NEG_REPLY, HCI_OP_PIN_CODE_NEG_REPLY, 0); } @@ -2108,7 +2396,7 @@ static int user_confirm_reply(struct sock *sk, struct hci_dev *hdev, void *data, return cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY, MGMT_STATUS_INVALID_PARAMS); - return user_pairing_resp(sk, hdev, &cp->addr.bdaddr, cp->addr.type, + return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_CONFIRM_REPLY, HCI_OP_USER_CONFIRM_REPLY, 0); } @@ -2120,7 +2408,7 @@ static int user_confirm_neg_reply(struct sock *sk, struct hci_dev *hdev, BT_DBG(""); - return user_pairing_resp(sk, hdev, &cp->addr.bdaddr, cp->addr.type, + return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_CONFIRM_NEG_REPLY, HCI_OP_USER_CONFIRM_NEG_REPLY, 0); } @@ -2132,7 +2420,7 @@ static int user_passkey_reply(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG(""); - return user_pairing_resp(sk, hdev, &cp->addr.bdaddr, cp->addr.type, + return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_PASSKEY_REPLY, HCI_OP_USER_PASSKEY_REPLY, cp->passkey); } @@ -2144,18 +2432,47 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev, BT_DBG(""); - return user_pairing_resp(sk, hdev, &cp->addr.bdaddr, cp->addr.type, + return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_PASSKEY_NEG_REPLY, HCI_OP_USER_PASSKEY_NEG_REPLY, 0); } -static int update_name(struct hci_dev *hdev, const char *name) +static void update_name(struct hci_request *req) { + struct hci_dev *hdev = req->hdev; struct hci_cp_write_local_name cp; - memcpy(cp.name, name, sizeof(cp.name)); + memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); - return hci_send_cmd(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp); +} + +static void set_name_complete(struct hci_dev *hdev, u8 status) +{ + struct mgmt_cp_set_local_name *cp; + struct pending_cmd *cmd; + + BT_DBG("status 0x%02x", status); + + hci_dev_lock(hdev); + + cmd = mgmt_pending_find(MGMT_OP_SET_LOCAL_NAME, hdev); + if (!cmd) + goto unlock; + + cp = cmd->param; + + if (status) + cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, + mgmt_status(status)); + else + cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, + cp, sizeof(*cp)); + + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); } static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, @@ -2163,12 +2480,24 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_set_local_name *cp = data; struct pending_cmd *cmd; + struct hci_request req; int err; BT_DBG(""); hci_dev_lock(hdev); + /* If the old values are the same as the new ones just return a + * direct command complete event. + */ + if (!memcmp(hdev->dev_name, cp->name, sizeof(hdev->dev_name)) && + !memcmp(hdev->short_name, cp->short_name, + sizeof(hdev->short_name))) { + err = cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, + data, len); + goto failed; + } + memcpy(hdev->short_name, cp->short_name, sizeof(hdev->short_name)); if (!hdev_is_powered(hdev)) { @@ -2191,7 +2520,19 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - err = update_name(hdev, cp->name); + memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name)); + + hci_req_init(&req, hdev); + + if (lmp_bredr_capable(hdev)) { + update_name(&req); + update_eir(&req); + } + + if (lmp_le_capable(hdev)) + hci_update_ad(&req); + + err = hci_req_run(&req, set_name_complete); if (err < 0) mgmt_pending_remove(cmd); @@ -2254,24 +2595,16 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, - MGMT_STATUS_NOT_POWERED, &cp->addr, - sizeof(cp->addr)); - goto unlock; - } - err = hci_add_remote_oob_data(hdev, &cp->addr.bdaddr, cp->hash, cp->randomizer); if (err < 0) status = MGMT_STATUS_FAILED; else - status = 0; + status = MGMT_STATUS_SUCCESS; err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, status, &cp->addr, sizeof(cp->addr)); -unlock: hci_dev_unlock(hdev); return err; } @@ -2287,24 +2620,15 @@ static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, - MGMT_OP_REMOVE_REMOTE_OOB_DATA, - MGMT_STATUS_NOT_POWERED, &cp->addr, - sizeof(cp->addr)); - goto unlock; - } - err = hci_remove_remote_oob_data(hdev, &cp->addr.bdaddr); if (err < 0) status = MGMT_STATUS_INVALID_PARAMS; else - status = 0; + status = MGMT_STATUS_SUCCESS; err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_REMOTE_OOB_DATA, status, &cp->addr, sizeof(cp->addr)); -unlock: hci_dev_unlock(hdev); return err; } @@ -2365,31 +2689,45 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, switch (hdev->discovery.type) { case DISCOV_TYPE_BREDR: - if (lmp_bredr_capable(hdev)) - err = hci_do_inquiry(hdev, INQUIRY_LEN_BREDR); - else - err = -ENOTSUPP; + if (!lmp_bredr_capable(hdev)) { + err = cmd_status(sk, hdev->id, MGMT_OP_START_DISCOVERY, + MGMT_STATUS_NOT_SUPPORTED); + mgmt_pending_remove(cmd); + goto failed; + } + + err = hci_do_inquiry(hdev, INQUIRY_LEN_BREDR); break; case DISCOV_TYPE_LE: - if (lmp_host_le_capable(hdev)) - err = hci_le_scan(hdev, LE_SCAN_TYPE, LE_SCAN_INT, - LE_SCAN_WIN, LE_SCAN_TIMEOUT_LE_ONLY); - else - err = -ENOTSUPP; + if (!lmp_host_le_capable(hdev)) { + err = cmd_status(sk, hdev->id, MGMT_OP_START_DISCOVERY, + MGMT_STATUS_NOT_SUPPORTED); + mgmt_pending_remove(cmd); + goto failed; + } + + err = hci_le_scan(hdev, LE_SCAN_ACTIVE, LE_SCAN_INT, + LE_SCAN_WIN, LE_SCAN_TIMEOUT_LE_ONLY); break; case DISCOV_TYPE_INTERLEAVED: - if (lmp_host_le_capable(hdev) && lmp_bredr_capable(hdev)) - err = hci_le_scan(hdev, LE_SCAN_TYPE, LE_SCAN_INT, - LE_SCAN_WIN, - LE_SCAN_TIMEOUT_BREDR_LE); - else - err = -ENOTSUPP; + if (!lmp_host_le_capable(hdev) || !lmp_bredr_capable(hdev)) { + err = cmd_status(sk, hdev->id, MGMT_OP_START_DISCOVERY, + MGMT_STATUS_NOT_SUPPORTED); + mgmt_pending_remove(cmd); + goto failed; + } + + err = hci_le_scan(hdev, LE_SCAN_ACTIVE, LE_SCAN_INT, + LE_SCAN_WIN, LE_SCAN_TIMEOUT_BREDR_LE); break; default: - err = -EINVAL; + err = cmd_status(sk, hdev->id, MGMT_OP_START_DISCOVERY, + MGMT_STATUS_INVALID_PARAMS); + mgmt_pending_remove(cmd); + goto failed; } if (err < 0) @@ -2510,7 +2848,8 @@ static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data, hci_inquiry_cache_update_resolve(hdev, e); } - err = 0; + err = cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, 0, &cp->addr, + sizeof(cp->addr)); failed: hci_dev_unlock(hdev); @@ -2526,13 +2865,18 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("%s", hdev->name); + if (!bdaddr_type_is_valid(cp->addr.type)) + return cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + hci_dev_lock(hdev); err = hci_blacklist_add(hdev, &cp->addr.bdaddr, cp->addr.type); if (err < 0) status = MGMT_STATUS_FAILED; else - status = 0; + status = MGMT_STATUS_SUCCESS; err = cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, status, &cp->addr, sizeof(cp->addr)); @@ -2551,13 +2895,18 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("%s", hdev->name); + if (!bdaddr_type_is_valid(cp->addr.type)) + return cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + hci_dev_lock(hdev); err = hci_blacklist_del(hdev, &cp->addr.bdaddr, cp->addr.type); if (err < 0) status = MGMT_STATUS_INVALID_PARAMS; else - status = 0; + status = MGMT_STATUS_SUCCESS; err = cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, status, &cp->addr, sizeof(cp->addr)); @@ -2571,6 +2920,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_device_id *cp = data; + struct hci_request req; int err; __u16 source; @@ -2591,27 +2941,66 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, err = cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, 0, NULL, 0); - update_eir(hdev); + hci_req_init(&req, hdev); + update_eir(&req); + hci_req_run(&req, NULL); hci_dev_unlock(hdev); return err; } +static void fast_connectable_complete(struct hci_dev *hdev, u8 status) +{ + struct pending_cmd *cmd; + + BT_DBG("status 0x%02x", status); + + hci_dev_lock(hdev); + + cmd = mgmt_pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev); + if (!cmd) + goto unlock; + + if (status) { + cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, + mgmt_status(status)); + } else { + struct mgmt_mode *cp = cmd->param; + + if (cp->val) + set_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags); + else + clear_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags); + + send_settings_rsp(cmd->sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev); + new_settings(hdev, cmd->sk); + } + + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); +} + static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - struct hci_cp_write_page_scan_activity acp; - u8 type; + struct pending_cmd *cmd; + struct hci_request req; int err; BT_DBG("%s", hdev->name); - if (!lmp_bredr_capable(hdev)) + if (!lmp_bredr_capable(hdev) || hdev->hci_ver < BLUETOOTH_VER_1_2) return cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, MGMT_STATUS_NOT_SUPPORTED); + if (cp->val != 0x00 && cp->val != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, + MGMT_STATUS_INVALID_PARAMS); + if (!hdev_is_powered(hdev)) return cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, MGMT_STATUS_NOT_POWERED); @@ -2622,49 +3011,59 @@ static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (cp->val) { - type = PAGE_SCAN_TYPE_INTERLACED; + if (mgmt_pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev)) { + err = cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, + MGMT_STATUS_BUSY); + goto unlock; + } - /* 160 msec page scan interval */ - acp.interval = __constant_cpu_to_le16(0x0100); - } else { - type = PAGE_SCAN_TYPE_STANDARD; /* default */ + if (!!cp->val == test_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags)) { + err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE, + hdev); + goto unlock; + } - /* default 1.28 sec page scan */ - acp.interval = __constant_cpu_to_le16(0x0800); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev, + data, len); + if (!cmd) { + err = -ENOMEM; + goto unlock; } - /* default 11.25 msec page scan window */ - acp.window = __constant_cpu_to_le16(0x0012); + hci_req_init(&req, hdev); - err = hci_send_cmd(hdev, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY, sizeof(acp), - &acp); - if (err < 0) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, - MGMT_STATUS_FAILED); - goto done; - } + write_fast_connectable(&req, cp->val); - err = hci_send_cmd(hdev, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type); + err = hci_req_run(&req, fast_connectable_complete); if (err < 0) { err = cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, MGMT_STATUS_FAILED); - goto done; + mgmt_pending_remove(cmd); } - err = cmd_complete(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, 0, - NULL, 0); -done: +unlock: hci_dev_unlock(hdev); + return err; } +static bool ltk_is_valid(struct mgmt_ltk_info *key) +{ + if (key->authenticated != 0x00 && key->authenticated != 0x01) + return false; + if (key->master != 0x00 && key->master != 0x01) + return false; + if (!bdaddr_type_is_le(key->addr.type)) + return false; + return true; +} + static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, void *cp_data, u16 len) { struct mgmt_cp_load_long_term_keys *cp = cp_data; u16 key_count, expected_len; - int i; + int i, err; key_count = __le16_to_cpu(cp->key_count); @@ -2674,11 +3073,20 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, BT_ERR("load_keys: expected %u bytes, got %u bytes", len, expected_len); return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, - EINVAL); + MGMT_STATUS_INVALID_PARAMS); } BT_DBG("%s key_count %u", hdev->name, key_count); + for (i = 0; i < key_count; i++) { + struct mgmt_ltk_info *key = &cp->keys[i]; + + if (!ltk_is_valid(key)) + return cmd_status(sk, hdev->id, + MGMT_OP_LOAD_LONG_TERM_KEYS, + MGMT_STATUS_INVALID_PARAMS); + } + hci_dev_lock(hdev); hci_smp_ltks_clear(hdev); @@ -2698,9 +3106,12 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, key->enc_size, key->ediv, key->rand); } + err = cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, 0, + NULL, 0); + hci_dev_unlock(hdev); - return 0; + return err; } static const struct mgmt_handler { @@ -2889,66 +3300,116 @@ static void settings_rsp(struct pending_cmd *cmd, void *data) mgmt_pending_free(cmd); } -static int set_bredr_scan(struct hci_dev *hdev) +static void set_bredr_scan(struct hci_request *req) { + struct hci_dev *hdev = req->hdev; u8 scan = 0; + /* Ensure that fast connectable is disabled. This function will + * not do anything if the page scan parameters are already what + * they should be. + */ + write_fast_connectable(req, false); + if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) scan |= SCAN_PAGE; if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) scan |= SCAN_INQUIRY; - if (!scan) - return 0; + if (scan) + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); +} + +static void powered_complete(struct hci_dev *hdev, u8 status) +{ + struct cmd_lookup match = { NULL, hdev }; - return hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + BT_DBG("status 0x%02x", status); + + hci_dev_lock(hdev); + + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); + + new_settings(hdev, match.sk); + + hci_dev_unlock(hdev); + + if (match.sk) + sock_put(match.sk); +} + +static int powered_update_hci(struct hci_dev *hdev) +{ + struct hci_request req; + u8 link_sec; + + hci_req_init(&req, hdev); + + if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags) && + !lmp_host_ssp_capable(hdev)) { + u8 ssp = 1; + + hci_req_add(&req, HCI_OP_WRITE_SSP_MODE, 1, &ssp); + } + + if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags) && + lmp_bredr_capable(hdev)) { + struct hci_cp_write_le_host_supported cp; + + cp.le = 1; + cp.simul = lmp_le_br_capable(hdev); + + /* Check first if we already have the right + * host state (host features set) + */ + if (cp.le != lmp_host_le_capable(hdev) || + cp.simul != lmp_host_le_br_capable(hdev)) + hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, + sizeof(cp), &cp); + } + + link_sec = test_bit(HCI_LINK_SECURITY, &hdev->dev_flags); + if (link_sec != test_bit(HCI_AUTH, &hdev->flags)) + hci_req_add(&req, HCI_OP_WRITE_AUTH_ENABLE, + sizeof(link_sec), &link_sec); + + if (lmp_bredr_capable(hdev)) { + set_bredr_scan(&req); + update_class(&req); + update_name(&req); + update_eir(&req); + } + + return hci_req_run(&req, powered_complete); } int mgmt_powered(struct hci_dev *hdev, u8 powered) { struct cmd_lookup match = { NULL, hdev }; + u8 status_not_powered = MGMT_STATUS_NOT_POWERED; + u8 zero_cod[] = { 0, 0, 0 }; int err; if (!test_bit(HCI_MGMT, &hdev->dev_flags)) return 0; - mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); - if (powered) { - if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags) && - !lmp_host_ssp_capable(hdev)) { - u8 ssp = 1; - - hci_send_cmd(hdev, HCI_OP_WRITE_SSP_MODE, 1, &ssp); - } + if (powered_update_hci(hdev) == 0) + return 0; - if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) { - struct hci_cp_write_le_host_supported cp; + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, + &match); + goto new_settings; + } - cp.le = 1; - cp.simul = lmp_le_br_capable(hdev); + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); + mgmt_pending_foreach(0, hdev, cmd_status_rsp, &status_not_powered); - /* Check first if we already have the right - * host state (host features set) - */ - if (cp.le != lmp_host_le_capable(hdev) || - cp.simul != lmp_host_le_br_capable(hdev)) - hci_send_cmd(hdev, - HCI_OP_WRITE_LE_HOST_SUPPORTED, - sizeof(cp), &cp); - } - - if (lmp_bredr_capable(hdev)) { - set_bredr_scan(hdev); - update_class(hdev); - update_name(hdev, hdev->dev_name); - update_eir(hdev); - } - } else { - u8 status = MGMT_STATUS_NOT_POWERED; - mgmt_pending_foreach(0, hdev, cmd_status_rsp, &status); - } + if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) + mgmt_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, + zero_cod, sizeof(zero_cod), NULL); +new_settings: err = new_settings(hdev, match.sk); if (match.sk) @@ -2985,7 +3446,7 @@ int mgmt_discoverable(struct hci_dev *hdev, u8 discoverable) int mgmt_connectable(struct hci_dev *hdev, u8 connectable) { - struct cmd_lookup match = { NULL, hdev }; + struct pending_cmd *cmd; bool changed = false; int err = 0; @@ -2997,14 +3458,10 @@ int mgmt_connectable(struct hci_dev *hdev, u8 connectable) changed = true; } - mgmt_pending_foreach(MGMT_OP_SET_CONNECTABLE, hdev, settings_rsp, - &match); + cmd = mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev); if (changed) - err = new_settings(hdev, match.sk); - - if (match.sk) - sock_put(match.sk); + err = new_settings(hdev, cmd ? cmd->sk : NULL); return err; } @@ -3388,23 +3845,25 @@ int mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) return err; } -static int clear_eir(struct hci_dev *hdev) +static void clear_eir(struct hci_request *req) { + struct hci_dev *hdev = req->hdev; struct hci_cp_write_eir cp; if (!lmp_ext_inq_capable(hdev)) - return 0; + return; memset(hdev->eir, 0, sizeof(hdev->eir)); memset(&cp, 0, sizeof(cp)); - return hci_send_cmd(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp); } int mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) { struct cmd_lookup match = { NULL, hdev }; + struct hci_request req; bool changed = false; int err = 0; @@ -3437,29 +3896,26 @@ int mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) if (match.sk) sock_put(match.sk); + hci_req_init(&req, hdev); + if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) - update_eir(hdev); + update_eir(&req); else - clear_eir(hdev); + clear_eir(&req); + + hci_req_run(&req, NULL); return err; } -static void class_rsp(struct pending_cmd *cmd, void *data) +static void sk_lookup(struct pending_cmd *cmd, void *data) { struct cmd_lookup *match = data; - cmd_complete(cmd->sk, cmd->index, cmd->opcode, match->mgmt_status, - match->hdev->dev_class, 3); - - list_del(&cmd->list); - if (match->sk == NULL) { match->sk = cmd->sk; sock_hold(match->sk); } - - mgmt_pending_free(cmd); } int mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, @@ -3468,11 +3924,9 @@ int mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, struct cmd_lookup match = { NULL, hdev, mgmt_status(status) }; int err = 0; - clear_bit(HCI_PENDING_CLASS, &hdev->dev_flags); - - mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, class_rsp, &match); - mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, class_rsp, &match); - mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, class_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, sk_lookup, &match); + mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match); + mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); if (!status) err = mgmt_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, @@ -3486,55 +3940,29 @@ int mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, int mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) { - struct pending_cmd *cmd; struct mgmt_cp_set_local_name ev; - bool changed = false; - int err = 0; + struct pending_cmd *cmd; - if (memcmp(name, hdev->dev_name, sizeof(hdev->dev_name)) != 0) { - memcpy(hdev->dev_name, name, sizeof(hdev->dev_name)); - changed = true; - } + if (status) + return 0; memset(&ev, 0, sizeof(ev)); memcpy(ev.name, name, HCI_MAX_NAME_LENGTH); memcpy(ev.short_name, hdev->short_name, HCI_MAX_SHORT_NAME_LENGTH); cmd = mgmt_pending_find(MGMT_OP_SET_LOCAL_NAME, hdev); - if (!cmd) - goto send_event; - - /* Always assume that either the short or the complete name has - * changed if there was a pending mgmt command */ - changed = true; + if (!cmd) { + memcpy(hdev->dev_name, name, sizeof(hdev->dev_name)); - if (status) { - err = cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, - mgmt_status(status)); - goto failed; + /* If this is a HCI command related to powering on the + * HCI dev don't send any mgmt signals. + */ + if (mgmt_pending_find(MGMT_OP_SET_POWERED, hdev)) + return 0; } - err = cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, &ev, - sizeof(ev)); - if (err < 0) - goto failed; - -send_event: - if (changed) - err = mgmt_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, - sizeof(ev), cmd ? cmd->sk : NULL); - - /* EIR is taken care of separately when powering on the - * adapter so only update them here if this is a name change - * unrelated to power on. - */ - if (!test_bit(HCI_INIT, &hdev->flags)) - update_eir(hdev); - -failed: - if (cmd) - mgmt_pending_remove(cmd); - return err; + return mgmt_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), + cmd ? cmd->sk : NULL); } int mgmt_read_local_oob_data_reply_complete(struct hci_dev *hdev, u8 *hash, diff --git a/net/bluetooth/rfcomm/Kconfig b/net/bluetooth/rfcomm/Kconfig index 22e718b554e4..18d352ea2bc7 100644 --- a/net/bluetooth/rfcomm/Kconfig +++ b/net/bluetooth/rfcomm/Kconfig @@ -12,6 +12,7 @@ config BT_RFCOMM config BT_RFCOMM_TTY bool "RFCOMM TTY support" depends on BT_RFCOMM + depends on TTY help This option enables TTY emulation support for RFCOMM channels. diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 201fdf737209..ca957d34b0c8 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -69,7 +69,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, u8 sec_level, int *err); static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst); -static void rfcomm_session_del(struct rfcomm_session *s); +static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s); /* ---- RFCOMM frame parsing macros ---- */ #define __get_dlci(b) ((b & 0xfc) >> 2) @@ -108,12 +108,6 @@ static void rfcomm_schedule(void) wake_up_process(rfcomm_thread); } -static void rfcomm_session_put(struct rfcomm_session *s) -{ - if (atomic_dec_and_test(&s->refcnt)) - rfcomm_session_del(s); -} - /* ---- RFCOMM FCS computation ---- */ /* reversed, 8-bit, poly=0x07 */ @@ -249,16 +243,14 @@ static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout) { BT_DBG("session %p state %ld timeout %ld", s, s->state, timeout); - if (!mod_timer(&s->timer, jiffies + timeout)) - rfcomm_session_hold(s); + mod_timer(&s->timer, jiffies + timeout); } static void rfcomm_session_clear_timer(struct rfcomm_session *s) { BT_DBG("session %p state %ld", s, s->state); - if (timer_pending(&s->timer) && del_timer(&s->timer)) - rfcomm_session_put(s); + del_timer_sync(&s->timer); } /* ---- RFCOMM DLCs ---- */ @@ -285,7 +277,7 @@ static void rfcomm_dlc_clear_timer(struct rfcomm_dlc *d) { BT_DBG("dlc %p state %ld", d, d->state); - if (timer_pending(&d->timer) && del_timer(&d->timer)) + if (del_timer(&d->timer)) rfcomm_dlc_put(d); } @@ -336,8 +328,6 @@ static void rfcomm_dlc_link(struct rfcomm_session *s, struct rfcomm_dlc *d) { BT_DBG("dlc %p session %p", d, s); - rfcomm_session_hold(s); - rfcomm_session_clear_timer(s); rfcomm_dlc_hold(d); list_add(&d->list, &s->dlcs); @@ -356,8 +346,6 @@ static void rfcomm_dlc_unlink(struct rfcomm_dlc *d) if (list_empty(&s->dlcs)) rfcomm_session_set_timer(s, RFCOMM_IDLE_TIMEOUT); - - rfcomm_session_put(s); } static struct rfcomm_dlc *rfcomm_dlc_get(struct rfcomm_session *s, u8 dlci) @@ -493,12 +481,34 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) int rfcomm_dlc_close(struct rfcomm_dlc *d, int err) { - int r; + int r = 0; + struct rfcomm_dlc *d_list; + struct rfcomm_session *s, *s_list; + + BT_DBG("dlc %p state %ld dlci %d err %d", d, d->state, d->dlci, err); rfcomm_lock(); - r = __rfcomm_dlc_close(d, err); + s = d->session; + if (!s) + goto no_session; + + /* after waiting on the mutex check the session still exists + * then check the dlc still exists + */ + list_for_each_entry(s_list, &session_list, list) { + if (s_list == s) { + list_for_each_entry(d_list, &s->dlcs, list) { + if (d_list == d) { + r = __rfcomm_dlc_close(d, err); + break; + } + } + break; + } + } +no_session: rfcomm_unlock(); return r; } @@ -609,7 +619,7 @@ static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state) return s; } -static void rfcomm_session_del(struct rfcomm_session *s) +static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s) { int state = s->state; @@ -617,15 +627,14 @@ static void rfcomm_session_del(struct rfcomm_session *s) list_del(&s->list); - if (state == BT_CONNECTED) - rfcomm_send_disc(s, 0); - rfcomm_session_clear_timer(s); sock_release(s->sock); kfree(s); if (state != BT_LISTEN) module_put(THIS_MODULE); + + return NULL; } static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst) @@ -644,17 +653,16 @@ static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst) return NULL; } -static void rfcomm_session_close(struct rfcomm_session *s, int err) +static struct rfcomm_session *rfcomm_session_close(struct rfcomm_session *s, + int err) { struct rfcomm_dlc *d; struct list_head *p, *n; - BT_DBG("session %p state %ld err %d", s, s->state, err); - - rfcomm_session_hold(s); - s->state = BT_CLOSED; + BT_DBG("session %p state %ld err %d", s, s->state, err); + /* Close all dlcs */ list_for_each_safe(p, n, &s->dlcs) { d = list_entry(p, struct rfcomm_dlc, list); @@ -663,7 +671,7 @@ static void rfcomm_session_close(struct rfcomm_session *s, int err) } rfcomm_session_clear_timer(s); - rfcomm_session_put(s); + return rfcomm_session_del(s); } static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, @@ -715,8 +723,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, if (*err == 0 || *err == -EINPROGRESS) return s; - rfcomm_session_del(s); - return NULL; + return rfcomm_session_del(s); failed: sock_release(sock); @@ -1105,7 +1112,7 @@ static void rfcomm_make_uih(struct sk_buff *skb, u8 addr) } /* ---- RFCOMM frame reception ---- */ -static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) +static struct rfcomm_session *rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) { BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); @@ -1114,7 +1121,7 @@ static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); if (!d) { rfcomm_send_dm(s, dlci); - return 0; + return s; } switch (d->state) { @@ -1150,25 +1157,14 @@ static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) break; case BT_DISCONN: - /* rfcomm_session_put is called later so don't do - * anything here otherwise we will mess up the session - * reference counter: - * - * (a) when we are the initiator dlc_unlink will drive - * the reference counter to 0 (there is no initial put - * after session_add) - * - * (b) when we are not the initiator rfcomm_rx_process - * will explicitly call put to balance the initial hold - * done after session add. - */ + s = rfcomm_session_close(s, ECONNRESET); break; } } - return 0; + return s; } -static int rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci) +static struct rfcomm_session *rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci) { int err = 0; @@ -1192,13 +1188,13 @@ static int rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci) else err = ECONNRESET; - s->state = BT_CLOSED; - rfcomm_session_close(s, err); + s = rfcomm_session_close(s, err); } - return 0; + return s; } -static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci) +static struct rfcomm_session *rfcomm_recv_disc(struct rfcomm_session *s, + u8 dlci) { int err = 0; @@ -1227,11 +1223,9 @@ static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci) else err = ECONNRESET; - s->state = BT_CLOSED; - rfcomm_session_close(s, err); + s = rfcomm_session_close(s, err); } - - return 0; + return s; } void rfcomm_dlc_accept(struct rfcomm_dlc *d) @@ -1652,11 +1646,18 @@ drop: return 0; } -static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb) +static struct rfcomm_session *rfcomm_recv_frame(struct rfcomm_session *s, + struct sk_buff *skb) { struct rfcomm_hdr *hdr = (void *) skb->data; u8 type, dlci, fcs; + if (!s) { + /* no session, so free socket data */ + kfree_skb(skb); + return s; + } + dlci = __get_dlci(hdr->addr); type = __get_type(hdr->ctrl); @@ -1667,7 +1668,7 @@ static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb) if (__check_fcs(skb->data, type, fcs)) { BT_ERR("bad checksum in packet"); kfree_skb(skb); - return -EILSEQ; + return s; } if (__test_ea(hdr->len)) @@ -1683,22 +1684,23 @@ static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb) case RFCOMM_DISC: if (__test_pf(hdr->ctrl)) - rfcomm_recv_disc(s, dlci); + s = rfcomm_recv_disc(s, dlci); break; case RFCOMM_UA: if (__test_pf(hdr->ctrl)) - rfcomm_recv_ua(s, dlci); + s = rfcomm_recv_ua(s, dlci); break; case RFCOMM_DM: - rfcomm_recv_dm(s, dlci); + s = rfcomm_recv_dm(s, dlci); break; case RFCOMM_UIH: - if (dlci) - return rfcomm_recv_data(s, dlci, __test_pf(hdr->ctrl), skb); - + if (dlci) { + rfcomm_recv_data(s, dlci, __test_pf(hdr->ctrl), skb); + return s; + } rfcomm_recv_mcc(s, skb); break; @@ -1707,7 +1709,7 @@ static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb) break; } kfree_skb(skb); - return 0; + return s; } /* ---- Connection and data processing ---- */ @@ -1844,7 +1846,7 @@ static void rfcomm_process_dlcs(struct rfcomm_session *s) } } -static void rfcomm_process_rx(struct rfcomm_session *s) +static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s) { struct socket *sock = s->sock; struct sock *sk = sock->sk; @@ -1856,17 +1858,15 @@ static void rfcomm_process_rx(struct rfcomm_session *s) while ((skb = skb_dequeue(&sk->sk_receive_queue))) { skb_orphan(skb); if (!skb_linearize(skb)) - rfcomm_recv_frame(s, skb); + s = rfcomm_recv_frame(s, skb); else kfree_skb(skb); } - if (sk->sk_state == BT_CLOSED) { - if (!s->initiator) - rfcomm_session_put(s); + if (s && (sk->sk_state == BT_CLOSED)) + s = rfcomm_session_close(s, sk->sk_err); - rfcomm_session_close(s, sk->sk_err); - } + return s; } static void rfcomm_accept_connection(struct rfcomm_session *s) @@ -1891,8 +1891,6 @@ static void rfcomm_accept_connection(struct rfcomm_session *s) s = rfcomm_session_add(nsock, BT_OPEN); if (s) { - rfcomm_session_hold(s); - /* We should adjust MTU on incoming sessions. * L2CAP MTU minus UIH header and FCS. */ s->mtu = min(l2cap_pi(nsock->sk)->chan->omtu, @@ -1903,7 +1901,7 @@ static void rfcomm_accept_connection(struct rfcomm_session *s) sock_release(nsock); } -static void rfcomm_check_connection(struct rfcomm_session *s) +static struct rfcomm_session *rfcomm_check_connection(struct rfcomm_session *s) { struct sock *sk = s->sock->sk; @@ -1921,10 +1919,10 @@ static void rfcomm_check_connection(struct rfcomm_session *s) break; case BT_CLOSED: - s->state = BT_CLOSED; - rfcomm_session_close(s, sk->sk_err); + s = rfcomm_session_close(s, sk->sk_err); break; } + return s; } static void rfcomm_process_sessions(void) @@ -1940,7 +1938,6 @@ static void rfcomm_process_sessions(void) if (test_and_clear_bit(RFCOMM_TIMED_OUT, &s->flags)) { s->state = BT_DISCONN; rfcomm_send_disc(s, 0); - rfcomm_session_put(s); continue; } @@ -1949,21 +1946,18 @@ static void rfcomm_process_sessions(void) continue; } - rfcomm_session_hold(s); - switch (s->state) { case BT_BOUND: - rfcomm_check_connection(s); + s = rfcomm_check_connection(s); break; default: - rfcomm_process_rx(s); + s = rfcomm_process_rx(s); break; } - rfcomm_process_dlcs(s); - - rfcomm_session_put(s); + if (s) + rfcomm_process_dlcs(s); } rfcomm_unlock(); @@ -2010,10 +2004,11 @@ static int rfcomm_add_listener(bdaddr_t *ba) /* Add listening session */ s = rfcomm_session_add(sock, BT_LISTEN); - if (!s) + if (!s) { + err = -ENOMEM; goto failed; + } - rfcomm_session_hold(s); return 0; failed: sock_release(sock); @@ -2071,8 +2066,6 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) if (!s) return; - rfcomm_session_hold(s); - list_for_each_safe(p, n, &s->dlcs) { d = list_entry(p, struct rfcomm_dlc, list); @@ -2104,8 +2097,6 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) set_bit(RFCOMM_AUTH_REJECT, &d->flags); } - rfcomm_session_put(s); - rfcomm_schedule(); } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index ce3f6658f4b2..30b3721dc6d7 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -107,15 +107,14 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) static struct sock *__rfcomm_get_sock_by_addr(u8 channel, bdaddr_t *src) { struct sock *sk = NULL; - struct hlist_node *node; - sk_for_each(sk, node, &rfcomm_sk_list.head) { + sk_for_each(sk, &rfcomm_sk_list.head) { if (rfcomm_pi(sk)->channel == channel && !bacmp(&bt_sk(sk)->src, src)) break; } - return node ? sk : NULL; + return sk ? sk : NULL; } /* Find socket with channel and source bdaddr. @@ -124,11 +123,10 @@ static struct sock *__rfcomm_get_sock_by_addr(u8 channel, bdaddr_t *src) static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src) { struct sock *sk = NULL, *sk1 = NULL; - struct hlist_node *node; read_lock(&rfcomm_sk_list.lock); - sk_for_each(sk, node, &rfcomm_sk_list.head) { + sk_for_each(sk, &rfcomm_sk_list.head) { if (state && sk->sk_state != state) continue; @@ -145,7 +143,7 @@ static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t * read_unlock(&rfcomm_sk_list.lock); - return node ? sk : sk1; + return sk ? sk : sk1; } static void rfcomm_sock_destruct(struct sock *sk) @@ -610,6 +608,7 @@ static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock, if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { rfcomm_dlc_accept(d); + msg->msg_namelen = 0; return 0; } @@ -970,11 +969,10 @@ done: static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; - struct hlist_node *node; read_lock(&rfcomm_sk_list.lock); - sk_for_each(sk, node, &rfcomm_sk_list.head) { + sk_for_each(sk, &rfcomm_sk_list.head) { seq_printf(f, "%pMR %pMR %d %d\n", &bt_sk(sk)->src, &bt_sk(sk)->dst, sk->sk_state, rfcomm_pi(sk)->channel); @@ -1039,7 +1037,7 @@ int __init rfcomm_init_sockets(void) goto error; } - err = bt_procfs_init(THIS_MODULE, &init_net, "rfcomm", &rfcomm_sk_list, NULL); + err = bt_procfs_init(&init_net, "rfcomm", &rfcomm_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create RFCOMM proc file"); bt_sock_unregister(BTPROTO_RFCOMM); @@ -1068,8 +1066,7 @@ void __exit rfcomm_cleanup_sockets(void) debugfs_remove(rfcomm_sock_debugfs); - if (bt_sock_unregister(BTPROTO_RFCOMM) < 0) - BT_ERR("RFCOMM socket layer unregistration failed"); + bt_sock_unregister(BTPROTO_RFCOMM); proto_unregister(&rfcomm_proto); } diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index bd6fd0f43d2b..b6e44ad6cca6 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -541,23 +541,21 @@ int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb) { struct rfcomm_dev *dev = dlc->owner; - struct tty_struct *tty; if (!dev) { kfree_skb(skb); return; } - tty = dev->port.tty; - if (!tty || !skb_queue_empty(&dev->pending)) { + if (!skb_queue_empty(&dev->pending)) { skb_queue_tail(&dev->pending, skb); return; } - BT_DBG("dlc %p tty %p len %d", dlc, tty, skb->len); + BT_DBG("dlc %p len %d", dlc, skb->len); - tty_insert_flip_string(tty, skb->data, skb->len); - tty_flip_buffer_push(tty); + tty_insert_flip_string(&dev->port, skb->data, skb->len); + tty_flip_buffer_push(&dev->port); kfree_skb(skb); } @@ -621,26 +619,23 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig) /* ---- TTY functions ---- */ static void rfcomm_tty_copy_pending(struct rfcomm_dev *dev) { - struct tty_struct *tty = dev->port.tty; struct sk_buff *skb; int inserted = 0; - if (!tty) - return; - - BT_DBG("dev %p tty %p", dev, tty); + BT_DBG("dev %p", dev); rfcomm_dlc_lock(dev->dlc); while ((skb = skb_dequeue(&dev->pending))) { - inserted += tty_insert_flip_string(tty, skb->data, skb->len); + inserted += tty_insert_flip_string(&dev->port, skb->data, + skb->len); kfree_skb(skb); } rfcomm_dlc_unlock(dev->dlc); if (inserted > 0) - tty_flip_buffer_push(tty); + tty_flip_buffer_push(&dev->port); } static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 57f250c20e39..e7bd4eea575c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -83,7 +83,7 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) if (conn) return conn; - conn = kzalloc(sizeof(struct sco_conn), GFP_ATOMIC); + conn = kzalloc(sizeof(struct sco_conn), GFP_KERNEL); if (!conn) return NULL; @@ -185,7 +185,7 @@ static int sco_connect(struct sock *sk) conn = sco_conn_add(hcon); if (!conn) { - hci_conn_put(hcon); + hci_conn_drop(hcon); err = -ENOMEM; goto done; } @@ -259,10 +259,9 @@ drop: /* -------- Socket interface ---------- */ static struct sock *__sco_get_sock_listen_by_addr(bdaddr_t *ba) { - struct hlist_node *node; struct sock *sk; - sk_for_each(sk, node, &sco_sk_list.head) { + sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; @@ -279,11 +278,10 @@ static struct sock *__sco_get_sock_listen_by_addr(bdaddr_t *ba) static struct sock *sco_get_sock_listen(bdaddr_t *src) { struct sock *sk = NULL, *sk1 = NULL; - struct hlist_node *node; read_lock(&sco_sk_list.lock); - sk_for_each(sk, node, &sco_sk_list.head) { + sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; @@ -298,7 +296,7 @@ static struct sock *sco_get_sock_listen(bdaddr_t *src) read_unlock(&sco_sk_list.lock); - return node ? sk : sk1; + return sk ? sk : sk1; } static void sco_sock_destruct(struct sock *sk) @@ -355,12 +353,13 @@ static void __sco_sock_close(struct sock *sk) if (sco_pi(sk)->conn->hcon) { sk->sk_state = BT_DISCONN; sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT); - hci_conn_put(sco_pi(sk)->conn->hcon); + hci_conn_drop(sco_pi(sk)->conn->hcon); sco_pi(sk)->conn->hcon = NULL; } else sco_chan_del(sk, ECONNRESET); break; + case BT_CONNECT2: case BT_CONNECT: case BT_DISCONN: sco_chan_del(sk, ECONNRESET); @@ -482,8 +481,7 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; - int err = 0; - + int err; BT_DBG("sk %p", sk); @@ -654,6 +652,42 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock, return err; } +static void sco_conn_defer_accept(struct hci_conn *conn, int mask) +{ + struct hci_dev *hdev = conn->hdev; + + BT_DBG("conn %p", conn); + + conn->state = BT_CONFIG; + + if (!lmp_esco_capable(hdev)) { + struct hci_cp_accept_conn_req cp; + + bacpy(&cp.bdaddr, &conn->dst); + + if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER)) + cp.role = 0x00; /* Become master */ + else + cp.role = 0x01; /* Remain slave */ + + hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp); + } else { + struct hci_cp_accept_sync_conn_req cp; + + bacpy(&cp.bdaddr, &conn->dst); + cp.pkt_type = cpu_to_le16(conn->pkt_type); + + cp.tx_bandwidth = __constant_cpu_to_le32(0x00001f40); + cp.rx_bandwidth = __constant_cpu_to_le32(0x00001f40); + cp.max_latency = __constant_cpu_to_le16(0xffff); + cp.content_format = cpu_to_le16(hdev->voice_setting); + cp.retrans_effort = 0xff; + + hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, + sizeof(cp), &cp); + } +} + static int sco_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags) { @@ -664,8 +698,9 @@ static int sco_sock_recvmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { - hci_conn_accept(pi->conn->hcon, 0); + sco_conn_defer_accept(pi->conn->hcon, 0); sk->sk_state = BT_CONFIG; + msg->msg_namelen = 0; release_sock(sk); return 0; @@ -883,7 +918,7 @@ static void sco_chan_del(struct sock *sk, int err) sco_conn_unlock(conn); if (conn->hcon) - hci_conn_put(conn->hcon); + hci_conn_drop(conn->hcon); } sk->sk_state = BT_CLOSED; @@ -900,8 +935,6 @@ static void sco_conn_ready(struct sco_conn *conn) BT_DBG("conn %p", conn); - sco_conn_lock(conn); - if (sk) { sco_sock_clear_timer(sk); bh_lock_sock(sk); @@ -909,9 +942,13 @@ static void sco_conn_ready(struct sco_conn *conn) sk->sk_state_change(sk); bh_unlock_sock(sk); } else { + sco_conn_lock(conn); + parent = sco_get_sock_listen(conn->src); - if (!parent) - goto done; + if (!parent) { + sco_conn_unlock(conn); + return; + } bh_lock_sock(parent); @@ -919,7 +956,8 @@ static void sco_conn_ready(struct sco_conn *conn) BTPROTO_SCO, GFP_ATOMIC); if (!sk) { bh_unlock_sock(parent); - goto done; + sco_conn_unlock(conn); + return; } sco_sock_init(sk, parent); @@ -939,24 +977,22 @@ static void sco_conn_ready(struct sco_conn *conn) parent->sk_data_ready(parent, 1); bh_unlock_sock(parent); - } -done: - sco_conn_unlock(conn); + sco_conn_unlock(conn); + } } /* ----- SCO interface with lower layer (HCI) ----- */ int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct sock *sk; - struct hlist_node *node; int lm = 0; BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr); /* Find listening sockets */ read_lock(&sco_sk_list.lock); - sk_for_each(sk, node, &sco_sk_list.head) { + sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; @@ -1016,11 +1052,10 @@ drop: static int sco_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; - struct hlist_node *node; read_lock(&sco_sk_list.lock); - sk_for_each(sk, node, &sco_sk_list.head) { + sk_for_each(sk, &sco_sk_list.head) { seq_printf(f, "%pMR %pMR %d\n", &bt_sk(sk)->src, &bt_sk(sk)->dst, sk->sk_state); } @@ -1084,7 +1119,7 @@ int __init sco_init(void) goto error; } - err = bt_procfs_init(THIS_MODULE, &init_net, "sco", &sco_sk_list, NULL); + err = bt_procfs_init(&init_net, "sco", &sco_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create SCO proc file"); bt_sock_unregister(BTPROTO_SCO); @@ -1113,8 +1148,7 @@ void __exit sco_exit(void) debugfs_remove(sco_debugfs); - if (bt_sock_unregister(BTPROTO_SCO) < 0) - BT_ERR("SCO socket unregistration failed"); + bt_sock_unregister(BTPROTO_SCO); proto_unregister(&sco_proto); } diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 5abefb12891d..b2296d3857a0 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -522,7 +522,7 @@ void smp_chan_destroy(struct l2cap_conn *conn) kfree(smp); conn->smp_chan = NULL; conn->hcon->smp_conn = NULL; - hci_conn_put(conn->hcon); + hci_conn_drop(conn->hcon); } int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index 6dee7bf648a9..aa0d3b2f1bb7 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -46,3 +46,17 @@ config BRIDGE_IGMP_SNOOPING Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config BRIDGE_VLAN_FILTERING + bool "VLAN filtering" + depends on BRIDGE + depends on VLAN_8021Q + default n + ---help--- + If you say Y here, then the Ethernet bridge will be able selectively + receive and forward traffic based on VLAN information in the packet + any VLAN information configured on the bridge port or bridge device. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. diff --git a/net/bridge/Makefile b/net/bridge/Makefile index e859098f5ee9..e85498b2f166 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -14,4 +14,6 @@ bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o +bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o + obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 7c78e2640190..967312803e41 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -30,6 +30,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + u16 vid = 0; rcu_read_lock(); #ifdef CONFIG_BRIDGE_NETFILTER @@ -45,6 +46,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) brstats->tx_bytes += skb->len; u64_stats_update_end(&brstats->syncp); + if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid)) + goto out; + BR_INPUT_SKB_CB(skb)->brdev = dev; skb_reset_mac_header(skb); @@ -62,12 +66,12 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) goto out; } - mdst = br_mdb_get(br, skb); + mdst = br_mdb_get(br, skb, vid); if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) br_multicast_deliver(mdst, skb); else br_flood_deliver(br, skb); - } else if ((dst = __br_fdb_get(br, dest)) != NULL) + } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) br_deliver(dst->dst, skb); else br_flood_deliver(br, skb); @@ -172,12 +176,10 @@ static int br_set_mac_address(struct net_device *dev, void *p) spin_lock_bh(&br->lock); if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) { - dev->addr_assign_type &= ~NET_ADDR_RANDOM; memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); br_fdb_change_mac_address(br, addr->sa_data); br_stp_change_bridge_id(br, addr->sa_data); } - br->flags |= BR_SET_MAC_ADDR; spin_unlock_bh(&br->lock); return 0; @@ -185,10 +187,10 @@ static int br_set_mac_address(struct net_device *dev, void *p) static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strcpy(info->driver, "bridge"); - strcpy(info->version, BR_VERSION); - strcpy(info->fw_version, "N/A"); - strcpy(info->bus_info, "N/A"); + strlcpy(info->driver, "bridge", sizeof(info->driver)); + strlcpy(info->version, BR_VERSION, sizeof(info->version)); + strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); + strlcpy(info->bus_info, "N/A", sizeof(info->bus_info)); } static netdev_features_t br_fix_features(struct net_device *dev, @@ -267,7 +269,7 @@ void br_netpoll_disable(struct net_bridge_port *p) p->np = NULL; - __netpoll_free_rcu(np); + __netpoll_free_async(np); } #endif @@ -315,6 +317,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_fdb_dump = br_fdb_dump, .ndo_bridge_getlink = br_getlink, .ndo_bridge_setlink = br_setlink, + .ndo_bridge_dellink = br_dellink, }; static void br_dev_free(struct net_device *dev) @@ -345,10 +348,10 @@ void br_dev_setup(struct net_device *dev) dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_LLTX | - NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_TX; + NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | - NETIF_F_HW_VLAN_TX; + NETIF_F_HW_VLAN_CTAG_TX; br->dev = dev; spin_lock_init(&br->lock); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index d9576e6de2b8..ebfa4443c69b 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -23,11 +23,12 @@ #include <linux/slab.h> #include <linux/atomic.h> #include <asm/unaligned.h> +#include <linux/if_vlan.h> #include "br_private.h" static struct kmem_cache *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr); + const unsigned char *addr, u16 vid); static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *, int); @@ -67,11 +68,11 @@ static inline int has_expired(const struct net_bridge *br, time_before_eq(fdb->updated + hold_time(br), jiffies); } -static inline int br_mac_hash(const unsigned char *mac) +static inline int br_mac_hash(const unsigned char *mac, __u16 vid) { - /* use 1 byte of OUI cnd 3 bytes of NIC */ + /* use 1 byte of OUI and 3 bytes of NIC */ u32 key = get_unaligned((u32 *)(mac + 2)); - return jhash_1word(key, fdb_salt) & (BR_HASH_SIZE - 1); + return jhash_2words(key, vid, fdb_salt) & (BR_HASH_SIZE - 1); } static void fdb_rcu_free(struct rcu_head *head) @@ -91,6 +92,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge *br = p->br; + bool no_vlan = (nbp_get_vlan_info(p) == NULL) ? true : false; int i; spin_lock_bh(&br->hash_lock); @@ -105,10 +107,12 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) if (f->dst == p && f->is_local) { /* maybe another port has same hw addr? */ struct net_bridge_port *op; + u16 vid = f->vlan_id; list_for_each_entry(op, &br->port_list, list) { if (op != p && ether_addr_equal(op->dev->dev_addr, - f->addr.addr)) { + f->addr.addr) && + nbp_vlan_find(op, vid)) { f->dst = op; goto insert; } @@ -116,27 +120,53 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) /* delete old one */ fdb_delete(br, f); - goto insert; +insert: + /* insert new address, may fail if invalid + * address or dup. + */ + fdb_insert(br, p, newaddr, vid); + + /* if this port has no vlan information + * configured, we can safely be done at + * this point. + */ + if (no_vlan) + goto done; } } } - insert: - /* insert new address, may fail if invalid address or dup. */ - fdb_insert(br, p, newaddr); +done: spin_unlock_bh(&br->hash_lock); } void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) { struct net_bridge_fdb_entry *f; + struct net_port_vlans *pv; + u16 vid = 0; /* If old entry was unassociated with any port, then delete it. */ - f = __br_fdb_get(br, br->dev->dev_addr); + f = __br_fdb_get(br, br->dev->dev_addr, 0); if (f && f->is_local && !f->dst) fdb_delete(br, f); - fdb_insert(br, NULL, newaddr); + fdb_insert(br, NULL, newaddr, 0); + + /* Now remove and add entries for every VLAN configured on the + * bridge. This function runs under RTNL so the bitmap will not + * change from under us. + */ + pv = br_get_vlan_info(br); + if (!pv) + return; + + for_each_set_bit_from(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + f = __br_fdb_get(br, br->dev->dev_addr, vid); + if (f && f->is_local && !f->dst) + fdb_delete(br, f); + fdb_insert(br, NULL, newaddr, vid); + } } void br_fdb_cleanup(unsigned long _data) @@ -149,9 +179,9 @@ void br_fdb_cleanup(unsigned long _data) spin_lock(&br->hash_lock); for (i = 0; i < BR_HASH_SIZE; i++) { struct net_bridge_fdb_entry *f; - struct hlist_node *h, *n; + struct hlist_node *n; - hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) { + hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { unsigned long this_timer; if (f->is_static) continue; @@ -175,8 +205,8 @@ void br_fdb_flush(struct net_bridge *br) spin_lock_bh(&br->hash_lock); for (i = 0; i < BR_HASH_SIZE; i++) { struct net_bridge_fdb_entry *f; - struct hlist_node *h, *n; - hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) { + struct hlist_node *n; + hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { if (!f->is_static) fdb_delete(br, f); } @@ -231,13 +261,15 @@ void br_fdb_delete_by_port(struct net_bridge *br, /* No locking or refcounting, assumes caller has rcu_read_lock */ struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, - const unsigned char *addr) + const unsigned char *addr, + __u16 vid) { - struct hlist_node *h; struct net_bridge_fdb_entry *fdb; - hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) { - if (ether_addr_equal(fdb->addr.addr, addr)) { + hlist_for_each_entry_rcu(fdb, + &br->hash[br_mac_hash(addr, vid)], hlist) { + if (ether_addr_equal(fdb->addr.addr, addr) && + fdb->vlan_id == vid) { if (unlikely(has_expired(br, fdb))) break; return fdb; @@ -261,7 +293,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) if (!port) ret = 0; else { - fdb = __br_fdb_get(port->br, addr); + fdb = __br_fdb_get(port->br, addr, 0); ret = fdb && fdb->dst && fdb->dst->dev != dev && fdb->dst->state == BR_STATE_FORWARDING; } @@ -280,14 +312,13 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, { struct __fdb_entry *fe = buf; int i, num = 0; - struct hlist_node *h; struct net_bridge_fdb_entry *f; memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); rcu_read_lock(); for (i = 0; i < BR_HASH_SIZE; i++) { - hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) { + hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { if (num >= maxnum) goto out; @@ -325,26 +356,28 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, } static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, - const unsigned char *addr) + const unsigned char *addr, + __u16 vid) { - struct hlist_node *h; struct net_bridge_fdb_entry *fdb; - hlist_for_each_entry(fdb, h, head, hlist) { - if (ether_addr_equal(fdb->addr.addr, addr)) + hlist_for_each_entry(fdb, head, hlist) { + if (ether_addr_equal(fdb->addr.addr, addr) && + fdb->vlan_id == vid) return fdb; } return NULL; } static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head, - const unsigned char *addr) + const unsigned char *addr, + __u16 vid) { - struct hlist_node *h; struct net_bridge_fdb_entry *fdb; - hlist_for_each_entry_rcu(fdb, h, head, hlist) { - if (ether_addr_equal(fdb->addr.addr, addr)) + hlist_for_each_entry_rcu(fdb, head, hlist) { + if (ether_addr_equal(fdb->addr.addr, addr) && + fdb->vlan_id == vid) return fdb; } return NULL; @@ -352,7 +385,8 @@ static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head, static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, struct net_bridge_port *source, - const unsigned char *addr) + const unsigned char *addr, + __u16 vid) { struct net_bridge_fdb_entry *fdb; @@ -360,6 +394,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, if (fdb) { memcpy(fdb->addr.addr, addr, ETH_ALEN); fdb->dst = source; + fdb->vlan_id = vid; fdb->is_local = 0; fdb->is_static = 0; fdb->updated = fdb->used = jiffies; @@ -369,15 +404,15 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, } static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr) + const unsigned char *addr, u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; if (!is_valid_ether_addr(addr)) return -EINVAL; - fdb = fdb_find(head, addr); + fdb = fdb_find(head, addr, vid); if (fdb) { /* it is okay to have multiple ports with same * address, just use the first one. @@ -386,11 +421,11 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return 0; br_warn(br, "adding interface %s with same address " "as a received packet\n", - source->dev->name); + source ? source->dev->name : br->dev->name); fdb_delete(br, fdb); } - fdb = fdb_create(head, source, addr); + fdb = fdb_create(head, source, addr, vid); if (!fdb) return -ENOMEM; @@ -401,20 +436,20 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, /* Add entry for local address of interface */ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr) + const unsigned char *addr, u16 vid) { int ret; spin_lock_bh(&br->hash_lock); - ret = fdb_insert(br, source, addr); + ret = fdb_insert(br, source, addr, vid); spin_unlock_bh(&br->hash_lock); return ret; } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr) + const unsigned char *addr, u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; /* some users want to always flood. */ @@ -426,7 +461,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->state == BR_STATE_FORWARDING)) return; - fdb = fdb_find_rcu(head, addr); + fdb = fdb_find_rcu(head, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ if (unlikely(fdb->is_local)) { @@ -441,8 +476,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } } else { spin_lock(&br->hash_lock); - if (likely(!fdb_find(head, addr))) { - fdb = fdb_create(head, source, addr); + if (likely(!fdb_find(head, addr, vid))) { + fdb = fdb_create(head, source, addr, vid); if (fdb) fdb_notify(br, fdb, RTM_NEWNEIGH); } @@ -495,6 +530,10 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; + + if (nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id)) + goto nla_put_failure; + return nlmsg_end(skb, nlh); nla_put_failure: @@ -506,6 +545,7 @@ static inline size_t fdb_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(u16)) /* NDA_VLAN */ + nla_total_size(sizeof(struct nda_cacheinfo)); } @@ -547,10 +587,9 @@ int br_fdb_dump(struct sk_buff *skb, goto out; for (i = 0; i < BR_HASH_SIZE; i++) { - struct hlist_node *h; struct net_bridge_fdb_entry *f; - hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) { + hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { if (idx < cb->args[0]) goto skip; @@ -571,24 +610,31 @@ out: /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, - __u16 state, __u16 flags) + __u16 state, __u16 flags, __u16 vid) { struct net_bridge *br = source->br; - struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; + bool modified = false; - fdb = fdb_find(head, addr); + fdb = fdb_find(head, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) return -ENOENT; - fdb = fdb_create(head, source, addr); + fdb = fdb_create(head, source, addr, vid); if (!fdb) return -ENOMEM; - fdb_notify(br, fdb, RTM_NEWNEIGH); + + modified = true; } else { if (flags & NLM_F_EXCL) return -EEXIST; + + if (fdb->dst != source) { + fdb->dst = source; + modified = true; + } } if (fdb_to_nud(fdb) != state) { @@ -600,13 +646,37 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, } else fdb->is_local = fdb->is_static = 0; - fdb->updated = fdb->used = jiffies; + modified = true; + } + + fdb->used = jiffies; + if (modified) { + fdb->updated = jiffies; fdb_notify(br, fdb, RTM_NEWNEIGH); } return 0; } +static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p, + const unsigned char *addr, u16 nlh_flags, u16 vid) +{ + int err = 0; + + if (ndm->ndm_flags & NTF_USE) { + rcu_read_lock(); + br_fdb_update(p->br, p, addr, vid); + rcu_read_unlock(); + } else { + spin_lock_bh(&p->br->hash_lock); + err = fdb_add_entry(p, addr, ndm->ndm_state, + nlh_flags, vid); + spin_unlock_bh(&p->br->hash_lock); + } + + return err; +} + /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, @@ -614,12 +684,29 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], { struct net_bridge_port *p; int err = 0; + struct net_port_vlans *pv; + unsigned short vid = VLAN_N_VID; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) { pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); return -EINVAL; } + if (tb[NDA_VLAN]) { + if (nla_len(tb[NDA_VLAN]) != sizeof(unsigned short)) { + pr_info("bridge: RTM_NEWNEIGH with invalid vlan\n"); + return -EINVAL; + } + + vid = nla_get_u16(tb[NDA_VLAN]); + + if (vid >= VLAN_N_VID) { + pr_info("bridge: RTM_NEWNEIGH with invalid vlan id %d\n", + vid); + return -EINVAL; + } + } + p = br_port_get_rtnl(dev); if (p == NULL) { pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", @@ -627,40 +714,87 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - if (ndm->ndm_flags & NTF_USE) { - rcu_read_lock(); - br_fdb_update(p->br, p, addr); - rcu_read_unlock(); + pv = nbp_get_vlan_info(p); + if (vid != VLAN_N_VID) { + if (!pv || !test_bit(vid, pv->vlan_bitmap)) { + pr_info("bridge: RTM_NEWNEIGH with unconfigured " + "vlan %d on port %s\n", vid, dev->name); + return -EINVAL; + } + + /* VID was specified, so use it. */ + err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); } else { - spin_lock_bh(&p->br->hash_lock); - err = fdb_add_entry(p, addr, ndm->ndm_state, nlh_flags); - spin_unlock_bh(&p->br->hash_lock); + if (!pv || bitmap_empty(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN)) { + err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); + goto out; + } + + /* We have vlans configured on this port and user didn't + * specify a VLAN. To be nice, add/update entry for every + * vlan on this port. + */ + for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); + if (err) + goto out; + } } +out: return err; } -static int fdb_delete_by_addr(struct net_bridge_port *p, const u8 *addr) +int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, + u16 vlan) { - struct net_bridge *br = p->br; - struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)]; struct net_bridge_fdb_entry *fdb; - fdb = fdb_find(head, addr); + fdb = fdb_find(head, addr, vlan); if (!fdb) return -ENOENT; - fdb_delete(p->br, fdb); + fdb_delete(br, fdb); return 0; } +static int __br_fdb_delete(struct net_bridge_port *p, + const unsigned char *addr, u16 vid) +{ + int err; + + spin_lock_bh(&p->br->hash_lock); + err = fdb_delete_by_addr(p->br, addr, vid); + spin_unlock_bh(&p->br->hash_lock); + + return err; +} + /* Remove neighbor entry with RTM_DELNEIGH */ -int br_fdb_delete(struct ndmsg *ndm, struct net_device *dev, +int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, const unsigned char *addr) { struct net_bridge_port *p; int err; + struct net_port_vlans *pv; + unsigned short vid = VLAN_N_VID; + + if (tb[NDA_VLAN]) { + if (nla_len(tb[NDA_VLAN]) != sizeof(unsigned short)) { + pr_info("bridge: RTM_NEWNEIGH with invalid vlan\n"); + return -EINVAL; + } + + vid = nla_get_u16(tb[NDA_VLAN]); + if (vid >= VLAN_N_VID) { + pr_info("bridge: RTM_NEWNEIGH with invalid vlan id %d\n", + vid); + return -EINVAL; + } + } p = br_port_get_rtnl(dev); if (p == NULL) { pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", @@ -668,9 +802,30 @@ int br_fdb_delete(struct ndmsg *ndm, struct net_device *dev, return -EINVAL; } - spin_lock_bh(&p->br->hash_lock); - err = fdb_delete_by_addr(p, addr); - spin_unlock_bh(&p->br->hash_lock); + pv = nbp_get_vlan_info(p); + if (vid != VLAN_N_VID) { + if (!pv || !test_bit(vid, pv->vlan_bitmap)) { + pr_info("bridge: RTM_DELNEIGH with unconfigured " + "vlan %d on port %s\n", vid, dev->name); + return -EINVAL; + } + + err = __br_fdb_delete(p, addr, vid); + } else { + if (!pv || bitmap_empty(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN)) { + err = __br_fdb_delete(p, addr, 0); + goto out; + } + /* We have vlans configured on this port and user didn't + * specify a VLAN. To be nice, add/update entry for every + * vlan on this port. + */ + err = -ENOENT; + for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + err &= __br_fdb_delete(p, addr, vid); + } + } +out: return err; } diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 02015a505d2a..092b20e4ee4c 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -31,6 +31,7 @@ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { return (((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && + br_allowed_egress(p->br, nbp_get_vlan_info(p), skb) && p->state == BR_STATE_FORWARDING); } @@ -63,6 +64,10 @@ int br_forward_finish(struct sk_buff *skb) static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { + skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb); + if (!skb) + return; + skb->dev = to->dev; if (unlikely(netpoll_tx_running(to->br->dev))) { @@ -88,6 +93,10 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) return; } + skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb); + if (!skb) + return; + indev = skb->dev; skb->dev = to->dev; skb_forward_csum(skb); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 37fe693471a8..4cdba60926ff 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -23,6 +23,7 @@ #include <linux/if_ether.h> #include <linux/slab.h> #include <net/sock.h> +#include <linux/if_vlan.h> #include "br_private.h" @@ -66,14 +67,15 @@ void br_port_carrier_check(struct net_bridge_port *p) struct net_device *dev = p->dev; struct net_bridge *br = p->br; - if (netif_running(dev) && netif_carrier_ok(dev)) + if (!(p->flags & BR_ADMIN_COST) && + netif_running(dev) && netif_oper_up(dev)) p->path_cost = port_cost(dev); if (!netif_running(br->dev)) return; spin_lock_bh(&br->lock); - if (netif_running(dev) && netif_carrier_ok(dev)) { + if (netif_running(dev) && netif_oper_up(dev)) { if (p->state == BR_STATE_DISABLED) br_stp_enable_port(p); } else { @@ -139,6 +141,7 @@ static void del_nbp(struct net_bridge_port *p) br_ifinfo_notify(RTM_DELLINK, p); + nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 1); list_del_rcu(&p->list); @@ -146,9 +149,8 @@ static void del_nbp(struct net_bridge_port *p) dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); - synchronize_net(); - netdev_set_master(dev, NULL); + netdev_upper_dev_unlink(dev, br->dev); br_multicast_del_port(p); @@ -364,7 +366,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (br_netpoll_info(br) && ((err = br_netpoll_enable(p, GFP_KERNEL)))) goto err3; - err = netdev_set_master(dev, br->dev); + err = netdev_master_upper_dev_link(dev, br->dev); if (err) goto err4; @@ -383,7 +385,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); - if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) && + if (netif_running(dev) && netif_oper_up(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); spin_unlock_bh(&br->lock); @@ -395,7 +397,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev_set_mtu(br->dev, br_min_mtu(br)); - if (br_fdb_insert(br, p, dev->dev_addr)) + if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -403,7 +405,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return 0; err5: - netdev_set_master(dev, NULL); + netdev_upper_dev_unlink(dev, br->dev); err4: br_netpoll_disable(p); err3: diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 4b34207419b1..828e2bcc1f52 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -17,6 +17,7 @@ #include <linux/etherdevice.h> #include <linux/netfilter_bridge.h> #include <linux/export.h> +#include <linux/rculist.h> #include "br_private.h" /* Hook for brouter */ @@ -34,6 +35,20 @@ static int br_pass_frame_up(struct sk_buff *skb) brstats->rx_bytes += skb->len; u64_stats_update_end(&brstats->syncp); + /* Bridge is just like any other port. Make sure the + * packet is allowed except in promisc modue when someone + * may be running packet capture. + */ + if (!(brdev->flags & IFF_PROMISC) && + !br_allowed_egress(br, br_get_vlan_info(br), skb)) { + kfree_skb(skb); + return NET_RX_DROP; + } + + skb = br_handle_vlan(br, br_get_vlan_info(br), skb); + if (!skb) + return NET_RX_DROP; + indev = skb->dev; skb->dev = brdev; @@ -50,13 +65,17 @@ int br_handle_frame_finish(struct sk_buff *skb) struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; struct sk_buff *skb2; + u16 vid = 0; if (!p || p->state == BR_STATE_DISABLED) goto drop; + if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid)) + goto drop; + /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; - br_fdb_update(br, p, eth_hdr(skb)->h_source); + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid); if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && br_multicast_rcv(br, p, skb)) @@ -78,7 +97,7 @@ int br_handle_frame_finish(struct sk_buff *skb) if (is_broadcast_ether_addr(dest)) skb2 = skb; else if (is_multicast_ether_addr(dest)) { - mdst = br_mdb_get(br, skb); + mdst = br_mdb_get(br, skb, vid); if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) { if ((mdst && mdst->mglist) || br_multicast_is_router(br)) @@ -91,7 +110,8 @@ int br_handle_frame_finish(struct sk_buff *skb) skb2 = skb; br->dev->stats.multicast++; - } else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) { + } else if ((dst = __br_fdb_get(br, dest, vid)) && + dst->is_local) { skb2 = skb; /* Do not forward the packet since it's local. */ skb = NULL; @@ -119,8 +139,10 @@ drop: static int br_handle_local_finish(struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); + u16 vid = 0; - br_fdb_update(p->br, p, eth_hdr(skb)->h_source); + br_vlan_get_tag(skb, &vid); + br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid); return 0; /* process further */ } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index acc9f4cc18f7..19942e38fd2d 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -18,7 +18,6 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; - struct hlist_node *n; struct nlattr *nest; if (!br->multicast_router || hlist_empty(&br->router_list)) @@ -28,7 +27,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, if (nest == NULL) return -EMSGSIZE; - hlist_for_each_entry_rcu(p, n, &br->router_list, rlist) { + hlist_for_each_entry_rcu(p, &br->router_list, rlist) { if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex)) goto fail; } @@ -61,12 +60,11 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, return -EMSGSIZE; for (i = 0; i < mdb->max; i++) { - struct hlist_node *h; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p, **pp; struct net_bridge_port *port; - hlist_for_each_entry_rcu(mp, h, &mdb->mhash[i], hlist[mdb->ver]) { + hlist_for_each_entry_rcu(mp, &mdb->mhash[i], hlist[mdb->ver]) { if (idx < s_idx) goto skip; @@ -82,6 +80,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, port = p->port; if (port) { struct br_mdb_entry e; + memset(&e, 0, sizeof(e)); e.ifindex = port->dev->ifindex; e.state = p->state; if (p->addr.proto == htons(ETH_P_IP)) @@ -138,6 +137,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) break; bpm = nlmsg_data(nlh); + memset(bpm, 0, sizeof(*bpm)); bpm->ifindex = dev->ifindex; if (br_mdb_fill_info(skb, cb, dev) < 0) goto out; @@ -173,6 +173,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb, return -EMSGSIZE; bpm = nlmsg_data(nlh); + memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start(skb, MDBA_MDB); @@ -230,6 +231,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, { struct br_mdb_entry entry; + memset(&entry, 0, sizeof(entry)); entry.ifindex = port->dev->ifindex; entry.addr.proto = group->proto; entry.addr.u.ip4 = group->u.ip4; @@ -272,9 +274,6 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; int err; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY, NULL); if (err < 0) return err; @@ -383,7 +382,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, return ret; } -static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct br_mdb_entry *entry; @@ -459,7 +458,7 @@ unlock: return err; } -static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net_device *dev; struct br_mdb_entry *entry; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 6d6f26531de2..81f2389f78eb 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -39,6 +39,8 @@ static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) { if (a->proto != b->proto) return 0; + if (a->vid != b->vid) + return 0; switch (a->proto) { case htons(ETH_P_IP): return a->u.ip4 == b->u.ip4; @@ -50,16 +52,19 @@ static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) return 0; } -static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip) +static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip, + __u16 vid) { - return jhash_1word(mdb->secret, (__force u32)ip) & (mdb->max - 1); + return jhash_2words((__force u32)ip, vid, mdb->secret) & (mdb->max - 1); } #if IS_ENABLED(CONFIG_IPV6) static inline int __br_ip6_hash(struct net_bridge_mdb_htable *mdb, - const struct in6_addr *ip) + const struct in6_addr *ip, + __u16 vid) { - return jhash2((__force u32 *)ip->s6_addr32, 4, mdb->secret) & (mdb->max - 1); + return jhash_2words(ipv6_addr_hash(ip), vid, + mdb->secret) & (mdb->max - 1); } #endif @@ -68,10 +73,10 @@ static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb, { switch (ip->proto) { case htons(ETH_P_IP): - return __br_ip4_hash(mdb, ip->u.ip4); + return __br_ip4_hash(mdb, ip->u.ip4, ip->vid); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return __br_ip6_hash(mdb, &ip->u.ip6); + return __br_ip6_hash(mdb, &ip->u.ip6, ip->vid); #endif } return 0; @@ -81,9 +86,8 @@ static struct net_bridge_mdb_entry *__br_mdb_ip_get( struct net_bridge_mdb_htable *mdb, struct br_ip *dst, int hash) { struct net_bridge_mdb_entry *mp; - struct hlist_node *p; - hlist_for_each_entry_rcu(mp, p, &mdb->mhash[hash], hlist[mdb->ver]) { + hlist_for_each_entry_rcu(mp, &mdb->mhash[hash], hlist[mdb->ver]) { if (br_ip_equal(&mp->addr, dst)) return mp; } @@ -101,31 +105,34 @@ struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, } static struct net_bridge_mdb_entry *br_mdb_ip4_get( - struct net_bridge_mdb_htable *mdb, __be32 dst) + struct net_bridge_mdb_htable *mdb, __be32 dst, __u16 vid) { struct br_ip br_dst; br_dst.u.ip4 = dst; br_dst.proto = htons(ETH_P_IP); + br_dst.vid = vid; return br_mdb_ip_get(mdb, &br_dst); } #if IS_ENABLED(CONFIG_IPV6) static struct net_bridge_mdb_entry *br_mdb_ip6_get( - struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst) + struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst, + __u16 vid) { struct br_ip br_dst; br_dst.u.ip6 = *dst; br_dst.proto = htons(ETH_P_IPV6); + br_dst.vid = vid; return br_mdb_ip_get(mdb, &br_dst); } #endif struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, - struct sk_buff *skb) + struct sk_buff *skb, u16 vid) { struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb); struct br_ip ip; @@ -137,6 +144,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return NULL; ip.proto = skb->protocol; + ip.vid = vid; switch (skb->protocol) { case htons(ETH_P_IP): @@ -170,13 +178,12 @@ static int br_mdb_copy(struct net_bridge_mdb_htable *new, int elasticity) { struct net_bridge_mdb_entry *mp; - struct hlist_node *p; int maxlen; int len; int i; for (i = 0; i < old->max; i++) - hlist_for_each_entry(mp, p, &old->mhash[i], hlist[old->ver]) + hlist_for_each_entry(mp, &old->mhash[i], hlist[old->ver]) hlist_add_head(&mp->hlist[new->ver], &new->mhash[br_ip_hash(new, &mp->addr)]); @@ -186,7 +193,7 @@ static int br_mdb_copy(struct net_bridge_mdb_htable *new, maxlen = 0; for (i = 0; i < new->max; i++) { len = 0; - hlist_for_each_entry(mp, p, &new->mhash[i], hlist[new->ver]) + hlist_for_each_entry(mp, &new->mhash[i], hlist[new->ver]) len++; if (len > maxlen) maxlen = len; @@ -502,14 +509,13 @@ static struct net_bridge_mdb_entry *br_multicast_get_group( { struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; - struct hlist_node *p; unsigned int count = 0; unsigned int max; int elasticity; int err; mdb = rcu_dereference_protected(br->mdb, 1); - hlist_for_each_entry(mp, p, &mdb->mhash[hash], hlist[mdb->ver]) { + hlist_for_each_entry(mp, &mdb->mhash[hash], hlist[mdb->ver]) { count++; if (unlikely(br_ip_equal(group, &mp->addr))) return mp; @@ -694,7 +700,8 @@ err: static int br_ip4_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, - __be32 group) + __be32 group, + __u16 vid) { struct br_ip br_group; @@ -703,6 +710,7 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, br_group.u.ip4 = group; br_group.proto = htons(ETH_P_IP); + br_group.vid = vid; return br_multicast_add_group(br, port, &br_group); } @@ -710,7 +718,8 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, - const struct in6_addr *group) + const struct in6_addr *group, + __u16 vid) { struct br_ip br_group; @@ -719,6 +728,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); + br_group.vid = vid; return br_multicast_add_group(br, port, &br_group); } @@ -870,10 +880,10 @@ void br_multicast_disable_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; struct net_bridge_port_group *pg; - struct hlist_node *p, *n; + struct hlist_node *n; spin_lock(&br->multicast_lock); - hlist_for_each_entry_safe(pg, p, n, &port->mglist, mglist) + hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) br_multicast_del_pg(br, pg); if (!hlist_unhashed(&port->rlist)) @@ -895,10 +905,12 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int type; int err = 0; __be32 group; + u16 vid = 0; if (!pskb_may_pull(skb, sizeof(*ih))) return -EINVAL; + br_vlan_get_tag(skb, &vid); ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = sizeof(*ih); @@ -930,7 +942,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, continue; } - err = br_ip4_multicast_add_group(br, port, group); + err = br_ip4_multicast_add_group(br, port, group, vid); if (err) break; } @@ -949,10 +961,12 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, int len; int num; int err = 0; + u16 vid = 0; if (!pskb_may_pull(skb, sizeof(*icmp6h))) return -EINVAL; + br_vlan_get_tag(skb, &vid); icmp6h = icmp6_hdr(skb); num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); len = sizeof(*icmp6h); @@ -990,7 +1004,8 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, continue; } - err = br_ip6_multicast_add_group(br, port, &grec->grec_mca); + err = br_ip6_multicast_add_group(br, port, &grec->grec_mca, + vid); if (!err) break; } @@ -1008,12 +1023,12 @@ static void br_multicast_add_router(struct net_bridge *br, struct net_bridge_port *port) { struct net_bridge_port *p; - struct hlist_node *n, *slot = NULL; + struct hlist_node *slot = NULL; - hlist_for_each_entry(p, n, &br->router_list, rlist) { + hlist_for_each_entry(p, &br->router_list, rlist) { if ((unsigned long) port >= (unsigned long) p) break; - slot = n; + slot = &p->rlist; } if (slot) @@ -1074,6 +1089,7 @@ static int br_ip4_multicast_query(struct net_bridge *br, unsigned long now = jiffies; __be32 group; int err = 0; + u16 vid = 0; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || @@ -1108,7 +1124,8 @@ static int br_ip4_multicast_query(struct net_bridge *br, if (!group) goto out; - mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group); + br_vlan_get_tag(skb, &vid); + mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid); if (!mp) goto out; @@ -1149,6 +1166,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, unsigned long now = jiffies; const struct in6_addr *group = NULL; int err = 0; + u16 vid = 0; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || @@ -1180,7 +1198,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (!group) goto out; - mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group); + br_vlan_get_tag(skb, &vid); + mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid); if (!mp) goto out; @@ -1286,7 +1305,8 @@ out: static void br_ip4_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, - __be32 group) + __be32 group, + __u16 vid) { struct br_ip br_group; @@ -1295,6 +1315,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, br_group.u.ip4 = group; br_group.proto = htons(ETH_P_IP); + br_group.vid = vid; br_multicast_leave_group(br, port, &br_group); } @@ -1302,7 +1323,8 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, - const struct in6_addr *group) + const struct in6_addr *group, + __u16 vid) { struct br_ip br_group; @@ -1311,6 +1333,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); + br_group.vid = vid; br_multicast_leave_group(br, port, &br_group); } @@ -1326,6 +1349,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, unsigned int len; unsigned int offset; int err; + u16 vid = 0; /* We treat OOM as packet loss for now. */ if (!pskb_may_pull(skb, sizeof(*iph))) @@ -1345,7 +1369,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, return -EINVAL; if (iph->protocol != IPPROTO_IGMP) { - if ((iph->daddr & IGMP_LOCAL_GROUP_MASK) != IGMP_LOCAL_GROUP) + if (!ipv4_is_local_multicast(iph->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; } @@ -1386,6 +1410,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = 0; + br_vlan_get_tag(skb2, &vid); BR_INPUT_SKB_CB(skb)->igmp = 1; ih = igmp_hdr(skb2); @@ -1393,7 +1418,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip4_multicast_add_group(br, port, ih->group); + err = br_ip4_multicast_add_group(br, port, ih->group, vid); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: err = br_ip4_multicast_igmp3_report(br, port, skb2); @@ -1402,7 +1427,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_query(br, port, skb2); break; case IGMP_HOST_LEAVE_MESSAGE: - br_ip4_multicast_leave_group(br, port, ih->group); + br_ip4_multicast_leave_group(br, port, ih->group, vid); break; } @@ -1427,6 +1452,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, unsigned int len; int offset; int err; + u16 vid = 0; if (!pskb_may_pull(skb, sizeof(*ip6h))) return -EINVAL; @@ -1510,6 +1536,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, err = 0; + br_vlan_get_tag(skb, &vid); BR_INPUT_SKB_CB(skb)->igmp = 1; switch (icmp6_type) { @@ -1522,7 +1549,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, } mld = (struct mld_msg *)skb_transport_header(skb2); BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip6_multicast_add_group(br, port, &mld->mld_mca); + err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); break; } case ICMPV6_MLD2_REPORT: @@ -1539,7 +1566,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, goto out; } mld = (struct mld_msg *)skb_transport_header(skb2); - br_ip6_multicast_leave_group(br, port, &mld->mld_mca); + br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); } } @@ -1624,7 +1651,7 @@ void br_multicast_stop(struct net_bridge *br) { struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; - struct hlist_node *p, *n; + struct hlist_node *n; u32 ver; int i; @@ -1641,7 +1668,7 @@ void br_multicast_stop(struct net_bridge *br) ver = mdb->ver; for (i = 0; i < mdb->max; i++) { - hlist_for_each_entry_safe(mp, p, n, &mdb->mhash[i], + hlist_for_each_entry_safe(mp, n, &mdb->mhash[i], hlist[ver]) { del_timer(&mp->timer); call_rcu_bh(&mp->rcu, br_multicast_free_group); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index fe43bc7b063f..1ed75bfd8d1d 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -535,7 +535,8 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb)) return br; - vlan = __vlan_find_dev_deep(br, vlan_tx_tag_get(skb) & VLAN_VID_MASK); + vlan = __vlan_find_dev_deep(br, skb->vlan_proto, + vlan_tx_tag_get(skb) & VLAN_VID_MASK); return vlan ? vlan : br; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 5dc66abcc9e2..8e3abf564798 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -16,6 +16,7 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> +#include <uapi/linux/if_bridge.h> #include "br_private.h" #include "br_private_stp.h" @@ -28,6 +29,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_MODE */ + nla_total_size(1) /* IFLA_BRPORT_GUARD */ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + 0; } @@ -64,15 +66,21 @@ static int br_port_fill_attrs(struct sk_buff *skb, * Create one netlink message for one interface * Contains port and master info as well as carrier and bridge state. */ -static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, - u32 pid, u32 seq, int event, unsigned int flags) +static int br_fill_ifinfo(struct sk_buff *skb, + const struct net_bridge_port *port, + u32 pid, u32 seq, int event, unsigned int flags, + u32 filter_mask, const struct net_device *dev) { - const struct net_bridge *br = port->br; - const struct net_device *dev = port->dev; + const struct net_bridge *br; struct ifinfomsg *hdr; struct nlmsghdr *nlh; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + if (port) + br = port->br; + else + br = netdev_priv(dev); + br_debug(br, "br_fill_info event %d port %s master %s\n", event, dev->name, br->dev->name); @@ -98,7 +106,7 @@ static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *por nla_put_u32(skb, IFLA_LINK, dev->iflink))) goto nla_put_failure; - if (event == RTM_NEWLINK) { + if (event == RTM_NEWLINK && port) { struct nlattr *nest = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); @@ -107,6 +115,45 @@ static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *por nla_nest_end(skb, nest); } + /* Check if the VID information is requested */ + if (filter_mask & RTEXT_FILTER_BRVLAN) { + struct nlattr *af; + const struct net_port_vlans *pv; + struct bridge_vlan_info vinfo; + u16 vid; + u16 pvid; + + if (port) + pv = nbp_get_vlan_info(port); + else + pv = br_get_vlan_info(br); + + if (!pv || bitmap_empty(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN)) + goto done; + + af = nla_nest_start(skb, IFLA_AF_SPEC); + if (!af) + goto nla_put_failure; + + pvid = br_get_pvid(pv); + for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) { + vinfo.vid = vid; + vinfo.flags = 0; + if (vid == pvid) + vinfo.flags |= BRIDGE_VLAN_INFO_PVID; + + if (test_bit(vid, pv->untagged_bitmap)) + vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED; + + if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + goto nla_put_failure; + } + + nla_nest_end(skb, af); + } + +done: return nlmsg_end(skb, nlh); nla_put_failure: @@ -119,10 +166,14 @@ nla_put_failure: */ void br_ifinfo_notify(int event, struct net_bridge_port *port) { - struct net *net = dev_net(port->dev); + struct net *net; struct sk_buff *skb; int err = -ENOBUFS; + if (!port) + return; + + net = dev_net(port->dev); br_debug(port->br, "port %u(%s) event %d\n", (unsigned int)port->port_no, port->dev->name, event); @@ -130,7 +181,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) if (skb == NULL) goto errout; - err = br_fill_ifinfo(skb, port, 0, 0, event, 0); + err = br_fill_ifinfo(skb, port, 0, 0, event, 0, 0, port->dev); if (err < 0) { /* -EMSGSIZE implies BUG in br_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -144,24 +195,85 @@ errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); } + /* * Dump information about all ports, in response to GETLINK */ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, - struct net_device *dev) + struct net_device *dev, u32 filter_mask) { int err = 0; struct net_bridge_port *port = br_port_get_rcu(dev); - /* not a bridge port */ - if (!port) + /* not a bridge port and */ + if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN)) goto out; - err = br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI); + err = br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI, + filter_mask, dev); out: return err; } +static const struct nla_policy ifla_br_policy[IFLA_MAX+1] = { + [IFLA_BRIDGE_FLAGS] = { .type = NLA_U16 }, + [IFLA_BRIDGE_MODE] = { .type = NLA_U16 }, + [IFLA_BRIDGE_VLAN_INFO] = { .type = NLA_BINARY, + .len = sizeof(struct bridge_vlan_info), }, +}; + +static int br_afspec(struct net_bridge *br, + struct net_bridge_port *p, + struct nlattr *af_spec, + int cmd) +{ + struct nlattr *tb[IFLA_BRIDGE_MAX+1]; + int err = 0; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MAX, af_spec, ifla_br_policy); + if (err) + return err; + + if (tb[IFLA_BRIDGE_VLAN_INFO]) { + struct bridge_vlan_info *vinfo; + + vinfo = nla_data(tb[IFLA_BRIDGE_VLAN_INFO]); + + if (vinfo->vid >= VLAN_N_VID) + return -EINVAL; + + switch (cmd) { + case RTM_SETLINK: + if (p) { + err = nbp_vlan_add(p, vinfo->vid, vinfo->flags); + if (err) + break; + + if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER) + err = br_vlan_add(p->br, vinfo->vid, + vinfo->flags); + } else + err = br_vlan_add(br, vinfo->vid, vinfo->flags); + + if (err) + break; + + break; + + case RTM_DELLINK: + if (p) { + nbp_vlan_delete(p, vinfo->vid); + if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER) + br_vlan_delete(p->br, vinfo->vid); + } else + br_vlan_delete(br, vinfo->vid); + break; + } + } + + return err; +} + static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, [IFLA_BRPORT_COST] = { .type = NLA_U32 }, @@ -181,8 +293,11 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state) if (p->br->stp_enabled == BR_KERNEL_STP) return -EBUSY; + /* if device is not up, change is not allowed + * if link is not present, only allowable state is disabled + */ if (!netif_running(p->dev) || - (!netif_carrier_ok(p->dev) && state != BR_STATE_DISABLED)) + (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED)) return -ENETDOWN; p->state = state; @@ -212,6 +327,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); + br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); @@ -236,47 +352,80 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) /* Change state and parameters on port. */ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh) { - struct ifinfomsg *ifm; struct nlattr *protinfo; + struct nlattr *afspec; struct net_bridge_port *p; struct nlattr *tb[IFLA_BRPORT_MAX + 1]; - int err; - - ifm = nlmsg_data(nlh); + int err = 0; - protinfo = nlmsg_find_attr(nlh, sizeof(*ifm), IFLA_PROTINFO); - if (!protinfo) + protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); + afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (!protinfo && !afspec) return 0; p = br_port_get_rtnl(dev); - if (!p) + /* We want to accept dev as bridge itself if the AF_SPEC + * is set to see if someone is setting vlan info on the brigde + */ + if (!p && !afspec) return -EINVAL; - if (protinfo->nla_type & NLA_F_NESTED) { - err = nla_parse_nested(tb, IFLA_BRPORT_MAX, - protinfo, ifla_brport_policy); + if (p && protinfo) { + if (protinfo->nla_type & NLA_F_NESTED) { + err = nla_parse_nested(tb, IFLA_BRPORT_MAX, + protinfo, ifla_brport_policy); + if (err) + return err; + + spin_lock_bh(&p->br->lock); + err = br_setport(p, tb); + spin_unlock_bh(&p->br->lock); + } else { + /* Binary compatability with old RSTP */ + if (nla_len(protinfo) < sizeof(u8)) + return -EINVAL; + + spin_lock_bh(&p->br->lock); + err = br_set_port_state(p, nla_get_u8(protinfo)); + spin_unlock_bh(&p->br->lock); + } if (err) - return err; - - spin_lock_bh(&p->br->lock); - err = br_setport(p, tb); - spin_unlock_bh(&p->br->lock); - } else { - /* Binary compatability with old RSTP */ - if (nla_len(protinfo) < sizeof(u8)) - return -EINVAL; + goto out; + } - spin_lock_bh(&p->br->lock); - err = br_set_port_state(p, nla_get_u8(protinfo)); - spin_unlock_bh(&p->br->lock); + if (afspec) { + err = br_afspec((struct net_bridge *)netdev_priv(dev), p, + afspec, RTM_SETLINK); } if (err == 0) br_ifinfo_notify(RTM_NEWLINK, p); +out: return err; } +/* Delete port information */ +int br_dellink(struct net_device *dev, struct nlmsghdr *nlh) +{ + struct nlattr *afspec; + struct net_bridge_port *p; + int err; + + afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (!afspec) + return 0; + + p = br_port_get_rtnl(dev); + /* We want to accept dev as bridge itself as well */ + if (!p && !(dev->priv_flags & IFF_EBRIDGE)) + return -EINVAL; + + err = br_afspec((struct net_bridge *)netdev_priv(dev), p, + afspec, RTM_DELLINK); + + return err; +} static int br_validate(struct nlattr *tb[], struct nlattr *data[]) { if (tb[IFLA_ADDRESS]) { @@ -289,6 +438,29 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } +static size_t br_get_link_af_size(const struct net_device *dev) +{ + struct net_port_vlans *pv; + + if (br_port_exists(dev)) + pv = nbp_get_vlan_info(br_port_get_rcu(dev)); + else if (dev->priv_flags & IFF_EBRIDGE) + pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev)); + else + return 0; + + if (!pv) + return 0; + + /* Each VLAN is returned in bridge_vlan_info along with flags */ + return pv->num_vlans * nla_total_size(sizeof(struct bridge_vlan_info)); +} + +static struct rtnl_af_ops br_af_ops = { + .family = AF_BRIDGE, + .get_link_af_size = br_get_link_af_size, +}; + struct rtnl_link_ops br_link_ops __read_mostly = { .kind = "bridge", .priv_size = sizeof(struct net_bridge), @@ -302,11 +474,18 @@ int __init br_netlink_init(void) int err; br_mdb_init(); - err = rtnl_link_register(&br_link_ops); + err = rtnl_af_register(&br_af_ops); if (err) goto out; + err = rtnl_link_register(&br_link_ops); + if (err) + goto out_af; + return 0; + +out_af: + rtnl_af_unregister(&br_af_ops); out: br_mdb_uninit(); return err; @@ -315,5 +494,6 @@ out: void __exit br_netlink_fini(void) { br_mdb_uninit(); + rtnl_af_unregister(&br_af_ops); rtnl_link_unregister(&br_link_ops); } diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index a76b62135558..1644b3e1f947 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -82,7 +82,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v break; case NETDEV_UP: - if (netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) { + if (netif_running(br->dev) && netif_oper_up(dev)) { spin_lock_bh(&br->lock); br_stp_enable_port(p); spin_unlock_bh(&br->lock); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 711094aed41a..d2c043a857b6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -18,6 +18,7 @@ #include <linux/netpoll.h> #include <linux/u64_stats_sync.h> #include <net/route.h> +#include <linux/if_vlan.h> #define BR_HASH_BITS 8 #define BR_HASH_SIZE (1 << BR_HASH_BITS) @@ -26,6 +27,7 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) +#define BR_VLAN_BITMAP_LEN BITS_TO_LONGS(VLAN_N_VID) #define BR_VERSION "2.3" @@ -61,6 +63,20 @@ struct br_ip #endif } u; __be16 proto; + __u16 vid; +}; + +struct net_port_vlans { + u16 port_idx; + u16 pvid; + union { + struct net_bridge_port *port; + struct net_bridge *br; + } parent; + struct rcu_head rcu; + unsigned long vlan_bitmap[BR_VLAN_BITMAP_LEN]; + unsigned long untagged_bitmap[BR_VLAN_BITMAP_LEN]; + u16 num_vlans; }; struct net_bridge_fdb_entry @@ -74,6 +90,7 @@ struct net_bridge_fdb_entry mac_addr addr; unsigned char is_local; unsigned char is_static; + __u16 vlan_id; }; struct net_bridge_port_group { @@ -139,6 +156,7 @@ struct net_bridge_port #define BR_BPDU_GUARD 0x00000002 #define BR_ROOT_BLOCK 0x00000004 #define BR_MULTICAST_FAST_LEAVE 0x00000008 +#define BR_ADMIN_COST 0x00000010 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u32 multicast_startup_queries_sent; @@ -156,6 +174,9 @@ struct net_bridge_port #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + struct net_port_vlans __rcu *vlan_info; +#endif }; #define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT) @@ -197,9 +218,6 @@ struct net_bridge bool nf_call_ip6tables; bool nf_call_arptables; #endif - unsigned long flags; -#define BR_SET_MAC_ADDR 0x00000001 - u16 group_fwd_mask; /* STP */ @@ -260,6 +278,10 @@ struct net_bridge struct timer_list topology_change_timer; struct timer_list gc_timer; struct kobject *ifobj; +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + u8 vlan_enabled; + struct net_port_vlans __rcu *vlan_info; +#endif }; struct br_input_skb_cb { @@ -355,18 +377,22 @@ extern void br_fdb_cleanup(unsigned long arg); extern void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, int do_all); extern struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, - const unsigned char *addr); + const unsigned char *addr, + __u16 vid); extern int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); extern int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, unsigned long off); extern int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr); + const unsigned char *addr, + u16 vid); extern void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr); + const unsigned char *addr, + u16 vid); +extern int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vid); -extern int br_fdb_delete(struct ndmsg *ndm, +extern int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr); extern int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], @@ -417,7 +443,7 @@ extern int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb); extern struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, - struct sk_buff *skb); + struct sk_buff *skb, u16 vid); extern void br_multicast_add_port(struct net_bridge_port *port); extern void br_multicast_del_port(struct net_bridge_port *port); extern void br_multicast_enable_port(struct net_bridge_port *port); @@ -479,7 +505,7 @@ static inline int br_multicast_rcv(struct net_bridge *br, } static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, - struct sk_buff *skb) + struct sk_buff *skb, u16 vid) { return NULL; } @@ -534,6 +560,142 @@ static inline void br_mdb_uninit(void) } #endif +/* br_vlan.c */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING +extern bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, + struct sk_buff *skb, u16 *vid); +extern bool br_allowed_egress(struct net_bridge *br, + const struct net_port_vlans *v, + const struct sk_buff *skb); +extern struct sk_buff *br_handle_vlan(struct net_bridge *br, + const struct net_port_vlans *v, + struct sk_buff *skb); +extern int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags); +extern int br_vlan_delete(struct net_bridge *br, u16 vid); +extern void br_vlan_flush(struct net_bridge *br); +extern int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); +extern int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); +extern int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); +extern void nbp_vlan_flush(struct net_bridge_port *port); +extern bool nbp_vlan_find(struct net_bridge_port *port, u16 vid); + +static inline struct net_port_vlans *br_get_vlan_info( + const struct net_bridge *br) +{ + return rcu_dereference_rtnl(br->vlan_info); +} + +static inline struct net_port_vlans *nbp_get_vlan_info( + const struct net_bridge_port *p) +{ + return rcu_dereference_rtnl(p->vlan_info); +} + +/* Since bridge now depends on 8021Q module, but the time bridge sees the + * skb, the vlan tag will always be present if the frame was tagged. + */ +static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) +{ + int err = 0; + + if (vlan_tx_tag_present(skb)) + *vid = vlan_tx_tag_get(skb) & VLAN_VID_MASK; + else { + *vid = 0; + err = -EINVAL; + } + + return err; +} + +static inline u16 br_get_pvid(const struct net_port_vlans *v) +{ + /* Return just the VID if it is set, or VLAN_N_VID (invalid vid) if + * vid wasn't set + */ + smp_rmb(); + return (v->pvid & VLAN_TAG_PRESENT) ? + (v->pvid & ~VLAN_TAG_PRESENT) : + VLAN_N_VID; +} + +#else +static inline bool br_allowed_ingress(struct net_bridge *br, + struct net_port_vlans *v, + struct sk_buff *skb, + u16 *vid) +{ + return true; +} + +static inline bool br_allowed_egress(struct net_bridge *br, + const struct net_port_vlans *v, + const struct sk_buff *skb) +{ + return true; +} + +static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, + const struct net_port_vlans *v, + struct sk_buff *skb) +{ + return skb; +} + +static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) +{ + return -EOPNOTSUPP; +} + +static inline int br_vlan_delete(struct net_bridge *br, u16 vid) +{ + return -EOPNOTSUPP; +} + +static inline void br_vlan_flush(struct net_bridge *br) +{ +} + +static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) +{ + return -EOPNOTSUPP; +} + +static inline int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) +{ + return -EOPNOTSUPP; +} + +static inline void nbp_vlan_flush(struct net_bridge_port *port) +{ +} + +static inline struct net_port_vlans *br_get_vlan_info( + const struct net_bridge *br) +{ + return NULL; +} +static inline struct net_port_vlans *nbp_get_vlan_info( + const struct net_bridge_port *p) +{ + return NULL; +} + +static inline bool nbp_vlan_find(struct net_bridge_port *port, u16 vid) +{ + return false; +} + +static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag) +{ + return 0; +} +static inline u16 br_get_pvid(const struct net_port_vlans *v) +{ + return VLAN_N_VID; /* Returns invalid vid */ +} +#endif + /* br_netfilter.c */ #ifdef CONFIG_BRIDGE_NETFILTER extern int br_netfilter_init(void); @@ -594,8 +756,9 @@ extern int br_netlink_init(void); extern void br_netlink_fini(void); extern void br_ifinfo_notify(int event, struct net_bridge_port *port); extern int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg); +extern int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg); extern int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, - struct net_device *dev); + struct net_device *dev, u32 filter_mask); #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index b01849a74310..1c0a50f13229 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -225,7 +225,14 @@ static void br_record_config_timeout_values(struct net_bridge *br, /* called under bridge lock */ void br_transmit_tcn(struct net_bridge *br) { - br_send_tcn_bpdu(br_get_port(br, br->root_port)); + struct net_bridge_port *p; + + p = br_get_port(br, br->root_port); + if (p) + br_send_tcn_bpdu(p); + else + br_notice(br, "root port %u not found for topology notice\n", + br->root_port); } /* called under bridge lock */ diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 9d5a414a3943..d45e760141bb 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -54,7 +54,7 @@ void br_stp_enable_bridge(struct net_bridge *br) br_config_bpdu_generation(br); list_for_each_entry(p, &br->port_list, list) { - if ((p->dev->flags & IFF_UP) && netif_carrier_ok(p->dev)) + if (netif_running(p->dev) && netif_oper_up(p->dev)) br_stp_enable_port(p); } @@ -216,7 +216,7 @@ bool br_stp_recalculate_bridge_id(struct net_bridge *br) struct net_bridge_port *p; /* user has chosen a value so keep it */ - if (br->flags & BR_SET_MAC_ADDR) + if (br->dev->addr_assign_type == NET_ADDR_SET) return false; list_for_each_entry(p, &br->port_list, list) { @@ -288,6 +288,7 @@ int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost) path_cost > BR_MAX_PATH_COST) return -ERANGE; + p->flags |= BR_ADMIN_COST; p->path_cost = path_cost; br_configuration_update(p->br); br_port_state_selection(p->br); diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index c3530a81a33b..950663d4d330 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -107,7 +107,7 @@ static void br_tcn_timer_expired(unsigned long arg) br_debug(br, "tcn timer expired\n"); spin_lock(&br->lock); - if (br->dev->flags & IFF_UP) { + if (!br_is_root_bridge(br) && (br->dev->flags & IFF_UP)) { br_transmit_tcn(br); mod_timer(&br->tcn_timer,jiffies + br->bridge_hello_time); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 5913a3a0047b..8baa9c08e1a4 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -691,6 +691,24 @@ static ssize_t store_nf_call_arptables( static DEVICE_ATTR(nf_call_arptables, S_IRUGO | S_IWUSR, show_nf_call_arptables, store_nf_call_arptables); #endif +#ifdef CONFIG_BRIDGE_VLAN_FILTERING +static ssize_t show_vlan_filtering(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br->vlan_enabled); +} + +static ssize_t store_vlan_filtering(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_vlan_filter_toggle); +} +static DEVICE_ATTR(vlan_filtering, S_IRUGO | S_IWUSR, + show_vlan_filtering, store_vlan_filtering); +#endif static struct attribute *bridge_attrs[] = { &dev_attr_forward_delay.attr, @@ -732,6 +750,9 @@ static struct attribute *bridge_attrs[] = { &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, #endif +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + &dev_attr_vlan_filtering.attr, +#endif NULL }; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c new file mode 100644 index 000000000000..bd58b45f5f90 --- /dev/null +++ b/net/bridge/br_vlan.c @@ -0,0 +1,419 @@ +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> + +#include "br_private.h" + +static void __vlan_add_pvid(struct net_port_vlans *v, u16 vid) +{ + if (v->pvid == vid) + return; + + smp_wmb(); + v->pvid = vid; +} + +static void __vlan_delete_pvid(struct net_port_vlans *v, u16 vid) +{ + if (v->pvid != vid) + return; + + smp_wmb(); + v->pvid = 0; +} + +static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags) +{ + if (flags & BRIDGE_VLAN_INFO_PVID) + __vlan_add_pvid(v, vid); + + if (flags & BRIDGE_VLAN_INFO_UNTAGGED) + set_bit(vid, v->untagged_bitmap); +} + +static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) +{ + const struct net_device_ops *ops; + struct net_bridge_port *p = NULL; + struct net_bridge *br; + struct net_device *dev; + int err; + + if (test_bit(vid, v->vlan_bitmap)) { + __vlan_add_flags(v, vid, flags); + return 0; + } + + if (vid) { + if (v->port_idx) { + p = v->parent.port; + br = p->br; + dev = p->dev; + } else { + br = v->parent.br; + dev = br->dev; + } + ops = dev->netdev_ops; + + if (p && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { + /* Add VLAN to the device filter if it is supported. + * Stricly speaking, this is not necessary now, since + * devices are made promiscuous by the bridge, but if + * that ever changes this code will allow tagged + * traffic to enter the bridge. + */ + err = ops->ndo_vlan_rx_add_vid(dev, htons(ETH_P_8021Q), + vid); + if (err) + return err; + } + + err = br_fdb_insert(br, p, dev->dev_addr, vid); + if (err) { + br_err(br, "failed insert local address into bridge " + "forwarding table\n"); + goto out_filt; + } + + } + + set_bit(vid, v->vlan_bitmap); + v->num_vlans++; + __vlan_add_flags(v, vid, flags); + + return 0; + +out_filt: + if (p && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) + ops->ndo_vlan_rx_kill_vid(dev, htons(ETH_P_8021Q), vid); + return err; +} + +static int __vlan_del(struct net_port_vlans *v, u16 vid) +{ + if (!test_bit(vid, v->vlan_bitmap)) + return -EINVAL; + + __vlan_delete_pvid(v, vid); + clear_bit(vid, v->untagged_bitmap); + + if (v->port_idx && vid) { + struct net_device *dev = v->parent.port->dev; + const struct net_device_ops *ops = dev->netdev_ops; + + if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) + ops->ndo_vlan_rx_kill_vid(dev, htons(ETH_P_8021Q), vid); + } + + clear_bit(vid, v->vlan_bitmap); + v->num_vlans--; + if (bitmap_empty(v->vlan_bitmap, BR_VLAN_BITMAP_LEN)) { + if (v->port_idx) + rcu_assign_pointer(v->parent.port->vlan_info, NULL); + else + rcu_assign_pointer(v->parent.br->vlan_info, NULL); + kfree_rcu(v, rcu); + } + return 0; +} + +static void __vlan_flush(struct net_port_vlans *v) +{ + smp_wmb(); + v->pvid = 0; + bitmap_zero(v->vlan_bitmap, BR_VLAN_BITMAP_LEN); + if (v->port_idx) + rcu_assign_pointer(v->parent.port->vlan_info, NULL); + else + rcu_assign_pointer(v->parent.br->vlan_info, NULL); + kfree_rcu(v, rcu); +} + +/* Strip the tag from the packet. Will return skb with tci set 0. */ +static struct sk_buff *br_vlan_untag(struct sk_buff *skb) +{ + if (skb->protocol != htons(ETH_P_8021Q)) { + skb->vlan_tci = 0; + return skb; + } + + skb->vlan_tci = 0; + skb = vlan_untag(skb); + if (skb) + skb->vlan_tci = 0; + + return skb; +} + +struct sk_buff *br_handle_vlan(struct net_bridge *br, + const struct net_port_vlans *pv, + struct sk_buff *skb) +{ + u16 vid; + + if (!br->vlan_enabled) + goto out; + + /* At this point, we know that the frame was filtered and contains + * a valid vlan id. If the vlan id is set in the untagged bitmap, + * send untagged; otherwise, send taged. + */ + br_vlan_get_tag(skb, &vid); + if (test_bit(vid, pv->untagged_bitmap)) + skb = br_vlan_untag(skb); + else { + /* Egress policy says "send tagged". If output device + * is the bridge, we need to add the VLAN header + * ourselves since we'll be going through the RX path. + * Sending to ports puts the frame on the TX path and + * we let dev_hard_start_xmit() add the header. + */ + if (skb->protocol != htons(ETH_P_8021Q) && + pv->port_idx == 0) { + /* vlan_put_tag expects skb->data to point to + * mac header. + */ + skb_push(skb, ETH_HLEN); + skb = __vlan_put_tag(skb, skb->vlan_proto, skb->vlan_tci); + if (!skb) + goto out; + /* put skb->data back to where it was */ + skb_pull(skb, ETH_HLEN); + skb->vlan_tci = 0; + } + } + +out: + return skb; +} + +/* Called under RCU */ +bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, + struct sk_buff *skb, u16 *vid) +{ + /* If VLAN filtering is disabled on the bridge, all packets are + * permitted. + */ + if (!br->vlan_enabled) + return true; + + /* If there are no vlan in the permitted list, all packets are + * rejected. + */ + if (!v) + return false; + + if (br_vlan_get_tag(skb, vid)) { + u16 pvid = br_get_pvid(v); + + /* Frame did not have a tag. See if pvid is set + * on this port. That tells us which vlan untagged + * traffic belongs to. + */ + if (pvid == VLAN_N_VID) + return false; + + /* PVID is set on this port. Any untagged ingress + * frame is considered to belong to this vlan. + */ + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), pvid); + return true; + } + + /* Frame had a valid vlan tag. See if vlan is allowed */ + if (test_bit(*vid, v->vlan_bitmap)) + return true; + + return false; +} + +/* Called under RCU. */ +bool br_allowed_egress(struct net_bridge *br, + const struct net_port_vlans *v, + const struct sk_buff *skb) +{ + u16 vid; + + if (!br->vlan_enabled) + return true; + + if (!v) + return false; + + br_vlan_get_tag(skb, &vid); + if (test_bit(vid, v->vlan_bitmap)) + return true; + + return false; +} + +/* Must be protected by RTNL */ +int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) +{ + struct net_port_vlans *pv = NULL; + int err; + + ASSERT_RTNL(); + + pv = rtnl_dereference(br->vlan_info); + if (pv) + return __vlan_add(pv, vid, flags); + + /* Create port vlan infomration + */ + pv = kzalloc(sizeof(*pv), GFP_KERNEL); + if (!pv) + return -ENOMEM; + + pv->parent.br = br; + err = __vlan_add(pv, vid, flags); + if (err) + goto out; + + rcu_assign_pointer(br->vlan_info, pv); + return 0; +out: + kfree(pv); + return err; +} + +/* Must be protected by RTNL */ +int br_vlan_delete(struct net_bridge *br, u16 vid) +{ + struct net_port_vlans *pv; + + ASSERT_RTNL(); + + pv = rtnl_dereference(br->vlan_info); + if (!pv) + return -EINVAL; + + if (vid) { + /* If the VID !=0 remove fdb for this vid. VID 0 is special + * in that it's the default and is always there in the fdb. + */ + spin_lock_bh(&br->hash_lock); + fdb_delete_by_addr(br, br->dev->dev_addr, vid); + spin_unlock_bh(&br->hash_lock); + } + + __vlan_del(pv, vid); + return 0; +} + +void br_vlan_flush(struct net_bridge *br) +{ + struct net_port_vlans *pv; + + ASSERT_RTNL(); + pv = rtnl_dereference(br->vlan_info); + if (!pv) + return; + + __vlan_flush(pv); +} + +int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) +{ + if (!rtnl_trylock()) + return restart_syscall(); + + if (br->vlan_enabled == val) + goto unlock; + + br->vlan_enabled = val; + +unlock: + rtnl_unlock(); + return 0; +} + +/* Must be protected by RTNL */ +int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) +{ + struct net_port_vlans *pv = NULL; + int err; + + ASSERT_RTNL(); + + pv = rtnl_dereference(port->vlan_info); + if (pv) + return __vlan_add(pv, vid, flags); + + /* Create port vlan infomration + */ + pv = kzalloc(sizeof(*pv), GFP_KERNEL); + if (!pv) { + err = -ENOMEM; + goto clean_up; + } + + pv->port_idx = port->port_no; + pv->parent.port = port; + err = __vlan_add(pv, vid, flags); + if (err) + goto clean_up; + + rcu_assign_pointer(port->vlan_info, pv); + return 0; + +clean_up: + kfree(pv); + return err; +} + +/* Must be protected by RTNL */ +int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) +{ + struct net_port_vlans *pv; + + ASSERT_RTNL(); + + pv = rtnl_dereference(port->vlan_info); + if (!pv) + return -EINVAL; + + if (vid) { + /* If the VID !=0 remove fdb for this vid. VID 0 is special + * in that it's the default and is always there in the fdb. + */ + spin_lock_bh(&port->br->hash_lock); + fdb_delete_by_addr(port->br, port->dev->dev_addr, vid); + spin_unlock_bh(&port->br->hash_lock); + } + + return __vlan_del(pv, vid); +} + +void nbp_vlan_flush(struct net_bridge_port *port) +{ + struct net_port_vlans *pv; + + ASSERT_RTNL(); + + pv = rtnl_dereference(port->vlan_info); + if (!pv) + return; + + __vlan_flush(pv); +} + +bool nbp_vlan_find(struct net_bridge_port *port, u16 vid) +{ + struct net_port_vlans *pv; + bool found = false; + + rcu_read_lock(); + pv = rcu_dereference(port->vlan_info); + + if (!pv) + goto out; + + if (test_bit(vid, pv->vlan_bitmap)) + found = true; + +out: + rcu_read_unlock(); + return found; +} diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 92de5e5f9db2..9878eb8204c5 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -78,6 +78,11 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum, const char *prefix) { unsigned int bitmask; + struct net *net = dev_net(in ? in : out); + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; spin_lock_bh(&ebt_log_lock); printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x", @@ -176,17 +181,18 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_log_info *info = par->targinfo; struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); li.type = NF_LOG_TYPE_LOG; li.u.log.level = info->loglevel; li.u.log.logflags = info->bitmask; if (info->bitmask & EBT_LOG_NFLOG) - nf_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in, - par->out, &li, "%s", info->prefix); + nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, + par->in, par->out, &li, "%s", info->prefix); else ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in, - par->out, &li, info->prefix); + par->out, &li, info->prefix); return EBT_CONTINUE; } @@ -206,19 +212,47 @@ static struct nf_logger ebt_log_logger __read_mostly = { .me = THIS_MODULE, }; +static int __net_init ebt_log_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_BRIDGE, &ebt_log_logger); + return 0; +} + +static void __net_exit ebt_log_net_fini(struct net *net) +{ + nf_log_unset(net, &ebt_log_logger); +} + +static struct pernet_operations ebt_log_net_ops = { + .init = ebt_log_net_init, + .exit = ebt_log_net_fini, +}; + static int __init ebt_log_init(void) { int ret; + ret = register_pernet_subsys(&ebt_log_net_ops); + if (ret < 0) + goto err_pernet; + ret = xt_register_target(&ebt_log_tg_reg); if (ret < 0) - return ret; + goto err_target; + nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger); - return 0; + + return ret; + +err_target: + unregister_pernet_subsys(&ebt_log_net_ops); +err_pernet: + return ret; } static void __exit ebt_log_fini(void) { + unregister_pernet_subsys(&ebt_log_net_ops); nf_log_unregister(&ebt_log_logger); xt_unregister_target(&ebt_log_tg_reg); } diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c index 5be68bbcc341..59ac7952010d 100644 --- a/net/bridge/netfilter/ebt_nflog.c +++ b/net/bridge/netfilter/ebt_nflog.c @@ -24,14 +24,15 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nflog_info *info = par->targinfo; struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; - nf_log_packet(PF_BRIDGE, par->hooknum, skb, par->in, par->out, - &li, "%s", info->prefix); + nf_log_packet(net, PF_BRIDGE, par->hooknum, skb, par->in, + par->out, &li, "%s", info->prefix); return EBT_CONTINUE; } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 3476ec469740..fc1905c51417 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -35,12 +35,13 @@ #include <linux/skbuff.h> #include <linux/kernel.h> #include <linux/timer.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netdevice.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_ulog.h> #include <net/netfilter/nf_log.h> +#include <net/netns/generic.h> #include <net/sock.h> #include "../br_private.h" @@ -62,16 +63,24 @@ typedef struct { spinlock_t lock; /* the per-queue lock */ } ebt_ulog_buff_t; -static ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS]; -static struct sock *ebtulognl; +static int ebt_ulog_net_id __read_mostly; +struct ebt_ulog_net { + unsigned int nlgroup[EBT_ULOG_MAXNLGROUPS]; + ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS]; + struct sock *ebtulognl; +}; + +static struct ebt_ulog_net *ebt_ulog_pernet(struct net *net) +{ + return net_generic(net, ebt_ulog_net_id); +} /* send one ulog_buff_t to userspace */ -static void ulog_send(unsigned int nlgroup) +static void ulog_send(struct ebt_ulog_net *ebt, unsigned int nlgroup) { - ebt_ulog_buff_t *ub = &ulog_buffers[nlgroup]; + ebt_ulog_buff_t *ub = &ebt->ulog_buffers[nlgroup]; - if (timer_pending(&ub->timer)) - del_timer(&ub->timer); + del_timer(&ub->timer); if (!ub->skb) return; @@ -81,7 +90,7 @@ static void ulog_send(unsigned int nlgroup) ub->lastnlh->nlmsg_type = NLMSG_DONE; NETLINK_CB(ub->skb).dst_group = nlgroup + 1; - netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC); + netlink_broadcast(ebt->ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC); ub->qlen = 0; ub->skb = NULL; @@ -90,10 +99,15 @@ static void ulog_send(unsigned int nlgroup) /* timer function to flush queue in flushtimeout time */ static void ulog_timer(unsigned long data) { - spin_lock_bh(&ulog_buffers[data].lock); - if (ulog_buffers[data].skb) - ulog_send(data); - spin_unlock_bh(&ulog_buffers[data].lock); + struct ebt_ulog_net *ebt = container_of((void *)data, + struct ebt_ulog_net, + nlgroup[*(unsigned int *)data]); + + ebt_ulog_buff_t *ub = &ebt->ulog_buffers[*(unsigned int *)data]; + spin_lock_bh(&ub->lock); + if (ub->skb) + ulog_send(ebt, *(unsigned int *)data); + spin_unlock_bh(&ub->lock); } static struct sk_buff *ulog_alloc_skb(unsigned int size) @@ -124,8 +138,10 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, ebt_ulog_packet_msg_t *pm; size_t size, copy_len; struct nlmsghdr *nlh; + struct net *net = dev_net(in ? in : out); + struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); unsigned int group = uloginfo->nlgroup; - ebt_ulog_buff_t *ub = &ulog_buffers[group]; + ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group]; spinlock_t *lock = &ub->lock; ktime_t kt; @@ -135,7 +151,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, else copy_len = uloginfo->cprange; - size = NLMSG_SPACE(sizeof(*pm) + copy_len); + size = nlmsg_total_size(sizeof(*pm) + copy_len); if (size > nlbufsiz) { pr_debug("Size %Zd needed, but nlbufsiz=%d\n", size, nlbufsiz); return; @@ -147,7 +163,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, if (!(ub->skb = ulog_alloc_skb(size))) goto unlock; } else if (size > skb_tailroom(ub->skb)) { - ulog_send(group); + ulog_send(ebt, group); if (!(ub->skb = ulog_alloc_skb(size))) goto unlock; @@ -206,7 +222,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, ub->lastnlh = nlh; if (ub->qlen >= uloginfo->qthreshold) - ulog_send(group); + ulog_send(ebt, group); else if (!timer_pending(&ub->timer)) { ub->timer.expires = jiffies + flushtimeout * HZ / 100; add_timer(&ub->timer); @@ -278,57 +294,89 @@ static struct nf_logger ebt_ulog_logger __read_mostly = { .me = THIS_MODULE, }; -static int __init ebt_ulog_init(void) +static int __net_init ebt_ulog_net_init(struct net *net) { - int ret; int i; + struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); + struct netlink_kernel_cfg cfg = { .groups = EBT_ULOG_MAXNLGROUPS, }; - if (nlbufsiz >= 128*1024) { - pr_warning("Netlink buffer has to be <= 128kB," - " please try a smaller nlbufsiz parameter.\n"); - return -EINVAL; - } - /* initialize ulog_buffers */ for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { - setup_timer(&ulog_buffers[i].timer, ulog_timer, i); - spin_lock_init(&ulog_buffers[i].lock); + ebt->nlgroup[i] = i; + setup_timer(&ebt->ulog_buffers[i].timer, ulog_timer, + (unsigned long)&ebt->nlgroup[i]); + spin_lock_init(&ebt->ulog_buffers[i].lock); } - ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg); - if (!ebtulognl) - ret = -ENOMEM; - else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0) - netlink_kernel_release(ebtulognl); - - if (ret == 0) - nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger); + ebt->ebtulognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); + if (!ebt->ebtulognl) + return -ENOMEM; - return ret; + nf_log_set(net, NFPROTO_BRIDGE, &ebt_ulog_logger); + return 0; } -static void __exit ebt_ulog_fini(void) +static void __net_exit ebt_ulog_net_fini(struct net *net) { - ebt_ulog_buff_t *ub; int i; + struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - nf_log_unregister(&ebt_ulog_logger); - xt_unregister_target(&ebt_ulog_tg_reg); + nf_log_unset(net, &ebt_ulog_logger); for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { - ub = &ulog_buffers[i]; - if (timer_pending(&ub->timer)) - del_timer(&ub->timer); - spin_lock_bh(&ub->lock); + ebt_ulog_buff_t *ub = &ebt->ulog_buffers[i]; + del_timer(&ub->timer); + if (ub->skb) { kfree_skb(ub->skb); ub->skb = NULL; } - spin_unlock_bh(&ub->lock); } - netlink_kernel_release(ebtulognl); + netlink_kernel_release(ebt->ebtulognl); +} + +static struct pernet_operations ebt_ulog_net_ops = { + .init = ebt_ulog_net_init, + .exit = ebt_ulog_net_fini, + .id = &ebt_ulog_net_id, + .size = sizeof(struct ebt_ulog_net), +}; + +static int __init ebt_ulog_init(void) +{ + int ret; + + if (nlbufsiz >= 128*1024) { + pr_warn("Netlink buffer has to be <= 128kB," + "please try a smaller nlbufsiz parameter.\n"); + return -EINVAL; + } + + ret = register_pernet_subsys(&ebt_ulog_net_ops); + if (ret) + goto out_pernet; + + ret = xt_register_target(&ebt_ulog_tg_reg); + if (ret) + goto out_target; + + nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger); + + return 0; + +out_target: + unregister_pernet_subsys(&ebt_ulog_net_ops); +out_pernet: + return ret; +} + +static void __exit ebt_ulog_fini(void) +{ + nf_log_unregister(&ebt_ulog_logger); + xt_unregister_target(&ebt_ulog_tg_reg); + unregister_pernet_subsys(&ebt_ulog_net_ops); } module_init(ebt_ulog_init); diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 40d8258bf74f..70f656ce0f4a 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -64,9 +64,7 @@ static int ebt_broute(struct sk_buff *skb) static int __net_init broute_net_init(struct net *net) { net->xt.broute_table = ebt_register_table(net, &broute_table); - if (IS_ERR(net->xt.broute_table)) - return PTR_ERR(net->xt.broute_table); - return 0; + return PTR_RET(net->xt.broute_table); } static void __net_exit broute_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5fe2ff3b01ef..3d110c4fc787 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -138,7 +138,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, ethproto = h->h_proto; if (e->bitmask & EBT_802_3) { - if (FWINV2(ntohs(ethproto) >= 1536, EBT_IPROTO)) + if (FWINV2(ntohs(ethproto) >= ETH_P_802_3_MIN, EBT_IPROTO)) return 1; } else if (!(e->bitmask & EBT_NOPROTO) && FWINV2(e->ethproto != ethproto, EBT_IPROTO)) @@ -1472,16 +1472,17 @@ static int do_ebt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; + struct net *net = sock_net(sk); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; switch(cmd) { case EBT_SO_SET_ENTRIES: - ret = do_replace(sock_net(sk), user, len); + ret = do_replace(net, user, len); break; case EBT_SO_SET_COUNTERS: - ret = update_counters(sock_net(sk), user, len); + ret = update_counters(net, user, len); break; default: ret = -EINVAL; @@ -1494,14 +1495,15 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) int ret; struct ebt_replace tmp; struct ebt_table *t; + struct net *net = sock_net(sk); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; - t = find_table_lock(sock_net(sk), tmp.name, &ret, &ebt_mutex); + t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); if (!t) return ret; @@ -2279,16 +2281,17 @@ static int compat_do_ebt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; + struct net *net = sock_net(sk); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case EBT_SO_SET_ENTRIES: - ret = compat_do_replace(sock_net(sk), user, len); + ret = compat_do_replace(net, user, len); break; case EBT_SO_SET_COUNTERS: - ret = compat_update_counters(sock_net(sk), user, len); + ret = compat_update_counters(net, user, len); break; default: ret = -EINVAL; @@ -2302,8 +2305,9 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, int ret; struct compat_ebt_replace tmp; struct ebt_table *t; + struct net *net = sock_net(sk); - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; /* try real handler in case userland supplied needed padding */ @@ -2314,7 +2318,7 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; - t = find_table_lock(sock_net(sk), tmp.name, &ret, &ebt_mutex); + t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); if (!t) return ret; diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 1ae1d9cb278d..1f9ece1a9c34 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -1,7 +1,7 @@ /* * CAIF Interface registration. * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 * * Borrowed heavily from file: pn_dev.c. Thanks to Remi Denis-Courmont @@ -118,7 +118,7 @@ static struct caif_device_entry *caif_get(struct net_device *dev) return NULL; } -void caif_flow_cb(struct sk_buff *skb) +static void caif_flow_cb(struct sk_buff *skb) { struct caif_device_entry *caifd; void (*dtor)(struct sk_buff *skb) = NULL; @@ -301,10 +301,11 @@ static void dev_flowctrl(struct net_device *dev, int on) } void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, - struct cflayer *link_support, int head_room, - struct cflayer **layer, int (**rcv_func)( - struct sk_buff *, struct net_device *, - struct packet_type *, struct net_device *)) + struct cflayer *link_support, int head_room, + struct cflayer **layer, + int (**rcv_func)(struct sk_buff *, struct net_device *, + struct packet_type *, + struct net_device *)) { struct caif_device_entry *caifd; enum cfcnfg_phy_preference pref; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 095259f83902..05a41c7ec304 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ @@ -197,8 +197,8 @@ static void cfsk_put(struct cflayer *layr) /* Packet Control Callback function called from CAIF */ static void caif_ctrl_cb(struct cflayer *layr, - enum caif_ctrlcmd flow, - int phyid) + enum caif_ctrlcmd flow, + int phyid) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); switch (flow) { @@ -274,7 +274,7 @@ static void caif_check_flow_release(struct sock *sk) * changed locking, address handling and added MSG_TRUNC. */ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t len, int flags) + struct msghdr *m, size_t len, int flags) { struct sock *sk = sock->sk; @@ -286,6 +286,8 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, if (m->msg_flags&MSG_OOB) goto read_error; + m->msg_namelen = 0; + skb = skb_recv_datagram(sk, flags, 0 , &ret); if (!skb) goto read_error; @@ -346,8 +348,8 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) * changed locking calls, changed address handling. */ static int caif_stream_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, - int flags) + struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; int copied = 0; @@ -462,7 +464,7 @@ out: * CAIF flow-on and sock_writable. */ static long caif_wait_for_flow_on(struct caifsock *cf_sk, - int wait_writeable, long timeo, int *err) + int wait_writeable, long timeo, int *err) { struct sock *sk = &cf_sk->sk; DEFINE_WAIT(wait); @@ -516,7 +518,7 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, /* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */ static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) + struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -591,7 +593,7 @@ err: * and other minor adaptations. */ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) + struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -670,7 +672,7 @@ out_err: } static int setsockopt(struct socket *sock, - int lvl, int opt, char __user *ov, unsigned int ol) + int lvl, int opt, char __user *ov, unsigned int ol) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -932,7 +934,7 @@ static int caif_release(struct socket *sock) /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ static unsigned int caif_poll(struct file *file, - struct socket *sock, poll_table *wait) + struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; unsigned int mask; @@ -1022,7 +1024,7 @@ static void caif_sock_destructor(struct sock *sk) } static int caif_create(struct net *net, struct socket *sock, int protocol, - int kern) + int kern) { struct sock *sk = NULL; struct caifsock *cf_sk = NULL; diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 3ebc8cbc91ff..942e00a425fd 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -1,7 +1,7 @@ /* * CAIF USB handler * Copyright (C) ST-Ericsson AB 2011 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 * */ @@ -75,14 +75,14 @@ static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt) } static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid) + int phyid) { if (layr->up && layr->up->ctrlcmd) layr->up->ctrlcmd(layr->up, ctrl, layr->id); } -struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN], - u8 braddr[ETH_ALEN]) +static struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN], + u8 braddr[ETH_ALEN]) { struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC); @@ -121,7 +121,7 @@ static struct packet_type caif_usb_type __read_mostly = { }; static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, - void *arg) + void *arg) { struct net_device *dev = arg; struct caif_dev_common common; diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index ba9cfd47778a..fa39fc298708 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ @@ -61,11 +61,11 @@ struct cfcnfg { }; static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, - enum cfctrl_srv serv, u8 phyid, - struct cflayer *adapt_layer); + enum cfctrl_srv serv, u8 phyid, + struct cflayer *adapt_layer); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, - struct cflayer *adapt_layer); + struct cflayer *adapt_layer); static void cfctrl_resp_func(void); static void cfctrl_enum_resp(void); @@ -131,7 +131,7 @@ static void cfctrl_resp_func(void) } static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg, - u8 phyid) + u8 phyid) { struct cfcnfg_phyinfo *phy; @@ -216,8 +216,8 @@ static const int protohead[CFCTRL_SRV_MASK] = { static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, - struct caif_connect_request *s, - struct cfctrl_link_param *l) + struct caif_connect_request *s, + struct cfctrl_link_param *l) { struct dev_info *dev_info; enum cfcnfg_phy_preference pref; @@ -301,8 +301,7 @@ static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, struct cflayer *adap_layer, int *ifindex, - int *proto_head, - int *proto_tail) + int *proto_head, int *proto_tail) { struct cflayer *frml; struct cfcnfg_phyinfo *phy; @@ -364,7 +363,7 @@ unlock: EXPORT_SYMBOL(caif_connect_client); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, - struct cflayer *adapt_layer) + struct cflayer *adapt_layer) { if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, @@ -402,7 +401,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { - pr_err("ERROR: Link Layer Device dissapeared" + pr_err("ERROR: Link Layer Device disappeared" "while connecting\n"); goto unlock; } @@ -526,7 +525,7 @@ out_err: EXPORT_SYMBOL(cfcnfg_add_phy_layer); int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer, - bool up) + bool up) { struct cfcnfg_phyinfo *phyinfo; diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index a376ec1ac0a7..2bd4b58f4372 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ @@ -20,12 +20,12 @@ #ifdef CAIF_NO_LOOP static int handle_loop(struct cfctrl *ctrl, - int cmd, struct cfpkt *pkt){ + int cmd, struct cfpkt *pkt){ return -1; } #else static int handle_loop(struct cfctrl *ctrl, - int cmd, struct cfpkt *pkt); + int cmd, struct cfpkt *pkt); #endif static int cfctrl_recv(struct cflayer *layr, struct cfpkt *pkt); static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, @@ -72,7 +72,7 @@ void cfctrl_remove(struct cflayer *layer) } static bool param_eq(const struct cfctrl_link_param *p1, - const struct cfctrl_link_param *p2) + const struct cfctrl_link_param *p2) { bool eq = p1->linktype == p2->linktype && @@ -197,8 +197,8 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid) } int cfctrl_linkup_request(struct cflayer *layer, - struct cfctrl_link_param *param, - struct cflayer *user_layer) + struct cfctrl_link_param *param, + struct cflayer *user_layer) { struct cfctrl *cfctrl = container_obj(layer); u32 tmp32; @@ -301,7 +301,7 @@ int cfctrl_linkup_request(struct cflayer *layer, } int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid, - struct cflayer *client) + struct cflayer *client) { int ret; struct cfpkt *pkt; @@ -555,7 +555,7 @@ error: } static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid) + int phyid) { struct cfctrl *this = container_obj(layr); switch (ctrl) { diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c index 2914659eb9b2..7aae0b56829e 100644 --- a/net/caif/cfdbgl.c +++ b/net/caif/cfdbgl.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c index a63f4a5f5aff..3bdddb32d55a 100644 --- a/net/caif/cfdgml.c +++ b/net/caif/cfdgml.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c index 0a7df7ef062d..8bc7caa28e64 100644 --- a/net/caif/cffrml.c +++ b/net/caif/cffrml.c @@ -2,7 +2,7 @@ * CAIF Framing Layer. * * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ @@ -28,7 +28,7 @@ struct cffrml { static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt); static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid); + int phyid); static u32 cffrml_rcv_error; static u32 cffrml_rcv_checsum_error; @@ -167,7 +167,7 @@ static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt) } static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid) + int phyid) { if (layr->up && layr->up->ctrlcmd) layr->up->ctrlcmd(layr->up, ctrl, layr->id); diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index 94b08612a4d8..8c5d6386319f 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ @@ -42,7 +42,7 @@ struct cfmuxl { static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid); + int phyid); static struct cflayer *get_up(struct cfmuxl *muxl, u16 id); struct cflayer *cfmuxl_create(void) @@ -244,7 +244,7 @@ static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt) } static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid) + int phyid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *layer; diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index 863dedd91bb6..6493351f39c6 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ @@ -266,8 +266,8 @@ inline u16 cfpkt_getlen(struct cfpkt *pkt) } inline u16 cfpkt_iterate(struct cfpkt *pkt, - u16 (*iter_func)(u16, void *, u16), - u16 data) + u16 (*iter_func)(u16, void *, u16), + u16 data) { /* * Don't care about the performance hit of linearizing, @@ -307,8 +307,8 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len) } struct cfpkt *cfpkt_append(struct cfpkt *dstpkt, - struct cfpkt *addpkt, - u16 expectlen) + struct cfpkt *addpkt, + u16 expectlen) { struct sk_buff *dst = pkt_to_skb(dstpkt); struct sk_buff *add = pkt_to_skb(addpkt); diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index 2b563ad04597..61d7617d9249 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ @@ -43,7 +43,7 @@ static void cfrfml_release(struct cflayer *layer) } struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info, - int mtu_size) + int mtu_size) { int tmp; struct cfrfml *this = kzalloc(sizeof(struct cfrfml), GFP_ATOMIC); @@ -69,7 +69,7 @@ struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info, } static struct cfpkt *rfm_append(struct cfrfml *rfml, char *seghead, - struct cfpkt *pkt, int *err) + struct cfpkt *pkt, int *err) { struct cfpkt *tmppkt; *err = -EPROTO; diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c index 8e68b97f13ee..ce60f06d76de 100644 --- a/net/caif/cfserl.c +++ b/net/caif/cfserl.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ @@ -29,7 +29,7 @@ struct cfserl { static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid); + int phyid); struct cflayer *cfserl_create(int instance, bool use_stx) { @@ -182,7 +182,7 @@ static int cfserl_transmit(struct cflayer *layer, struct cfpkt *newpkt) } static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid) + int phyid) { layr->up->ctrlcmd(layr->up, ctrl, phyid); } diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index ba217e90765e..353f793d1b3b 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ @@ -25,7 +25,7 @@ #define container_obj(layr) container_of(layr, struct cfsrvl, layer) static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid) + int phyid) { struct cfsrvl *service = container_obj(layr); @@ -158,10 +158,9 @@ static void cfsrvl_release(struct cflayer *layer) } void cfsrvl_init(struct cfsrvl *service, - u8 channel_id, - struct dev_info *dev_info, - bool supports_flowctrl - ) + u8 channel_id, + struct dev_info *dev_info, + bool supports_flowctrl) { caif_assert(offsetof(struct cfsrvl, layer) == 0); service->open = false; @@ -207,8 +206,8 @@ void caif_free_client(struct cflayer *adap_layer) EXPORT_SYMBOL(caif_free_client); void caif_client_register_refcnt(struct cflayer *adapt_layer, - void (*hold)(struct cflayer *lyr), - void (*put)(struct cflayer *lyr)) + void (*hold)(struct cflayer *lyr), + void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c index 86d2dadb4b73..1728fa4471cf 100644 --- a/net/caif/cfutill.c +++ b/net/caif/cfutill.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c index 910ab0661f66..262224581efa 100644 --- a/net/caif/cfveil.c +++ b/net/caif/cfveil.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c index a8e2a2d758a5..b3b110e8a350 100644 --- a/net/caif/cfvidl.c +++ b/net/caif/cfvidl.c @@ -1,6 +1,6 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index e597733affb8..7344a8fa1bb0 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -1,7 +1,7 @@ /* * Copyright (C) ST-Ericsson AB 2010 - * Authors: Sjur Brendeland/sjur.brandeland@stericsson.com - * Daniel Martensson / Daniel.Martensson@stericsson.com + * Authors: Sjur Brendeland + * Daniel Martensson * License terms: GNU General Public License (GPL) version 2 */ @@ -167,7 +167,7 @@ static void chnl_put(struct cflayer *lyr) } static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, - int phyid) + int phyid) { struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); pr_debug("NET flowctrl func called flow: %s\n", @@ -443,7 +443,7 @@ nla_put_failure: } static void caif_netlink_parms(struct nlattr *data[], - struct caif_connect_request *conn_req) + struct caif_connect_request *conn_req) { if (!data) { pr_warn("no params data found\n"); @@ -488,7 +488,7 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev, } static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) + struct nlattr *data[]) { struct chnl_net *caifdev; ASSERT_RTNL(); diff --git a/net/can/Kconfig b/net/can/Kconfig index 03200699d274..a15c0e0d1fc7 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -16,10 +16,11 @@ menuconfig CAN If you want CAN support you should say Y here and also to the specific driver for your controller(s) below. +if CAN + config CAN_RAW tristate "Raw CAN Protocol (raw access with CAN-ID filtering)" - depends on CAN - default N + default y ---help--- The raw CAN protocol option offers access to the CAN bus via the BSD socket API. You probably want to use the raw socket in @@ -29,8 +30,7 @@ config CAN_RAW config CAN_BCM tristate "Broadcast Manager CAN Protocol (with content filtering)" - depends on CAN - default N + default y ---help--- The Broadcast Manager offers content filtering, timeout monitoring, sending of RTR frames, and cyclic CAN messages without permanent user @@ -42,8 +42,7 @@ config CAN_BCM config CAN_GW tristate "CAN Gateway/Router (with netlink configuration)" - depends on CAN - default N + default y ---help--- The CAN Gateway/Router is used to route (and modify) CAN frames. It is based on the PF_CAN core infrastructure for msg filtering and @@ -53,3 +52,5 @@ config CAN_GW by the netlink configuration interface known e.g. from iptables. source "drivers/net/can/Kconfig" + +endif diff --git a/net/can/af_can.c b/net/can/af_can.c index ddac1ee2ed20..c4e50852c9f4 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -516,7 +516,6 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, { struct receiver *r = NULL; struct hlist_head *rl; - struct hlist_node *next; struct dev_rcv_lists *d; if (dev && dev->type != ARPHRD_CAN) @@ -526,7 +525,7 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, d = find_dev_rcv_lists(dev); if (!d) { - printk(KERN_ERR "BUG: receive list not found for " + pr_err("BUG: receive list not found for " "dev %s, id %03X, mask %03X\n", DNAME(dev), can_id, mask); goto out; @@ -540,23 +539,20 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, * been registered before. */ - hlist_for_each_entry_rcu(r, next, rl, list) { + hlist_for_each_entry_rcu(r, rl, list) { if (r->can_id == can_id && r->mask == mask && r->func == func && r->data == data) break; } /* - * Check for bugs in CAN protocol implementations: - * If no matching list item was found, the list cursor variable next - * will be NULL, while r will point to the last item of the list. + * Check for bugs in CAN protocol implementations using af_can.c: + * 'r' will be NULL if no matching list item was found for removal. */ - if (!next) { - printk(KERN_ERR "BUG: receive list entry not found for " - "dev %s, id %03X, mask %03X\n", - DNAME(dev), can_id, mask); - r = NULL; + if (!r) { + WARN(1, "BUG: receive list entry not found for dev %s, " + "id %03X, mask %03X\n", DNAME(dev), can_id, mask); goto out; } @@ -590,7 +586,6 @@ static inline void deliver(struct sk_buff *skb, struct receiver *r) static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) { struct receiver *r; - struct hlist_node *n; int matches = 0; struct can_frame *cf = (struct can_frame *)skb->data; canid_t can_id = cf->can_id; @@ -600,7 +595,7 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) if (can_id & CAN_ERR_FLAG) { /* check for error message frame entries only */ - hlist_for_each_entry_rcu(r, n, &d->rx[RX_ERR], list) { + hlist_for_each_entry_rcu(r, &d->rx[RX_ERR], list) { if (can_id & r->mask) { deliver(skb, r); matches++; @@ -610,13 +605,13 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) } /* check for unfiltered entries */ - hlist_for_each_entry_rcu(r, n, &d->rx[RX_ALL], list) { + hlist_for_each_entry_rcu(r, &d->rx[RX_ALL], list) { deliver(skb, r); matches++; } /* check for can_id/mask entries */ - hlist_for_each_entry_rcu(r, n, &d->rx[RX_FIL], list) { + hlist_for_each_entry_rcu(r, &d->rx[RX_FIL], list) { if ((can_id & r->mask) == r->can_id) { deliver(skb, r); matches++; @@ -624,7 +619,7 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) } /* check for inverted can_id/mask entries */ - hlist_for_each_entry_rcu(r, n, &d->rx[RX_INV], list) { + hlist_for_each_entry_rcu(r, &d->rx[RX_INV], list) { if ((can_id & r->mask) != r->can_id) { deliver(skb, r); matches++; @@ -636,7 +631,7 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) return matches; if (can_id & CAN_EFF_FLAG) { - hlist_for_each_entry_rcu(r, n, &d->rx[RX_EFF], list) { + hlist_for_each_entry_rcu(r, &d->rx[RX_EFF], list) { if (r->can_id == can_id) { deliver(skb, r); matches++; @@ -644,7 +639,7 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) } } else { can_id &= CAN_SFF_MASK; - hlist_for_each_entry_rcu(r, n, &d->rx_sff[can_id], list) { + hlist_for_each_entry_rcu(r, &d->rx_sff[can_id], list) { deliver(skb, r); matches++; } @@ -751,8 +746,7 @@ int can_proto_register(const struct can_proto *cp) int err = 0; if (proto < 0 || proto >= CAN_NPROTO) { - printk(KERN_ERR "can: protocol number %d out of range\n", - proto); + pr_err("can: protocol number %d out of range\n", proto); return -EINVAL; } @@ -763,8 +757,7 @@ int can_proto_register(const struct can_proto *cp) mutex_lock(&proto_tab_lock); if (proto_tab[proto]) { - printk(KERN_ERR "can: protocol %d already registered\n", - proto); + pr_err("can: protocol %d already registered\n", proto); err = -EBUSY; } else RCU_INIT_POINTER(proto_tab[proto], cp); @@ -818,11 +811,8 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg, /* create new dev_rcv_lists for this device */ d = kzalloc(sizeof(*d), GFP_KERNEL); - if (!d) { - printk(KERN_ERR - "can: allocation of receive list failed\n"); + if (!d) return NOTIFY_DONE; - } BUG_ON(dev->ml_priv); dev->ml_priv = d; @@ -840,8 +830,8 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg, dev->ml_priv = NULL; } } else - printk(KERN_ERR "can: notifier: receive list not " - "found for dev %s\n", dev->name); + pr_err("can: notifier: receive list not found for dev " + "%s\n", dev->name); spin_unlock(&can_rcvlists_lock); @@ -929,7 +919,7 @@ static __exit void can_exit(void) /* remove created dev_rcv_lists from still registered CAN devices */ rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv){ + if (dev->type == ARPHRD_CAN && dev->ml_priv) { struct dev_rcv_lists *d = dev->ml_priv; diff --git a/net/can/bcm.c b/net/can/bcm.c index 969b7cdff59d..8f113e6ff327 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -54,6 +54,7 @@ #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> +#include <linux/can/skb.h> #include <linux/can/bcm.h> #include <linux/slab.h> #include <net/sock.h> @@ -225,7 +226,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) static int bcm_proc_open(struct inode *inode, struct file *file) { - return single_open(file, bcm_proc_show, PDE(inode)->data); + return single_open(file, bcm_proc_show, PDE_DATA(inode)); } static const struct file_operations bcm_proc_fops = { @@ -256,10 +257,13 @@ static void bcm_can_tx(struct bcm_op *op) return; } - skb = alloc_skb(CFSIZ, gfp_any()); + skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), gfp_any()); if (!skb) goto out; + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + memcpy(skb_put(skb, CFSIZ), cf, CFSIZ); /* send with loopback */ @@ -1199,11 +1203,12 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) if (!ifindex) return -ENODEV; - skb = alloc_skb(CFSIZ, GFP_KERNEL); - + skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), GFP_KERNEL); if (!skb) return -ENOMEM; + can_skb_reserve(skb); + err = memcpy_fromiovec(skb_put(skb, CFSIZ), msg->msg_iov, CFSIZ); if (err < 0) { kfree_skb(skb); @@ -1216,6 +1221,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) return -ENODEV; } + can_skb_prv(skb)->ifindex = dev->ifindex; skb->dev = dev; skb->sk = sk; err = can_send(skb, 1); /* send with loopback */ @@ -1627,7 +1633,7 @@ static void __exit bcm_module_exit(void) can_proto_unregister(&bcm_can_proto); if (proc_dir) - proc_net_remove(&init_net, "can-bcm"); + remove_proc_entry("can-bcm", init_net.proc_net); } module_init(bcm_module_init); diff --git a/net/can/gw.c b/net/can/gw.c index 574dda78eb0f..3ee690e8c7d3 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -42,6 +42,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> +#include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> @@ -52,19 +53,31 @@ #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> +#include <linux/can/skb.h> #include <linux/can/gw.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> -#define CAN_GW_VERSION "20101209" -static __initconst const char banner[] = - KERN_INFO "can: netlink gateway (rev " CAN_GW_VERSION ")\n"; +#define CAN_GW_VERSION "20130117" +#define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); -MODULE_ALIAS("can-gw"); +MODULE_ALIAS(CAN_GW_NAME); + +#define CGW_MIN_HOPS 1 +#define CGW_MAX_HOPS 6 +#define CGW_DEFAULT_HOPS 1 + +static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; +module_param(max_hops, uint, S_IRUGO); +MODULE_PARM_DESC(max_hops, + "maximum " CAN_GW_NAME " routing hops for CAN frames " + "(valid values: " __stringify(CGW_MIN_HOPS) "-" + __stringify(CGW_MAX_HOPS) " hops, " + "default: " __stringify(CGW_DEFAULT_HOPS) ")"); static HLIST_HEAD(cgw_list); static struct notifier_block notifier; @@ -118,6 +131,7 @@ struct cgw_job { struct rcu_head rcu; u32 handled_frames; u32 dropped_frames; + u32 deleted_frames; struct cf_mod mod; union { /* CAN frame data source */ @@ -338,15 +352,38 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) struct sk_buff *nskb; int modidx = 0; - /* do not handle already routed frames - see comment below */ - if (skb_mac_header_was_set(skb)) + /* + * Do not handle CAN frames routed more than 'max_hops' times. + * In general we should never catch this delimiter which is intended + * to cover a misconfiguration protection (e.g. circular CAN routes). + * + * The Controller Area Network controllers only accept CAN frames with + * correct CRCs - which are not visible in the controller registers. + * According to skbuff.h documentation the csum_start element for IP + * checksums is undefined/unsued when ip_summed == CHECKSUM_UNNECESSARY. + * Only CAN skbs can be processed here which already have this property. + */ + +#define cgw_hops(skb) ((skb)->csum_start) + + BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY); + + if (cgw_hops(skb) >= max_hops) { + /* indicate deleted frames due to misconfiguration */ + gwj->deleted_frames++; return; + } if (!(gwj->dst.dev->flags & IFF_UP)) { gwj->dropped_frames++; return; } + /* is sending the skb back to the incoming interface not allowed? */ + if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && + can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) + return; + /* * clone the given skb, which has not been done in can_rcv() * @@ -363,15 +400,8 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) return; } - /* - * Mark routed frames by setting some mac header length which is - * not relevant for the CAN frames located in the skb->data section. - * - * As dev->header_ops is not set in CAN netdevices no one is ever - * accessing the various header offsets in the CAN skbuffs anyway. - * E.g. using the packet socket to read CAN frames is still working. - */ - skb_set_mac_header(nskb, 8); + /* put the incremented hop counter in the cloned skb */ + cgw_hops(nskb) = cgw_hops(skb) + 1; nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ @@ -427,16 +457,16 @@ static int cgw_notifier(struct notifier_block *nb, if (msg == NETDEV_UNREGISTER) { struct cgw_job *gwj = NULL; - struct hlist_node *n, *nx; + struct hlist_node *nx; ASSERT_RTNL(); - hlist_for_each_entry_safe(gwj, n, nx, &cgw_list, list) { + hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) { if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(gwj); - kfree(gwj); + kmem_cache_free(cgw_cache, gwj); } } } @@ -472,6 +502,11 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, goto cancel; } + if (gwj->deleted_frames) { + if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0) + goto cancel; + } + /* check non default settings of attributes */ if (gwj->mod.modtype.and) { @@ -540,12 +575,11 @@ cancel: static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) { struct cgw_job *gwj = NULL; - struct hlist_node *n; int idx = 0; int s_idx = cb->args[0]; rcu_read_lock(); - hlist_for_each_entry_rcu(gwj, n, &cgw_list, list) { + hlist_for_each_entry_rcu(gwj, &cgw_list, list) { if (idx < s_idx) goto cont; @@ -744,8 +778,7 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, return 0; } -static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, - void *arg) +static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) { struct rtcanmsg *r; struct cgw_job *gwj; @@ -771,6 +804,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, gwj->handled_frames = 0; gwj->dropped_frames = 0; + gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; @@ -822,21 +856,21 @@ out: static void cgw_remove_all_jobs(void) { struct cgw_job *gwj = NULL; - struct hlist_node *n, *nx; + struct hlist_node *nx; ASSERT_RTNL(); - hlist_for_each_entry_safe(gwj, n, nx, &cgw_list, list) { + hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(gwj); - kfree(gwj); + kmem_cache_free(cgw_cache, gwj); } } -static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) { struct cgw_job *gwj = NULL; - struct hlist_node *n, *nx; + struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; @@ -871,7 +905,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ASSERT_RTNL(); /* remove only the first matching entry */ - hlist_for_each_entry_safe(gwj, n, nx, &cgw_list, list) { + hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) { if (gwj->flags != r->flags) continue; @@ -885,7 +919,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) hlist_del(&gwj->list); cgw_unregister_filter(gwj); - kfree(gwj); + kmem_cache_free(cgw_cache, gwj); err = 0; break; } @@ -895,7 +929,11 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) static __init int cgw_module_init(void) { - printk(banner); + /* sanitize given module parameter */ + max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); + + pr_info("can: netlink gateway (rev " CAN_GW_VERSION ") max_hops=%d\n", + max_hops); cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job), 0, 0, NULL); diff --git a/net/can/proc.c b/net/can/proc.c index ae566902d2bf..b543470c8f8b 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -195,9 +195,8 @@ static void can_print_rcvlist(struct seq_file *m, struct hlist_head *rx_list, struct net_device *dev) { struct receiver *r; - struct hlist_node *n; - hlist_for_each_entry_rcu(r, n, rx_list, list) { + hlist_for_each_entry_rcu(r, rx_list, list) { char *fmt = (r->can_id & CAN_EFF_FLAG)? " %-5s %08x %08x %pK %pK %8ld %s\n" : " %-5s %03x %08x %pK %pK %8ld %s\n"; @@ -379,7 +378,7 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v) static int can_rcvlist_proc_open(struct inode *inode, struct file *file) { - return single_open(file, can_rcvlist_proc_show, PDE(inode)->data); + return single_open(file, can_rcvlist_proc_show, PDE_DATA(inode)); } static const struct file_operations can_rcvlist_proc_fops = { @@ -531,5 +530,5 @@ void can_remove_proc(void) can_remove_proc_readentry(CAN_PROC_RCVLIST_SFF); if (can_dir) - proc_net_remove(&init_net, "can"); + remove_proc_entry("can", init_net.proc_net); } diff --git a/net/can/raw.c b/net/can/raw.c index 5b0e3e330d97..1085e65f848e 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -50,6 +50,7 @@ #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> +#include <linux/can/skb.h> #include <linux/can/raw.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -699,17 +700,19 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, if (!dev) return -ENXIO; - skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, - &err); + skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), + msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto put_dev; + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err < 0) goto free_skb; - err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); - if (err < 0) - goto free_skb; + + sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); skb->dev = dev; skb->sk = sk; diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index cc04dd667a10..e50cc69ae8ca 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -1,6 +1,6 @@ config CEPH_LIB - tristate "Ceph core library (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "Ceph core library" + depends on INET select LIBCRC32C select CRYPTO_AES select CRYPTO diff --git a/net/ceph/Makefile b/net/ceph/Makefile index e87ef435e11b..958d9856912c 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ - pagevec.o + pagevec.o snapshot.o diff --git a/net/ceph/auth.c b/net/ceph/auth.c index b4bf4ac090f1..6b923bcaa2a4 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -47,6 +47,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp if (!ac) goto out; + mutex_init(&ac->mutex); ac->negotiating = true; if (name) ac->name = name; @@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac) */ void ceph_auth_reset(struct ceph_auth_client *ac) { + mutex_lock(&ac->mutex); dout("auth_reset %p\n", ac); if (ac->ops && !ac->negotiating) ac->ops->reset(ac); ac->negotiating = true; + mutex_unlock(&ac->mutex); } int ceph_entity_name_encode(const char *name, void **p, void *end) @@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) int i, num; int ret; + mutex_lock(&ac->mutex); dout("auth_build_hello\n"); monhdr->have_version = 0; monhdr->session_mon = cpu_to_le16(-1); @@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) ret = ceph_entity_name_encode(ac->name, &p, end); if (ret < 0) - return ret; + goto out; ceph_decode_need(&p, end, sizeof(u64), bad); ceph_encode_64(&p, ac->global_id); ceph_encode_32(&lenp, p - lenp - sizeof(u32)); - return p - buf; + ret = p - buf; +out: + mutex_unlock(&ac->mutex); + return ret; bad: - return -ERANGE; + ret = -ERANGE; + goto out; } static int ceph_build_auth_request(struct ceph_auth_client *ac, @@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac, if (ret < 0) { pr_err("error %d building auth method %s request\n", ret, ac->ops->name); - return ret; + goto out; } dout(" built request %d bytes\n", ret); ceph_encode_32(&p, ret); - return p + ret - msg_buf; + ret = p + ret - msg_buf; +out: + return ret; } /* @@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, int result_msg_len; int ret = -EINVAL; + mutex_lock(&ac->mutex); dout("handle_auth_reply %p %p\n", p, end); ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); protocol = ceph_decode_32(&p); @@ -227,33 +238,103 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ret = ac->ops->handle_reply(ac, result, payload, payload_end); if (ret == -EAGAIN) { - return ceph_build_auth_request(ac, reply_buf, reply_len); + ret = ceph_build_auth_request(ac, reply_buf, reply_len); } else if (ret) { pr_err("auth method '%s' error %d\n", ac->ops->name, ret); - return ret; } - return 0; -bad: - pr_err("failed to decode auth msg\n"); out: + mutex_unlock(&ac->mutex); return ret; + +bad: + pr_err("failed to decode auth msg\n"); + ret = -EINVAL; + goto out; } int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len) { + int ret = 0; + + mutex_lock(&ac->mutex); if (!ac->protocol) - return ceph_auth_build_hello(ac, msg_buf, msg_len); - BUG_ON(!ac->ops); - if (ac->ops->should_authenticate(ac)) - return ceph_build_auth_request(ac, msg_buf, msg_len); - return 0; + ret = ceph_auth_build_hello(ac, msg_buf, msg_len); + else if (ac->ops->should_authenticate(ac)) + ret = ceph_build_auth_request(ac, msg_buf, msg_len); + mutex_unlock(&ac->mutex); + return ret; } int ceph_auth_is_authenticated(struct ceph_auth_client *ac) { - if (!ac->ops) - return 0; - return ac->ops->is_authenticated(ac); + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops) + ret = ac->ops->is_authenticated(ac); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_is_authenticated); + +int ceph_auth_create_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *auth) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->create_authorizer) + ret = ac->ops->create_authorizer(ac, peer_type, auth); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_create_authorizer); + +void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->destroy_authorizer) + ac->ops->destroy_authorizer(ac, a); + mutex_unlock(&ac->mutex); +} +EXPORT_SYMBOL(ceph_auth_destroy_authorizer); + +int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *a) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->update_authorizer) + ret = ac->ops->update_authorizer(ac, peer_type, a); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_update_authorizer); + +int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->verify_authorizer_reply) + ret = ac->ops->verify_authorizer_reply(ac, a, len); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply); + +void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) +{ + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, peer_type); + mutex_unlock(&ac->mutex); } +EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index a16bf14eb027..96238ba95f2b 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, return -ENOMEM; } au->service = th->service; + au->secret_id = th->secret_id; msg_a = au->buf->vec.iov_base; msg_a->struct_v = 1; @@ -555,6 +556,26 @@ static int ceph_x_create_authorizer( return 0; } +static int ceph_x_update_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth) +{ + struct ceph_x_authorizer *au; + struct ceph_x_ticket_handler *th; + + th = get_ticket_handler(ac, peer_type); + if (IS_ERR(th)) + return PTR_ERR(th); + + au = (struct ceph_x_authorizer *)auth->authorizer; + if (au->secret_id < th->secret_id) { + dout("ceph_x_update_authorizer service %u secret %llu < %llu\n", + au->service, au->secret_id, th->secret_id); + return ceph_x_build_authorizer(ac, th, au); + } + return 0; +} + static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len) { @@ -630,7 +651,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, th = get_ticket_handler(ac, peer_type); if (!IS_ERR(th)) - remove_ticket_handler(ac, th); + memset(&th->validity, 0, sizeof(th->validity)); } @@ -641,6 +662,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = { .build_request = ceph_x_build_request, .handle_reply = ceph_x_handle_reply, .create_authorizer = ceph_x_create_authorizer, + .update_authorizer = ceph_x_update_authorizer, .verify_authorizer_reply = ceph_x_verify_authorizer_reply, .destroy_authorizer = ceph_x_destroy_authorizer, .invalidate_authorizer = ceph_x_invalidate_authorizer, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index f459e93b774f..c5a058da7ac8 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -29,6 +29,7 @@ struct ceph_x_authorizer { struct ceph_buffer *buf; unsigned int service; u64 nonce; + u64 secret_id; char reply_buf[128]; /* big enough for encrypted blob */ }; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ee71ea26777a..34b11ee8124e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -15,6 +15,8 @@ #include <linux/slab.h> #include <linux/statfs.h> #include <linux/string.h> +#include <linux/nsproxy.h> +#include <net/net_namespace.h> #include <linux/ceph/ceph_features.h> @@ -26,6 +28,22 @@ #include "crypto.h" +/* + * Module compatibility interface. For now it doesn't do anything, + * but its existence signals a certain level of functionality. + * + * The data buffer is used to pass information both to and from + * libceph. The return value indicates whether libceph determines + * it is compatible with the caller (from another kernel module), + * given the provided data. + * + * The data pointer can be null. + */ +bool libceph_compatible(void *data) +{ + return true; +} +EXPORT_SYMBOL(libceph_compatible); /* * find filename portion of a path (/foo/bar/baz -> baz) @@ -292,6 +310,9 @@ ceph_parse_options(char *options, const char *dev_name, int err = -ENOMEM; substring_t argstr[MAX_OPT_ARGS]; + if (current->nsproxy->net_ns != &init_net) + return ERR_PTR(-EINVAL); + opt = kzalloc(sizeof(*opt), GFP_KERNEL); if (!opt) return ERR_PTR(-ENOMEM); @@ -585,13 +606,17 @@ static int __init init_ceph_lib(void) if (ret < 0) goto out_crypto; - pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", - CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, - CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, - CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); + ret = ceph_osdc_setup(); + if (ret < 0) + goto out_msgr; + + pr_info("loaded (mon/osd proto %d/%d)\n", + CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); return 0; +out_msgr: + ceph_msgr_exit(); out_crypto: ceph_crypto_shutdown(); out_debugfs: @@ -603,6 +628,7 @@ out: static void __exit exit_ceph_lib(void) { dout("exit_ceph_lib\n"); + ceph_osdc_cleanup(); ceph_msgr_exit(); ceph_crypto_shutdown(); ceph_debugfs_cleanup(); diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 3fbda04de29c..1348df96fe15 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -21,9 +21,15 @@ const char *ceph_osd_op_name(int op) switch (op) { case CEPH_OSD_OP_READ: return "read"; case CEPH_OSD_OP_STAT: return "stat"; + case CEPH_OSD_OP_MAPEXT: return "mapext"; + case CEPH_OSD_OP_SPARSE_READ: return "sparse-read"; + case CEPH_OSD_OP_NOTIFY: return "notify"; + case CEPH_OSD_OP_NOTIFY_ACK: return "notify-ack"; + case CEPH_OSD_OP_ASSERT_VER: return "assert-version"; case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; + case CEPH_OSD_OP_CREATE: return "create"; case CEPH_OSD_OP_WRITE: return "write"; case CEPH_OSD_OP_DELETE: return "delete"; case CEPH_OSD_OP_TRUNCATE: return "truncate"; @@ -39,6 +45,11 @@ const char *ceph_osd_op_name(int op) case CEPH_OSD_OP_TMAPUP: return "tmapup"; case CEPH_OSD_OP_TMAPGET: return "tmapget"; case CEPH_OSD_OP_TMAPPUT: return "tmapput"; + case CEPH_OSD_OP_WATCH: return "watch"; + + case CEPH_OSD_OP_CLONERANGE: return "clonerange"; + case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version"; + case CEPH_OSD_OP_SRC_CMPXATTR: return "src-cmpxattr"; case CEPH_OSD_OP_GETXATTR: return "getxattr"; case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; @@ -53,6 +64,10 @@ const char *ceph_osd_op_name(int op) case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; case CEPH_OSD_OP_SCRUB: return "scrub"; + case CEPH_OSD_OP_SCRUB_RESERVE: return "scrub-reserve"; + case CEPH_OSD_OP_SCRUB_UNRESERVE: return "scrub-unreserve"; + case CEPH_OSD_OP_SCRUB_STOP: return "scrub-stop"; + case CEPH_OSD_OP_SCRUB_MAP: return "scrub-map"; case CEPH_OSD_OP_WRLOCK: return "wrlock"; case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; @@ -64,10 +79,34 @@ const char *ceph_osd_op_name(int op) case CEPH_OSD_OP_CALL: return "call"; case CEPH_OSD_OP_PGLS: return "pgls"; + case CEPH_OSD_OP_PGLS_FILTER: return "pgls-filter"; + case CEPH_OSD_OP_OMAPGETKEYS: return "omap-get-keys"; + case CEPH_OSD_OP_OMAPGETVALS: return "omap-get-vals"; + case CEPH_OSD_OP_OMAPGETHEADER: return "omap-get-header"; + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: return "omap-get-vals-by-keys"; + case CEPH_OSD_OP_OMAPSETVALS: return "omap-set-vals"; + case CEPH_OSD_OP_OMAPSETHEADER: return "omap-set-header"; + case CEPH_OSD_OP_OMAPCLEAR: return "omap-clear"; + case CEPH_OSD_OP_OMAPRMKEYS: return "omap-rm-keys"; } return "???"; } +const char *ceph_osd_state_name(int s) +{ + switch (s) { + case CEPH_OSD_EXISTS: + return "exists"; + case CEPH_OSD_UP: + return "up"; + case CEPH_OSD_AUTOOUT: + return "autoout"; + case CEPH_OSD_NEW: + return "new"; + default: + return "???"; + } +} const char *ceph_pool_op_name(int op) { diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 35fce755ce10..cbd06a91941c 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in * @outpos: our position in that vector * @firstn: true if choosing "first n" items, false if choosing "indep" * @recurse_to_leaf: true if we want one device under each item of given type + * @descend_once: true if we should only try one descent before giving up * @out2: second output vector for leaf items (if @recurse_to_leaf) */ static int crush_choose(const struct crush_map *map, @@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map, int x, int numrep, int type, int *out, int outpos, int firstn, int recurse_to_leaf, - int *out2) + int descend_once, int *out2) { int rep; unsigned int ftotal, flocal; @@ -391,7 +392,7 @@ static int crush_choose(const struct crush_map *map, } reject = 0; - if (recurse_to_leaf) { + if (!collide && recurse_to_leaf) { if (item < 0) { if (crush_choose(map, map->buckets[-1-item], @@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map, x, outpos+1, 0, out2, outpos, firstn, 0, + map->chooseleaf_descend_once, NULL) <= outpos) /* didn't get leaf */ reject = 1; @@ -422,7 +424,10 @@ reject: ftotal++; flocal++; - if (collide && flocal <= map->choose_local_tries) + if (reject && descend_once) + /* let outer call try again */ + skip_rep = 1; + else if (collide && flocal <= map->choose_local_tries) /* retry locally a few times */ retry_bucket = 1; else if (map->choose_local_fallback_tries > 0 && @@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map, int i, j; int numrep; int firstn; + const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map, curstep->arg2, o+osize, j, firstn, - recurse_to_leaf, c+osize); + recurse_to_leaf, + descend_once, c+osize); } if (recurse_to_leaf) diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index af14cb425164..6e7a236525b6 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -423,7 +423,8 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, } } -int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) +static int ceph_key_instantiate(struct key *key, + struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey; size_t datalen = prep->datalen; @@ -458,12 +459,12 @@ err: return ret; } -int ceph_key_match(const struct key *key, const void *description) +static int ceph_key_match(const struct key *key, const void *description) { return strcmp(key->description, description) == 0; } -void ceph_key_destroy(struct key *key) { +static void ceph_key_destroy(struct key *key) { struct ceph_crypto_key *ckey = key->payload.data; ceph_crypto_key_destroy(ckey); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 38b5dc1823d4..83661cdc0766 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -66,9 +66,9 @@ static int osdmap_show(struct seq_file *s, void *p) for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pool = rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", - pool->id, pool->v.pg_num, pool->pg_num_mask, - pool->v.lpg_num, pool->lpg_num_mask); + seq_printf(s, "pg_pool %llu pg_num %d / %d\n", + (unsigned long long)pool->id, pool->pg_num, + pool->pg_num_mask); } for (i = 0; i < client->osdc.osdmap->max_osd; i++) { struct ceph_entity_addr *addr = @@ -123,26 +123,16 @@ static int osdc_show(struct seq_file *s, void *pp) mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { struct ceph_osd_request *req; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - int num_ops; - int opcode, olen; - int i; + unsigned int i; + int opcode; req = rb_entry(p, struct ceph_osd_request, r_node); - seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, + seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, req->r_osd ? req->r_osd->o_osd : -1, - le32_to_cpu(req->r_pgid.pool), - le16_to_cpu(req->r_pgid.ps)); + req->r_pgid.pool, req->r_pgid.seed); - head = req->r_request->front.iov_base; - op = (void *)(head + 1); - - num_ops = le16_to_cpu(head->num_ops); - olen = le32_to_cpu(head->object_len); - seq_printf(s, "%.*s", olen, - (const char *)(head->ops + num_ops)); + seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); if (req->r_reassert_version.epoch) seq_printf(s, "\t%u'%llu", @@ -151,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp) else seq_printf(s, "\t"); - for (i = 0; i < num_ops; i++) { - opcode = le16_to_cpu(op->op); + for (i = 0; i < req->r_num_ops; i++) { + opcode = req->r_ops[i].op; seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); - op++; } seq_printf(s, "\n"); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5ccf87ed8d68..eb0a46a49bd4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -9,8 +9,9 @@ #include <linux/slab.h> #include <linux/socket.h> #include <linux/string.h> +#ifdef CONFIG_BLOCK #include <linux/bio.h> -#include <linux/blkdev.h> +#endif /* CONFIG_BLOCK */ #include <linux/dns_resolver.h> #include <net/tcp.h> @@ -20,6 +21,9 @@ #include <linux/ceph/pagelist.h> #include <linux/export.h> +#define list_entry_next(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable @@ -97,6 +101,62 @@ #define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ #define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ +static bool con_flag_valid(unsigned long con_flag) +{ + switch (con_flag) { + case CON_FLAG_LOSSYTX: + case CON_FLAG_KEEPALIVE_PENDING: + case CON_FLAG_WRITE_PENDING: + case CON_FLAG_SOCK_CLOSED: + case CON_FLAG_BACKOFF: + return true; + default: + return false; + } +} + +static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + clear_bit(con_flag, &con->flags); +} + +static void con_flag_set(struct ceph_connection *con, unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + set_bit(con_flag, &con->flags); +} + +static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + return test_bit(con_flag, &con->flags); +} + +static bool con_flag_test_and_clear(struct ceph_connection *con, + unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + return test_and_clear_bit(con_flag, &con->flags); +} + +static bool con_flag_test_and_set(struct ceph_connection *con, + unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + return test_and_set_bit(con_flag, &con->flags); +} + +/* Slab caches for frequently-allocated structures */ + +static struct kmem_cache *ceph_msg_cache; +static struct kmem_cache *ceph_msg_data_cache; + /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; @@ -114,7 +174,7 @@ static struct lock_class_key socket_class; static void queue_con(struct ceph_connection *con); static void con_work(struct work_struct *); -static void ceph_fault(struct ceph_connection *con); +static void con_fault(struct ceph_connection *con); /* * Nicely render a sockaddr as a string. An array of formatted @@ -171,13 +231,50 @@ static void encode_my_addr(struct ceph_messenger *msgr) */ static struct workqueue_struct *ceph_msgr_wq; -void _ceph_msgr_exit(void) +static int ceph_msgr_slab_init(void) +{ + BUG_ON(ceph_msg_cache); + ceph_msg_cache = kmem_cache_create("ceph_msg", + sizeof (struct ceph_msg), + __alignof__(struct ceph_msg), 0, NULL); + + if (!ceph_msg_cache) + return -ENOMEM; + + BUG_ON(ceph_msg_data_cache); + ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", + sizeof (struct ceph_msg_data), + __alignof__(struct ceph_msg_data), + 0, NULL); + if (ceph_msg_data_cache) + return 0; + + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; + + return -ENOMEM; +} + +static void ceph_msgr_slab_exit(void) +{ + BUG_ON(!ceph_msg_data_cache); + kmem_cache_destroy(ceph_msg_data_cache); + ceph_msg_data_cache = NULL; + + BUG_ON(!ceph_msg_cache); + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; +} + +static void _ceph_msgr_exit(void) { if (ceph_msgr_wq) { destroy_workqueue(ceph_msgr_wq); ceph_msgr_wq = NULL; } + ceph_msgr_slab_exit(); + BUG_ON(zero_page == NULL); kunmap(zero_page); page_cache_release(zero_page); @@ -190,6 +287,9 @@ int ceph_msgr_init(void) zero_page = ZERO_PAGE(0); page_cache_get(zero_page); + if (ceph_msgr_slab_init()) + return -ENOMEM; + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (ceph_msgr_wq) return 0; @@ -308,7 +408,7 @@ static void ceph_sock_write_space(struct sock *sk) * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ - if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) { + if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); @@ -333,7 +433,7 @@ static void ceph_sock_state_change(struct sock *sk) case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); - set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); + con_flag_set(con, CON_FLAG_SOCK_CLOSED); queue_con(con); break; case TCP_ESTABLISHED: @@ -419,6 +519,22 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) return r; } +static int ceph_tcp_recvpage(struct socket *sock, struct page *page, + int page_offset, size_t length) +{ + void *kaddr; + int ret; + + BUG_ON(page_offset + length > PAGE_SIZE); + + kaddr = kmap(page); + BUG_ON(!kaddr); + ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length); + kunmap(page); + + return ret; +} + /* * write something. @more is true if caller will be sending more data * shortly. @@ -441,7 +557,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, } static int ceph_tcp_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int more) + int offset, size_t size, bool more) { int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); int ret; @@ -474,7 +590,7 @@ static int con_close_socket(struct ceph_connection *con) * received a socket close event before we had the chance to * shut the socket down. */ - clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags); + con_flag_clear(con, CON_FLAG_SOCK_CLOSED); con_sock_state_closed(con); return rc; @@ -538,11 +654,10 @@ void ceph_con_close(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr)); con->state = CON_STATE_CLOSED; - clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ - clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); - clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); - clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); - clear_bit(CON_FLAG_BACKOFF, &con->flags); + con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */ + con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING); + con_flag_clear(con, CON_FLAG_WRITE_PENDING); + con_flag_clear(con, CON_FLAG_BACKOFF); reset_connection(con); con->peer_global_seq = 0; @@ -646,50 +761,397 @@ static void con_out_kvec_add(struct ceph_connection *con, } #ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) + +/* + * For a bio data item, a piece is whatever remains of the next + * entry in the current bio iovec, or the first entry in the next + * bio in the list. + */ +static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) { - if (!bio) { - *iter = NULL; - *seg = 0; - return; + struct ceph_msg_data *data = cursor->data; + struct bio *bio; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = data->bio; + BUG_ON(!bio); + BUG_ON(!bio->bi_vcnt); + + cursor->resid = min(length, data->bio_length); + cursor->bio = bio; + cursor->vector_index = 0; + cursor->vector_offset = 0; + cursor->last_piece = length <= bio->bi_io_vec[0].bv_len; +} + +static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, + size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + struct bio *bio; + struct bio_vec *bio_vec; + unsigned int index; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + index = cursor->vector_index; + BUG_ON(index >= (unsigned int) bio->bi_vcnt); + + bio_vec = &bio->bi_io_vec[index]; + BUG_ON(cursor->vector_offset >= bio_vec->bv_len); + *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset); + BUG_ON(*page_offset >= PAGE_SIZE); + if (cursor->last_piece) /* pagelist offset is always 0 */ + *length = cursor->resid; + else + *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); + BUG_ON(*length > cursor->resid); + BUG_ON(*page_offset + *length > PAGE_SIZE); + + return bio_vec->bv_page; +} + +static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct bio *bio; + struct bio_vec *bio_vec; + unsigned int index; + + BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + index = cursor->vector_index; + BUG_ON(index >= (unsigned int) bio->bi_vcnt); + bio_vec = &bio->bi_io_vec[index]; + + /* Advance the cursor offset */ + + BUG_ON(cursor->resid < bytes); + cursor->resid -= bytes; + cursor->vector_offset += bytes; + if (cursor->vector_offset < bio_vec->bv_len) + return false; /* more bytes to process in this segment */ + BUG_ON(cursor->vector_offset != bio_vec->bv_len); + + /* Move on to the next segment, and possibly the next bio */ + + if (++index == (unsigned int) bio->bi_vcnt) { + bio = bio->bi_next; + index = 0; + } + cursor->bio = bio; + cursor->vector_index = index; + cursor->vector_offset = 0; + + if (!cursor->last_piece) { + BUG_ON(!cursor->resid); + BUG_ON(!bio); + /* A short read is OK, so use <= rather than == */ + if (cursor->resid <= bio->bi_io_vec[index].bv_len) + cursor->last_piece = true; } - *iter = bio; - *seg = bio->bi_idx; + + return true; } +#endif /* CONFIG_BLOCK */ -static void iter_bio_next(struct bio **bio_iter, int *seg) +/* + * For a page array, a piece comes from the first page in the array + * that has not already been fully consumed. + */ +static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) { - if (*bio_iter == NULL) - return; + struct ceph_msg_data *data = cursor->data; + int page_count; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); - BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + BUG_ON(!data->pages); + BUG_ON(!data->length); - (*seg)++; - if (*seg == (*bio_iter)->bi_vcnt) - init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); + cursor->resid = min(length, data->length); + page_count = calc_pages_for(data->alignment, (u64)data->length); + cursor->page_offset = data->alignment & ~PAGE_MASK; + cursor->page_index = 0; + BUG_ON(page_count > (int)USHRT_MAX); + cursor->page_count = (unsigned short)page_count; + BUG_ON(length > SIZE_MAX - cursor->page_offset); + cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; } -#endif -static void prepare_write_message_data(struct ceph_connection *con) +static struct page * +ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) { - struct ceph_msg *msg = con->out_msg; + struct ceph_msg_data *data = cursor->data; - BUG_ON(!msg); - BUG_ON(!msg->hdr.data_len); + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_index >= cursor->page_count); + BUG_ON(cursor->page_offset >= PAGE_SIZE); - /* initialize page iterator */ - con->out_msg_pos.page = 0; - if (msg->pages) - con->out_msg_pos.page_pos = msg->page_alignment; + *page_offset = cursor->page_offset; + if (cursor->last_piece) + *length = cursor->resid; else - con->out_msg_pos.page_pos = 0; + *length = PAGE_SIZE - *page_offset; + + return data->pages[cursor->page_index]; +} + +static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); + + /* Advance the cursor page offset */ + + cursor->resid -= bytes; + cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; + if (!bytes || cursor->page_offset) + return false; /* more bytes to process in the current page */ + + /* Move on to the next page; offset is already at 0 */ + + BUG_ON(cursor->page_index >= cursor->page_count); + cursor->page_index++; + cursor->last_piece = cursor->resid <= PAGE_SIZE; + + return true; +} + +/* + * For a pagelist, a piece is whatever remains to be consumed in the + * first page in the list, or the front of the next page. + */ +static void +ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + struct page *page; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + if (!length) + return; /* pagelist can be assigned but empty */ + + BUG_ON(list_empty(&pagelist->head)); + page = list_first_entry(&pagelist->head, struct page, lru); + + cursor->resid = min(length, pagelist->length); + cursor->page = page; + cursor->offset = 0; + cursor->last_piece = cursor->resid <= PAGE_SIZE; +} + +static struct page * +ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + BUG_ON(!cursor->page); + BUG_ON(cursor->offset + cursor->resid != pagelist->length); + + /* offset of first page in pagelist is always 0 */ + *page_offset = cursor->offset & ~PAGE_MASK; + if (cursor->last_piece) + *length = cursor->resid; + else + *length = PAGE_SIZE - *page_offset; + + return cursor->page; +} + +static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + BUG_ON(cursor->offset + cursor->resid != pagelist->length); + BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); + + /* Advance the cursor offset */ + + cursor->resid -= bytes; + cursor->offset += bytes; + /* offset of first page in pagelist is always 0 */ + if (!bytes || cursor->offset & ~PAGE_MASK) + return false; /* more bytes to process in the current page */ + + /* Move on to the next page */ + + BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); + cursor->page = list_entry_next(cursor->page, lru); + cursor->last_piece = cursor->resid <= PAGE_SIZE; + + return true; +} + +/* + * Message data is handled (sent or received) in pieces, where each + * piece resides on a single page. The network layer might not + * consume an entire piece at once. A data item's cursor keeps + * track of which piece is next to process and how much remains to + * be processed in that piece. It also tracks whether the current + * piece is the last one in the data item. + */ +static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) +{ + size_t length = cursor->total_resid; + + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + ceph_msg_data_pagelist_cursor_init(cursor, length); + break; + case CEPH_MSG_DATA_PAGES: + ceph_msg_data_pages_cursor_init(cursor, length); + break; #ifdef CONFIG_BLOCK - if (msg->bio) - init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); -#endif - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = false; - con->out_more = 1; /* data + footer will follow */ + case CEPH_MSG_DATA_BIO: + ceph_msg_data_bio_cursor_init(cursor, length); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + /* BUG(); */ + break; + } + cursor->need_crc = true; +} + +static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +{ + struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data *data; + + BUG_ON(!length); + BUG_ON(length > msg->data_length); + BUG_ON(list_empty(&msg->data)); + + cursor->data_head = &msg->data; + cursor->total_resid = length; + data = list_first_entry(&msg->data, struct ceph_msg_data, links); + cursor->data = data; + + __ceph_msg_data_cursor_init(cursor); +} + +/* + * Return the page containing the next piece to process for a given + * data item, and supply the page offset and length of that piece. + * Indicate whether this is the last piece in this data item. + */ +static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length, + bool *last_piece) +{ + struct page *page; + + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + page = ceph_msg_data_pagelist_next(cursor, page_offset, length); + break; + case CEPH_MSG_DATA_PAGES: + page = ceph_msg_data_pages_next(cursor, page_offset, length); + break; +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: + page = ceph_msg_data_bio_next(cursor, page_offset, length); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + page = NULL; + break; + } + BUG_ON(!page); + BUG_ON(*page_offset + *length > PAGE_SIZE); + BUG_ON(!*length); + if (last_piece) + *last_piece = cursor->last_piece; + + return page; +} + +/* + * Returns true if the result moves the cursor on to the next piece + * of the data item. + */ +static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + bool new_piece; + + BUG_ON(bytes > cursor->resid); + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); + break; + case CEPH_MSG_DATA_PAGES: + new_piece = ceph_msg_data_pages_advance(cursor, bytes); + break; +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: + new_piece = ceph_msg_data_bio_advance(cursor, bytes); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + BUG(); + break; + } + cursor->total_resid -= bytes; + + if (!cursor->resid && cursor->total_resid) { + WARN_ON(!cursor->last_piece); + BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); + cursor->data = list_entry_next(cursor->data, links); + __ceph_msg_data_cursor_init(cursor); + new_piece = true; + } + cursor->need_crc = new_piece; + + return new_piece; +} + +static void prepare_message_data(struct ceph_msg *msg, u32 data_len) +{ + BUG_ON(!msg); + BUG_ON(!data_len); + + /* Initialize data cursor */ + + ceph_msg_data_cursor_init(msg, (size_t)data_len); } /* @@ -752,16 +1214,12 @@ static void prepare_write_message(struct ceph_connection *con) m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; } -#ifdef CONFIG_BLOCK - else - m->bio_iter = NULL; -#endif + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); - dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", + dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - le32_to_cpu(m->hdr.data_len), - m->nr_pages); + m->data_length); BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ @@ -792,13 +1250,15 @@ static void prepare_write_message(struct ceph_connection *con) /* is there a data payload? */ con->out_msg->footer.data_crc = 0; - if (m->hdr.data_len) - prepare_write_message_data(con); - else + if (m->data_length) { + prepare_message_data(con->out_msg, m->data_length); + con->out_more = 1; /* data + footer will follow */ + } else { /* no, queue up footer too and be done */ prepare_write_message_footer(con); + } - set_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con_flag_set(con, CON_FLAG_WRITE_PENDING); } /* @@ -819,7 +1279,25 @@ static void prepare_write_ack(struct ceph_connection *con) &con->out_temp_ack); con->out_more = 1; /* more will follow.. eventually.. */ - set_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + +/* + * Prepare to share the seq during handshake + */ +static void prepare_write_seq(struct ceph_connection *con) +{ + dout("prepare_write_seq %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); + + con_flag_set(con, CON_FLAG_WRITE_PENDING); } /* @@ -830,7 +1308,7 @@ static void prepare_write_keepalive(struct ceph_connection *con) dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); - set_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con_flag_set(con, CON_FLAG_WRITE_PENDING); } /* @@ -873,7 +1351,7 @@ static void prepare_write_banner(struct ceph_connection *con) &con->msgr->my_enc_addr); con->out_more = 0; - set_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con_flag_set(con, CON_FLAG_WRITE_PENDING); } static int prepare_write_connect(struct ceph_connection *con) @@ -923,7 +1401,7 @@ static int prepare_write_connect(struct ceph_connection *con) auth->authorizer_buf); con->out_more = 0; - set_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con_flag_set(con, CON_FLAG_WRITE_PENDING); return 0; } @@ -971,35 +1449,19 @@ out: return ret; /* done! */ } -static void out_msg_pos_next(struct ceph_connection *con, struct page *page, - size_t len, size_t sent, bool in_trail) +static u32 ceph_crc32c_page(u32 crc, struct page *page, + unsigned int page_offset, + unsigned int length) { - struct ceph_msg *msg = con->out_msg; - - BUG_ON(!msg); - BUG_ON(!sent); + char *kaddr; - con->out_msg_pos.data_pos += sent; - con->out_msg_pos.page_pos += sent; - if (sent < len) - return; + kaddr = kmap(page); + BUG_ON(kaddr == NULL); + crc = crc32c(crc, kaddr + page_offset, length); + kunmap(page); - BUG_ON(sent != len); - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = false; - if (in_trail) - list_move_tail(&page->lru, - &msg->trail->head); - else if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); -#ifdef CONFIG_BLOCK - else if (msg->bio) - iter_bio_next(&msg->bio_iter, &msg->bio_seg); -#endif + return crc; } - /* * Write as much message data payload as we can. If we finish, queue * up the footer. @@ -1007,21 +1469,17 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, * 0 -> socket full, but more to do * <0 -> error */ -static int write_partial_msg_pages(struct ceph_connection *con) +static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; - unsigned int data_len = le32_to_cpu(msg->hdr.data_len); - size_t len; + struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !con->msgr->nocrc; - int ret; - int total_max_write; - bool in_trail = false; - const size_t trail_len = (msg->trail ? msg->trail->length : 0); - const size_t trail_off = data_len - trail_len; + u32 crc; + + dout("%s %p msg %p\n", __func__, con, msg); - dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, msg, con->out_msg_pos.page, msg->nr_pages, - con->out_msg_pos.page_pos); + if (list_empty(&msg->data)) + return -EINVAL; /* * Iterate through each page that contains data to be @@ -1031,72 +1489,41 @@ static int write_partial_msg_pages(struct ceph_connection *con) * need to map the page. If we have no pages, they have * been revoked, so use the zero page. */ - while (data_len > con->out_msg_pos.data_pos) { - struct page *page = NULL; - int max_write = PAGE_SIZE; - int bio_offset = 0; - - in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; - if (!in_trail) - total_max_write = trail_off - con->out_msg_pos.data_pos; - - if (in_trail) { - total_max_write = data_len - con->out_msg_pos.data_pos; - - page = list_first_entry(&msg->trail->head, - struct page, lru); - } else if (msg->pages) { - page = msg->pages[con->out_msg_pos.page]; - } else if (msg->pagelist) { - page = list_first_entry(&msg->pagelist->head, - struct page, lru); -#ifdef CONFIG_BLOCK - } else if (msg->bio) { - struct bio_vec *bv; + crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; + while (cursor->resid) { + struct page *page; + size_t page_offset; + size_t length; + bool last_piece; + bool need_crc; + int ret; + + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, + &last_piece); + ret = ceph_tcp_sendpage(con->sock, page, page_offset, + length, last_piece); + if (ret <= 0) { + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); - bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); - page = bv->bv_page; - bio_offset = bv->bv_offset; - max_write = bv->bv_len; -#endif - } else { - page = zero_page; - } - len = min_t(int, max_write - con->out_msg_pos.page_pos, - total_max_write); - - if (do_datacrc && !con->out_msg_pos.did_page_crc) { - void *base; - u32 crc = le32_to_cpu(msg->footer.data_crc); - char *kaddr; - - kaddr = kmap(page); - BUG_ON(kaddr == NULL); - base = kaddr + con->out_msg_pos.page_pos + bio_offset; - crc = crc32c(crc, base, len); - kunmap(page); - msg->footer.data_crc = cpu_to_le32(crc); - con->out_msg_pos.did_page_crc = true; + return ret; } - ret = ceph_tcp_sendpage(con->sock, page, - con->out_msg_pos.page_pos + bio_offset, - len, 1); - if (ret <= 0) - goto out; - - out_msg_pos_next(con, page, len, (size_t) ret, in_trail); + if (do_datacrc && cursor->need_crc) + crc = ceph_crc32c_page(crc, page, page_offset, length); + need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret); } - dout("write_partial_msg_pages %p msg %p done\n", con, msg); + dout("%s %p msg %p done\n", __func__, con, msg); /* prepare and queue up footer, too */ - if (!do_datacrc) + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + else msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); prepare_write_message_footer(con); - ret = 1; -out: - return ret; + + return 1; /* must return > 0 to indicate success */ } /* @@ -1109,7 +1536,7 @@ static int write_partial_skip(struct ceph_connection *con) while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); - ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); + ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); if (ret <= 0) goto out; con->out_skip -= ret; @@ -1140,6 +1567,13 @@ static void prepare_read_ack(struct ceph_connection *con) con->in_base_pos = 0; } +static void prepare_read_seq(struct ceph_connection *con) +{ + dout("prepare_read_seq %p\n", con); + con->in_base_pos = 0; + con->in_tag = CEPH_MSGR_TAG_SEQ; +} + static void prepare_read_tag(struct ceph_connection *con) { dout("prepare_read_tag %p\n", con); @@ -1546,7 +1980,6 @@ static int process_connect(struct ceph_connection *con) con->error_msg = "connect authorization failure"; return -1; } - con->auth_retry = 1; con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) @@ -1617,6 +2050,7 @@ static int process_connect(struct ceph_connection *con) prepare_read_connect(con); break; + case CEPH_MSGR_TAG_SEQ: case CEPH_MSGR_TAG_READY: if (req_feat & ~server_feat) { pr_err("%s%lld %s protocol feature mismatch," @@ -1631,7 +2065,7 @@ static int process_connect(struct ceph_connection *con) WARN_ON(con->state != CON_STATE_NEGOTIATING); con->state = CON_STATE_OPEN; - + con->auth_retry = 0; /* we authenticated; clear flag */ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; con->peer_features = server_feat; @@ -1643,11 +2077,16 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.connect_seq)); if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - set_bit(CON_FLAG_LOSSYTX, &con->flags); + con_flag_set(con, CON_FLAG_LOSSYTX); con->delay = 0; /* reset backoff memory */ - prepare_read_tag(con); + if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { + prepare_write_seq(con); + prepare_read_seq(con); + } else { + prepare_read_tag(con); + } break; case CEPH_MSGR_TAG_WAIT: @@ -1681,7 +2120,6 @@ static int read_partial_ack(struct ceph_connection *con) return read_partial(con, end, size, &con->in_temp_ack); } - /* * We can finally discard anything that's been acked. */ @@ -1706,8 +2144,6 @@ static void process_ack(struct ceph_connection *con) } - - static int read_partial_message_section(struct ceph_connection *con, struct kvec *section, unsigned int sec_len, u32 *crc) @@ -1731,77 +2167,49 @@ static int read_partial_message_section(struct ceph_connection *con, return 1; } -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); - -static int read_partial_message_pages(struct ceph_connection *con, - struct page **pages, - unsigned int data_len, bool do_datacrc) +static int read_partial_msg_data(struct ceph_connection *con) { - void *p; + struct ceph_msg *msg = con->in_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + const bool do_datacrc = !con->msgr->nocrc; + struct page *page; + size_t page_offset; + size_t length; + u32 crc = 0; int ret; - int left; - - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); - /* (page) data */ - BUG_ON(pages == NULL); - p = kmap(pages[con->in_msg_pos.page]); - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && do_datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(pages[con->in_msg_pos.page]); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == PAGE_SIZE) { - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.page++; - } - - return ret; -} -#ifdef CONFIG_BLOCK -static int read_partial_message_bio(struct ceph_connection *con, - struct bio **bio_iter, int *bio_seg, - unsigned int data_len, bool do_datacrc) -{ - struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); - void *p; - int ret, left; + BUG_ON(!msg); + if (list_empty(&msg->data)) + return -EIO; - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(bv->bv_len - con->in_msg_pos.page_pos)); + if (do_datacrc) + crc = con->in_data_crc; + while (cursor->resid) { + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, + NULL); + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; - p = kmap(bv->bv_page) + bv->bv_offset; + return ret; + } - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && do_datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(bv->bv_page); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == bv->bv_len) { - con->in_msg_pos.page_pos = 0; - iter_bio_next(bio_iter, bio_seg); + if (do_datacrc) + crc = ceph_crc32c_page(crc, page, page_offset, ret); + (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret); } + if (do_datacrc) + con->in_data_crc = crc; - return ret; + return 1; /* must return > 0 to indicate success */ } -#endif /* * read (part of) a message. */ +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); + static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; @@ -1834,7 +2242,7 @@ static int read_partial_message(struct ceph_connection *con) if (front_len > CEPH_MSG_MAX_FRONT_LEN) return -EIO; middle_len = le32_to_cpu(con->in_hdr.middle_len); - if (middle_len > CEPH_MSG_MAX_DATA_LEN) + if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) return -EIO; data_len = le32_to_cpu(con->in_hdr.data_len); if (data_len > CEPH_MSG_MAX_DATA_LEN) @@ -1863,14 +2271,22 @@ static int read_partial_message(struct ceph_connection *con) int skip = 0; dout("got hdr type %d front %d data %d\n", con->in_hdr.type, - con->in_hdr.front_len, con->in_hdr.data_len); + front_len, data_len); ret = ceph_con_in_msg_alloc(con, &skip); if (ret < 0) return ret; + + BUG_ON(!con->in_msg ^ skip); + if (con->in_msg && data_len > con->in_msg->data_length) { + pr_warning("%s skipping long message (%u > %zd)\n", + __func__, data_len, con->in_msg->data_length); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + skip = 1; + } if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); - BUG_ON(con->in_msg); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; @@ -1885,17 +2301,10 @@ static int read_partial_message(struct ceph_connection *con) if (m->middle) m->middle->vec.iov_len = 0; - con->in_msg_pos.page = 0; - if (m->pages) - con->in_msg_pos.page_pos = m->page_alignment; - else - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.data_pos = 0; + /* prepare for data payload, if any */ -#ifdef CONFIG_BLOCK - if (m->bio) - init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); -#endif + if (data_len) + prepare_message_data(con->in_msg, data_len); } /* front */ @@ -1914,24 +2323,10 @@ static int read_partial_message(struct ceph_connection *con) } /* (page) data */ - while (con->in_msg_pos.data_pos < data_len) { - if (m->pages) { - ret = read_partial_message_pages(con, m->pages, - data_len, do_datacrc); - if (ret <= 0) - return ret; -#ifdef CONFIG_BLOCK - } else if (m->bio) { - BUG_ON(!m->bio_iter); - ret = read_partial_message_bio(con, - &m->bio_iter, &m->bio_seg, - data_len, do_datacrc); - if (ret <= 0) - return ret; -#endif - } else { - BUG_ON(1); - } + if (data_len) { + ret = read_partial_msg_data(con); + if (ret <= 0) + return ret; } /* footer */ @@ -2057,13 +2452,13 @@ more_kvec: goto do_next; } - ret = write_partial_msg_pages(con); + ret = write_partial_message_data(con); if (ret == 1) goto more_kvec; /* we need to send the footer, too! */ if (ret == 0) goto out; if (ret < 0) { - dout("try_write write_partial_msg_pages err %d\n", + dout("try_write write_partial_message_data err %d\n", ret); goto out; } @@ -2080,15 +2475,14 @@ do_next: prepare_write_ack(con); goto more; } - if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, - &con->flags)) { + if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { prepare_write_keepalive(con); goto more; } } /* Nothing to do! */ - clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con_flag_clear(con, CON_FLAG_WRITE_PENDING); dout("try_write nothing else to write.\n"); ret = 0; out: @@ -2216,7 +2610,12 @@ more: prepare_read_tag(con); goto more; } - if (con->in_tag == CEPH_MSGR_TAG_ACK) { + if (con->in_tag == CEPH_MSGR_TAG_ACK || + con->in_tag == CEPH_MSGR_TAG_SEQ) { + /* + * the final handshake seq exchange is semantically + * equivalent to an ACK + */ ret = read_partial_ack(con); if (ret <= 0) goto out; @@ -2268,7 +2667,7 @@ static void queue_con(struct ceph_connection *con) static bool con_sock_closed(struct ceph_connection *con) { - if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) + if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) return false; #define CASE(x) \ @@ -2295,6 +2694,41 @@ static bool con_sock_closed(struct ceph_connection *con) return true; } +static bool con_backoff(struct ceph_connection *con) +{ + int ret; + + if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) + return false; + + ret = queue_con_delay(con, round_jiffies_relative(con->delay)); + if (ret) { + dout("%s: con %p FAILED to back off %lu\n", __func__, + con, con->delay); + BUG_ON(ret == -ENOENT); + con_flag_set(con, CON_FLAG_BACKOFF); + } + + return true; +} + +/* Finish fault handling; con->mutex must *not* be held here */ + +static void con_fault_finish(struct ceph_connection *con) +{ + /* + * in case we faulted due to authentication, invalidate our + * current tickets so that we can get new ones. + */ + if (con->auth_retry && con->ops->invalidate_authorizer) { + dout("calling invalidate_authorizer()\n"); + con->ops->invalidate_authorizer(con); + } + + if (con->ops->fault) + con->ops->fault(con); +} + /* * Do some work on a connection. Drop a connection ref when we're done. */ @@ -2302,73 +2736,68 @@ static void con_work(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); - int ret; + bool fault; mutex_lock(&con->mutex); -restart: - if (con_sock_closed(con)) - goto fault; + while (true) { + int ret; - if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { - dout("con_work %p backing off\n", con); - ret = queue_con_delay(con, round_jiffies_relative(con->delay)); - if (ret) { - dout("con_work %p FAILED to back off %lu\n", con, - con->delay); - BUG_ON(ret == -ENOENT); - set_bit(CON_FLAG_BACKOFF, &con->flags); + if ((fault = con_sock_closed(con))) { + dout("%s: con %p SOCK_CLOSED\n", __func__, con); + break; + } + if (con_backoff(con)) { + dout("%s: con %p BACKOFF\n", __func__, con); + break; + } + if (con->state == CON_STATE_STANDBY) { + dout("%s: con %p STANDBY\n", __func__, con); + break; + } + if (con->state == CON_STATE_CLOSED) { + dout("%s: con %p CLOSED\n", __func__, con); + BUG_ON(con->sock); + break; + } + if (con->state == CON_STATE_PREOPEN) { + dout("%s: con %p PREOPEN\n", __func__, con); + BUG_ON(con->sock); } - goto done; - } - if (con->state == CON_STATE_STANDBY) { - dout("con_work %p STANDBY\n", con); - goto done; - } - if (con->state == CON_STATE_CLOSED) { - dout("con_work %p CLOSED\n", con); - BUG_ON(con->sock); - goto done; - } - if (con->state == CON_STATE_PREOPEN) { - dout("con_work OPENING\n"); - BUG_ON(con->sock); - } + ret = try_read(con); + if (ret < 0) { + if (ret == -EAGAIN) + continue; + con->error_msg = "socket error on read"; + fault = true; + break; + } - ret = try_read(con); - if (ret == -EAGAIN) - goto restart; - if (ret < 0) { - con->error_msg = "socket error on read"; - goto fault; - } + ret = try_write(con); + if (ret < 0) { + if (ret == -EAGAIN) + continue; + con->error_msg = "socket error on write"; + fault = true; + } - ret = try_write(con); - if (ret == -EAGAIN) - goto restart; - if (ret < 0) { - con->error_msg = "socket error on write"; - goto fault; + break; /* If we make it to here, we're done */ } - -done: + if (fault) + con_fault(con); mutex_unlock(&con->mutex); -done_unlocked: - con->ops->put(con); - return; -fault: - ceph_fault(con); /* error/fault path */ - goto done_unlocked; -} + if (fault) + con_fault_finish(con); + con->ops->put(con); +} /* * Generic error/fault handler. A retry mechanism is used with * exponential backoff */ -static void ceph_fault(struct ceph_connection *con) - __releases(con->mutex) +static void con_fault(struct ceph_connection *con) { pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); @@ -2381,10 +2810,10 @@ static void ceph_fault(struct ceph_connection *con) con_close_socket(con); - if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { + if (con_flag_test(con, CON_FLAG_LOSSYTX)) { dout("fault on LOSSYTX channel, marking CLOSED\n"); con->state = CON_STATE_CLOSED; - goto out_unlock; + return; } if (con->in_msg) { @@ -2401,9 +2830,9 @@ static void ceph_fault(struct ceph_connection *con) /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && - !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { + !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); - clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con_flag_clear(con, CON_FLAG_WRITE_PENDING); con->state = CON_STATE_STANDBY; } else { /* retry after a delay. */ @@ -2412,23 +2841,9 @@ static void ceph_fault(struct ceph_connection *con) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) con->delay *= 2; - set_bit(CON_FLAG_BACKOFF, &con->flags); + con_flag_set(con, CON_FLAG_BACKOFF); queue_con(con); } - -out_unlock: - mutex_unlock(&con->mutex); - /* - * in case we faulted due to authentication, invalidate our - * current tickets so that we can get new ones. - */ - if (con->auth_retry && con->ops->invalidate_authorizer) { - dout("calling invalidate_authorizer()\n"); - con->ops->invalidate_authorizer(con); - } - - if (con->ops->fault) - con->ops->fault(con); } @@ -2469,8 +2884,8 @@ static void clear_standby(struct ceph_connection *con) dout("clear_standby %p and ++connect_seq\n", con); con->state = CON_STATE_PREOPEN; con->connect_seq++; - WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); - WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); + WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING)); + WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)); } } @@ -2511,7 +2926,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) /* if there wasn't anything waiting to send before, queue * new work */ - if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) + if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); @@ -2600,12 +3015,94 @@ void ceph_con_keepalive(struct ceph_connection *con) mutex_lock(&con->mutex); clear_standby(con); mutex_unlock(&con->mutex); - if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && - test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) + if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 && + con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); +static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) +{ + struct ceph_msg_data *data; + + if (WARN_ON(!ceph_msg_data_type_valid(type))) + return NULL; + + data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); + if (data) + data->type = type; + INIT_LIST_HEAD(&data->links); + + return data; +} + +static void ceph_msg_data_destroy(struct ceph_msg_data *data) +{ + if (!data) + return; + + WARN_ON(!list_empty(&data->links)); + if (data->type == CEPH_MSG_DATA_PAGELIST) { + ceph_pagelist_release(data->pagelist); + kfree(data->pagelist); + } + kmem_cache_free(ceph_msg_data_cache, data); +} + +void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, + size_t length, size_t alignment) +{ + struct ceph_msg_data *data; + + BUG_ON(!pages); + BUG_ON(!length); + + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); + BUG_ON(!data); + data->pages = pages; + data->length = length; + data->alignment = alignment & ~PAGE_MASK; + + list_add_tail(&data->links, &msg->data); + msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pages); + +void ceph_msg_data_add_pagelist(struct ceph_msg *msg, + struct ceph_pagelist *pagelist) +{ + struct ceph_msg_data *data; + + BUG_ON(!pagelist); + BUG_ON(!pagelist->length); + + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); + BUG_ON(!data); + data->pagelist = pagelist; + + list_add_tail(&data->links, &msg->data); + msg->data_length += pagelist->length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pagelist); + +#ifdef CONFIG_BLOCK +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, + size_t length) +{ + struct ceph_msg_data *data; + + BUG_ON(!bio); + + data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); + BUG_ON(!data); + data->bio = bio; + data->bio_length = length; + + list_add_tail(&data->links, &msg->data); + msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_bio); +#endif /* CONFIG_BLOCK */ /* * construct a new message with given type, size @@ -2616,47 +3113,20 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, { struct ceph_msg *m; - m = kmalloc(sizeof(*m), flags); + m = kmem_cache_zalloc(ceph_msg_cache, flags); if (m == NULL) goto out; - kref_init(&m->kref); - m->con = NULL; - INIT_LIST_HEAD(&m->list_head); - - m->hdr.tid = 0; m->hdr.type = cpu_to_le16(type); m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); - m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); - m->hdr.middle_len = 0; - m->hdr.data_len = 0; - m->hdr.data_off = 0; - m->hdr.reserved = 0; - m->footer.front_crc = 0; - m->footer.middle_crc = 0; - m->footer.data_crc = 0; - m->footer.flags = 0; - m->front_max = front_len; - m->front_is_vmalloc = false; - m->more_to_follow = false; - m->ack_stamp = 0; - m->pool = NULL; - /* middle */ - m->middle = NULL; - - /* data */ - m->nr_pages = 0; - m->page_alignment = 0; - m->pages = NULL; - m->pagelist = NULL; - m->bio = NULL; - m->bio_iter = NULL; - m->bio_seg = 0; - m->trail = NULL; + INIT_LIST_HEAD(&m->list_head); + kref_init(&m->kref); + INIT_LIST_HEAD(&m->data); /* front */ + m->front_max = front_len; if (front_len) { if (front_len > PAGE_CACHE_SIZE) { m->front.iov_base = __vmalloc(front_len, flags, @@ -2734,49 +3204,37 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) { struct ceph_msg_header *hdr = &con->in_hdr; - int type = le16_to_cpu(hdr->type); - int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); + struct ceph_msg *msg; int ret = 0; BUG_ON(con->in_msg != NULL); + BUG_ON(!con->ops->alloc_msg); - if (con->ops->alloc_msg) { - struct ceph_msg *msg; - - mutex_unlock(&con->mutex); - msg = con->ops->alloc_msg(con, hdr, skip); - mutex_lock(&con->mutex); - if (con->state != CON_STATE_OPEN) { - if (msg) - ceph_msg_put(msg); - return -EAGAIN; - } - con->in_msg = msg; - if (con->in_msg) { - con->in_msg->con = con->ops->get(con); - BUG_ON(con->in_msg->con == NULL); - } - if (*skip) { - con->in_msg = NULL; - return 0; - } - if (!con->in_msg) { - con->error_msg = - "error allocating memory for incoming message"; - return -ENOMEM; - } + mutex_unlock(&con->mutex); + msg = con->ops->alloc_msg(con, hdr, skip); + mutex_lock(&con->mutex); + if (con->state != CON_STATE_OPEN) { + if (msg) + ceph_msg_put(msg); + return -EAGAIN; } - if (!con->in_msg) { - con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); - if (!con->in_msg) { - pr_err("unable to allocate msg type %d len %d\n", - type, front_len); - return -ENOMEM; - } + if (msg) { + BUG_ON(*skip); + con->in_msg = msg; con->in_msg->con = con->ops->get(con); BUG_ON(con->in_msg->con == NULL); - con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); + } else { + /* + * Null message pointer means either we should skip + * this message or we couldn't allocate memory. The + * former is not an error. + */ + if (*skip) + return 0; + con->error_msg = "error allocating memory for incoming message"; + + return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); @@ -2802,7 +3260,7 @@ void ceph_msg_kfree(struct ceph_msg *m) vfree(m->front.iov_base); else kfree(m->front.iov_base); - kfree(m); + kmem_cache_free(ceph_msg_cache, m); } /* @@ -2811,6 +3269,9 @@ void ceph_msg_kfree(struct ceph_msg *m) void ceph_msg_last_put(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); + LIST_HEAD(data); + struct list_head *links; + struct list_head *next; dout("ceph_msg_put last one on %p\n", m); WARN_ON(!list_empty(&m->list_head)); @@ -2820,16 +3281,16 @@ void ceph_msg_last_put(struct kref *kref) ceph_buffer_put(m->middle); m->middle = NULL; } - m->nr_pages = 0; - m->pages = NULL; - if (m->pagelist) { - ceph_pagelist_release(m->pagelist); - kfree(m->pagelist); - m->pagelist = NULL; - } + list_splice_init(&m->data, &data); + list_for_each_safe(links, next, &data) { + struct ceph_msg_data *data; - m->trail = NULL; + data = list_entry(links, struct ceph_msg_data, links); + list_del_init(links); + ceph_msg_data_destroy(data); + } + m->data_length = 0; if (m->pool) ceph_msgpool_put(m->pool, m); @@ -2840,8 +3301,8 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, - msg->front_max, msg->nr_pages); + pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, + msg->front_max, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 812eb3b46c1f..1fe25cd29d0e 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -697,7 +697,7 @@ int ceph_monc_delete_snapid(struct ceph_mon_client *monc, u32 pool, u64 snapid) { return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, - pool, snapid, 0, 0); + pool, snapid, NULL, 0); } @@ -737,7 +737,7 @@ static void delayed_work(struct work_struct *work) __validate_auth(monc); - if (monc->auth->ops->is_authenticated(monc->auth)) + if (ceph_auth_is_authenticated(monc->auth)) __send_subscribe(monc); } __schedule_delayed(monc); @@ -892,8 +892,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); had_debugfs_info = have_debugfs_info(monc); - if (monc->auth->ops) - was_auth = monc->auth->ops->is_authenticated(monc->auth); + was_auth = ceph_auth_is_authenticated(monc->auth); monc->pending_auth = 0; ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, @@ -904,7 +903,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, wake_up_all(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); - } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { + } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index eb9a44478764..a3395fdfbd4f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1,3 +1,4 @@ + #include <linux/ceph/ceph_debug.h> #include <linux/module.h> @@ -21,9 +22,11 @@ #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 +static struct kmem_cache *ceph_osd_request_cache; + static const struct ceph_connection_operations osd_con_ops; -static void send_queued(struct ceph_osd_client *osdc); +static void __send_queued(struct ceph_osd_client *osdc); static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); static void __register_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); @@ -32,64 +35,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, static void __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); -static int op_needs_trail(int op) -{ - switch (op) { - case CEPH_OSD_OP_GETXATTR: - case CEPH_OSD_OP_SETXATTR: - case CEPH_OSD_OP_CMPXATTR: - case CEPH_OSD_OP_CALL: - case CEPH_OSD_OP_NOTIFY: - return 1; - default: - return 0; - } -} - -static int op_has_extent(int op) -{ - return (op == CEPH_OSD_OP_READ || - op == CEPH_OSD_OP_WRITE); -} - -int ceph_calc_raw_layout(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, - u64 snapid, - u64 off, u64 *plen, u64 *bno, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) -{ - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; - u64 orig_len = *plen; - u64 objoff, objlen; /* extent in object */ - int r; - - reqhead->snapid = cpu_to_le64(snapid); - - /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, plen, bno, - &objoff, &objlen); - if (r < 0) - return r; - if (*plen < orig_len) - dout(" skipping last %llu, final file extent %llu~%llu\n", - orig_len - *plen, off, *plen); - - if (op_has_extent(op->op)) { - op->extent.offset = objoff; - op->extent.length = objlen; - } - req->r_num_pages = calc_pages_for(off, *plen); - req->r_page_alignment = off & ~PAGE_MASK; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = *plen; - - dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", - *bno, objoff, objlen, req->r_num_pages); - return 0; -} -EXPORT_SYMBOL(ceph_calc_raw_layout); - /* * Implement client access to distributed object storage cluster. * @@ -115,25 +60,238 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); * * fill osd op in request message. */ -static int calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) -{ - u64 bno; +static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, + u64 *objnum, u64 *objoff, u64 *objlen) +{ + u64 orig_len = *plen; int r; - r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, - plen, &bno, req, op); + /* object extent? */ + r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum, + objoff, objlen); if (r < 0) return r; + if (*objlen < orig_len) { + *plen = *objlen; + dout(" skipping last %llu, final file extent %llu~%llu\n", + orig_len - *plen, off, *plen); + } - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); - req->r_oid_len = strlen(req->r_oid); + dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); - return r; + return 0; +} + +static void ceph_osd_data_init(struct ceph_osd_data *osd_data) +{ + memset(osd_data, 0, sizeof (*osd_data)); + osd_data->type = CEPH_OSD_DATA_TYPE_NONE; +} + +static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, + struct page **pages, u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->length = length; + osd_data->alignment = alignment; + osd_data->pages_from_pool = pages_from_pool; + osd_data->own_pages = own_pages; +} + +static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, + struct ceph_pagelist *pagelist) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; + osd_data->pagelist = pagelist; +} + +#ifdef CONFIG_BLOCK +static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, + struct bio *bio, size_t bio_length) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_BIO; + osd_data->bio = bio; + osd_data->bio_length = bio_length; +} +#endif /* CONFIG_BLOCK */ + +#define osd_req_op_data(oreq, whch, typ, fld) \ + ({ \ + BUG_ON(whch >= (oreq)->r_num_ops); \ + &(oreq)->r_ops[whch].typ.fld; \ + }) + +static struct ceph_osd_data * +osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + return &osd_req->r_ops[which].raw_data_in; +} + +struct ceph_osd_data * +osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + return osd_req_op_data(osd_req, which, extent, osd_data); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data); + +struct ceph_osd_data * +osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + return osd_req_op_data(osd_req, which, cls, response_data); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ + +void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, + u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_raw_data_in(osd_req, which); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_raw_data_in_pages); + +void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, + u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); + +void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); + +#ifdef CONFIG_BLOCK +void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, + unsigned int which, struct bio *bio, size_t bio_length) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_bio_init(osd_data, bio, bio_length); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); +#endif /* CONFIG_BLOCK */ + +static void osd_req_op_cls_request_info_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_info); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} + +void osd_req_op_cls_request_data_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); + +void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); + +void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, response_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); + +static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) +{ + switch (osd_data->type) { + case CEPH_OSD_DATA_TYPE_NONE: + return 0; + case CEPH_OSD_DATA_TYPE_PAGES: + return osd_data->length; + case CEPH_OSD_DATA_TYPE_PAGELIST: + return (u64)osd_data->pagelist->length; +#ifdef CONFIG_BLOCK + case CEPH_OSD_DATA_TYPE_BIO: + return (u64)osd_data->bio_length; +#endif /* CONFIG_BLOCK */ + default: + WARN(true, "unrecognized data type %d\n", (int)osd_data->type); + return 0; + } +} + +static void ceph_osd_data_release(struct ceph_osd_data *osd_data) +{ + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) { + int num_pages; + + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + ceph_release_page_vector(osd_data->pages, num_pages); + } + ceph_osd_data_init(osd_data); +} + +static void osd_req_op_data_release(struct ceph_osd_request *osd_req, + unsigned int which) +{ + struct ceph_osd_req_op *op; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; + + switch (op->op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + ceph_osd_data_release(&op->extent.osd_data); + break; + case CEPH_OSD_OP_CALL: + ceph_osd_data_release(&op->cls.request_info); + ceph_osd_data_release(&op->cls.request_data); + ceph_osd_data_release(&op->cls.response_data); + break; + default: + break; + } } /* @@ -141,82 +299,64 @@ static int calc_layout(struct ceph_osd_client *osdc, */ void ceph_osdc_release_request(struct kref *kref) { - struct ceph_osd_request *req = container_of(kref, - struct ceph_osd_request, - r_kref); + struct ceph_osd_request *req; + unsigned int which; + req = container_of(kref, struct ceph_osd_request, r_kref); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_con_filling_msg) { - dout("%s revoking pages %p from con %p\n", __func__, - req->r_pages, req->r_con_filling_msg); + if (req->r_reply) { ceph_msg_revoke_incoming(req->r_reply); - req->r_con_filling_msg->ops->put(req->r_con_filling_msg); - } - if (req->r_reply) ceph_msg_put(req->r_reply); - if (req->r_own_pages) - ceph_release_page_vector(req->r_pages, - req->r_num_pages); -#ifdef CONFIG_BLOCK - if (req->r_bio) - bio_put(req->r_bio); -#endif - ceph_put_snap_context(req->r_snapc); - if (req->r_trail) { - ceph_pagelist_release(req->r_trail); - kfree(req->r_trail); } + + for (which = 0; which < req->r_num_ops; which++) + osd_req_op_data_release(req, which); + + ceph_put_snap_context(req->r_snapc); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else - kfree(req); -} -EXPORT_SYMBOL(ceph_osdc_release_request); + kmem_cache_free(ceph_osd_request_cache, req); -static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail) -{ - int i = 0; - - if (needs_trail) - *needs_trail = 0; - while (ops[i].op) { - if (needs_trail && op_needs_trail(ops[i].op)) - *needs_trail = 1; - i++; - } - - return i; } +EXPORT_SYMBOL(ceph_osdc_release_request); struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, - int flags, struct ceph_snap_context *snapc, - struct ceph_osd_req_op *ops, + unsigned int num_ops, bool use_mempool, - gfp_t gfp_flags, - struct page **pages, - struct bio *bio) + gfp_t gfp_flags) { struct ceph_osd_request *req; struct ceph_msg *msg; - int needs_trail; - int num_op = get_num_ops(ops, &needs_trail); - size_t msg_size = sizeof(struct ceph_osd_request_head); + size_t msg_size; + + BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); + BUG_ON(num_ops > CEPH_OSD_MAX_OP); - msg_size += num_op*sizeof(struct ceph_osd_op); + msg_size = 4 + 4 + 8 + 8 + 4+8; + msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ + msg_size += 1 + 8 + 4 + 4; /* pg_t */ + msg_size += 4 + MAX_OBJ_NAME_SIZE; + msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); + msg_size += 8; /* snapid */ + msg_size += 8; /* snap_seq */ + msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ + msg_size += 4; if (use_mempool) { req = mempool_alloc(osdc->req_mempool, gfp_flags); memset(req, 0, sizeof(*req)); } else { - req = kzalloc(sizeof(*req), gfp_flags); + req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); } if (req == NULL) return NULL; req->r_osdc = osdc; req->r_mempool = use_mempool; + req->r_num_ops = num_ops; kref_init(&req->r_kref); init_completion(&req->r_completion); @@ -228,10 +368,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); - req->r_flags = flags; - - WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); - /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -244,20 +380,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; - /* allocate space for the trailing data */ - if (needs_trail) { - req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags); - if (!req->r_trail) { - ceph_osdc_put_request(req); - return NULL; - } - ceph_pagelist_init(req->r_trail); - } - /* create request message; allow space for oid */ - msg_size += MAX_OBJ_NAME_SIZE; - if (snapc) - msg_size += sizeof(u64) * snapc->num_snaps; if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else @@ -270,82 +393,280 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, memset(msg->front.iov_base, 0, msg->front.iov_len); req->r_request = msg; - req->r_pages = pages; -#ifdef CONFIG_BLOCK - if (bio) { - req->r_bio = bio; - bio_get(req->r_bio); - } -#endif return req; } EXPORT_SYMBOL(ceph_osdc_alloc_request); -static void osd_req_encode_op(struct ceph_osd_request *req, - struct ceph_osd_op *dst, - struct ceph_osd_req_op *src) +static bool osd_req_opcode_valid(u16 opcode) { - dst->op = cpu_to_le16(src->op); + switch (opcode) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_STAT: + case CEPH_OSD_OP_MAPEXT: + case CEPH_OSD_OP_MASKTRUNC: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_NOTIFY: + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_ASSERT_VER: + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_TRUNCATE: + case CEPH_OSD_OP_ZERO: + case CEPH_OSD_OP_DELETE: + case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_STARTSYNC: + case CEPH_OSD_OP_SETTRUNC: + case CEPH_OSD_OP_TRIMTRUNC: + case CEPH_OSD_OP_TMAPUP: + case CEPH_OSD_OP_TMAPPUT: + case CEPH_OSD_OP_TMAPGET: + case CEPH_OSD_OP_CREATE: + case CEPH_OSD_OP_ROLLBACK: + case CEPH_OSD_OP_WATCH: + case CEPH_OSD_OP_OMAPGETKEYS: + case CEPH_OSD_OP_OMAPGETVALS: + case CEPH_OSD_OP_OMAPGETHEADER: + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: + case CEPH_OSD_OP_OMAPSETVALS: + case CEPH_OSD_OP_OMAPSETHEADER: + case CEPH_OSD_OP_OMAPCLEAR: + case CEPH_OSD_OP_OMAPRMKEYS: + case CEPH_OSD_OP_OMAP_CMP: + case CEPH_OSD_OP_CLONERANGE: + case CEPH_OSD_OP_ASSERT_SRC_VERSION: + case CEPH_OSD_OP_SRC_CMPXATTR: + case CEPH_OSD_OP_GETXATTR: + case CEPH_OSD_OP_GETXATTRS: + case CEPH_OSD_OP_CMPXATTR: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_SETXATTRS: + case CEPH_OSD_OP_RESETXATTRS: + case CEPH_OSD_OP_RMXATTR: + case CEPH_OSD_OP_PULL: + case CEPH_OSD_OP_PUSH: + case CEPH_OSD_OP_BALANCEREADS: + case CEPH_OSD_OP_UNBALANCEREADS: + case CEPH_OSD_OP_SCRUB: + case CEPH_OSD_OP_SCRUB_RESERVE: + case CEPH_OSD_OP_SCRUB_UNRESERVE: + case CEPH_OSD_OP_SCRUB_STOP: + case CEPH_OSD_OP_SCRUB_MAP: + case CEPH_OSD_OP_WRLOCK: + case CEPH_OSD_OP_WRUNLOCK: + case CEPH_OSD_OP_RDLOCK: + case CEPH_OSD_OP_RDUNLOCK: + case CEPH_OSD_OP_UPLOCK: + case CEPH_OSD_OP_DNLOCK: + case CEPH_OSD_OP_CALL: + case CEPH_OSD_OP_PGLS: + case CEPH_OSD_OP_PGLS_FILTER: + return true; + default: + return false; + } +} + +/* + * This is an osd op init function for opcodes that have no data or + * other information associated with them. It also serves as a + * common init routine for all the other init functions, below. + */ +static struct ceph_osd_req_op * +_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode) +{ + struct ceph_osd_req_op *op; + + BUG_ON(which >= osd_req->r_num_ops); + BUG_ON(!osd_req_opcode_valid(opcode)); + + op = &osd_req->r_ops[which]; + memset(op, 0, sizeof (*op)); + op->op = opcode; + + return op; +} + +void osd_req_op_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode) +{ + (void)_osd_req_op_init(osd_req, which, opcode); +} +EXPORT_SYMBOL(osd_req_op_init); + +void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + size_t payload_len = 0; + + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); + + op->extent.offset = offset; + op->extent.length = length; + op->extent.truncate_size = truncate_size; + op->extent.truncate_seq = truncate_seq; + if (opcode == CEPH_OSD_OP_WRITE) + payload_len += length; + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_extent_init); + +void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length) +{ + struct ceph_osd_req_op *op; + u64 previous; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; + previous = op->extent.length; + + if (length == previous) + return; /* Nothing to do */ + BUG_ON(length > previous); + + op->extent.length = length; + op->payload_len -= previous - length; +} +EXPORT_SYMBOL(osd_req_op_extent_update); + +void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode, const char *class, const char *method) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_pagelist *pagelist; + size_t payload_len = 0; + size_t size; + + BUG_ON(opcode != CEPH_OSD_OP_CALL); + + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + BUG_ON(!pagelist); + ceph_pagelist_init(pagelist); + + op->cls.class_name = class; + size = strlen(class); + BUG_ON(size > (size_t) U8_MAX); + op->cls.class_len = size; + ceph_pagelist_append(pagelist, class, size); + payload_len += size; + + op->cls.method_name = method; + size = strlen(method); + BUG_ON(size > (size_t) U8_MAX); + op->cls.method_len = size; + ceph_pagelist_append(pagelist, method, size); + payload_len += size; + + osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); + + op->cls.argc = 0; /* currently unused */ + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_cls_init); + +void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 cookie, u64 version, int flag) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + + BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + + op->watch.cookie = cookie; + op->watch.ver = version; + if (opcode == CEPH_OSD_OP_WATCH && flag) + op->watch.flag = (u8)1; +} +EXPORT_SYMBOL(osd_req_op_watch_init); + +static void ceph_osdc_msg_data_add(struct ceph_msg *msg, + struct ceph_osd_data *osd_data) +{ + u64 length = ceph_osd_data_length(osd_data); + + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + BUG_ON(length > (u64) SIZE_MAX); + if (length) + ceph_msg_data_add_pages(msg, osd_data->pages, + length, osd_data->alignment); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { + BUG_ON(!length); + ceph_msg_data_add_pagelist(msg, osd_data->pagelist); +#ifdef CONFIG_BLOCK + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { + ceph_msg_data_add_bio(msg, osd_data->bio, length); +#endif + } else { + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); + } +} + +static u64 osd_req_encode_op(struct ceph_osd_request *req, + struct ceph_osd_op *dst, unsigned int which) +{ + struct ceph_osd_req_op *src; + struct ceph_osd_data *osd_data; + u64 request_data_len = 0; + u64 data_length; + + BUG_ON(which >= req->r_num_ops); + src = &req->r_ops[which]; + if (WARN_ON(!osd_req_opcode_valid(src->op))) { + pr_err("unrecognized osd opcode %d\n", src->op); + + return 0; + } switch (src->op) { + case CEPH_OSD_OP_STAT: + osd_data = &src->raw_data_in; + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: - dst->extent.offset = - cpu_to_le64(src->extent.offset); - dst->extent.length = - cpu_to_le64(src->extent.length); + if (src->op == CEPH_OSD_OP_WRITE) + request_data_len = src->extent.length; + dst->extent.offset = cpu_to_le64(src->extent.offset); + dst->extent.length = cpu_to_le64(src->extent.length); dst->extent.truncate_size = cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); - break; - - case CEPH_OSD_OP_GETXATTR: - case CEPH_OSD_OP_SETXATTR: - case CEPH_OSD_OP_CMPXATTR: - BUG_ON(!req->r_trail); - - dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); - dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); - dst->xattr.cmp_op = src->xattr.cmp_op; - dst->xattr.cmp_mode = src->xattr.cmp_mode; - ceph_pagelist_append(req->r_trail, src->xattr.name, - src->xattr.name_len); - ceph_pagelist_append(req->r_trail, src->xattr.val, - src->xattr.value_len); + osd_data = &src->extent.osd_data; + if (src->op == CEPH_OSD_OP_WRITE) + ceph_osdc_msg_data_add(req->r_request, osd_data); + else + ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_CALL: - BUG_ON(!req->r_trail); - dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; - dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); - - ceph_pagelist_append(req->r_trail, src->cls.class_name, - src->cls.class_len); - ceph_pagelist_append(req->r_trail, src->cls.method_name, - src->cls.method_len); - ceph_pagelist_append(req->r_trail, src->cls.indata, - src->cls.indata_len); - break; - case CEPH_OSD_OP_ROLLBACK: - dst->snap.snapid = cpu_to_le64(src->snap.snapid); + osd_data = &src->cls.request_info; + ceph_osdc_msg_data_add(req->r_request, osd_data); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); + request_data_len = osd_data->pagelist->length; + + osd_data = &src->cls.request_data; + data_length = ceph_osd_data_length(osd_data); + if (data_length) { + BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); + dst->cls.indata_len = cpu_to_le32(data_length); + ceph_osdc_msg_data_add(req->r_request, osd_data); + src->payload_len += data_length; + request_data_len += data_length; + } + osd_data = &src->cls.response_data; + ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_STARTSYNC: break; - case CEPH_OSD_OP_NOTIFY: - { - __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); - __le32 timeout = cpu_to_le32(src->watch.timeout); - - BUG_ON(!req->r_trail); - - ceph_pagelist_append(req->r_trail, - &prot_ver, sizeof(prot_ver)); - ceph_pagelist_append(req->r_trail, - &timeout, sizeof(timeout)); - } case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: dst->watch.cookie = cpu_to_le64(src->watch.cookie); @@ -353,90 +674,17 @@ static void osd_req_encode_op(struct ceph_osd_request *req, dst->watch.flag = src->watch.flag; break; default: - pr_err("unrecognized osd opcode %d\n", dst->op); + pr_err("unsupported osd opcode %s\n", + ceph_osd_op_name(src->op)); WARN_ON(1); - break; - } - dst->payload_len = cpu_to_le32(src->payload_len); -} - -/* - * build new request AND message - * - */ -void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 *plen, - struct ceph_osd_req_op *src_ops, - struct ceph_snap_context *snapc, - struct timespec *mtime, - const char *oid, - int oid_len) -{ - struct ceph_msg *msg = req->r_request; - struct ceph_osd_request_head *head; - struct ceph_osd_req_op *src_op; - struct ceph_osd_op *op; - void *p; - int num_op = get_num_ops(src_ops, NULL); - size_t msg_size = sizeof(*head) + num_op*sizeof(*op); - int flags = req->r_flags; - u64 data_len = 0; - int i; - - head = msg->front.iov_base; - op = (void *)(head + 1); - p = (void *)(op + num_op); - req->r_snapc = ceph_get_snap_context(snapc); - - head->client_inc = cpu_to_le32(1); /* always, for now. */ - head->flags = cpu_to_le32(flags); - if (flags & CEPH_OSD_FLAG_WRITE) - ceph_encode_timespec(&head->mtime, mtime); - head->num_ops = cpu_to_le16(num_op); - - - /* fill in oid */ - head->object_len = cpu_to_le32(oid_len); - memcpy(p, oid, oid_len); - p += oid_len; - - src_op = src_ops; - while (src_op->op) { - osd_req_encode_op(req, op, src_op); - src_op++; - op++; - } - - if (req->r_trail) - data_len += req->r_trail->length; - - if (snapc) { - head->snap_seq = cpu_to_le64(snapc->seq); - head->num_snaps = cpu_to_le32(snapc->num_snaps); - for (i = 0; i < snapc->num_snaps; i++) { - put_unaligned_le64(snapc->snaps[i], p); - p += sizeof(u64); - } - } - - if (flags & CEPH_OSD_FLAG_WRITE) { - req->r_request->hdr.data_off = cpu_to_le16(off); - req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); - } else if (data_len) { - req->r_request->hdr.data_off = 0; - req->r_request->hdr.data_len = cpu_to_le32(data_len); + return 0; } + dst->op = cpu_to_le16(src->op); + dst->payload_len = cpu_to_le32(src->payload_len); - req->r_request->page_alignment = req->r_page_alignment; - - BUG_ON(p > msg->front.iov_base + msg->front.iov_len); - msg_size = p - msg->front.iov_base; - msg->front.iov_len = msg_size; - msg->hdr.front_len = cpu_to_le32(msg_size); - return; + return request_data_len; } -EXPORT_SYMBOL(ceph_osdc_build_request); /* * build new request AND message, calculate layout, and adjust file @@ -452,54 +700,63 @@ EXPORT_SYMBOL(ceph_osdc_build_request); struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 off, u64 *plen, + u64 off, u64 *plen, int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, - int do_sync, u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, - bool use_mempool, int num_reply, - int page_align) + bool use_mempool) { - struct ceph_osd_req_op ops[3]; struct ceph_osd_request *req; + u64 objnum = 0; + u64 objoff = 0; + u64 objlen = 0; + u32 object_size; + u64 object_base; int r; - ops[0].op = opcode; - ops[0].extent.truncate_seq = truncate_seq; - ops[0].extent.truncate_size = truncate_size; - ops[0].payload_len = 0; - - if (do_sync) { - ops[1].op = CEPH_OSD_OP_STARTSYNC; - ops[1].payload_len = 0; - ops[2].op = 0; - } else - ops[1].op = 0; - - req = ceph_osdc_alloc_request(osdc, flags, - snapc, ops, - use_mempool, - GFP_NOFS, NULL, NULL); + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); + + req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, + GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); + req->r_flags = flags; + /* calculate max write size */ - r = calc_layout(osdc, vino, layout, off, plen, req, ops); - if (r < 0) + r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); + if (r < 0) { + ceph_osdc_put_request(req); return ERR_PTR(r); - req->r_file_layout = *layout; /* keep a copy */ + } + + object_size = le32_to_cpu(layout->fl_object_size); + object_base = off - objoff; + if (truncate_size <= object_base) { + truncate_size = 0; + } else { + truncate_size -= object_base; + if (truncate_size > object_size) + truncate_size = object_size; + } - /* in case it differs from natural (file) alignment that - calc_layout filled in for us */ - req->r_num_pages = calc_pages_for(page_align, *plen); - req->r_page_alignment = page_align; + osd_req_op_extent_init(req, 0, opcode, objoff, objlen, + truncate_size, truncate_seq); - ceph_osdc_build_request(req, off, plen, ops, - snapc, - mtime, - req->r_oid, req->r_oid_len); + /* + * A second op in the ops array means the caller wants to + * also issue a include a 'startsync' command so that the + * osd will flush data quickly. + */ + if (num_ops > 1) + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + + req->r_file_layout = *layout; /* keep a copy */ + + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", + vino.ino, objnum); + req->r_oid_len = strlen(req->r_oid); return req; } @@ -577,21 +834,46 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd) { struct ceph_osd_request *req, *nreq; + LIST_HEAD(resend); int err; dout("__kick_osd_requests osd%d\n", osd->o_osd); err = __reset_osd(osdc, osd); if (err) return; - + /* + * Build up a list of requests to resend by traversing the + * osd's list of requests. Requests for a given object are + * sent in tid order, and that is also the order they're + * kept on this list. Therefore all requests that are in + * flight will be found first, followed by all requests that + * have not yet been sent. And to resend requests while + * preserving this order we will want to put any sent + * requests back on the front of the osd client's unsent + * list. + * + * So we build a separate ordered list of already-sent + * requests for the affected osd and splice it onto the + * front of the osd client's unsent list. Once we've seen a + * request that has not yet been sent we're done. Those + * requests are already sitting right where they belong. + */ list_for_each_entry(req, &osd->o_requests, r_osd_item) { - list_move(&req->r_req_lru_item, &osdc->req_unsent); - dout("requeued %p tid %llu osd%d\n", req, req->r_tid, + if (!req->r_sent) + break; + list_move_tail(&req->r_req_lru_item, &resend); + dout("requeueing %p tid %llu osd%d\n", req, req->r_tid, osd->o_osd); if (!req->r_linger) req->r_flags |= CEPH_OSD_FLAG_RETRY; } + list_splice(&resend, &osdc->req_unsent); + /* + * Linger requests are re-registered before sending, which + * sets up a new tid for each. We add them to the unsent + * list at the end to keep things in tid order. + */ list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, r_linger_osd) { /* @@ -600,8 +882,8 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, */ BUG_ON(!list_empty(&req->r_req_lru_item)); __register_request(osdc, req); - list_add(&req->r_req_lru_item, &osdc->req_unsent); - list_add(&req->r_osd_item, &req->r_osd->o_requests); + list_add_tail(&req->r_req_lru_item, &osdc->req_unsent); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); __unregister_linger_request(osdc, req); dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, osd->o_osd); @@ -623,8 +905,8 @@ static void osd_reset(struct ceph_connection *con) down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); __kick_osd_requests(osdc, osd); + __send_queued(osdc); mutex_unlock(&osdc->request_mutex); - send_queued(osdc); up_read(&osdc->map_sem); } @@ -673,8 +955,7 @@ static void put_osd(struct ceph_osd *osd) if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } } @@ -739,31 +1020,35 @@ static void remove_old_osds(struct ceph_osd_client *osdc) */ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) { - struct ceph_osd_request *req; - int ret = 0; + struct ceph_entity_addr *peer_addr; dout("__reset_osd %p osd%d\n", osd, osd->o_osd); if (list_empty(&osd->o_requests) && list_empty(&osd->o_linger_requests)) { __remove_osd(osdc, osd); - ret = -ENODEV; - } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], - &osd->o_con.peer_addr, - sizeof(osd->o_con.peer_addr)) == 0 && - !ceph_con_opened(&osd->o_con)) { + + return -ENODEV; + } + + peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; + if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && + !ceph_con_opened(&osd->o_con)) { + struct ceph_osd_request *req; + dout(" osd addr hasn't changed and connection never opened," " letting msgr retry"); /* touch each r_stamp for handle_timeout()'s benfit */ list_for_each_entry(req, &osd->o_requests, r_osd_item) req->r_stamp = jiffies; - ret = -EAGAIN; - } else { - ceph_con_close(&osd->o_con); - ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, - &osdc->osdmap->osd_addr[osd->o_osd]); - osd->o_incarnation++; + + return -EAGAIN; } - return ret; + + ceph_con_close(&osd->o_con); + ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); + osd->o_incarnation++; + + return 0; } static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) @@ -835,14 +1120,6 @@ static void __register_request(struct ceph_osd_client *osdc, } } -static void register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - mutex_lock(&osdc->request_mutex); - __register_request(osdc, req); - mutex_unlock(&osdc->request_mutex); -} - /* * called under osdc->request_mutex */ @@ -961,20 +1238,18 @@ EXPORT_SYMBOL(ceph_osdc_set_request_linger); static int __map_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, int force_resend) { - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; struct ceph_pg pgid; int acting[CEPH_PG_MAX_SIZE]; int o = -1, num = 0; int err; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, - &req->r_file_layout, osdc->osdmap); + err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, + ceph_file_layout_pg_pool(req->r_file_layout)); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; } - pgid = reqhead->layout.ol_pgid; req->r_pgid = pgid; err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); @@ -991,8 +1266,8 @@ static int __map_request(struct ceph_osd_client *osdc, (req->r_osd == NULL && o == -1)) return 0; /* no change */ - dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", - req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, + dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", + req->r_tid, pgid.pool, pgid.seed, o, req->r_osd ? req->r_osd->o_osd : -1); /* record full pg acting set */ @@ -1024,10 +1299,10 @@ static int __map_request(struct ceph_osd_client *osdc, if (req->r_osd) { __remove_osd_from_lru(req->r_osd); - list_add(&req->r_osd_item, &req->r_osd->o_requests); - list_move(&req->r_req_lru_item, &osdc->req_unsent); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); + list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); } else { - list_move(&req->r_req_lru_item, &osdc->req_notarget); + list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); } err = 1; /* osd or pg changed */ @@ -1041,37 +1316,47 @@ out: static void __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { - struct ceph_osd_request_head *reqhead; - - dout("send_request %p tid %llu to osd%d flags %d\n", - req, req->r_tid, req->r_osd->o_osd, req->r_flags); + void *p; - reqhead = req->r_request->front.iov_base; - reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); - reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ - reqhead->reassert_version = req->r_reassert_version; + dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", + req, req->r_tid, req->r_osd->o_osd, req->r_flags, + (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); + + /* fill in message content that changes each time we send it */ + put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); + put_unaligned_le32(req->r_flags, req->r_request_flags); + put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); + p = req->r_request_pgid; + ceph_encode_64(&p, req->r_pgid.pool); + ceph_encode_32(&p, req->r_pgid.seed); + put_unaligned_le64(1, req->r_request_attempts); /* FIXME */ + memcpy(req->r_request_reassert_version, &req->r_reassert_version, + sizeof(req->r_reassert_version)); req->r_stamp = jiffies; list_move_tail(&req->r_req_lru_item, &osdc->req_lru); ceph_msg_get(req->r_request); /* send consumes a ref */ - ceph_con_send(&req->r_osd->o_con, req->r_request); + + /* Mark the request unsafe if this is the first timet's being sent. */ + + if (!req->r_sent && req->r_unsafe_callback) + req->r_unsafe_callback(req, true); req->r_sent = req->r_osd->o_incarnation; + + ceph_con_send(&req->r_osd->o_con, req->r_request); } /* * Send any requests in the queue (req_unsent). */ -static void send_queued(struct ceph_osd_client *osdc) +static void __send_queued(struct ceph_osd_client *osdc) { struct ceph_osd_request *req, *tmp; - dout("send_queued\n"); - mutex_lock(&osdc->request_mutex); - list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { + dout("__send_queued\n"); + list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) __send_request(osdc, req); - } - mutex_unlock(&osdc->request_mutex); } /* @@ -1123,8 +1408,8 @@ static void handle_timeout(struct work_struct *work) } __schedule_osd_timeout(osdc); + __send_queued(osdc); mutex_unlock(&osdc->request_mutex); - send_queued(osdc); up_read(&osdc->map_sem); } @@ -1147,8 +1432,8 @@ static void handle_osds_timeout(struct work_struct *work) static void complete_request(struct ceph_osd_request *req) { - if (req->r_safe_callback) - req->r_safe_callback(req, NULL); + if (req->r_unsafe_callback) + req->r_unsafe_callback(req, false); complete_all(&req->r_safe_completion); /* fsync waiter */ } @@ -1159,55 +1444,98 @@ static void complete_request(struct ceph_osd_request *req) static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, struct ceph_connection *con) { - struct ceph_osd_reply_head *rhead = msg->front.iov_base; + void *p, *end; struct ceph_osd_request *req; u64 tid; - int numops, object_len, flags; + int object_len; + unsigned int numops; + int payload_len, flags; s32 result; + s32 retry_attempt; + struct ceph_pg pg; + int err; + u32 reassert_epoch; + u64 reassert_version; + u32 osdmap_epoch; + int already_completed; + u32 bytes; + unsigned int i; tid = le64_to_cpu(msg->hdr.tid); - if (msg->front.iov_len < sizeof(*rhead)) - goto bad; - numops = le32_to_cpu(rhead->num_ops); - object_len = le32_to_cpu(rhead->object_len); - result = le32_to_cpu(rhead->result); - if (msg->front.iov_len != sizeof(*rhead) + object_len + - numops * sizeof(struct ceph_osd_op)) + dout("handle_reply %p tid %llu\n", msg, tid); + + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + ceph_decode_need(&p, end, 4, bad); + object_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, object_len, bad); + p += object_len; + + err = ceph_decode_pgid(&p, end, &pg); + if (err) goto bad; - dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); + + ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); + flags = ceph_decode_64(&p); + result = ceph_decode_32(&p); + reassert_epoch = ceph_decode_32(&p); + reassert_version = ceph_decode_64(&p); + osdmap_epoch = ceph_decode_32(&p); + /* lookup */ mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (req == NULL) { dout("handle_reply tid %llu dne\n", tid); - mutex_unlock(&osdc->request_mutex); - return; + goto bad_mutex; } ceph_osdc_get_request(req); - flags = le32_to_cpu(rhead->flags); - /* - * if this connection filled our message, drop our reference now, to - * avoid a (safe but slower) revoke later. - */ - if (req->r_con_filling_msg == con && req->r_reply == msg) { - dout(" dropping con_filling_msg ref %p\n", con); - req->r_con_filling_msg = NULL; - con->ops->put(con); - } + dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, + req, result); + + ceph_decode_need(&p, end, 4, bad); + numops = ceph_decode_32(&p); + if (numops > CEPH_OSD_MAX_OP) + goto bad_put; + if (numops != req->r_num_ops) + goto bad_put; + payload_len = 0; + ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad); + for (i = 0; i < numops; i++) { + struct ceph_osd_op *op = p; + int len; + + len = le32_to_cpu(op->payload_len); + req->r_reply_op_len[i] = len; + dout(" op %d has %d bytes\n", i, len); + payload_len += len; + p += sizeof(*op); + } + bytes = le32_to_cpu(msg->hdr.data_len); + if (payload_len != bytes) { + pr_warning("sum of op payload lens %d != data_len %d", + payload_len, bytes); + goto bad_put; + } + + ceph_decode_need(&p, end, 4 + numops * 4, bad); + retry_attempt = ceph_decode_32(&p); + for (i = 0; i < numops; i++) + req->r_reply_op_result[i] = ceph_decode_32(&p); if (!req->r_got_reply) { - unsigned int bytes; - req->r_result = le32_to_cpu(rhead->result); - bytes = le32_to_cpu(msg->hdr.data_len); + req->r_result = result; dout("handle_reply result %d bytes %d\n", req->r_result, bytes); if (req->r_result == 0) req->r_result = bytes; /* in case this is a write and we need to replay, */ - req->r_reassert_version = rhead->reassert_version; + req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch); + req->r_reassert_version.version = cpu_to_le64(reassert_version); req->r_got_reply = 1; } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { @@ -1227,7 +1555,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, ((flags & CEPH_OSD_FLAG_WRITE) == 0)) __unregister_request(osdc, req); + already_completed = req->r_completed; + req->r_completed = 1; mutex_unlock(&osdc->request_mutex); + if (already_completed) + goto done; if (req->r_callback) req->r_callback(req, msg); @@ -1242,10 +1574,13 @@ done: ceph_osdc_put_request(req); return; +bad_put: + ceph_osdc_put_request(req); +bad_mutex: + mutex_unlock(&osdc->request_mutex); bad: - pr_err("corrupt osd_op_reply got %d %d expected %d\n", - (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), - (int)sizeof(*rhead)); + pr_err("corrupt osd_op_reply got %d %d\n", + (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); ceph_msg_dump(msg); } @@ -1462,7 +1797,9 @@ done: if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) ceph_monc_request_next_osdmap(&osdc->client->monc); - send_queued(osdc); + mutex_lock(&osdc->request_mutex); + __send_queued(osdc); + mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); wake_up_all(&osdc->client->auth_wq); return; @@ -1556,8 +1893,7 @@ static void __remove_event(struct ceph_osd_event *event) int ceph_osdc_create_event(struct ceph_osd_client *osdc, void (*event_cb)(u64, u64, u8, void *), - int one_shot, void *data, - struct ceph_osd_event **pevent) + void *data, struct ceph_osd_event **pevent) { struct ceph_osd_event *event; @@ -1567,14 +1903,13 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc, dout("create_event %p\n", event); event->cb = event_cb; - event->one_shot = one_shot; + event->one_shot = 0; event->data = data; event->osdc = osdc; INIT_LIST_HEAD(&event->osd_node); RB_CLEAR_NODE(&event->node); kref_init(&event->kref); /* one ref for us */ kref_get(&event->kref); /* one ref for the caller */ - init_completion(&event->completion); spin_lock(&osdc->event_lock); event->cookie = ++osdc->event_count; @@ -1610,7 +1945,6 @@ static void do_event_work(struct work_struct *work) dout("do_event_work completing %p\n", event); event->cb(ver, notify_id, opcode, event->data); - complete(&event->completion); dout("do_event_work completed %p\n", event); ceph_osdc_put_event(event); kfree(event_work); @@ -1620,7 +1954,8 @@ static void do_event_work(struct work_struct *work) /* * Process osd watch notifications */ -void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) +static void handle_watch_notify(struct ceph_osd_client *osdc, + struct ceph_msg *msg) { void *p, *end; u8 proto_ver; @@ -1641,9 +1976,8 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) spin_lock(&osdc->event_lock); event = __find_event(osdc, cookie); if (event) { + BUG_ON(event->one_shot); get_event(event); - if (event->one_shot) - __remove_event(event); } spin_unlock(&osdc->event_lock); dout("handle_watch_notify cookie %lld ver %lld event %p\n", @@ -1668,7 +2002,6 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) return; done_err: - complete(&event->completion); ceph_osdc_put_event(event); return; @@ -1677,20 +2010,103 @@ bad: return; } -int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) +/* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, + struct ceph_snap_context *snapc, u64 snap_id, + struct timespec *mtime) { - int err; + struct ceph_msg *msg = req->r_request; + void *p; + size_t msg_size; + int flags = req->r_flags; + u64 data_len; + unsigned int i; - dout("wait_event %p\n", event); - err = wait_for_completion_interruptible_timeout(&event->completion, - timeout * HZ); - ceph_osdc_put_event(event); - if (err > 0) - err = 0; - dout("wait_event %p returns %d\n", event, err); - return err; + req->r_snapid = snap_id; + req->r_snapc = ceph_get_snap_context(snapc); + + /* encode request */ + msg->hdr.version = cpu_to_le16(4); + + p = msg->front.iov_base; + ceph_encode_32(&p, 1); /* client_inc is always 1 */ + req->r_request_osdmap_epoch = p; + p += 4; + req->r_request_flags = p; + p += 4; + if (req->r_flags & CEPH_OSD_FLAG_WRITE) + ceph_encode_timespec(p, mtime); + p += sizeof(struct ceph_timespec); + req->r_request_reassert_version = p; + p += sizeof(struct ceph_eversion); /* will get filled in */ + + /* oloc */ + ceph_encode_8(&p, 4); + ceph_encode_8(&p, 4); + ceph_encode_32(&p, 8 + 4 + 4); + req->r_request_pool = p; + p += 8; + ceph_encode_32(&p, -1); /* preferred */ + ceph_encode_32(&p, 0); /* key len */ + + ceph_encode_8(&p, 1); + req->r_request_pgid = p; + p += 8 + 4; + ceph_encode_32(&p, -1); /* preferred */ + + /* oid */ + ceph_encode_32(&p, req->r_oid_len); + memcpy(p, req->r_oid, req->r_oid_len); + dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); + p += req->r_oid_len; + + /* ops--can imply data */ + ceph_encode_16(&p, (u16)req->r_num_ops); + data_len = 0; + for (i = 0; i < req->r_num_ops; i++) { + data_len += osd_req_encode_op(req, p, i); + p += sizeof(struct ceph_osd_op); + } + + /* snaps */ + ceph_encode_64(&p, req->r_snapid); + ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); + ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); + if (req->r_snapc) { + for (i = 0; i < snapc->num_snaps; i++) { + ceph_encode_64(&p, req->r_snapc->snaps[i]); + } + } + + req->r_request_attempts = p; + p += 4; + + /* data */ + if (flags & CEPH_OSD_FLAG_WRITE) { + u16 data_off; + + /* + * The header "data_off" is a hint to the receiver + * allowing it to align received data into its + * buffers such that there's no need to re-copy + * it before writing it to disk (direct I/O). + */ + data_off = (u16) (off & 0xffff); + req->r_request->hdr.data_off = cpu_to_le16(data_off); + } + req->r_request->hdr.data_len = cpu_to_le32(data_len); + + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); + msg_size = p - msg->front.iov_base; + msg->front.iov_len = msg_size; + msg->hdr.front_len = cpu_to_le32(msg_size); + + dout("build_request msg_size was %d\n", (int)msg_size); } -EXPORT_SYMBOL(ceph_osdc_wait_event); +EXPORT_SYMBOL(ceph_osdc_build_request); /* * Register request, send initial attempt. @@ -1701,41 +2117,26 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, { int rc = 0; - req->r_request->pages = req->r_pages; - req->r_request->nr_pages = req->r_num_pages; -#ifdef CONFIG_BLOCK - req->r_request->bio = req->r_bio; -#endif - req->r_request->trail = req->r_trail; - - register_request(osdc, req); - down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); - /* - * a racing kick_requests() may have sent the message for us - * while we dropped request_mutex above, so only send now if - * the request still han't been touched yet. - */ - if (req->r_sent == 0) { - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } - goto out_unlock; - } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } else { - __send_request(osdc, req); + __register_request(osdc, req); + WARN_ON(req->r_sent); + rc = __map_request(osdc, req, 0); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; } - rc = 0; + goto out_unlock; } - + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + __send_queued(osdc); + } + rc = 0; out_unlock: mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); @@ -1865,7 +2266,6 @@ out_mempool: out: return err; } -EXPORT_SYMBOL(ceph_osdc_init); void ceph_osdc_stop(struct ceph_osd_client *osdc) { @@ -1882,7 +2282,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); } -EXPORT_SYMBOL(ceph_osdc_stop); /* * Read some contiguous pages. If we cross a stripe boundary, shorten @@ -1899,18 +2298,22 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, truncate_seq, truncate_size, NULL, - false, 1, page_align); + NULL, truncate_seq, truncate_size, + false); if (IS_ERR(req)) return PTR_ERR(req); /* it may be a short read due to an object boundary */ - req->r_pages = pages; - dout("readpages final extent is %llu~%llu (%d pages align %d)\n", - off, *plen, req->r_num_pages, page_align); + osd_req_op_extent_osd_data_pages(req, 0, + pages, *plen, page_align, false, false); + + dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", + off, *plen, *plen, page_align); + + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -1931,30 +2334,29 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, u64 off, u64 len, u32 truncate_seq, u64 truncate_size, struct timespec *mtime, - struct page **pages, int num_pages, - int flags, int do_sync, bool nofail) + struct page **pages, int num_pages) { struct ceph_osd_request *req; int rc = 0; int page_align = off & ~PAGE_MASK; - BUG_ON(vino.snap != CEPH_NOSNAP); - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, + BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, CEPH_OSD_OP_WRITE, - flags | CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE, - snapc, do_sync, - truncate_seq, truncate_size, mtime, - nofail, 1, page_align); + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + snapc, truncate_seq, truncate_size, + true); if (IS_ERR(req)) return PTR_ERR(req); /* it may be a short write due to an object boundary */ - req->r_pages = pages; - dout("writepages %llu~%llu (%d pages)\n", off, len, - req->r_num_pages); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, false); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); - rc = ceph_osdc_start_request(osdc, req, nofail); + ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); + + rc = ceph_osdc_start_request(osdc, req, true); if (!rc) rc = ceph_osdc_wait_request(osdc, req); @@ -1966,6 +2368,26 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } EXPORT_SYMBOL(ceph_osdc_writepages); +int ceph_osdc_setup(void) +{ + BUG_ON(ceph_osd_request_cache); + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", + sizeof (struct ceph_osd_request), + __alignof__(struct ceph_osd_request), + 0, NULL); + + return ceph_osd_request_cache ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(ceph_osdc_setup); + +void ceph_osdc_cleanup(void) +{ + BUG_ON(!ceph_osd_request_cache); + kmem_cache_destroy(ceph_osd_request_cache); + ceph_osd_request_cache = NULL; +} +EXPORT_SYMBOL(ceph_osdc_cleanup); + /* * handle incoming message */ @@ -2025,13 +2447,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, goto out; } - if (req->r_con_filling_msg) { + if (req->r_reply->con) dout("%s revoking msg %p from old con %p\n", __func__, - req->r_reply, req->r_con_filling_msg); - ceph_msg_revoke_incoming(req->r_reply); - req->r_con_filling_msg->ops->put(req->r_con_filling_msg); - req->r_con_filling_msg = NULL; - } + req->r_reply, req->r_reply->con); + ceph_msg_revoke_incoming(req->r_reply); if (front > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d\n", @@ -2045,26 +2464,29 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - int want = calc_pages_for(req->r_page_alignment, data_len); - - if (unlikely(req->r_num_pages < want)) { - pr_warning("tid %lld reply has %d bytes %d pages, we" - " had only %d pages ready\n", tid, data_len, - want, req->r_num_pages); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; + struct ceph_osd_data *osd_data; + + /* + * XXX This is assuming there is only one op containing + * XXX page data. Probably OK for reads, but this + * XXX ought to be done more generally. + */ + osd_data = osd_req_op_extent_osd_data(req, 0); + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + if (osd_data->pages && + unlikely(osd_data->length < data_len)) { + + pr_warning("tid %lld reply has %d bytes " + "we had only %llu bytes ready\n", + tid, data_len, osd_data->length); + *skip = 1; + ceph_msg_put(m); + m = NULL; + goto out; + } } - m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; - m->page_alignment = req->r_page_alignment; -#ifdef CONFIG_BLOCK - m->bio = req->r_bio; -#endif } *skip = 0; - req->r_con_filling_msg = con->ops->get(con); dout("get_reply tid %lld %p\n", tid, m); out: @@ -2129,13 +2551,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &o->o_auth; if (force_new && auth->authorizer) { - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(ac, auth->authorizer); auth->authorizer = NULL; } - if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { - int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); if (ret) return ERR_PTR(ret); } @@ -2151,11 +2577,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - /* - * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, - * XXX which do we do: succeed or fail? - */ - return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -2164,9 +2586,7 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - if (ac->ops && ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); - + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); return ceph_monc_validate_auth(&osdc->client->monc); } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index de73214b5d26..603ddd92db19 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -13,26 +13,18 @@ char *ceph_osdmap_state_str(char *str, int len, int state) { - int flag = 0; - if (!len) - goto done; - - *str = '\0'; - if (state) { - if (state & CEPH_OSD_EXISTS) { - snprintf(str, len, "exists"); - flag = 1; - } - if (state & CEPH_OSD_UP) { - snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), - "up"); - flag = 1; - } - } else { + return str; + + if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) + snprintf(str, len, "exists, up"); + else if (state & CEPH_OSD_EXISTS) + snprintf(str, len, "exists"); + else if (state & CEPH_OSD_UP) + snprintf(str, len, "up"); + else snprintf(str, len, "doesn't exist"); - } -done: + return str; } @@ -53,13 +45,8 @@ static int calc_bits_of(unsigned int t) */ static void calc_pg_masks(struct ceph_pg_pool_info *pi) { - pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; - pi->pgp_num_mask = - (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; - pi->lpg_num_mask = - (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; - pi->lpgp_num_mask = - (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; + pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; + pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; } /* @@ -170,6 +157,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->choose_local_tries = 2; c->choose_local_fallback_tries = 5; c->choose_total_tries = 19; + c->chooseleaf_descend_once = 0; ceph_decode_need(p, end, 4*sizeof(u32), bad); magic = ceph_decode_32(p); @@ -336,6 +324,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable choose_total_tries = %d", c->choose_total_tries); + ceph_decode_need(p, end, sizeof(u32), done); + c->chooseleaf_descend_once = ceph_decode_32(p); + dout("crush decode tunable chooseleaf_descend_once = %d", + c->chooseleaf_descend_once); + done: dout("crush_decode success\n"); return c; @@ -354,12 +347,13 @@ bad: */ static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) { - u64 a = *(u64 *)&l; - u64 b = *(u64 *)&r; - - if (a < b) + if (l.pool < r.pool) return -1; - if (a > b) + if (l.pool > r.pool) + return 1; + if (l.seed < r.seed) + return -1; + if (l.seed > r.seed) return 1; return 0; } @@ -405,8 +399,8 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, } else if (c > 0) { n = n->rb_right; } else { - dout("__lookup_pg_mapping %llx got %p\n", - *(u64 *)&pgid, pg); + dout("__lookup_pg_mapping %lld.%x got %p\n", + pgid.pool, pgid.seed, pg); return pg; } } @@ -418,12 +412,13 @@ static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); if (pg) { - dout("__remove_pg_mapping %llx %p\n", *(u64 *)&pgid, pg); + dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, + pg); rb_erase(&pg->node, root); kfree(pg); return 0; } - dout("__remove_pg_mapping %llx dne\n", *(u64 *)&pgid); + dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); return -ENOENT; } @@ -452,7 +447,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) return 0; } -static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) +static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) { struct ceph_pg_pool_info *pi; struct rb_node *n = root->rb_node; @@ -508,24 +503,57 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) { - unsigned int n, m; + u8 ev, cv; + unsigned len, num; + void *pool_end; + + ceph_decode_need(p, end, 2 + 4, bad); + ev = ceph_decode_8(p); /* encoding version */ + cv = ceph_decode_8(p); /* compat version */ + if (ev < 5) { + pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); + return -EINVAL; + } + if (cv > 7) { + pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); + return -EINVAL; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, bad); + pool_end = *p + len; - ceph_decode_copy(p, &pi->v, sizeof(pi->v)); - calc_pg_masks(pi); + pi->type = ceph_decode_8(p); + pi->size = ceph_decode_8(p); + pi->crush_ruleset = ceph_decode_8(p); + pi->object_hash = ceph_decode_8(p); + + pi->pg_num = ceph_decode_32(p); + pi->pgp_num = ceph_decode_32(p); + + *p += 4 + 4; /* skip lpg* */ + *p += 4; /* skip last_change */ + *p += 8 + 4; /* skip snap_seq, snap_epoch */ - /* num_snaps * snap_info_t */ - n = le32_to_cpu(pi->v.num_snaps); - while (n--) { - ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + - sizeof(struct ceph_timespec), bad); - *p += sizeof(u64) + /* key */ - 1 + sizeof(u64) + /* u8, snapid */ - sizeof(struct ceph_timespec); - m = ceph_decode_32(p); /* snap name */ - *p += m; + /* skip snaps */ + num = ceph_decode_32(p); + while (num--) { + *p += 8; /* snapid key */ + *p += 1 + 1; /* versions */ + len = ceph_decode_32(p); + *p += len; } - *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; + /* skip removed snaps */ + num = ceph_decode_32(p); + *p += num * (8 + 8); + + *p += 8; /* skip auid */ + pi->flags = ceph_decode_64(p); + + /* ignore the rest */ + + *p = pool_end; + calc_pg_masks(pi); return 0; bad: @@ -535,14 +563,15 @@ bad: static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) { struct ceph_pg_pool_info *pi; - u32 num, len, pool; + u32 num, len; + u64 pool; ceph_decode_32_safe(p, end, num, bad); dout(" %d pool names\n", num); while (num--) { - ceph_decode_32_safe(p, end, pool, bad); + ceph_decode_64_safe(p, end, pool, bad); ceph_decode_32_safe(p, end, len, bad); - dout(" pool %d len %d\n", pool, len); + dout(" pool %llu len %d\n", pool, len); ceph_decode_need(p, end, len, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) { @@ -633,7 +662,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) struct ceph_osdmap *map; u16 version; u32 len, max, i; - u8 ev; int err = -EINVAL; void *start = *p; struct ceph_pg_pool_info *pi; @@ -646,9 +674,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) map->pg_temp = RB_ROOT; ceph_decode_16_safe(p, end, version, bad); - if (version > CEPH_OSDMAP_VERSION) { - pr_warning("got unknown v %d > %d of osdmap\n", version, - CEPH_OSDMAP_VERSION); + if (version > 6) { + pr_warning("got unknown v %d > 6 of osdmap\n", version); + goto bad; + } + if (version < 6) { + pr_warning("got old v %d < 6 of osdmap\n", version); goto bad; } @@ -660,20 +691,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_safe(p, end, max, bad); while (max--) { - ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); + ceph_decode_need(p, end, 8 + 2, bad); err = -ENOMEM; pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) goto bad; - pi->id = ceph_decode_32(p); - err = -EINVAL; - ev = ceph_decode_8(p); /* encoding version */ - if (ev > CEPH_PG_POOL_VERSION) { - pr_warning("got unknown v %d > %d of ceph_pg_pool\n", - ev, CEPH_PG_POOL_VERSION); - kfree(pi); - goto bad; - } + pi->id = ceph_decode_64(p); err = __decode_pool(p, end, pi); if (err < 0) { kfree(pi); @@ -682,12 +705,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) __insert_pg_pool(&map->pg_pools, pi); } - if (version >= 5) { - err = __decode_pool_names(p, end, map); - if (err < 0) { - dout("fail to decode pool names"); - goto bad; - } + err = __decode_pool_names(p, end, map); + if (err < 0) { + dout("fail to decode pool names"); + goto bad; } ceph_decode_32_safe(p, end, map->pool_max, bad); @@ -726,8 +747,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) struct ceph_pg pgid; struct ceph_pg_mapping *pg; - ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); - ceph_decode_copy(p, &pgid, sizeof(pgid)); + err = ceph_decode_pgid(p, end, &pgid); + if (err) + goto bad; + ceph_decode_need(p, end, sizeof(u32), bad); n = ceph_decode_32(p); err = -EINVAL; if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) @@ -745,7 +768,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) err = __insert_pg_mapping(pg, &map->pg_temp); if (err) goto bad; - dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); + dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed, + len); } /* crush */ @@ -784,16 +808,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_fsid fsid; u32 epoch = 0; struct ceph_timespec modified; - u32 len, pool; - __s32 new_pool_max, new_flags, max; + s32 len; + u64 pool; + __s64 new_pool_max; + __s32 new_flags, max; void *start = *p; int err = -EINVAL; u16 version; ceph_decode_16_safe(p, end, version, bad); - if (version > CEPH_OSDMAP_INC_VERSION) { - pr_warning("got unknown v %d > %d of inc osdmap\n", version, - CEPH_OSDMAP_INC_VERSION); + if (version != 6) { + pr_warning("got unknown v %d != 6 of inc osdmap\n", version); goto bad; } @@ -803,7 +828,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); ceph_decode_copy(p, &modified, sizeof(modified)); - new_pool_max = ceph_decode_32(p); + new_pool_max = ceph_decode_64(p); new_flags = ceph_decode_32(p); /* full map? */ @@ -853,18 +878,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* new_pool */ ceph_decode_32_safe(p, end, len, bad); while (len--) { - __u8 ev; struct ceph_pg_pool_info *pi; - ceph_decode_32_safe(p, end, pool, bad); - ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); - ev = ceph_decode_8(p); /* encoding version */ - if (ev > CEPH_PG_POOL_VERSION) { - pr_warning("got unknown v %d > %d of ceph_pg_pool\n", - ev, CEPH_PG_POOL_VERSION); - err = -EINVAL; - goto bad; - } + ceph_decode_64_safe(p, end, pool, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); if (!pi) { pi = kzalloc(sizeof(*pi), GFP_NOFS); @@ -890,7 +906,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, while (len--) { struct ceph_pg_pool_info *pi; - ceph_decode_32_safe(p, end, pool, bad); + ceph_decode_64_safe(p, end, pool, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); @@ -948,10 +964,12 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, int j; struct ceph_pg pgid; u32 pglen; - ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); - ceph_decode_copy(p, &pgid, sizeof(pgid)); - pglen = ceph_decode_32(p); + err = ceph_decode_pgid(p, end, &pgid); + if (err) + goto bad; + ceph_decode_need(p, end, sizeof(u32), bad); + pglen = ceph_decode_32(p); if (pglen) { ceph_decode_need(p, end, pglen*sizeof(u32), bad); @@ -975,8 +993,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, kfree(pg); goto bad; } - dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, - pglen); + dout(" added pg_temp %lld.%x len %d\n", pgid.pool, + pgid.seed, pglen); } else { /* remove */ __remove_pg_mapping(&map->pg_temp, pgid); @@ -1010,7 +1028,7 @@ bad: * pass a stride back to the caller. */ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, + u64 off, u64 len, u64 *ono, u64 *oxoff, u64 *oxlen) { @@ -1021,7 +1039,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u32 su_per_object; u64 t, su_offset; - dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, + dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, osize, su); if (su == 0 || sc == 0) goto invalid; @@ -1054,11 +1072,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, /* * Calculate the length of the extent being written to the selected - * object. This is the minimum of the full length requested (plen) or + * object. This is the minimum of the full length requested (len) or * the remainder of the current stripe being written to. */ - *oxlen = min_t(u64, *plen, su - su_offset); - *plen = *oxlen; + *oxlen = min_t(u64, len, su - su_offset); dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); return 0; @@ -1076,36 +1093,22 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); * calculate an object layout (i.e. pgid) from an oid, * file_layout, and osdmap */ -int ceph_calc_object_layout(struct ceph_object_layout *ol, - const char *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap) +int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, + struct ceph_osdmap *osdmap, uint64_t pool) { - unsigned int num, num_mask; - struct ceph_pg pgid; - int poolid = le32_to_cpu(fl->fl_pg_pool); - struct ceph_pg_pool_info *pool; - unsigned int ps; + struct ceph_pg_pool_info *pool_info; BUG_ON(!osdmap); - - pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); - if (!pool) + pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); + if (!pool_info) return -EIO; - ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); - num = le32_to_cpu(pool->v.pg_num); - num_mask = pool->pg_num_mask; - - pgid.ps = cpu_to_le16(ps); - pgid.preferred = cpu_to_le16(-1); - pgid.pool = fl->fl_pg_pool; - dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); + pg->pool = pool; + pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); - ol->ol_pgid = pgid; - ol->ol_stripe_unit = fl->fl_object_stripe_unit; + dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); return 0; } -EXPORT_SYMBOL(ceph_calc_object_layout); +EXPORT_SYMBOL(ceph_calc_ceph_pg); /* * Calculate raw osd vector for the given pgid. Return pointer to osd @@ -1117,19 +1120,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, struct ceph_pg_mapping *pg; struct ceph_pg_pool_info *pool; int ruleno; - unsigned int poolid, ps, pps, t, r; + int r; + u32 pps; - poolid = le32_to_cpu(pgid.pool); - ps = le16_to_cpu(pgid.ps); - - pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); if (!pool) return NULL; /* pg_temp? */ - t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), - pool->pgp_num_mask); - pgid.ps = cpu_to_le16(t); + pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, + pool->pgp_num_mask); pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { *num = pg->len; @@ -1137,26 +1137,39 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } /* crush */ - ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, - pool->v.type, pool->v.size); + ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, + pool->type, pool->size); if (ruleno < 0) { - pr_err("no crush rule pool %d ruleset %d type %d size %d\n", - poolid, pool->v.crush_ruleset, pool->v.type, - pool->v.size); + pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", + pgid.pool, pool->crush_ruleset, pool->type, + pool->size); return NULL; } - pps = ceph_stable_mod(ps, - le32_to_cpu(pool->v.pgp_num), - pool->pgp_num_mask); - pps += poolid; + if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { + /* hash pool id and seed sothat pool PGs do not overlap */ + pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask), + pgid.pool); + } else { + /* + * legacy ehavior: add ps and pool together. this is + * not a great approach because the PGs from each pool + * will overlap on top of each other: 0.5 == 1.4 == + * 2.3 == ... + */ + pps = ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask) + + (unsigned)pgid.pool; + } r = crush_do_rule(osdmap->crush, ruleno, pps, osds, - min_t(int, pool->v.size, *num), + min_t(int, pool->size, *num), osdmap->osd_weight); if (r < 0) { - pr_err("error %d from crush rule: pool %d ruleset %d type %d" - " size %d\n", r, poolid, pool->v.crush_ruleset, - pool->v.type, pool->v.size); + pr_err("error %d from crush rule: pool %lld ruleset %d type %d" + " size %d\n", r, pgid.pool, pool->crush_ruleset, + pool->type, pool->size); return NULL; } *num = r; diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index cd9c21df87d1..815a2249cfa9 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -12,7 +12,7 @@ /* * build a vector of user pages */ -struct page **ceph_get_direct_page_vector(const char __user *data, +struct page **ceph_get_direct_page_vector(const void __user *data, int num_pages, bool write_page) { struct page **pages; @@ -93,7 +93,7 @@ EXPORT_SYMBOL(ceph_alloc_page_vector); * copy user data into a page vector */ int ceph_copy_user_to_page_vector(struct page **pages, - const char __user *data, + const void __user *data, loff_t off, size_t len) { int i = 0; @@ -118,17 +118,17 @@ int ceph_copy_user_to_page_vector(struct page **pages, } EXPORT_SYMBOL(ceph_copy_user_to_page_vector); -int ceph_copy_to_page_vector(struct page **pages, - const char *data, +void ceph_copy_to_page_vector(struct page **pages, + const void *data, loff_t off, size_t len) { int i = 0; size_t po = off & ~PAGE_CACHE_MASK; size_t left = len; - size_t l; while (left > 0) { - l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(page_address(pages[i]) + po, data, l); data += l; left -= l; @@ -138,21 +138,20 @@ int ceph_copy_to_page_vector(struct page **pages, i++; } } - return len; } EXPORT_SYMBOL(ceph_copy_to_page_vector); -int ceph_copy_from_page_vector(struct page **pages, - char *data, +void ceph_copy_from_page_vector(struct page **pages, + void *data, loff_t off, size_t len) { int i = 0; size_t po = off & ~PAGE_CACHE_MASK; size_t left = len; - size_t l; while (left > 0) { - l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(data, page_address(pages[i]) + po, l); data += l; left -= l; @@ -162,7 +161,6 @@ int ceph_copy_from_page_vector(struct page **pages, i++; } } - return len; } EXPORT_SYMBOL(ceph_copy_from_page_vector); @@ -170,7 +168,7 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector); * copy user data from a page vector into a user pointer */ int ceph_copy_page_vector_to_user(struct page **pages, - char __user *data, + void __user *data, loff_t off, size_t len) { int i = 0; diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c new file mode 100644 index 000000000000..154683f5f14c --- /dev/null +++ b/net/ceph/snapshot.c @@ -0,0 +1,78 @@ +/* + * snapshot.c Ceph snapshot context utility routines (part of libceph) + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <stddef.h> + +#include <linux/types.h> +#include <linux/export.h> +#include <linux/ceph/libceph.h> + +/* + * Ceph snapshot contexts are reference counted objects, and the + * returned structure holds a single reference. Acquire additional + * references with ceph_get_snap_context(), and release them with + * ceph_put_snap_context(). When the reference count reaches zero + * the entire structure is freed. + */ + +/* + * Create a new ceph snapshot context large enough to hold the + * indicated number of snapshot ids (which can be 0). Caller has + * to fill in snapc->seq and snapc->snaps[0..snap_count-1]. + * + * Returns a null pointer if an error occurs. + */ +struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, + gfp_t gfp_flags) +{ + struct ceph_snap_context *snapc; + size_t size; + + size = sizeof (struct ceph_snap_context); + size += snap_count * sizeof (snapc->snaps[0]); + snapc = kzalloc(size, gfp_flags); + if (!snapc) + return NULL; + + atomic_set(&snapc->nref, 1); + snapc->num_snaps = snap_count; + + return snapc; +} +EXPORT_SYMBOL(ceph_create_snap_context); + +struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc) +{ + if (sc) + atomic_inc(&sc->nref); + return sc; +} +EXPORT_SYMBOL(ceph_get_snap_context); + +void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} +EXPORT_SYMBOL(ceph_put_snap_context); diff --git a/net/core/Makefile b/net/core/Makefile index 674641b13aea..b33b996f5dd6 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,10 +9,11 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o + sock_diag.o dev_ioctl.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o +obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_NET_DMA) += user_dma.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 368f9c3f9dc6..b71423db7785 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -78,9 +78,10 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int syn return autoremove_wake_function(wait, mode, sync, key); } /* - * Wait for a packet.. + * Wait for the last received packet to be different from skb */ -static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) +static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, + const struct sk_buff *skb) { int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); @@ -92,7 +93,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) if (error) goto out_err; - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (sk->sk_receive_queue.prev != skb) goto out; /* Socket shut down? */ @@ -131,9 +132,9 @@ out_noerr: * __skb_recv_datagram - Receive a datagram skbuff * @sk: socket * @flags: MSG_ flags + * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts - * @peeked: returns non-zero if this packet has been seen before * @err: error code returned * * Get a datagram skbuff, understands the peeking, nonblocking wakeups @@ -161,7 +162,7 @@ out_noerr: struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, int *peeked, int *off, int *err) { - struct sk_buff *skb; + struct sk_buff *skb, *last; long timeo; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() @@ -182,13 +183,17 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, */ unsigned long cpu_flags; struct sk_buff_head *queue = &sk->sk_receive_queue; + int _off = *off; + last = (struct sk_buff *)queue; spin_lock_irqsave(&queue->lock, cpu_flags); skb_queue_walk(queue, skb) { + last = skb; *peeked = skb->peeked; if (flags & MSG_PEEK) { - if (*off >= skb->len && skb->len) { - *off -= skb->len; + if (_off >= skb->len && (skb->len || _off || + skb->peeked)) { + _off -= skb->len; continue; } skb->peeked = 1; @@ -197,6 +202,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, __skb_unlink(skb, queue); spin_unlock_irqrestore(&queue->lock, cpu_flags); + *off = _off; return skb; } spin_unlock_irqrestore(&queue->lock, cpu_flags); @@ -206,7 +212,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, if (!timeo) goto no_packet; - } while (!wait_for_packet(sk, err, &timeo)); + } while (!wait_for_more_packets(sk, err, &timeo, last)); return NULL; @@ -749,7 +755,9 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= POLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0); + if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) diff --git a/net/core/dev.c b/net/core/dev.c index f64e439b4a00..fc1e289397f5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -97,8 +97,6 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> #include <linux/stat.h> #include <net/dst.h> #include <net/pkt_sched.h> @@ -106,12 +104,10 @@ #include <net/xfrm.h> #include <linux/highmem.h> #include <linux/init.h> -#include <linux/kmod.h> #include <linux/module.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> -#include <net/wext.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> @@ -132,9 +128,7 @@ #include <linux/pci.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> -#include <linux/net_tstamp.h> #include <linux/static_key.h> -#include <net/flow_keys.h> #include "net-sysfs.h" @@ -144,41 +138,10 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) -/* - * The list of packet types we will receive (as opposed to discard) - * and the routines to invoke. - * - * Why 16. Because with 16 the only overlap we get on a hash of the - * low nibble of the protocol value is RARP/SNAP/X.25. - * - * NOTE: That is no longer true with the addition of VLAN tags. Not - * sure which should go first, but I bet it won't make much - * difference if we are running VLANs. The good news is that - * this protocol won't be in the list unless compiled in, so - * the average user (w/out VLANs) will not be adversely affected. - * --BLG - * - * 0800 IP - * 8100 802.1Q VLAN - * 0001 802.3 - * 0002 AX.25 - * 0004 802.2 - * 8035 RARP - * 0005 SNAP - * 0805 X.25 - * 0806 ARP - * 8137 IPX - * 0009 Localtalk - * 86DD IPv6 - */ - -#define PTYPE_HASH_SIZE (16) -#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) - static DEFINE_SPINLOCK(ptype_lock); static DEFINE_SPINLOCK(offload_lock); -static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -static struct list_head ptype_all __read_mostly; /* Taps */ +struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +struct list_head ptype_all __read_mostly; /* Taps */ static struct list_head offload_base __read_mostly; /* @@ -237,7 +200,7 @@ static inline void rps_unlock(struct softnet_data *sd) } /* Device list insertion */ -static int list_netdevice(struct net_device *dev) +static void list_netdevice(struct net_device *dev) { struct net *net = dev_net(dev); @@ -251,8 +214,6 @@ static int list_netdevice(struct net_device *dev) write_unlock_bh(&dev_base_lock); dev_base_seq_inc(net); - - return 0; } /* Device list removal @@ -695,11 +656,10 @@ __setup("netdev=", netdev_boot_setup); struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each_entry(dev, p, head, name_hlist) + hlist_for_each_entry(dev, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; @@ -721,11 +681,10 @@ EXPORT_SYMBOL(__dev_get_by_name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each_entry_rcu(dev, p, head, name_hlist) + hlist_for_each_entry_rcu(dev, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; @@ -772,11 +731,10 @@ EXPORT_SYMBOL(dev_get_by_name); struct net_device *__dev_get_by_index(struct net *net, int ifindex) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each_entry(dev, p, head, index_hlist) + hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; @@ -797,11 +755,10 @@ EXPORT_SYMBOL(__dev_get_by_index); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) { - struct hlist_node *p; struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each_entry_rcu(dev, p, head, index_hlist) + hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; @@ -1227,36 +1184,6 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -/** - * dev_load - load a network module - * @net: the applicable net namespace - * @name: name of interface - * - * If a network interface is not present and the process has suitable - * privileges this function loads the module. If module loading is not - * available in this kernel then it becomes a nop. - */ - -void dev_load(struct net *net, const char *name) -{ - struct net_device *dev; - int no_module; - - rcu_read_lock(); - dev = dev_get_by_name_rcu(net, name); - rcu_read_unlock(); - - no_module = !dev; - if (no_module && capable(CAP_NET_ADMIN)) - no_module = request_module("netdev-%s", name); - if (no_module && capable(CAP_SYS_MODULE)) { - if (!request_module("%s", name)) - pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", - name); - } -} -EXPORT_SYMBOL(dev_load); - static int __dev_open(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; @@ -1267,6 +1194,14 @@ static int __dev_open(struct net_device *dev) if (!netif_device_present(dev)) return -ENODEV; + /* Block netpoll from trying to do any rx path servicing. + * If we don't do this there is a chance ndo_poll_controller + * or ndo_poll may be running while we open the device + */ + ret = netpoll_rx_disable(dev); + if (ret) + return ret; + ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); if (ret) @@ -1280,6 +1215,8 @@ static int __dev_open(struct net_device *dev) if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); + netpoll_rx_enable(dev); + if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { @@ -1371,9 +1308,16 @@ static int __dev_close(struct net_device *dev) int retval; LIST_HEAD(single); + /* Temporarily disable netpoll until the interface is down */ + retval = netpoll_rx_disable(dev); + if (retval) + return retval; + list_add(&dev->unreg_list, &single); retval = __dev_close_many(&single); list_del(&single); + + netpoll_rx_enable(dev); return retval; } @@ -1409,14 +1353,22 @@ static int dev_close_many(struct list_head *head) */ int dev_close(struct net_device *dev) { + int ret = 0; if (dev->flags & IFF_UP) { LIST_HEAD(single); + /* Block netpoll rx while the interface is going down */ + ret = netpoll_rx_disable(dev); + if (ret) + return ret; + list_add(&dev->unreg_list, &single); dev_close_many(&single); list_del(&single); + + netpoll_rx_enable(dev); } - return 0; + return ret; } EXPORT_SYMBOL(dev_close); @@ -1591,7 +1543,6 @@ void net_enable_timestamp(void) return; } #endif - WARN_ON(in_interrupt()); static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); @@ -1621,57 +1572,6 @@ static inline void net_timestamp_set(struct sk_buff *skb) __net_timestamp(SKB); \ } \ -static int net_hwtstamp_validate(struct ifreq *ifr) -{ - struct hwtstamp_config cfg; - enum hwtstamp_tx_types tx_type; - enum hwtstamp_rx_filters rx_filter; - int tx_type_valid = 0; - int rx_filter_valid = 0; - - if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) - return -EFAULT; - - if (cfg.flags) /* reserved for future extensions */ - return -EINVAL; - - tx_type = cfg.tx_type; - rx_filter = cfg.rx_filter; - - switch (tx_type) { - case HWTSTAMP_TX_OFF: - case HWTSTAMP_TX_ON: - case HWTSTAMP_TX_ONESTEP_SYNC: - tx_type_valid = 1; - break; - } - - switch (rx_filter) { - case HWTSTAMP_FILTER_NONE: - case HWTSTAMP_FILTER_ALL: - case HWTSTAMP_FILTER_SOME: - case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: - case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: - case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: - case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: - case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: - case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: - case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: - case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: - case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: - case HWTSTAMP_FILTER_PTP_V2_EVENT: - case HWTSTAMP_FILTER_PTP_V2_SYNC: - case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: - rx_filter_valid = 1; - break; - } - - if (!tx_type_valid || !rx_filter_valid) - return -ERANGE; - - return 0; -} - static inline bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) { @@ -1722,7 +1622,6 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } skb_orphan(skb); - nf_reset(skb); if (unlikely(!is_skb_forwardable(dev, skb))) { atomic_long_inc(&dev->rx_dropped); @@ -1738,6 +1637,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb->mark = 0; secpath_reset(skb); nf_reset(skb); + nf_reset_trace(skb); return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1857,6 +1757,230 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) } } +#ifdef CONFIG_XPS +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, + int cpu, u16 index) +{ + struct xps_map *map = NULL; + int pos; + + if (dev_maps) + map = xmap_dereference(dev_maps->cpu_map[cpu]); + + for (pos = 0; map && pos < map->len; pos++) { + if (map->queues[pos] == index) { + if (map->len > 1) { + map->queues[pos] = map->queues[--map->len]; + } else { + RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); + kfree_rcu(map, rcu); + map = NULL; + } + break; + } + } + + return map; +} + +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ + struct xps_dev_maps *dev_maps; + int cpu, i; + bool active = false; + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (!dev_maps) + goto out_no_maps; + + for_each_possible_cpu(cpu) { + for (i = index; i < dev->num_tx_queues; i++) { + if (!remove_xps_queue(dev_maps, cpu, i)) + break; + } + if (i == dev->num_tx_queues) + active = true; + } + + if (!active) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + + for (i = index; i < dev->num_tx_queues; i++) + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + +out_no_maps: + mutex_unlock(&xps_map_mutex); +} + +static struct xps_map *expand_xps_map(struct xps_map *map, + int cpu, u16 index) +{ + struct xps_map *new_map; + int alloc_len = XPS_MIN_MAP_ALLOC; + int i, pos; + + for (pos = 0; map && pos < map->len; pos++) { + if (map->queues[pos] != index) + continue; + return map; + } + + /* Need to add queue to this CPU's existing map */ + if (map) { + if (pos < map->alloc_len) + return map; + + alloc_len = map->alloc_len * 2; + } + + /* Need to allocate new map to store queue on this CPU's map */ + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(cpu)); + if (!new_map) + return NULL; + + for (i = 0; i < pos; i++) + new_map->queues[i] = map->queues[i]; + new_map->alloc_len = alloc_len; + new_map->len = pos; + + return new_map; +} + +int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index) +{ + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + struct xps_map *map, *new_map; + int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); + int cpu, numa_node_id = -2; + bool active = false; + + mutex_lock(&xps_map_mutex); + + dev_maps = xmap_dereference(dev->xps_maps); + + /* allocate memory for queue storage */ + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mask)) + continue; + + if (!new_dev_maps) + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + NULL; + + map = expand_xps_map(map, cpu, index); + if (!map) + goto error; + + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + } + + if (!new_dev_maps) + goto out_no_new_maps; + + for_each_possible_cpu(cpu) { + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { + /* add queue to CPU maps */ + int pos = 0; + + map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + while ((pos < map->len) && (map->queues[pos] != index)) + pos++; + + if (pos == map->len) + map->queues[map->len++] = index; +#ifdef CONFIG_NUMA + if (numa_node_id == -2) + numa_node_id = cpu_to_node(cpu); + else if (numa_node_id != cpu_to_node(cpu)) + numa_node_id = -1; +#endif + } else if (dev_maps) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[cpu]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + } + + } + + rcu_assign_pointer(dev->xps_maps, new_dev_maps); + + /* Cleanup old maps */ + if (dev_maps) { + for_each_possible_cpu(cpu) { + new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = xmap_dereference(dev_maps->cpu_map[cpu]); + if (map && map != new_map) + kfree_rcu(map, rcu); + } + + kfree_rcu(dev_maps, rcu); + } + + dev_maps = new_dev_maps; + active = true; + +out_no_new_maps: + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? numa_node_id : + NUMA_NO_NODE); + + if (!dev_maps) + goto out_no_maps; + + /* removes queue from unused CPUs */ + for_each_possible_cpu(cpu) { + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) + continue; + + if (remove_xps_queue(dev_maps, cpu, index)) + active = true; + } + + /* free map if not active */ + if (!active) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + +out_no_maps: + mutex_unlock(&xps_map_mutex); + + return 0; +error: + /* remove any maps that we added */ + for_each_possible_cpu(cpu) { + new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } + + mutex_unlock(&xps_map_mutex); + + kfree(new_dev_maps); + return -ENOMEM; +} +EXPORT_SYMBOL(netif_set_xps_queue); + +#endif /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -1880,8 +2004,12 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); - if (txq < dev->real_num_tx_queues) + if (txq < dev->real_num_tx_queues) { qdisc_reset_all_tx_gt(dev, txq); +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, txq); +#endif + } } dev->real_num_tx_queues = txq; @@ -2018,6 +2146,9 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) struct net_device *dev = skb->dev; const char *driver = ""; + if (!net_ratelimit()) + return; + if (dev && dev->dev.parent) driver = dev_driver_string(dev->dev.parent); @@ -2046,6 +2177,15 @@ int skb_checksum_help(struct sk_buff *skb) return -EINVAL; } + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (skb_has_shared_frag(skb)) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); csum = skb_checksum(skb, offset, skb->len - offset, 0); @@ -2068,52 +2208,59 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -/** - * skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, - netdev_features_t features) +__be16 skb_network_protocol(struct sk_buff *skb) { - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; __be16 type = skb->protocol; int vlan_depth = ETH_HLEN; - int err; - while (type == htons(ETH_P_8021Q)) { + /* Tunnel gso handlers can set protocol to ethernet. */ + if (type == htons(ETH_P_TEB)) { + struct ethhdr *eth; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + return 0; + + eth = (struct ethhdr *)skb_mac_header(skb); + type = eth->h_proto; + } + + while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { struct vlan_hdr *vh; if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return ERR_PTR(-EINVAL); + return 0; vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } - skb_reset_mac_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; - __skb_pull(skb, skb->mac_len); + return type; +} - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - skb_warn_bad_offload(skb); +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + __be16 type = skb_network_protocol(skb); - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) - return ERR_PTR(err); - } + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, skb->mac_len); rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + int err; + err = ptype->callbacks.gso_send_check(skb); segs = ERR_PTR(err); if (err || skb_gso_ok(skb, features)) @@ -2131,7 +2278,50 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, return segs; } -EXPORT_SYMBOL(skb_gso_segment); +EXPORT_SYMBOL(skb_mac_gso_segment); + + +/* openvswitch calls this on rx path, so we need a different check. + */ +static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL; + else + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + skb_warn_bad_offload(skb); + + if (skb_header_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return ERR_PTR(err); + } + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + return skb_mac_gso_segment(skb, features); +} +EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG @@ -2229,24 +2419,12 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) return 0; } -static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) -{ - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_V4_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_V6_CSUM) && - protocol == htons(ETH_P_IPV6)) || - ((features & NETIF_F_FCOE_CRC) && - protocol == htons(ETH_P_FCOE))); -} - static netdev_features_t harmonize_features(struct sk_buff *skb, __be16 protocol, netdev_features_t features) { if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, protocol)) { features &= ~NETIF_F_ALL_CSUM; - features &= ~NETIF_F_SG; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -2262,20 +2440,22 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; - if (protocol == htons(ETH_P_8021Q)) { + if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { return harmonize_features(skb, protocol, features); } - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX); + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); - if (protocol != htons(ETH_P_8021Q)) { + if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) { return harmonize_features(skb, protocol, features); } else { features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX; + NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; return harmonize_features(skb, protocol, features); } } @@ -2287,7 +2467,7 @@ EXPORT_SYMBOL(netif_skb_features); * 2. skb is fragmented and the device does not support SG. */ static inline int skb_needs_linearize(struct sk_buff *skb, - int features) + netdev_features_t features) { return skb_is_nonlinear(skb) && ((skb_has_frag_list(skb) && @@ -2316,8 +2496,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, features = netif_skb_features(skb); if (vlan_tx_tag_present(skb) && - !(features & NETIF_F_HW_VLAN_TX)) { - skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); if (unlikely(!skb)) goto out; @@ -2376,13 +2557,6 @@ gso: skb->next = nskb->next; nskb->next = NULL; - /* - * If device doesn't need nskb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(nskb); - if (!list_empty(&ptype_all)) dev_queue_xmit_nit(nskb, dev); @@ -2402,134 +2576,45 @@ gso: } while (skb->next); out_kfree_gso_skb: - if (likely(skb->next == NULL)) + if (likely(skb->next == NULL)) { skb->destructor = DEV_GSO_CB(skb)->destructor; + consume_skb(skb); + return rc; + } out_kfree_skb: kfree_skb(skb); out: return rc; } -static u32 hashrnd __read_mostly; - -/* - * Returns a Tx hash based on the given packet descriptor a Tx queues' number - * to be used as a distribution range. - */ -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, - unsigned int num_tx_queues) -{ - u32 hash; - u16 qoffset = 0; - u16 qcount = num_tx_queues; - - if (skb_rx_queue_recorded(skb)) { - hash = skb_get_rx_queue(skb); - while (unlikely(hash >= num_tx_queues)) - hash -= num_tx_queues; - return hash; - } - - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; - } - - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol; - hash = jhash_1word(hash, hashrnd); - - return (u16) (((u64) hash * qcount) >> 32) + qoffset; -} -EXPORT_SYMBOL(__skb_tx_hash); - -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ - if (unlikely(queue_index >= dev->real_num_tx_queues)) { - net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); - return 0; - } - return queue_index; -} - -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +static void qdisc_pkt_len_init(struct sk_buff *skb) { -#ifdef CONFIG_XPS - struct xps_dev_maps *dev_maps; - struct xps_map *map; - int queue_index = -1; + const struct skb_shared_info *shinfo = skb_shinfo(skb); - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); - if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[raw_smp_processor_id()]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else { - u32 hash; - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol ^ - skb->rxhash; - hash = jhash_1word(hash, hashrnd); - queue_index = map->queues[ - ((u64)hash * map->len) >> 32]; - } - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; - } - } - rcu_read_unlock(); - - return queue_index; -#else - return -1; -#endif -} - -struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb) -{ - int queue_index; - const struct net_device_ops *ops = dev->netdev_ops; + qdisc_skb_cb(skb)->pkt_len = skb->len; - if (dev->real_num_tx_queues == 1) - queue_index = 0; - else if (ops->ndo_select_queue) { - queue_index = ops->ndo_select_queue(dev, skb); - queue_index = dev_cap_txqueue(dev, queue_index); - } else { - struct sock *sk = skb->sk; - queue_index = sk_tx_queue_get(sk); + /* To get more precise estimation of bytes sent on wire, + * we add to pkt_len the headers size of all segments + */ + if (shinfo->gso_size) { + unsigned int hdr_len; + u16 gso_segs = shinfo->gso_segs; - if (queue_index < 0 || skb->ooo_okay || - queue_index >= dev->real_num_tx_queues) { - int old_index = queue_index; + /* mac layer + network layer */ + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - queue_index = get_xps_queue(dev, skb); - if (queue_index < 0) - queue_index = skb_tx_hash(dev, skb); + /* + transport layer */ + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + hdr_len += tcp_hdrlen(skb); + else + hdr_len += sizeof(struct udphdr); - if (queue_index != old_index && sk) { - struct dst_entry *dst = - rcu_dereference_check(sk->sk_dst_cache, 1); + if (shinfo->gso_type & SKB_GSO_DODGY) + gso_segs = DIV_ROUND_UP(skb->len - hdr_len, + shinfo->gso_size); - if (dst && skb_dst(skb) == dst) - sk_tx_queue_set(sk, queue_index); - } - } + qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } - - skb_set_queue_mapping(skb, queue_index); - return netdev_get_tx_queue(dev, queue_index); } static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, @@ -2540,7 +2625,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, bool contended; int rc; - qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_pkt_len_init(skb); qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a @@ -2663,6 +2748,8 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; + skb_reset_mac_header(skb); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ @@ -2757,41 +2844,6 @@ static inline void ____napi_schedule(struct softnet_data *sd, __raise_softirq_irqoff(NET_RX_SOFTIRQ); } -/* - * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers. Sets rxhash in skb to non-zero hash value - * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb - * if hash is a canonical 4-tuple hash over transport ports. - */ -void __skb_get_rxhash(struct sk_buff *skb) -{ - struct flow_keys keys; - u32 hash; - - if (!skb_flow_dissect(skb, &keys)) - return; - - if (keys.ports) - skb->l4_rxhash = 1; - - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys.dst < (__force u32)keys.src) || - (((__force u32)keys.dst == (__force u32)keys.src) && - ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { - swap(keys.dst, keys.src); - swap(keys.port16[0], keys.port16[1]); - } - - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, hashrnd); - if (!hash) - hash = 1; - - skb->rxhash = hash; -} -EXPORT_SYMBOL(__skb_get_rxhash); - #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ @@ -3277,6 +3329,7 @@ int netdev_rx_handler_register(struct net_device *dev, if (dev->rx_handler) return -EBUSY; + /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); @@ -3288,7 +3341,7 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_register); * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * - * Unregister a receive hander from a device. + * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex. */ @@ -3297,6 +3350,11 @@ void netdev_rx_handler_unregister(struct net_device *dev) ASSERT_RTNL(); RCU_INIT_POINTER(dev->rx_handler, NULL); + /* a reader seeing a non NULL rx_handler in a rcu_read_lock() + * section has a guarantee to see a non NULL rx_handler_data + * as well. + */ + synchronize_net(); RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); @@ -3312,13 +3370,14 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) case __constant_htons(ETH_P_IP): case __constant_htons(ETH_P_IPV6): case __constant_htons(ETH_P_8021Q): + case __constant_htons(ETH_P_8021AD): return true; default: return false; } } -static int __netif_receive_skb(struct sk_buff *skb) +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; @@ -3327,24 +3386,11 @@ static int __netif_receive_skb(struct sk_buff *skb) bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; - unsigned long pflags = current->flags; net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); - /* - * PFMEMALLOC skbs are special, they should - * - be delivered to SOCK_MEMALLOC sockets only - * - stay away from userspace - * - have bounded memory usage - * - * Use PF_MEMALLOC as this saves us from propagating the allocation - * context down to all allocation sites. - */ - if (sk_memalloc_socks() && skb_pfmemalloc(skb)) - current->flags |= PF_MEMALLOC; - /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) goto out; @@ -3352,7 +3398,8 @@ static int __netif_receive_skb(struct sk_buff *skb) orig_dev = skb->dev; skb_reset_network_header(skb); - skb_reset_transport_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); skb_reset_mac_len(skb); pt_prev = NULL; @@ -3364,7 +3411,8 @@ another_round: __this_cpu_inc(softnet_data.processed); - if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { + if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || + skb->protocol == cpu_to_be16(ETH_P_8021AD)) { skb = vlan_untag(skb); if (unlikely(!skb)) goto unlock; @@ -3377,7 +3425,7 @@ another_round: } #endif - if (sk_memalloc_socks() && skb_pfmemalloc(skb)) + if (pfmemalloc) goto skip_taps; list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -3396,8 +3444,7 @@ skip_taps: ncls: #endif - if (sk_memalloc_socks() && skb_pfmemalloc(skb) - && !skb_pfmemalloc_protocol(skb)) + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; if (vlan_tx_tag_present(skb)) { @@ -3419,6 +3466,7 @@ ncls: } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: + ret = NET_RX_SUCCESS; goto unlock; case RX_HANDLER_ANOTHER: goto another_round; @@ -3467,7 +3515,31 @@ drop: unlock: rcu_read_unlock(); out: - tsk_restore_flags(current, pflags, PF_MEMALLOC); + return ret; +} + +static int __netif_receive_skb(struct sk_buff *skb) +{ + int ret; + + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { + unsigned long pflags = current->flags; + + /* + * PFMEMALLOC skbs are special, they should + * - be delivered to SOCK_MEMALLOC sockets only + * - stay away from userspace + * - have bounded memory usage + * + * Use PF_MEMALLOC as this saves us from propagating the allocation + * context down to all allocation sites. + */ + current->flags |= PF_MEMALLOC; + ret = __netif_receive_skb_core(skb, true); + tsk_restore_flags(current, pflags, PF_MEMALLOC); + } else + ret = __netif_receive_skb_core(skb, false); + return ret; } @@ -3634,7 +3706,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff __be16 type = skb->protocol; struct list_head *head = &offload_base; int same_flow; - int mac_len; enum gro_result ret; if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) @@ -3651,8 +3722,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff continue; skb_set_network_header(skb, skb_gro_offset(skb)); - mac_len = skb->network_header - skb->mac_header; - skb->mac_len = mac_len; + skb_reset_mac_len(skb); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; @@ -4010,6 +4080,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->gro_list = NULL; napi->skb = NULL; napi->poll = poll; + if (weight > NAPI_POLL_WEIGHT) + pr_err_once("netif_napi_add() called with weight %d on device %s\n", + weight, dev->name); napi->weight = weight; list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; @@ -4056,7 +4129,7 @@ static void net_rx_action(struct softirq_action *h) * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ - if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) + if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; local_irq_enable(); @@ -4134,530 +4207,231 @@ softnet_break: goto out; } -static gifconf_func_t *gifconf_list[NPROTO]; - -/** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler - * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) -{ - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} -EXPORT_SYMBOL(register_gifconf); - - -/* - * Map an interface index to its name (SIOCGIFNAME) - */ - -/* - * We need this ioctl for efficient implementation of the - * if_indextoname() function required by the IPv6 API. Without - * it, we would have to search all the interfaces to find a - * match. --pb - */ - -static int dev_ifname(struct net *net, struct ifreq __user *arg) -{ +struct netdev_upper { struct net_device *dev; - struct ifreq ifr; - unsigned seq; - - /* - * Fetch the caller's info block. - */ + bool master; + struct list_head list; + struct rcu_head rcu; + struct list_head search_list; +}; - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; +static void __append_search_uppers(struct list_head *search_list, + struct net_device *dev) +{ + struct netdev_upper *upper; -retry: - seq = read_seqcount_begin(&devnet_rename_seq); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); - if (!dev) { - rcu_read_unlock(); - return -ENODEV; + list_for_each_entry(upper, &dev->upper_dev_list, list) { + /* check if this upper is not already in search list */ + if (list_empty(&upper->search_list)) + list_add_tail(&upper->search_list, search_list); } - - strcpy(ifr.ifr_name, dev->name); - rcu_read_unlock(); - if (read_seqcount_retry(&devnet_rename_seq, seq)) - goto retry; - - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; } -/* - * Perform a SIOCGIFCONF call. This structure will change - * size eventually, and there is nothing I can do about it. - * Thus we will need a 'compatibility mode'. - */ - -static int dev_ifconf(struct net *net, char __user *arg) +static bool __netdev_search_upper_dev(struct net_device *dev, + struct net_device *upper_dev) { - struct ifconf ifc; - struct net_device *dev; - char __user *pos; - int len; - int total; - int i; + LIST_HEAD(search_list); + struct netdev_upper *upper; + struct netdev_upper *tmp; + bool ret = false; - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; - - pos = ifc.ifc_buf; - len = ifc.ifc_len; - - /* - * Loop over the interfaces, and write an info block for each. - */ - - total = 0; - for_each_netdev(net, dev) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0); - else - done = gifconf_list[i](dev, pos + total, - len - total); - if (done < 0) - return -EFAULT; - total += done; - } + __append_search_uppers(&search_list, dev); + list_for_each_entry(upper, &search_list, search_list) { + if (upper->dev == upper_dev) { + ret = true; + break; } + __append_search_uppers(&search_list, upper->dev); } - - /* - * All done. Write the updated control block back to the caller. - */ - ifc.ifc_len = total; - - /* - * Both BSD and Solaris return 0 here, so we do too. - */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; + list_for_each_entry_safe(upper, tmp, &search_list, search_list) + INIT_LIST_HEAD(&upper->search_list); + return ret; } -#ifdef CONFIG_PROC_FS - -#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) - -#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) -#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) - -static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +static struct netdev_upper *__netdev_find_upper(struct net_device *dev, + struct net_device *upper_dev) { - struct net *net = seq_file_net(seq); - struct net_device *dev; - struct hlist_node *p; - struct hlist_head *h; - unsigned int count = 0, offset = get_offset(*pos); + struct netdev_upper *upper; - h = &net->dev_name_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, p, h, name_hlist) { - if (++count == offset) - return dev; + list_for_each_entry(upper, &dev->upper_dev_list, list) { + if (upper->dev == upper_dev) + return upper; } - return NULL; } -static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) -{ - struct net_device *dev; - unsigned int bucket; - - do { - dev = dev_from_same_bucket(seq, pos); - if (dev) - return dev; - - bucket = get_bucket(*pos) + 1; - *pos = set_bucket_offset(bucket, 1); - } while (bucket < NETDEV_HASHENTRIES); - - return NULL; -} - -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. +/** + * netdev_has_upper_dev - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks only immediate upper device, + * not through a complete stack of devices. The caller must hold the RTNL lock. */ -void *dev_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - rcu_read_lock(); - if (!*pos) - return SEQ_START_TOKEN; - - if (get_bucket(*pos) >= NETDEV_HASHENTRIES) - return NULL; - - return dev_from_bucket(seq, pos); -} - -void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +bool netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) { - ++*pos; - return dev_from_bucket(seq, pos); -} - -void dev_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) -{ - struct rtnl_link_stats64 temp; - const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + ASSERT_RTNL(); - seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " - "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", - dev->name, stats->rx_bytes, stats->rx_packets, - stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors + - stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, stats->tx_packets, - stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + - stats->tx_aborted_errors + - stats->tx_window_errors + - stats->tx_heartbeat_errors, - stats->tx_compressed); + return __netdev_find_upper(dev, upper_dev); } +EXPORT_SYMBOL(netdev_has_upper_dev); -/* - * Called from the PROCfs module. This now uses the new arbitrary sized - * /proc/net interface to create /proc/net/dev +/** + * netdev_has_any_upper_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to an upper device and return true in case + * it is. The caller must hold the RTNL lock. */ -static int dev_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Inter-| Receive " - " | Transmit\n" - " face |bytes packets errs drop fifo frame " - "compressed multicast|bytes packets errs " - "drop fifo colls carrier compressed\n"); - else - dev_seq_printf_stats(seq, v); - return 0; -} - -static struct softnet_data *softnet_get_online(loff_t *pos) -{ - struct softnet_data *sd = NULL; - - while (*pos < nr_cpu_ids) - if (cpu_online(*pos)) { - sd = &per_cpu(softnet_data, *pos); - break; - } else - ++*pos; - return sd; -} - -static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) -{ - return softnet_get_online(pos); -} - -static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return softnet_get_online(pos); -} - -static void softnet_seq_stop(struct seq_file *seq, void *v) -{ -} - -static int softnet_seq_show(struct seq_file *seq, void *v) +bool netdev_has_any_upper_dev(struct net_device *dev) { - struct softnet_data *sd = v; + ASSERT_RTNL(); - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, - 0, 0, 0, 0, /* was fastroute */ - sd->cpu_collision, sd->received_rps); - return 0; + return !list_empty(&dev->upper_dev_list); } +EXPORT_SYMBOL(netdev_has_any_upper_dev); -static const struct seq_operations dev_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_seq_show, -}; - -static int dev_seq_open(struct inode *inode, struct file *file) +/** + * netdev_master_upper_dev_get - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RTNL lock. + */ +struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { - return seq_open_net(inode, file, &dev_seq_ops, - sizeof(struct seq_net_private)); -} + struct netdev_upper *upper; -static const struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, - .open = dev_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; + ASSERT_RTNL(); -static const struct seq_operations softnet_seq_ops = { - .start = softnet_seq_start, - .next = softnet_seq_next, - .stop = softnet_seq_stop, - .show = softnet_seq_show, -}; + if (list_empty(&dev->upper_dev_list)) + return NULL; -static int softnet_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &softnet_seq_ops); + upper = list_first_entry(&dev->upper_dev_list, + struct netdev_upper, list); + if (likely(upper->master)) + return upper->dev; + return NULL; } +EXPORT_SYMBOL(netdev_master_upper_dev_get); -static const struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, - .open = softnet_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void *ptype_get_idx(loff_t pos) +/** + * netdev_master_upper_dev_get_rcu - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RCU read lock. + */ +struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - struct packet_type *pt = NULL; - loff_t i = 0; - int t; - - list_for_each_entry_rcu(pt, &ptype_all, list) { - if (i == pos) - return pt; - ++i; - } + struct netdev_upper *upper; - for (t = 0; t < PTYPE_HASH_SIZE; t++) { - list_for_each_entry_rcu(pt, &ptype_base[t], list) { - if (i == pos) - return pt; - ++i; - } - } + upper = list_first_or_null_rcu(&dev->upper_dev_list, + struct netdev_upper, list); + if (upper && likely(upper->master)) + return upper->dev; return NULL; } +EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); -static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) +static int __netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev, bool master) { - rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; -} + struct netdev_upper *upper; -static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct packet_type *pt; - struct list_head *nxt; - int hash; + ASSERT_RTNL(); - ++*pos; - if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); + if (dev == upper_dev) + return -EBUSY; - pt = v; - nxt = pt->list.next; - if (pt->type == htons(ETH_P_ALL)) { - if (nxt != &ptype_all) - goto found; - hash = 0; - nxt = ptype_base[0].next; - } else - hash = ntohs(pt->type) & PTYPE_HASH_MASK; + /* To prevent loops, check if dev is not upper device to upper_dev. */ + if (__netdev_search_upper_dev(upper_dev, dev)) + return -EBUSY; - while (nxt == &ptype_base[hash]) { - if (++hash >= PTYPE_HASH_SIZE) - return NULL; - nxt = ptype_base[hash].next; - } -found: - return list_entry(nxt, struct packet_type, list); -} + if (__netdev_find_upper(dev, upper_dev)) + return -EEXIST; -static void ptype_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} + if (master && netdev_master_upper_dev_get(dev)) + return -EBUSY; -static int ptype_seq_show(struct seq_file *seq, void *v) -{ - struct packet_type *pt = v; + upper = kmalloc(sizeof(*upper), GFP_KERNEL); + if (!upper) + return -ENOMEM; - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { - if (pt->type == htons(ETH_P_ALL)) - seq_puts(seq, "ALL "); - else - seq_printf(seq, "%04x", ntohs(pt->type)); + upper->dev = upper_dev; + upper->master = master; + INIT_LIST_HEAD(&upper->search_list); - seq_printf(seq, " %-8s %pF\n", - pt->dev ? pt->dev->name : "", pt->func); - } + /* Ensure that master upper link is always the first item in list. */ + if (master) + list_add_rcu(&upper->list, &dev->upper_dev_list); + else + list_add_tail_rcu(&upper->list, &dev->upper_dev_list); + dev_hold(upper_dev); return 0; } -static const struct seq_operations ptype_seq_ops = { - .start = ptype_seq_start, - .next = ptype_seq_next, - .stop = ptype_seq_stop, - .show = ptype_seq_show, -}; - -static int ptype_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &ptype_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations ptype_seq_fops = { - .owner = THIS_MODULE, - .open = ptype_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - - -static int __net_init dev_proc_net_init(struct net *net) -{ - int rc = -ENOMEM; - - if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) - goto out; - if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) - goto out_dev; - if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) - goto out_softnet; - - if (wext_proc_init(net)) - goto out_ptype; - rc = 0; -out: - return rc; -out_ptype: - proc_net_remove(net, "ptype"); -out_softnet: - proc_net_remove(net, "softnet_stat"); -out_dev: - proc_net_remove(net, "dev"); - goto out; -} - -static void __net_exit dev_proc_net_exit(struct net *net) -{ - wext_proc_exit(net); - - proc_net_remove(net, "ptype"); - proc_net_remove(net, "softnet_stat"); - proc_net_remove(net, "dev"); -} - -static struct pernet_operations __net_initdata dev_proc_ops = { - .init = dev_proc_net_init, - .exit = dev_proc_net_exit, -}; - -static int __init dev_proc_init(void) +/** + * netdev_upper_dev_link - Add a link to the upper device + * @dev: device + * @upper_dev: new upper device + * + * Adds a link to device which is upper to this one. The caller must hold + * the RTNL lock. On a failure a negative errno code is returned. + * On success the reference counts are adjusted and the function + * returns zero. + */ +int netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - return register_pernet_subsys(&dev_proc_ops); + return __netdev_upper_dev_link(dev, upper_dev, false); } -#else -#define dev_proc_init() 0 -#endif /* CONFIG_PROC_FS */ - +EXPORT_SYMBOL(netdev_upper_dev_link); /** - * netdev_set_master - set up master pointer - * @slave: slave device - * @master: new master device + * netdev_master_upper_dev_link - Add a master link to the upper device + * @dev: device + * @upper_dev: new upper device * - * Changes the master device of the slave. Pass %NULL to break the - * bonding. The caller must hold the RTNL semaphore. On a failure - * a negative errno code is returned. On success the reference counts - * are adjusted and the function returns zero. + * Adds a link to device which is upper to this one. In this case, only + * one master upper device can be linked, although other non-master devices + * might be linked as well. The caller must hold the RTNL lock. + * On a failure a negative errno code is returned. On success the reference + * counts are adjusted and the function returns zero. */ -int netdev_set_master(struct net_device *slave, struct net_device *master) +int netdev_master_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - struct net_device *old = slave->master; - - ASSERT_RTNL(); - - if (master) { - if (old) - return -EBUSY; - dev_hold(master); - } - - slave->master = master; - - if (old) - dev_put(old); - return 0; + return __netdev_upper_dev_link(dev, upper_dev, true); } -EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_master_upper_dev_link); /** - * netdev_set_bond_master - set up bonding master/slave pair - * @slave: slave device - * @master: new master device + * netdev_upper_dev_unlink - Removes a link to upper device + * @dev: device + * @upper_dev: new upper device * - * Changes the master device of the slave. Pass %NULL to break the - * bonding. The caller must hold the RTNL semaphore. On a failure - * a negative errno code is returned. On success %RTM_NEWLINK is sent - * to the routing socket and the function returns zero. + * Removes a link to device which is upper to this one. The caller must hold + * the RTNL lock. */ -int netdev_set_bond_master(struct net_device *slave, struct net_device *master) +void netdev_upper_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) { - int err; + struct netdev_upper *upper; ASSERT_RTNL(); - err = netdev_set_master(slave, master); - if (err) - return err; - if (master) - slave->flags |= IFF_SLAVE; - else - slave->flags &= ~IFF_SLAVE; - - rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); - return 0; + upper = __netdev_find_upper(dev, upper_dev); + if (!upper) + return; + list_del_rcu(&upper->list); + dev_put(upper_dev); + kfree_rcu(upper, rcu); } -EXPORT_SYMBOL(netdev_set_bond_master); +EXPORT_SYMBOL(netdev_upper_dev_unlink); static void dev_change_rx_flags(struct net_device *dev, int flags) { @@ -5020,381 +4794,33 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) if (!netif_device_present(dev)) return -ENODEV; err = ops->ndo_set_mac_address(dev, sa); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + if (err) + return err; + dev->addr_assign_type = NET_ADDR_SET; + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); - return err; + return 0; } EXPORT_SYMBOL(dev_set_mac_address); -/* - * Perform the SIOCxIFxxx calls, inside rcu_read_lock() - */ -static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) -{ - int err; - struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); - - if (!dev) - return -ENODEV; - - switch (cmd) { - case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = (short) dev_get_flags(dev); - return 0; - - case SIOCGIFMETRIC: /* Get the metric on the interface - (currently unused) */ - ifr->ifr_metric = 0; - return 0; - - case SIOCGIFMTU: /* Get the MTU of a device */ - ifr->ifr_mtu = dev->mtu; - return 0; - - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; - - case SIOCGIFSLAVE: - err = -EINVAL; - break; - - case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; - - case SIOCGIFINDEX: - ifr->ifr_ifindex = dev->ifindex; - return 0; - - case SIOCGIFTXQLEN: - ifr->ifr_qlen = dev->tx_queue_len; - return 0; - - default: - /* dev_ioctl() should ensure this case - * is never reached - */ - WARN_ON(1); - err = -ENOTTY; - break; - - } - return err; -} - -/* - * Perform the SIOCxIFxxx calls, inside rtnl_lock() - */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) -{ - int err; - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - const struct net_device_ops *ops; - - if (!dev) - return -ENODEV; - - ops = dev->netdev_ops; - - switch (cmd) { - case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); - - case SIOCSIFMETRIC: /* Set the metric on the interface - (currently unused) */ - return -EOPNOTSUPP; - - case SIOCSIFMTU: /* Set the MTU of a device */ - return dev_set_mtu(dev, ifr->ifr_mtu); - - case SIOCSIFHWADDR: - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); - - case SIOCSIFHWBROADCAST: - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return 0; - - case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; - - case SIOCADDMULTI: - if (!ops->ndo_set_rx_mode || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); - - case SIOCDELMULTI: - if (!ops->ndo_set_rx_mode || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); - - case SIOCSIFTXQLEN: - if (ifr->ifr_qlen < 0) - return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; - return 0; - - case SIOCSIFNAME: - ifr->ifr_newname[IFNAMSIZ-1] = '\0'; - return dev_change_name(dev, ifr->ifr_newname); - - case SIOCSHWTSTAMP: - err = net_hwtstamp_validate(ifr); - if (err) - return err; - /* fall through */ - - /* - * Unknown or private ioctl - */ - default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || - cmd == SIOCBONDRELEASE || - cmd == SIOCBONDSETHWADDR || - cmd == SIOCBONDSLAVEINFOQUERY || - cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCSHWTSTAMP || - cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (ops->ndo_do_ioctl) { - if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); - else - err = -ENODEV; - } - } else - err = -EINVAL; - - } - return err; -} - -/* - * This function handles all "interface"-type I/O control requests. The actual - * 'doing' part of this is dev_ifsioc above. - */ - /** - * dev_ioctl - network device ioctl - * @net: the applicable net namespace - * @cmd: command to issue - * @arg: pointer to a struct ifreq in user space + * dev_change_carrier - Change device carrier + * @dev: device + * @new_carrier: new value * - * Issue ioctl functions to devices. This is normally called by the - * user space syscall interfaces but can sometimes be useful for - * other purposes. The return value is the return from the syscall if - * positive or a negative errno code on error. + * Change device carrier */ - -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int dev_change_carrier(struct net_device *dev, bool new_carrier) { - struct ifreq ifr; - int ret; - char *colon; - - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_lock(); - ret = dev_ifconf(net, (char __user *) arg); - rtnl_unlock(); - return ret; - } - if (cmd == SIOCGIFNAME) - return dev_ifname(net, (struct ifreq __user *)arg); - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - ifr.ifr_name[IFNAMSIZ-1] = 0; - - colon = strchr(ifr.ifr_name, ':'); - if (colon) - *colon = 0; - - /* - * See which interface the caller is talking about. - */ - - switch (cmd) { - /* - * These ioctl calls: - * - can be done by all. - * - atomic and do not require locking. - * - return a value - */ - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFHWADDR: - case SIOCGIFSLAVE: - case SIOCGIFMAP: - case SIOCGIFINDEX: - case SIOCGIFTXQLEN: - dev_load(net, ifr.ifr_name); - rcu_read_lock(); - ret = dev_ifsioc_locked(net, &ifr, cmd); - rcu_read_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - case SIOCETHTOOL: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ethtool(net, &ifr); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - return a value - */ - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCSIFNAME: - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - do not return a value - */ - case SIOCSIFMAP: - case SIOCSIFTXQLEN: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - /* fall through */ - /* - * These ioctl calls: - * - require local superuser power. - * - require strict serialization. - * - do not return a value - */ - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: - case SIOCSIFMTU: - case SIOCSIFHWADDR: - case SIOCSIFSLAVE: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCSIFHWBROADCAST: - case SIOCSMIIREG: - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: - case SIOCSHWTSTAMP: - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - /* fall through */ - case SIOCBONDSLAVEINFOQUERY: - case SIOCBONDINFOQUERY: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - return ret; - - case SIOCGIFMEM: - /* Get the per device memory space. We can add this but - * currently do not support it */ - case SIOCSIFMEM: - /* Set the per device memory buffer space. - * Not applicable in our case */ - case SIOCSIFLINK: - return -ENOTTY; + const struct net_device_ops *ops = dev->netdev_ops; - /* - * Unknown or private ioctl. - */ - default: - if (cmd == SIOCWANDEV || - (cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } - /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) - return wext_handle_ioctl(net, &ifr, cmd, arg); - return -ENOTTY; - } + if (!ops->ndo_change_carrier) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_carrier(dev, new_carrier); } - +EXPORT_SYMBOL(dev_change_carrier); /** * dev_new_index - allocate an ifindex @@ -5482,11 +4908,15 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); - /* Notifier chain MUST detach us from master device. */ - WARN_ON(dev->master); + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif } synchronize_net(); @@ -5514,20 +4944,25 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } - /* Fix illegal SG+CSUM combinations. */ - if ((features & NETIF_F_SG) && - !(features & NETIF_F_ALL_CSUM)) { - netdev_dbg(dev, - "Dropping NETIF_F_SG since no checksum feature.\n"); - features &= ~NETIF_F_SG; - } - /* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); features &= ~NETIF_F_ALL_TSO; } + if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IP_CSUM)) { + netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO; + features &= ~NETIF_F_TSO_ECN; + } + + if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IPV6_CSUM)) { + netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO6; + } + /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; @@ -5664,10 +5099,9 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) { - pr_err("netdev: Unable to allocate %u rx queues\n", count); + if (!rx) return -ENOMEM; - } + dev->_rx = rx; for (i = 0; i < count; i++) @@ -5698,10 +5132,9 @@ static int netif_alloc_netdev_queues(struct net_device *dev) BUG_ON(count < 1); tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) { - pr_err("netdev: Unable to allocate %u tx queues\n", count); + if (!tx) return -ENOMEM; - } + dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -5760,6 +5193,15 @@ int register_netdevice(struct net_device *dev) } } + if (((dev->hw_features | dev->features) & + NETIF_F_HW_VLAN_CTAG_FILTER) && + (!dev->netdev_ops->ndo_vlan_rx_add_vid || + !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { + netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); + ret = -EINVAL; + goto err_uninit; + } + ret = -EBUSY; if (!dev->ifindex) dev->ifindex = dev_new_index(net); @@ -5789,6 +5231,10 @@ int register_netdevice(struct net_device *dev) */ dev->vlan_features |= NETIF_F_HIGHDMA; + /* Make NETIF_F_SG inheritable to tunnel devices. + */ + dev->hw_enc_features |= NETIF_F_SG; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) @@ -5815,6 +5261,13 @@ int register_netdevice(struct net_device *dev) list_netdevice(dev); add_device_randomness(dev->dev_addr, dev->addr_len); + /* If the device has permanent device address, driver should + * set dev_addr and also addr_assign_type should be set to + * NET_ADDR_PERM (default value). + */ + if (dev->addr_assign_type == NET_ADDR_PERM) + memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); + /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); @@ -6173,10 +5626,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, alloc_size += NETDEV_ALIGN - 1; p = kzalloc(alloc_size, GFP_KERNEL); - if (!p) { - pr_err("alloc_netdev: Unable to allocate device\n"); + if (!p) return NULL; - } dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; @@ -6199,6 +5650,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); + INIT_LIST_HEAD(&dev->upper_dev_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); @@ -6842,19 +6294,9 @@ static int __init net_dev_init(void) hotcpu_notifier(dev_cpu_callback, 0); dst_init(); - dev_mcast_init(); rc = 0; out: return rc; } subsys_initcall(net_dev_init); - -static int __init initialize_hashrnd(void) -{ - get_random_bytes(&hashrnd, sizeof(hashrnd)); - return 0; -} - -late_initcall_sync(initialize_hashrnd); - diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index b079c7bbc157..c013f38482a1 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -15,7 +15,6 @@ #include <linux/rtnetlink.h> #include <linux/export.h> #include <linux/list.h> -#include <linux/proc_fs.h> /* * General list handling functions @@ -23,7 +22,8 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global) + unsigned char addr_type, bool global, + bool sync) { struct netdev_hw_addr *ha; int alloc_size; @@ -38,7 +38,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, ha->type = addr_type; ha->refcount = 1; ha->global_use = global; - ha->synced = false; + ha->synced = sync; list_add_tail_rcu(&ha->list, &list->list); list->count++; @@ -47,7 +47,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global) + unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; @@ -64,43 +64,62 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, else ha->global_use = true; } + if (sync) { + if (ha->synced) + return 0; + else + ha->synced = true; + } ha->refcount++; return 0; } } - return __hw_addr_create_ex(list, addr, addr_len, addr_type, global); + return __hw_addr_create_ex(list, addr, addr_len, addr_type, global, + sync); } static int __hw_addr_add(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { - return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); + return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false); +} + +static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, + struct netdev_hw_addr *ha, bool global, + bool sync) +{ + if (global && !ha->global_use) + return -ENOENT; + + if (sync && !ha->synced) + return -ENOENT; + + if (global) + ha->global_use = false; + + if (sync) + ha->synced = false; + + if (--ha->refcount) + return 0; + list_del_rcu(&ha->list); + kfree_rcu(ha, rcu_head); + list->count--; + return 0; } static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global) + unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; list_for_each_entry(ha, &list->list, list) { if (!memcmp(ha->addr, addr, addr_len) && - (ha->type == addr_type || !addr_type)) { - if (global) { - if (!ha->global_use) - break; - else - ha->global_use = false; - } - if (--ha->refcount) - return 0; - list_del_rcu(&ha->list); - kfree_rcu(ha, rcu_head); - list->count--; - return 0; - } + (ha->type == addr_type || !addr_type)) + return __hw_addr_del_entry(list, ha, global, sync); } return -ENOENT; } @@ -109,7 +128,57 @@ static int __hw_addr_del(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { - return __hw_addr_del_ex(list, addr, addr_len, addr_type, false); + return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false); +} + +static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr *ha, + int addr_len) +{ + int err; + + err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, + false, true); + if (err) + return err; + ha->sync_cnt++; + ha->refcount++; + + return 0; +} + +static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + struct netdev_hw_addr *ha, + int addr_len) +{ + int err; + + err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type, + false, true); + if (err) + return; + ha->sync_cnt--; + __hw_addr_del_entry(from_list, ha, false, true); +} + +static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + int err = 0; + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (ha->sync_cnt == ha->refcount) { + __hw_addr_unsync_one(to_list, from_list, ha, addr_len); + } else { + err = __hw_addr_sync_one(to_list, ha, addr_len); + if (err) + break; + } + } + return err; } int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, @@ -153,6 +222,11 @@ void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, } EXPORT_SYMBOL(__hw_addr_del_multiple); +/* This function only works where there is a strict 1-1 relationship + * between source and destionation of they synch. If you ever need to + * sync addresses to more then 1 destination, you need to use + * __hw_addr_sync_multiple(). + */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) @@ -161,17 +235,12 @@ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { - if (!ha->synced) { - err = __hw_addr_add(to_list, ha->addr, - addr_len, ha->type); + if (!ha->sync_cnt) { + err = __hw_addr_sync_one(to_list, ha, addr_len); if (err) break; - ha->synced = true; - ha->refcount++; - } else if (ha->refcount == 1) { - __hw_addr_del(to_list, ha->addr, addr_len, ha->type); - __hw_addr_del(from_list, ha->addr, addr_len, ha->type); - } + } else if (ha->refcount == 1) + __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } return err; } @@ -184,13 +253,8 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { - if (ha->synced) { - __hw_addr_del(to_list, ha->addr, - addr_len, ha->type); - ha->synced = false; - __hw_addr_del(from_list, ha->addr, - addr_len, ha->type); - } + if (ha->sync_cnt) + __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } } EXPORT_SYMBOL(__hw_addr_unsync); @@ -407,7 +471,7 @@ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) } } err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST, true); + NETDEV_HW_ADDR_T_UNICAST, true, false); if (!err) __dev_set_rx_mode(dev); out: @@ -470,7 +534,8 @@ EXPORT_SYMBOL(dev_uc_del); * locked by netif_addr_lock_bh. * * This function is intended to be called from the dev->set_rx_mode - * function of layered software devices. + * function of layered software devices. This function assumes that + * addresses will only ever be synced to the @to devices and no other. */ int dev_uc_sync(struct net_device *to, struct net_device *from) { @@ -489,6 +554,36 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) EXPORT_SYMBOL(dev_uc_sync); /** + * dev_uc_sync_multiple - Synchronize device's unicast list to another + * device, but allow for multiple calls to sync to multiple devices. + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have been deleted from the source. The source device + * must be locked by netif_addr_lock_bh. + * + * This function is intended to be called from the dev->set_rx_mode + * function of layered software devices. It allows for a single source + * device to be synced to multiple destination devices. + */ +int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_nested(to); + err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_uc_sync_multiple); + +/** * dev_uc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device @@ -560,7 +655,7 @@ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) } } err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, true); + NETDEV_HW_ADDR_T_MULTICAST, true, false); if (!err) __dev_set_rx_mode(dev); out: @@ -576,7 +671,7 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, global); + NETDEV_HW_ADDR_T_MULTICAST, global, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); @@ -616,7 +711,7 @@ static int __dev_mc_del(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, global); + NETDEV_HW_ADDR_T_MULTICAST, global, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); @@ -680,6 +775,36 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) EXPORT_SYMBOL(dev_mc_sync); /** + * dev_mc_sync_multiple - Synchronize device's unicast list to another + * device, but allow for multiple calls to sync to multiple devices. + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_addr_lock_bh. + * + * This function is intended to be called from the ndo_set_rx_mode + * function of layered software devices. It allows for a single + * source device to be synced to multiple destination devices. + */ +int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_nested(to); + err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_mc_sync_multiple); + +/** * dev_mc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device @@ -727,76 +852,3 @@ void dev_mc_init(struct net_device *dev) __hw_addr_init(&dev->mc); } EXPORT_SYMBOL(dev_mc_init); - -#ifdef CONFIG_PROC_FS -#include <linux/seq_file.h> - -static int dev_mc_seq_show(struct seq_file *seq, void *v) -{ - struct netdev_hw_addr *ha; - struct net_device *dev = v; - - if (v == SEQ_START_TOKEN) - return 0; - - netif_addr_lock_bh(dev); - netdev_for_each_mc_addr(ha, dev) { - int i; - - seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, - dev->name, ha->refcount, ha->global_use); - - for (i = 0; i < dev->addr_len; i++) - seq_printf(seq, "%02x", ha->addr[i]); - - seq_putc(seq, '\n'); - } - netif_addr_unlock_bh(dev); - return 0; -} - -static const struct seq_operations dev_mc_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_mc_seq_show, -}; - -static int dev_mc_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &dev_mc_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_mc_seq_fops = { - .owner = THIS_MODULE, - .open = dev_mc_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -#endif - -static int __net_init dev_mc_net_init(struct net *net) -{ - if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) - return -ENOMEM; - return 0; -} - -static void __net_exit dev_mc_net_exit(struct net *net) -{ - proc_net_remove(net, "dev_mcast"); -} - -static struct pernet_operations __net_initdata dev_mc_net_ops = { - .init = dev_mc_net_init, - .exit = dev_mc_net_exit, -}; - -void __init dev_mcast_init(void) -{ - register_pernet_subsys(&dev_mc_net_ops); -} - diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c new file mode 100644 index 000000000000..6cc0481faade --- /dev/null +++ b/net/core/dev_ioctl.c @@ -0,0 +1,576 @@ +#include <linux/kmod.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/rtnetlink.h> +#include <linux/net_tstamp.h> +#include <linux/wireless.h> +#include <net/wext.h> + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct net *net, struct ifreq __user *arg) +{ + struct net_device *dev; + struct ifreq ifr; + unsigned seq; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + +retry: + seq = read_seqcount_begin(&devnet_rename_seq); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + + strcpy(ifr.ifr_name, dev->name); + rcu_read_unlock(); + if (read_seqcount_retry(&devnet_rename_seq, seq)) + goto retry; + + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; +} + +static gifconf_func_t *gifconf_list[NPROTO]; + +/** + * register_gifconf - register a SIOCGIF handler + * @family: Address family + * @gifconf: Function handler + * + * Register protocol dependent address dumping routines. The handler + * that is passed must not be freed or reused until it has been replaced + * by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t *gifconf) +{ + if (family >= NPROTO) + return -EINVAL; + gifconf_list[family] = gifconf; + return 0; +} +EXPORT_SYMBOL(register_gifconf); + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(struct net *net, char __user *arg) +{ + struct ifconf ifc; + struct net_device *dev; + char __user *pos; + int len; + int total; + int i; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + + /* + * Loop over the interfaces, and write an info block for each. + */ + + total = 0; + for_each_netdev(net, dev) { + for (i = 0; i < NPROTO; i++) { + if (gifconf_list[i]) { + int done; + if (!pos) + done = gifconf_list[i](dev, NULL, 0); + else + done = gifconf_list[i](dev, pos + total, + len - total); + if (done < 0) + return -EFAULT; + total += done; + } + } + } + + /* + * All done. Write the updated control block back to the caller. + */ + ifc.ifc_len = total; + + /* + * Both BSD and Solaris return 0 here, so we do too. + */ + return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +/* + * Perform the SIOCxIFxxx calls, inside rcu_read_lock() + */ +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); + + if (!dev) + return -ENODEV; + + switch (cmd) { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = (short) dev_get_flags(dev); + return 0; + + case SIOCGIFMETRIC: /* Get the metric on the interface + (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCGIFHWADDR: + if (!dev->addr_len) + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + else + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + ifr->ifr_hwaddr.sa_family = dev->type; + return 0; + + case SIOCGIFSLAVE: + err = -EINVAL; + break; + + case SIOCGIFMAP: + ifr->ifr_map.mem_start = dev->mem_start; + ifr->ifr_map.mem_end = dev->mem_end; + ifr->ifr_map.base_addr = dev->base_addr; + ifr->ifr_map.irq = dev->irq; + ifr->ifr_map.dma = dev->dma; + ifr->ifr_map.port = dev->if_port; + return 0; + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + default: + /* dev_ioctl() should ensure this case + * is never reached + */ + WARN_ON(1); + err = -ENOTTY; + break; + + } + return err; +} + +static int net_hwtstamp_validate(struct ifreq *ifr) +{ + struct hwtstamp_config cfg; + enum hwtstamp_tx_types tx_type; + enum hwtstamp_rx_filters rx_filter; + int tx_type_valid = 0; + int rx_filter_valid = 0; + + if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) + return -EFAULT; + + if (cfg.flags) /* reserved for future extensions */ + return -EINVAL; + + tx_type = cfg.tx_type; + rx_filter = cfg.rx_filter; + + switch (tx_type) { + case HWTSTAMP_TX_OFF: + case HWTSTAMP_TX_ON: + case HWTSTAMP_TX_ONESTEP_SYNC: + tx_type_valid = 1; + break; + } + + switch (rx_filter) { + case HWTSTAMP_FILTER_NONE: + case HWTSTAMP_FILTER_ALL: + case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + rx_filter_valid = 1; + break; + } + + if (!tx_type_valid || !rx_filter_valid) + return -ERANGE; + + return 0; +} + +/* + * Perform the SIOCxIFxxx calls, inside rtnl_lock() + */ +static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + const struct net_device_ops *ops; + + if (!dev) + return -ENODEV; + + ops = dev->netdev_ops; + + switch (cmd) { + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; + + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); + + case SIOCSIFHWADDR: + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCSIFMAP: + if (ops->ndo_set_config) { + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_set_config(dev, &ifr->ifr_map); + } + return -EOPNOTSUPP; + + case SIOCADDMULTI: + if (!ops->ndo_set_rx_mode || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCDELMULTI: + if (!ops->ndo_set_rx_mode || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCSIFTXQLEN: + if (ifr->ifr_qlen < 0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + + case SIOCSIFNAME: + ifr->ifr_newname[IFNAMSIZ-1] = '\0'; + return dev_change_name(dev, ifr->ifr_newname); + + case SIOCSHWTSTAMP: + err = net_hwtstamp_validate(ifr); + if (err) + return err; + /* fall through */ + + /* + * Unknown or private ioctl + */ + default: + if ((cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) || + cmd == SIOCBONDENSLAVE || + cmd == SIOCBONDRELEASE || + cmd == SIOCBONDSETHWADDR || + cmd == SIOCBONDSLAVEINFOQUERY || + cmd == SIOCBONDINFOQUERY || + cmd == SIOCBONDCHANGEACTIVE || + cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCBRADDIF || + cmd == SIOCBRDELIF || + cmd == SIOCSHWTSTAMP || + cmd == SIOCWANDEV) { + err = -EOPNOTSUPP; + if (ops->ndo_do_ioctl) { + if (netif_device_present(dev)) + err = ops->ndo_do_ioctl(dev, ifr, cmd); + else + err = -ENODEV; + } + } else + err = -EINVAL; + + } + return err; +} + +/** + * dev_load - load a network module + * @net: the applicable net namespace + * @name: name of interface + * + * If a network interface is not present and the process has suitable + * privileges this function loads the module. If module loading is not + * available in this kernel then it becomes a nop. + */ + +void dev_load(struct net *net, const char *name) +{ + struct net_device *dev; + int no_module; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + rcu_read_unlock(); + + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) { + if (!request_module("%s", name)) + pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", + name); + } +} +EXPORT_SYMBOL(dev_load); + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +/** + * dev_ioctl - network device ioctl + * @net: the applicable net namespace + * @cmd: command to issue + * @arg: pointer to a struct ifreq in user space + * + * Issue ioctl functions to devices. This is normally called by the + * user space syscall interfaces but can sometimes be useful for + * other purposes. The return value is the return from the syscall if + * positive or a negative errno code on error. + */ + +int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + int ret; + char *colon; + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_lock(); + ret = dev_ifconf(net, (char __user *) arg); + rtnl_unlock(); + return ret; + } + if (cmd == SIOCGIFNAME) + return dev_ifname(net, (struct ifreq __user *)arg); + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch (cmd) { + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(net, ifr.ifr_name); + rcu_read_lock(); + ret = dev_ifsioc_locked(net, &ifr, cmd); + rcu_read_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + case SIOCETHTOOL: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ethtool(net, &ifr); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - return a value + */ + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSIFNAME: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFMAP: + case SIOCSIFTXQLEN: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* fall through */ + /* + * These ioctl calls: + * - require local superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: + case SIOCBRADDIF: + case SIOCBRDELIF: + case SIOCSHWTSTAMP: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + /* fall through */ + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but + * currently do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. + * Not applicable in our case */ + case SIOCSIFLINK: + return -ENOTTY; + + /* + * Unknown or private ioctl. + */ + default: + if (cmd == SIOCWANDEV || + (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15)) { + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + return ret; + } + /* Take care of Wireless Extensions */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) + return wext_handle_ioctl(net, &ifr, cmd, arg); + return -ENOTTY; + } +} diff --git a/net/core/dst.c b/net/core/dst.c index ee6153e2cf43..df9cc810ec8e 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -179,6 +179,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst_init_metrics(dst, dst_default_metrics, true); dst->expires = 0UL; dst->path = dst; + dst->from = NULL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -319,27 +320,28 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) EXPORT_SYMBOL(__dst_destroy_metrics_generic); /** - * skb_dst_set_noref - sets skb dst, without a reference + * __skb_dst_set_noref - sets skb dst, without a reference * @skb: buffer * @dst: dst entry + * @force: if force is set, use noref version even for DST_NOCACHE entries * * Sets skb dst, assuming a reference was not taken on dst * skb_dst_drop() should not dst_release() this dst */ -void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); /* If dst not in cache, we must take a reference, because * dst_release() will destroy dst as soon as its refcount becomes zero */ - if (unlikely(dst->flags & DST_NOCACHE)) { + if (unlikely((dst->flags & DST_NOCACHE) && !force)) { dst_hold(dst); skb_dst_set(skb, dst); } else { skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } } -EXPORT_SYMBOL(skb_dst_set_noref); +EXPORT_SYMBOL(__skb_dst_set_noref); /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat diff --git a/net/core/ethtool.c b/net/core/ethtool.c index a8705432e4b1..22efdaa76ebf 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -60,10 +60,13 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", [NETIF_F_HIGHDMA_BIT] = "highdma", [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", - [NETIF_F_HW_VLAN_TX_BIT] = "tx-vlan-hw-insert", + [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-ctag-hw-insert", - [NETIF_F_HW_VLAN_RX_BIT] = "rx-vlan-hw-parse", - [NETIF_F_HW_VLAN_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-ctag-hw-parse", + [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-ctag-filter", + [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", + [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", + [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", [NETIF_F_GSO_BIT] = "tx-generic-segmentation", [NETIF_F_LLTX_BIT] = "tx-lockless", @@ -77,6 +80,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", @@ -175,7 +180,7 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); - if (ops && ops->get_sset_count && ops->get_strings) + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else return -EOPNOTSUPP; @@ -265,18 +270,19 @@ static int ethtool_set_one_feature(struct net_device *dev, #define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) -#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \ - NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ + NETIF_F_RXHASH) static u32 __ethtool_get_flags(struct net_device *dev) { u32 flags = 0; - if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; - if (dev->features & NETIF_F_HW_VLAN_RX) flags |= ETH_FLAG_RXVLAN; - if (dev->features & NETIF_F_HW_VLAN_TX) flags |= ETH_FLAG_TXVLAN; - if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; - if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; + if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; return flags; } @@ -289,8 +295,8 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data) return -EINVAL; if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; - if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_RX; - if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_TX; + if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_CTAG_RX; + if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_CTAG_TX; if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; @@ -311,7 +317,7 @@ int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) { ASSERT_RTNL(); - if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) + if (!dev->ethtool_ops->get_settings) return -EOPNOTSUPP; memset(cmd, 0, sizeof(struct ethtool_cmd)); @@ -355,7 +361,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; - if (ops && ops->get_drvinfo) { + if (ops->get_drvinfo) { ops->get_drvinfo(dev, &info); } else if (dev->dev.parent && dev->dev.parent->driver) { strlcpy(info.bus_info, dev_name(dev->dev.parent), @@ -370,7 +376,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ - if (ops && ops->get_sset_count) { + if (ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); @@ -383,9 +389,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (rc >= 0) info.n_priv_flags = rc; } - if (ops && ops->get_regs_len) + if (ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); - if (ops && ops->get_eeprom_len) + if (ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); if (copy_to_user(useraddr, &info, sizeof(info))) @@ -590,13 +596,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, struct ethtool_rxnfc rx_rings; u32 user_size, dev_size, i; u32 *indir; + const struct ethtool_ops *ops = dev->ethtool_ops; int ret; - if (!dev->ethtool_ops->get_rxfh_indir_size || - !dev->ethtool_ops->set_rxfh_indir || - !dev->ethtool_ops->get_rxnfc) + if (!ops->get_rxfh_indir_size || !ops->set_rxfh_indir || + !ops->get_rxnfc) return -EOPNOTSUPP; - dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + + dev_size = ops->get_rxfh_indir_size(dev); if (dev_size == 0) return -EOPNOTSUPP; @@ -613,7 +620,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = dev->ethtool_ops->get_rxnfc(dev, &rx_rings, NULL); + ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; @@ -639,7 +646,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, } } - ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); + ret = ops->set_rxfh_indir(dev, indir); out: kfree(indir); @@ -1082,9 +1089,10 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; static bool busy; + const struct ethtool_ops *ops = dev->ethtool_ops; int rc; - if (!dev->ethtool_ops->set_phys_id) + if (!ops->set_phys_id) return -EOPNOTSUPP; if (busy) @@ -1093,7 +1101,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) if (copy_from_user(&id, useraddr, sizeof(id))) return -EFAULT; - rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); + rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); if (rc < 0) return rc; @@ -1118,7 +1126,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) i = n; do { rtnl_lock(); - rc = dev->ethtool_ops->set_phys_id(dev, + rc = ops->set_phys_id(dev, (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); rtnl_unlock(); if (rc) @@ -1133,7 +1141,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) dev_put(dev); busy = false; - (void)dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); + (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); return rc; } @@ -1275,7 +1283,7 @@ static int ethtool_get_dump_flag(struct net_device *dev, struct ethtool_dump dump; const struct ethtool_ops *ops = dev->ethtool_ops; - if (!dev->ethtool_ops->get_dump_flag) + if (!ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) @@ -1299,8 +1307,7 @@ static int ethtool_get_dump_data(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; void *data = NULL; - if (!dev->ethtool_ops->get_dump_data || - !dev->ethtool_ops->get_dump_flag) + if (!ops->get_dump_data || !ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) @@ -1346,13 +1353,9 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) info.cmd = ETHTOOL_GET_TS_INFO; if (phydev && phydev->drv && phydev->drv->ts_info) { - err = phydev->drv->ts_info(phydev, &info); - - } else if (dev->ethtool_ops && dev->ethtool_ops->get_ts_info) { - + } else if (ops->get_ts_info) { err = ops->get_ts_info(dev, &info); - } else { info.so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | @@ -1418,7 +1421,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) void __user *useraddr = ifr->ifr_data; u32 ethcmd; int rc; - u32 old_features; + netdev_features_t old_features; if (!dev || !netif_device_present(dev)) return -ENODEV; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 58a4ba27dfe3..d5a9f8ead0d8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -266,7 +266,7 @@ errout: return err; } -static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -415,7 +415,7 @@ errout: return err; } -static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); diff --git a/net/core/filter.c b/net/core/filter.c index c23543cba132..dad2a178f9f8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -348,6 +348,9 @@ load_b: case BPF_S_ANC_VLAN_TAG_PRESENT: A = !!vlan_tx_tag_present(skb); continue; + case BPF_S_ANC_PAY_OFFSET: + A = __skb_get_poff(skb); + continue; case BPF_S_ANC_NLATTR: { struct nlattr *nla; @@ -532,6 +535,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, }; int pc; + bool anc_found; if (flen == 0 || flen > BPF_MAXINSNS) return -EINVAL; @@ -592,8 +596,10 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) case BPF_S_LD_W_ABS: case BPF_S_LD_H_ABS: case BPF_S_LD_B_ABS: + anc_found = false; #define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ code = BPF_S_ANC_##CODE; \ + anc_found = true; \ break switch (ftest->k) { ANCILLARY(PROTOCOL); @@ -609,7 +615,12 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) ANCILLARY(ALU_XOR_X); ANCILLARY(VLAN_TAG); ANCILLARY(VLAN_TAG_PRESENT); + ANCILLARY(PAY_OFFSET); } + + /* ancillary operation unknown or unsupported */ + if (anc_found == false && ftest->k >= SKF_AD_OFF) + return -EINVAL; } ftest->code = code; } @@ -714,6 +725,9 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) unsigned int fsize = sizeof(struct sock_filter) * fprog->len; int err; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) return -EINVAL; @@ -750,6 +764,9 @@ int sk_detach_filter(struct sock *sk) int ret = -ENOENT; struct sk_filter *filter; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + filter = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); if (filter) { @@ -801,6 +818,7 @@ static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN, [BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND, [BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND, diff --git a/net/core/flow.c b/net/core/flow.c index b0901ee5a002..7102f166482d 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -132,14 +132,14 @@ static void __flow_cache_shrink(struct flow_cache *fc, int shrink_to) { struct flow_cache_entry *fle; - struct hlist_node *entry, *tmp; + struct hlist_node *tmp; LIST_HEAD(gc_list); int i, deleted = 0; for (i = 0; i < flow_cache_hash_size(fc); i++) { int saved = 0; - hlist_for_each_entry_safe(fle, entry, tmp, + hlist_for_each_entry_safe(fle, tmp, &fcp->hash_table[i], u.hlist) { if (saved < shrink_to && flow_entry_valid(fle)) { @@ -211,7 +211,6 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, struct flow_cache *fc = &flow_cache_global; struct flow_cache_percpu *fcp; struct flow_cache_entry *fle, *tfle; - struct hlist_node *entry; struct flow_cache_object *flo; size_t keysize; unsigned int hash; @@ -235,7 +234,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, flow_new_hash_rnd(fc, fcp); hash = flow_hash_code(fc, fcp, key, keysize); - hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) { + hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) { if (tfle->net == net && tfle->family == family && tfle->dir == dir && @@ -286,7 +285,7 @@ nocache: else fle->genid--; } else { - if (flo && !IS_ERR(flo)) + if (!IS_ERR_OR_NULL(flo)) flo->ops->delete(flo); } ret_object: @@ -301,13 +300,13 @@ static void flow_cache_flush_tasklet(unsigned long data) struct flow_cache *fc = info->cache; struct flow_cache_percpu *fcp; struct flow_cache_entry *fle; - struct hlist_node *entry, *tmp; + struct hlist_node *tmp; LIST_HEAD(gc_list); int i, deleted = 0; fcp = this_cpu_ptr(fc->percpu); for (i = 0; i < flow_cache_hash_size(fc); i++) { - hlist_for_each_entry_safe(fle, entry, tmp, + hlist_for_each_entry_safe(fle, tmp, &fcp->hash_table[i], u.hlist) { if (flow_entry_valid(fle)) continue; @@ -324,12 +323,30 @@ static void flow_cache_flush_tasklet(unsigned long data) complete(&info->completion); } +/* + * Return whether a cpu needs flushing. Conservatively, we assume + * the presence of any entries means the core may require flushing, + * since the flow_cache_ops.check() function may assume it's running + * on the same core as the per-cpu cache component. + */ +static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu) +{ + struct flow_cache_percpu *fcp; + int i; + + fcp = per_cpu_ptr(fc->percpu, cpu); + for (i = 0; i < flow_cache_hash_size(fc); i++) + if (!hlist_empty(&fcp->hash_table[i])) + return 0; + return 1; +} + static void flow_cache_flush_per_cpu(void *data) { struct flow_flush_info *info = data; struct tasklet_struct *tasklet; - tasklet = this_cpu_ptr(&info->cache->percpu->flush_tasklet); + tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet; tasklet->data = (unsigned long)info; tasklet_schedule(tasklet); } @@ -338,22 +355,40 @@ void flow_cache_flush(void) { struct flow_flush_info info; static DEFINE_MUTEX(flow_flush_sem); + cpumask_var_t mask; + int i, self; + + /* Track which cpus need flushing to avoid disturbing all cores. */ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return; + cpumask_clear(mask); /* Don't want cpus going down or up during this. */ get_online_cpus(); mutex_lock(&flow_flush_sem); info.cache = &flow_cache_global; - atomic_set(&info.cpuleft, num_online_cpus()); + for_each_online_cpu(i) + if (!flow_cache_percpu_empty(info.cache, i)) + cpumask_set_cpu(i, mask); + atomic_set(&info.cpuleft, cpumask_weight(mask)); + if (atomic_read(&info.cpuleft) == 0) + goto done; + init_completion(&info.completion); local_bh_disable(); - smp_call_function(flow_cache_flush_per_cpu, &info, 0); - flow_cache_flush_tasklet((unsigned long)&info); + self = cpumask_test_and_clear_cpu(smp_processor_id(), mask); + on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0); + if (self) + flow_cache_flush_tasklet((unsigned long)&info); local_bh_enable(); wait_for_completion(&info.completion); + +done: mutex_unlock(&flow_flush_sem); put_online_cpus(); + free_cpumask_var(mask); } static void flow_cache_flush_task(struct work_struct *work) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 466820b6e344..00ee068efc1c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -5,6 +5,10 @@ #include <linux/if_vlan.h> #include <net/ip.h> #include <net/ipv6.h> +#include <linux/igmp.h> +#include <linux/icmp.h> +#include <linux/sctp.h> +#include <linux/dccp.h> #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> @@ -119,6 +123,17 @@ ipv6: nhoff += 4; if (hdr->flags & GRE_SEQ) nhoff += 4; + if (proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = skb_header_pointer(skb, nhoff, + sizeof(_eth), &_eth); + if (!eth) + return false; + proto = eth->h_proto; + nhoff += sizeof(*eth); + } goto again; } break; @@ -140,6 +155,234 @@ ipv6: flow->ports = *ports; } + flow->thoff = (u16) nhoff; + return true; } EXPORT_SYMBOL(skb_flow_dissect); + +static u32 hashrnd __read_mostly; + +/* + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Sets rxhash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb + * if hash is a canonical 4-tuple hash over transport ports. + */ +void __skb_get_rxhash(struct sk_buff *skb) +{ + struct flow_keys keys; + u32 hash; + + if (!skb_flow_dissect(skb, &keys)) + return; + + if (keys.ports) + skb->l4_rxhash = 1; + + /* get a consistent hash (same value on both flow directions) */ + if (((__force u32)keys.dst < (__force u32)keys.src) || + (((__force u32)keys.dst == (__force u32)keys.src) && + ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { + swap(keys.dst, keys.src); + swap(keys.port16[0], keys.port16[1]); + } + + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, hashrnd); + if (!hash) + hash = 1; + + skb->rxhash = hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, + unsigned int num_tx_queues) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; + + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; + return hash; + } + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol; + hash = jhash_1word(hash, hashrnd); + + return (u16) (((u64) hash * qcount) >> 32) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + +/* __skb_get_poff() returns the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 __skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys keys; + u32 poff = 0; + + if (!skb_flow_dissect(skb, &keys)) + return 0; + + poff += keys.thoff; + switch (keys.ip_proto) { + case IPPROTO_TCP: { + const struct tcphdr *tcph; + struct tcphdr _tcph; + + tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); + if (!tcph) + return poff; + + poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4); + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + poff += sizeof(struct udphdr); + break; + /* For the rest, we do not really care about header + * extensions at this point for now. + */ + case IPPROTO_ICMP: + poff += sizeof(struct icmphdr); + break; + case IPPROTO_ICMPV6: + poff += sizeof(struct icmp6hdr); + break; + case IPPROTO_IGMP: + poff += sizeof(struct igmphdr); + break; + case IPPROTO_DCCP: + poff += sizeof(struct dccp_hdr); + break; + case IPPROTO_SCTP: + poff += sizeof(struct sctphdr); + break; + } + + return poff; +} + +static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", + dev->name, queue_index, + dev->real_num_tx_queues); + return 0; + } + return queue_index; +} + +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[raw_smp_processor_id()]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else { + u32 hash; + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol ^ + skb->rxhash; + hash = jhash_1word(hash, hashrnd); + queue_index = map->queues[ + ((u64)hash * map->len) >> 32]; + } + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + int queue_index = sk_tx_queue_get(sk); + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int new_index = get_xps_queue(dev, skb); + if (new_index < 0) + new_index = skb_tx_hash(dev, skb); + + if (queue_index != new_index && sk) { + struct dst_entry *dst = + rcu_dereference_check(sk->sk_dst_cache, 1); + + if (dst && skb_dst(skb) == dst) + sk_tx_queue_set(sk, queue_index); + + } + + queue_index = new_index; + } + + return queue_index; +} +EXPORT_SYMBOL(__netdev_pick_tx); + +struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb) +{ + int queue_index = 0; + + if (dev->real_num_tx_queues != 1) { + const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb); + else + queue_index = __netdev_pick_tx(dev, skb); + queue_index = dev_cap_txqueue(dev, queue_index); + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + +static int __init initialize_hashrnd(void) +{ + get_random_bytes(&hashrnd, sizeof(hashrnd)); + return 0; +} + +late_initcall_sync(initialize_hashrnd); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index c815f285e5ab..5c56b217b999 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -39,21 +39,13 @@ #include <linux/string.h> #include <linux/log2.h> +#define DEBUG #define NEIGH_DEBUG 1 - -#define NEIGH_PRINTK(x...) printk(x) -#define NEIGH_NOPRINTK(x...) do { ; } while(0) -#define NEIGH_PRINTK1 NEIGH_NOPRINTK -#define NEIGH_PRINTK2 NEIGH_NOPRINTK - -#if NEIGH_DEBUG >= 1 -#undef NEIGH_PRINTK1 -#define NEIGH_PRINTK1 NEIGH_PRINTK -#endif -#if NEIGH_DEBUG >= 2 -#undef NEIGH_PRINTK2 -#define NEIGH_PRINTK2 NEIGH_PRINTK -#endif +#define neigh_dbg(level, fmt, ...) \ +do { \ + if (level <= NEIGH_DEBUG) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) #define PNEIGH_HASHMASK 0xF @@ -246,7 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) n->nud_state = NUD_NOARP; else n->nud_state = NUD_NONE; - NEIGH_PRINTK2("neigh %p is stray.\n", n); + neigh_dbg(2, "neigh %p is stray\n", n); } write_unlock(&n->lock); neigh_cleanup_and_release(n); @@ -290,15 +282,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device goto out_entries; } - if (tbl->entry_size) - n = kzalloc(tbl->entry_size, GFP_ATOMIC); - else { - int sz = sizeof(*n) + tbl->key_len; - - sz = ALIGN(sz, NEIGH_PRIV_ALIGN); - sz += dev->neigh_priv_len; - n = kzalloc(sz, GFP_ATOMIC); - } + n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; @@ -550,7 +534,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, lockdep_is_held(&tbl->lock))); rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); - NEIGH_PRINTK2("neigh %p is created.\n", n); + neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; @@ -733,7 +717,7 @@ void neigh_destroy(struct neighbour *neigh) dev_put(dev); neigh_parms_put(neigh->parms); - NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + neigh_dbg(2, "neigh %p is destroyed\n", neigh); atomic_dec(&neigh->tbl->entries); kfree_rcu(neigh, rcu); @@ -747,7 +731,7 @@ EXPORT_SYMBOL(neigh_destroy); */ static void neigh_suspect(struct neighbour *neigh) { - NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh_dbg(2, "neigh %p is suspected\n", neigh); neigh->output = neigh->ops->output; } @@ -759,7 +743,7 @@ static void neigh_suspect(struct neighbour *neigh) */ static void neigh_connect(struct neighbour *neigh) { - NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + neigh_dbg(2, "neigh %p is connected\n", neigh); neigh->output = neigh->ops->connected_output; } @@ -778,6 +762,9 @@ static void neigh_periodic_work(struct work_struct *work) nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); + if (atomic_read(&tbl->entries) < tbl->gc_thresh1) + goto out; + /* * periodically recompute ReachableTime from random function */ @@ -832,6 +819,7 @@ next_elt: nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } +out: /* Cycle through all hash buckets every base_reachable_time/2 ticks. * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 * base_reachable_time. @@ -856,7 +844,7 @@ static void neigh_invalidate(struct neighbour *neigh) struct sk_buff *skb; NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); - NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + neigh_dbg(2, "neigh %p is failed\n", neigh); neigh->updated = jiffies; /* It is very thin place. report_unreachable is very complicated @@ -908,17 +896,17 @@ static void neigh_timer_handler(unsigned long arg) if (state & NUD_REACHABLE) { if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) { - NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, neigh->used + neigh->parms->delay_probe_time)) { - NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_suspect(neigh); next = now + neigh->parms->delay_probe_time; } else { - NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh_dbg(2, "neigh %p is suspected\n", neigh); neigh->nud_state = NUD_STALE; neigh->updated = jiffies; neigh_suspect(neigh); @@ -927,14 +915,14 @@ static void neigh_timer_handler(unsigned long arg) } else if (state & NUD_DELAY) { if (time_before_eq(now, neigh->confirmed + neigh->parms->delay_probe_time)) { - NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); + neigh_dbg(2, "neigh %p is now reachable\n", neigh); neigh->nud_state = NUD_REACHABLE; neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { - NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh_dbg(2, "neigh %p is probed\n", neigh); neigh->nud_state = NUD_PROBE; neigh->updated = jiffies; atomic_set(&neigh->probes, 0); @@ -1001,7 +989,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) return 1; } } else if (neigh->nud_state & NUD_STALE) { - NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_add_timer(neigh, @@ -1324,8 +1312,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) out: return rc; discard: - NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", - dst, neigh); + neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh); out_kfree_skb: rc = -EINVAL; kfree_skb(skb); @@ -1502,7 +1489,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) } } write_unlock_bh(&tbl->lock); - NEIGH_PRINTK1("neigh_parms_release: not found\n"); + neigh_dbg(1, "%s: not found\n", __func__); } EXPORT_SYMBOL(neigh_parms_release); @@ -1542,6 +1529,12 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); + if (!tbl->entry_size) + tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + + tbl->key_len, NEIGH_PRIV_ALIGN); + else + WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); + rwlock_init(&tbl->lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); @@ -1611,7 +1604,7 @@ int neigh_table_clear(struct neigh_table *tbl) } EXPORT_SYMBOL(neigh_table_clear); -static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -1675,7 +1668,7 @@ out: return err; } -static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -1953,7 +1946,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_LOCKTIME] = { .type = NLA_U64 }, }; -static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct neigh_table *tbl; @@ -2712,7 +2705,7 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - sf->private = PDE(inode)->data; + sf->private = PDE_DATA(inode); } return ret; }; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c new file mode 100644 index 000000000000..569d355fec3e --- /dev/null +++ b/net/core/net-procfs.c @@ -0,0 +1,411 @@ +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/wext.h> + +#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + +extern struct list_head ptype_all __read_mostly; +extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; + +static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + struct net_device *dev; + struct hlist_head *h; + unsigned int count = 0, offset = get_offset(*pos); + + h = &net->dev_name_head[get_bucket(*pos)]; + hlist_for_each_entry_rcu(dev, h, name_hlist) { + if (++count == offset) + return dev; + } + + return NULL; +} + +static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) +{ + struct net_device *dev; + unsigned int bucket; + + do { + dev = dev_from_same_bucket(seq, pos); + if (dev) + return dev; + + bucket = get_bucket(*pos) + 1; + *pos = set_bucket_offset(bucket, 1); + } while (bucket < NETDEV_HASHENTRIES); + + return NULL; +} + +/* + * This is invoked by the /proc filesystem handler to display a device + * in detail. + */ +static void *dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + if (!*pos) + return SEQ_START_TOKEN; + + if (get_bucket(*pos) >= NETDEV_HASHENTRIES) + return NULL; + + return dev_from_bucket(seq, pos); +} + +static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return dev_from_bucket(seq, pos); +} + +static void dev_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized + * /proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); + else + dev_seq_printf_stats(seq, v); + return 0; +} + +static struct softnet_data *softnet_get_online(loff_t *pos) +{ + struct softnet_data *sd = NULL; + + while (*pos < nr_cpu_ids) + if (cpu_online(*pos)) { + sd = &per_cpu(softnet_data, *pos); + break; + } else + ++*pos; + return sd; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ + return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ + struct softnet_data *sd = v; + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + sd->processed, sd->dropped, sd->time_squeeze, 0, + 0, 0, 0, 0, /* was fastroute */ + sd->cpu_collision, sd->received_rps); + return 0; +} + +static const struct seq_operations dev_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_seq_fops = { + .owner = THIS_MODULE, + .open = dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static const struct seq_operations softnet_seq_ops = { + .start = softnet_seq_start, + .next = softnet_seq_next, + .stop = softnet_seq_stop, + .show = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &softnet_seq_ops); +} + +static const struct file_operations softnet_seq_fops = { + .owner = THIS_MODULE, + .open = softnet_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *ptype_get_idx(loff_t pos) +{ + struct packet_type *pt = NULL; + loff_t i = 0; + int t; + + list_for_each_entry_rcu(pt, &ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + for (t = 0; t < PTYPE_HASH_SIZE; t++) { + list_for_each_entry_rcu(pt, &ptype_base[t], list) { + if (i == pos) + return pt; + ++i; + } + } + return NULL; +} + +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct packet_type *pt; + struct list_head *nxt; + int hash; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ptype_get_idx(0); + + pt = v; + nxt = pt->list.next; + if (pt->type == htons(ETH_P_ALL)) { + if (nxt != &ptype_all) + goto found; + hash = 0; + nxt = ptype_base[0].next; + } else + hash = ntohs(pt->type) & PTYPE_HASH_MASK; + + while (nxt == &ptype_base[hash]) { + if (++hash >= PTYPE_HASH_SIZE) + return NULL; + nxt = ptype_base[hash].next; + } +found: + return list_entry(nxt, struct packet_type, list); +} + +static void ptype_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int ptype_seq_show(struct seq_file *seq, void *v) +{ + struct packet_type *pt = v; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Type Device Function\n"); + else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + if (pt->type == htons(ETH_P_ALL)) + seq_puts(seq, "ALL "); + else + seq_printf(seq, "%04x", ntohs(pt->type)); + + seq_printf(seq, " %-8s %pf\n", + pt->dev ? pt->dev->name : "", pt->func); + } + + return 0; +} + +static const struct seq_operations ptype_seq_ops = { + .start = ptype_seq_start, + .next = ptype_seq_next, + .stop = ptype_seq_stop, + .show = ptype_seq_show, +}; + +static int ptype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ptype_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ptype_seq_fops = { + .owner = THIS_MODULE, + .open = ptype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + + +static int __net_init dev_proc_net_init(struct net *net) +{ + int rc = -ENOMEM; + + if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops)) + goto out; + if (!proc_create("softnet_stat", S_IRUGO, net->proc_net, + &softnet_seq_fops)) + goto out_dev; + if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops)) + goto out_softnet; + + if (wext_proc_init(net)) + goto out_ptype; + rc = 0; +out: + return rc; +out_ptype: + remove_proc_entry("ptype", net->proc_net); +out_softnet: + remove_proc_entry("softnet_stat", net->proc_net); +out_dev: + remove_proc_entry("dev", net->proc_net); + goto out; +} + +static void __net_exit dev_proc_net_exit(struct net *net) +{ + wext_proc_exit(net); + + remove_proc_entry("ptype", net->proc_net); + remove_proc_entry("softnet_stat", net->proc_net); + remove_proc_entry("dev", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_proc_ops = { + .init = dev_proc_net_init, + .exit = dev_proc_net_exit, +}; + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ + struct netdev_hw_addr *ha; + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + return 0; + + netif_addr_lock_bh(dev); + netdev_for_each_mc_addr(ha, dev) { + int i; + + seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, + dev->name, ha->refcount, ha->global_use); + + for (i = 0; i < dev->addr_len; i++) + seq_printf(seq, "%02x", ha->addr[i]); + + seq_putc(seq, '\n'); + } + netif_addr_unlock_bh(dev); + return 0; +} + +static const struct seq_operations dev_mc_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_mc_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_mc_seq_fops = { + .owner = THIS_MODULE, + .open = dev_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init dev_mc_net_init(struct net *net) +{ + if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ + remove_proc_entry("dev_mcast", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { + .init = dev_mc_net_init, + .exit = dev_mc_net_exit, +}; + +int __init dev_proc_init(void) +{ + int ret = register_pernet_subsys(&dev_proc_ops); + if (!ret) + return register_pernet_subsys(&dev_mc_net_ops); + return ret; +} diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 28c5f5aa7ca7..981fed397d1d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -21,6 +21,7 @@ #include <linux/vmalloc.h> #include <linux/export.h> #include <linux/jiffies.h> +#include <linux/pm_runtime.h> #include "net-sysfs.h" @@ -126,6 +127,19 @@ static ssize_t show_broadcast(struct device *dev, return -EINVAL; } +static int change_carrier(struct net_device *net, unsigned long new_carrier) +{ + if (!netif_running(net)) + return -EINVAL; + return dev_change_carrier(net, (bool) new_carrier); +} + +static ssize_t store_carrier(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_carrier); +} + static ssize_t show_carrier(struct device *dev, struct device_attribute *attr, char *buf) { @@ -331,7 +345,7 @@ static struct device_attribute net_class_attributes[] = { __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), __ATTR(address, S_IRUGO, show_address, NULL), __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), - __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(carrier, S_IRUGO | S_IWUSR, show_carrier, store_carrier), __ATTR(speed, S_IRUGO, show_speed, NULL), __ATTR(duplex, S_IRUGO, show_duplex, NULL), __ATTR(dormant, S_IRUGO, show_dormant, NULL), @@ -592,21 +606,11 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return sprintf(buf, "%lu\n", val); } -static void rps_dev_flow_table_release_work(struct work_struct *work) -{ - struct rps_dev_flow_table *table = container_of(work, - struct rps_dev_flow_table, free_work); - - vfree(table); -} - static void rps_dev_flow_table_release(struct rcu_head *rcu) { struct rps_dev_flow_table *table = container_of(rcu, struct rps_dev_flow_table, rcu); - - INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); - schedule_work(&table->free_work); + vfree(table); } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, @@ -989,68 +993,14 @@ static ssize_t show_xps_map(struct netdev_queue *queue, return len; } -static DEFINE_MUTEX(xps_map_mutex); -#define xmap_dereference(P) \ - rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) - -static void xps_queue_release(struct netdev_queue *queue) -{ - struct net_device *dev = queue->dev; - struct xps_dev_maps *dev_maps; - struct xps_map *map; - unsigned long index; - int i, pos, nonempty = 0; - - index = get_netdev_queue_index(queue); - - mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); - - if (dev_maps) { - for_each_possible_cpu(i) { - map = xmap_dereference(dev_maps->cpu_map[i]); - if (!map) - continue; - - for (pos = 0; pos < map->len; pos++) - if (map->queues[pos] == index) - break; - - if (pos < map->len) { - if (map->len > 1) - map->queues[pos] = - map->queues[--map->len]; - else { - RCU_INIT_POINTER(dev_maps->cpu_map[i], - NULL); - kfree_rcu(map, rcu); - map = NULL; - } - } - if (map) - nonempty = 1; - } - - if (!nonempty) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); - } - } - mutex_unlock(&xps_map_mutex); -} - static ssize_t store_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, const char *buf, size_t len) { struct net_device *dev = queue->dev; - cpumask_var_t mask; - int err, i, cpu, pos, map_len, alloc_len, need_set; unsigned long index; - struct xps_map *map, *new_map; - struct xps_dev_maps *dev_maps, *new_dev_maps; - int nonempty = 0; - int numa_node_id = -2; + cpumask_var_t mask; + int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1066,105 +1016,11 @@ static ssize_t store_xps_map(struct netdev_queue *queue, return err; } - new_dev_maps = kzalloc(max_t(unsigned int, - XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL); - if (!new_dev_maps) { - free_cpumask_var(mask); - return -ENOMEM; - } - - mutex_lock(&xps_map_mutex); - - dev_maps = xmap_dereference(dev->xps_maps); - - for_each_possible_cpu(cpu) { - map = dev_maps ? - xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; - new_map = map; - if (map) { - for (pos = 0; pos < map->len; pos++) - if (map->queues[pos] == index) - break; - map_len = map->len; - alloc_len = map->alloc_len; - } else - pos = map_len = alloc_len = 0; - - need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu); -#ifdef CONFIG_NUMA - if (need_set) { - if (numa_node_id == -2) - numa_node_id = cpu_to_node(cpu); - else if (numa_node_id != cpu_to_node(cpu)) - numa_node_id = -1; - } -#endif - if (need_set && pos >= map_len) { - /* Need to add queue to this CPU's map */ - if (map_len >= alloc_len) { - alloc_len = alloc_len ? - 2 * alloc_len : XPS_MIN_MAP_ALLOC; - new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), - GFP_KERNEL, - cpu_to_node(cpu)); - if (!new_map) - goto error; - new_map->alloc_len = alloc_len; - for (i = 0; i < map_len; i++) - new_map->queues[i] = map->queues[i]; - new_map->len = map_len; - } - new_map->queues[new_map->len++] = index; - } else if (!need_set && pos < map_len) { - /* Need to remove queue from this CPU's map */ - if (map_len > 1) - new_map->queues[pos] = - new_map->queues[--new_map->len]; - else - new_map = NULL; - } - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map); - } - - /* Cleanup old maps */ - for_each_possible_cpu(cpu) { - map = dev_maps ? - xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; - if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) - kfree_rcu(map, rcu); - if (new_dev_maps->cpu_map[cpu]) - nonempty = 1; - } - - if (nonempty) { - rcu_assign_pointer(dev->xps_maps, new_dev_maps); - } else { - kfree(new_dev_maps); - RCU_INIT_POINTER(dev->xps_maps, NULL); - } - - if (dev_maps) - kfree_rcu(dev_maps, rcu); - - netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id : - NUMA_NO_NODE); - - mutex_unlock(&xps_map_mutex); + err = netif_set_xps_queue(dev, mask, index); free_cpumask_var(mask); - return len; -error: - mutex_unlock(&xps_map_mutex); - - if (new_dev_maps) - for_each_possible_cpu(i) - kfree(rcu_dereference_protected( - new_dev_maps->cpu_map[i], - 1)); - kfree(new_dev_maps); - free_cpumask_var(mask); - return -ENOMEM; + return err ? : len; } static struct netdev_queue_attribute xps_cpus_attribute = @@ -1183,10 +1039,6 @@ static void netdev_queue_release(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); -#ifdef CONFIG_XPS - xps_queue_release(queue); -#endif - memset(kobj, 0, sizeof(*kobj)); dev_put(queue->dev); } @@ -1396,6 +1248,8 @@ void netdev_unregister_kobject(struct net_device * net) remove_queue_kobjects(net); + pm_runtime_set_memalloc_noio(dev, false); + device_del(dev); } @@ -1440,6 +1294,8 @@ int netdev_register_kobject(struct net_device *net) return error; } + pm_runtime_set_memalloc_noio(dev, true); + return error; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 8acce01b6dab..f97652036754 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -10,7 +10,8 @@ #include <linux/idr.h> #include <linux/rculist.h> #include <linux/nsproxy.h> -#include <linux/proc_fs.h> +#include <linux/fs.h> +#include <linux/proc_ns.h> #include <linux/file.h> #include <linux/export.h> #include <linux/user_namespace.h> @@ -336,7 +337,7 @@ EXPORT_SYMBOL_GPL(__put_net); struct net *get_net_ns_by_fd(int fd) { - struct proc_inode *ei; + struct proc_ns *ei; struct file *file; struct net *net; @@ -344,7 +345,7 @@ struct net *get_net_ns_by_fd(int fd) if (IS_ERR(file)) return ERR_CAST(file); - ei = PROC_I(file->f_dentry->d_inode); + ei = get_proc_ns(file_inode(file)); if (ei->ns_ops == &netns_operations) net = get_net(ei->ns); else diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3151acf5ec13..cec074be8c43 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -29,6 +29,9 @@ #include <linux/if_vlan.h> #include <net/tcp.h> #include <net/udp.h> +#include <net/addrconf.h> +#include <net/ndisc.h> +#include <net/ip6_checksum.h> #include <asm/unaligned.h> #include <trace/events/napi.h> @@ -44,6 +47,8 @@ static struct sk_buff_head skb_pool; static atomic_t trapped; +DEFINE_STATIC_SRCU(netpoll_srcu); + #define USEC_PER_POLL 50 #define NETPOLL_RX_ENABLED 1 #define NETPOLL_RX_DROP 2 @@ -55,7 +60,8 @@ static atomic_t trapped; MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo); +static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo); +static void netpoll_async_cleanup(struct work_struct *work); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); @@ -181,13 +187,13 @@ static void poll_napi(struct net_device *dev) } } -static void service_arp_queue(struct netpoll_info *npi) +static void service_neigh_queue(struct netpoll_info *npi) { if (npi) { struct sk_buff *skb; - while ((skb = skb_dequeue(&npi->arp_tx))) - netpoll_arp_reply(skb, npi); + while ((skb = skb_dequeue(&npi->neigh_tx))) + netpoll_neigh_reply(skb, npi); } } @@ -196,35 +202,76 @@ static void netpoll_poll_dev(struct net_device *dev) const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); - if (!dev || !netif_running(dev)) + /* Don't do any rx activity if the dev_lock mutex is held + * the dev_open/close paths use this to block netpoll activity + * while changing device state + */ + if (down_trylock(&ni->dev_lock)) return; + if (!netif_running(dev)) { + up(&ni->dev_lock); + return; + } + ops = dev->netdev_ops; - if (!ops->ndo_poll_controller) + if (!ops->ndo_poll_controller) { + up(&ni->dev_lock); return; + } /* Process pending work on NIC */ ops->ndo_poll_controller(dev); poll_napi(dev); + up(&ni->dev_lock); + if (dev->flags & IFF_SLAVE) { if (ni) { - struct net_device *bond_dev = dev->master; + struct net_device *bond_dev; struct sk_buff *skb; - struct netpoll_info *bond_ni = rcu_dereference_bh(bond_dev->npinfo); - while ((skb = skb_dequeue(&ni->arp_tx))) { + struct netpoll_info *bond_ni; + + bond_dev = netdev_master_upper_dev_get_rcu(dev); + bond_ni = rcu_dereference_bh(bond_dev->npinfo); + while ((skb = skb_dequeue(&ni->neigh_tx))) { skb->dev = bond_dev; - skb_queue_tail(&bond_ni->arp_tx, skb); + skb_queue_tail(&bond_ni->neigh_tx, skb); } } } - service_arp_queue(ni); + service_neigh_queue(ni); zap_completion_queue(); } +int netpoll_rx_disable(struct net_device *dev) +{ + struct netpoll_info *ni; + int idx; + might_sleep(); + idx = srcu_read_lock(&netpoll_srcu); + ni = srcu_dereference(dev->npinfo, &netpoll_srcu); + if (ni) + down(&ni->dev_lock); + srcu_read_unlock(&netpoll_srcu, idx); + return 0; +} +EXPORT_SYMBOL(netpoll_rx_disable); + +void netpoll_rx_enable(struct net_device *dev) +{ + struct netpoll_info *ni; + rcu_read_lock(); + ni = rcu_dereference(dev->npinfo); + if (ni) + up(&ni->dev_lock); + rcu_read_unlock(); +} +EXPORT_SYMBOL(netpoll_rx_enable); + static void refill_skbs(void) { struct sk_buff *skb; @@ -336,8 +383,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (__netif_tx_trylock(txq)) { if (!netif_xmit_stopped(txq)) { if (vlan_tx_tag_present(skb) && - !(netif_skb_features(skb) & NETIF_F_HW_VLAN_TX)) { - skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + !vlan_hw_offload_capable(netif_skb_features(skb), + skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb)); if (unlikely(!skb)) break; skb->vlan_tci = 0; @@ -381,9 +429,14 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) struct iphdr *iph; struct ethhdr *eth; static atomic_t ip_ident; + struct ipv6hdr *ip6h; udp_len = len + sizeof(*udph); - ip_len = udp_len + sizeof(*iph); + if (np->ipv6) + ip_len = udp_len + sizeof(*ip6h); + else + ip_len = udp_len + sizeof(*iph); + total_len = ip_len + LL_RESERVED_SPACE(np->dev); skb = find_skb(np, total_len + np->dev->needed_tailroom, @@ -400,34 +453,66 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) udph->source = htons(np->local_port); udph->dest = htons(np->remote_port); udph->len = htons(udp_len); - udph->check = 0; - udph->check = csum_tcpudp_magic(np->local_ip, - np->remote_ip, - udp_len, IPPROTO_UDP, - csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - iph = ip_hdr(skb); - - /* iph->version = 4; iph->ihl = 5; */ - put_unaligned(0x45, (unsigned char *)iph); - iph->tos = 0; - put_unaligned(htons(ip_len), &(iph->tot_len)); - iph->id = htons(atomic_inc_return(&ip_ident)); - iph->frag_off = 0; - iph->ttl = 64; - iph->protocol = IPPROTO_UDP; - iph->check = 0; - put_unaligned(np->local_ip, &(iph->saddr)); - put_unaligned(np->remote_ip, &(iph->daddr)); - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - - eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IP); + + if (np->ipv6) { + udph->check = 0; + udph->check = csum_ipv6_magic(&np->local_ip.in6, + &np->remote_ip.in6, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + + skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + + /* ip6h->version = 6; ip6h->priority = 0; */ + put_unaligned(0x60, (unsigned char *)ip6h); + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + + ip6h->payload_len = htons(sizeof(struct udphdr) + len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 32; + ip6h->saddr = np->local_ip.in6; + ip6h->daddr = np->remote_ip.in6; + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb->protocol = eth->h_proto = htons(ETH_P_IPV6); + } else { + udph->check = 0; + udph->check = csum_tcpudp_magic(np->local_ip.ip, + np->remote_ip.ip, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + /* iph->version = 4; iph->ihl = 5; */ + put_unaligned(0x45, (unsigned char *)iph); + iph->tos = 0; + put_unaligned(htons(ip_len), &(iph->tot_len)); + iph->id = htons(atomic_inc_return(&ip_ident)); + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(np->local_ip.ip, &(iph->saddr)); + put_unaligned(np->remote_ip.ip, &(iph->daddr)); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb->protocol = eth->h_proto = htons(ETH_P_IP); + } + memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); @@ -437,18 +522,16 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) } EXPORT_SYMBOL(netpoll_send_udp); -static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo) +static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo) { - struct arphdr *arp; - unsigned char *arp_ptr; - int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; + int size, type = ARPOP_REPLY; __be32 sip, tip; unsigned char *sha; struct sk_buff *send_skb; struct netpoll *np, *tmp; unsigned long flags; int hlen, tlen; - int hits = 0; + int hits = 0, proto; if (list_empty(&npinfo->rx_np)) return; @@ -466,94 +549,214 @@ static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo) if (!hits) return; - /* No arp on this interface */ - if (skb->dev->flags & IFF_NOARP) - return; - - if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) - return; + proto = ntohs(eth_hdr(skb)->h_proto); + if (proto == ETH_P_IP) { + struct arphdr *arp; + unsigned char *arp_ptr; + /* No arp on this interface */ + if (skb->dev->flags & IFF_NOARP) + return; - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - arp = arp_hdr(skb); + if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) + return; - if ((arp->ar_hrd != htons(ARPHRD_ETHER) && - arp->ar_hrd != htons(ARPHRD_IEEE802)) || - arp->ar_pro != htons(ETH_P_IP) || - arp->ar_op != htons(ARPOP_REQUEST)) - return; + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + arp = arp_hdr(skb); - arp_ptr = (unsigned char *)(arp+1); - /* save the location of the src hw addr */ - sha = arp_ptr; - arp_ptr += skb->dev->addr_len; - memcpy(&sip, arp_ptr, 4); - arp_ptr += 4; - /* If we actually cared about dst hw addr, - it would get copied here */ - arp_ptr += skb->dev->addr_len; - memcpy(&tip, arp_ptr, 4); - - /* Should we ignore arp? */ - if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) - return; + if ((arp->ar_hrd != htons(ARPHRD_ETHER) && + arp->ar_hrd != htons(ARPHRD_IEEE802)) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_op != htons(ARPOP_REQUEST)) + return; - size = arp_hdr_len(skb->dev); + arp_ptr = (unsigned char *)(arp+1); + /* save the location of the src hw addr */ + sha = arp_ptr; + arp_ptr += skb->dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4; + /* If we actually cared about dst hw addr, + it would get copied here */ + arp_ptr += skb->dev->addr_len; + memcpy(&tip, arp_ptr, 4); - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (tip != np->local_ip) - continue; + /* Should we ignore arp? */ + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + return; - hlen = LL_RESERVED_SPACE(np->dev); - tlen = np->dev->needed_tailroom; - send_skb = find_skb(np, size + hlen + tlen, hlen); - if (!send_skb) - continue; + size = arp_hdr_len(skb->dev); - skb_reset_network_header(send_skb); - arp = (struct arphdr *) skb_put(send_skb, size); - send_skb->dev = skb->dev; - send_skb->protocol = htons(ETH_P_ARP); + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (tip != np->local_ip.ip) + continue; + + hlen = LL_RESERVED_SPACE(np->dev); + tlen = np->dev->needed_tailroom; + send_skb = find_skb(np, size + hlen + tlen, hlen); + if (!send_skb) + continue; + + skb_reset_network_header(send_skb); + arp = (struct arphdr *) skb_put(send_skb, size); + send_skb->dev = skb->dev; + send_skb->protocol = htons(ETH_P_ARP); + + /* Fill the device header for the ARP frame */ + if (dev_hard_header(send_skb, skb->dev, ETH_P_ARP, + sha, np->dev->dev_addr, + send_skb->len) < 0) { + kfree_skb(send_skb); + continue; + } - /* Fill the device header for the ARP frame */ - if (dev_hard_header(send_skb, skb->dev, ptype, - sha, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - continue; + /* + * Fill out the arp protocol part. + * + * we only support ethernet device type, + * which (according to RFC 1390) should + * always equal 1 (Ethernet). + */ + + arp->ar_hrd = htons(np->dev->type); + arp->ar_pro = htons(ETH_P_IP); + arp->ar_hln = np->dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr = (unsigned char *)(arp + 1); + memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &tip, 4); + arp_ptr += 4; + memcpy(arp_ptr, sha, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &sip, 4); + + netpoll_send_skb(np, send_skb); + + /* If there are several rx_hooks for the same address, + we're fine by sending a single reply */ + break; } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + } else if( proto == ETH_P_IPV6) { +#if IS_ENABLED(CONFIG_IPV6) + struct nd_msg *msg; + u8 *lladdr = NULL; + struct ipv6hdr *hdr; + struct icmp6hdr *icmp6h; + const struct in6_addr *saddr; + const struct in6_addr *daddr; + struct inet6_dev *in6_dev = NULL; + struct in6_addr *target; + + in6_dev = in6_dev_get(skb->dev); + if (!in6_dev || !in6_dev->cnf.accept_ra) + return; - /* - * Fill out the arp protocol part. - * - * we only support ethernet device type, - * which (according to RFC 1390) should - * always equal 1 (Ethernet). - */ + if (!pskb_may_pull(skb, skb->len)) + return; - arp->ar_hrd = htons(np->dev->type); - arp->ar_pro = htons(ETH_P_IP); - arp->ar_hln = np->dev->addr_len; - arp->ar_pln = 4; - arp->ar_op = htons(type); + msg = (struct nd_msg *)skb_transport_header(skb); - arp_ptr = (unsigned char *)(arp + 1); - memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &tip, 4); - arp_ptr += 4; - memcpy(arp_ptr, sha, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &sip, 4); + __skb_push(skb, skb->data - skb_transport_header(skb)); - netpoll_send_skb(np, send_skb); + if (ipv6_hdr(skb)->hop_limit != 255) + return; + if (msg->icmph.icmp6_code != 0) + return; + if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) + return; + + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; - /* If there are several rx_hooks for the same address, - we're fine by sending a single reply */ - break; + size = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (!ipv6_addr_equal(daddr, &np->local_ip.in6)) + continue; + + hlen = LL_RESERVED_SPACE(np->dev); + tlen = np->dev->needed_tailroom; + send_skb = find_skb(np, size + hlen + tlen, hlen); + if (!send_skb) + continue; + + send_skb->protocol = htons(ETH_P_IPV6); + send_skb->dev = skb->dev; + + skb_reset_network_header(send_skb); + skb_put(send_skb, sizeof(struct ipv6hdr)); + hdr = ipv6_hdr(send_skb); + + *(__be32*)hdr = htonl(0x60000000); + + hdr->payload_len = htons(size); + hdr->nexthdr = IPPROTO_ICMPV6; + hdr->hop_limit = 255; + hdr->saddr = *saddr; + hdr->daddr = *daddr; + + send_skb->transport_header = send_skb->tail; + skb_put(send_skb, size); + + icmp6h = (struct icmp6hdr *)skb_transport_header(skb); + icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; + icmp6h->icmp6_router = 0; + icmp6h->icmp6_solicited = 1; + target = (struct in6_addr *)(skb_transport_header(send_skb) + sizeof(struct icmp6hdr)); + *target = msg->target; + icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size, + IPPROTO_ICMPV6, + csum_partial(icmp6h, + size, 0)); + + if (dev_hard_header(send_skb, skb->dev, ETH_P_IPV6, + lladdr, np->dev->dev_addr, + send_skb->len) < 0) { + kfree_skb(send_skb); + continue; + } + + netpoll_send_skb(np, send_skb); + + /* If there are several rx_hooks for the same address, + we're fine by sending a single reply */ + break; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); +#endif } - spin_unlock_irqrestore(&npinfo->rx_lock, flags); +} + +static bool pkt_is_ns(struct sk_buff *skb) +{ + struct nd_msg *msg; + struct ipv6hdr *hdr; + + if (skb->protocol != htons(ETH_P_ARP)) + return false; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg))) + return false; + + msg = (struct nd_msg *)skb_transport_header(skb); + __skb_push(skb, skb->data - skb_transport_header(skb)); + hdr = ipv6_hdr(skb); + + if (hdr->nexthdr != IPPROTO_ICMPV6) + return false; + if (hdr->hop_limit != 255) + return false; + if (msg->icmph.icmp6_code != 0) + return false; + if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) + return false; + + return true; } int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) @@ -571,9 +774,11 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) goto out; /* check if netpoll clients need ARP */ - if (skb->protocol == htons(ETH_P_ARP) && - atomic_read(&trapped)) { - skb_queue_tail(&npinfo->arp_tx, skb); + if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { + skb_queue_tail(&npinfo->neigh_tx, skb); + return 1; + } else if (pkt_is_ns(skb) && atomic_read(&trapped)) { + skb_queue_tail(&npinfo->neigh_tx, skb); return 1; } @@ -584,60 +789,100 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) } proto = ntohs(eth_hdr(skb)->h_proto); - if (proto != ETH_P_IP) + if (proto != ETH_P_IP && proto != ETH_P_IPV6) goto out; if (skb->pkt_type == PACKET_OTHERHOST) goto out; if (skb_shared(skb)) goto out; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - iph = (struct iphdr *)skb->data; - if (iph->ihl < 5 || iph->version != 4) - goto out; - if (!pskb_may_pull(skb, iph->ihl*4)) - goto out; - iph = (struct iphdr *)skb->data; - if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) - goto out; - - len = ntohs(iph->tot_len); - if (skb->len < len || len < iph->ihl*4) - goto out; + if (proto == ETH_P_IP) { + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + iph = (struct iphdr *)skb->data; + if (iph->ihl < 5 || iph->version != 4) + goto out; + if (!pskb_may_pull(skb, iph->ihl*4)) + goto out; + iph = (struct iphdr *)skb->data; + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto out; - /* - * Our transport medium may have padded the buffer out. - * Now We trim to the true length of the frame. - */ - if (pskb_trim_rcsum(skb, len)) - goto out; + len = ntohs(iph->tot_len); + if (skb->len < len || len < iph->ihl*4) + goto out; - iph = (struct iphdr *)skb->data; - if (iph->protocol != IPPROTO_UDP) - goto out; + /* + * Our transport medium may have padded the buffer out. + * Now We trim to the true length of the frame. + */ + if (pskb_trim_rcsum(skb, len)) + goto out; - len -= iph->ihl*4; - uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); - ulen = ntohs(uh->len); + iph = (struct iphdr *)skb->data; + if (iph->protocol != IPPROTO_UDP) + goto out; - if (ulen != len) - goto out; - if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) - goto out; + len -= iph->ihl*4; + uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); + ulen = ntohs(uh->len); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (np->local_ip && np->local_ip != iph->daddr) - continue; - if (np->remote_ip && np->remote_ip != iph->saddr) - continue; - if (np->local_port && np->local_port != ntohs(uh->dest)) - continue; + if (ulen != len) + goto out; + if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) + goto out; + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->local_ip.ip && np->local_ip.ip != iph->daddr) + continue; + if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr) + continue; + if (np->local_port && np->local_port != ntohs(uh->dest)) + continue; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + hits++; + } + } else { +#if IS_ENABLED(CONFIG_IPV6) + const struct ipv6hdr *ip6h; - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); - hits++; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + ip6h = (struct ipv6hdr *)skb->data; + if (ip6h->version != 6) + goto out; + len = ntohs(ip6h->payload_len); + if (!len) + goto out; + if (len + sizeof(struct ipv6hdr) > skb->len) + goto out; + if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr))) + goto out; + ip6h = ipv6_hdr(skb); + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + uh = udp_hdr(skb); + ulen = ntohs(uh->len); + if (ulen != skb->len) + goto out; + if (udp6_csum_init(skb, uh, IPPROTO_UDP)) + goto out; + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr)) + continue; + if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr)) + continue; + if (np->local_port && np->local_port != ntohs(uh->dest)) + continue; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + hits++; + } +#endif } if (!hits) @@ -658,17 +903,44 @@ out: void netpoll_print_options(struct netpoll *np) { np_info(np, "local port %d\n", np->local_port); - np_info(np, "local IP %pI4\n", &np->local_ip); + if (np->ipv6) + np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); + else + np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); np_info(np, "interface '%s'\n", np->dev_name); np_info(np, "remote port %d\n", np->remote_port); - np_info(np, "remote IP %pI4\n", &np->remote_ip); + if (np->ipv6) + np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); + else + np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); np_info(np, "remote ethernet address %pM\n", np->remote_mac); } EXPORT_SYMBOL(netpoll_print_options); +static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) +{ + const char *end; + + if (!strchr(str, ':') && + in4_pton(str, -1, (void *)addr, -1, &end) > 0) { + if (!*end) + return 0; + } + if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { +#if IS_ENABLED(CONFIG_IPV6) + if (!*end) + return 1; +#else + return -1; +#endif + } + return -1; +} + int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; + int ipv6; if (*cur != '@') { if ((delim = strchr(cur, '@')) == NULL) @@ -684,7 +956,11 @@ int netpoll_parse_options(struct netpoll *np, char *opt) if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; - np->local_ip = in_aton(cur); + ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); + if (ipv6 < 0) + goto parse_failed; + else + np->ipv6 = (bool)ipv6; cur = delim; } cur++; @@ -716,7 +992,13 @@ int netpoll_parse_options(struct netpoll *np, char *opt) if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; - np->remote_ip = in_aton(cur); + ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); + if (ipv6 < 0) + goto parse_failed; + else if (np->ipv6 != (bool)ipv6) + goto parse_failed; + else + np->ipv6 = (bool)ipv6; cur = delim + 1; if (*cur != 0) { @@ -744,6 +1026,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) np->dev = ndev; strlcpy(np->dev_name, ndev->name, IFNAMSIZ); + INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || !ndev->netdev_ops->ndo_poll_controller) { @@ -764,7 +1047,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) INIT_LIST_HEAD(&npinfo->rx_np); spin_lock_init(&npinfo->rx_lock); - skb_queue_head_init(&npinfo->arp_tx); + sema_init(&npinfo->dev_lock, 1); + skb_queue_head_init(&npinfo->neigh_tx); skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); @@ -777,7 +1061,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) goto free_npinfo; } } else { - npinfo = ndev->npinfo; + npinfo = rtnl_dereference(ndev->npinfo); atomic_inc(&npinfo->refcnt); } @@ -808,14 +1092,19 @@ int netpoll_setup(struct netpoll *np) struct in_device *in_dev; int err; - if (np->dev_name) - ndev = dev_get_by_name(&init_net, np->dev_name); + rtnl_lock(); + if (np->dev_name) { + struct net *net = current->nsproxy->net_ns; + ndev = __dev_get_by_name(net, np->dev_name); + } if (!ndev) { np_err(np, "%s doesn't exist, aborting\n", np->dev_name); - return -ENODEV; + err = -ENODEV; + goto unlock; } + dev_hold(ndev); - if (ndev->master) { + if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); err = -EBUSY; goto put; @@ -826,15 +1115,14 @@ int netpoll_setup(struct netpoll *np) np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - rtnl_lock(); err = dev_open(ndev); - rtnl_unlock(); if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } + rtnl_unlock(); atleast = jiffies + HZ/10; atmost = jiffies + carrier_timeout * HZ; while (!netif_carrier_ok(ndev)) { @@ -854,39 +1142,70 @@ int netpoll_setup(struct netpoll *np) np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n"); msleep(4000); } + rtnl_lock(); } - if (!np->local_ip) { - rcu_read_lock(); - in_dev = __in_dev_get_rcu(ndev); + if (!np->local_ip.ip) { + if (!np->ipv6) { + in_dev = __in_dev_get_rtnl(ndev); + + if (!in_dev || !in_dev->ifa_list) { + np_err(np, "no IP address for %s, aborting\n", + np->dev_name); + err = -EDESTADDRREQ; + goto put; + } + + np->local_ip.ip = in_dev->ifa_list->ifa_local; + np_info(np, "local IP %pI4\n", &np->local_ip.ip); + } else { +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *idev; - if (!in_dev || !in_dev->ifa_list) { - rcu_read_unlock(); - np_err(np, "no IP address for %s, aborting\n", - np->dev_name); err = -EDESTADDRREQ; + idev = __in6_dev_get(ndev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) + continue; + np->local_ip.in6 = ifp->addr; + err = 0; + break; + } + read_unlock_bh(&idev->lock); + } + if (err) { + np_err(np, "no IPv6 address for %s, aborting\n", + np->dev_name); + goto put; + } else + np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); +#else + np_err(np, "IPv6 is not supported %s, aborting\n", + np->dev_name); + err = -EINVAL; goto put; +#endif } - - np->local_ip = in_dev->ifa_list->ifa_local; - rcu_read_unlock(); - np_info(np, "local IP %pI4\n", &np->local_ip); } /* fill up the skb queue */ refill_skbs(); - rtnl_lock(); err = __netpoll_setup(np, ndev, GFP_KERNEL); - rtnl_unlock(); - if (err) goto put; + rtnl_unlock(); return 0; put: dev_put(ndev); +unlock: + rtnl_unlock(); return err; } EXPORT_SYMBOL(netpoll_setup); @@ -903,7 +1222,7 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) struct netpoll_info *npinfo = container_of(rcu_head, struct netpoll_info, rcu); - skb_queue_purge(&npinfo->arp_tx); + skb_queue_purge(&npinfo->neigh_tx); skb_queue_purge(&npinfo->txq); /* we can't call cancel_delayed_work_sync here, as we are in softirq */ @@ -921,7 +1240,11 @@ void __netpoll_cleanup(struct netpoll *np) struct netpoll_info *npinfo; unsigned long flags; - npinfo = np->dev->npinfo; + /* rtnl_dereference would be preferable here but + * rcu_cleanup_netpoll path can put us in here safely without + * holding the rtnl, so plain rcu_dereference it is + */ + npinfo = rtnl_dereference(np->dev->npinfo); if (!npinfo) return; @@ -933,6 +1256,8 @@ void __netpoll_cleanup(struct netpoll *np) spin_unlock_irqrestore(&npinfo->rx_lock, flags); } + synchronize_srcu(&netpoll_srcu); + if (atomic_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; @@ -940,25 +1265,27 @@ void __netpoll_cleanup(struct netpoll *np) if (ops->ndo_netpoll_cleanup) ops->ndo_netpoll_cleanup(np->dev); - RCU_INIT_POINTER(np->dev->npinfo, NULL); + rcu_assign_pointer(np->dev->npinfo, NULL); call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); } } EXPORT_SYMBOL_GPL(__netpoll_cleanup); -static void rcu_cleanup_netpoll(struct rcu_head *rcu_head) +static void netpoll_async_cleanup(struct work_struct *work) { - struct netpoll *np = container_of(rcu_head, struct netpoll, rcu); + struct netpoll *np = container_of(work, struct netpoll, cleanup_work); + rtnl_lock(); __netpoll_cleanup(np); + rtnl_unlock(); kfree(np); } -void __netpoll_free_rcu(struct netpoll *np) +void __netpoll_free_async(struct netpoll *np) { - call_rcu_bh(&np->rcu, rcu_cleanup_netpoll); + schedule_work(&np->cleanup_work); } -EXPORT_SYMBOL_GPL(__netpoll_free_rcu); +EXPORT_SYMBOL_GPL(__netpoll_free_async); void netpoll_cleanup(struct netpoll *np) { diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 5e67defe2cb0..0777d0aa18c3 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -69,10 +69,8 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) /* allocate & copy */ new = kzalloc(new_sz, GFP_KERNEL); - if (!new) { - pr_warn("Unable to alloc new priomap!\n"); + if (!new) return -ENOMEM; - } if (old) memcpy(new->priomap, old->priomap, diff --git a/net/core/pktgen.c b/net/core/pktgen.c index e6e1cbe863f5..11f2704c3810 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -164,6 +164,7 @@ #ifdef CONFIG_XFRM #include <net/xfrm.h> #endif +#include <net/netns/generic.h> #include <asm/byteorder.h> #include <linux/rcupdate.h> #include <linux/bitops.h> @@ -212,7 +213,6 @@ #define PKTGEN_MAGIC 0xbe9be955 #define PG_PROC_DIR "pktgen" #define PGCTRL "pgctrl" -static struct proc_dir_entry *pg_proc_dir; #define MAX_CFLOWS 65536 @@ -397,7 +397,15 @@ struct pktgen_hdr { __be32 tv_usec; }; -static bool pktgen_exiting __read_mostly; + +static int pg_net_id __read_mostly; + +struct pktgen_net { + struct net *net; + struct proc_dir_entry *proc_dir; + struct list_head pktgen_threads; + bool pktgen_exiting; +}; struct pktgen_thread { spinlock_t if_lock; /* for list of devices */ @@ -414,6 +422,7 @@ struct pktgen_thread { wait_queue_head_t queue; struct completion start_done; + struct pktgen_net *net; }; #define REMOVE 1 @@ -428,9 +437,9 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char *ifname, bool exact); static int pktgen_device_event(struct notifier_block *, unsigned long, void *); -static void pktgen_run_all_threads(void); -static void pktgen_reset_all_threads(void); -static void pktgen_stop_all_threads_ifs(void); +static void pktgen_run_all_threads(struct pktgen_net *pn); +static void pktgen_reset_all_threads(struct pktgen_net *pn); +static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); @@ -442,7 +451,6 @@ static int pg_clone_skb_d __read_mostly; static int debug __read_mostly; static DEFINE_MUTEX(pktgen_thread_lock); -static LIST_HEAD(pktgen_threads); static struct notifier_block pktgen_notifier_block = { .notifier_call = pktgen_device_event, @@ -464,6 +472,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, { int err = 0; char data[128]; + struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); if (!capable(CAP_NET_ADMIN)) { err = -EPERM; @@ -480,13 +489,13 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, data[count - 1] = 0; /* Make string */ if (!strcmp(data, "stop")) - pktgen_stop_all_threads_ifs(); + pktgen_stop_all_threads_ifs(pn); else if (!strcmp(data, "start")) - pktgen_run_all_threads(); + pktgen_run_all_threads(pn); else if (!strcmp(data, "reset")) - pktgen_reset_all_threads(); + pktgen_reset_all_threads(pn); else pr_warning("Unknown command: %s\n", data); @@ -499,7 +508,7 @@ out: static int pgctrl_open(struct inode *inode, struct file *file) { - return single_open(file, pgctrl_show, PDE(inode)->data); + return single_open(file, pgctrl_show, PDE_DATA(inode)); } static const struct file_operations pktgen_fops = { @@ -1676,7 +1685,7 @@ static ssize_t pktgen_if_write(struct file *file, static int pktgen_if_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_if_show, PDE(inode)->data); + return single_open(file, pktgen_if_show, PDE_DATA(inode)); } static const struct file_operations pktgen_if_fops = { @@ -1814,7 +1823,7 @@ out: static int pktgen_thread_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_thread_show, PDE(inode)->data); + return single_open(file, pktgen_thread_show, PDE_DATA(inode)); } static const struct file_operations pktgen_thread_fops = { @@ -1827,13 +1836,14 @@ static const struct file_operations pktgen_thread_fops = { }; /* Think find or remove for NN */ -static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) +static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn, + const char *ifname, int remove) { struct pktgen_thread *t; struct pktgen_dev *pkt_dev = NULL; bool exact = (remove == FIND); - list_for_each_entry(t, &pktgen_threads, th_list) { + list_for_each_entry(t, &pn->pktgen_threads, th_list) { pkt_dev = pktgen_find_dev(t, ifname, exact); if (pkt_dev) { if (remove) { @@ -1851,7 +1861,7 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) /* * mark a device for removal */ -static void pktgen_mark_device(const char *ifname) +static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) { struct pktgen_dev *pkt_dev = NULL; const int max_tries = 10, msec_per_try = 125; @@ -1862,7 +1872,7 @@ static void pktgen_mark_device(const char *ifname) while (1) { - pkt_dev = __pktgen_NN_threads(ifname, REMOVE); + pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE); if (pkt_dev == NULL) break; /* success */ @@ -1883,21 +1893,21 @@ static void pktgen_mark_device(const char *ifname) mutex_unlock(&pktgen_thread_lock); } -static void pktgen_change_name(struct net_device *dev) +static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev) { struct pktgen_thread *t; - list_for_each_entry(t, &pktgen_threads, th_list) { + list_for_each_entry(t, &pn->pktgen_threads, th_list) { struct pktgen_dev *pkt_dev; list_for_each_entry(pkt_dev, &t->if_list, list) { if (pkt_dev->odev != dev) continue; - remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); + proc_remove(pkt_dev->entry); pkt_dev->entry = proc_create_data(dev->name, 0600, - pg_proc_dir, + pn->proc_dir, &pktgen_if_fops, pkt_dev); if (!pkt_dev->entry) @@ -1912,8 +1922,9 @@ static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = ptr; + struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id); - if (!net_eq(dev_net(dev), &init_net) || pktgen_exiting) + if (pn->pktgen_exiting) return NOTIFY_DONE; /* It is OK that we do not hold the group lock right now, @@ -1922,18 +1933,19 @@ static int pktgen_device_event(struct notifier_block *unused, switch (event) { case NETDEV_CHANGENAME: - pktgen_change_name(dev); + pktgen_change_name(pn, dev); break; case NETDEV_UNREGISTER: - pktgen_mark_device(dev->name); + pktgen_mark_device(pn, dev->name); break; } return NOTIFY_DONE; } -static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, +static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn, + struct pktgen_dev *pkt_dev, const char *ifname) { char b[IFNAMSIZ+5]; @@ -1947,13 +1959,14 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, } b[i] = 0; - return dev_get_by_name(&init_net, b); + return dev_get_by_name(pn->net, b); } /* Associate pktgen_dev with a device. */ -static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) +static int pktgen_setup_dev(const struct pktgen_net *pn, + struct pktgen_dev *pkt_dev, const char *ifname) { struct net_device *odev; int err; @@ -1964,7 +1977,7 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) pkt_dev->odev = NULL; } - odev = pktgen_dev_get_by_name(pkt_dev, ifname); + odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname); if (!odev) { pr_err("no such netdevice: \"%s\"\n", ifname); return -ENODEV; @@ -2185,7 +2198,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) pkt_dev->curfl = 0; /*reset */ } } else { - flow = random32() % pkt_dev->cflows; + flow = prandom_u32() % pkt_dev->cflows; pkt_dev->curfl = flow; if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2206,9 +2219,10 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { struct xfrm_state *x = pkt_dev->flows[flow].x; + struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); if (!x) { /*slow path: we dont already have xfrm_state*/ - x = xfrm_stateonly_find(&init_net, DUMMY_MARK, + x = xfrm_stateonly_find(pn->net, DUMMY_MARK, (xfrm_address_t *)&pkt_dev->cur_daddr, (xfrm_address_t *)&pkt_dev->cur_saddr, AF_INET, @@ -2232,7 +2246,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = random32() % + t = prandom_u32() % (pkt_dev->queue_map_max - pkt_dev->queue_map_min + 1) + pkt_dev->queue_map_min; @@ -2264,7 +2278,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = random32() % pkt_dev->src_mac_count; + mc = prandom_u32() % pkt_dev->src_mac_count; else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset >= @@ -2290,7 +2304,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = random32() % pkt_dev->dst_mac_count; + mc = prandom_u32() % pkt_dev->dst_mac_count; else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2317,21 +2331,21 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | - ((__force __be32)random32() & + ((__force __be32)prandom_u32() & htonl(0x000fffff)); } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = random32() & (4096-1); + pkt_dev->vlan_id = prandom_u32() & (4096 - 1); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = random32() & (4096 - 1); + pkt_dev->svlan_id = prandom_u32() & (4096 - 1); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = random32() % + pkt_dev->cur_udp_src = prandom_u32() % (pkt_dev->udp_src_max - pkt_dev->udp_src_min) + pkt_dev->udp_src_min; @@ -2344,7 +2358,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = random32() % + pkt_dev->cur_udp_dst = prandom_u32() % (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + pkt_dev->udp_dst_min; } else { @@ -2361,7 +2375,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) - t = random32() % (imx - imn) + imn; + t = prandom_u32() % (imx - imn) + imn; else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2382,17 +2396,15 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __be32 s; if (pkt_dev->flags & F_IPDST_RND) { - t = random32() % (imx - imn) + imn; - s = htonl(t); - - while (ipv4_is_loopback(s) || - ipv4_is_multicast(s) || - ipv4_is_lbcast(s) || - ipv4_is_zeronet(s) || - ipv4_is_local_multicast(s)) { - t = random32() % (imx - imn) + imn; + do { + t = prandom_u32() % + (imx - imn) + imn; s = htonl(t); - } + } while (ipv4_is_loopback(s) || + ipv4_is_multicast(s) || + ipv4_is_lbcast(s) || + ipv4_is_zeronet(s) || + ipv4_is_local_multicast(s)); pkt_dev->cur_daddr = s; } else { t = ntohl(pkt_dev->cur_daddr); @@ -2423,7 +2435,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < 4; i++) { pkt_dev->cur_in6_daddr.s6_addr32[i] = - (((__force __be32)random32() | + (((__force __be32)prandom_u32() | pkt_dev->min_in6_daddr.s6_addr32[i]) & pkt_dev->max_in6_daddr.s6_addr32[i]); } @@ -2433,7 +2445,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; if (pkt_dev->flags & F_TXSIZE_RND) { - t = random32() % + t = prandom_u32() % (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) + pkt_dev->min_pkt_size; } else { @@ -2915,7 +2927,7 @@ static void pktgen_run(struct pktgen_thread *t) t->control &= ~(T_STOP); } -static void pktgen_stop_all_threads_ifs(void) +static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn) { struct pktgen_thread *t; @@ -2923,7 +2935,7 @@ static void pktgen_stop_all_threads_ifs(void) mutex_lock(&pktgen_thread_lock); - list_for_each_entry(t, &pktgen_threads, th_list) + list_for_each_entry(t, &pn->pktgen_threads, th_list) t->control |= T_STOP; mutex_unlock(&pktgen_thread_lock); @@ -2959,28 +2971,28 @@ signal: return 0; } -static int pktgen_wait_all_threads_run(void) +static int pktgen_wait_all_threads_run(struct pktgen_net *pn) { struct pktgen_thread *t; int sig = 1; mutex_lock(&pktgen_thread_lock); - list_for_each_entry(t, &pktgen_threads, th_list) { + list_for_each_entry(t, &pn->pktgen_threads, th_list) { sig = pktgen_wait_thread_run(t); if (sig == 0) break; } if (sig == 0) - list_for_each_entry(t, &pktgen_threads, th_list) + list_for_each_entry(t, &pn->pktgen_threads, th_list) t->control |= (T_STOP); mutex_unlock(&pktgen_thread_lock); return sig; } -static void pktgen_run_all_threads(void) +static void pktgen_run_all_threads(struct pktgen_net *pn) { struct pktgen_thread *t; @@ -2988,7 +3000,7 @@ static void pktgen_run_all_threads(void) mutex_lock(&pktgen_thread_lock); - list_for_each_entry(t, &pktgen_threads, th_list) + list_for_each_entry(t, &pn->pktgen_threads, th_list) t->control |= (T_RUN); mutex_unlock(&pktgen_thread_lock); @@ -2996,10 +3008,10 @@ static void pktgen_run_all_threads(void) /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); - pktgen_wait_all_threads_run(); + pktgen_wait_all_threads_run(pn); } -static void pktgen_reset_all_threads(void) +static void pktgen_reset_all_threads(struct pktgen_net *pn) { struct pktgen_thread *t; @@ -3007,7 +3019,7 @@ static void pktgen_reset_all_threads(void) mutex_lock(&pktgen_thread_lock); - list_for_each_entry(t, &pktgen_threads, th_list) + list_for_each_entry(t, &pn->pktgen_threads, th_list) t->control |= (T_REMDEVALL); mutex_unlock(&pktgen_thread_lock); @@ -3015,7 +3027,7 @@ static void pktgen_reset_all_threads(void) /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); - pktgen_wait_all_threads_run(); + pktgen_wait_all_threads_run(pn); } static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) @@ -3157,9 +3169,7 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) static void pktgen_rem_thread(struct pktgen_thread *t) { /* Remove from the thread list */ - - remove_proc_entry(t->tsk->comm, pg_proc_dir); - + remove_proc_entry(t->tsk->comm, t->net->proc_dir); } static void pktgen_resched(struct pktgen_dev *pkt_dev) @@ -3305,7 +3315,7 @@ static int pktgen_thread_worker(void *arg) pkt_dev = next_to_run(t); if (unlikely(!pkt_dev && t->control == 0)) { - if (pktgen_exiting) + if (t->net->pktgen_exiting) break; wait_event_interruptible_timeout(t->queue, t->control != 0, @@ -3427,7 +3437,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) /* We don't allow a device to be on several threads */ - pkt_dev = __pktgen_NN_threads(ifname, FIND); + pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND); if (pkt_dev) { pr_err("ERROR: interface already used\n"); return -EBUSY; @@ -3462,13 +3472,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_id = 0xffff; pkt_dev->node = -1; - err = pktgen_setup_dev(pkt_dev, ifname); + err = pktgen_setup_dev(t->net, pkt_dev, ifname); if (err) goto out1; if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING) pkt_dev->clone_skb = pg_clone_skb_d; - pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir, + pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir, &pktgen_if_fops, pkt_dev); if (!pkt_dev->entry) { pr_err("cannot create %s/%s procfs entry\n", @@ -3493,7 +3503,7 @@ out1: return err; } -static int __init pktgen_create_thread(int cpu) +static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) { struct pktgen_thread *t; struct proc_dir_entry *pe; @@ -3511,7 +3521,7 @@ static int __init pktgen_create_thread(int cpu) INIT_LIST_HEAD(&t->if_list); - list_add_tail(&t->th_list, &pktgen_threads); + list_add_tail(&t->th_list, &pn->pktgen_threads); init_completion(&t->start_done); p = kthread_create_on_node(pktgen_thread_worker, @@ -3527,7 +3537,7 @@ static int __init pktgen_create_thread(int cpu) kthread_bind(p, cpu); t->tsk = p; - pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir, + pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, &pktgen_thread_fops, t); if (!pe) { pr_err("cannot create %s/%s procfs entry\n", @@ -3538,6 +3548,7 @@ static int __init pktgen_create_thread(int cpu) return -EINVAL; } + t->net = pn; wake_up_process(p); wait_for_completion(&t->start_done); @@ -3563,7 +3574,6 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t, static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) { - pr_debug("remove_device pkt_dev=%p\n", pkt_dev); if (pkt_dev->running) { @@ -3583,7 +3593,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, _rem_dev_from_if_list(t, pkt_dev); if (pkt_dev->entry) - remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); + proc_remove(pkt_dev->entry); #ifdef CONFIG_XFRM free_SAs(pkt_dev); @@ -3595,63 +3605,63 @@ static int pktgen_remove_device(struct pktgen_thread *t, return 0; } -static int __init pg_init(void) +static int __net_init pg_net_init(struct net *net) { - int cpu; + struct pktgen_net *pn = net_generic(net, pg_net_id); struct proc_dir_entry *pe; - int ret = 0; - - pr_info("%s", version); - - pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); - if (!pg_proc_dir) + int cpu, ret = 0; + + pn->net = net; + INIT_LIST_HEAD(&pn->pktgen_threads); + pn->pktgen_exiting = false; + pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net); + if (!pn->proc_dir) { + pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR); return -ENODEV; - - pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops); + } + pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops); if (pe == NULL) { - pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL); + pr_err("cannot create %s procfs entry\n", PGCTRL); ret = -EINVAL; - goto remove_dir; + goto remove; } - register_netdevice_notifier(&pktgen_notifier_block); - for_each_online_cpu(cpu) { int err; - err = pktgen_create_thread(cpu); + err = pktgen_create_thread(cpu, pn); if (err) - pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n", + pr_warn("Cannot create thread for cpu %d (%d)\n", cpu, err); } - if (list_empty(&pktgen_threads)) { - pr_err("ERROR: Initialization failed for all threads\n"); + if (list_empty(&pn->pktgen_threads)) { + pr_err("Initialization failed for all threads\n"); ret = -ENODEV; - goto unregister; + goto remove_entry; } return 0; - unregister: - unregister_netdevice_notifier(&pktgen_notifier_block); - remove_proc_entry(PGCTRL, pg_proc_dir); - remove_dir: - proc_net_remove(&init_net, PG_PROC_DIR); +remove_entry: + remove_proc_entry(PGCTRL, pn->proc_dir); +remove: + remove_proc_entry(PG_PROC_DIR, pn->net->proc_net); return ret; } -static void __exit pg_cleanup(void) +static void __net_exit pg_net_exit(struct net *net) { + struct pktgen_net *pn = net_generic(net, pg_net_id); struct pktgen_thread *t; struct list_head *q, *n; LIST_HEAD(list); /* Stop all interfaces & threads */ - pktgen_exiting = true; + pn->pktgen_exiting = true; mutex_lock(&pktgen_thread_lock); - list_splice_init(&pktgen_threads, &list); + list_splice_init(&pn->pktgen_threads, &list); mutex_unlock(&pktgen_thread_lock); list_for_each_safe(q, n, &list) { @@ -3661,12 +3671,36 @@ static void __exit pg_cleanup(void) kfree(t); } - /* Un-register us from receiving netdevice events */ - unregister_netdevice_notifier(&pktgen_notifier_block); + remove_proc_entry(PGCTRL, pn->proc_dir); + remove_proc_entry(PG_PROC_DIR, pn->net->proc_net); +} + +static struct pernet_operations pg_net_ops = { + .init = pg_net_init, + .exit = pg_net_exit, + .id = &pg_net_id, + .size = sizeof(struct pktgen_net), +}; + +static int __init pg_init(void) +{ + int ret = 0; - /* Clean up proc file system */ - remove_proc_entry(PGCTRL, pg_proc_dir); - proc_net_remove(&init_net, PG_PROC_DIR); + pr_info("%s", version); + ret = register_pernet_subsys(&pg_net_ops); + if (ret) + return ret; + ret = register_netdevice_notifier(&pktgen_notifier_block); + if (ret) + unregister_pernet_subsys(&pg_net_ops); + + return ret; +} + +static void __exit pg_cleanup(void) +{ + unregister_netdevice_notifier(&pktgen_notifier_block); + unregister_pernet_subsys(&pg_net_ops); } module_init(pg_init); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1868625af25e..a08bd2b7fe3f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -496,8 +496,10 @@ static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) } if (ops->fill_info) { data = nla_nest_start(skb, IFLA_INFO_DATA); - if (data == NULL) + if (data == NULL) { + err = -EMSGSIZE; goto err_cancel_link; + } err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; @@ -515,32 +517,6 @@ out: return err; } -static const int rtm_min[RTM_NR_FAMILIES] = -{ - [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)), - [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), - [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)), - [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)), - [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)), - [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)), - [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)), - [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)), - [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), - [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), -}; - -static const int rta_max[RTM_NR_FAMILIES] = -{ - [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX, - [RTM_FAM(RTM_NEWADDR)] = IFA_MAX, - [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX, - [RTM_FAM(RTM_NEWRULE)] = FRA_MAX, - [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX, - [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, - [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, - [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, -}; - int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; @@ -780,6 +756,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ @@ -879,6 +856,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, const struct rtnl_link_stats64 *stats; struct nlattr *attr, *af_spec; struct rtnl_af_ops *af_ops; + struct net_device *upper_dev = netdev_master_upper_dev_get(dev); ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -907,8 +885,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, #endif (dev->ifindex != dev->iflink && nla_put_u32(skb, IFLA_LINK, dev->iflink)) || - (dev->master && - nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) || + (upper_dev && + nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || + nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (dev->qdisc && nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || (dev->ifalias && @@ -976,6 +955,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, * report anything. */ ivi.spoofchk = -1; + memset(ivi.mac, 0, sizeof(ivi.mac)); if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) break; vf_mac.vf = @@ -1057,7 +1037,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) int idx = 0, s_idx; struct net_device *dev; struct hlist_head *head; - struct hlist_node *node; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; @@ -1067,7 +1046,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); cb->seq = net->dev_base_seq; - if (nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, + if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy) >= 0) { if (tb[IFLA_EXT_MASK]) @@ -1077,7 +1056,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, @@ -1108,6 +1087,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, [IFLA_MASTER] = { .type = NLA_U32 }, + [IFLA_CARRIER] = { .type = NLA_U8 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, @@ -1270,16 +1250,16 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) static int do_set_master(struct net_device *dev, int ifindex) { - struct net_device *master_dev; + struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; int err; - if (dev->master) { - if (dev->master->ifindex == ifindex) + if (upper_dev) { + if (upper_dev->ifindex == ifindex) return 0; - ops = dev->master->netdev_ops; + ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { - err = ops->ndo_del_slave(dev->master, dev); + err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; } else { @@ -1288,12 +1268,12 @@ static int do_set_master(struct net_device *dev, int ifindex) } if (ifindex) { - master_dev = __dev_get_by_index(dev_net(dev), ifindex); - if (!master_dev) + upper_dev = __dev_get_by_index(dev_net(dev), ifindex); + if (!upper_dev) return -EINVAL; - ops = master_dev->netdev_ops; + ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { - err = ops->ndo_add_slave(master_dev, dev); + err = ops->ndo_add_slave(upper_dev, dev); if (err) return err; } else { @@ -1307,7 +1287,6 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { const struct net_device_ops *ops = dev->netdev_ops; - int send_addr_notify = 0; int err; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { @@ -1360,16 +1339,6 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct sockaddr *sa; int len; - if (!ops->ndo_set_mac_address) { - err = -EOPNOTSUPP; - goto errout; - } - - if (!netif_device_present(dev)) { - err = -ENODEV; - goto errout; - } - len = sizeof(sa_family_t) + dev->addr_len; sa = kmalloc(len, GFP_KERNEL); if (!sa) { @@ -1379,13 +1348,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = ops->ndo_set_mac_address(dev, sa); + err = dev_set_mac_address(dev, sa); kfree(sa); if (err) goto errout; - send_addr_notify = 1; modified = 1; - add_device_randomness(dev->dev_addr, dev->addr_len); } if (tb[IFLA_MTU]) { @@ -1422,7 +1389,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, if (tb[IFLA_BROADCAST]) { nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); - send_addr_notify = 1; + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } if (ifm->ifi_flags || ifm->ifi_change) { @@ -1438,6 +1405,13 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, modified = 1; } + if (tb[IFLA_CARRIER]) { + err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); + if (err) + goto errout; + modified = 1; + } + if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); @@ -1536,13 +1510,10 @@ errout: net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); - if (send_addr_notify) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return err; } -static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -1583,7 +1554,7 @@ errout: return err; } -static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; @@ -1672,9 +1643,11 @@ struct net_device *rtnl_create_link(struct net *net, if (tb[IFLA_MTU]) dev->mtu = nla_get_u32(tb[IFLA_MTU]); - if (tb[IFLA_ADDRESS]) + if (tb[IFLA_ADDRESS]) { memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); + dev->addr_assign_type = NET_ADDR_SET; + } if (tb[IFLA_BROADCAST]) memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), nla_len(tb[IFLA_BROADCAST])); @@ -1712,7 +1685,7 @@ static int rtnl_group_changelink(struct net *net, int group, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; @@ -1867,7 +1840,7 @@ out: } } -static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -1923,7 +1896,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) u32 ext_filter_mask = 0; u16 min_ifinfo_dump_size = 0; - if (nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, + if (nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); @@ -1958,8 +1931,11 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (rtnl_msg_handlers[idx] == NULL || rtnl_msg_handlers[idx][type].dumpit == NULL) continue; - if (idx > s_idx) + if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); + cb->prev_seq = 0; + cb->seq = 0; + } if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) break; } @@ -1992,6 +1968,7 @@ errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); } +EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, @@ -2051,19 +2028,47 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +/** + * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry + */ +int ndo_dflt_fdb_add(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, + u16 flags) +{ + int err = -EINVAL; + + /* If aging addresses are supported device will need to + * implement its own handler for this. + */ + if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { + pr_info("%s: FDB only supports static addresses\n", dev->name); + return err; + } + + if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) + err = dev_uc_add_excl(dev, addr); + else if (is_multicast_ether_addr(addr)) + err = dev_mc_add_excl(dev, addr); + + /* Only return duplicate errors if NLM_F_EXCL is set */ + if (err == -EEXIST && !(flags & NLM_F_EXCL)) + err = 0; + + return err; +} +EXPORT_SYMBOL(ndo_dflt_fdb_add); + +static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - struct net_device *master = NULL; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; int err; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); if (err < 0) return err; @@ -2086,7 +2091,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } addr = nla_data(tb[NDA_LLADDR]); - if (!is_valid_ether_addr(addr)) { + if (is_zero_ether_addr(addr)) { pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ether address\n"); return -EINVAL; } @@ -2096,10 +2101,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && (dev->priv_flags & IFF_BRIDGE_PORT)) { - master = dev->master; - err = master->netdev_ops->ndo_fdb_add(ndm, tb, - dev, addr, - nlh->nlmsg_flags); + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + const struct net_device_ops *ops = br_dev->netdev_ops; + + err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags); if (err) goto out; else @@ -2107,10 +2112,13 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } /* Embedded bridge, macvlan, and any other device support */ - if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_add) { - err = dev->netdev_ops->ndo_fdb_add(ndm, tb, - dev, addr, - nlh->nlmsg_flags); + if ((ndm->ndm_flags & NTF_SELF)) { + if (dev->netdev_ops->ndo_fdb_add) + err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, + nlh->nlmsg_flags); + else + err = ndo_dflt_fdb_add(ndm, tb, dev, addr, + nlh->nlmsg_flags); if (!err) { rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH); @@ -2121,11 +2129,40 @@ out: return err; } -static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +/** + * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry + */ +int ndo_dflt_fdb_del(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr) +{ + int err = -EOPNOTSUPP; + + /* If aging addresses are supported device will need to + * implement its own handler for this. + */ + if (ndm->ndm_state & NUD_PERMANENT) { + pr_info("%s: FDB only supports static addresses\n", dev->name); + return -EINVAL; + } + + if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) + err = dev_uc_del(dev, addr); + else if (is_multicast_ether_addr(addr)) + err = dev_mc_del(dev, addr); + else + err = -EINVAL; + + return err; +} +EXPORT_SYMBOL(ndo_dflt_fdb_del); + +static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; - struct nlattr *llattr; + struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; int err = -EINVAL; __u8 *addr; @@ -2133,8 +2170,9 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (nlmsg_len(nlh) < sizeof(*ndm)) - return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + if (err < 0) + return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { @@ -2148,22 +2186,27 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -ENODEV; } - llattr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_LLADDR); - if (llattr == NULL || nla_len(llattr) != ETH_ALEN) { - pr_info("PF_BRIGDE: RTM_DELNEIGH with invalid address\n"); + if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { + pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n"); + return -EINVAL; + } + + addr = nla_data(tb[NDA_LLADDR]); + if (is_zero_ether_addr(addr)) { + pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ether address\n"); return -EINVAL; } - addr = nla_data(llattr); err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && (dev->priv_flags & IFF_BRIDGE_PORT)) { - struct net_device *master = dev->master; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + const struct net_device_ops *ops = br_dev->netdev_ops; - if (master->netdev_ops->ndo_fdb_del) - err = master->netdev_ops->ndo_fdb_del(ndm, dev, addr); + if (ops->ndo_fdb_del) + err = ops->ndo_fdb_del(ndm, tb, dev, addr); if (err) goto out; @@ -2172,8 +2215,11 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } /* Embedded bridge, macvlan, and any other device support */ - if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_del) { - err = dev->netdev_ops->ndo_fdb_del(ndm, dev, addr); + if (ndm->ndm_flags & NTF_SELF) { + if (dev->netdev_ops->ndo_fdb_del) + err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr); + else + err = ndo_dflt_fdb_del(ndm, tb, dev, addr); if (!err) { rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); @@ -2218,7 +2264,7 @@ skip: * @dev: netdevice * * Default netdevice operation to dump the existing unicast address list. - * Returns zero on success. + * Returns number of addresses from list put in skb. */ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -2247,15 +2293,19 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (dev->priv_flags & IFF_BRIDGE_PORT) { - struct net_device *master = dev->master; - const struct net_device_ops *ops = master->netdev_ops; + struct net_device *br_dev; + const struct net_device_ops *ops; + br_dev = netdev_master_upper_dev_get(dev); + ops = br_dev->netdev_ops; if (ops->ndo_fdb_dump) idx = ops->ndo_fdb_dump(skb, cb, dev, idx); } if (dev->netdev_ops->ndo_fdb_dump) idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx); + else + idx = ndo_dflt_fdb_dump(skb, cb, dev, idx); } rcu_read_unlock(); @@ -2270,6 +2320,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct ifinfomsg *ifm; struct nlattr *br_afspec; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); if (nlh == NULL) @@ -2287,8 +2338,8 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || - (dev->master && - nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) || + (br_dev && + nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev->iflink && @@ -2320,23 +2371,31 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; + struct nlattr *extfilt; + u32 filter_mask = 0; + + extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct rtgenmsg), + IFLA_EXT_MASK); + if (extfilt) + filter_mask = nla_get_u32(extfilt); rcu_read_lock(); for_each_netdev_rcu(net, dev) { const struct net_device_ops *ops = dev->netdev_ops; - struct net_device *master = dev->master; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); - if (master && master->netdev_ops->ndo_bridge_getlink) { + if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0] && - master->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev) < 0) + br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, filter_mask) < 0) break; idx++; } if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0] && - ops->ndo_bridge_getlink(skb, portid, seq, dev) < 0) + ops->ndo_bridge_getlink(skb, portid, seq, dev, + filter_mask) < 0) break; idx++; } @@ -2365,7 +2424,7 @@ static inline size_t bridge_nlmsg_size(void) static int rtnl_bridge_notify(struct net_device *dev, u16 flags) { struct net *net = dev_net(dev); - struct net_device *master = dev->master; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); struct sk_buff *skb; int err = -EOPNOTSUPP; @@ -2376,15 +2435,15 @@ static int rtnl_bridge_notify(struct net_device *dev, u16 flags) } if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) && - master && master->netdev_ops->ndo_bridge_getlink) { - err = master->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev); + br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { + err = br_dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); if (err < 0) goto errout; } if ((flags & BRIDGE_FLAGS_SELF) && dev->netdev_ops->ndo_bridge_getlink) { - err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev); + err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); if (err < 0) goto errout; } @@ -2398,8 +2457,7 @@ errout: return err; } -static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, - void *arg) +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -2436,13 +2494,14 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, oflags = flags; if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { - if (!dev->master || - !dev->master->netdev_ops->ndo_bridge_setlink) { + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + + if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { err = -EOPNOTSUPP; goto out; } - err = dev->master->netdev_ops->ndo_bridge_setlink(dev, nlh); + err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh); if (err) goto out; @@ -2468,9 +2527,75 @@ out: return err; } -/* Protected by RTNL sempahore. */ -static struct rtattr **rta_buf; -static int rtattr_max; +static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct net_device *dev; + struct nlattr *br_spec, *attr = NULL; + int rem, err = -EOPNOTSUPP; + u16 oflags, flags = 0; + bool have_flags = false; + + if (nlmsg_len(nlh) < sizeof(*ifm)) + return -EINVAL; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_family != AF_BRIDGE) + return -EPFNOSUPPORT; + + dev = __dev_get_by_index(net, ifm->ifi_index); + if (!dev) { + pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + return -ENODEV; + } + + br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (br_spec) { + nla_for_each_nested(attr, br_spec, rem) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + have_flags = true; + flags = nla_get_u16(attr); + break; + } + } + } + + oflags = flags; + + if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + + if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { + err = -EOPNOTSUPP; + goto out; + } + + err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh); + if (err) + goto out; + + flags &= ~BRIDGE_FLAGS_MASTER; + } + + if ((flags & BRIDGE_FLAGS_SELF)) { + if (!dev->netdev_ops->ndo_bridge_dellink) + err = -EOPNOTSUPP; + else + err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh); + + if (!err) + flags &= ~BRIDGE_FLAGS_SELF; + } + + if (have_flags) + memcpy(nla_data(attr), &flags, sizeof(flags)); + /* Generate event to notify upper layer of bridge change */ + if (!err) + err = rtnl_bridge_notify(dev, oflags); +out: + return err; +} /* Process one rtnetlink message. */ @@ -2479,7 +2604,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct net *net = sock_net(skb->sk); rtnl_doit_func doit; int sz_idx, kind; - int min_len; int family; int type; int err; @@ -2491,10 +2615,10 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) type -= RTM_BASE; /* All the messages must have at least 1 byte length */ - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + if (nlmsg_len(nlh) < sizeof(struct rtgenmsg)) return 0; - family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; sz_idx = type>>2; kind = type&3; @@ -2527,32 +2651,11 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } - memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); - - min_len = rtm_min[sz_idx]; - if (nlh->nlmsg_len < min_len) - return -EINVAL; - - if (nlh->nlmsg_len > min_len) { - int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); - struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len); - - while (RTA_OK(attr, attrlen)) { - unsigned int flavor = attr->rta_type; - if (flavor) { - if (flavor > rta_max[sz_idx]) - return -EINVAL; - rta_buf[flavor-1] = attr; - } - attr = RTA_NEXT(attr, attrlen); - } - } - doit = rtnl_get_doit(family, type); if (doit == NULL) return -EOPNOTSUPP; - return doit(skb, nlh, (void *)&rta_buf[0]); + return doit(skb, nlh); } static void rtnetlink_rcv(struct sk_buff *skb) @@ -2622,16 +2725,6 @@ static struct pernet_operations rtnetlink_net_ops = { void __init rtnetlink_init(void) { - int i; - - rtattr_max = 0; - for (i = 0; i < ARRAY_SIZE(rta_max); i++) - if (rta_max[i] > rtattr_max) - rtattr_max = rta_max[i]; - rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); - if (!rta_buf) - panic("rtnetlink_init: cannot allocate rta_buf\n"); - if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); @@ -2651,6 +2744,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); + rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL); } diff --git a/net/core/scm.c b/net/core/scm.c index 905dcc6ad1e3..03795d0147f2 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -24,6 +24,7 @@ #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/security.h> +#include <linux/pid_namespace.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/slab.h> @@ -52,7 +53,8 @@ static __inline__ int scm_check_creds(struct ucred *creds) if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; - if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) && + if ((creds->pid == task_tgid_vnr(current) || + ns_capable(current->nsproxy->pid_ns->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || @@ -185,22 +187,6 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) p->creds.uid = uid; p->creds.gid = gid; - - if (!p->cred || - !uid_eq(p->cred->euid, uid) || - !gid_eq(p->cred->egid, gid)) { - struct cred *cred; - err = -ENOMEM; - cred = prepare_creds(); - if (!cred) - goto error; - - cred->uid = cred->euid = uid; - cred->gid = cred->egid = gid; - if (p->cred) - put_cred(p->cred); - p->cred = cred; - } break; } default: @@ -304,8 +290,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) /* Bump the usage count and install the file. */ sock = sock_from_file(fp[i], &err); if (sock) { - sock_update_netprioidx(sock->sk, current); - sock_update_classid(sock->sk, current); + sock_update_netprioidx(sock->sk); + sock_update_classid(sock->sk); } fd_install(new_fd, get_file(fp[i])); } diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index e61a8bb7fce7..6a2f13cee86a 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -12,12 +12,10 @@ static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; -static int __init net_secret_init(void) +void net_secret_init(void) { get_random_bytes(net_secret, sizeof(net_secret)); - return 0; } -late_initcall(net_secret_init); #ifdef CONFIG_INET static u32 seq_scale(u32 seq) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 32443ebc3e89..af9185d0be6a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -104,47 +104,37 @@ static const struct pipe_buf_operations sock_pipe_buf_ops = { .get = sock_pipe_buf_get, }; -/* - * Keep out-of-line to prevent kernel bloat. - * __builtin_return_address is not used because it is not always - * reliable. - */ - /** - * skb_over_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_put(). Not user callable. + * skb_panic - private function for out-of-line support + * @skb: buffer + * @sz: size + * @addr: address + * @msg: skb_over_panic or skb_under_panic + * + * Out-of-line support for skb_put() and skb_push(). + * Called via the wrapper skb_over_panic() or skb_under_panic(). + * Keep out of line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always reliable. */ -static void skb_over_panic(struct sk_buff *skb, int sz, void *here) +static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, + const char msg[]) { pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", - __func__, here, skb->len, sz, skb->head, skb->data, + msg, addr, skb->len, sz, skb->head, skb->data, (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : "<NULL>"); BUG(); } -/** - * skb_under_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_push(). Not user callable. - */ - -static void skb_under_panic(struct sk_buff *skb, int sz, void *here) +static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) { - pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", - __func__, here, skb->len, sz, skb->head, skb->data, - (unsigned long)skb->tail, (unsigned long)skb->end, - skb->dev ? skb->dev->name : "<NULL>"); - BUG(); + skb_panic(skb, sz, addr, __func__); } +static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) +{ + skb_panic(skb, sz, addr, __func__); +} /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells @@ -155,8 +145,9 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) */ #define kmalloc_reserve(size, gfp, node, pfmemalloc) \ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) -void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip, - bool *pfmemalloc) + +static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, + unsigned long ip, bool *pfmemalloc) { void *obj; bool ret_pfmemalloc = false; @@ -188,6 +179,33 @@ out: * */ +struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) +{ + struct sk_buff *skb; + + /* Get the HEAD */ + skb = kmem_cache_alloc_node(skbuff_head_cache, + gfp_mask & ~__GFP_DMA, node); + if (!skb) + goto out; + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->data = NULL; + skb->truesize = sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->mac_header = ~0U; +#endif +out: + return skb; +} + /** * __alloc_skb - allocate a network buffer * @size: size to allocate @@ -259,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->end = skb->tail + size; #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->mac_header = ~0U; + skb->transport_header = ~0U; #endif /* make sure we initialize shinfo sequentially */ @@ -327,6 +346,7 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) skb->end = skb->tail + size; #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->mac_header = ~0U; + skb->transport_header = ~0U; #endif /* make sure we initialize shinfo sequentially */ @@ -348,10 +368,6 @@ struct netdev_alloc_cache { }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); -#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768) -#define NETDEV_FRAG_PAGE_MAX_SIZE (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER) -#define NETDEV_PAGECNT_MAX_BIAS NETDEV_FRAG_PAGE_MAX_SIZE - static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct netdev_alloc_cache *nc; @@ -595,7 +611,8 @@ static void skb_release_head_state(struct sk_buff *skb) static void skb_release_all(struct sk_buff *skb) { skb_release_head_state(skb); - skb_release_data(skb); + if (likely(skb->data)) + skb_release_data(skb); } /** @@ -684,6 +701,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->mac_header = old->mac_header; new->inner_transport_header = old->inner_transport_header; new->inner_network_header = old->inner_network_header; + new->inner_mac_header = old->inner_mac_header; skb_dst_copy(new, old); new->rxhash = old->rxhash; new->ooo_okay = old->ooo_okay; @@ -717,6 +735,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->tc_verd = old->tc_verd; #endif #endif + new->vlan_proto = old->vlan_proto; new->vlan_tci = old->vlan_tci; skb_copy_secmark(new, old); @@ -878,6 +897,18 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) } EXPORT_SYMBOL(skb_clone); +static void skb_headers_offset_update(struct sk_buff *skb, int off) +{ + /* {transport,network,mac}_header and tail are relative to skb->head */ + skb->transport_header += off; + skb->network_header += off; + if (skb_mac_header_was_set(skb)) + skb->mac_header += off; + skb->inner_transport_header += off; + skb->inner_network_header += off; + skb->inner_mac_header += off; +} + static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { #ifndef NET_SKBUFF_DATA_USES_OFFSET @@ -890,13 +921,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) __copy_skb_header(new, old); #ifndef NET_SKBUFF_DATA_USES_OFFSET - /* {transport,network,mac}_header are relative to skb->head */ - new->transport_header += offset; - new->network_header += offset; - if (skb_mac_header_was_set(new)) - new->mac_header += offset; - new->inner_transport_header += offset; - new->inner_network_header += offset; + skb_headers_offset_update(new, offset); #endif skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; @@ -1088,14 +1113,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, #else skb->end = skb->head + size; #endif - /* {transport,network,mac}_header and tail are relative to skb->head */ skb->tail += off; - skb->transport_header += off; - skb->network_header += off; - if (skb_mac_header_was_set(skb)) - skb->mac_header += off; - skb->inner_transport_header += off; - skb->inner_network_header += off; + skb_headers_offset_update(skb, off); /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += nhead; @@ -1191,12 +1210,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, if (n->ip_summed == CHECKSUM_PARTIAL) n->csum_start += off; #ifdef NET_SKBUFF_DATA_USES_OFFSET - n->transport_header += off; - n->network_header += off; - if (skb_mac_header_was_set(skb)) - n->mac_header += off; - n->inner_transport_header += off; - n->inner_network_header += off; + skb_headers_offset_update(n, off); #endif return n; @@ -2337,6 +2351,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); + skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2668,48 +2683,37 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, int len, int odd, struct sk_buff *skb), void *from, int length) { - int frg_cnt = 0; - skb_frag_t *frag = NULL; - struct page *page = NULL; - int copy, left; + int frg_cnt = skb_shinfo(skb)->nr_frags; + int copy; int offset = 0; int ret; + struct page_frag *pfrag = ¤t->task_frag; do { /* Return error if we don't have space for new frag */ - frg_cnt = skb_shinfo(skb)->nr_frags; if (frg_cnt >= MAX_SKB_FRAGS) - return -EFAULT; - - /* allocate a new page for next frag */ - page = alloc_pages(sk->sk_allocation, 0); + return -EMSGSIZE; - /* If alloc_page fails just return failure and caller will - * free previous allocated pages by doing kfree_skb() - */ - if (page == NULL) + if (!sk_page_frag_refill(sk, pfrag)) return -ENOMEM; - /* initialize the next frag */ - skb_fill_page_desc(skb, frg_cnt, page, 0, 0); - skb->truesize += PAGE_SIZE; - atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); - - /* get the new initialized frag */ - frg_cnt = skb_shinfo(skb)->nr_frags; - frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; - /* copy the user data to page */ - left = PAGE_SIZE - frag->page_offset; - copy = (length > left)? left : length; + copy = min_t(int, length, pfrag->size - pfrag->offset); - ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag), - offset, copy, 0, skb); + ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, + offset, copy, 0, skb); if (ret < 0) return -EFAULT; /* copy was successful so update the size parameters */ - skb_frag_size_add(frag, copy); + skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset, + copy); + frg_cnt++; + pfrag->offset += copy; + get_page(pfrag->page); + + skb->truesize += copy; + atomic_add(copy, &sk->sk_wmem_alloc); skb->len += copy; skb->data_len += copy; offset += copy; @@ -2759,14 +2763,22 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) unsigned int mss = skb_shinfo(skb)->gso_size; unsigned int doffset = skb->data - skb_mac_header(skb); unsigned int offset = doffset; + unsigned int tnl_hlen = skb_tnl_header_len(skb); unsigned int headroom; unsigned int len; + __be16 proto; + bool csum; int sg = !!(features & NETIF_F_SG); int nfrags = skb_shinfo(skb)->nr_frags; int err = -ENOMEM; int i = 0; int pos; + proto = skb_network_protocol(skb); + if (unlikely(!proto)) + return ERR_PTR(-EINVAL); + + csum = !!can_checksum_protocol(features, proto); __skb_push(skb, doffset); headroom = skb_headroom(skb); pos = skb_headlen(skb); @@ -2835,7 +2847,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) skb_set_network_header(nskb, skb->mac_len); nskb->transport_header = (nskb->network_header + skb_network_header_len(skb)); - skb_copy_from_linear_data(skb, nskb->data, doffset); + + skb_copy_from_linear_data_offset(skb, -tnl_hlen, + nskb->data - tnl_hlen, + doffset + tnl_hlen); if (fskb != skb_shinfo(skb)->frag_list) continue; @@ -2853,6 +2868,8 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) skb_copy_from_linear_data_offset(skb, offset, skb_put(nskb, hsize), hsize); + skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + while (pos < offset + len && i < nfrags) { *frag = skb_shinfo(skb)->frags[i]; __skb_frag_ref(frag); @@ -2899,6 +2916,12 @@ skip_fraglist: nskb->data_len = len - hsize; nskb->len += nskb->data_len; nskb->truesize += nskb->data_len; + + if (!csum) { + nskb->csum = skb_checksum(nskb, doffset, + nskb->len - doffset, 0); + nskb->ip_summed = CHECKSUM_NONE; + } } while ((offset += len) < skb->len); return segs; @@ -3304,12 +3327,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) - return; - if (hwtstamps) { - *skb_hwtstamps(skb) = + *skb_hwtstamps(orig_skb) = *hwtstamps; } else { /* @@ -3317,9 +3336,13 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, * so keep the shared tx_flags and only * store software time stamp */ - skb->tstamp = ktime_get_real(); + orig_skb->tstamp = ktime_get_real(); } + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; @@ -3376,6 +3399,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + start; skb->csum_offset = off; + skb_set_transport_header(skb, start); return true; } EXPORT_SYMBOL_GPL(skb_partial_csum_set); diff --git a/net/core/sock.c b/net/core/sock.c index bc131d419683..d4f4cea726e7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -186,8 +186,10 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; +#if defined(CONFIG_MEMCG_KMEM) struct static_key memcg_socket_limit_enabled; EXPORT_SYMBOL(memcg_socket_limit_enabled); +#endif /* * Make lock validator output more readable. (we pre-construct these @@ -665,6 +667,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname, case SO_REUSEADDR: sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; + case SO_REUSEPORT: + sk->sk_reuseport = valbool; + break; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: @@ -861,6 +866,13 @@ set_rcvbuf: ret = sk_detach_filter(sk); break; + case SO_LOCK_FILTER: + if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) + ret = -EPERM; + else + sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); + break; + case SO_PASSSEC: if (valbool) set_bit(SOCK_PASSSEC, &sock->flags); @@ -895,6 +907,10 @@ set_rcvbuf: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; + case SO_SELECT_ERR_QUEUE: + sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); + break; + default: ret = -ENOPROTOOPT; break; @@ -965,6 +981,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_reuse; break; + case SO_REUSEPORT: + v.val = sk->sk_reuseport; + break; + case SO_KEEPALIVE: v.val = sock_flag(sk, SOCK_KEEPOPEN); break; @@ -1140,6 +1160,14 @@ int sock_getsockopt(struct socket *sock, int level, int optname, goto lenout; + case SO_LOCK_FILTER: + v.val = sock_flag(sk, SOCK_FILTER_LOCKED); + break; + + case SO_SELECT_ERR_QUEUE: + v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); + break; + default: return -ENOPROTOOPT; } @@ -1278,13 +1306,12 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#ifdef CONFIG_CGROUPS #if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk, struct task_struct *task) +void sock_update_classid(struct sock *sk) { u32 classid; - classid = task_cls_classid(task); + classid = task_cls_classid(current); if (classid != sk->sk_classid) sk->sk_classid = classid; } @@ -1292,16 +1319,15 @@ EXPORT_SYMBOL(sock_update_classid); #endif #if IS_ENABLED(CONFIG_NETPRIO_CGROUP) -void sock_update_netprioidx(struct sock *sk, struct task_struct *task) +void sock_update_netprioidx(struct sock *sk) { if (in_interrupt()) return; - sk->sk_cgrp_prioidx = task_netprioidx(task); + sk->sk_cgrp_prioidx = task_netprioidx(current); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif -#endif /** * sk_alloc - All socket objects are allocated here @@ -1327,8 +1353,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, get_net(net)); atomic_set(&sk->sk_wmem_alloc, 1); - sock_update_classid(sk, current); - sock_update_netprioidx(sk, current); + sock_update_classid(sk); + sock_update_netprioidx(sk); } return sk; @@ -2212,7 +2238,7 @@ EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { - if (timer_pending(timer) && del_timer(timer)) + if (del_timer(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); @@ -2818,7 +2844,7 @@ static const struct file_operations proto_seq_fops = { static __net_init int proto_init_net(struct net *net) { - if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) + if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops)) return -ENOMEM; return 0; @@ -2826,7 +2852,7 @@ static __net_init int proto_init_net(struct net *net) static __net_exit void proto_exit_net(struct net *net) { - proc_net_remove(net, "protocols"); + remove_proc_entry("protocols", net->proc_net); } diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 602cd637182e..d5bef0b0f639 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -49,6 +49,39 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) } EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); +int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk, + struct sk_buff *skb, int attrtype) +{ + struct nlattr *attr; + struct sk_filter *filter; + unsigned int len; + int err = 0; + + if (!ns_capable(user_ns, CAP_NET_ADMIN)) { + nla_reserve(skb, attrtype, 0); + return 0; + } + + rcu_read_lock(); + + filter = rcu_dereference(sk->sk_filter); + len = filter ? filter->len * sizeof(struct sock_filter) : 0; + + attr = nla_reserve(skb, attrtype, len); + if (attr == NULL) { + err = -EMSGSIZE; + goto out; + } + + if (filter) + memcpy(nla_data(attr), filter->insns, len); + +out: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(sock_diag_put_filterinfo); + void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) { mutex_lock(&sock_diag_table_mutex); @@ -97,21 +130,6 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld) } EXPORT_SYMBOL_GPL(sock_diag_unregister); -static const inline struct sock_diag_handler *sock_diag_lock_handler(int family) -{ - if (sock_diag_handlers[family] == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, family); - - mutex_lock(&sock_diag_table_mutex); - return sock_diag_handlers[family]; -} - -static inline void sock_diag_unlock_handler(const struct sock_diag_handler *h) -{ - mutex_unlock(&sock_diag_table_mutex); -} - static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; @@ -121,12 +139,20 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(*req)) return -EINVAL; - hndl = sock_diag_lock_handler(req->sdiag_family); + if (req->sdiag_family >= AF_MAX) + return -EINVAL; + + if (sock_diag_handlers[req->sdiag_family] == NULL) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, req->sdiag_family); + + mutex_lock(&sock_diag_table_mutex); + hndl = sock_diag_handlers[req->sdiag_family]; if (hndl == NULL) err = -ENOENT; else err = hndl->dump(skb, nlh); - sock_diag_unlock_handler(hndl); + mutex_unlock(&sock_diag_table_mutex); return err; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d1b08045a9df..cfdb46ab3a7f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -20,6 +20,8 @@ #include <net/sock.h> #include <net/net_ratelimit.h> +static int one = 1; + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -92,28 +94,32 @@ static struct ctl_table net_core_table[] = { .data = &sysctl_wmem_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "rmem_max", .data = &sysctl_rmem_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "wmem_default", .data = &sysctl_wmem_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "rmem_default", .data = &sysctl_rmem_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "dev_weight", diff --git a/net/core/utils.c b/net/core/utils.c index e3487e461939..3c7f5b51b979 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -17,6 +17,7 @@ #include <linux/module.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/ctype.h> #include <linux/inet.h> #include <linux/mm.h> #include <linux/net.h> @@ -348,9 +349,7 @@ int mac_pton(const char *s, u8 *mac) /* Don't dirty result unless string is valid MAC. */ for (i = 0; i < ETH_ALEN; i++) { - if (!strchr("0123456789abcdefABCDEF", s[i * 3])) - return 0; - if (!strchr("0123456789abcdefABCDEF", s[i * 3 + 1])) + if (!isxdigit(s[i * 3]) || !isxdigit(s[i * 3 + 1])) return 0; if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':') return 0; diff --git a/net/dcb/dcbevent.c b/net/dcb/dcbevent.c index 1d9eb7c60a68..4f72fc40bf02 100644 --- a/net/dcb/dcbevent.c +++ b/net/dcb/dcbevent.c @@ -20,6 +20,7 @@ #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/export.h> +#include <net/dcbevent.h> static ATOMIC_NOTIFIER_HEAD(dcbevent_notif_chain); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 1b588e23cf80..40d5829ed36a 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -284,6 +284,7 @@ static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh, if (!netdev->dcbnl_ops->getpermhwaddr) return -EOPNOTSUPP; + memset(perm_addr, 0, sizeof(perm_addr)); netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr); return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr); @@ -1042,6 +1043,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) if (ops->ieee_getets) { struct ieee_ets ets; + memset(&ets, 0, sizeof(ets)); err = ops->ieee_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets)) @@ -1050,6 +1052,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) if (ops->ieee_getmaxrate) { struct ieee_maxrate maxrate; + memset(&maxrate, 0, sizeof(maxrate)); err = ops->ieee_getmaxrate(netdev, &maxrate); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE, @@ -1061,6 +1064,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) if (ops->ieee_getpfc) { struct ieee_pfc pfc; + memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc)) @@ -1094,6 +1098,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) /* get peer info if available */ if (ops->ieee_peer_getets) { struct ieee_ets ets; + memset(&ets, 0, sizeof(ets)); err = ops->ieee_peer_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets)) @@ -1102,6 +1107,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) if (ops->ieee_peer_getpfc) { struct ieee_pfc pfc; + memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc)) @@ -1280,6 +1286,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) /* peer info if available */ if (ops->cee_peer_getpg) { struct cee_pg pg; + memset(&pg, 0, sizeof(pg)); err = ops->cee_peer_getpg(netdev, &pg); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg)) @@ -1288,6 +1295,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) if (ops->cee_peer_getpfc) { struct cee_pfc pfc; + memset(&pfc, 0, sizeof(pfc)); err = ops->cee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc)) @@ -1650,7 +1658,7 @@ static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = { [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get }, }; -static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct net_device *netdev; diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index b75968a04017..8c0ef71bed2f 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -1,6 +1,6 @@ menuconfig IP_DCCP - tristate "The DCCP Protocol (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "The DCCP Protocol" + depends on INET ---help--- Datagram Congestion Control Protocol (RFC 4340) diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 0581143cb800..8ba3fc9d6d16 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -1,5 +1,4 @@ -menu "DCCP CCIDs Configuration (EXPERIMENTAL)" - depends on EXPERIMENTAL +menu "DCCP CCIDs Configuration" config IP_DCCP_CCID2_DEBUG bool "CCID-2 debugging messages" @@ -12,7 +11,7 @@ config IP_DCCP_CCID2_DEBUG If in doubt, say N. config IP_DCCP_CCID3 - bool "CCID-3 (TCP-Friendly) (EXPERIMENTAL)" + bool "CCID-3 (TCP-Friendly)" def_bool y if (IP_DCCP = y || IP_DCCP = m) ---help--- CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 4f9f5eb478f1..ebc54fef85a5 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -500,8 +500,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, return &rt->dst; } -static int dccp_v4_send_response(struct sock *sk, struct request_sock *req, - struct request_values *rv_unused) +static int dccp_v4_send_response(struct sock *sk, struct request_sock *req) { int err = -1; struct sk_buff *skb; @@ -658,7 +657,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) dreq->dreq_gss = dreq->dreq_iss; dreq->dreq_service = service; - if (dccp_v4_send_response(sk, req, NULL)) + if (dccp_v4_send_response(sk, req)) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6e05981f271e..9c61f9c02fdb 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -213,8 +213,7 @@ out: } -static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, - struct request_values *rv_unused) +static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) { struct inet6_request_sock *ireq6 = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -428,7 +427,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) dreq->dreq_gss = dreq->dreq_iss; dreq->dreq_service = service; - if (dccp_v6_send_response(sk, req, NULL)) + if (dccp_v6_send_response(sk, req)) goto drop_and_free; inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 0a8d6ebd9b45..4c6bdf97a657 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -171,7 +171,7 @@ static __init int dccpprobe_init(void) spin_lock_init(&dccpw.lock); if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL)) return ret; - if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &dccpprobe_fops)) + if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops)) goto err0; ret = setup_jprobe(); @@ -181,7 +181,7 @@ static __init int dccpprobe_init(void) pr_info("DCCP watch registered (port=%d)\n", port); return 0; err1: - proc_net_remove(&init_net, procname); + remove_proc_entry(procname, init_net.proc_net); err0: kfifo_free(&dccpw.fifo); return ret; @@ -191,7 +191,7 @@ module_init(dccpprobe_init); static __exit void dccpprobe_exit(void) { kfifo_free(&dccpw.fifo); - proc_net_remove(&init_net, procname); + remove_proc_entry(procname, init_net.proc_net); unregister_jprobe(&dccp_send_probe); } diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig index 7914fd619c5c..f3393e154f0f 100644 --- a/net/decnet/Kconfig +++ b/net/decnet/Kconfig @@ -25,8 +25,8 @@ config DECNET The module is called decnet. config DECNET_ROUTER - bool "DECnet: router support (EXPERIMENTAL)" - depends on DECNET && EXPERIMENTAL + bool "DECnet: router support" + depends on DECNET select FIB_RULES ---help--- Add support for turning your DECnet Endnode into a level 1 or 2 diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 307c322d53bb..c21f200eed93 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -175,12 +175,11 @@ static struct hlist_head *dn_find_list(struct sock *sk) static int check_port(__le16 port) { struct sock *sk; - struct hlist_node *node; if (port == 0) return -1; - sk_for_each(sk, node, &dn_sk_hash[le16_to_cpu(port) & DN_SK_HASH_MASK]) { + sk_for_each(sk, &dn_sk_hash[le16_to_cpu(port) & DN_SK_HASH_MASK]) { struct dn_scp *scp = DN_SK(sk); if (scp->addrloc == port) return -1; @@ -374,11 +373,10 @@ int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *sdn, struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr) { struct hlist_head *list = listen_hash(addr); - struct hlist_node *node; struct sock *sk; read_lock(&dn_hash_lock); - sk_for_each(sk, node, list) { + sk_for_each(sk, list) { struct dn_scp *scp = DN_SK(sk); if (sk->sk_state != TCP_LISTEN) continue; @@ -414,11 +412,10 @@ struct sock *dn_find_by_skb(struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); struct sock *sk; - struct hlist_node *node; struct dn_scp *scp; read_lock(&dn_hash_lock); - sk_for_each(sk, node, &dn_sk_hash[le16_to_cpu(cb->dst_port) & DN_SK_HASH_MASK]) { + sk_for_each(sk, &dn_sk_hash[le16_to_cpu(cb->dst_port) & DN_SK_HASH_MASK]) { scp = DN_SK(sk); if (cb->src != dn_saddr2dn(&scp->peer)) continue; @@ -909,6 +906,7 @@ static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, struct dn_scp *scp = DN_SK(sk); int err = -EISCONN; struct flowidn fld; + struct dst_entry *dst; if (sock->state == SS_CONNECTED) goto out; @@ -955,10 +953,11 @@ static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, fld.flowidn_proto = DNPROTO_NSP; if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, flags) < 0) goto out; - sk->sk_route_caps = sk->sk_dst_cache->dev->features; + dst = __sk_dst_get(sk); + sk->sk_route_caps = dst->dev->features; sock->state = SS_CONNECTING; scp->state = DN_CI; - scp->segsize_loc = dst_metric_advmss(sk->sk_dst_cache); + scp->segsize_loc = dst_metric_advmss(dst); dn_nsp_send_conninit(sk, NSP_CI); err = -EINPROGRESS; @@ -2382,7 +2381,7 @@ static int __init decnet_init(void) dev_add_pack(&dn_dix_packet_type); register_netdevice_notifier(&dn_dev_notifier); - proc_net_fops_create(&init_net, "decnet", S_IRUGO, &dn_socket_seq_fops); + proc_create("decnet", S_IRUGO, init_net.proc_net, &dn_socket_seq_fops); dn_register_sysctl(); out: return rc; @@ -2411,7 +2410,7 @@ static void __exit decnet_exit(void) dn_neigh_cleanup(); dn_fib_cleanup(); - proc_net_remove(&init_net, "decnet"); + remove_proc_entry("decnet", init_net.proc_net); proto_unregister(&dn_proto); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index e47ba9fc4a0e..7d9197063ebb 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -563,7 +563,7 @@ static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = { .len = IFNAMSIZ - 1 }, }; -static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -607,7 +607,7 @@ errout: return err; } -static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -1412,7 +1412,7 @@ void __init dn_dev_init(void) rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, NULL); rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, NULL); - proc_net_fops_create(&init_net, "decnet_dev", S_IRUGO, &dn_dev_seq_fops); + proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops); #ifdef CONFIG_SYSCTL { @@ -1433,7 +1433,7 @@ void __exit dn_dev_cleanup(void) } #endif /* CONFIG_SYSCTL */ - proc_net_remove(&init_net, "decnet_dev"); + remove_proc_entry("decnet_dev", init_net.proc_net); dn_dev_devices_off(); } diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index e36614eccc04..57dc159245ec 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -145,22 +145,10 @@ static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi return NULL; } -__le16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type) +static int dn_fib_count_nhs(const struct nlattr *attr) { - while(RTA_OK(attr,attrlen)) { - if (attr->rta_type == type) - return *(__le16*)RTA_DATA(attr); - attr = RTA_NEXT(attr, attrlen); - } - - return 0; -} - -static int dn_fib_count_nhs(struct rtattr *rta) -{ - int nhs = 0; - struct rtnexthop *nhp = RTA_DATA(rta); - int nhlen = RTA_PAYLOAD(rta); + struct rtnexthop *nhp = nla_data(attr); + int nhs = 0, nhlen = nla_len(attr); while(nhlen >= (int)sizeof(struct rtnexthop)) { if ((nhlen -= nhp->rtnh_len) < 0) @@ -172,10 +160,11 @@ static int dn_fib_count_nhs(struct rtattr *rta) return nhs; } -static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct rtattr *rta, const struct rtmsg *r) +static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr, + const struct rtmsg *r) { - struct rtnexthop *nhp = RTA_DATA(rta); - int nhlen = RTA_PAYLOAD(rta); + struct rtnexthop *nhp = nla_data(attr); + int nhlen = nla_len(attr); change_nexthops(fi) { int attrlen = nhlen - sizeof(struct rtnexthop); @@ -187,7 +176,10 @@ static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct rtattr *rta, cons nh->nh_weight = nhp->rtnh_hops + 1; if (attrlen) { - nh->nh_gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + struct nlattr *gw_attr; + + gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY); + nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0; } nhp = RTNH_NEXT(nhp); } endfor_nexthops(fi); @@ -268,7 +260,8 @@ out: } -struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta *rta, const struct nlmsghdr *nlh, int *errp) +struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *attrs[], + const struct nlmsghdr *nlh, int *errp) { int err; struct dn_fib_info *fi = NULL; @@ -281,11 +274,9 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta if (dn_fib_props[r->rtm_type].scope > r->rtm_scope) goto err_inval; - if (rta->rta_mp) { - nhs = dn_fib_count_nhs(rta->rta_mp); - if (nhs == 0) - goto err_inval; - } + if (attrs[RTA_MULTIPATH] && + (nhs = dn_fib_count_nhs(attrs[RTA_MULTIPATH])) == 0) + goto err_inval; fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct dn_fib_nh), GFP_KERNEL); err = -ENOBUFS; @@ -295,53 +286,65 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta fi->fib_protocol = r->rtm_protocol; fi->fib_nhs = nhs; fi->fib_flags = r->rtm_flags; - if (rta->rta_priority) - fi->fib_priority = *rta->rta_priority; - if (rta->rta_mx) { - int attrlen = RTA_PAYLOAD(rta->rta_mx); - struct rtattr *attr = RTA_DATA(rta->rta_mx); - while(RTA_OK(attr, attrlen)) { - unsigned int flavour = attr->rta_type; + if (attrs[RTA_PRIORITY]) + fi->fib_priority = nla_get_u32(attrs[RTA_PRIORITY]); + + if (attrs[RTA_METRICS]) { + struct nlattr *attr; + int rem; - if (flavour) { - if (flavour > RTAX_MAX) + nla_for_each_nested(attr, attrs[RTA_METRICS], rem) { + int type = nla_type(attr); + + if (type) { + if (type > RTAX_MAX || nla_len(attr) < 4) goto err_inval; - fi->fib_metrics[flavour-1] = *(unsigned int *)RTA_DATA(attr); + + fi->fib_metrics[type-1] = nla_get_u32(attr); } - attr = RTA_NEXT(attr, attrlen); } } - if (rta->rta_prefsrc) - memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 2); - if (rta->rta_mp) { - if ((err = dn_fib_get_nhs(fi, rta->rta_mp, r)) != 0) + if (attrs[RTA_PREFSRC]) + fi->fib_prefsrc = nla_get_le16(attrs[RTA_PREFSRC]); + + if (attrs[RTA_MULTIPATH]) { + if ((err = dn_fib_get_nhs(fi, attrs[RTA_MULTIPATH], r)) != 0) goto failure; - if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif) + + if (attrs[RTA_OIF] && + fi->fib_nh->nh_oif != nla_get_u32(attrs[RTA_OIF])) goto err_inval; - if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 2)) + + if (attrs[RTA_GATEWAY] && + fi->fib_nh->nh_gw != nla_get_le16(attrs[RTA_GATEWAY])) goto err_inval; } else { struct dn_fib_nh *nh = fi->fib_nh; - if (rta->rta_oif) - nh->nh_oif = *rta->rta_oif; - if (rta->rta_gw) - memcpy(&nh->nh_gw, rta->rta_gw, 2); + + if (attrs[RTA_OIF]) + nh->nh_oif = nla_get_u32(attrs[RTA_OIF]); + + if (attrs[RTA_GATEWAY]) + nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]); + nh->nh_flags = r->rtm_flags; nh->nh_weight = 1; } if (r->rtm_type == RTN_NAT) { - if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif) + if (!attrs[RTA_GATEWAY] || nhs != 1 || attrs[RTA_OIF]) goto err_inval; - memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 2); + + fi->fib_nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]); goto link_it; } if (dn_fib_props[r->rtm_type].error) { - if (rta->rta_gw || rta->rta_oif || rta->rta_mp) + if (attrs[RTA_GATEWAY] || attrs[RTA_OIF] || attrs[RTA_MULTIPATH]) goto err_inval; + goto link_it; } @@ -367,8 +370,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta } if (fi->fib_prefsrc) { - if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL || - memcmp(&fi->fib_prefsrc, rta->rta_dst, 2)) + if (r->rtm_type != RTN_LOCAL || !attrs[RTA_DST] || + fi->fib_prefsrc != nla_get_le16(attrs[RTA_DST])) if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) goto err_inval; } @@ -486,39 +489,21 @@ void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res) spin_unlock_bh(&dn_fib_multipath_lock); } - -static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta) -{ - int i; - - for(i = 1; i <= RTA_MAX; i++) { - struct rtattr *attr = rta[i-1]; - if (attr) { - if (RTA_PAYLOAD(attr) < 4 && RTA_PAYLOAD(attr) != 2) - return -EINVAL; - if (i != RTA_MULTIPATH && i != RTA_METRICS && - i != RTA_TABLE) - rta[i-1] = (struct rtattr *)RTA_DATA(attr); - } - } - - return 0; -} - -static inline u32 rtm_get_table(struct rtattr **rta, u8 table) +static inline u32 rtm_get_table(struct nlattr *attrs[], u8 table) { - if (rta[RTA_TABLE - 1]) - table = nla_get_u32((struct nlattr *) rta[RTA_TABLE - 1]); + if (attrs[RTA_TABLE]) + table = nla_get_u32(attrs[RTA_TABLE]); return table; } -static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct dn_fib_table *tb; - struct rtattr **rta = arg; - struct rtmsg *r = NLMSG_DATA(nlh); + struct rtmsg *r = nlmsg_data(nlh); + struct nlattr *attrs[RTA_MAX+1]; + int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -526,22 +511,24 @@ static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void * if (!net_eq(net, &init_net)) return -EINVAL; - if (dn_fib_check_attr(r, rta)) - return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy); + if (err < 0) + return err; - tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 0); - if (tb) - return tb->delete(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb)); + tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 0); + if (!tb) + return -ESRCH; - return -ESRCH; + return tb->delete(tb, r, attrs, nlh, &NETLINK_CB(skb)); } -static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct dn_fib_table *tb; - struct rtattr **rta = arg; - struct rtmsg *r = NLMSG_DATA(nlh); + struct rtmsg *r = nlmsg_data(nlh); + struct nlattr *attrs[RTA_MAX+1]; + int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -549,14 +536,15 @@ static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void * if (!net_eq(net, &init_net)) return -EINVAL; - if (dn_fib_check_attr(r, rta)) - return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy); + if (err < 0) + return err; - tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 1); - if (tb) - return tb->insert(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb)); + tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 1); + if (!tb) + return -ENOBUFS; - return -ENOBUFS; + return tb->insert(tb, r, attrs, nlh, &NETLINK_CB(skb)); } static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa) @@ -566,10 +554,31 @@ static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifad struct nlmsghdr nlh; struct rtmsg rtm; } req; - struct dn_kern_rta rta; + struct { + struct nlattr hdr; + __le16 dst; + } dst_attr = { + .dst = dst, + }; + struct { + struct nlattr hdr; + __le16 prefsrc; + } prefsrc_attr = { + .prefsrc = ifa->ifa_local, + }; + struct { + struct nlattr hdr; + u32 oif; + } oif_attr = { + .oif = ifa->ifa_dev->dev->ifindex, + }; + struct nlattr *attrs[RTA_MAX+1] = { + [RTA_DST] = (struct nlattr *) &dst_attr, + [RTA_PREFSRC] = (struct nlattr * ) &prefsrc_attr, + [RTA_OIF] = (struct nlattr *) &oif_attr, + }; memset(&req.rtm, 0, sizeof(req.rtm)); - memset(&rta, 0, sizeof(rta)); if (type == RTN_UNICAST) tb = dn_fib_get_table(RT_MIN_TABLE, 1); @@ -591,14 +600,10 @@ static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifad req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST); req.rtm.rtm_type = type; - rta.rta_dst = &dst; - rta.rta_prefsrc = &ifa->ifa_local; - rta.rta_oif = &ifa->ifa_dev->dev->ifindex; - if (cmd == RTM_NEWROUTE) - tb->insert(tb, &req.rtm, &rta, &req.nlh, NULL); + tb->insert(tb, &req.rtm, attrs, &req.nlh, NULL); else - tb->delete(tb, &req.rtm, &rta, &req.nlh, NULL); + tb->delete(tb, &req.rtm, attrs, &req.nlh, NULL); } static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa) diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 3aede1b459fd..f8637f93d318 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -95,7 +95,7 @@ static u32 dn_neigh_hash(const void *pkey, struct neigh_table dn_neigh_table = { .family = PF_DECnet, - .entry_size = sizeof(struct dn_neigh), + .entry_size = NEIGH_ENTRY_SIZE(sizeof(struct dn_neigh)), .key_len = sizeof(__le16), .hash = dn_neigh_hash, .constructor = dn_neigh_construct, @@ -590,11 +590,12 @@ static const struct file_operations dn_neigh_seq_fops = { void __init dn_neigh_init(void) { neigh_table_init(&dn_neigh_table); - proc_net_fops_create(&init_net, "decnet_neigh", S_IRUGO, &dn_neigh_seq_fops); + proc_create("decnet_neigh", S_IRUGO, init_net.proc_net, + &dn_neigh_seq_fops); } void __exit dn_neigh_cleanup(void) { - proc_net_remove(&init_net, "decnet_neigh"); + remove_proc_entry("decnet_neigh", init_net.proc_net); neigh_table_clear(&dn_neigh_table); } diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 8a96047c7c94..1aaa51ebbda6 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -598,7 +598,7 @@ void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg, if (reason == 0) reason = le16_to_cpu(scp->discdata_out.opt_status); - dn_nsp_do_disc(sk, msgflg, reason, gfp, sk->sk_dst_cache, ddl, + dn_nsp_do_disc(sk, msgflg, reason, gfp, __sk_dst_get(sk), ddl, scp->discdata_out.opt_data, scp->addrrem, scp->addrloc); } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index b57419cc41a4..fe32388ea24f 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1282,7 +1282,7 @@ static int dn_route_output_key(struct dst_entry **pprt, struct flowidn *flp, int return err; } -int dn_route_output_sock(struct dst_entry **pprt, struct flowidn *fl, struct sock *sk, int flags) +int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *fl, struct sock *sk, int flags) { int err; @@ -1613,23 +1613,41 @@ errout: return -EMSGSIZE; } +const struct nla_policy rtm_dn_policy[RTA_MAX + 1] = { + [RTA_DST] = { .type = NLA_U16 }, + [RTA_SRC] = { .type = NLA_U16 }, + [RTA_IIF] = { .type = NLA_U32 }, + [RTA_OIF] = { .type = NLA_U32 }, + [RTA_GATEWAY] = { .type = NLA_U16 }, + [RTA_PRIORITY] = { .type = NLA_U32 }, + [RTA_PREFSRC] = { .type = NLA_U16 }, + [RTA_METRICS] = { .type = NLA_NESTED }, + [RTA_MULTIPATH] = { .type = NLA_NESTED }, + [RTA_TABLE] = { .type = NLA_U32 }, + [RTA_MARK] = { .type = NLA_U32 }, +}; + /* * This is called by both endnodes and routers now. */ -static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) +static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); - struct rtattr **rta = arg; struct rtmsg *rtm = nlmsg_data(nlh); struct dn_route *rt = NULL; struct dn_skb_cb *cb; int err; struct sk_buff *skb; struct flowidn fld; + struct nlattr *tb[RTA_MAX+1]; if (!net_eq(net, &init_net)) return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_dn_policy); + if (err < 0) + return err; + memset(&fld, 0, sizeof(fld)); fld.flowidn_proto = DNPROTO_NSP; @@ -1639,12 +1657,14 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void skb_reset_mac_header(skb); cb = DN_SKB_CB(skb); - if (rta[RTA_SRC-1]) - memcpy(&fld.saddr, RTA_DATA(rta[RTA_SRC-1]), 2); - if (rta[RTA_DST-1]) - memcpy(&fld.daddr, RTA_DATA(rta[RTA_DST-1]), 2); - if (rta[RTA_IIF-1]) - memcpy(&fld.flowidn_iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int)); + if (tb[RTA_SRC]) + fld.saddr = nla_get_le16(tb[RTA_SRC]); + + if (tb[RTA_DST]) + fld.daddr = nla_get_le16(tb[RTA_DST]); + + if (tb[RTA_IIF]) + fld.flowidn_iif = nla_get_u32(tb[RTA_IIF]); if (fld.flowidn_iif) { struct net_device *dev; @@ -1669,10 +1689,9 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void if (!err && -rt->dst.error) err = rt->dst.error; } else { - int oif = 0; - if (rta[RTA_OIF - 1]) - memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); - fld.flowidn_oif = oif; + if (tb[RTA_OIF]) + fld.flowidn_oif = nla_get_u32(tb[RTA_OIF]); + err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0); } @@ -1901,7 +1920,8 @@ void __init dn_route_init(void) dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1); - proc_net_fops_create(&init_net, "decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops); + proc_create("decnet_cache", S_IRUGO, init_net.proc_net, + &dn_rt_cache_seq_fops); #ifdef CONFIG_DECNET_ROUTER rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, @@ -1917,7 +1937,7 @@ void __exit dn_route_cleanup(void) del_timer(&dn_route_timer); dn_run_flush(0); - proc_net_remove(&init_net, "decnet_cache"); + remove_proc_entry("decnet_cache", init_net.proc_net); dst_entries_destroy(&dn_dst_ops); } diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index f968c1b58f47..86e3807052e9 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -19,7 +19,6 @@ #include <linux/sockios.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/netlink.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/netdevice.h> @@ -224,26 +223,27 @@ static struct dn_zone *dn_new_zone(struct dn_hash *table, int z) } -static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern_rta *rta, struct dn_fib_info *fi) +static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct nlattr *attrs[], struct dn_fib_info *fi) { struct rtnexthop *nhp; int nhlen; - if (rta->rta_priority && *rta->rta_priority != fi->fib_priority) + if (attrs[RTA_PRIORITY] && + nla_get_u32(attrs[RTA_PRIORITY]) != fi->fib_priority) return 1; - if (rta->rta_oif || rta->rta_gw) { - if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) && - (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 2) == 0)) + if (attrs[RTA_OIF] || attrs[RTA_GATEWAY]) { + if ((!attrs[RTA_OIF] || nla_get_u32(attrs[RTA_OIF]) == fi->fib_nh->nh_oif) && + (!attrs[RTA_GATEWAY] || nla_get_le16(attrs[RTA_GATEWAY]) != fi->fib_nh->nh_gw)) return 0; return 1; } - if (rta->rta_mp == NULL) + if (!attrs[RTA_MULTIPATH]) return 0; - nhp = RTA_DATA(rta->rta_mp); - nhlen = RTA_PAYLOAD(rta->rta_mp); + nhp = nla_data(attrs[RTA_MULTIPATH]); + nhlen = nla_len(attrs[RTA_MULTIPATH]); for_nexthops(fi) { int attrlen = nhlen - sizeof(struct rtnexthop); @@ -254,7 +254,10 @@ static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif) return 1; if (attrlen) { - gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + struct nlattr *gw_attr; + + gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY); + gw = gw_attr ? nla_get_le16(gw_attr) : 0; if (gw && gw != nh->nh_gw) return 1; @@ -483,13 +486,12 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb) unsigned int h, s_h; unsigned int e = 0, s_e; struct dn_fib_table *tb; - struct hlist_node *node; int dumped = 0; if (!net_eq(net, &init_net)) return 0; - if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && + if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *)nlmsg_data(cb->nlh))->rtm_flags&RTM_F_CLONED) return dn_cache_dump(skb, cb); @@ -498,7 +500,7 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < DN_FIB_TABLE_HASHSZ; h++, s_h = 0) { e = 0; - hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist) { + hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist) { if (e < s_e) goto next; if (dumped) @@ -518,7 +520,8 @@ out: return skb->len; } -static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req) +static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[], + struct nlmsghdr *n, struct netlink_skb_parms *req) { struct dn_hash *table = (struct dn_hash *)tb->data; struct dn_fib_node *new_f, *f, **fp, **del_fp; @@ -537,15 +540,14 @@ static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct return -ENOBUFS; dz_key_0(key); - if (rta->rta_dst) { - __le16 dst; - memcpy(&dst, rta->rta_dst, 2); + if (attrs[RTA_DST]) { + __le16 dst = nla_get_le16(attrs[RTA_DST]); if (dst & ~DZ_MASK(dz)) return -EINVAL; key = dz_key(dst, dz); } - if ((fi = dn_fib_create_info(r, rta, n, &err)) == NULL) + if ((fi = dn_fib_create_info(r, attrs, n, &err)) == NULL) return err; if (dz->dz_nent > (dz->dz_divisor << 2) && @@ -655,7 +657,8 @@ out: } -static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req) +static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[], + struct nlmsghdr *n, struct netlink_skb_parms *req) { struct dn_hash *table = (struct dn_hash*)tb->data; struct dn_fib_node **fp, **del_fp, *f; @@ -672,9 +675,8 @@ static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct return -ESRCH; dz_key_0(key); - if (rta->rta_dst) { - __le16 dst; - memcpy(&dst, rta->rta_dst, 2); + if (attrs[RTA_DST]) { + __le16 dst = nla_get_le16(attrs[RTA_DST]); if (dst & ~DZ_MASK(dz)) return -EINVAL; key = dz_key(dst, dz); @@ -704,7 +706,7 @@ static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) && (!r->rtm_protocol || fi->fib_protocol == r->rtm_protocol) && - dn_fib_nh_match(r, n, rta, fi) == 0) + dn_fib_nh_match(r, n, attrs, fi) == 0) del_fp = fp; } @@ -828,7 +830,6 @@ out: struct dn_fib_table *dn_fib_get_table(u32 n, int create) { struct dn_fib_table *t; - struct hlist_node *node; unsigned int h; if (n < RT_TABLE_MIN) @@ -839,7 +840,7 @@ struct dn_fib_table *dn_fib_get_table(u32 n, int create) h = n & (DN_FIB_TABLE_HASHSZ - 1); rcu_read_lock(); - hlist_for_each_entry_rcu(t, node, &dn_fib_table_hash[h], hlist) { + hlist_for_each_entry_rcu(t, &dn_fib_table_hash[h], hlist) { if (t->n == n) { rcu_read_unlock(); return t; @@ -885,11 +886,10 @@ void dn_fib_flush(void) { int flushed = 0; struct dn_fib_table *tb; - struct hlist_node *node; unsigned int h; for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) { - hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist) + hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist) flushed += tb->flush(tb); } @@ -908,12 +908,12 @@ void __init dn_fib_table_init(void) void __exit dn_fib_table_cleanup(void) { struct dn_fib_table *t; - struct hlist_node *node, *next; + struct hlist_node *next; unsigned int h; write_lock(&dn_fib_tables_lock); for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) { - hlist_for_each_entry_safe(t, node, next, &dn_fib_table_hash[h], + hlist_for_each_entry_safe(t, next, &dn_fib_table_hash[h], hlist) { hlist_del(&t->hlist); kfree(t); diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig index 2f81de5e752f..8d7c109d5109 100644 --- a/net/decnet/netfilter/Kconfig +++ b/net/decnet/netfilter/Kconfig @@ -3,7 +3,7 @@ # menu "DECnet: Netfilter Configuration" - depends on DECNET && NETFILTER && EXPERIMENTAL + depends on DECNET && NETFILTER depends on NETFILTER_ADVANCED config DECNET_NF_GRABULATOR diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index dfe42012a044..2a7efe388344 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -19,7 +19,7 @@ #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/spinlock.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netfilter_decnet.h> #include <net/sock.h> @@ -39,21 +39,21 @@ static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp) unsigned char *ptr; struct nf_dn_rtmsg *rtm; - size = NLMSG_SPACE(rt_skb->len); - size += NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg)); - skb = alloc_skb(size, GFP_ATOMIC); + size = NLMSG_ALIGN(rt_skb->len) + + NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg)); + skb = nlmsg_new(size, GFP_ATOMIC); if (!skb) { *errp = -ENOMEM; return NULL; } old_tail = skb->tail; - nlh = nlmsg_put(skb, 0, 0, 0, size - sizeof(*nlh), 0); + nlh = nlmsg_put(skb, 0, 0, 0, size, 0); if (!nlh) { kfree_skb(skb); *errp = -ENOMEM; return NULL; } - rtm = (struct nf_dn_rtmsg *)NLMSG_DATA(nlh); + rtm = (struct nf_dn_rtmsg *)nlmsg_data(nlh); rtm->nfdn_ifindex = rt_skb->dev->ifindex; ptr = NFDN_RTMSG(rtm); skb_copy_from_linear_data(rt_skb, ptr, rt_skb->len); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 45295ca09571..0eb5d5e76dfb 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1,6 +1,7 @@ /* * net/dsa/dsa.c - Hardware switch handling * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -14,6 +15,9 @@ #include <linux/slab.h> #include <linux/module.h> #include <net/dsa.h> +#include <linux/of.h> +#include <linux/of_mdio.h> +#include <linux/of_platform.h> #include "dsa_priv.h" char dsa_driver_version[] = "0.1"; @@ -80,6 +84,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, int ret; char *name; int i; + bool valid_name_found = false; /* * Probe for switch model. @@ -131,8 +136,13 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, } else { ds->phys_port_mask |= 1 << i; } + valid_name_found = true; } + if (!valid_name_found && i == DSA_MAX_PORTS) { + ret = -EINVAL; + goto out; + } /* * If the CPU connects to this switch, set the switch tree @@ -281,34 +291,239 @@ static struct net_device *dev_to_net_device(struct device *dev) return NULL; } +#ifdef CONFIG_OF +static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, + struct dsa_chip_data *cd, + int chip_index, + struct device_node *link) +{ + int ret; + const __be32 *reg; + int link_port_addr; + int link_sw_addr; + struct device_node *parent_sw; + int len; + + parent_sw = of_get_parent(link); + if (!parent_sw) + return -EINVAL; + + reg = of_get_property(parent_sw, "reg", &len); + if (!reg || (len != sizeof(*reg) * 2)) + return -EINVAL; + + link_sw_addr = be32_to_cpup(reg + 1); + + if (link_sw_addr >= pd->nr_chips) + return -EINVAL; + + /* First time routing table allocation */ + if (!cd->rtable) { + cd->rtable = kmalloc(pd->nr_chips * sizeof(s8), GFP_KERNEL); + if (!cd->rtable) + return -ENOMEM; + + /* default to no valid uplink/downlink */ + memset(cd->rtable, -1, pd->nr_chips * sizeof(s8)); + } + + reg = of_get_property(link, "reg", NULL); + if (!reg) { + ret = -EINVAL; + goto out; + } + + link_port_addr = be32_to_cpup(reg); + + cd->rtable[link_sw_addr] = link_port_addr; + + return 0; +out: + kfree(cd->rtable); + return ret; +} + +static void dsa_of_free_platform_data(struct dsa_platform_data *pd) +{ + int i; + int port_index; + + for (i = 0; i < pd->nr_chips; i++) { + port_index = 0; + while (port_index < DSA_MAX_PORTS) { + if (pd->chip[i].port_names[port_index]) + kfree(pd->chip[i].port_names[port_index]); + port_index++; + } + kfree(pd->chip[i].rtable); + } + kfree(pd->chip); +} + +static int dsa_of_probe(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct device_node *child, *mdio, *ethernet, *port, *link; + struct mii_bus *mdio_bus; + struct platform_device *ethernet_dev; + struct dsa_platform_data *pd; + struct dsa_chip_data *cd; + const char *port_name; + int chip_index, port_index; + const unsigned int *sw_addr, *port_reg; + int ret; + + mdio = of_parse_phandle(np, "dsa,mii-bus", 0); + if (!mdio) + return -EINVAL; + + mdio_bus = of_mdio_find_bus(mdio); + if (!mdio_bus) + return -EINVAL; + + ethernet = of_parse_phandle(np, "dsa,ethernet", 0); + if (!ethernet) + return -EINVAL; + + ethernet_dev = of_find_device_by_node(ethernet); + if (!ethernet_dev) + return -ENODEV; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return -ENOMEM; + + pdev->dev.platform_data = pd; + pd->netdev = ðernet_dev->dev; + pd->nr_chips = of_get_child_count(np); + if (pd->nr_chips > DSA_MAX_SWITCHES) + pd->nr_chips = DSA_MAX_SWITCHES; + + pd->chip = kzalloc(pd->nr_chips * sizeof(struct dsa_chip_data), + GFP_KERNEL); + if (!pd->chip) { + ret = -ENOMEM; + goto out_free; + } + + chip_index = 0; + for_each_available_child_of_node(np, child) { + cd = &pd->chip[chip_index]; + + cd->mii_bus = &mdio_bus->dev; + + sw_addr = of_get_property(child, "reg", NULL); + if (!sw_addr) + continue; + + cd->sw_addr = be32_to_cpup(sw_addr); + if (cd->sw_addr > PHY_MAX_ADDR) + continue; + + for_each_available_child_of_node(child, port) { + port_reg = of_get_property(port, "reg", NULL); + if (!port_reg) + continue; + + port_index = be32_to_cpup(port_reg); + + port_name = of_get_property(port, "label", NULL); + if (!port_name) + continue; + + cd->port_names[port_index] = kstrdup(port_name, + GFP_KERNEL); + if (!cd->port_names[port_index]) { + ret = -ENOMEM; + goto out_free_chip; + } + + link = of_parse_phandle(port, "link", 0); + + if (!strcmp(port_name, "dsa") && link && + pd->nr_chips > 1) { + ret = dsa_of_setup_routing_table(pd, cd, + chip_index, link); + if (ret) + goto out_free_chip; + } + + if (port_index == DSA_MAX_PORTS) + break; + } + } + + return 0; + +out_free_chip: + dsa_of_free_platform_data(pd); +out_free: + kfree(pd); + pdev->dev.platform_data = NULL; + return ret; +} + +static void dsa_of_remove(struct platform_device *pdev) +{ + struct dsa_platform_data *pd = pdev->dev.platform_data; + + if (!pdev->dev.of_node) + return; + + dsa_of_free_platform_data(pd); + kfree(pd); +} +#else +static inline int dsa_of_probe(struct platform_device *pdev) +{ + return 0; +} + +static inline void dsa_of_remove(struct platform_device *pdev) +{ +} +#endif + static int dsa_probe(struct platform_device *pdev) { static int dsa_version_printed; struct dsa_platform_data *pd = pdev->dev.platform_data; struct net_device *dev; struct dsa_switch_tree *dst; - int i; + int i, ret; if (!dsa_version_printed++) printk(KERN_NOTICE "Distributed Switch Architecture " "driver version %s\n", dsa_driver_version); + if (pdev->dev.of_node) { + ret = dsa_of_probe(pdev); + if (ret) + return ret; + + pd = pdev->dev.platform_data; + } + if (pd == NULL || pd->netdev == NULL) return -EINVAL; dev = dev_to_net_device(pd->netdev); - if (dev == NULL) - return -EINVAL; + if (dev == NULL) { + ret = -EINVAL; + goto out; + } if (dev->dsa_ptr != NULL) { dev_put(dev); - return -EEXIST; + ret = -EEXIST; + goto out; } dst = kzalloc(sizeof(*dst), GFP_KERNEL); if (dst == NULL) { dev_put(dev); - return -ENOMEM; + ret = -ENOMEM; + goto out; } platform_set_drvdata(pdev, dst); @@ -360,6 +575,11 @@ static int dsa_probe(struct platform_device *pdev) } return 0; + +out: + dsa_of_remove(pdev); + + return ret; } static int dsa_remove(struct platform_device *pdev) @@ -379,6 +599,8 @@ static int dsa_remove(struct platform_device *pdev) dsa_switch_destroy(ds); } + dsa_of_remove(pdev); + return 0; } @@ -386,6 +608,12 @@ static void dsa_shutdown(struct platform_device *pdev) { } +static const struct of_device_id dsa_of_match_table[] = { + { .compatible = "marvell,dsa", }, + {} +}; +MODULE_DEVICE_TABLE(of, dsa_of_match_table); + static struct platform_driver dsa_driver = { .probe = dsa_probe, .remove = dsa_remove, @@ -393,6 +621,7 @@ static struct platform_driver dsa_driver = { .driver = { .name = "dsa", .owner = THIS_MODULE, + .of_match_table = dsa_of_match_table, }, }; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index e32083d5d8f8..6ebd8fbd9285 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -41,8 +41,8 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->name = "dsa slave smi"; ds->slave_mii_bus->read = dsa_slave_phy_read; ds->slave_mii_bus->write = dsa_slave_phy_write; - snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x", - ds->master_mii_bus->id, ds->pd->sw_addr); + snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x", + ds->index, ds->pd->sw_addr); ds->slave_mii_bus->parent = &ds->master_mii_bus->dev; } @@ -203,10 +203,10 @@ dsa_slave_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) static void dsa_slave_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { - strncpy(drvinfo->driver, "dsa", 32); - strncpy(drvinfo->version, dsa_driver_version, 32); - strncpy(drvinfo->fw_version, "N/A", 32); - strncpy(drvinfo->bus_info, "platform", 32); + strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); + strlcpy(drvinfo->version, dsa_driver_version, sizeof(drvinfo->version)); + strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); + strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } static int dsa_slave_nway_reset(struct net_device *dev) @@ -391,7 +391,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, if (p->phy != NULL) { phy_attach(slave_dev, dev_name(&p->phy->dev), - 0, PHY_INTERFACE_MODE_GMII); + PHY_INTERFACE_MODE_GMII); p->phy->autoneg = AUTONEG_ENABLE; p->phy->speed = 0; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 4efad533e5f6..5359560926bc 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -195,7 +195,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) if (netdev_uses_trailer_tags(dev)) return htons(ETH_P_TRAILER); - if (ntohs(eth->h_proto) >= 1536) + if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) return eth->h_proto; /* @@ -272,6 +272,36 @@ void eth_header_cache_update(struct hh_cache *hh, EXPORT_SYMBOL(eth_header_cache_update); /** + * eth_prepare_mac_addr_change - prepare for mac change + * @dev: network device + * @p: socket address + */ +int eth_prepare_mac_addr_change(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) + return -EBUSY; + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + return 0; +} +EXPORT_SYMBOL(eth_prepare_mac_addr_change); + +/** + * eth_commit_mac_addr_change - commit mac change + * @dev: network device + * @p: socket address + */ +void eth_commit_mac_addr_change(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); +} +EXPORT_SYMBOL(eth_commit_mac_addr_change); + +/** * eth_mac_addr - set new Ethernet hardware address * @dev: network device * @p: socket address @@ -283,15 +313,12 @@ EXPORT_SYMBOL(eth_header_cache_update); */ int eth_mac_addr(struct net_device *dev, void *p) { - struct sockaddr *addr = p; + int ret; - if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) - return -EBUSY; - if (!is_valid_ether_addr(addr->sa_data)) - return -EADDRNOTAVAIL; - memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); - /* if device marked as NET_ADDR_RANDOM, reset it */ - dev->addr_assign_type &= ~NET_ADDR_RANDOM; + ret = eth_prepare_mac_addr_change(dev, p); + if (ret < 0) + return ret; + eth_commit_mac_addr_change(dev, p); return 0; } EXPORT_SYMBOL(eth_mac_addr); diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index f651da60f161..55e1fd5b3e56 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -104,6 +104,7 @@ static const u8 lowpan_llprefix[] = {0xfe, 0x80}; struct lowpan_dev_info { struct net_device *real_dev; /* real WPAN device ptr */ struct mutex dev_list_mtx; /* mutex for list ops */ + unsigned short fragment_tag; }; struct lowpan_dev_record { @@ -120,7 +121,6 @@ struct lowpan_fragment { struct list_head list; /* fragments list */ }; -static unsigned short fragment_tag; static LIST_HEAD(lowpan_fragments); static DEFINE_SPINLOCK(flist_lock); @@ -284,6 +284,9 @@ lowpan_compress_udp_header(u8 **hc06_ptr, struct sk_buff *skb) /* checksum is always inline */ memcpy(*hc06_ptr, &uh->check, 2); *hc06_ptr += 2; + + /* skip the UDP header */ + skb_pull(skb, sizeof(struct udphdr)); } static inline int lowpan_fetch_skb_u8(struct sk_buff *skb, u8 *val) @@ -309,9 +312,8 @@ static inline int lowpan_fetch_skb_u16(struct sk_buff *skb, u16 *val) } static int -lowpan_uncompress_udp_header(struct sk_buff *skb) +lowpan_uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) { - struct udphdr *uh = udp_hdr(skb); u8 tmp; if (!uh) @@ -358,6 +360,14 @@ lowpan_uncompress_udp_header(struct sk_buff *skb) /* copy checksum */ memcpy(&uh->check, &skb->data[0], 2); skb_pull(skb, 2); + + /* + * UDP lenght needs to be infered from the lower layers + * here, we obtain the hint from the remaining size of the + * frame + */ + uh->len = htons(skb->len + sizeof(struct udphdr)); + pr_debug("uncompressed UDP length: src = %d", uh->len); } else { pr_debug("ERROR: unsupported NH format\n"); goto err; @@ -377,17 +387,14 @@ static int lowpan_header_create(struct sk_buff *skb, struct ipv6hdr *hdr; const u8 *saddr = _saddr; const u8 *daddr = _daddr; - u8 *head; + u8 head[100]; struct ieee802154_addr sa, da; + /* TODO: + * if this package isn't ipv6 one, where should it be routed? + */ if (type != ETH_P_IPV6) return 0; - /* TODO: - * if this package isn't ipv6 one, where should it be routed? - */ - head = kzalloc(100, GFP_KERNEL); - if (head == NULL) - return -ENOMEM; hdr = ipv6_hdr(skb); hc06_ptr = head + 2; @@ -561,8 +568,6 @@ static int lowpan_header_create(struct sk_buff *skb, skb_pull(skb, sizeof(struct ipv6hdr)); memcpy(skb_push(skb, hc06_ptr - head), head, hc06_ptr - head); - kfree(head); - lowpan_raw_dump_table(__func__, "raw skb data dump", skb->data, skb->len); @@ -577,27 +582,63 @@ static int lowpan_header_create(struct sk_buff *skb, * this isn't implemented in mainline yet, so currently we assign 0xff */ { + mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA; + mac_cb(skb)->seq = ieee802154_mlme_ops(dev)->get_dsn(dev); + /* prepare wpan address data */ sa.addr_type = IEEE802154_ADDR_LONG; - sa.pan_id = 0xff; + sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - da.addr_type = IEEE802154_ADDR_LONG; - da.pan_id = 0xff; - - memcpy(&(da.hwaddr), daddr, 8); memcpy(&(sa.hwaddr), saddr, 8); + /* intra-PAN communications */ + da.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA; + /* + * if the destination address is the broadcast address, use the + * corresponding short address + */ + if (lowpan_is_addr_broadcast(daddr)) { + da.addr_type = IEEE802154_ADDR_SHORT; + da.short_addr = IEEE802154_ADDR_BROADCAST; + } else { + da.addr_type = IEEE802154_ADDR_LONG; + memcpy(&(da.hwaddr), daddr, IEEE802154_ADDR_LEN); + + /* request acknowledgment */ + mac_cb(skb)->flags |= MAC_CB_FLAG_ACKREQ; + } return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, type, (void *)&da, (void *)&sa, skb->len); } } +static int lowpan_give_skb_to_devices(struct sk_buff *skb) +{ + struct lowpan_dev_record *entry; + struct sk_buff *skb_cp; + int stat = NET_RX_SUCCESS; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &lowpan_devices, list) + if (lowpan_dev_info(entry->ldev)->real_dev == skb->dev) { + skb_cp = skb_copy(skb, GFP_ATOMIC); + if (!skb_cp) { + stat = -ENOMEM; + break; + } + + skb_cp->dev = entry->ldev; + stat = netif_rx(skb_cp); + } + rcu_read_unlock(); + + return stat; +} + static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr) { struct sk_buff *new; - struct lowpan_dev_record *entry; int stat = NET_RX_SUCCESS; new = skb_copy_expand(skb, sizeof(struct ipv6hdr), skb_tailroom(skb), @@ -614,19 +655,7 @@ static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr) new->protocol = htons(ETH_P_IPV6); new->pkt_type = PACKET_HOST; - rcu_read_lock(); - list_for_each_entry_rcu(entry, &lowpan_devices, list) - if (lowpan_dev_info(entry->ldev)->real_dev == new->dev) { - skb = skb_copy(new, GFP_ATOMIC); - if (!skb) { - stat = -ENOMEM; - break; - } - - skb->dev = entry->ldev; - stat = netif_rx(skb); - } - rcu_read_unlock(); + stat = lowpan_give_skb_to_devices(new); kfree_skb(new); @@ -645,7 +674,7 @@ static void lowpan_fragment_timer_expired(unsigned long entry_addr) } static struct lowpan_fragment * -lowpan_alloc_new_frame(struct sk_buff *skb, u8 len, u16 tag) +lowpan_alloc_new_frame(struct sk_buff *skb, u16 len, u16 tag) { struct lowpan_fragment *frame; @@ -715,7 +744,7 @@ lowpan_process_data(struct sk_buff *skb) { struct lowpan_fragment *frame; /* slen stores the rightmost 8 bits of the 11 bits length */ - u8 slen, offset; + u8 slen, offset = 0; u16 len, tag; bool found = false; @@ -726,6 +755,18 @@ lowpan_process_data(struct sk_buff *skb) /* adds the 3 MSB to the 8 LSB to retrieve the 11 bits length */ len = ((iphc0 & 7) << 8) | slen; + if ((iphc0 & LOWPAN_DISPATCH_MASK) == LOWPAN_DISPATCH_FRAG1) { + pr_debug("%s received a FRAG1 packet (tag: %d, " + "size of the entire IP packet: %d)", + __func__, tag, len); + } else { /* FRAGN */ + if (lowpan_fetch_skb_u8(skb, &offset)) + goto unlock_and_drop; + pr_debug("%s received a FRAGN packet (tag: %d, " + "size of the entire IP packet: %d, " + "offset: %d)", __func__, tag, len, offset * 8); + } + /* * check if frame assembling with the same tag is * already in progress @@ -740,17 +781,13 @@ lowpan_process_data(struct sk_buff *skb) /* alloc new frame structure */ if (!found) { + pr_debug("%s first fragment received for tag %d, " + "begin packet reassembly", __func__, tag); frame = lowpan_alloc_new_frame(skb, len, tag); if (!frame) goto unlock_and_drop; } - if ((iphc0 & LOWPAN_DISPATCH_MASK) == LOWPAN_DISPATCH_FRAG1) - goto unlock_and_drop; - - if (lowpan_fetch_skb_u8(skb, &offset)) /* fetch offset */ - goto unlock_and_drop; - /* if payload fits buffer, copy it */ if (likely((offset * 8 + skb->len) <= frame->length)) skb_copy_to_linear_data_offset(frame->skb, offset * 8, @@ -768,6 +805,9 @@ lowpan_process_data(struct sk_buff *skb) list_del(&frame->list); spin_unlock_bh(&flist_lock); + pr_debug("%s successfully reassembled fragment " + "(tag %d)", __func__, tag); + dev_kfree_skb(skb); skb = frame->skb; kfree(frame); @@ -913,10 +953,35 @@ lowpan_process_data(struct sk_buff *skb) } /* UDP data uncompression */ - if (iphc0 & LOWPAN_IPHC_NH_C) - if (lowpan_uncompress_udp_header(skb)) + if (iphc0 & LOWPAN_IPHC_NH_C) { + struct udphdr uh; + struct sk_buff *new; + if (lowpan_uncompress_udp_header(skb, &uh)) goto drop; + /* + * replace the compressed UDP head by the uncompressed UDP + * header + */ + new = skb_copy_expand(skb, sizeof(struct udphdr), + skb_tailroom(skb), GFP_ATOMIC); + kfree_skb(skb); + + if (!new) + return -ENOMEM; + + skb = new; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr)); + + lowpan_raw_dump_table(__func__, "raw UDP header dump", + (u8 *)&uh, sizeof(uh)); + + hdr.nexthdr = UIP_PROTO_UDP; + } + /* Not fragmented package */ hdr.payload_len = htons(skb->len); @@ -964,13 +1029,13 @@ static int lowpan_get_mac_header_length(struct sk_buff *skb) static int lowpan_fragment_xmit(struct sk_buff *skb, u8 *head, - int mlen, int plen, int offset) + int mlen, int plen, int offset, int type) { struct sk_buff *frag; int hlen, ret; - /* if payload length is zero, therefore it's a first fragment */ - hlen = (plen == 0 ? LOWPAN_FRAG1_HEAD_SIZE : LOWPAN_FRAGN_HEAD_SIZE); + hlen = (type == LOWPAN_DISPATCH_FRAG1) ? + LOWPAN_FRAG1_HEAD_SIZE : LOWPAN_FRAGN_HEAD_SIZE; lowpan_raw_dump_inline(__func__, "6lowpan fragment header", head, hlen); @@ -998,14 +1063,14 @@ lowpan_fragment_xmit(struct sk_buff *skb, u8 *head, } static int -lowpan_skb_fragmentation(struct sk_buff *skb) +lowpan_skb_fragmentation(struct sk_buff *skb, struct net_device *dev) { int err, header_length, payload_length, tag, offset = 0; u8 head[5]; header_length = lowpan_get_mac_header_length(skb); payload_length = skb->len - header_length; - tag = fragment_tag++; + tag = lowpan_dev_info(dev)->fragment_tag++; /* first fragment header */ head[0] = LOWPAN_DISPATCH_FRAG1 | ((payload_length >> 8) & 0x7); @@ -1013,7 +1078,16 @@ lowpan_skb_fragmentation(struct sk_buff *skb) head[2] = tag >> 8; head[3] = tag & 0xff; - err = lowpan_fragment_xmit(skb, head, header_length, 0, 0); + err = lowpan_fragment_xmit(skb, head, header_length, LOWPAN_FRAG_SIZE, + 0, LOWPAN_DISPATCH_FRAG1); + + if (err) { + pr_debug("%s unable to send FRAG1 packet (tag: %d)", + __func__, tag); + goto exit; + } + + offset = LOWPAN_FRAG_SIZE; /* next fragment header */ head[0] &= ~LOWPAN_DISPATCH_FRAG1; @@ -1028,10 +1102,17 @@ lowpan_skb_fragmentation(struct sk_buff *skb) len = payload_length - offset; err = lowpan_fragment_xmit(skb, head, header_length, - len, offset); + len, offset, LOWPAN_DISPATCH_FRAGN); + if (err) { + pr_debug("%s unable to send a subsequent FRAGN packet " + "(tag: %d, offset: %d", __func__, tag, offset); + goto exit; + } + offset += len; } +exit: return err; } @@ -1054,14 +1135,14 @@ static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) } pr_debug("frame is too big, fragmentation is needed\n"); - err = lowpan_skb_fragmentation(skb); + err = lowpan_skb_fragmentation(skb, dev); error: dev_kfree_skb(skb); out: - if (err < 0) + if (err) pr_debug("ERROR: xmit failed\n"); - return (err < 0 ? NETDEV_TX_BUSY : NETDEV_TX_OK); + return (err < 0) ? NET_XMIT_DROP : err; } static struct wpan_phy *lowpan_get_phy(const struct net_device *dev) @@ -1082,6 +1163,12 @@ static u16 lowpan_get_short_addr(const struct net_device *dev) return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); } +static u8 lowpan_get_dsn(const struct net_device *dev) +{ + struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev); +} + static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; @@ -1095,6 +1182,7 @@ static struct ieee802154_mlme_ops lowpan_mlme = { .get_pan_id = lowpan_get_pan_id, .get_phy = lowpan_get_phy, .get_short_addr = lowpan_get_short_addr, + .get_dsn = lowpan_get_dsn, }; static void lowpan_setup(struct net_device *dev) @@ -1137,19 +1225,42 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; /* check that it's our buffer */ - switch (skb->data[0] & 0xe0) { - case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ - case LOWPAN_DISPATCH_FRAG1: /* first fragment header */ - case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */ - local_skb = skb_clone(skb, GFP_ATOMIC); + if (skb->data[0] == LOWPAN_DISPATCH_IPV6) { + /* Copy the packet so that the IPv6 header is + * properly aligned. + */ + local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1, + skb_tailroom(skb), GFP_ATOMIC); if (!local_skb) goto drop; - lowpan_process_data(local_skb); + local_skb->protocol = htons(ETH_P_IPV6); + local_skb->pkt_type = PACKET_HOST; + + /* Pull off the 1-byte of 6lowpan header. */ + skb_pull(local_skb, 1); + skb_reset_network_header(local_skb); + skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); + + lowpan_give_skb_to_devices(local_skb); + + kfree_skb(local_skb); kfree_skb(skb); - break; - default: - break; + } else { + switch (skb->data[0] & 0xe0) { + case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ + case LOWPAN_DISPATCH_FRAG1: /* first fragment header */ + case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */ + local_skb = skb_clone(skb, GFP_ATOMIC); + if (!local_skb) + goto drop; + lowpan_process_data(local_skb); + + kfree_skb(skb); + break; + default: + break; + } } return NET_RX_SUCCESS; @@ -1175,6 +1286,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, return -ENODEV; lowpan_dev_info(dev)->real_dev = real_dev; + lowpan_dev_info(dev)->fragment_tag = 0; mutex_init(&lowpan_dev_info(dev)->dev_list_mtx); entry = kzalloc(sizeof(struct lowpan_dev_record), GFP_KERNEL); @@ -1234,7 +1346,7 @@ static inline int __init lowpan_netlink_init(void) return rtnl_link_register(&lowpan_link_ops); } -static inline void __init lowpan_netlink_fini(void) +static inline void lowpan_netlink_fini(void) { rtnl_link_unregister(&lowpan_link_ops); } diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h index 8c2251fb0a3f..4b8f917658b5 100644 --- a/net/ieee802154/6lowpan.h +++ b/net/ieee802154/6lowpan.h @@ -84,7 +84,7 @@ (memcmp(addr1, addr2, length >> 3) == 0) /* local link, i.e. FE80::/10 */ -#define is_addr_link_local(a) (((a)->s6_addr16[0]) == 0x80FE) +#define is_addr_link_local(a) (((a)->s6_addr16[0]) == htons(0xFE80)) /* * check whether we can compress the IID to 16 bits, @@ -92,9 +92,10 @@ */ #define lowpan_is_iid_16_bit_compressable(a) \ ((((a)->s6_addr16[4]) == 0) && \ - (((a)->s6_addr16[5]) == 0) && \ - (((a)->s6_addr16[6]) == 0) && \ - ((((a)->s6_addr[14]) & 0x80) == 0)) + (((a)->s6_addr[10]) == 0) && \ + (((a)->s6_addr[11]) == 0xff) && \ + (((a)->s6_addr[12]) == 0xfe) && \ + (((a)->s6_addr[13]) == 0)) /* multicast address */ #define is_addr_mcast(a) (((a)->s6_addr[0]) == 0xFF) diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig index 7dee65052925..b2e06df0076c 100644 --- a/net/ieee802154/Kconfig +++ b/net/ieee802154/Kconfig @@ -1,6 +1,5 @@ config IEEE802154 - tristate "IEEE Std 802.15.4 Low-Rate Wireless Personal Area Networks support (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "IEEE Std 802.15.4 Low-Rate Wireless Personal Area Networks support" ---help--- IEEE Std 802.15.4 defines a low data rate, low power and low complexity short range wireless personal area networks. It was diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index 16705611589a..581a59504bd5 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -291,6 +291,9 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; + struct sockaddr_ieee802154 *saddr; + + saddr = (struct sockaddr_ieee802154 *)msg->msg_name; skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) @@ -309,6 +312,13 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, sock_recv_ts_and_drops(msg, sk, skb); + if (saddr) { + saddr->family = AF_IEEE802154; + saddr->addr = mac_cb(skb)->sa; + } + if (addr_len) + *addr_len = sizeof(*saddr); + if (flags & MSG_TRUNC) copied = skb->len; done: @@ -350,7 +360,6 @@ static inline int ieee802154_match_sock(u8 *hw_addr, u16 pan_id, int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk, *prev = NULL; - struct hlist_node *node; int ret = NET_RX_SUCCESS; u16 pan_id, short_addr; @@ -361,7 +370,7 @@ int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev); read_lock(&dgram_lock); - sk_for_each(sk, node, &dgram_head) { + sk_for_each(sk, &dgram_head) { if (ieee802154_match_sock(dev->dev_addr, pan_id, short_addr, dgram_sk(sk))) { if (prev) { diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index 97351e1d07a4..7e49bbcc6967 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -64,8 +64,8 @@ struct sk_buff *ieee802154_nl_create(int flags, u8 req) int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group) { - /* XXX: nlh is right at the start of msg */ - void *hdr = genlmsg_data(NLMSG_DATA(msg->data)); + struct nlmsghdr *nlh = nlmsg_hdr(msg); + void *hdr = genlmsg_data(nlmsg_data(nlh)); if (genlmsg_end(msg, hdr) < 0) goto out; @@ -97,8 +97,8 @@ struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info) { - /* XXX: nlh is right at the start of msg */ - void *hdr = genlmsg_data(NLMSG_DATA(msg->data)); + struct nlmsghdr *nlh = nlmsg_hdr(msg); + void *hdr = genlmsg_data(nlmsg_data(nlh)); if (genlmsg_end(msg, hdr) < 0) goto out; diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 96bb08abece2..b0bdd8c51e9c 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -315,7 +315,7 @@ static int ieee802154_associate_req(struct sk_buff *skb, struct net_device *dev; struct ieee802154_addr addr; u8 page; - int ret = -EINVAL; + int ret = -EOPNOTSUPP; if (!info->attrs[IEEE802154_ATTR_CHANNEL] || !info->attrs[IEEE802154_ATTR_COORD_PAN_ID] || @@ -327,6 +327,8 @@ static int ieee802154_associate_req(struct sk_buff *skb, dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; + if (!ieee802154_mlme_ops(dev)->assoc_req) + goto out; if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) { addr.addr_type = IEEE802154_ADDR_LONG; @@ -350,6 +352,7 @@ static int ieee802154_associate_req(struct sk_buff *skb, page, nla_get_u8(info->attrs[IEEE802154_ATTR_CAPABILITY])); +out: dev_put(dev); return ret; } @@ -359,7 +362,7 @@ static int ieee802154_associate_resp(struct sk_buff *skb, { struct net_device *dev; struct ieee802154_addr addr; - int ret = -EINVAL; + int ret = -EOPNOTSUPP; if (!info->attrs[IEEE802154_ATTR_STATUS] || !info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] || @@ -369,6 +372,8 @@ static int ieee802154_associate_resp(struct sk_buff *skb, dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; + if (!ieee802154_mlme_ops(dev)->assoc_resp) + goto out; addr.addr_type = IEEE802154_ADDR_LONG; nla_memcpy(addr.hwaddr, info->attrs[IEEE802154_ATTR_DEST_HW_ADDR], @@ -380,6 +385,7 @@ static int ieee802154_associate_resp(struct sk_buff *skb, nla_get_u16(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]), nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS])); +out: dev_put(dev); return ret; } @@ -389,7 +395,7 @@ static int ieee802154_disassociate_req(struct sk_buff *skb, { struct net_device *dev; struct ieee802154_addr addr; - int ret = -EINVAL; + int ret = -EOPNOTSUPP; if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] && !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) || @@ -399,6 +405,8 @@ static int ieee802154_disassociate_req(struct sk_buff *skb, dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; + if (!ieee802154_mlme_ops(dev)->disassoc_req) + goto out; if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) { addr.addr_type = IEEE802154_ADDR_LONG; @@ -415,6 +423,7 @@ static int ieee802154_disassociate_req(struct sk_buff *skb, ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr, nla_get_u8(info->attrs[IEEE802154_ATTR_REASON])); +out: dev_put(dev); return ret; } @@ -432,7 +441,7 @@ static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) u8 channel, bcn_ord, sf_ord; u8 page; int pan_coord, blx, coord_realign; - int ret; + int ret = -EOPNOTSUPP; if (!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] || !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR] || @@ -448,6 +457,8 @@ static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; + if (!ieee802154_mlme_ops(dev)->start_req) + goto out; addr.addr_type = IEEE802154_ADDR_SHORT; addr.short_addr = nla_get_u16( @@ -476,6 +487,7 @@ static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) ret = ieee802154_mlme_ops(dev)->start_req(dev, &addr, channel, page, bcn_ord, sf_ord, pan_coord, blx, coord_realign); +out: dev_put(dev); return ret; } @@ -483,7 +495,7 @@ static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) static int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; - int ret; + int ret = -EOPNOTSUPP; u8 type; u32 channels; u8 duration; @@ -497,6 +509,8 @@ static int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info) dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; + if (!ieee802154_mlme_ops(dev)->scan_req) + goto out; type = nla_get_u8(info->attrs[IEEE802154_ATTR_SCAN_TYPE]); channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]); @@ -511,6 +525,7 @@ static int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info) ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, page, duration); +out: dev_put(dev); return ret; } diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 50e823927d49..41f538b8e59c 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -221,10 +221,9 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk; - struct hlist_node *node; read_lock(&raw_lock); - sk_for_each(sk, node, &raw_head) { + sk_for_each(sk, &raw_head) { bh_lock_sock(sk); if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) { diff --git a/net/ieee802154/wpan-class.c b/net/ieee802154/wpan-class.c index 1627ef2e8522..13571eae6bae 100644 --- a/net/ieee802154/wpan-class.c +++ b/net/ieee802154/wpan-class.c @@ -91,7 +91,7 @@ static struct class wpan_phy_class = { static DEFINE_MUTEX(wpan_phy_mutex); static int wpan_phy_idx; -static int wpan_phy_match(struct device *dev, void *data) +static int wpan_phy_match(struct device *dev, const void *data) { return !strcmp(dev_name(dev), (const char *)data); } @@ -103,8 +103,7 @@ struct wpan_phy *wpan_phy_find(const char *str) if (WARN_ON(!str)) return NULL; - dev = class_find_device(&wpan_phy_class, NULL, - (void *)str, wpan_phy_match); + dev = class_find_device(&wpan_phy_class, NULL, str, wpan_phy_match); if (!dev) return NULL; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 5a19aeb86094..8603ca827104 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -166,6 +166,7 @@ config IP_PNP_RARP config NET_IPIP tristate "IP: tunneling" select INET_TUNNEL + select NET_IP_TUNNEL ---help--- Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -186,9 +187,14 @@ config NET_IPGRE_DEMUX This is helper module to demultiplex GRE packets on GRE version field criteria. Required by ip_gre and pptp modules. +config NET_IP_TUNNEL + tristate + default n + config NET_IPGRE tristate "IP: GRE tunnels over IP" depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX + select NET_IP_TUNNEL help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -313,6 +319,7 @@ config SYN_COOKIES config NET_IPVTI tristate "Virtual (secure) IP: tunneling" select INET_TUNNEL + select NET_IP_TUNNEL depends on INET_XFRM_MODE_TUNNEL ---help--- Tunneling means encapsulating data of one protocol type within @@ -488,7 +495,6 @@ config TCP_CONG_HTCP config TCP_CONG_HSTCP tristate "High Speed TCP" - depends on EXPERIMENTAL default n ---help--- Sally Floyd's High Speed TCP (RFC 3649) congestion control. @@ -499,7 +505,6 @@ config TCP_CONG_HSTCP config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" - depends on EXPERIMENTAL default n ---help--- TCP-Hybla is a sender-side only change that eliminates penalization of @@ -509,7 +514,6 @@ config TCP_CONG_HYBLA config TCP_CONG_VEGAS tristate "TCP Vegas" - depends on EXPERIMENTAL default n ---help--- TCP Vegas is a sender-side only change to TCP that anticipates @@ -520,7 +524,6 @@ config TCP_CONG_VEGAS config TCP_CONG_SCALABLE tristate "Scalable TCP" - depends on EXPERIMENTAL default n ---help--- Scalable TCP is a sender-side only change to TCP which uses a @@ -530,7 +533,6 @@ config TCP_CONG_SCALABLE config TCP_CONG_LP tristate "TCP Low Priority" - depends on EXPERIMENTAL default n ---help--- TCP Low Priority (TCP-LP), a distributed algorithm whose goal is @@ -540,7 +542,6 @@ config TCP_CONG_LP config TCP_CONG_VENO tristate "TCP Veno" - depends on EXPERIMENTAL default n ---help--- TCP Veno is a sender-side only enhancement of TCP to obtain better @@ -552,7 +553,6 @@ config TCP_CONG_VENO config TCP_CONG_YEAH tristate "YeAH TCP" - depends on EXPERIMENTAL select TCP_CONG_VEGAS default n ---help--- @@ -567,7 +567,6 @@ config TCP_CONG_YEAH config TCP_CONG_ILLINOIS tristate "TCP Illinois" - depends on EXPERIMENTAL default n ---help--- TCP-Illinois is a sender-side modification of TCP Reno for @@ -631,8 +630,7 @@ config DEFAULT_TCP_CONG default "cubic" config TCP_MD5SIG - bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "TCP: MD5 Signature Option support (RFC2385)" select CRYPTO select CRYPTO_MD5 ---help--- diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 15ca63ec604e..089cb9f36387 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -13,6 +13,7 @@ obj-y := route.o inetpeer.o protocol.o \ fib_frontend.o fib_semantics.o fib_trie.o \ inet_fragment.o ping.o +obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 24b384b7903e..d01be2a3ae53 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -111,10 +111,10 @@ #include <net/sock.h> #include <net/raw.h> #include <net/icmp.h> -#include <net/ipip.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/net_namespace.h> +#include <net/secure_seq.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif @@ -248,8 +248,12 @@ EXPORT_SYMBOL(inet_listen); u32 inet_ehash_secret __read_mostly; EXPORT_SYMBOL(inet_ehash_secret); +u32 ipv6_hash_secret __read_mostly; +EXPORT_SYMBOL(ipv6_hash_secret); + /* - * inet_ehash_secret must be set exactly once + * inet_ehash_secret must be set exactly once, and to a non nul value + * ipv6_hash_secret must be set exactly once. */ void build_ehash_secret(void) { @@ -259,24 +263,12 @@ void build_ehash_secret(void) get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - cmpxchg(&inet_ehash_secret, 0, rnd); -} -EXPORT_SYMBOL(build_ehash_secret); - -static inline int inet_netns_ok(struct net *net, __u8 protocol) -{ - const struct net_protocol *ipprot; - - if (net_eq(net, &init_net)) - return 1; - - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot == NULL) { - /* raw IP is OK */ - return 1; + if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) { + get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); + net_secret_init(); } - return ipprot->netns_ok; } +EXPORT_SYMBOL(build_ehash_secret); /* * Create an inet socket. @@ -350,10 +342,6 @@ lookup_protocol: !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; - err = -EAFNOSUPPORT; - if (!inet_netns_ok(net, protocol)) - goto out_rcu_unlock; - sock->ops = answer->ops; answer_prot = answer->prot; answer_no_check = answer->no_check; @@ -1297,15 +1285,16 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int ihl; int id; unsigned int offset = 0; - - if (!(features & NETIF_F_V4_CSUM)) - features &= ~NETIF_F_SG; + bool tunnel; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | + SKB_GSO_GRE | + SKB_GSO_TCPV6 | + SKB_GSO_UDP_TUNNEL | 0))) goto out; @@ -1320,6 +1309,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, ihl))) goto out; + tunnel = !!skb->encapsulation; + __skb_pull(skb, ihl); skb_reset_transport_header(skb); iph = ip_hdr(skb); @@ -1333,20 +1324,21 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ops->callbacks.gso_segment(skb, features); rcu_read_unlock(); - if (!segs || IS_ERR(segs)) + if (IS_ERR_OR_NULL(segs)) goto out; skb = segs; do { iph = ip_hdr(skb); - if (proto == IPPROTO_UDP) { + if (!tunnel && proto == IPPROTO_UDP) { iph->id = htons(id); iph->frag_off = htons(offset >> 3); if (skb->next != NULL) iph->frag_off |= htons(IP_MF); offset += (skb->len - skb->mac_len - iph->ihl * 4); - } else + } else { iph->id = htons(id++); + } iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); @@ -1590,7 +1582,7 @@ static const struct net_offload udp_offload = { static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, - .err_handler = ping_err, + .err_handler = icmp_err, .no_policy = 1, .netns_ok = 1, }; @@ -1705,12 +1697,11 @@ static struct packet_type ip_packet_type __read_mostly = { static int __init inet_init(void) { - struct sk_buff *dummy_skb; struct inet_protosw *q; struct list_head *r; int rc = -EINVAL; - BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); if (!sysctl_local_reserved_ports) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index a69b4e4a02b5..2e7f1948216f 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -321,8 +321,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ded146b217f1..247ec1951c35 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -654,11 +654,19 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, arp_ptr += dev->addr_len; memcpy(arp_ptr, &src_ip, 4); arp_ptr += 4; - if (target_hw != NULL) - memcpy(arp_ptr, target_hw, dev->addr_len); - else - memset(arp_ptr, 0, dev->addr_len); - arp_ptr += dev->addr_len; + + switch (dev->type) { +#if IS_ENABLED(CONFIG_FIREWIRE_NET) + case ARPHRD_IEEE1394: + break; +#endif + default: + if (target_hw != NULL) + memcpy(arp_ptr, target_hw, dev->addr_len); + else + memset(arp_ptr, 0, dev->addr_len); + arp_ptr += dev->addr_len; + } memcpy(arp_ptr, &dest_ip, 4); return skb; @@ -781,7 +789,14 @@ static int arp_process(struct sk_buff *skb) arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; - arp_ptr += dev->addr_len; + switch (dev_type) { +#if IS_ENABLED(CONFIG_FIREWIRE_NET) + case ARPHRD_IEEE1394: + break; +#endif + default: + arp_ptr += dev->addr_len; + } memcpy(&tip, arp_ptr, 4); /* * Check for bad requests for 127.x.x.x and requests for multicast @@ -1405,14 +1420,14 @@ static const struct file_operations arp_seq_fops = { static int __net_init arp_net_init(struct net *net) { - if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops)) + if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops)) return -ENOMEM; return 0; } static void __net_exit arp_net_exit(struct net *net) { - proc_net_remove(net, "arp"); + remove_proc_entry("arp", net->proc_net); } static struct pernet_operations arp_net_ops = { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a8e4f2665d5e..dfc39d4d48b7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -63,6 +63,7 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include <net/addrconf.h> #include "fib_lookup.h" @@ -93,6 +94,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, }; #define IN4_ADDR_HSIZE_SHIFT 8 @@ -137,10 +139,9 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) u32 hash = inet_addr_hash(net, addr); struct net_device *result = NULL; struct in_ifaddr *ifa; - struct hlist_node *node; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) { if (ifa->ifa_local == addr) { struct net_device *dev = ifa->ifa_dev->dev; @@ -417,6 +418,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } +static void check_lifetime(struct work_struct *work); + +static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); + static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { @@ -462,6 +467,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, inet_hash_insert(dev_net(in_dev->dev), ifa); + cancel_delayed_work(&check_lifetime_work); + schedule_delayed_work(&check_lifetime_work, 0); + /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ @@ -528,7 +536,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, return NULL; } -static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -573,7 +581,131 @@ errout: return err; } -static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) +#define INFINITY_LIFE_TIME 0xFFFFFFFF + +static void check_lifetime(struct work_struct *work) +{ + unsigned long now, next, next_sec, next_sched; + struct in_ifaddr *ifa; + struct hlist_node *n; + int i; + + now = jiffies; + next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); + + for (i = 0; i < IN4_ADDR_HSIZE; i++) { + bool change_needed = false; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) { + unsigned long age; + + if (ifa->ifa_flags & IFA_F_PERMANENT) + continue; + + /* We try to batch several events at once. */ + age = (now - ifa->ifa_tstamp + + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + + if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && + age >= ifa->ifa_valid_lft) { + change_needed = true; + } else if (ifa->ifa_preferred_lft == + INFINITY_LIFE_TIME) { + continue; + } else if (age >= ifa->ifa_preferred_lft) { + if (time_before(ifa->ifa_tstamp + + ifa->ifa_valid_lft * HZ, next)) + next = ifa->ifa_tstamp + + ifa->ifa_valid_lft * HZ; + + if (!(ifa->ifa_flags & IFA_F_DEPRECATED)) + change_needed = true; + } else if (time_before(ifa->ifa_tstamp + + ifa->ifa_preferred_lft * HZ, + next)) { + next = ifa->ifa_tstamp + + ifa->ifa_preferred_lft * HZ; + } + } + rcu_read_unlock(); + if (!change_needed) + continue; + rtnl_lock(); + hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) { + unsigned long age; + + if (ifa->ifa_flags & IFA_F_PERMANENT) + continue; + + /* We try to batch several events at once. */ + age = (now - ifa->ifa_tstamp + + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + + if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && + age >= ifa->ifa_valid_lft) { + struct in_ifaddr **ifap; + + for (ifap = &ifa->ifa_dev->ifa_list; + *ifap != NULL; ifap = &(*ifap)->ifa_next) { + if (*ifap == ifa) { + inet_del_ifa(ifa->ifa_dev, + ifap, 1); + break; + } + } + } else if (ifa->ifa_preferred_lft != + INFINITY_LIFE_TIME && + age >= ifa->ifa_preferred_lft && + !(ifa->ifa_flags & IFA_F_DEPRECATED)) { + ifa->ifa_flags |= IFA_F_DEPRECATED; + rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); + } + } + rtnl_unlock(); + } + + next_sec = round_jiffies_up(next); + next_sched = next; + + /* If rounded timeout is accurate enough, accept it. */ + if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) + next_sched = next_sec; + + now = jiffies; + /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ + if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) + next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; + + schedule_delayed_work(&check_lifetime_work, next_sched - now); +} + +static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, + __u32 prefered_lft) +{ + unsigned long timeout; + + ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); + + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) + ifa->ifa_valid_lft = timeout; + else + ifa->ifa_flags |= IFA_F_PERMANENT; + + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa->ifa_flags |= IFA_F_DEPRECATED; + ifa->ifa_preferred_lft = timeout; + } + ifa->ifa_tstamp = jiffies; + if (!ifa->ifa_cstamp) + ifa->ifa_cstamp = ifa->ifa_tstamp; +} + +static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, + __u32 *pvalid_lft, __u32 *pprefered_lft) { struct nlattr *tb[IFA_MAX+1]; struct in_ifaddr *ifa; @@ -633,24 +765,77 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + if (tb[IFA_CACHEINFO]) { + struct ifa_cacheinfo *ci; + + ci = nla_data(tb[IFA_CACHEINFO]); + if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { + err = -EINVAL; + goto errout; + } + *pvalid_lft = ci->ifa_valid; + *pprefered_lft = ci->ifa_prefered; + } + return ifa; errout: return ERR_PTR(err); } -static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct in_ifaddr *ifa1, **ifap; + + if (!ifa->ifa_local) + return NULL; + + for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; + ifap = &ifa1->ifa_next) { + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa) && + ifa1->ifa_local == ifa->ifa_local) + return ifa1; + } + return NULL; +} + +static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa; + struct in_ifaddr *ifa_existing; + __u32 valid_lft = INFINITY_LIFE_TIME; + __u32 prefered_lft = INFINITY_LIFE_TIME; ASSERT_RTNL(); - ifa = rtm_to_ifaddr(net, nlh); + ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft); if (IS_ERR(ifa)) return PTR_ERR(ifa); - return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); + ifa_existing = find_matching_ifa(ifa); + if (!ifa_existing) { + /* It would be best to check for !NLM_F_CREATE here but + * userspace alreay relies on not having to provide this. + */ + set_ifa_lifetime(ifa, valid_lft, prefered_lft); + return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); + } else { + inet_free_ifa(ifa); + + if (nlh->nlmsg_flags & NLM_F_EXCL || + !(nlh->nlmsg_flags & NLM_F_REPLACE)) + return -EEXIST; + ifa = ifa_existing; + set_ifa_lifetime(ifa, valid_lft, prefered_lft); + cancel_delayed_work(&check_lifetime_work); + schedule_delayed_work(&check_lifetime_work, 0); + rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); + blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + } + return 0; } /* @@ -852,6 +1037,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) ifa->ifa_prefixlen = 32; ifa->ifa_mask = inet_make_mask(32); } + set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ret = inet_set_ifa(dev, ifa); break; @@ -1190,6 +1376,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, ifa->ifa_dev = in_dev; ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, + INFINITY_LIFE_TIME); inet_insert_ifa(ifa); } } @@ -1246,11 +1434,30 @@ static size_t inet_nlmsg_size(void) + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ } +static inline u32 cstamp_delta(unsigned long cstamp) +{ + return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; +} + +static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, + unsigned long tstamp, u32 preferred, u32 valid) +{ + struct ifa_cacheinfo ci; + + ci.cstamp = cstamp_delta(cstamp); + ci.tstamp = cstamp_delta(tstamp); + ci.ifa_prefered = preferred; + ci.ifa_valid = valid; + + return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); +} + static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, u32 portid, u32 seq, int event, unsigned int flags) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; + u32 preferred, valid; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); if (nlh == NULL) @@ -1259,10 +1466,31 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; - ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT; + ifm->ifa_flags = ifa->ifa_flags; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (!(ifm->ifa_flags & IFA_F_PERMANENT)) { + preferred = ifa->ifa_preferred_lft; + valid = ifa->ifa_valid_lft; + if (preferred != INFINITY_LIFE_TIME) { + long tval = (jiffies - ifa->ifa_tstamp) / HZ; + + if (preferred > tval) + preferred -= tval; + else + preferred = 0; + if (valid != INFINITY_LIFE_TIME) { + if (valid > tval) + valid -= tval; + else + valid = 0; + } + } + } else { + preferred = INFINITY_LIFE_TIME; + valid = INFINITY_LIFE_TIME; + } if ((ifa->ifa_address && nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && @@ -1270,7 +1498,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, (ifa->ifa_broadcast && nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && - nla_put_string(skb, IFA_LABEL, ifa->ifa_label))) + nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || + put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, + preferred, valid)) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -1290,7 +1520,6 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) struct in_device *in_dev; struct in_ifaddr *ifa; struct hlist_head *head; - struct hlist_node *node; s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -1300,7 +1529,9 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; head = &net->dev_index_head[h]; rcu_read_lock(); - hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ + net->dev_base_seq; + hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; if (h > s_h || idx > s_idx) @@ -1320,6 +1551,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_unlock(); goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } cont: idx++; @@ -1531,8 +1763,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { }; static int inet_netconf_get_devconf(struct sk_buff *in_skb, - struct nlmsghdr *nlh, - void *arg) + struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; @@ -1592,6 +1823,77 @@ errout: return err; } +static int inet_netconf_dump_devconf(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx, s_idx; + struct net_device *dev; + struct in_device *in_dev; + struct hlist_head *head; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ + net->dev_base_seq; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto cont; + + if (inet_netconf_fill_devconf(skb, dev->ifindex, + &in_dev->cnf, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + -1) <= 0) { + rcu_read_unlock(); + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + rcu_read_unlock(); + } + if (h == NETDEV_HASHENTRIES) { + if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, + net->ipv4.devconf_all, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } + if (h == NETDEV_HASHENTRIES + 1) { + if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, + net->ipv4.devconf_dflt, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } +done: + cb->args[0] = h; + cb->args[1] = idx; + + return skb->len; +} + #ifdef CONFIG_SYSCTL static void devinet_copy_dflt_conf(struct net *net, int i) @@ -1988,12 +2290,14 @@ void __init devinet_init(void) register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); + schedule_delayed_work(&check_lifetime_work, 0); + rtnl_af_register(&inet_af_ops); rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, - NULL, NULL); + inet_netconf_dump_devconf, NULL); } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 3b4f0cd2e63e..4cfe34d4cc96 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -139,8 +139,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) /* skb is pure payload to encrypt */ - err = -ENOMEM; - esp = x->data; aead = esp->aead; alen = crypto_aead_authsize(aead); @@ -176,8 +174,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); - if (!tmp) + if (!tmp) { + err = -ENOMEM; goto error; + } seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 5cd75e2dab2c..c7629a209f9d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -112,7 +112,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id) struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; - struct hlist_node *node; struct hlist_head *head; unsigned int h; @@ -122,7 +121,7 @@ struct fib_table *fib_get_table(struct net *net, u32 id) rcu_read_lock(); head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (tb->tb_id == id) { rcu_read_unlock(); return tb; @@ -137,13 +136,12 @@ static void fib_flush(struct net *net) { int flushed = 0; struct fib_table *tb; - struct hlist_node *node; struct hlist_head *head; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, node, head, tb_hlist) + hlist_for_each_entry(tb, head, tb_hlist) flushed += fib_table_flush(tb); } @@ -606,7 +604,7 @@ errout: return err; } -static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_config cfg; @@ -628,7 +626,7 @@ errout: return err; } -static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_config cfg; @@ -656,7 +654,6 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; - struct hlist_node *node; struct hlist_head *head; int dumped = 0; @@ -670,7 +667,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, node, head, tb_hlist) { + hlist_for_each_entry(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) @@ -960,8 +957,8 @@ static void nl_fib_input(struct sk_buff *skb) net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); - if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || - nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) + if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(*frn)) return; skb = skb_clone(skb, GFP_KERNEL); @@ -969,12 +966,12 @@ static void nl_fib_input(struct sk_buff *skb) return; nlh = nlmsg_hdr(skb); - frn = (struct fib_result_nl *) NLMSG_DATA(nlh); + frn = (struct fib_result_nl *) nlmsg_data(nlh); tb = fib_get_table(net, frn->tb_id_in); nl_fib_lookup(frn, tb); - portid = NETLINK_CB(skb).portid; /* pid of sending process */ + portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT); @@ -1117,11 +1114,11 @@ static void ip_fib_net_exit(struct net *net) for (i = 0; i < FIB_TABLE_HASHSZ; i++) { struct fib_table *tb; struct hlist_head *head; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; head = &net->ipv4.fib_table_hash[i]; - hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { - hlist_del(node); + hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { + hlist_del(&tb->tb_hlist); fib_table_flush(tb); fib_free_table(tb); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4797a800faf8..8f6cb7a87cd6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -298,14 +298,13 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi) static struct fib_info *fib_find_info(const struct fib_info *nfi) { struct hlist_head *head; - struct hlist_node *node; struct fib_info *fi; unsigned int hash; hash = fib_info_hashfn(nfi); head = &fib_info_hash[hash]; - hlist_for_each_entry(fi, node, head, fib_hash) { + hlist_for_each_entry(fi, head, fib_hash) { if (!net_eq(fi->fib_net, nfi->fib_net)) continue; if (fi->fib_nhs != nfi->fib_nhs) @@ -331,7 +330,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; - struct hlist_node *node; struct fib_nh *nh; unsigned int hash; @@ -339,7 +337,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) hash = fib_devindex_hashfn(dev->ifindex); head = &fib_info_devhash[hash]; - hlist_for_each_entry(nh, node, head, nh_hash) { + hlist_for_each_entry(nh, head, nh_hash) { if (nh->nh_dev == dev && nh->nh_gw == gw && !(nh->nh_flags & RTNH_F_DEAD)) { @@ -721,10 +719,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, for (i = 0; i < old_size; i++) { struct hlist_head *head = &fib_info_hash[i]; - struct hlist_node *node, *n; + struct hlist_node *n; struct fib_info *fi; - hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { + hlist_for_each_entry_safe(fi, n, head, fib_hash) { struct hlist_head *dest; unsigned int new_hash; @@ -739,10 +737,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, for (i = 0; i < old_size; i++) { struct hlist_head *lhead = &fib_info_laddrhash[i]; - struct hlist_node *node, *n; + struct hlist_node *n; struct fib_info *fi; - hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { + hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { struct hlist_head *ldest; unsigned int new_hash; @@ -1096,13 +1094,12 @@ int fib_sync_down_addr(struct net *net, __be32 local) int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; - struct hlist_node *node; struct fib_info *fi; if (fib_info_laddrhash == NULL || local == 0) return 0; - hlist_for_each_entry(fi, node, head, fib_lhash) { + hlist_for_each_entry(fi, head, fib_lhash) { if (!net_eq(fi->fib_net, net)) continue; if (fi->fib_prefsrc == local) { @@ -1120,13 +1117,12 @@ int fib_sync_down_dev(struct net_device *dev, int force) struct fib_info *prev_fi = NULL; unsigned int hash = fib_devindex_hashfn(dev->ifindex); struct hlist_head *head = &fib_info_devhash[hash]; - struct hlist_node *node; struct fib_nh *nh; if (force) scope = -1; - hlist_for_each_entry(nh, node, head, nh_hash) { + hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int dead; @@ -1232,7 +1228,6 @@ int fib_sync_up(struct net_device *dev) struct fib_info *prev_fi; unsigned int hash; struct hlist_head *head; - struct hlist_node *node; struct fib_nh *nh; int ret; @@ -1244,7 +1239,7 @@ int fib_sync_up(struct net_device *dev) head = &fib_info_devhash[hash]; ret = 0; - hlist_for_each_entry(nh, node, head, nh_hash) { + hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int alive; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 31d771ca9a70..49616fed9340 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -125,7 +125,6 @@ struct tnode { unsigned int empty_children; /* KEYLENGTH bits needed */ union { struct rcu_head rcu; - struct work_struct work; struct tnode *tnode_free; }; struct rt_trie_node __rcu *child[0]; @@ -383,12 +382,6 @@ static struct tnode *tnode_alloc(size_t size) return vzalloc(size); } -static void __tnode_vfree(struct work_struct *arg) -{ - struct tnode *tn = container_of(arg, struct tnode, work); - vfree(tn); -} - static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); @@ -397,10 +390,8 @@ static void __tnode_free_rcu(struct rcu_head *head) if (size <= PAGE_SIZE) kfree(tn); - else { - INIT_WORK(&tn->work, __tnode_vfree); - schedule_work(&tn->work); - } + else + vfree(tn); } static inline void tnode_free(struct tnode *tn) @@ -920,10 +911,9 @@ nomem: static struct leaf_info *find_leaf_info(struct leaf *l, int plen) { struct hlist_head *head = &l->list; - struct hlist_node *node; struct leaf_info *li; - hlist_for_each_entry_rcu(li, node, head, hlist) + hlist_for_each_entry_rcu(li, head, hlist) if (li->plen == plen) return li; @@ -943,12 +933,11 @@ static inline struct list_head *get_fa_head(struct leaf *l, int plen) static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) { struct leaf_info *li = NULL, *last = NULL; - struct hlist_node *node; if (hlist_empty(head)) { hlist_add_head_rcu(&new->hlist, head); } else { - hlist_for_each_entry(li, node, head, hlist) { + hlist_for_each_entry(li, head, hlist) { if (new->plen > li->plen) break; @@ -1354,9 +1343,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, { struct leaf_info *li; struct hlist_head *hhead = &l->list; - struct hlist_node *node; - hlist_for_each_entry_rcu(li, node, hhead, hlist) { + hlist_for_each_entry_rcu(li, hhead, hlist) { struct fib_alias *fa; if (l->key != (key & li->mask_plen)) @@ -1740,10 +1728,10 @@ static int trie_flush_leaf(struct leaf *l) { int found = 0; struct hlist_head *lih = &l->list; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; struct leaf_info *li = NULL; - hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { + hlist_for_each_entry_safe(li, tmp, lih, hlist) { found += trie_flush_list(&li->falh); if (list_empty(&li->falh)) { @@ -1895,14 +1883,13 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { struct leaf_info *li; - struct hlist_node *node; int i, s_i; s_i = cb->args[4]; i = 0; /* rcu_read_lock is hold by caller */ - hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + hlist_for_each_entry_rcu(li, &l->list, hlist) { if (i < s_i) { i++; continue; @@ -2092,14 +2079,13 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) if (IS_LEAF(n)) { struct leaf *l = (struct leaf *)n; struct leaf_info *li; - struct hlist_node *tmp; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; - hlist_for_each_entry_rcu(li, tmp, &l->list, hlist) + hlist_for_each_entry_rcu(li, &l->list, hlist) ++s->prefixes; } else { const struct tnode *tn = (const struct tnode *) n; @@ -2200,10 +2186,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; - struct hlist_node *node; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct trie *t = (struct trie *) tb->tb_data; struct trie_stat stat; @@ -2245,10 +2230,9 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; - struct hlist_node *node; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct rt_trie_node *n; for (n = fib_trie_get_first(iter, @@ -2298,7 +2282,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) /* new hash chain */ while (++h < FIB_TABLE_HASHSZ) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; @@ -2381,13 +2365,12 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) } else { struct leaf *l = (struct leaf *) n; struct leaf_info *li; - struct hlist_node *node; __be32 val = htonl(l->key); seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); - hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + hlist_for_each_entry_rcu(li, &l->list, hlist) { struct fib_alias *fa; list_for_each_entry_rcu(fa, &li->falh, fa_list) { @@ -2532,7 +2515,6 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) { struct leaf *l = v; struct leaf_info *li; - struct hlist_node *node; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " @@ -2541,7 +2523,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } - hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + hlist_for_each_entry_rcu(li, &l->list, hlist) { struct fib_alias *fa; __be32 mask, prefix; @@ -2607,31 +2589,31 @@ static const struct file_operations fib_route_fops = { int __net_init fib_proc_init(struct net *net) { - if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops)) + if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops)) goto out1; - if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO, - &fib_triestat_fops)) + if (!proc_create("fib_triestat", S_IRUGO, net->proc_net, + &fib_triestat_fops)) goto out2; - if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops)) + if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops)) goto out3; return 0; out3: - proc_net_remove(net, "fib_triestat"); + remove_proc_entry("fib_triestat", net->proc_net); out2: - proc_net_remove(net, "fib_trie"); + remove_proc_entry("fib_trie", net->proc_net); out1: return -ENOMEM; } void __net_exit fib_proc_exit(struct net *net) { - proc_net_remove(net, "fib_trie"); - proc_net_remove(net, "fib_triestat"); - proc_net_remove(net, "route"); + remove_proc_entry("fib_trie", net->proc_net); + remove_proc_entry("fib_triestat", net->proc_net); + remove_proc_entry("route", net->proc_net); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index 42a491055c76..b2e805af9b87 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -19,6 +19,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/netdevice.h> +#include <linux/if_tunnel.h> #include <linux/spinlock.h> #include <net/protocol.h> #include <net/gre.h> @@ -112,12 +113,113 @@ static void gre_err(struct sk_buff *skb, u32 info) rcu_read_unlock(); } +static struct sk_buff *gre_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + netdev_features_t enc_features; + int ghl = GRE_HEADER_SECTION; + struct gre_base_hdr *greh; + int mac_len = skb->mac_len; + __be16 protocol = skb->protocol; + int tnl_hlen; + bool csum; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_TCPV4 | + SKB_GSO_TCPV6 | + SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_GRE))) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) + goto out; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + + if (greh->flags & GRE_KEY) + ghl += GRE_HEADER_SECTION; + if (greh->flags & GRE_SEQ) + ghl += GRE_HEADER_SECTION; + if (greh->flags & GRE_CSUM) { + ghl += GRE_HEADER_SECTION; + csum = true; + } else + csum = false; + + /* setup inner skb. */ + skb->protocol = greh->protocol; + skb->encapsulation = 0; + + if (unlikely(!pskb_may_pull(skb, ghl))) + goto out; + __skb_pull(skb, ghl); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb->mac_len = skb_inner_network_offset(skb); + + /* segment inner packet. */ + enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + segs = skb_mac_gso_segment(skb, enc_features); + if (!segs || IS_ERR(segs)) + goto out; + + skb = segs; + tnl_hlen = skb_tnl_header_len(skb); + do { + __skb_push(skb, ghl); + if (csum) { + __be32 *pcsum; + + if (skb_has_shared_frag(skb)) { + int err; + + err = __skb_linearize(skb); + if (err) { + kfree_skb(segs); + segs = ERR_PTR(err); + goto out; + } + } + + greh = (struct gre_base_hdr *)(skb->data); + pcsum = (__be32 *)(greh + 1); + *pcsum = 0; + *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0)); + } + __skb_push(skb, tnl_hlen - ghl); + + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb->mac_len = mac_len; + skb->protocol = protocol; + } while ((skb = skb->next)); +out: + return segs; +} + +static int gre_gso_send_check(struct sk_buff *skb) +{ + if (!skb->encapsulation) + return -EINVAL; + return 0; +} + static const struct net_protocol net_gre_protocol = { .handler = gre_rcv, .err_handler = gre_err, .netns_ok = 1, }; +static const struct net_offload gre_offload = { + .callbacks = { + .gso_send_check = gre_gso_send_check, + .gso_segment = gre_gso_segment, + }, +}; + static int __init gre_init(void) { pr_info("GRE over IPv4 demultiplexor driver\n"); @@ -127,11 +229,18 @@ static int __init gre_init(void) return -EAGAIN; } + if (inet_add_offload(&gre_offload, IPPROTO_GRE)) { + pr_err("can't add protocol offload\n"); + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); + return -EAGAIN; + } + return 0; } static void __exit gre_exit(void) { + inet_del_offload(&gre_offload, IPPROTO_GRE); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 17ff9fd7cdda..76e10b47e053 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -881,7 +881,7 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_NONE: skb->csum = 0; if (__skb_checksum_complete(skb)) - goto error; + goto csum_error; } if (!pskb_pull(skb, sizeof(*icmph))) @@ -929,11 +929,36 @@ int icmp_rcv(struct sk_buff *skb) drop: kfree_skb(skb); return 0; +csum_error: + ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS); error: ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto drop; } +void icmp_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; + struct net *net = dev_net(skb->dev); + + /* + * Use ping_err to handle all icmp errors except those + * triggered by ICMP_ECHOREPLY which sent from kernel. + */ + if (icmph->type != ICMP_ECHOREPLY) { + ping_err(skb, info); + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); + else if (type == ICMP_REDIRECT) + ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); +} + /* * This table is the definition of how we handle ICMP. */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 736ab70fd179..d8c232794bcb 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2646,24 +2646,25 @@ static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops); + pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); if (!pde) goto out_igmp; - pde = proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + pde = proc_create("mcfilter", S_IRUGO, net->proc_net, + &igmp_mcf_seq_fops); if (!pde) goto out_mcfilter; return 0; out_mcfilter: - proc_net_remove(net, "igmp"); + remove_proc_entry("igmp", net->proc_net); out_igmp: return -ENOMEM; } static void __net_exit igmp_net_exit(struct net *net) { - proc_net_remove(net, "mcfilter"); - proc_net_remove(net, "igmp"); + remove_proc_entry("mcfilter", net->proc_net); + remove_proc_entry("igmp", net->proc_net); } static struct pernet_operations igmp_net_ops = { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d0670f00d524..6acb541c9091 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -57,8 +57,9 @@ int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax) { struct sock *sk2; - struct hlist_node *node; int reuse = sk->sk_reuse; + int reuseport = sk->sk_reuseport; + kuid_t uid = sock_i_uid((struct sock *)sk); /* * Unlike other sk lookup places we do not check @@ -67,14 +68,17 @@ int inet_csk_bind_conflict(const struct sock *sk, * one this bucket belongs to. */ - sk_for_each_bound(sk2, node, &tb->owners) { + sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && !inet_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - if (!reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) { + if ((!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + (!reuseport || !sk2->sk_reuseport || + (sk2->sk_state != TCP_TIME_WAIT && + !uid_eq(uid, sock_i_uid(sk2))))) { const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || sk2_rcv_saddr == sk_rcv_saddr(sk)) @@ -90,7 +94,7 @@ int inet_csk_bind_conflict(const struct sock *sk, } } } - return node != NULL; + return sk2 != NULL; } EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); @@ -101,11 +105,11 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_bind_hashbucket *head; - struct hlist_node *node; struct inet_bind_bucket *tb; int ret, attempts = 5; struct net *net = sock_net(sk); int smallest_size = -1, smallest_rover; + kuid_t uid = sock_i_uid(sk); local_bh_disable(); if (!snum) { @@ -123,11 +127,14 @@ again: head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == rover) { - if (tb->fastreuse > 0 && - sk->sk_reuse && - sk->sk_state != TCP_LISTEN && + if (((tb->fastreuse > 0 && + sk->sk_reuse && + sk->sk_state != TCP_LISTEN) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + uid_eq(tb->fastuid, uid))) && (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_rover = rover; @@ -174,7 +181,7 @@ have_snum: head = &hashinfo->bhash[inet_bhashfn(net, snum, hashinfo->bhash_size)]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == snum) goto tb_found; } @@ -185,14 +192,18 @@ tb_found: if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN && + if (((tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && smallest_size == -1) { goto success; } else { ret = 1; if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && + if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && smallest_size != -1 && --attempts >= 0) { spin_unlock(&head->lock); goto again; @@ -212,9 +223,19 @@ tb_not_found: tb->fastreuse = 1; else tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; + if (sk->sk_reuseport) { + tb->fastreuseport = 1; + tb->fastuid = uid; + } else + tb->fastreuseport = 0; + } else { + if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; + if (tb->fastreuseport && + (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) + tb->fastreuseport = 0; + } success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, snum); @@ -538,7 +559,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) { - int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL); + int err = req->rsk_ops->rtx_syn_ack(parent, req); if (!err) req->num_retrans++; @@ -714,6 +735,7 @@ EXPORT_SYMBOL(inet_csk_destroy_sock); * tcp/dccp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) + __releases(&sk->sk_lock.slock) { /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 7afa2c3c788f..5f648751fce2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -158,7 +158,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, #define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) - if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS || + icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); @@ -322,7 +324,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s } err = sk_diag_fill(sk, rep, req, - sk_user_ns(NETLINK_CB(in_skb).ssk), + sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, nlh); if (err < 0) { @@ -628,7 +630,7 @@ static int inet_csk_diag_dump(struct sock *sk, return 0; return inet_csk_diag_fill(sk, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).ssk), + sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } @@ -803,7 +805,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, } err = inet_diag_fill_req(skb, sk, req, - sk_user_ns(NETLINK_CB(cb->skb).ssk), + sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh); if (err < 0) { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 4750d2b74d79..7e06641e36ae 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -21,7 +21,30 @@ #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <net/sock.h> #include <net/inet_frag.h> +#include <net/inet_ecn.h> + +/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements + * Value : 0xff if frame should be dropped. + * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field + */ +const u8 ip_frag_ecn_table[16] = { + /* at least one fragment had CE, and others ECT_0 or ECT_1 */ + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + + /* invalid combinations : drop frame */ + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, +}; +EXPORT_SYMBOL(ip_frag_ecn_table); static void inet_frag_secret_rebuild(unsigned long dummy) { @@ -29,20 +52,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy) unsigned long now = jiffies; int i; + /* Per bucket lock NOT needed here, due to write lock protection */ write_lock(&f->lock); + get_random_bytes(&f->rnd, sizeof(u32)); for (i = 0; i < INETFRAGS_HASHSZ; i++) { + struct inet_frag_bucket *hb; struct inet_frag_queue *q; - struct hlist_node *p, *n; + struct hlist_node *n; - hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) { + hb = &f->hash[i]; + hlist_for_each_entry_safe(q, n, &hb->chain, list) { unsigned int hval = f->hashfn(q); if (hval != i) { + struct inet_frag_bucket *hb_dest; + hlist_del(&q->list); /* Relink to new hash chain. */ - hlist_add_head(&q->list, &f->hash[hval]); + hb_dest = &f->hash[hval]; + hlist_add_head(&q->list, &hb_dest->chain); } } } @@ -55,9 +85,12 @@ void inet_frags_init(struct inet_frags *f) { int i; - for (i = 0; i < INETFRAGS_HASHSZ; i++) - INIT_HLIST_HEAD(&f->hash[i]); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { + struct inet_frag_bucket *hb = &f->hash[i]; + spin_lock_init(&hb->chain_lock); + INIT_HLIST_HEAD(&hb->chain); + } rwlock_init(&f->lock); f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ @@ -73,8 +106,9 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_init_net(struct netns_frags *nf) { nf->nqueues = 0; - atomic_set(&nf->mem, 0); + init_frag_mem_limit(nf); INIT_LIST_HEAD(&nf->lru_list); + spin_lock_init(&nf->lru_lock); } EXPORT_SYMBOL(inet_frags_init_net); @@ -91,16 +125,26 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) local_bh_disable(); inet_frag_evictor(nf, f, true); local_bh_enable(); + + percpu_counter_destroy(&nf->mem); } EXPORT_SYMBOL(inet_frags_exit_net); static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) { - write_lock(&f->lock); + struct inet_frag_bucket *hb; + unsigned int hash; + + read_lock(&f->lock); + hash = f->hashfn(fq); + hb = &f->hash[hash]; + + spin_lock(&hb->chain_lock); hlist_del(&fq->list); - list_del(&fq->lru_list); - fq->net->nqueues--; - write_unlock(&f->lock); + spin_unlock(&hb->chain_lock); + + read_unlock(&f->lock); + inet_frag_lru_del(fq); } void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) @@ -117,12 +161,8 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) EXPORT_SYMBOL(inet_frag_kill); static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, - struct sk_buff *skb, int *work) + struct sk_buff *skb) { - if (work) - *work -= skb->truesize; - - atomic_sub(skb->truesize, &nf->mem); if (f->skb_free) f->skb_free(skb); kfree_skb(skb); @@ -133,6 +173,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, { struct sk_buff *fp; struct netns_frags *nf; + unsigned int sum, sum_truesize = 0; WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); @@ -143,13 +184,14 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(nf, f, fp, work); + sum_truesize += fp->truesize; + frag_kfree_skb(nf, f, fp); fp = xp; } - + sum = sum_truesize + f->qsize; if (work) - *work -= f->qsize; - atomic_sub(f->qsize, &nf->mem); + *work -= sum; + sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); @@ -164,22 +206,26 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) int work, evicted = 0; if (!force) { - if (atomic_read(&nf->mem) <= nf->high_thresh) + if (frag_mem_limit(nf) <= nf->high_thresh) return 0; } - work = atomic_read(&nf->mem) - nf->low_thresh; + work = frag_mem_limit(nf) - nf->low_thresh; while (work > 0) { - read_lock(&f->lock); + spin_lock(&nf->lru_lock); + if (list_empty(&nf->lru_list)) { - read_unlock(&f->lock); + spin_unlock(&nf->lru_lock); break; } q = list_first_entry(&nf->lru_list, struct inet_frag_queue, lru_list); atomic_inc(&q->refcnt); - read_unlock(&f->lock); + /* Remove q from list to avoid several CPUs grabbing it */ + list_del_init(&q->lru_list); + + spin_unlock(&nf->lru_lock); spin_lock(&q->lock); if (!(q->last_in & INET_FRAG_COMPLETE)) @@ -199,28 +245,32 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, void *arg) { + struct inet_frag_bucket *hb; struct inet_frag_queue *qp; #ifdef CONFIG_SMP - struct hlist_node *n; #endif unsigned int hash; - write_lock(&f->lock); + read_lock(&f->lock); /* Protects against hash rebuild */ /* * While we stayed w/o the lock other CPU could update * the rnd seed, so we need to re-calculate the hash * chain. Fortunatelly the qp_in can be used to get one. */ hash = f->hashfn(qp_in); + hb = &f->hash[hash]; + spin_lock(&hb->chain_lock); + #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we - * promoted read lock to write lock. + * released the hash bucket lock. */ - hlist_for_each_entry(qp, n, &f->hash[hash], list) { + hlist_for_each_entry(qp, &hb->chain, list) { if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); - write_unlock(&f->lock); + spin_unlock(&hb->chain_lock); + read_unlock(&f->lock); qp_in->last_in |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; @@ -232,10 +282,10 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - hlist_add_head(&qp->list, &f->hash[hash]); - list_add_tail(&qp->lru_list, &nf->lru_list); - nf->nqueues++; - write_unlock(&f->lock); + hlist_add_head(&qp->list, &hb->chain); + spin_unlock(&hb->chain_lock); + read_unlock(&f->lock); + inet_frag_lru_add(nf, qp); return qp; } @@ -250,10 +300,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, q->net = nf; f->constructor(q, arg); - atomic_add(f->qsize, &nf->mem); + add_frag_mem_limit(q, f->qsize); + setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); + INIT_LIST_HEAD(&q->lru_list); return q; } @@ -274,18 +326,40 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) __releases(&f->lock) { + struct inet_frag_bucket *hb; struct inet_frag_queue *q; - struct hlist_node *n; + int depth = 0; - hlist_for_each_entry(q, n, &f->hash[hash], list) { + hb = &f->hash[hash]; + + spin_lock(&hb->chain_lock); + hlist_for_each_entry(q, &hb->chain, list) { if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); + spin_unlock(&hb->chain_lock); read_unlock(&f->lock); return q; } + depth++; } + spin_unlock(&hb->chain_lock); read_unlock(&f->lock); - return inet_frag_create(nf, f, key); + if (depth <= INETFRAGS_MAXDEPTH) + return inet_frag_create(nf, f, key); + else + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); + +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix) +{ + static const char msg[] = "inet_frag_find: Fragment hash bucket" + " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) + ". Dropping fragment.\n"; + + if (PTR_ERR(q) == -ENOBUFS) + LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); +} +EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fa3ae8148710..6af375afeeef 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, write_pnet(&tb->ib_net, hold_net(net)); tb->port = snum; tb->fastreuse = 0; + tb->fastreuseport = 0; tb->num_owners = 0; INIT_HLIST_HEAD(&tb->owners); hlist_add_head(&tb->node, &head->chain); @@ -119,13 +120,12 @@ int __inet_inherit_port(struct sock *sk, struct sock *child) * that the listener socket's icsk_bind_hash is the same * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ - struct hlist_node *node; - inet_bind_bucket_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), sock_net(sk)) && tb->port == port) break; } - if (!node) { + if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, sock_net(sk), head, port); if (!tb) { @@ -151,16 +151,16 @@ static inline int compute_score(struct sock *sk, struct net *net, if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { __be32 rcv_saddr = inet->inet_rcv_saddr; - score = sk->sk_family == PF_INET ? 1 : 0; + score = sk->sk_family == PF_INET ? 2 : 1; if (rcv_saddr) { if (rcv_saddr != daddr) return -1; - score += 2; + score += 4; } if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) return -1; - score += 2; + score += 4; } } return score; @@ -176,6 +176,7 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif) { @@ -183,17 +184,29 @@ struct sock *__inet_lookup_listener(struct net *net, struct hlist_nulls_node *node; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore; + int score, hiscore, matches = 0, reuseport = 0; + u32 phash = 0; rcu_read_lock(); begin: result = NULL; - hiscore = -1; + hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { result = sk; hiscore = score; + reuseport = sk->sk_reuseport; + if (reuseport) { + phash = inet_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } + } else if (score == hiscore && reuseport) { + matches++; + if (((u64)phash * matches) >> 32 == 0) + result = sk; + phash = next_pseudo_random32(phash); } } /* @@ -479,7 +492,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, int i, remaining, low, high, port; static u32 hint; u32 offset = hint + port_offset; - struct hlist_node *node; struct inet_timewait_sock *tw = NULL; inet_get_local_port_range(&low, &high); @@ -498,10 +510,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, * because the established check is already * unique enough. */ - inet_bind_bucket_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), net) && tb->port == port) { - if (tb->fastreuse >= 0) + if (tb->fastreuse >= 0 || + tb->fastreuseport >= 0) goto next_port; WARN_ON(hlist_empty(&tb->owners)); if (!check_established(death_row, sk, @@ -518,6 +531,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, break; } tb->fastreuse = -1; + tb->fastreuseport = -1; goto ok; next_port: diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index cc280a3f4f96..1975f52933c5 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/if_vlan.h> #include <linux/inet_lro.h> +#include <net/checksum.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); @@ -114,11 +115,9 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) *(p+2) = lro_desc->tcp_rcv_tsecr; } + csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len)); iph->tot_len = htons(lro_desc->ip_tot_len); - iph->check = 0; - iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); - tcph->check = 0; tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2784db3155fb..1f27c9f4afd0 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -216,7 +216,6 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, const int slot) { struct inet_timewait_sock *tw; - struct hlist_node *node; unsigned int killed; int ret; @@ -229,7 +228,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, killed = 0; ret = 0; rescan: - inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { + inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { __inet_twsk_del_dead_node(tw); spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); @@ -438,10 +437,10 @@ void inet_twdr_twcal_tick(unsigned long data) for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { if (time_before_eq(j, now)) { - struct hlist_node *node, *safe; + struct hlist_node *safe; struct inet_timewait_sock *tw; - inet_twsk_for_each_inmate_safe(tw, node, safe, + inet_twsk_for_each_inmate_safe(tw, safe, &twdr->twcal_row[slot]) { __inet_twsk_del_dead_node(tw); __inet_twsk_kill(tw, twdr->hashinfo); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index eb9d63a570cd..b66910aaef4d 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -79,40 +79,11 @@ struct ipq { struct inet_peer *peer; }; -/* RFC 3168 support : - * We want to check ECN values of all fragments, do detect invalid combinations. - * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. - */ -#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ -#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ -#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ -#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ - static inline u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } -/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements - * Value : 0xff if frame should be dropped. - * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field - */ -static const u8 ip4_frag_ecn_table[16] = { - /* at least one fragment had CE, and others ECT_0 or ECT_1 */ - [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, - [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, - [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, - - /* invalid combinations : drop frame */ - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, - [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, -}; - static struct inet_frags ip4_frags; int ip_frag_nqueues(struct net *net) @@ -122,7 +93,7 @@ int ip_frag_nqueues(struct net *net) int ip_frag_mem(struct net *net) { - return atomic_read(&net->ipv4.frags.mem); + return sum_frag_mem_limit(&net->ipv4.frags); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -161,13 +132,6 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a) qp->user == arg->user; } -/* Memory Tracking Functions. */ -static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) -{ - atomic_sub(skb->truesize, &nf->mem); - kfree_skb(skb); -} - static void ip4_frag_init(struct inet_frag_queue *q, void *a) { struct ipq *qp = container_of(q, struct ipq, q); @@ -255,8 +219,7 @@ static void ip_expire(unsigned long arg) if (!head->dev) goto out_rcu_unlock; - /* skb dst is stale, drop it, and perform route lookup again */ - skb_dst_drop(head); + /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); @@ -299,14 +262,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (q == NULL) - goto out_nomem; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct ipq, q); - -out_nomem: - LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n")); - return NULL; } /* Is the fragment too far ahead to be part of ipq? */ @@ -340,6 +300,7 @@ static inline int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; + unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); @@ -349,9 +310,12 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(qp->q.net, fp); + + sum_truesize += fp->truesize; + kfree_skb(fp); fp = xp; } while (fp); + sub_frag_mem_limit(&qp->q, sum_truesize); qp->q.last_in = 0; qp->q.len = 0; @@ -496,7 +460,8 @@ found: qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(qp->q.net, free_it); + sub_frag_mem_limit(&qp->q, free_it->truesize); + kfree_skb(free_it); } } @@ -519,7 +484,7 @@ found: qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - atomic_add(skb->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, skb->truesize); if (offset == 0) qp->q.last_in |= INET_FRAG_FIRST_IN; @@ -528,12 +493,17 @@ found: qp->q.max_size = skb->len + ihl; if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - qp->q.meat == qp->q.len) - return ip_frag_reasm(qp, prev, dev); + qp->q.meat == qp->q.len) { + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + err = ip_frag_reasm(qp, prev, dev); + skb->_skb_refdst = orefdst; + return err; + } - write_lock(&ip4_frags.lock); - list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); - write_unlock(&ip4_frags.lock); + skb_dst_drop(skb); + inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: @@ -558,7 +528,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, ipq_kill(qp); - ecn = ip4_frag_ecn_table[qp->ecn]; + ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { err = -EINVAL; goto out_fail; @@ -594,7 +564,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; /* If the first fragment is fragmented itself, we split @@ -617,7 +587,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, clone->truesize); } skb_push(head, head->data - skb_network_header(head)); @@ -645,7 +615,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } fp = next; } - atomic_sub(sum_truesize, &qp->q.net->mem); + sub_frag_mem_limit(&qp->q, sum_truesize); head->next = NULL; head->dev = dev; @@ -851,14 +821,22 @@ static inline void ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { - /* - * Fragment cache limits. We will commit 256K at one time. Should we - * cross that limit we will prune down to 192K. This should cope with - * even the most extreme cases without allowing an attacker to - * measurably harm machine performance. + /* Fragment cache limits. + * + * The fragment memory accounting code, (tries to) account for + * the real memory usage, by measuring both the size of frag + * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) + * and the SKB's truesize. + * + * A 64K fragment consumes 129736 bytes (44*2944)+200 + * (1500 truesize == 2944, sizeof(struct ipq) == 200) + * + * We will commit 4MB at one time. Should we cross that limit + * we will prune down to 3MB, making room for approx 8 big 64K + * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 256 * 1024; - net->ipv4.frags.low_thresh = 192 * 1024; + net->ipv4.frags.high_thresh = 4 * 1024 * 1024; + net->ipv4.frags.low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e81b1caf2ea2..c625e4dad4b0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -37,7 +37,7 @@ #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> @@ -108,15 +108,6 @@ fatal route to network, even if it were you who configured fatal static route: you are innocent. :-) - - - 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain - practically identical code. It would be good to glue them - together, but it is not very evident, how to make them modular. - sit is integral part of IPv6, ipip and gre are naturally modular. - We could extract common parts (hash table, ioctl etc) - to a separate module (ip_tunnel.c). - Alexey Kuznetsov. */ @@ -126,400 +117,137 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); -static void ipgre_tunnel_setup(struct net_device *dev); -static int ipgre_tunnel_bind_dev(struct net_device *dev); - -/* Fallback tunnel: no source, no destination, no key, no options */ - -#define HASH_SIZE 16 static int ipgre_net_id __read_mostly; -struct ipgre_net { - struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; - - struct net_device *fb_tunnel_dev; -}; - -/* Tunnel hash table */ - -/* - 4 hash tables: - - 3: (remote,local) - 2: (remote,*) - 1: (*,local) - 0: (*,*) +static int gre_tap_net_id __read_mostly; - We require exact key match i.e. if a key is present in packet - it will match only tunnel with the same key; if it is not present, - it will match only keyless tunnel. - - All keysless packets, if not matched configured keyless tunnels - will match fallback tunnel. - */ +static __sum16 check_checksum(struct sk_buff *skb) +{ + __sum16 csum = 0; -#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + csum = csum_fold(skb->csum); -#define tunnels_r_l tunnels[3] -#define tunnels_r tunnels[2] -#define tunnels_l tunnels[1] -#define tunnels_wc tunnels[0] + if (!csum) + break; + /* Fall through. */ -static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -{ - int i; - - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); + skb->ip_summed = CHECKSUM_COMPLETE; + break; } - tot->multicast = dev->stats.multicast; - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->rx_errors = dev->stats.rx_errors; - - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - - return tot; + return csum; } -/* Does key in tunnel parameters match packet */ -static bool ipgre_key_match(const struct ip_tunnel_parm *p, - __be16 flags, __be32 key) +static int ip_gre_calc_hlen(__be16 o_flags) { - if (p->i_flags & GRE_KEY) { - if (flags & GRE_KEY) - return key == p->i_key; - else - return false; /* key expected, none present */ - } else - return !(flags & GRE_KEY); -} + int addend = 4; -/* Given src, dst and key, find appropriate for input tunnel. */ + if (o_flags&TUNNEL_CSUM) + addend += 4; + if (o_flags&TUNNEL_KEY) + addend += 4; + if (o_flags&TUNNEL_SEQ) + addend += 4; + return addend; +} -static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, - __be32 remote, __be32 local, - __be16 flags, __be32 key, - __be16 gre_proto) +static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, + bool *csum_err, int *hdr_len) { - struct net *net = dev_net(dev); - int link = dev->ifindex; - unsigned int h0 = HASH(remote); - unsigned int h1 = HASH(key); - struct ip_tunnel *t, *cand = NULL; - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - int dev_type = (gre_proto == htons(ETH_P_TEB)) ? - ARPHRD_ETHER : ARPHRD_IPGRE; - int score, cand_score = 4; - - for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { - if (local != t->parms.iph.saddr || - remote != t->parms.iph.daddr || - !(t->dev->flags & IFF_UP)) - continue; - - if (!ipgre_key_match(&t->parms, flags, key)) - continue; - - if (t->dev->type != ARPHRD_IPGRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } + unsigned int ip_hlen = ip_hdrlen(skb); + const struct gre_base_hdr *greh; + __be32 *options; - for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { - if (remote != t->parms.iph.daddr || - !(t->dev->flags & IFF_UP)) - continue; - - if (!ipgre_key_match(&t->parms, flags, key)) - continue; - - if (t->dev->type != ARPHRD_IPGRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } + if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) + return -EINVAL; - for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { - if ((local != t->parms.iph.saddr && - (local != t->parms.iph.daddr || - !ipv4_is_multicast(local))) || - !(t->dev->flags & IFF_UP)) - continue; - - if (!ipgre_key_match(&t->parms, flags, key)) - continue; - - if (t->dev->type != ARPHRD_IPGRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } + greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); + if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) + return -EINVAL; - for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { - if (t->parms.i_key != key || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->dev->type != ARPHRD_IPGRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } + tpi->flags = gre_flags_to_tnl_flags(greh->flags); + *hdr_len = ip_gre_calc_hlen(tpi->flags); - if (cand != NULL) - return cand; + if (!pskb_may_pull(skb, *hdr_len)) + return -EINVAL; - dev = ign->fb_tunnel_dev; - if (dev->flags & IFF_UP) - return netdev_priv(dev); + greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); - return NULL; -} + tpi->proto = greh->protocol; -static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, - struct ip_tunnel_parm *parms) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - __be32 key = parms->i_key; - unsigned int h = HASH(key); - int prio = 0; - - if (local) - prio |= 1; - if (remote && !ipv4_is_multicast(remote)) { - prio |= 2; - h ^= HASH(remote); + options = (__be32 *)(greh + 1); + if (greh->flags & GRE_CSUM) { + if (check_checksum(skb)) { + *csum_err = true; + return -EINVAL; + } + options++; } - return &ign->tunnels[prio][h]; -} - -static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, - struct ip_tunnel *t) -{ - return __ipgre_bucket(ign, &t->parms); -} - -static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); + if (greh->flags & GRE_KEY) { + tpi->key = *options; + options++; + } else + tpi->key = 0; - rcu_assign_pointer(t->next, rtnl_dereference(*tp)); - rcu_assign_pointer(*tp, t); -} + if (unlikely(greh->flags & GRE_SEQ)) { + tpi->seq = *options; + options++; + } else + tpi->seq = 0; -static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp; - struct ip_tunnel *iter; - - for (tp = ipgre_bucket(ign, t); - (iter = rtnl_dereference(*tp)) != NULL; - tp = &iter->next) { - if (t == iter) { - rcu_assign_pointer(*tp, t->next); - break; + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + tpi->proto = htons(ETH_P_IP); + if ((*(u8 *)options & 0xF0) != 0x40) { + *hdr_len += 4; + if (!pskb_may_pull(skb, *hdr_len)) + return -EINVAL; } } -} - -static struct ip_tunnel *ipgre_tunnel_find(struct net *net, - struct ip_tunnel_parm *parms, - int type) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - __be32 key = parms->i_key; - int link = parms->link; - struct ip_tunnel *t; - struct ip_tunnel __rcu **tp; - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - - for (tp = __ipgre_bucket(ign, parms); - (t = rtnl_dereference(*tp)) != NULL; - tp = &t->next) - if (local == t->parms.iph.saddr && - remote == t->parms.iph.daddr && - key == t->parms.i_key && - link == t->parms.link && - type == t->dev->type) - break; - - return t; -} - -static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) -{ - struct ip_tunnel *t, *nt; - struct net_device *dev; - char name[IFNAMSIZ]; - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - - t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); - if (t || !create) - return t; - - if (parms->name[0]) - strlcpy(name, parms->name, IFNAMSIZ); - else - strcpy(name, "gre%d"); - - dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); - if (!dev) - return NULL; - dev_net_set(dev, net); - - nt = netdev_priv(dev); - nt->parms = *parms; - dev->rtnl_link_ops = &ipgre_link_ops; - - dev->mtu = ipgre_tunnel_bind_dev(dev); - - if (register_netdevice(dev) < 0) - goto failed_free; - - /* Can use a lockless transmit, unless we generate output sequences */ - if (!(nt->parms.o_flags & GRE_SEQ)) - dev->features |= NETIF_F_LLTX; - - dev_hold(dev); - ipgre_tunnel_link(ign, nt); - return nt; - -failed_free: - free_netdev(dev); - return NULL; -} - -static void ipgre_tunnel_uninit(struct net_device *dev) -{ - struct net *net = dev_net(dev); - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - - ipgre_tunnel_unlink(ign, netdev_priv(dev)); - dev_put(dev); + return 0; } - static void ipgre_err(struct sk_buff *skb, u32 info) { -/* All the routers (except for Linux) return only - 8 bytes of packet payload. It means, that precise relaying of - ICMP in the real Internet is absolutely infeasible. + /* All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. - Moreover, Cisco "wise men" put GRE key to the third word - in GRE header. It makes impossible maintaining even soft state for keyed - GRE tunnels with enabled checksum. Tell them "thank you". - - Well, I wonder, rfc1812 was written by Cisco employee, - what the hell these idiots break standards established - by themselves??? - */ + Moreover, Cisco "wise men" put GRE key to the third word + in GRE header. It makes impossible maintaining even soft + state for keyed GRE tunnels with enabled checksum. Tell + them "thank you". + Well, I wonder, rfc1812 was written by Cisco employee, + what the hell these idiots break standards established + by themselves??? + */ + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn; const struct iphdr *iph = (const struct iphdr *)skb->data; - __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2)); - int grehlen = (iph->ihl<<2) + 4; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; - __be16 flags; - __be32 key = 0; + struct tnl_ptk_info tpi; + int hdr_len; + bool csum_err = false; - flags = p[0]; - if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { - if (flags&(GRE_VERSION|GRE_ROUTING)) + if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) { + if (!csum_err) /* ignore csum errors. */ return; - if (flags&GRE_KEY) { - grehlen += 4; - if (flags&GRE_CSUM) - grehlen += 4; - } } - /* If only 8 bytes returned, keyed message will be dropped here */ - if (skb_headlen(skb) < grehlen) - return; - - if (flags & GRE_KEY) - key = *(((__be32 *)p) + (grehlen / 4) - 1); - switch (type) { default: case ICMP_PARAMETERPROB: @@ -548,8 +276,13 @@ static void ipgre_err(struct sk_buff *skb, u32 info) break; } - t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, - flags, key, p[1]); + if (tpi.proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else + itn = net_generic(net, ipgre_net_id); + + t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags, + iph->daddr, iph->saddr, tpi.key); if (t == NULL) return; @@ -578,612 +311,214 @@ static void ipgre_err(struct sk_buff *skb, u32 info) t->err_time = jiffies; } -static inline u8 -ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) -{ - u8 inner = 0; - if (skb->protocol == htons(ETH_P_IP)) - inner = old_iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) - inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); - return INET_ECN_encapsulate(tos, inner); -} - static int ipgre_rcv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn; const struct iphdr *iph; - u8 *h; - __be16 flags; - __sum16 csum = 0; - __be32 key = 0; - u32 seqno = 0; struct ip_tunnel *tunnel; - int offset = 4; - __be16 gre_proto; - int err; + struct tnl_ptk_info tpi; + int hdr_len; + bool csum_err = false; - if (!pskb_may_pull(skb, 16)) + if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0) goto drop; - iph = ip_hdr(skb); - h = skb->data; - flags = *(__be16 *)h; - - if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { - /* - Version must be 0. - - We do not support routing headers. - */ - if (flags&(GRE_VERSION|GRE_ROUTING)) - goto drop; - - if (flags&GRE_CSUM) { - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - csum = csum_fold(skb->csum); - if (!csum) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - csum = __skb_checksum_complete(skb); - skb->ip_summed = CHECKSUM_COMPLETE; - } - offset += 4; - } - if (flags&GRE_KEY) { - key = *(__be32 *)(h + offset); - offset += 4; - } - if (flags&GRE_SEQ) { - seqno = ntohl(*(__be32 *)(h + offset)); - offset += 4; - } - } + if (tpi.proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else + itn = net_generic(net, ipgre_net_id); - gre_proto = *(__be16 *)(h + 2); + iph = ip_hdr(skb); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags, + iph->saddr, iph->daddr, tpi.key); - tunnel = ipgre_tunnel_lookup(skb->dev, - iph->saddr, iph->daddr, flags, key, - gre_proto); if (tunnel) { - struct pcpu_tstats *tstats; - - secpath_reset(skb); - - skb->protocol = gre_proto; - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IP - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { - skb->protocol = htons(ETH_P_IP); - if ((*(h + offset) & 0xF0) != 0x40) - offset += 4; - } - - skb->mac_header = skb->network_header; - __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb_transport_header(skb), offset); - skb->pkt_type = PACKET_HOST; -#ifdef CONFIG_NET_IPGRE_BROADCAST - if (ipv4_is_multicast(iph->daddr)) { - /* Looped back packet, drop it! */ - if (rt_is_output_route(skb_rtable(skb))) - goto drop; - tunnel->dev->stats.multicast++; - skb->pkt_type = PACKET_BROADCAST; - } -#endif - - if (((flags&GRE_CSUM) && csum) || - (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - tunnel->dev->stats.rx_crc_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - if (tunnel->parms.i_flags&GRE_SEQ) { - if (!(flags&GRE_SEQ) || - (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { - tunnel->dev->stats.rx_fifo_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - tunnel->i_seqno = seqno + 1; - } - - /* Warning: All skb pointers will be invalidated! */ - if (tunnel->dev->type == ARPHRD_ETHER) { - if (!pskb_may_pull(skb, ETH_HLEN)) { - tunnel->dev->stats.rx_length_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - - iph = ip_hdr(skb); - skb->protocol = eth_type_trans(skb, tunnel->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - } - - __skb_tunnel_rx(skb, tunnel->dev); - - skb_reset_network_header(skb); - err = IP_ECN_decapsulate(iph, skb); - if (unlikely(err)) { - if (log_ecn_error) - net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", - &iph->saddr, iph->tos); - if (err > 1) { - ++tunnel->dev->stats.rx_frame_errors; - ++tunnel->dev->stats.rx_errors; - goto drop; - } - } - - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - - gro_cells_receive(&tunnel->gro_cells, skb); + ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); return 0; } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - drop: kfree_skb(skb); return 0; } -static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb) { - struct ip_tunnel *tunnel = netdev_priv(dev); - const struct iphdr *old_iph = ip_hdr(skb); - const struct iphdr *tiph; - struct flowi4 fl4; - u8 tos; - __be16 df; - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct iphdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int gre_hlen; - __be32 dst; - int mtu; - u8 ttl; - - if (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb)) - goto tx_error; - - if (dev->type == ARPHRD_ETHER) - IPCB(skb)->flags = 0; - - if (dev->header_ops && dev->type == ARPHRD_IPGRE) { - gre_hlen = 0; - if (skb->protocol == htons(ETH_P_IP)) - tiph = (const struct iphdr *)skb->data; - else - tiph = &tunnel->parms.iph; - } else { - gre_hlen = tunnel->hlen; - tiph = &tunnel->parms.iph; - } - - if ((dst = tiph->daddr) == 0) { - /* NBMA tunnel */ - - if (skb_dst(skb) == NULL) { - dev->stats.tx_fifo_errors++; - goto tx_error; - } - - if (skb->protocol == htons(ETH_P_IP)) { - rt = skb_rtable(skb); - dst = rt_nexthop(rt, old_iph->daddr); - } -#if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) { - const struct in6_addr *addr6; - struct neighbour *neigh; - bool do_tx_error_icmp; - int addr_type; - - neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); - if (neigh == NULL) - goto tx_error; - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if (addr_type == IPV6_ADDR_ANY) { - addr6 = &ipv6_hdr(skb)->daddr; - addr_type = ipv6_addr_type(addr6); - } - - if ((addr_type & IPV6_ADDR_COMPATv4) == 0) - do_tx_error_icmp = true; - else { - do_tx_error_icmp = false; - dst = addr6->s6_addr32[3]; - } - neigh_release(neigh); - if (do_tx_error_icmp) - goto tx_error_icmp; - } -#endif - else - goto tx_error; - } + int err; - ttl = tiph->ttl; - tos = tiph->tos; - if (tos == 1) { - tos = 0; - if (skb->protocol == htons(ETH_P_IP)) - tos = old_iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) - tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); - } + if (skb_is_gso(skb)) { + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + goto error; + skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; + return skb; + } else if (skb->ip_summed == CHECKSUM_PARTIAL && + tunnel->parms.o_flags&TUNNEL_CSUM) { + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } else if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_NONE; + + return skb; + +error: + kfree_skb(skb); + return ERR_PTR(err); +} - rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, - tunnel->parms.o_key, RT_TOS(tos), - tunnel->parms.link); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error; - } - tdev = rt->dst.dev; +static struct sk_buff *gre_build_header(struct sk_buff *skb, + const struct tnl_ptk_info *tpi, + int hdr_len) +{ + struct gre_base_hdr *greh; - if (tdev == dev) { - ip_rt_put(rt); - dev->stats.collisions++; - goto tx_error; - } + skb_push(skb, hdr_len); - df = tiph->frag_off; - if (df) - mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; - else - mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; + greh = (struct gre_base_hdr *)skb->data; + greh->flags = tnl_flags_to_gre_flags(tpi->flags); + greh->protocol = tpi->proto; - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { + __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - if (skb->protocol == htons(ETH_P_IP)) { - df |= (old_iph->frag_off&htons(IP_DF)); - - if ((old_iph->frag_off&htons(IP_DF)) && - mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - goto tx_error; + if (tpi->flags&TUNNEL_SEQ) { + *ptr = tpi->seq; + ptr--; } - } -#if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) { - struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); - - if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { - if ((tunnel->parms.iph.daddr && - !ipv4_is_multicast(tunnel->parms.iph.daddr)) || - rt6->rt6i_dst.plen == 128) { - rt6->rt6i_flags |= RTF_MODIFIED; - dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); - } + if (tpi->flags&TUNNEL_KEY) { + *ptr = tpi->key; + ptr--; } - - if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - ip_rt_put(rt); - goto tx_error; + if (tpi->flags&TUNNEL_CSUM && + !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { + *(__sum16 *)ptr = 0; + *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, + skb->len, 0)); } } -#endif - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { - tunnel->err_count--; + return skb; +} - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } +static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, + const struct iphdr *tnl_params, + __be16 proto) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct tnl_ptk_info tpi; - max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; - - if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| - (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { - struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; - if (!new_skb) { - ip_rt_put(rt); - dev->stats.tx_dropped++; - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - if (skb->sk) - skb_set_owner_w(new_skb, skb->sk); - dev_kfree_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - /* Warning : tiph value might point to freed memory */ + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; } - skb_push(skb, gre_hlen); - skb_reset_network_header(skb); - skb_set_transport_header(skb, sizeof(*iph)); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | - IPSKB_REROUTED); - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - - /* - * Push down and install the IPIP header. - */ + tpi.flags = tunnel->parms.o_flags; + tpi.proto = proto; + tpi.key = tunnel->parms.o_key; + if (tunnel->parms.o_flags & TUNNEL_SEQ) + tunnel->o_seqno++; + tpi.seq = htonl(tunnel->o_seqno); - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; - iph->protocol = IPPROTO_GRE; - iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); - iph->daddr = fl4.daddr; - iph->saddr = fl4.saddr; - iph->ttl = ttl; - - if (ttl == 0) { - if (skb->protocol == htons(ETH_P_IP)) - iph->ttl = old_iph->ttl; -#if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) - iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; -#endif - else - iph->ttl = ip4_dst_hoplimit(&rt->dst); + /* Push GRE header. */ + skb = gre_build_header(skb, &tpi, tunnel->hlen); + if (unlikely(!skb)) { + dev->stats.tx_dropped++; + return; } - ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; - ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? - htons(ETH_P_TEB) : skb->protocol; + ip_tunnel_xmit(skb, dev, tnl_params); +} - if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4); +static netdev_tx_t ipgre_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *tnl_params; - if (tunnel->parms.o_flags&GRE_SEQ) { - ++tunnel->o_seqno; - *ptr = htonl(tunnel->o_seqno); - ptr--; - } - if (tunnel->parms.o_flags&GRE_KEY) { - *ptr = tunnel->parms.o_key; - ptr--; - } - if (tunnel->parms.o_flags&GRE_CSUM) { - int offset = skb_transport_offset(skb); + skb = handle_offloads(tunnel, skb); + if (IS_ERR(skb)) + goto out; - *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, offset, - skb->len - offset, - 0)); - } + if (dev->header_ops) { + /* Need space for new headers */ + if (skb_cow_head(skb, dev->needed_headroom - + (tunnel->hlen + sizeof(struct iphdr)))) + goto free_skb; + + tnl_params = (const struct iphdr *)skb->data; + + /* Pull skb since ip_tunnel_xmit() needs skb->data pointing + * to gre header. + */ + skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); + } else { + if (skb_cow_head(skb, dev->needed_headroom)) + goto free_skb; + + tnl_params = &tunnel->parms.iph; } - iptunnel_xmit(skb, dev); + __gre_xmit(skb, dev, tnl_params, skb->protocol); + return NETDEV_TX_OK; -#if IS_ENABLED(CONFIG_IPV6) -tx_error_icmp: - dst_link_failure(skb); -#endif -tx_error: - dev->stats.tx_errors++; +free_skb: dev_kfree_skb(skb); +out: + dev->stats.tx_dropped++; return NETDEV_TX_OK; } -static int ipgre_tunnel_bind_dev(struct net_device *dev) +static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, + struct net_device *dev) { - struct net_device *tdev = NULL; - struct ip_tunnel *tunnel; - const struct iphdr *iph; - int hlen = LL_MAX_HEADER; - int mtu = ETH_DATA_LEN; - int addend = sizeof(struct iphdr) + 4; - - tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; - - /* Guess output device to choose reasonable mtu and needed_headroom */ - - if (iph->daddr) { - struct flowi4 fl4; - struct rtable *rt; - - rt = ip_route_output_gre(dev_net(dev), &fl4, - iph->daddr, iph->saddr, - tunnel->parms.o_key, - RT_TOS(iph->tos), - tunnel->parms.link); - if (!IS_ERR(rt)) { - tdev = rt->dst.dev; - ip_rt_put(rt); - } - - if (dev->type != ARPHRD_ETHER) - dev->flags |= IFF_POINTOPOINT; - } + struct ip_tunnel *tunnel = netdev_priv(dev); - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + skb = handle_offloads(tunnel, skb); + if (IS_ERR(skb)) + goto out; - if (tdev) { - hlen = tdev->hard_header_len + tdev->needed_headroom; - mtu = tdev->mtu; - } - dev->iflink = tunnel->parms.link; - - /* Precalculate GRE options length */ - if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { - if (tunnel->parms.o_flags&GRE_CSUM) - addend += 4; - if (tunnel->parms.o_flags&GRE_KEY) - addend += 4; - if (tunnel->parms.o_flags&GRE_SEQ) - addend += 4; - } - dev->needed_headroom = addend + hlen; - mtu -= dev->hard_header_len + addend; + if (skb_cow_head(skb, dev->needed_headroom)) + goto free_skb; - if (mtu < 68) - mtu = 68; + __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); - tunnel->hlen = addend; + return NETDEV_TX_OK; - return mtu; +free_skb: + dev_kfree_skb(skb); +out: + dev->stats.tx_dropped++; + return NETDEV_TX_OK; } -static int -ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +static int ipgre_tunnel_ioctl(struct net_device *dev, + struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - - switch (cmd) { - case SIOCGETTUNNEL: - t = NULL; - if (dev == ign->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { - err = -EFAULT; - break; - } - t = ipgre_tunnel_locate(net, &p, 0); - } - if (t == NULL) - t = netdev_priv(dev); - memcpy(&p, &t->parms, sizeof(p)); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - err = -EFAULT; - break; - - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - - err = -EINVAL; - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || - ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) - goto done; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - - if (!(p.i_flags&GRE_KEY)) - p.i_key = 0; - if (!(p.o_flags&GRE_KEY)) - p.o_key = 0; - - t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - - if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else { - unsigned int nflags = 0; - - t = netdev_priv(dev); - - if (ipv4_is_multicast(p.iph.daddr)) - nflags = IFF_BROADCAST; - else if (p.iph.daddr) - nflags = IFF_POINTOPOINT; - - if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { - err = -EINVAL; - break; - } - ipgre_tunnel_unlink(ign, t); - synchronize_net(); - t->parms.iph.saddr = p.iph.saddr; - t->parms.iph.daddr = p.iph.daddr; - t->parms.i_key = p.i_key; - t->parms.o_key = p.o_key; - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); - ipgre_tunnel_link(ign, t); - netdev_state_change(dev); - } - } - - if (t) { - err = 0; - if (cmd == SIOCCHGTUNNEL) { - t->parms.iph.ttl = p.iph.ttl; - t->parms.iph.tos = p.iph.tos; - t->parms.iph.frag_off = p.iph.frag_off; - if (t->parms.link != p.link) { - t->parms.link = p.link; - dev->mtu = ipgre_tunnel_bind_dev(dev); - netdev_state_change(dev); - } - } - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); - break; - case SIOCDELTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - if (dev == ign->fb_tunnel_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - err = -ENOENT; - if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) - goto done; - err = -EPERM; - if (t == netdev_priv(ign->fb_tunnel_dev)) - goto done; - dev = t->dev; - } - unregister_netdevice(dev); - err = 0; - break; - - default: - err = -EINVAL; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || + ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) { + return -EINVAL; } + p.i_flags = gre_flags_to_tnl_flags(p.i_flags); + p.o_flags = gre_flags_to_tnl_flags(p.o_flags); -done: - return err; -} + err = ip_tunnel_ioctl(dev, &p, cmd); + if (err) + return err; -static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - if (new_mtu < 68 || - new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) - return -EINVAL; - dev->mtu = new_mtu; + p.i_flags = tnl_flags_to_gre_flags(p.i_flags); + p.o_flags = tnl_flags_to_gre_flags(p.o_flags); + + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; return 0; } @@ -1213,25 +548,23 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) ... ftp fec0:6666:6666::193.233.7.65 ... - */ - static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ip_tunnel *t = netdev_priv(dev); - struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(iph+1); + struct iphdr *iph; + struct gre_base_hdr *greh; - memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); - p[0] = t->parms.o_flags; - p[1] = htons(type); + iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph)); + greh = (struct gre_base_hdr *)(iph+1); + greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags); + greh->protocol = htons(type); - /* - * Set the source hardware address. - */ + memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); + /* Set the source hardware address. */ if (saddr) memcpy(&iph->saddr, saddr, 4); if (daddr) @@ -1239,7 +572,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, if (iph->daddr) return t->hlen; - return -t->hlen; + return -(t->hlen + sizeof(*iph)); } static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) @@ -1293,31 +626,21 @@ static int ipgre_close(struct net_device *dev) } return 0; } - #endif static const struct net_device_ops ipgre_netdev_ops = { .ndo_init = ipgre_tunnel_init, - .ndo_uninit = ipgre_tunnel_uninit, + .ndo_uninit = ip_tunnel_uninit, #ifdef CONFIG_NET_IPGRE_BROADCAST .ndo_open = ipgre_open, .ndo_stop = ipgre_close, #endif - .ndo_start_xmit = ipgre_tunnel_xmit, + .ndo_start_xmit = ipgre_xmit, .ndo_do_ioctl = ipgre_tunnel_ioctl, - .ndo_change_mtu = ipgre_tunnel_change_mtu, - .ndo_get_stats64 = ipgre_get_stats64, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; -static void ipgre_dev_free(struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - - gro_cells_destroy(&tunnel->gro_cells); - free_percpu(dev->tstats); - free_netdev(dev); -} - #define GRE_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ @@ -1326,35 +649,48 @@ static void ipgre_dev_free(struct net_device *dev) static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; - dev->destructor = ipgre_dev_free; + ip_tunnel_setup(dev, ipgre_net_id); +} - dev->type = ARPHRD_IPGRE; - dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; +static void __gre_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel; + + tunnel = netdev_priv(dev); + tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + tunnel->parms.iph.protocol = IPPROTO_GRE; + + dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; - dev->flags = IFF_NOARP; - dev->iflink = 0; - dev->addr_len = 4; - dev->features |= NETIF_F_NETNS_LOCAL; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; - dev->features |= GRE_FEATURES; + dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES; dev->hw_features |= GRE_FEATURES; + + if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported. */ + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + /* Can use a lockless transmit, unless we generate + * output sequences + */ + dev->features |= NETIF_F_LLTX; + } } static int ipgre_tunnel_init(struct net_device *dev) { - struct ip_tunnel *tunnel; - struct iphdr *iph; - int err; + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; - tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; + __gre_tunnel_init(dev); - tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + memcpy(dev->dev_addr, &iph->saddr, 4); + memcpy(dev->broadcast, &iph->daddr, 4); - memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); - memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + dev->type = ARPHRD_IPGRE; + dev->flags = IFF_NOARP; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + dev->addr_len = 4; if (iph->daddr) { #ifdef CONFIG_NET_IPGRE_BROADCAST @@ -1368,106 +704,30 @@ static int ipgre_tunnel_init(struct net_device *dev) } else dev->header_ops = &ipgre_header_ops; - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; - - err = gro_cells_init(&tunnel->gro_cells, dev); - if (err) { - free_percpu(dev->tstats); - return err; - } - - return 0; + return ip_tunnel_init(dev); } -static void ipgre_fb_tunnel_init(struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - struct iphdr *iph = &tunnel->parms.iph; - - tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); - - iph->version = 4; - iph->protocol = IPPROTO_GRE; - iph->ihl = 5; - tunnel->hlen = sizeof(struct iphdr) + 4; - - dev_hold(dev); -} - - static const struct gre_protocol ipgre_protocol = { .handler = ipgre_rcv, .err_handler = ipgre_err, }; -static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) -{ - int prio; - - for (prio = 0; prio < 4; prio++) { - int h; - for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - - t = rtnl_dereference(ign->tunnels[prio][h]); - - while (t != NULL) { - unregister_netdevice_queue(t->dev, head); - t = rtnl_dereference(t->next); - } - } - } -} - static int __net_init ipgre_init_net(struct net *net) { - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - int err; - - ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", - ipgre_tunnel_setup); - if (!ign->fb_tunnel_dev) { - err = -ENOMEM; - goto err_alloc_dev; - } - dev_net_set(ign->fb_tunnel_dev, net); - - ipgre_fb_tunnel_init(ign->fb_tunnel_dev); - ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; - - if ((err = register_netdev(ign->fb_tunnel_dev))) - goto err_reg_dev; - - rcu_assign_pointer(ign->tunnels_wc[0], - netdev_priv(ign->fb_tunnel_dev)); - return 0; - -err_reg_dev: - ipgre_dev_free(ign->fb_tunnel_dev); -err_alloc_dev: - return err; + return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } static void __net_exit ipgre_exit_net(struct net *net) { - struct ipgre_net *ign; - LIST_HEAD(list); - - ign = net_generic(net, ipgre_net_id); - rtnl_lock(); - ipgre_destroy_tunnels(ign, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); + struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); + ip_tunnel_delete_net(itn); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, .exit = ipgre_exit_net, .id = &ipgre_net_id, - .size = sizeof(struct ipgre_net), + .size = sizeof(struct ip_tunnel_net), }; static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -1512,8 +772,8 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) +static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], + struct ip_tunnel_parm *parms) { memset(parms, 0, sizeof(*parms)); @@ -1526,10 +786,10 @@ static void ipgre_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1553,145 +813,46 @@ static void ipgre_netlink_parms(struct nlattr *data[], parms->iph.frag_off = htons(IP_DF); } -static int ipgre_tap_init(struct net_device *dev) +static int gre_tap_init(struct net_device *dev) { - struct ip_tunnel *tunnel; - - tunnel = netdev_priv(dev); - - tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + __gre_tunnel_init(dev); - ipgre_tunnel_bind_dev(dev); - - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; - - return 0; + return ip_tunnel_init(dev); } -static const struct net_device_ops ipgre_tap_netdev_ops = { - .ndo_init = ipgre_tap_init, - .ndo_uninit = ipgre_tunnel_uninit, - .ndo_start_xmit = ipgre_tunnel_xmit, +static const struct net_device_ops gre_tap_netdev_ops = { + .ndo_init = gre_tap_init, + .ndo_uninit = ip_tunnel_uninit, + .ndo_start_xmit = gre_tap_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = ipgre_tunnel_change_mtu, - .ndo_get_stats64 = ipgre_get_stats64, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; static void ipgre_tap_setup(struct net_device *dev) { - ether_setup(dev); - - dev->netdev_ops = &ipgre_tap_netdev_ops; - dev->destructor = ipgre_dev_free; - - dev->iflink = 0; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netdev_ops = &gre_tap_netdev_ops; + ip_tunnel_setup(dev, gre_tap_net_id); } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) +static int ipgre_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) { - struct ip_tunnel *nt; - struct net *net = dev_net(dev); - struct ipgre_net *ign = net_generic(net, ipgre_net_id); - int mtu; - int err; - - nt = netdev_priv(dev); - ipgre_netlink_parms(data, &nt->parms); - - if (ipgre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; - - if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) - eth_hw_addr_random(dev); - - mtu = ipgre_tunnel_bind_dev(dev); - if (!tb[IFLA_MTU]) - dev->mtu = mtu; - - /* Can use a lockless transmit, unless we generate output sequences */ - if (!(nt->parms.o_flags & GRE_SEQ)) - dev->features |= NETIF_F_LLTX; - - err = register_netdevice(dev); - if (err) - goto out; - - dev_hold(dev); - ipgre_tunnel_link(ign, nt); + struct ip_tunnel_parm p; -out: - return err; + ipgre_netlink_parms(data, tb, &p); + return ip_tunnel_newlink(dev, tb, &p); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct ip_tunnel *t, *nt; - struct net *net = dev_net(dev); - struct ipgre_net *ign = net_generic(net, ipgre_net_id); struct ip_tunnel_parm p; - int mtu; - - if (dev == ign->fb_tunnel_dev) - return -EINVAL; - - nt = netdev_priv(dev); - ipgre_netlink_parms(data, &p); - - t = ipgre_tunnel_locate(net, &p, 0); - - if (t) { - if (t->dev != dev) - return -EEXIST; - } else { - t = nt; - - if (dev->type != ARPHRD_ETHER) { - unsigned int nflags = 0; - - if (ipv4_is_multicast(p.iph.daddr)) - nflags = IFF_BROADCAST; - else if (p.iph.daddr) - nflags = IFF_POINTOPOINT; - - if ((dev->flags ^ nflags) & - (IFF_POINTOPOINT | IFF_BROADCAST)) - return -EINVAL; - } - ipgre_tunnel_unlink(ign, t); - t->parms.iph.saddr = p.iph.saddr; - t->parms.iph.daddr = p.iph.daddr; - t->parms.i_key = p.i_key; - if (dev->type != ARPHRD_ETHER) { - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); - } - ipgre_tunnel_link(ign, t); - netdev_state_change(dev); - } - - t->parms.o_key = p.o_key; - t->parms.iph.ttl = p.iph.ttl; - t->parms.iph.tos = p.iph.tos; - t->parms.iph.frag_off = p.iph.frag_off; - - if (t->parms.link != p.link) { - t->parms.link = p.link; - mtu = ipgre_tunnel_bind_dev(dev); - if (!tb[IFLA_MTU]) - dev->mtu = mtu; - netdev_state_change(dev); - } - - return 0; + ipgre_netlink_parms(data, tb, &p); + return ip_tunnel_changelink(dev, tb, &p); } static size_t ipgre_get_size(const struct net_device *dev) @@ -1726,8 +887,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || @@ -1765,6 +926,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = { .validate = ipgre_tunnel_validate, .newlink = ipgre_newlink, .changelink = ipgre_changelink, + .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, }; @@ -1778,13 +940,28 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { .validate = ipgre_tap_validate, .newlink = ipgre_newlink, .changelink = ipgre_changelink, + .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, }; -/* - * And now the modules code and kernel interface. - */ +static int __net_init ipgre_tap_init_net(struct net *net) +{ + return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL); +} + +static void __net_exit ipgre_tap_exit_net(struct net *net) +{ + struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); + ip_tunnel_delete_net(itn); +} + +static struct pernet_operations ipgre_tap_net_ops = { + .init = ipgre_tap_init_net, + .exit = ipgre_tap_exit_net, + .id = &gre_tap_net_id, + .size = sizeof(struct ip_tunnel_net), +}; static int __init ipgre_init(void) { @@ -1796,6 +973,10 @@ static int __init ipgre_init(void) if (err < 0) return err; + err = register_pernet_device(&ipgre_tap_net_ops); + if (err < 0) + goto pnet_tap_faied; + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); @@ -1810,16 +991,17 @@ static int __init ipgre_init(void) if (err < 0) goto tap_ops_failed; -out: - return err; + return 0; tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: + unregister_pernet_device(&ipgre_tap_net_ops); +pnet_tap_faied: unregister_pernet_device(&ipgre_net_ops); - goto out; + return err; } static void __exit ipgre_fini(void) @@ -1828,6 +1010,7 @@ static void __exit ipgre_fini(void) rtnl_link_unregister(&ipgre_link_ops); if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) pr_info("%s: can't remove protocol\n", __func__); + unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); } @@ -1837,3 +1020,4 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); MODULE_ALIAS_NETDEV("gre0"); +MODULE_ALIAS_NETDEV("gretap0"); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f1395a6fb35f..3da817b89e9b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -208,13 +208,6 @@ static int ip_local_deliver_finish(struct sk_buff *skb) if (ipprot != NULL) { int ret; - if (!net_eq(net, &init_net) && !ipprot->netns_ok) { - net_info_ratelimited("%s: proto %d isn't netns-ready\n", - __func__, protocol); - kfree_skb(skb); - goto out; - } - if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb(skb); @@ -235,9 +228,11 @@ static int ip_local_deliver_finish(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } - } else + kfree_skb(skb); + } else { IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); - kfree_skb(skb); + consume_skb(skb); + } } } out: @@ -424,7 +419,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - goto inhdr_error; + goto csum_error; len = ntohs(iph->tot_len); if (skb->len < len) { @@ -451,6 +446,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); +csum_error: + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS); inhdr_error: IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); drop: diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index f6289bf6f332..ec7264514a82 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -370,7 +370,6 @@ int ip_options_compile(struct net *net, } switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: - opt->ts = optptr - iph; if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; @@ -381,7 +380,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); @@ -396,7 +394,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); @@ -423,18 +420,18 @@ int ip_options_compile(struct net *net, put_unaligned_be32(midtime, timeptr); opt->is_changed = 1; } - } else { + } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { unsigned int overflow = optptr[3]>>4; if (overflow == 15) { pp_ptr = optptr + 3; goto error; } - opt->ts = optptr - iph; if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } + opt->ts = optptr - iph; break; case IPOPT_RA: if (optlen < 4) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3e98ed2bff55..147abf5275aa 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -430,8 +430,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) to->nf_trace = from->nf_trace; #endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) @@ -598,6 +597,7 @@ slow_path: /* for offloaded checksums cleanup checksum before fragmentation */ if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) goto fail; + iph = ip_hdr(skb); left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c new file mode 100644 index 000000000000..e4147ec1665a --- /dev/null +++ b/net/ipv4/ip_tunnel.c @@ -0,0 +1,1035 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/rculist.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ip_tunnels.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/rtnetlink.h> + +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#endif + +static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn, + __be32 key, __be32 remote) +{ + return hash_32((__force u32)key ^ (__force u32)remote, + IP_TNL_HASH_BITS); +} + +/* Often modified stats are per cpu, other are shared (netdev->stats) */ +struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) +{ + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + do { + start = u64_stats_fetch_begin_bh(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); + + tot->rx_packets += rx_packets; + tot->tx_packets += tx_packets; + tot->rx_bytes += rx_bytes; + tot->tx_bytes += tx_bytes; + } + + tot->multicast = dev->stats.multicast; + + tot->rx_crc_errors = dev->stats.rx_crc_errors; + tot->rx_fifo_errors = dev->stats.rx_fifo_errors; + tot->rx_length_errors = dev->stats.rx_length_errors; + tot->rx_frame_errors = dev->stats.rx_frame_errors; + tot->rx_errors = dev->stats.rx_errors; + + tot->tx_fifo_errors = dev->stats.tx_fifo_errors; + tot->tx_carrier_errors = dev->stats.tx_carrier_errors; + tot->tx_dropped = dev->stats.tx_dropped; + tot->tx_aborted_errors = dev->stats.tx_aborted_errors; + tot->tx_errors = dev->stats.tx_errors; + + tot->collisions = dev->stats.collisions; + + return tot; +} +EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); + +static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, + __be16 flags, __be32 key) +{ + if (p->i_flags & TUNNEL_KEY) { + if (flags & TUNNEL_KEY) + return key == p->i_key; + else + /* key expected, none present */ + return false; + } else + return !(flags & TUNNEL_KEY); +} + +/* Fallback tunnel: no source, no destination, no key, no options + + Tunnel hash table: + We require exact key match i.e. if a key is present in packet + it will match only tunnel with the same key; if it is not present, + it will match only keyless tunnel. + + All keysless packets, if not matched configured keyless tunnels + will match fallback tunnel. + Given src, dst and key, find appropriate for input tunnel. +*/ +struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, + int link, __be16 flags, + __be32 remote, __be32 local, + __be32 key) +{ + unsigned int hash; + struct ip_tunnel *t, *cand = NULL; + struct hlist_head *head; + + hash = ip_tunnel_hash(itn, key, remote); + head = &itn->tunnels[hash]; + + hlist_for_each_entry_rcu(t, head, hash_node) { + if (local != t->parms.iph.saddr || + remote != t->parms.iph.daddr || + !(t->dev->flags & IFF_UP)) + continue; + + if (!ip_tunnel_key_match(&t->parms, flags, key)) + continue; + + if (t->parms.link == link) + return t; + else + cand = t; + } + + hlist_for_each_entry_rcu(t, head, hash_node) { + if (remote != t->parms.iph.daddr || + !(t->dev->flags & IFF_UP)) + continue; + + if (!ip_tunnel_key_match(&t->parms, flags, key)) + continue; + + if (t->parms.link == link) + return t; + else if (!cand) + cand = t; + } + + hash = ip_tunnel_hash(itn, key, 0); + head = &itn->tunnels[hash]; + + hlist_for_each_entry_rcu(t, head, hash_node) { + if ((local != t->parms.iph.saddr && + (local != t->parms.iph.daddr || + !ipv4_is_multicast(local))) || + !(t->dev->flags & IFF_UP)) + continue; + + if (!ip_tunnel_key_match(&t->parms, flags, key)) + continue; + + if (t->parms.link == link) + return t; + else if (!cand) + cand = t; + } + + if (flags & TUNNEL_NO_KEY) + goto skip_key_lookup; + + hlist_for_each_entry_rcu(t, head, hash_node) { + if (t->parms.i_key != key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->parms.link == link) + return t; + else if (!cand) + cand = t; + } + +skip_key_lookup: + if (cand) + return cand; + + if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) + return netdev_priv(itn->fb_tunnel_dev); + + + return NULL; +} +EXPORT_SYMBOL_GPL(ip_tunnel_lookup); + +static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, + struct ip_tunnel_parm *parms) +{ + unsigned int h; + __be32 remote; + + if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) + remote = parms->iph.daddr; + else + remote = 0; + + h = ip_tunnel_hash(itn, parms->i_key, remote); + return &itn->tunnels[h]; +} + +static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) +{ + struct hlist_head *head = ip_bucket(itn, &t->parms); + + hlist_add_head_rcu(&t->hash_node, head); +} + +static void ip_tunnel_del(struct ip_tunnel *t) +{ + hlist_del_init_rcu(&t->hash_node); +} + +static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, + struct ip_tunnel_parm *parms, + int type) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + __be32 key = parms->i_key; + int link = parms->link; + struct ip_tunnel *t = NULL; + struct hlist_head *head = ip_bucket(itn, parms); + + hlist_for_each_entry_rcu(t, head, hash_node) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + key == t->parms.i_key && + link == t->parms.link && + type == t->dev->type) + break; + } + return t; +} + +static struct net_device *__ip_tunnel_create(struct net *net, + const struct rtnl_link_ops *ops, + struct ip_tunnel_parm *parms) +{ + int err; + struct ip_tunnel *tunnel; + struct net_device *dev; + char name[IFNAMSIZ]; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else { + if (strlen(ops->kind) > (IFNAMSIZ - 3)) { + err = -E2BIG; + goto failed; + } + strlcpy(name, ops->kind, IFNAMSIZ); + strncat(name, "%d", 2); + } + + ASSERT_RTNL(); + dev = alloc_netdev(ops->priv_size, name, ops->setup); + if (!dev) { + err = -ENOMEM; + goto failed; + } + dev_net_set(dev, net); + + dev->rtnl_link_ops = ops; + + tunnel = netdev_priv(dev); + tunnel->parms = *parms; + + err = register_netdevice(dev); + if (err) + goto failed_free; + + return dev; + +failed_free: + free_netdev(dev); +failed: + return ERR_PTR(err); +} + +static inline struct rtable *ip_route_output_tunnel(struct net *net, + struct flowi4 *fl4, + int proto, + __be32 daddr, __be32 saddr, + __be32 key, __u8 tos, int oif) +{ + memset(fl4, 0, sizeof(*fl4)); + fl4->flowi4_oif = oif; + fl4->daddr = daddr; + fl4->saddr = saddr; + fl4->flowi4_tos = tos; + fl4->flowi4_proto = proto; + fl4->fl4_gre_key = key; + return ip_route_output_key(net, fl4); +} + +static int ip_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = ETH_DATA_LEN; + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + + iph = &tunnel->parms.iph; + + /* Guess output device to choose reasonable mtu and needed_headroom */ + if (iph->daddr) { + struct flowi4 fl4; + struct rtable *rt; + + rt = ip_route_output_tunnel(dev_net(dev), &fl4, + tunnel->parms.iph.protocol, + iph->daddr, iph->saddr, + tunnel->parms.o_key, + RT_TOS(iph->tos), + tunnel->parms.link); + if (!IS_ERR(rt)) { + tdev = rt->dst.dev; + ip_rt_put(rt); + } + if (dev->type != ARPHRD_ETHER) + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len + tdev->needed_headroom; + mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + + dev->needed_headroom = t_hlen + hlen; + mtu -= (dev->hard_header_len + t_hlen); + + if (mtu < 68) + mtu = 68; + + return mtu; +} + +static struct ip_tunnel *ip_tunnel_create(struct net *net, + struct ip_tunnel_net *itn, + struct ip_tunnel_parm *parms) +{ + struct ip_tunnel *nt, *fbt; + struct net_device *dev; + + BUG_ON(!itn->fb_tunnel_dev); + fbt = netdev_priv(itn->fb_tunnel_dev); + dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); + if (IS_ERR(dev)) + return NULL; + + dev->mtu = ip_tunnel_bind_dev(dev); + + nt = netdev_priv(dev); + ip_tunnel_add(itn, nt); + return nt; +} + +int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, + const struct tnl_ptk_info *tpi, bool log_ecn_error) +{ + struct pcpu_tstats *tstats; + const struct iphdr *iph = ip_hdr(skb); + int err; + + secpath_reset(skb); + + skb->protocol = tpi->proto; + + skb->mac_header = skb->network_header; + __pskb_pull(skb, tunnel->hlen); + skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen); +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(iph->daddr)) { + /* Looped back packet, drop it! */ + if (rt_is_output_route(skb_rtable(skb))) + goto drop; + tunnel->dev->stats.multicast++; + skb->pkt_type = PACKET_BROADCAST; + } +#endif + + if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || + ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + + if (tunnel->parms.i_flags&TUNNEL_SEQ) { + if (!(tpi->flags&TUNNEL_SEQ) || + (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + tunnel->i_seqno = ntohl(tpi->seq) + 1; + } + + /* Warning: All skb pointers will be invalidated! */ + if (tunnel->dev->type == ARPHRD_ETHER) { + if (!pskb_may_pull(skb, ETH_HLEN)) { + tunnel->dev->stats.rx_length_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + + iph = ip_hdr(skb); + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } + + skb->pkt_type = PACKET_HOST; + __skb_tunnel_rx(skb, tunnel->dev); + + skb_reset_network_header(skb); + err = IP_ECN_decapsulate(iph, skb); + if (unlikely(err)) { + if (log_ecn_error) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &iph->saddr, iph->tos); + if (err > 1) { + ++tunnel->dev->stats.rx_frame_errors; + ++tunnel->dev->stats.rx_errors; + goto drop; + } + } + + tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + gro_cells_receive(&tunnel->gro_cells, skb); + return 0; + +drop: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_rcv); + +void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + const struct iphdr *tnl_params) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *inner_iph; + struct iphdr *iph; + struct flowi4 fl4; + u8 tos, ttl; + __be16 df; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + unsigned int max_headroom; /* The extra header space needed */ + __be32 dst; + int mtu; + + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + + dst = tnl_params->daddr; + if (dst == 0) { + /* NBMA tunnel */ + + if (skb_dst(skb) == NULL) { + dev->stats.tx_fifo_errors++; + goto tx_error; + } + + if (skb->protocol == htons(ETH_P_IP)) { + rt = skb_rtable(skb); + dst = rt_nexthop(rt, inner_iph->daddr); + } +#if IS_ENABLED(CONFIG_IPV6) + else if (skb->protocol == htons(ETH_P_IPV6)) { + const struct in6_addr *addr6; + struct neighbour *neigh; + bool do_tx_error_icmp; + int addr_type; + + neigh = dst_neigh_lookup(skb_dst(skb), + &ipv6_hdr(skb)->daddr); + if (neigh == NULL) + goto tx_error; + + addr6 = (const struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + do_tx_error_icmp = true; + else { + do_tx_error_icmp = false; + dst = addr6->s6_addr32[3]; + } + neigh_release(neigh); + if (do_tx_error_icmp) + goto tx_error_icmp; + } +#endif + else + goto tx_error; + } + + tos = tnl_params->tos; + if (tos & 0x1) { + tos &= ~0x1; + if (skb->protocol == htons(ETH_P_IP)) + tos = inner_iph->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); + } + + rt = ip_route_output_tunnel(dev_net(dev), &fl4, + tunnel->parms.iph.protocol, + dst, tnl_params->saddr, + tunnel->parms.o_key, + RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + tdev = rt->dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } + + df = tnl_params->frag_off; + + if (df) + mtu = dst_mtu(&rt->dst) - dev->hard_header_len + - sizeof(struct iphdr); + else + mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; + + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + + if (skb->protocol == htons(ETH_P_IP)) { + df |= (inner_iph->frag_off&htons(IP_DF)); + + if (!skb_is_gso(skb) && + (inner_iph->frag_off&htons(IP_DF)) && + mtu < ntohs(inner_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + } +#if IS_ENABLED(CONFIG_IPV6) + else if (skb->protocol == htons(ETH_P_IPV6)) { + struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); + + if (rt6 && mtu < dst_mtu(skb_dst(skb)) && + mtu >= IPV6_MIN_MTU) { + if ((tunnel->parms.iph.daddr && + !ipv4_is_multicast(tunnel->parms.iph.daddr)) || + rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); + } + } + + if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && + mtu < skb->len) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ip_rt_put(rt); + goto tx_error; + } + } +#endif + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + ttl = tnl_params->ttl; + if (ttl == 0) { + if (skb->protocol == htons(ETH_P_IP)) + ttl = inner_iph->ttl; +#if IS_ENABLED(CONFIG_IPV6) + else if (skb->protocol == htons(ETH_P_IPV6)) + ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; +#endif + else + ttl = ip4_dst_hoplimit(&rt->dst); + } + + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr) + + rt->dst.header_len; + if (max_headroom > dev->needed_headroom) { + dev->needed_headroom = max_headroom; + if (skb_cow_head(skb, dev->needed_headroom)) { + dev->stats.tx_dropped++; + dev_kfree_skb(skb); + return; + } + } + + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + + /* Push down and install the IP header. */ + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = tnl_params->protocol; + iph->tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); + iph->daddr = fl4.daddr; + iph->saddr = fl4.saddr; + iph->ttl = ttl; + tunnel_ip_select_ident(skb, inner_iph, &rt->dst); + + iptunnel_xmit(skb, dev); + return; + +#if IS_ENABLED(CONFIG_IPV6) +tx_error_icmp: + dst_link_failure(skb); +#endif +tx_error: + dev->stats.tx_errors++; + dev_kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(ip_tunnel_xmit); + +static void ip_tunnel_update(struct ip_tunnel_net *itn, + struct ip_tunnel *t, + struct net_device *dev, + struct ip_tunnel_parm *p, + bool set_mtu) +{ + ip_tunnel_del(t); + t->parms.iph.saddr = p->iph.saddr; + t->parms.iph.daddr = p->iph.daddr; + t->parms.i_key = p->i_key; + t->parms.o_key = p->o_key; + if (dev->type != ARPHRD_ETHER) { + memcpy(dev->dev_addr, &p->iph.saddr, 4); + memcpy(dev->broadcast, &p->iph.daddr, 4); + } + ip_tunnel_add(itn, t); + + t->parms.iph.ttl = p->iph.ttl; + t->parms.iph.tos = p->iph.tos; + t->parms.iph.frag_off = p->iph.frag_off; + + if (t->parms.link != p->link) { + int mtu; + + t->parms.link = p->link; + mtu = ip_tunnel_bind_dev(dev); + if (set_mtu) + dev->mtu = mtu; + } + netdev_state_change(dev); +} + +int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +{ + int err = 0; + struct ip_tunnel *t; + struct net *net = dev_net(dev); + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); + + BUG_ON(!itn->fb_tunnel_dev); + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == itn->fb_tunnel_dev) + t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + if (t == NULL) + t = netdev_priv(dev); + memcpy(p, &t->parms, sizeof(*p)); + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + if (p->iph.ttl) + p->iph.frag_off |= htons(IP_DF); + if (!(p->i_flags&TUNNEL_KEY)) + p->i_key = 0; + if (!(p->o_flags&TUNNEL_KEY)) + p->o_key = 0; + + t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + + if (!t && (cmd == SIOCADDTUNNEL)) + t = ip_tunnel_create(net, itn, p); + + if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + unsigned int nflags = 0; + + if (ipv4_is_multicast(p->iph.daddr)) + nflags = IFF_BROADCAST; + else if (p->iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { + err = -EINVAL; + break; + } + + t = netdev_priv(dev); + } + } + + if (t) { + err = 0; + ip_tunnel_update(itn, t, dev, p, true); + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + + if (dev == itn->fb_tunnel_dev) { + err = -ENOENT; + t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + if (t == NULL) + goto done; + err = -EPERM; + if (t == netdev_priv(itn->fb_tunnel_dev)) + goto done; + dev = t->dev; + } + unregister_netdevice(dev); + err = 0; + break; + + default: + err = -EINVAL; + } + +done: + return err; +} +EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); + +int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + + if (new_mtu < 68 || + new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); + +static void ip_tunnel_dev_free(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + gro_cells_destroy(&tunnel->gro_cells); + free_percpu(dev->tstats); + free_netdev(dev); +} + +void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_net *itn; + + itn = net_generic(net, tunnel->ip_tnl_net_id); + + if (itn->fb_tunnel_dev != dev) { + ip_tunnel_del(netdev_priv(dev)); + unregister_netdevice_queue(dev, head); + } +} +EXPORT_SYMBOL_GPL(ip_tunnel_dellink); + +int __net_init ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, + struct rtnl_link_ops *ops, char *devname) +{ + struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); + struct ip_tunnel_parm parms; + + itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL); + if (!itn->tunnels) + return -ENOMEM; + + if (!ops) { + itn->fb_tunnel_dev = NULL; + return 0; + } + memset(&parms, 0, sizeof(parms)); + if (devname) + strlcpy(parms.name, devname, IFNAMSIZ); + + rtnl_lock(); + itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); + rtnl_unlock(); + if (IS_ERR(itn->fb_tunnel_dev)) { + kfree(itn->tunnels); + return PTR_ERR(itn->fb_tunnel_dev); + } + + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_init_net); + +static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head) +{ + int h; + + for (h = 0; h < IP_TNL_HASH_SIZE; h++) { + struct ip_tunnel *t; + struct hlist_node *n; + struct hlist_head *thead = &itn->tunnels[h]; + + hlist_for_each_entry_safe(t, n, thead, hash_node) + unregister_netdevice_queue(t->dev, head); + } + if (itn->fb_tunnel_dev) + unregister_netdevice_queue(itn->fb_tunnel_dev, head); +} + +void __net_exit ip_tunnel_delete_net(struct ip_tunnel_net *itn) +{ + LIST_HEAD(list); + + rtnl_lock(); + ip_tunnel_destroy(itn, &list); + unregister_netdevice_many(&list); + rtnl_unlock(); + kfree(itn->tunnels); +} +EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); + +int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], + struct ip_tunnel_parm *p) +{ + struct ip_tunnel *nt; + struct net *net = dev_net(dev); + struct ip_tunnel_net *itn; + int mtu; + int err; + + nt = netdev_priv(dev); + itn = net_generic(net, nt->ip_tnl_net_id); + + if (ip_tunnel_find(itn, p, dev->type)) + return -EEXIST; + + nt->parms = *p; + err = register_netdevice(dev); + if (err) + goto out; + + if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) + eth_hw_addr_random(dev); + + mtu = ip_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + + ip_tunnel_add(itn, nt); + +out: + return err; +} +EXPORT_SYMBOL_GPL(ip_tunnel_newlink); + +int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], + struct ip_tunnel_parm *p) +{ + struct ip_tunnel *t, *nt; + struct net *net = dev_net(dev); + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); + + if (dev == itn->fb_tunnel_dev) + return -EINVAL; + + nt = netdev_priv(dev); + + t = ip_tunnel_find(itn, p, dev->type); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else { + t = nt; + + if (dev->type != ARPHRD_ETHER) { + unsigned int nflags = 0; + + if (ipv4_is_multicast(p->iph.daddr)) + nflags = IFF_BROADCAST; + else if (p->iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags ^ nflags) & + (IFF_POINTOPOINT | IFF_BROADCAST)) + return -EINVAL; + } + } + + ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]); + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_changelink); + +int ip_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + int err; + + dev->destructor = ip_tunnel_dev_free; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + err = gro_cells_init(&tunnel->gro_cells, dev); + if (err) { + free_percpu(dev->tstats); + return err; + } + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + iph->version = 4; + iph->ihl = 5; + + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_init); + +void ip_tunnel_uninit(struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_net *itn; + + itn = net_generic(net, tunnel->ip_tnl_net_id); + /* fb_tunnel_dev will be unregisted in net-exit call. */ + if (itn->fb_tunnel_dev != dev) + ip_tunnel_del(netdev_priv(dev)); +} +EXPORT_SYMBOL_GPL(ip_tunnel_uninit); + +/* Do least required initialization, rest of init is done in tunnel_init call */ +void ip_tunnel_setup(struct net_device *dev, int net_id) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + tunnel->ip_tnl_net_id = net_id; +} +EXPORT_SYMBOL_GPL(ip_tunnel_setup); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index c3a4233c0ac2..9d2bdb2c1d3f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -38,7 +38,7 @@ #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> @@ -82,44 +82,6 @@ static int vti_tunnel_bind_dev(struct net_device *dev); } while (0) -static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -{ - int i; - - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } - - tot->multicast = dev->stats.multicast; - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_errors = dev->stats.rx_errors; - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - - return tot; -} - static struct ip_tunnel *vti_tunnel_lookup(struct net *net, __be32 remote, __be32 local) { @@ -597,7 +559,7 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_start_xmit = vti_tunnel_xmit, .ndo_do_ioctl = vti_tunnel_ioctl, .ndo_change_mtu = vti_tunnel_change_mtu, - .ndo_get_stats64 = vti_get_stats64, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; static void vti_dev_free(struct net_device *dev) diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 9a46daed2f3c..59cb8c769056 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -75,6 +75,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->props.mode = x->props.mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; + t->props.extra_flags = x->props.extra_flags; memcpy(&t->mark, &x->mark, sizeof(t->mark)); if (xfrm_init_state(t)) @@ -163,6 +164,7 @@ static const struct net_protocol ipcomp4_protocol = { .handler = xfrm4_rcv, .err_handler = ipcomp4_err, .no_policy = 1, + .netns_ok = 1, }; static int __init ipcomp4_init(void) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index a2e50ae80b53..efa1138fa523 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -206,7 +206,7 @@ static int __init ic_open_devs(void) struct ic_device *d, **last; struct net_device *dev; unsigned short oflags; - unsigned long start; + unsigned long start, next_msg; last = &ic_first_dev; rtnl_lock(); @@ -263,12 +263,23 @@ static int __init ic_open_devs(void) /* wait for a carrier on at least one device */ start = jiffies; + next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { + int wait, elapsed; + for_each_netdev(&init_net, dev) if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) goto have_carrier; msleep(1); + + if time_before(jiffies, next_msg) + continue; + + elapsed = jiffies_to_msecs(jiffies - start); + wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000; + pr_info("Waiting up to %d more seconds for network.\n", wait); + next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); } have_carrier: rtnl_unlock(); @@ -1394,7 +1405,7 @@ static int __init ip_auto_config(void) unsigned int i; #ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); + proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) @@ -1522,7 +1533,8 @@ static int __init ip_auto_config(void) } for (i++; i < CONF_NAMESERVERS_MAX; i++) if (ic_nameservers[i] != NONE) - pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]); + pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]); + pr_cont("\n"); #endif /* !SILENT */ return 0; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 191fc24a745a..77bfcce64fe5 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -111,227 +111,21 @@ #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> -#define HASH_SIZE 16 -#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) - static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static int ipip_net_id __read_mostly; -struct ipip_net { - struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_wc[1]; - struct ip_tunnel __rcu **tunnels[4]; - - struct net_device *fb_tunnel_dev; -}; static int ipip_tunnel_init(struct net_device *dev); -static void ipip_tunnel_setup(struct net_device *dev); -static void ipip_dev_free(struct net_device *dev); static struct rtnl_link_ops ipip_link_ops __read_mostly; -static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -{ - int i; - - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } - - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - tot->collisions = dev->stats.collisions; - - return tot; -} - -static struct ip_tunnel *ipip_tunnel_lookup(struct net *net, - __be32 remote, __be32 local) -{ - unsigned int h0 = HASH(remote); - unsigned int h1 = HASH(local); - struct ip_tunnel *t; - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1]) - if (local == t->parms.iph.saddr && - remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) - return t; - - for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0]) - if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) - return t; - - for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1]) - if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) - return t; - - t = rcu_dereference(ipn->tunnels_wc[0]); - if (t && (t->dev->flags&IFF_UP)) - return t; - return NULL; -} - -static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, - struct ip_tunnel_parm *parms) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - unsigned int h = 0; - int prio = 0; - - if (remote) { - prio |= 2; - h ^= HASH(remote); - } - if (local) { - prio |= 1; - h ^= HASH(local); - } - return &ipn->tunnels[prio][h]; -} - -static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, - struct ip_tunnel *t) -{ - return __ipip_bucket(ipn, &t->parms); -} - -static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp; - struct ip_tunnel *iter; - - for (tp = ipip_bucket(ipn, t); - (iter = rtnl_dereference(*tp)) != NULL; - tp = &iter->next) { - if (t == iter) { - rcu_assign_pointer(*tp, t->next); - break; - } - } -} - -static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) -{ - struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); - - rcu_assign_pointer(t->next, rtnl_dereference(*tp)); - rcu_assign_pointer(*tp, t); -} - -static int ipip_tunnel_create(struct net_device *dev) -{ - struct ip_tunnel *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct ipip_net *ipn = net_generic(net, ipip_net_id); - int err; - - err = ipip_tunnel_init(dev); - if (err < 0) - goto out; - - err = register_netdevice(dev); - if (err < 0) - goto out; - - strcpy(t->parms.name, dev->name); - dev->rtnl_link_ops = &ipip_link_ops; - - dev_hold(dev); - ipip_tunnel_link(ipn, t); - return 0; - -out: - return err; -} - -static struct ip_tunnel *ipip_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) -{ - __be32 remote = parms->iph.daddr; - __be32 local = parms->iph.saddr; - struct ip_tunnel *t, *nt; - struct ip_tunnel __rcu **tp; - struct net_device *dev; - char name[IFNAMSIZ]; - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - for (tp = __ipip_bucket(ipn, parms); - (t = rtnl_dereference(*tp)) != NULL; - tp = &t->next) { - if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) - return t; - } - if (!create) - return NULL; - - if (parms->name[0]) - strlcpy(name, parms->name, IFNAMSIZ); - else - strcpy(name, "tunl%d"); - - dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); - if (dev == NULL) - return NULL; - - dev_net_set(dev, net); - - nt = netdev_priv(dev); - nt->parms = *parms; - - if (ipip_tunnel_create(dev) < 0) - goto failed_free; - - return nt; - -failed_free: - ipip_dev_free(dev); - return NULL; -} - -/* called with RTNL */ -static void ipip_tunnel_uninit(struct net_device *dev) -{ - struct net *net = dev_net(dev); - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - if (dev == ipn->fb_tunnel_dev) - RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL); - else - ipip_tunnel_unlink(ipn, netdev_priv(dev)); - dev_put(dev); -} - static int ipip_err(struct sk_buff *skb, u32 info) { @@ -339,41 +133,17 @@ static int ipip_err(struct sk_buff *skb, u32 info) 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. */ + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err; - - switch (type) { - default: - case ICMP_PARAMETERPROB: - return 0; - - case ICMP_DEST_UNREACH: - switch (code) { - case ICMP_SR_FAILED: - case ICMP_PORT_UNREACH: - /* Impossible event. */ - return 0; - default: - /* All others are translated to HOST_UNREACH. - rfc2003 contains "deep thoughts" about NET_UNREACH, - I believe they are just ether pollution. --ANK - */ - break; - } - break; - case ICMP_TIME_EXCEEDED: - if (code != ICMP_EXC_TTL) - return 0; - break; - case ICMP_REDIRECT: - break; - } + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; err = -ENOENT; - t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); + t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); if (t == NULL) goto out; @@ -403,53 +173,29 @@ static int ipip_err(struct sk_buff *skb, u32 info) else t->err_count = 1; t->err_time = jiffies; -out: +out: return err; } +static const struct tnl_ptk_info tpi = { + /* no tunnel info required for ipip. */ + .proto = htons(ETH_P_IP), +}; + static int ipip_rcv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); - int err; - - tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); - if (tunnel != NULL) { - struct pcpu_tstats *tstats; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - - secpath_reset(skb); - - skb->mac_header = skb->network_header; - skb_reset_network_header(skb); - skb->protocol = htons(ETH_P_IP); - skb->pkt_type = PACKET_HOST; - - __skb_tunnel_rx(skb, tunnel->dev); - - err = IP_ECN_decapsulate(iph, skb); - if (unlikely(err)) { - if (log_ecn_error) - net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", - &iph->saddr, iph->tos); - if (err > 1) { - ++tunnel->dev->stats.rx_frame_errors; - ++tunnel->dev->stats.rx_errors; - goto drop; - } - } - - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - - netif_rx(skb); - return 0; + return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); } return -1; @@ -463,327 +209,64 @@ drop: * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ - static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; - u8 tos = tunnel->parms.iph.tos; - __be16 df = tiph->frag_off; - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - const struct iphdr *old_iph = ip_hdr(skb); - struct iphdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - __be32 dst = tiph->daddr; - struct flowi4 fl4; - int mtu; - - if (skb->protocol != htons(ETH_P_IP)) - goto tx_error; - - if (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb)) - goto tx_error; - if (tos & 1) - tos = old_iph->tos; - - if (!dst) { - /* NBMA tunnel */ - if ((rt = skb_rtable(skb)) == NULL) { - dev->stats.tx_fifo_errors++; - goto tx_error; - } - dst = rt_nexthop(rt, old_iph->daddr); - } - - rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, - dst, tiph->saddr, - 0, 0, - IPPROTO_IPIP, RT_TOS(tos), - tunnel->parms.link); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; - } - tdev = rt->dst.dev; - - if (tdev == dev) { - ip_rt_put(rt); - dev->stats.collisions++; + if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - } - df |= old_iph->frag_off & htons(IP_DF); - - if (df) { - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); - - if (mtu < 68) { - dev->stats.collisions++; - ip_rt_put(rt); - goto tx_error; - } - - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - - if ((old_iph->frag_off & htons(IP_DF)) && - mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - ip_rt_put(rt); - goto tx_error; - } + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; } - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { - tunnel->err_count--; - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); - - if (skb_headroom(skb) < max_headroom || skb_shared(skb) || - (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { - struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - dev->stats.tx_dropped++; - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - if (skb->sk) - skb_set_owner_w(new_skb, skb->sk); - dev_kfree_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } - - skb->transport_header = skb->network_header; - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | - IPSKB_REROUTED); - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - - /* - * Push down and install the IPIP header. - */ - - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); - iph->daddr = fl4.daddr; - iph->saddr = fl4.saddr; - - if ((iph->ttl = tiph->ttl) == 0) - iph->ttl = old_iph->ttl; - - iptunnel_xmit(skb, dev); + ip_tunnel_xmit(skb, dev, tiph); return NETDEV_TX_OK; -tx_error_icmp: - dst_link_failure(skb); tx_error: dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } -static void ipip_tunnel_bind_dev(struct net_device *dev) -{ - struct net_device *tdev = NULL; - struct ip_tunnel *tunnel; - const struct iphdr *iph; - - tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; - - if (iph->daddr) { - struct rtable *rt; - struct flowi4 fl4; - - rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, - iph->daddr, iph->saddr, - 0, 0, - IPPROTO_IPIP, - RT_TOS(iph->tos), - tunnel->parms.link); - if (!IS_ERR(rt)) { - tdev = rt->dst.dev; - ip_rt_put(rt); - } - dev->flags |= IFF_POINTOPOINT; - } - - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); - - if (tdev) { - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); - dev->mtu = tdev->mtu - sizeof(struct iphdr); - } - dev->iflink = tunnel->parms.link; -} - -static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) -{ - struct net *net = dev_net(t->dev); - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - ipip_tunnel_unlink(ipn, t); - synchronize_net(); - t->parms.iph.saddr = p->iph.saddr; - t->parms.iph.daddr = p->iph.daddr; - memcpy(t->dev->dev_addr, &p->iph.saddr, 4); - memcpy(t->dev->broadcast, &p->iph.daddr, 4); - ipip_tunnel_link(ipn, t); - t->parms.iph.ttl = p->iph.ttl; - t->parms.iph.tos = p->iph.tos; - t->parms.iph.frag_off = p->iph.frag_off; - if (t->parms.link != p->link) { - t->parms.link = p->link; - ipip_tunnel_bind_dev(t->dev); - } - netdev_state_change(t->dev); -} - static int -ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - switch (cmd) { - case SIOCGETTUNNEL: - t = NULL; - if (dev == ipn->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { - err = -EFAULT; - break; - } - t = ipip_tunnel_locate(net, &p, 0); - } - if (t == NULL) - t = netdev_priv(dev); - memcpy(&p, &t->parms, sizeof(p)); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - err = -EFAULT; - break; - - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - - err = -EINVAL; - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) - goto done; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - - t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - - if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else { - if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || - (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { - err = -EINVAL; - break; - } - t = netdev_priv(dev); - } - - ipip_tunnel_update(t, &p); - } - - if (t) { - err = 0; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); - break; - - case SIOCDELTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - if (dev == ipn->fb_tunnel_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - err = -ENOENT; - if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) - goto done; - err = -EPERM; - if (t->dev == ipn->fb_tunnel_dev) - goto done; - dev = t->dev; - } - unregister_netdevice(dev); - err = 0; - break; - default: - err = -EINVAL; - } - -done: - return err; -} + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; -static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ - if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + return -EINVAL; + if (p.i_key || p.o_key || p.i_flags || p.o_flags) return -EINVAL; - dev->mtu = new_mtu; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + err = ip_tunnel_ioctl(dev, &p, cmd); + if (err) + return err; + + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; + return 0; } static const struct net_device_ops ipip_netdev_ops = { - .ndo_uninit = ipip_tunnel_uninit, + .ndo_init = ipip_tunnel_init, + .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ipip_tunnel_ioctl, - .ndo_change_mtu = ipip_tunnel_change_mtu, - .ndo_get_stats64 = ipip_get_stats64, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; -static void ipip_dev_free(struct net_device *dev) -{ - free_percpu(dev->tstats); - free_netdev(dev); -} - #define IPIP_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ @@ -792,11 +275,8 @@ static void ipip_dev_free(struct net_device *dev) static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; - dev->destructor = ipip_dev_free; dev->type = ARPHRD_TUNNEL; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; @@ -806,46 +286,19 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; + ip_tunnel_setup(dev, ipip_net_id); } static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - tunnel->dev = dev; - memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - ipip_tunnel_bind_dev(dev); - - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; - - return 0; -} - -static int __net_init ipip_fb_tunnel_init(struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - struct iphdr *iph = &tunnel->parms.iph; - struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); - - tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); - - iph->version = 4; - iph->protocol = IPPROTO_IPIP; - iph->ihl = 5; - - dev->tstats = alloc_percpu(struct pcpu_tstats); - if (!dev->tstats) - return -ENOMEM; - - dev_hold(dev); - rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); - return 0; + tunnel->hlen = 0; + tunnel->parms.iph.protocol = IPPROTO_IPIP; + return ip_tunnel_init(dev); } static void ipip_netlink_parms(struct nlattr *data[], @@ -885,28 +338,16 @@ static void ipip_netlink_parms(struct nlattr *data[], static int ipip_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct net *net = dev_net(dev); - struct ip_tunnel *nt; - - nt = netdev_priv(dev); - ipip_netlink_parms(data, &nt->parms); - - if (ipip_tunnel_locate(net, &nt->parms, 0)) - return -EEXIST; + struct ip_tunnel_parm p; - return ipip_tunnel_create(dev); + ipip_netlink_parms(data, &p); + return ip_tunnel_newlink(dev, tb, &p); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct ip_tunnel *t; struct ip_tunnel_parm p; - struct net *net = dev_net(dev); - struct ipip_net *ipn = net_generic(net, ipip_net_id); - - if (dev == ipn->fb_tunnel_dev) - return -EINVAL; ipip_netlink_parms(data, &p); @@ -914,16 +355,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; - t = ipip_tunnel_locate(net, &p, 0); - - if (t) { - if (t->dev != dev) - return -EEXIST; - } else - t = netdev_priv(dev); - - ipip_tunnel_update(t, &p); - return 0; + return ip_tunnel_changelink(dev, tb, &p); } static size_t ipip_get_size(const struct net_device *dev) @@ -980,6 +412,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = { .setup = ipip_tunnel_setup, .newlink = ipip_newlink, .changelink = ipip_changelink, + .dellink = ip_tunnel_dellink, .get_size = ipip_get_size, .fill_info = ipip_fill_info, }; @@ -990,90 +423,29 @@ static struct xfrm_tunnel ipip_handler __read_mostly = { .priority = 1, }; -static const char banner[] __initconst = - KERN_INFO "IPv4 over IPv4 tunneling driver\n"; - -static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) -{ - int prio; - - for (prio = 1; prio < 4; prio++) { - int h; - for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - - t = rtnl_dereference(ipn->tunnels[prio][h]); - while (t != NULL) { - unregister_netdevice_queue(t->dev, head); - t = rtnl_dereference(t->next); - } - } - } -} - static int __net_init ipip_init_net(struct net *net) { - struct ipip_net *ipn = net_generic(net, ipip_net_id); - struct ip_tunnel *t; - int err; - - ipn->tunnels[0] = ipn->tunnels_wc; - ipn->tunnels[1] = ipn->tunnels_l; - ipn->tunnels[2] = ipn->tunnels_r; - ipn->tunnels[3] = ipn->tunnels_r_l; - - ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), - "tunl0", - ipip_tunnel_setup); - if (!ipn->fb_tunnel_dev) { - err = -ENOMEM; - goto err_alloc_dev; - } - dev_net_set(ipn->fb_tunnel_dev, net); - - err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); - if (err) - goto err_reg_dev; - - if ((err = register_netdev(ipn->fb_tunnel_dev))) - goto err_reg_dev; - - t = netdev_priv(ipn->fb_tunnel_dev); - - strcpy(t->parms.name, ipn->fb_tunnel_dev->name); - return 0; - -err_reg_dev: - ipip_dev_free(ipn->fb_tunnel_dev); -err_alloc_dev: - /* nothing */ - return err; + return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } static void __net_exit ipip_exit_net(struct net *net) { - struct ipip_net *ipn = net_generic(net, ipip_net_id); - LIST_HEAD(list); - - rtnl_lock(); - ipip_destroy_tunnels(ipn, &list); - unregister_netdevice_queue(ipn->fb_tunnel_dev, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); + struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); + ip_tunnel_delete_net(itn); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit = ipip_exit_net, .id = &ipip_net_id, - .size = sizeof(struct ipip_net), + .size = sizeof(struct ip_tunnel_net), }; static int __init ipip_init(void) { int err; - printk(banner); + pr_info("ipip: IPv4 over IPv4 tunneling driver\n"); err = register_pernet_device(&ipip_net_ops); if (err < 0) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a9454cbd953c..9d9610ae7855 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -61,7 +61,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/export.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> @@ -626,9 +626,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); nlh->nlmsg_type = NLMSG_ERROR; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); - e = NLMSG_DATA(nlh); + e = nlmsg_data(nlh); e->error = -ETIMEDOUT; memset(&e->msg, 0, sizeof(e->msg)); @@ -828,6 +828,49 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, return NULL; } +/* Look for a (*,*,oif) entry */ +static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, + int vifi) +{ + int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY)); + struct mfc_cache *c; + + list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == htonl(INADDR_ANY) && + c->mfc_un.res.ttls[vifi] < 255) + return c; + + return NULL; +} + +/* Look for a (*,G) entry */ +static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, + __be32 mcastgrp, int vifi) +{ + int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY)); + struct mfc_cache *c, *proxy; + + if (mcastgrp == htonl(INADDR_ANY)) + goto skip; + + list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == mcastgrp) { + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + /* It's ok if the vifi is part of the static tree */ + proxy = ipmr_cache_find_any_parent(mrt, + c->mfc_parent); + if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) + return c; + } + +skip: + return ipmr_cache_find_any_parent(mrt, vifi); +} + /* * Allocate a multicast cache entry */ @@ -867,14 +910,14 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); - if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { + if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); - e = NLMSG_DATA(nlh); + e = nlmsg_data(nlh); e->error = -EMSGSIZE; memset(&e->msg, 0, sizeof(e->msg)); } @@ -1053,7 +1096,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) * MFC cache manipulation by user space mroute daemon */ -static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) +static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { int line; struct mfc_cache *c, *next; @@ -1062,7 +1105,8 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && - c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && + (parent == -1 || parent == c->mfc_parent)) { list_del_rcu(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_cache_free(c); @@ -1073,7 +1117,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) } static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, - struct mfcctl *mfc, int mrtsock) + struct mfcctl *mfc, int mrtsock, int parent) { bool found = false; int line; @@ -1086,7 +1130,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && - c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && + (parent == -1 || parent == c->mfc_parent)) { found = true; break; } @@ -1103,7 +1148,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, return 0; } - if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) + if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) && + !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; c = ipmr_cache_alloc(); @@ -1218,7 +1264,7 @@ static void mrtsock_destruct(struct sock *sk) int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) { - int ret; + int ret, parent = 0; struct vifctl vif; struct mfcctl mfc; struct net *net = sock_net(sk); @@ -1287,16 +1333,22 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi */ case MRT_ADD_MFC: case MRT_DEL_MFC: + parent = -1; + case MRT_ADD_MFC_PROXY: + case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) return -EINVAL; if (copy_from_user(&mfc, optval, sizeof(mfc))) return -EFAULT; + if (parent == 0) + parent = mfc.mfcc_parent; rtnl_lock(); - if (optname == MRT_DEL_MFC) - ret = ipmr_mfc_delete(mrt, &mfc); + if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) + ret = ipmr_mfc_delete(mrt, &mfc, parent); else ret = ipmr_mfc_add(net, mrt, &mfc, - sk == rtnl_dereference(mrt->mroute_sk)); + sk == rtnl_dereference(mrt->mroute_sk), + parent); rtnl_unlock(); return ret; /* @@ -1749,17 +1801,28 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, { int psend = -1; int vif, ct; + int true_vifi = ipmr_find_vif(mrt, skb->dev); vif = cache->mfc_parent; cache->mfc_un.res.pkt++; cache->mfc_un.res.bytes += skb->len; + if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { + struct mfc_cache *cache_proxy; + + /* For an (*,G) entry, we only check that the incomming + * interface is part of the static tree. + */ + cache_proxy = ipmr_cache_find_any_parent(mrt, vif); + if (cache_proxy && + cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + goto forward; + } + /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (mrt->vif_table[vif].dev != skb->dev) { - int true_vifi; - if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... @@ -1776,7 +1839,6 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, } cache->mfc_un.res.wrong_if++; - true_vifi = ipmr_find_vif(mrt, skb->dev); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -1794,15 +1856,34 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, goto dont_forward; } +forward: mrt->vif_table[vif].pkt_in++; mrt->vif_table[vif].bytes_in += skb->len; /* * Forward the frame */ + if (cache->mfc_origin == htonl(INADDR_ANY) && + cache->mfc_mcastgrp == htonl(INADDR_ANY)) { + if (true_vifi >= 0 && + true_vifi != cache->mfc_parent && + ip_hdr(skb)->ttl > + cache->mfc_un.res.ttls[cache->mfc_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = cache->mfc_parent; + goto last_forward; + } + goto dont_forward; + } for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) { - if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { + /* For (*,G) entry, don't forward to the incoming interface */ + if ((cache->mfc_origin != htonl(INADDR_ANY) || + ct != true_vifi) && + ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -1813,6 +1894,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, psend = ct; } } +last_forward: if (psend != -1) { if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -1902,6 +1984,13 @@ int ip_mr_input(struct sk_buff *skb) /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + if (cache == NULL) { + int vif = ipmr_find_vif(mrt, skb->dev); + + if (vif >= 0) + cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, + vif); + } /* * No usable cache entry @@ -2107,7 +2196,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, rcu_read_lock(); cache = ipmr_cache_find(mrt, saddr, daddr); + if (cache == NULL && skb->dev) { + int vif = ipmr_find_vif(mrt, skb->dev); + if (vif >= 0) + cache = ipmr_cache_find_any(mrt, daddr, vif); + } if (cache == NULL) { struct sk_buff *skb2; struct iphdr *iph; @@ -2609,16 +2703,16 @@ static int __net_init ipmr_net_init(struct net *net) #ifdef CONFIG_PROC_FS err = -ENOMEM; - if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops)) + if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops)) goto proc_vif_fail; - if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops)) + if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops)) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: - proc_net_remove(net, "ip_mr_vif"); + remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: ipmr_rules_exit(net); #endif @@ -2629,8 +2723,8 @@ fail: static void __net_exit ipmr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_remove(net, "ip_mr_cache"); - proc_net_remove(net, "ip_mr_vif"); + remove_proc_entry("ip_mr_cache", net->proc_net); + remove_proc_entry("ip_mr_vif", net->proc_net); #endif ipmr_rules_exit(net); } diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 4c0cf63dd92e..c3e0adea9c27 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -1,4 +1,9 @@ -/* IPv4 specific functions of netfilter core */ +/* + * IPv4 specific functions of netfilter core + * + * Rusty Russell (C) 2000 -- This code is GPL. + * Patrick McHardy (C) 2006-2012 + */ #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> @@ -40,14 +45,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) fl4.flowi4_flags = flags; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) - return -1; + return PTR_ERR(rt); /* Drop old route. */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); if (skb_dst(skb)->error) - return -1; + return skb_dst(skb)->error; #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -56,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) skb_dst_set(skb, NULL); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); if (IS_ERR(dst)) - return -1; + return PTR_ERR(dst);; skb_dst_set(skb, dst); } #endif @@ -66,7 +71,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) - return -1; + return -ENOMEM; return 0; } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d8d6f2a5bf12..e7916c193932 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,19 +36,6 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -config IP_NF_QUEUE - tristate "IP Userspace queueing via NETLINK (OBSOLETE)" - depends on NETFILTER_ADVANCED - help - Netfilter has the ability to queue packets to user space: the - netlink device can be used to access them using this driver. - - This option enables the old IPv4-only "ip_queue" implementation - which has been obsoleted by the new "nfnetlink_queue" code (see - CONFIG_NETFILTER_NETLINK_QUEUE). - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" default m if NETFILTER_ADVANCED=n @@ -84,7 +71,7 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED + depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW) ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. @@ -241,8 +228,8 @@ config IP_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. config IP_NF_TARGET_CLUSTERIP - tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_MANGLE && EXPERIMENTAL + tristate "CLUSTERIP target support" + depends on IP_NF_MANGLE depends on NF_CONNTRACK_IPV4 depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3ea4127404d6..85a4f21aac1a 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -6,6 +6,7 @@ * Some ARP specific bits are: * * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net> * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -901,7 +902,7 @@ static int get_info(struct net *net, void __user *user, #endif t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -958,7 +959,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, } t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", @@ -1001,7 +1002,7 @@ static int __do_replace(struct net *net, const char *name, t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } @@ -1158,7 +1159,7 @@ static int do_add_counters(struct net *net, const void __user *user, } t = xt_find_table_lock(net, NFPROTO_ARP, name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } @@ -1646,7 +1647,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 79ca5e70d497..eadab1ed6500 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -48,9 +48,7 @@ static int __net_init arptable_filter_net_init(struct net *net) net->ipv4.arptable_filter = arpt_register_table(net, &packet_filter, repl); kfree(repl); - if (IS_ERR(net->ipv4.arptable_filter)) - return PTR_ERR(net->ipv4.arptable_filter); - return 0; + return PTR_RET(net->ipv4.arptable_filter); } static void __net_exit arptable_filter_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 17c5e06da662..d23118d95ff9 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -3,6 +3,7 @@ * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -182,8 +183,7 @@ ipt_get_target_c(const struct ipt_entry *e) return ipt_get_target((struct ipt_entry *)e); } -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) static const char *const hooknames[] = { [NF_INET_PRE_ROUTING] = "PREROUTING", [NF_INET_LOCAL_IN] = "INPUT", @@ -259,6 +259,7 @@ static void trace_packet(const struct sk_buff *skb, const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; + struct net *net = dev_net(in ? in : out); table_base = private->entries[smp_processor_id()]; root = get_entry(table_base, private->hook_entry[hook]); @@ -271,7 +272,7 @@ static void trace_packet(const struct sk_buff *skb, &chainname, &comment, &rulenum) != 0) break; - nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo, + nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", tablename, chainname, comment, rulenum); } @@ -361,8 +362,7 @@ ipt_do_table(struct sk_buff *skb, t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) trace_packet(skb, hook, in, out, @@ -1090,7 +1090,7 @@ static int get_info(struct net *net, void __user *user, #endif t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), "iptable_%s", name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1149,7 +1149,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, } t = xt_find_table_lock(net, AF_INET, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) @@ -1189,7 +1189,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), "iptable_%s", name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } @@ -1347,7 +1347,7 @@ do_add_counters(struct net *net, const void __user *user, } t = xt_find_table_lock(net, AF_INET, name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } @@ -1931,7 +1931,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; duprintf("t->private->number = %u\n", private->number); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 75e33a7048f8..0b732efd32e2 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -105,7 +105,7 @@ clusterip_config_entry_put(struct clusterip_config *c) * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS - remove_proc_entry(c->pde->name, c->pde->parent); + proc_remove(c->pde); #endif return; } @@ -631,7 +631,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - struct clusterip_config *c = PDE(inode)->data; + struct clusterip_config *c = PDE_DATA(inode); sf->private = c; @@ -643,7 +643,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) static int clusterip_proc_release(struct inode *inode, struct file *file) { - struct clusterip_config *c = PDE(inode)->data; + struct clusterip_config *c = PDE_DATA(inode); int ret; ret = seq_release(inode, file); @@ -657,7 +657,7 @@ static int clusterip_proc_release(struct inode *inode, struct file *file) static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { - struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data; + struct clusterip_config *c = PDE_DATA(file_inode(file)); #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; unsigned long nodenum; @@ -736,7 +736,7 @@ static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); #ifdef CONFIG_PROC_FS - remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); + proc_remove(clusterip_procdir); #endif nf_unregister_hook(&cip_arp_ops); xt_unregister_target(&clusterip_tg_reg); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index b5ef3cba2250..f8a222cb6448 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -4,6 +4,7 @@ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005-2007 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -37,7 +38,7 @@ #include <linux/skbuff.h> #include <linux/kernel.h> #include <linux/timer.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netdevice.h> #include <linux/mm.h> #include <linux/moduleparam.h> @@ -45,6 +46,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_ULOG.h> #include <net/netfilter/nf_log.h> +#include <net/netns/generic.h> #include <net/sock.h> #include <linux/bitops.h> #include <asm/unaligned.h> @@ -78,20 +80,26 @@ typedef struct { struct timer_list timer; /* the timer function */ } ulog_buff_t; -static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ +static int ulog_net_id __read_mostly; +struct ulog_net { + unsigned int nlgroup[ULOG_MAXNLGROUPS]; + ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; + struct sock *nflognl; + spinlock_t lock; +}; -static struct sock *nflognl; /* our socket */ -static DEFINE_SPINLOCK(ulog_lock); /* spinlock */ +static struct ulog_net *ulog_pernet(struct net *net) +{ + return net_generic(net, ulog_net_id); +} /* send one ulog_buff_t to userspace */ -static void ulog_send(unsigned int nlgroupnum) +static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum) { - ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; + ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum]; - if (timer_pending(&ub->timer)) { - pr_debug("ulog_send: timer was pending, deleting\n"); - del_timer(&ub->timer); - } + pr_debug("ulog_send: timer is deleting\n"); + del_timer(&ub->timer); if (!ub->skb) { pr_debug("ulog_send: nothing to send\n"); @@ -105,7 +113,8 @@ static void ulog_send(unsigned int nlgroupnum) NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; pr_debug("throwing %d packets to netlink group %u\n", ub->qlen, nlgroupnum + 1); - netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); + netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1, + GFP_ATOMIC); ub->qlen = 0; ub->skb = NULL; @@ -116,13 +125,16 @@ static void ulog_send(unsigned int nlgroupnum) /* timer function to flush queue in flushtimeout time */ static void ulog_timer(unsigned long data) { + struct ulog_net *ulog = container_of((void *)data, + struct ulog_net, + nlgroup[*(unsigned int *)data]); pr_debug("timer function called, calling ulog_send\n"); /* lock to protect against somebody modifying our structure * from ipt_ulog_target at the same time */ - spin_lock_bh(&ulog_lock); - ulog_send(data); - spin_unlock_bh(&ulog_lock); + spin_lock_bh(&ulog->lock); + ulog_send(ulog, data); + spin_unlock_bh(&ulog->lock); } static struct sk_buff *ulog_alloc_skb(unsigned int size) @@ -162,6 +174,8 @@ static void ipt_ulog_packet(unsigned int hooknum, size_t size, copy_len; struct nlmsghdr *nlh; struct timeval tv; + struct net *net = dev_net(in ? in : out); + struct ulog_net *ulog = ulog_pernet(net); /* ffs == find first bit set, necessary because userspace * is already shifting groupnumber, but we need unshifted. @@ -174,11 +188,11 @@ static void ipt_ulog_packet(unsigned int hooknum, else copy_len = loginfo->copy_range; - size = NLMSG_SPACE(sizeof(*pm) + copy_len); + size = nlmsg_total_size(sizeof(*pm) + copy_len); - ub = &ulog_buffers[groupnum]; + ub = &ulog->ulog_buffers[groupnum]; - spin_lock_bh(&ulog_lock); + spin_lock_bh(&ulog->lock); if (!ub->skb) { if (!(ub->skb = ulog_alloc_skb(size))) @@ -188,7 +202,7 @@ static void ipt_ulog_packet(unsigned int hooknum, /* either the queue len is too high or we don't have * enough room in nlskb left. send it to userspace. */ - ulog_send(groupnum); + ulog_send(ulog, groupnum); if (!(ub->skb = ulog_alloc_skb(size))) goto alloc_failure; @@ -262,16 +276,16 @@ static void ipt_ulog_packet(unsigned int hooknum, if (ub->qlen >= loginfo->qthreshold) { if (loginfo->qthreshold > 1) nlh->nlmsg_type = NLMSG_DONE; - ulog_send(groupnum); + ulog_send(ulog, groupnum); } out_unlock: - spin_unlock_bh(&ulog_lock); + spin_unlock_bh(&ulog->lock); return; alloc_failure: pr_debug("Error building netlink message\n"); - spin_unlock_bh(&ulog_lock); + spin_unlock_bh(&ulog->lock); } static unsigned int @@ -378,58 +392,45 @@ static struct nf_logger ipt_ulog_logger __read_mostly = { .me = THIS_MODULE, }; -static int __init ulog_tg_init(void) +static int __net_init ulog_tg_net_init(struct net *net) { - int ret, i; + int i; + struct ulog_net *ulog = ulog_pernet(net); struct netlink_kernel_cfg cfg = { .groups = ULOG_MAXNLGROUPS, }; - pr_debug("init module\n"); - - if (nlbufsiz > 128*1024) { - pr_warning("Netlink buffer has to be <= 128kB\n"); - return -EINVAL; - } - + spin_lock_init(&ulog->lock); /* initialize ulog_buffers */ for (i = 0; i < ULOG_MAXNLGROUPS; i++) - setup_timer(&ulog_buffers[i].timer, ulog_timer, i); + setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, i); - nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg); - if (!nflognl) + ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); + if (!ulog->nflognl) return -ENOMEM; - ret = xt_register_target(&ulog_tg_reg); - if (ret < 0) { - netlink_kernel_release(nflognl); - return ret; - } if (nflog) - nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); + nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger); return 0; } -static void __exit ulog_tg_exit(void) +static void __net_exit ulog_tg_net_exit(struct net *net) { ulog_buff_t *ub; int i; - - pr_debug("cleanup_module\n"); + struct ulog_net *ulog = ulog_pernet(net); if (nflog) - nf_log_unregister(&ipt_ulog_logger); - xt_unregister_target(&ulog_tg_reg); - netlink_kernel_release(nflognl); + nf_log_unset(net, &ipt_ulog_logger); + + netlink_kernel_release(ulog->nflognl); /* remove pending timers and free allocated skb's */ for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ub = &ulog_buffers[i]; - if (timer_pending(&ub->timer)) { - pr_debug("timer was pending, deleting\n"); - del_timer(&ub->timer); - } + ub = &ulog->ulog_buffers[i]; + pr_debug("timer is deleting\n"); + del_timer(&ub->timer); if (ub->skb) { kfree_skb(ub->skb); @@ -438,5 +439,50 @@ static void __exit ulog_tg_exit(void) } } +static struct pernet_operations ulog_tg_net_ops = { + .init = ulog_tg_net_init, + .exit = ulog_tg_net_exit, + .id = &ulog_net_id, + .size = sizeof(struct ulog_net), +}; + +static int __init ulog_tg_init(void) +{ + int ret; + pr_debug("init module\n"); + + if (nlbufsiz > 128*1024) { + pr_warn("Netlink buffer has to be <= 128kB\n"); + return -EINVAL; + } + + ret = register_pernet_subsys(&ulog_tg_net_ops); + if (ret) + goto out_pernet; + + ret = xt_register_target(&ulog_tg_reg); + if (ret < 0) + goto out_target; + + if (nflog) + nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); + + return 0; + +out_target: + unregister_pernet_subsys(&ulog_tg_net_ops); +out_pernet: + return ret; +} + +static void __exit ulog_tg_exit(void) +{ + pr_debug("cleanup_module\n"); + if (nflog) + nf_log_unregister(&ipt_ulog_logger); + xt_unregister_target(&ulog_tg_reg); + unregister_pernet_subsys(&ulog_tg_net_ops); +} + module_init(ulog_tg_init); module_exit(ulog_tg_exit); diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index c30130062cd6..c49dcd0284a0 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -66,6 +66,12 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, return dev_match; } +static bool rpfilter_is_local(const struct sk_buff *skb) +{ + const struct rtable *rt = skb_rtable(skb); + return rt && (rt->rt_flags & RTCF_LOCAL); +} + static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rpfilter_info *info; @@ -76,7 +82,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) info = par->matchinfo; invert = info->flags & XT_RPFILTER_INVERT; - if (par->in->flags & IFF_LOOPBACK) + if (rpfilter_is_local(skb)) return true ^ invert; iph = ip_hdr(skb); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 85d88f206447..cba5658ec82c 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -44,6 +44,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) u_int8_t tos; __be32 saddr, daddr; u_int32_t mark; + int err; /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || @@ -66,9 +67,11 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || - iph->tos != tos) - if (ip_route_me_harder(skb, RTN_UNSPEC)) - ret = NF_DROP; + iph->tos != tos) { + err = ip_route_me_harder(skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } } return ret; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index eeaff7e4acb5..6383273d54e1 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -176,6 +176,7 @@ nf_nat_ipv4_out(unsigned int hooknum, #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; + int err; #endif unsigned int ret; @@ -195,9 +196,11 @@ nf_nat_ipv4_out(unsigned int hooknum, ct->tuplehash[!dir].tuple.dst.u3.ip) || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) - if (nf_xfrm_me_harder(skb, AF_INET) < 0) - ret = NF_DROP; + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } } #endif return ret; @@ -213,6 +216,7 @@ nf_nat_ipv4_local_fn(unsigned int hooknum, const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; + int err; /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || @@ -226,16 +230,19 @@ nf_nat_ipv4_local_fn(unsigned int hooknum, if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { - if (ip_route_me_harder(skb, RTN_UNSPEC)) - ret = NF_DROP; + err = ip_route_me_harder(skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (nf_xfrm_me_harder(skb, AF_INET) < 0) - ret = NF_DROP; + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } #endif } return ret; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index fcdd0c2406e6..567d84168bd2 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -1,6 +1,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -100,7 +101,6 @@ static unsigned int ipv4_helper(unsigned int hooknum, enum ip_conntrack_info ctinfo; const struct nf_conn_help *help; const struct nf_conntrack_helper *helper; - unsigned int ret; /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(skb, &ctinfo); @@ -116,13 +116,8 @@ static unsigned int ipv4_helper(unsigned int hooknum, if (!helper) return NF_ACCEPT; - ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); - if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) { - nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, - "nf_ct_%s: dropping packet", helper->name); - } - return ret; + return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), + ct, ctinfo); } static unsigned int ipv4_confirm(unsigned int hooknum, @@ -420,54 +415,43 @@ static int ipv4_net_init(struct net *net) { int ret = 0; - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_tcp4); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4); if (ret < 0) { - pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n"); + pr_err("nf_conntrack_tcp4: pernet registration failed\n"); goto out_tcp; } - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_udp4); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4); if (ret < 0) { - pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n"); + pr_err("nf_conntrack_udp4: pernet registration failed\n"); goto out_udp; } - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_icmp); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp); if (ret < 0) { - pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n"); + pr_err("nf_conntrack_icmp4: pernet registration failed\n"); goto out_icmp; } - ret = nf_conntrack_l3proto_register(net, - &nf_conntrack_l3proto_ipv4); + ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4); if (ret < 0) { - pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n"); + pr_err("nf_conntrack_ipv4: pernet registration failed\n"); goto out_ipv4; } return 0; out_ipv4: - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_icmp); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); out_icmp: - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_udp4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); out_udp: - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_tcp4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); out_tcp: return ret; } static void ipv4_net_exit(struct net *net) { - nf_conntrack_l3proto_unregister(net, - &nf_conntrack_l3proto_ipv4); - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_icmp); - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_udp4); - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_tcp4); + nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); } static struct pernet_operations ipv4_net_ops = { @@ -500,16 +484,49 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) pr_err("nf_conntrack_ipv4: can't register hooks.\n"); goto cleanup_pernet; } + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n"); + goto cleanup_hooks; + } + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n"); + goto cleanup_tcp4; + } + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n"); + goto cleanup_udp4; + } + + ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); + goto cleanup_icmpv4; + } + #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) ret = nf_conntrack_ipv4_compat_init(); if (ret < 0) - goto cleanup_hooks; + goto cleanup_proto; #endif return ret; #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + cleanup_proto: + nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); +#endif + cleanup_icmpv4: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); + cleanup_udp4: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); + cleanup_tcp4: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4); cleanup_hooks: nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); -#endif cleanup_pernet: unregister_pernet_subsys(&ipv4_net_ops); cleanup_sockopt: @@ -523,6 +540,10 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) nf_conntrack_ipv4_compat_fini(); #endif + nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4); nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); unregister_pernet_subsys(&ipv4_net_ops); nf_unregister_sockopt(&so_getorigdst); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 9682b36df38c..4c48e434bb1f 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -2,6 +2,7 @@ * * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -417,12 +418,12 @@ static int __net_init ip_conntrack_net_init(struct net *net) { struct proc_dir_entry *proc, *proc_exp, *proc_stat; - proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); + proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops); if (!proc) goto err1; - proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, - &ip_exp_file_ops); + proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net, + &ip_exp_file_ops); if (!proc_exp) goto err2; @@ -433,9 +434,9 @@ static int __net_init ip_conntrack_net_init(struct net *net) return 0; err3: - proc_net_remove(net, "ip_conntrack_expect"); + remove_proc_entry("ip_conntrack_expect", net->proc_net); err2: - proc_net_remove(net, "ip_conntrack"); + remove_proc_entry("ip_conntrack", net->proc_net); err1: return -ENOMEM; } @@ -443,8 +444,8 @@ err1: static void __net_exit ip_conntrack_net_exit(struct net *net) { remove_proc_entry("ip_conntrack", net->proc_net_stat); - proc_net_remove(net, "ip_conntrack_expect"); - proc_net_remove(net, "ip_conntrack"); + remove_proc_entry("ip_conntrack_expect", net->proc_net); + remove_proc_entry("ip_conntrack", net->proc_net); } static struct pernet_operations ip_conntrack_net_ops = { diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 5241d997ab75..a338dad41b7d 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -187,8 +188,8 @@ icmp_error(struct net *net, struct nf_conn *tmpl, icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: short packet "); + nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, + NULL, "nf_ct_icmp: short packet "); return -NF_ACCEPT; } @@ -196,7 +197,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: bad HW ICMP checksum "); return -NF_ACCEPT; } @@ -209,7 +210,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, */ if (icmph->type > NR_ICMP_TYPES) { if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: invalid ICMP type "); return -NF_ACCEPT; } diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 9c3db10b22d3..9eea059dd621 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -2,6 +2,7 @@ * H.323 extension for NAT alteration. * * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * This source code is licensed under General Public License version 2. * diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index a06d7d74817d..657d2307f031 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -13,6 +13,8 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * * TODO: - NAT to a unique tuple, not to TCP source port * (needs netfilter tuple reservation) */ diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index ea44f02563b5..690d890111bb 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -21,6 +21,8 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * */ #include <linux/module.h> diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index bac712293fd6..5f011cc89cd9 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -38,6 +38,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: James Morris <jmorris@intercode.com.au> + * + * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/moduleparam.h> diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 6f9c07268cf6..7d93d62cd5fd 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -322,8 +322,8 @@ void ping_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr *)skb->data; struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); struct inet_sock *inet_sock; - int type = icmph->type; - int code = icmph->code; + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; @@ -514,9 +514,8 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.oif = sk->sk_bound_dev_if; ipc.tx_flags = 0; - err = sock_tx_timestamp(sk, &ipc.tx_flags); - if (err) - return err; + + sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc); @@ -889,7 +888,7 @@ static int ping_proc_register(struct net *net) struct proc_dir_entry *p; int rc = 0; - p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops); + p = proc_create("icmp", S_IRUGO, net->proc_net, &ping_seq_fops); if (!p) rc = -ENOMEM; return rc; @@ -897,7 +896,7 @@ static int ping_proc_register(struct net *net) static void ping_proc_unregister(struct net *net) { - proc_net_remove(net, "icmp"); + remove_proc_entry("icmp", net->proc_net); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8de53e1ddd54..2a5bf86d2415 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -125,6 +125,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -162,6 +163,7 @@ static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), + SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -172,6 +174,7 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -224,6 +227,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), + SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), + SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), @@ -267,6 +272,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), + SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_SENTINEL }; @@ -319,15 +325,16 @@ static void icmp_put(struct seq_file *seq) struct net *net = seq->private; atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; - seq_puts(seq, "\nIcmp: InMsgs InErrors"); + seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_printf(seq, " OutMsgs OutErrors"); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); - seq_printf(seq, "\nIcmp: %lu %lu", + seq_printf(seq, "\nIcmp: %lu %lu %lu", snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS), + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); @@ -471,28 +478,29 @@ static const struct file_operations netstat_seq_fops = { static __net_init int ip_proc_init_net(struct net *net) { - if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops)) + if (!proc_create("sockstat", S_IRUGO, net->proc_net, + &sockstat_seq_fops)) goto out_sockstat; - if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) + if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops)) goto out_netstat; - if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops)) + if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops)) goto out_snmp; return 0; out_snmp: - proc_net_remove(net, "netstat"); + remove_proc_entry("netstat", net->proc_net); out_netstat: - proc_net_remove(net, "sockstat"); + remove_proc_entry("sockstat", net->proc_net); out_sockstat: return -ENOMEM; } static __net_exit void ip_proc_exit_net(struct net *net) { - proc_net_remove(net, "snmp"); - proc_net_remove(net, "netstat"); - proc_net_remove(net, "sockstat"); + remove_proc_entry("snmp", net->proc_net); + remove_proc_entry("netstat", net->proc_net); + remove_proc_entry("sockstat", net->proc_net); } static __net_initdata struct pernet_operations ip_proc_ops = { diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 0f9d09f54bd9..ce848461acbb 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -37,6 +37,12 @@ const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { + if (!prot->netns_ok) { + pr_err("Protocol %u is not namespace aware, cannot register.\n", + protocol); + return -EINVAL; + } + return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6f08991409c3..dd44e0ab600c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -111,9 +111,7 @@ EXPORT_SYMBOL_GPL(raw_unhash_sk); static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif) { - struct hlist_node *node; - - sk_for_each_from(sk, node) { + sk_for_each_from(sk) { struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && inet->inet_num == num && @@ -914,9 +912,7 @@ static struct sock *raw_get_first(struct seq_file *seq) for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { - struct hlist_node *node; - - sk_for_each(sk, node, &state->h->ht[state->bucket]) + sk_for_each(sk, &state->h->ht[state->bucket]) if (sock_net(sk) == seq_file_net(seq)) goto found; } @@ -1050,7 +1046,7 @@ static const struct file_operations raw_seq_fops = { static __net_init int raw_init_net(struct net *net) { - if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) + if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops)) return -ENOMEM; return 0; @@ -1058,7 +1054,7 @@ static __net_init int raw_init_net(struct net *net) static __net_exit void raw_exit_net(struct net *net) { - proc_net_remove(net, "raw"); + remove_proc_entry("raw", net->proc_net); } static __net_initdata struct pernet_operations raw_net_ops = { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a0fcc47fee73..550781a17b34 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -117,15 +117,11 @@ #define RT_GC_TIMEOUT (300*HZ) static int ip_rt_max_size; -static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; -static int ip_rt_gc_interval __read_mostly = 60 * HZ; -static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; -static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; @@ -384,8 +380,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, - &rt_cache_seq_fops); + pde = proc_create("rt_cache", S_IRUGO, net->proc_net, + &rt_cache_seq_fops); if (!pde) goto err1; @@ -2315,7 +2311,7 @@ nla_put_failure: return -EMSGSIZE; } -static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; @@ -2423,6 +2419,11 @@ void ip_rt_multicast_event(struct in_device *in_dev) } #ifdef CONFIG_SYSCTL +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly = 60 * HZ; +static int ip_rt_gc_min_interval __read_mostly = HZ / 2; +static int ip_rt_gc_elasticity __read_mostly = 8; + static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b236ef04914f..b05c96e7af8b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -232,7 +232,8 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, * * return false if we decode an option that should not be. */ -bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok) +bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, + struct net *net, bool *ecn_ok) { /* echoed timestamp, lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr & TSMASK; @@ -247,7 +248,7 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok) tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0; *ecn_ok = (options >> 5) & 1; - if (*ecn_ok && !sysctl_tcp_ecn) + if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn) return false; if (tcp_opt->sack_ok && !sysctl_tcp_sack) @@ -266,7 +267,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) { struct tcp_options_received tcp_opt; - const u8 *hash_location; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct tcp_sock *tp = tcp_sk(sk); @@ -293,9 +293,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); + tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) + if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) goto out; ret = NULL; @@ -348,8 +348,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), - RT_SCOPE_UNIVERSE, IPPROTO_TCP, + flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, ireq->loc_addr, th->source, th->dest); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d84400b65049..fa2f63fc453b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -27,7 +27,8 @@ #include <net/tcp_memcontrol.h> static int zero; -static int two = 2; +static int one = 1; +static int four = 4; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -232,8 +233,8 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, return 0; } -int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer, - size_t *lenp, loff_t *ppos) +static int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) { ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; @@ -538,13 +539,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_ecn", - .data = &sysctl_tcp_ecn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_dsack", .data = &sysctl_tcp_dsack, .maxlen = sizeof(int), @@ -556,14 +550,16 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_wmem, .maxlen = sizeof(sysctl_tcp_wmem), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, .maxlen = sizeof(sysctl_tcp_rmem), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "tcp_app_win", @@ -596,13 +592,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_frto_response", - .data = &sysctl_tcp_frto_response, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), @@ -637,13 +626,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_congestion_control, }, { - .procname = "tcp_abc", - .data = &sysctl_tcp_abc, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "tcp_mtu_probing", .data = &sysctl_tcp_mtu_probing, .maxlen = sizeof(int), @@ -744,13 +726,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "tcp_cookie_size", - .data = &sysctl_tcp_cookie_size, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_thin_linear_timeouts", .data = &sysctl_tcp_thin_linear_timeouts, .maxlen = sizeof(int), @@ -771,7 +746,7 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, - .extra2 = &two, + .extra2 = &four, }, { .procname = "udp_mem", @@ -786,7 +761,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .extra1 = &one }, { .procname = "udp_wmem_min", @@ -794,7 +769,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .extra1 = &one }, { } }; @@ -850,6 +825,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = ipv4_ping_group_range, }, { + .procname = "tcp_ecn", + .data = &init_net.ipv4.sysctl_tcp_ecn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "tcp_mem", .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), .mode = 0644, @@ -882,6 +864,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) &net->ipv4.sysctl_icmp_ratemask; table[6].data = &net->ipv4.sysctl_ping_group_range; + table[7].data = + &net->ipv4.sysctl_tcp_ecn; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2aa69c8ae60c..dcb116dde216 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -400,6 +400,8 @@ void tcp_init_sock(struct sock *sk) tcp_enable_early_retrans(tp); icsk->icsk_ca_ops = &tcp_init_congestion_ops; + tp->tsoffset = 0; + sk->sk_state = TCP_CLOSE; sk->sk_write_space = sk_stream_write_space; @@ -407,15 +409,6 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - /* TCP Cookie Transactions */ - if (sysctl_tcp_cookie_size > 0) { - /* Default, cookies without s_data_payload. */ - tp->cookie_values = - kzalloc(sizeof(*tp->cookie_values), - sk->sk_allocation); - if (tp->cookie_values != NULL) - kref_init(&tp->cookie_values->kref); - } /* Presumed zeroed, in order of appearance: * cookie_in_always, cookie_out_never, * s_data_constant, s_data_in, s_data_out @@ -773,7 +766,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ - skb->avail_size = size; + skb->reserved_tailroom = skb->end - skb->tail - size; return skb; } __kfree_skb(skb); @@ -895,6 +888,7 @@ new_segment: get_page(page); skb_fill_page_desc(skb, i, page, offset, copy); } + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; skb->len += copy; skb->data_len += copy; @@ -1406,10 +1400,10 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) return; last_issued = tp->ucopy.dma_cookie; - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + dma_async_issue_pending(tp->ucopy.dma_chan); do { - if (dma_async_memcpy_complete(tp->ucopy.dma_chan, + if (dma_async_is_tx_complete(tp->ucopy.dma_chan, last_issued, &done, &used) == DMA_SUCCESS) { /* Safe to free early-copied skbs now */ @@ -1751,7 +1745,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, tcp_service_net_dma(sk, true); tcp_cleanup_rbuf(sk, copied); } else - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + dma_async_issue_pending(tp->ucopy.dma_chan); } #endif if (copied >= target) { @@ -1844,7 +1838,7 @@ do_prequeue: break; } - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + dma_async_issue_pending(tp->ucopy.dma_chan); if ((offset + used) == skb->len) copied_early = true; @@ -2287,7 +2281,6 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; - tp->bytes_acked = 0; tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); @@ -2395,92 +2388,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } - case TCP_COOKIE_TRANSACTIONS: { - struct tcp_cookie_transactions ctd; - struct tcp_cookie_values *cvp = NULL; - - if (sizeof(ctd) > optlen) - return -EINVAL; - if (copy_from_user(&ctd, optval, sizeof(ctd))) - return -EFAULT; - - if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || - ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) - return -EINVAL; - - if (ctd.tcpct_cookie_desired == 0) { - /* default to global value */ - } else if ((0x1 & ctd.tcpct_cookie_desired) || - ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || - ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { - return -EINVAL; - } - - if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { - /* Supercedes all other values */ - lock_sock(sk); - if (tp->cookie_values != NULL) { - kref_put(&tp->cookie_values->kref, - tcp_cookie_values_release); - tp->cookie_values = NULL; - } - tp->rx_opt.cookie_in_always = 0; /* false */ - tp->rx_opt.cookie_out_never = 1; /* true */ - release_sock(sk); - return err; - } - - /* Allocate ancillary memory before locking. - */ - if (ctd.tcpct_used > 0 || - (tp->cookie_values == NULL && - (sysctl_tcp_cookie_size > 0 || - ctd.tcpct_cookie_desired > 0 || - ctd.tcpct_s_data_desired > 0))) { - cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, - GFP_KERNEL); - if (cvp == NULL) - return -ENOMEM; - - kref_init(&cvp->kref); - } - lock_sock(sk); - tp->rx_opt.cookie_in_always = - (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); - tp->rx_opt.cookie_out_never = 0; /* false */ - - if (tp->cookie_values != NULL) { - if (cvp != NULL) { - /* Changed values are recorded by a changed - * pointer, ensuring the cookie will differ, - * without separately hashing each value later. - */ - kref_put(&tp->cookie_values->kref, - tcp_cookie_values_release); - } else { - cvp = tp->cookie_values; - } - } - - if (cvp != NULL) { - cvp->cookie_desired = ctd.tcpct_cookie_desired; - - if (ctd.tcpct_used > 0) { - memcpy(cvp->s_data_payload, ctd.tcpct_value, - ctd.tcpct_used); - cvp->s_data_desired = ctd.tcpct_used; - cvp->s_data_constant = 1; /* true */ - } else { - /* No constant payload data. */ - cvp->s_data_desired = ctd.tcpct_s_data_desired; - cvp->s_data_constant = 0; /* false */ - } - - tp->cookie_values = cvp; - } - release_sock(sk); - return err; - } default: /* fallthru */ break; @@ -2711,6 +2618,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else err = -EINVAL; break; + case TCP_TIMESTAMP: + if (!tp->repair) + err = -EPERM; + else + tp->tsoffset = val - tcp_time_stamp; + break; default: err = -ENOPROTOOPT; break; @@ -2894,41 +2807,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; - case TCP_COOKIE_TRANSACTIONS: { - struct tcp_cookie_transactions ctd; - struct tcp_cookie_values *cvp = tp->cookie_values; - - if (get_user(len, optlen)) - return -EFAULT; - if (len < sizeof(ctd)) - return -EINVAL; - - memset(&ctd, 0, sizeof(ctd)); - ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? - TCP_COOKIE_IN_ALWAYS : 0) - | (tp->rx_opt.cookie_out_never ? - TCP_COOKIE_OUT_NEVER : 0); - - if (cvp != NULL) { - ctd.tcpct_flags |= (cvp->s_data_in ? - TCP_S_DATA_IN : 0) - | (cvp->s_data_out ? - TCP_S_DATA_OUT : 0); - - ctd.tcpct_cookie_desired = cvp->cookie_desired; - ctd.tcpct_s_data_desired = cvp->s_data_desired; - - memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], - cvp->cookie_pair_size); - ctd.tcpct_used = cvp->cookie_pair_size; - } - - if (put_user(sizeof(ctd), optlen)) - return -EFAULT; - if (copy_to_user(optval, &ctd, sizeof(ctd))) - return -EFAULT; - return 0; - } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; @@ -2959,6 +2837,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_USER_TIMEOUT: val = jiffies_to_msecs(icsk->icsk_user_timeout); break; + case TCP_TIMESTAMP: + val = tcp_time_stamp + tp->tsoffset; + break; default: return -ENOPROTOOPT; } @@ -3004,6 +2885,8 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, __be32 delta; unsigned int oldlen; unsigned int mss; + struct sk_buff *gso_skb = skb; + __sum16 newcheck; if (!pskb_may_pull(skb, sizeof(*th))) goto out; @@ -3032,6 +2915,8 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | + SKB_GSO_GRE | + SKB_GSO_UDP_TUNNEL | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; @@ -3052,11 +2937,13 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, th = tcp_hdr(skb); seq = ntohl(th->seq); + newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + + (__force u32)delta)); + do { th->fin = th->psh = 0; + th->check = newcheck; - th->check = ~csum_fold((__force __wsum)((__force u32)th->check + - (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) th->check = csum_fold(csum_partial(skb_transport_header(skb), @@ -3070,6 +2957,17 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, th->cwr = 0; } while (skb->next); + /* Following permits TCP Small Queues to work well with GSO : + * The callback to TCP stack will be called at the time last frag + * is freed at TX completion, and not right now when gso_skb + * is freed by GSO engine + */ + if (gso_skb->destructor == tcp_wfree) { + swap(gso_skb->sk, skb->sk); + swap(gso_skb->destructor, skb->destructor); + swap(gso_skb->truesize, skb->truesize); + } + delta = htonl(oldlen + (skb->tail - skb->transport_header) + skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + @@ -3243,7 +3141,7 @@ __tcp_alloc_md5sig_pool(struct sock *sk) struct crypto_hash *hash; hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (!hash || IS_ERR(hash)) + if (IS_ERR_OR_NULL(hash)) goto out_free; per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; @@ -3396,134 +3294,6 @@ EXPORT_SYMBOL(tcp_md5_hash_key); #endif -/* Each Responder maintains up to two secret values concurrently for - * efficient secret rollover. Each secret value has 4 states: - * - * Generating. (tcp_secret_generating != tcp_secret_primary) - * Generates new Responder-Cookies, but not yet used for primary - * verification. This is a short-term state, typically lasting only - * one round trip time (RTT). - * - * Primary. (tcp_secret_generating == tcp_secret_primary) - * Used both for generation and primary verification. - * - * Retiring. (tcp_secret_retiring != tcp_secret_secondary) - * Used for verification, until the first failure that can be - * verified by the newer Generating secret. At that time, this - * cookie's state is changed to Secondary, and the Generating - * cookie's state is changed to Primary. This is a short-term state, - * typically lasting only one round trip time (RTT). - * - * Secondary. (tcp_secret_retiring == tcp_secret_secondary) - * Used for secondary verification, after primary verification - * failures. This state lasts no more than twice the Maximum Segment - * Lifetime (2MSL). Then, the secret is discarded. - */ -struct tcp_cookie_secret { - /* The secret is divided into two parts. The digest part is the - * equivalent of previously hashing a secret and saving the state, - * and serves as an initialization vector (IV). The message part - * serves as the trailing secret. - */ - u32 secrets[COOKIE_WORKSPACE_WORDS]; - unsigned long expires; -}; - -#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) -#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) -#define TCP_SECRET_LIFE (HZ * 600) - -static struct tcp_cookie_secret tcp_secret_one; -static struct tcp_cookie_secret tcp_secret_two; - -/* Essentially a circular list, without dynamic allocation. */ -static struct tcp_cookie_secret *tcp_secret_generating; -static struct tcp_cookie_secret *tcp_secret_primary; -static struct tcp_cookie_secret *tcp_secret_retiring; -static struct tcp_cookie_secret *tcp_secret_secondary; - -static DEFINE_SPINLOCK(tcp_secret_locker); - -/* Select a pseudo-random word in the cookie workspace. - */ -static inline u32 tcp_cookie_work(const u32 *ws, const int n) -{ - return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; -} - -/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. - * Called in softirq context. - * Returns: 0 for success. - */ -int tcp_cookie_generator(u32 *bakery) -{ - unsigned long jiffy = jiffies; - - if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { - spin_lock_bh(&tcp_secret_locker); - if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { - /* refreshed by another */ - memcpy(bakery, - &tcp_secret_generating->secrets[0], - COOKIE_WORKSPACE_WORDS); - } else { - /* still needs refreshing */ - get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); - - /* The first time, paranoia assumes that the - * randomization function isn't as strong. But, - * this secret initialization is delayed until - * the last possible moment (packet arrival). - * Although that time is observable, it is - * unpredictably variable. Mash in the most - * volatile clock bits available, and expire the - * secret extra quickly. - */ - if (unlikely(tcp_secret_primary->expires == - tcp_secret_secondary->expires)) { - struct timespec tv; - - getnstimeofday(&tv); - bakery[COOKIE_DIGEST_WORDS+0] ^= - (u32)tv.tv_nsec; - - tcp_secret_secondary->expires = jiffy - + TCP_SECRET_1MSL - + (0x0f & tcp_cookie_work(bakery, 0)); - } else { - tcp_secret_secondary->expires = jiffy - + TCP_SECRET_LIFE - + (0xff & tcp_cookie_work(bakery, 1)); - tcp_secret_primary->expires = jiffy - + TCP_SECRET_2MSL - + (0x1f & tcp_cookie_work(bakery, 2)); - } - memcpy(&tcp_secret_secondary->secrets[0], - bakery, COOKIE_WORKSPACE_WORDS); - - rcu_assign_pointer(tcp_secret_generating, - tcp_secret_secondary); - rcu_assign_pointer(tcp_secret_retiring, - tcp_secret_primary); - /* - * Neither call_rcu() nor synchronize_rcu() needed. - * Retiring data is not freed. It is replaced after - * further (locked) pointer updates, and a quiet time - * (minimum 1MSL, maximum LIFE - 2MSL). - */ - } - spin_unlock_bh(&tcp_secret_locker); - } else { - rcu_read_lock_bh(); - memcpy(bakery, - &rcu_dereference(tcp_secret_generating)->secrets[0], - COOKIE_WORKSPACE_WORDS); - rcu_read_unlock_bh(); - } - return 0; -} -EXPORT_SYMBOL(tcp_cookie_generator); - void tcp_done(struct sock *sk) { struct request_sock *req = tcp_sk(sk)->fastopen_rsk; @@ -3578,7 +3348,6 @@ void __init tcp_init(void) unsigned long limit; int max_rshare, max_wshare, cnt; unsigned int i; - unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3654,13 +3423,5 @@ void __init tcp_init(void) tcp_register_congestion_control(&tcp_reno); - memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); - memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); - tcp_secret_one.expires = jiffy; /* past due */ - tcp_secret_two.expires = jiffy; /* past due */ - tcp_secret_generating = &tcp_secret_one; - tcp_secret_primary = &tcp_secret_one; - tcp_secret_retiring = &tcp_secret_two; - tcp_secret_secondary = &tcp_secret_two; tcp_tasklet_init(); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index cdf2e707bb10..019c2389a341 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -317,28 +317,11 @@ void tcp_slow_start(struct tcp_sock *tp) snd_cwnd = 1U; } - /* RFC3465: ABC Slow start - * Increase only after a full MSS of bytes is acked - * - * TCP sender SHOULD increase cwnd by the number of - * previously unacknowledged bytes ACKed by each incoming - * acknowledgment, provided the increase is not more than L - */ - if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache) - return; - if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ else cnt = snd_cwnd; /* exponential increase */ - /* RFC3465: ABC - * We MAY increase by 2 if discovered delayed ack - */ - if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) - cnt <<= 1; - tp->bytes_acked = 0; - tp->snd_cwnd_cnt += cnt; while (tp->snd_cwnd_cnt >= snd_cwnd) { tp->snd_cwnd_cnt -= snd_cwnd; @@ -378,20 +361,9 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* In "safe" area, increase. */ if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); - /* In dangerous area, increase slowly. */ - else if (sysctl_tcp_abc) { - /* RFC3465: Appropriate Byte Count - * increase once for each full cwnd acked - */ - if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { - tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } - } else { + else tcp_cong_avoid_ai(tp, tp->snd_cwnd); - } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ad70a962c20e..08bbe6096528 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -81,8 +81,6 @@ int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; EXPORT_SYMBOL(sysctl_tcp_reordering); -int sysctl_tcp_ecn __read_mostly = 2; -EXPORT_SYMBOL(sysctl_tcp_ecn); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; @@ -95,13 +93,11 @@ int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; -int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; -int sysctl_tcp_abc __read_mostly; -int sysctl_tcp_early_retrans __read_mostly = 2; +int sysctl_tcp_early_retrans __read_mostly = 3; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -111,17 +107,16 @@ int sysctl_tcp_early_retrans __read_mostly = 2; #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ -#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ +#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ -#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) -#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) @@ -1162,10 +1157,8 @@ static u8 tcp_sacktag_one(struct sock *sk, tcp_highest_sack_seq(tp))) state->reord = min(fack_count, state->reord); - - /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(end_seq, tp->frto_highmark)) - state->flag |= FLAG_ONLY_ORIG_SACKED; + if (!after(end_seq, tp->high_seq)) + state->flag |= FLAG_ORIG_SACK_ACKED; } if (sacked & TCPCB_LOST) { @@ -1558,7 +1551,6 @@ static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, u32 prior_snd_una) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); @@ -1731,12 +1723,6 @@ walk: start_seq, end_seq, dup_sack); advance_sp: - /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct - * due to in-order walk - */ - if (after(end_seq, tp->frto_highmark)) - state.flag &= ~FLAG_ONLY_ORIG_SACKED; - i++; } @@ -1753,8 +1739,7 @@ advance_sp: tcp_verify_left_out(tp); if ((state.reord < tp->fackets_out) && - ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && - (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) + ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); out: @@ -1828,198 +1813,6 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } -static int tcp_is_sackfrto(const struct tcp_sock *tp) -{ - return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp); -} - -/* F-RTO can only be used if TCP has never retransmitted anything other than - * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) - */ -bool tcp_use_frto(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - - if (!sysctl_tcp_frto) - return false; - - /* MTU probe and F-RTO won't really play nicely along currently */ - if (icsk->icsk_mtup.probe_size) - return false; - - if (tcp_is_sackfrto(tp)) - return true; - - /* Avoid expensive walking of rexmit queue if possible */ - if (tp->retrans_out > 1) - return false; - - skb = tcp_write_queue_head(sk); - if (tcp_skb_is_last(sk, skb)) - return true; - skb = tcp_write_queue_next(sk, skb); /* Skips head */ - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) - return false; - /* Short-circuit when first non-SACKed skb has been checked */ - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - break; - } - return true; -} - -/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO - * recovery a bit and use heuristics in tcp_process_frto() to detect if - * the RTO was spurious. Only clear SACKED_RETRANS of the head here to - * keep retrans_out counting accurate (with SACK F-RTO, other than head - * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS - * bits are handled if the Loss state is really to be entered (in - * tcp_enter_frto_loss). - * - * Do like tcp_enter_loss() would; when RTO expires the second time it - * does: - * "Reduce ssthresh if it has not yet been made inside this window." - */ -void tcp_enter_frto(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || - tp->snd_una == tp->high_seq || - ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && - !icsk->icsk_retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(sk); - /* Our state is too optimistic in ssthresh() call because cwnd - * is not reduced until tcp_enter_frto_loss() when previous F-RTO - * recovery has not yet completed. Pattern would be this: RTO, - * Cumulative ACK, RTO (2xRTO for the same segment does not end - * up here twice). - * RFC4138 should be more specific on what to do, even though - * RTO is quite unlikely to occur after the first Cumulative ACK - * due to back-off and complexity of triggering events ... - */ - if (tp->frto_counter) { - u32 stored_cwnd; - stored_cwnd = tp->snd_cwnd; - tp->snd_cwnd = 2; - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - tp->snd_cwnd = stored_cwnd; - } else { - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - } - /* ... in theory, cong.control module could do "any tricks" in - * ssthresh(), which means that ca_state, lost bits and lost_out - * counter would have to be faked before the call occurs. We - * consider that too expensive, unlikely and hacky, so modules - * using these in ssthresh() must deal these incompatibility - * issues if they receives CA_EVENT_FRTO and frto_counter != 0 - */ - tcp_ca_event(sk, CA_EVENT_FRTO); - } - - tp->undo_marker = tp->snd_una; - tp->undo_retrans = 0; - - skb = tcp_write_queue_head(sk); - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) - tp->undo_marker = 0; - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - } - tcp_verify_left_out(tp); - - /* Too bad if TCP was application limited */ - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); - - /* Earlier loss recovery underway (see RFC4138; Appendix B). - * The last condition is necessary at least in tp->frto_counter case. - */ - if (tcp_is_sackfrto(tp) && (tp->frto_counter || - ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && - after(tp->high_seq, tp->snd_una)) { - tp->frto_highmark = tp->high_seq; - } else { - tp->frto_highmark = tp->snd_nxt; - } - tcp_set_ca_state(sk, TCP_CA_Disorder); - tp->high_seq = tp->snd_nxt; - tp->frto_counter = 1; -} - -/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, - * which indicates that we should follow the traditional RTO recovery, - * i.e. mark everything lost and do go-back-N retransmission. - */ -static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - tp->lost_out = 0; - tp->retrans_out = 0; - if (tcp_is_reno(tp)) - tcp_reset_reno_sack(tp); - - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - /* - * Count the retransmission made on RTO correctly (only when - * waiting for the first ACK and did not get it)... - */ - if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) { - /* For some reason this R-bit might get cleared? */ - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out += tcp_skb_pcount(skb); - /* ...enter this if branch just for the first segment */ - flag |= FLAG_DATA_ACKED; - } else { - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) - tp->undo_marker = 0; - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - } - - /* Marking forward transmissions that were made after RTO lost - * can cause unnecessary retransmissions in some scenarios, - * SACK blocks will mitigate that in some but not in all cases. - * We used to not mark them but it was causing break-ups with - * receivers that do only in-order receival. - * - * TODO: we could detect presence of such receiver and select - * different behavior per flow. - */ - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; - } - } - tcp_verify_left_out(tp); - - tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; - tp->snd_cwnd_cnt = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; - tp->frto_counter = 0; - tp->bytes_acked = 0; - - tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); - tcp_set_ca_state(sk, TCP_CA_Loss); - tp->high_seq = tp->snd_nxt; - TCP_ECN_queue_cwr(tp); - - tcp_clear_all_retrans_hints(tp); -} - static void tcp_clear_retrans_partial(struct tcp_sock *tp) { tp->retrans_out = 0; @@ -2046,10 +1839,13 @@ void tcp_enter_loss(struct sock *sk, int how) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + bool new_recovery = false; /* Reduce ssthresh if it has not yet been made inside this window. */ - if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || + if (icsk->icsk_ca_state <= TCP_CA_Disorder || + !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + new_recovery = true; tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); @@ -2058,17 +1854,13 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->bytes_acked = 0; tcp_clear_retrans_partial(tp); if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - if (!how) { - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - tp->undo_marker = tp->snd_una; - } else { + tp->undo_marker = tp->snd_una; + if (how) { tp->sacked_out = 0; tp->fackets_out = 0; } @@ -2095,8 +1887,14 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); - /* Abort F-RTO algorithm if one is in progress */ - tp->frto_counter = 0; + + /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous + * loss recovery is underway except recurring timeout(s) on + * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing + */ + tp->frto = sysctl_tcp_frto && + (new_recovery || icsk->icsk_retransmits) && + !inet_csk(sk)->icsk_mtup.probe_size; } /* If ACK arrived pointing to a remembered SACK, it means that our @@ -2155,15 +1953,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples * available, or RTO is scheduled to fire first. */ - if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) + if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || + (flag & FLAG_ECE) || !tp->srtt) return false; delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) return false; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); - tp->early_retrans_delayed = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, + TCP_RTO_MAX); return true; } @@ -2279,10 +2078,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; - /* Do not perform any recovery during F-RTO algorithm */ - if (tp->frto_counter) - return false; - /* Trick#1: The loss is proven. */ if (tp->lost_out) return true; @@ -2326,7 +2121,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) * interval if appropriate. */ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && - (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && + (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && !tcp_may_send_now(sk)) return !tcp_pause_early_retransmit(sk, flag); @@ -2643,12 +2438,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) return failed; } -/* Undo during loss recovery after partial ACK. */ -static bool tcp_try_undo_loss(struct sock *sk) +/* Undo during loss recovery after partial ACK or using F-RTO. */ +static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_may_undo(tp)) { + if (frto_undo || tcp_may_undo(tp)) { struct sk_buff *skb; tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) @@ -2662,9 +2457,12 @@ static bool tcp_try_undo_loss(struct sock *sk) tp->lost_out = 0; tcp_undo_cwr(sk, true); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); + if (frto_undo) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; - if (tcp_is_sack(tp)) + if (frto_undo || tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); return true; } @@ -2686,7 +2484,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) struct tcp_sock *tp = tcp_sk(sk); tp->high_seq = tp->snd_nxt; - tp->bytes_acked = 0; + tp->tlp_high_seq = 0; tp->snd_cwnd_cnt = 0; tp->prior_cwnd = tp->snd_cwnd; tp->prr_delivered = 0; @@ -2737,7 +2535,6 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; - tp->bytes_acked = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; tcp_init_cwnd_reduction(sk, set_ssthresh); @@ -2765,7 +2562,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) tcp_verify_left_out(tp); - if (!tp->frto_counter && !tcp_any_retrans_done(sk)) + if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; if (flag & FLAG_ECE) @@ -2882,6 +2679,58 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tcp_set_ca_state(sk, TCP_CA_Recovery); } +/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are + * recovered or spurious. Otherwise retransmits more on partial ACKs. + */ +static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + bool recovered = !before(tp->snd_una, tp->high_seq); + + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ + if (flag & FLAG_ORIG_SACK_ACKED) { + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + tcp_try_undo_loss(sk, true); + return; + } + if (after(tp->snd_nxt, tp->high_seq) && + (flag & FLAG_DATA_SACKED || is_dupack)) { + tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ + } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { + tp->high_seq = tp->snd_nxt; + __tcp_push_pending_frames(sk, tcp_current_mss(sk), + TCP_NAGLE_OFF); + if (after(tp->snd_nxt, tp->high_seq)) + return; /* Step 2.b */ + tp->frto = 0; + } + } + + if (recovered) { + /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ + icsk->icsk_retransmits = 0; + tcp_try_undo_recovery(sk); + return; + } + if (flag & FLAG_DATA_ACKED) + icsk->icsk_retransmits = 0; + if (tcp_is_reno(tp)) { + /* A Reno DUPACK means new data in F-RTO step 2.b above are + * delivered. Lower inflight to clock out (re)tranmissions. + */ + if (after(tp->snd_nxt, tp->high_seq) && is_dupack) + tcp_add_reno_sack(sk); + else if (flag & FLAG_SND_UNA_ADVANCED) + tcp_reset_reno_sack(tp); + } + if (tcp_try_undo_loss(sk, false)) + return; + tcp_xmit_retransmit_queue(sk); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2928,12 +2777,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { - case TCP_CA_Loss: - icsk->icsk_retransmits = 0; - if (tcp_try_undo_recovery(sk)) - return; - break; - case TCP_CA_CWR: /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ @@ -2964,18 +2807,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; break; case TCP_CA_Loss: - if (flag & FLAG_DATA_ACKED) - icsk->icsk_retransmits = 0; - if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) - tcp_reset_reno_sack(tp); - if (!tcp_try_undo_loss(sk)) { - tcp_moderate_cwnd(tp); - tcp_xmit_retransmit_queue(sk); - return; - } + tcp_process_loss(sk, flag, is_dupack); if (icsk->icsk_ca_state != TCP_CA_Open) return; - /* Loss is undone; fall through to processing in Open state. */ + /* Fall through to processing in Open state. */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -3088,6 +2923,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) */ void tcp_rearm_rto(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); /* If the retrans timer is currently being used by Fast Open @@ -3101,12 +2937,13 @@ void tcp_rearm_rto(struct sock *sk) } else { u32 rto = inet_csk(sk)->icsk_rto; /* Offset the time elapsed after installing regular RTO */ - if (tp->early_retrans_delayed) { + if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); /* delta may not be positive if the socket is locked - * when the delayed ER timer fires and is rescheduled. + * when the retrans timer fires and is rescheduled. */ if (delta > 0) rto = delta; @@ -3114,7 +2951,6 @@ void tcp_rearm_rto(struct sock *sk) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); } - tp->early_retrans_delayed = 0; } /* This function is called when the delayed ER timer fires. TCP enters @@ -3202,8 +3038,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_RETRANS_DATA_ACKED; ca_seq_rtt = -1; seq_rtt = -1; - if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) - flag |= FLAG_NONHEAD_RETRANS_ACKED; } else { ca_seq_rtt = now - scb->when; last_ackt = skb->tstamp; @@ -3212,6 +3046,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } if (!(sacked & TCPCB_SACKED_ACKED)) reord = min(pkts_acked, reord); + if (!after(scb->end_seq, tp->high_seq)) + flag |= FLAG_ORIG_SACK_ACKED; } if (sacked & TCPCB_SACKED_ACKED) @@ -3412,166 +3248,74 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } -/* A very conservative spurious RTO response algorithm: reduce cwnd and - * continue in congestion avoidance. - */ -static void tcp_conservative_spur_to_response(struct tcp_sock *tp) +/* RFC 5961 7 [ACK Throttling] */ +static void tcp_send_challenge_ack(struct sock *sk) { - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tp->snd_cwnd_cnt = 0; - tp->bytes_acked = 0; - TCP_ECN_queue_cwr(tp); - tcp_moderate_cwnd(tp); -} + /* unprotected vars, we dont care of overwrites */ + static u32 challenge_timestamp; + static unsigned int challenge_count; + u32 now = jiffies / HZ; -/* A conservative spurious RTO response algorithm: reduce cwnd using - * PRR and continue in congestion avoidance. - */ -static void tcp_cwr_spur_to_response(struct sock *sk) -{ - tcp_enter_cwr(sk, 0); + if (now != challenge_timestamp) { + challenge_timestamp = now; + challenge_count = 0; + } + if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + tcp_send_ack(sk); + } } -static void tcp_undo_spur_to_response(struct sock *sk, int flag) +static void tcp_store_ts_recent(struct tcp_sock *tp) { - if (flag & FLAG_ECE) - tcp_cwr_spur_to_response(sk); - else - tcp_undo_cwr(sk, true); + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = get_seconds(); } -/* F-RTO spurious RTO detection algorithm (RFC4138) - * - * F-RTO affects during two new ACKs following RTO (well, almost, see inline - * comments). State (ACK number) is kept in frto_counter. When ACK advances - * window (but not to or beyond highest sequence sent before RTO): - * On First ACK, send two new segments out. - * On Second ACK, RTO was likely spurious. Do spurious response (response - * algorithm is not part of the F-RTO detection algorithm - * given in RFC4138 but can be selected separately). - * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss - * and TCP falls back to conventional RTO recovery. F-RTO allows overriding - * of Nagle, this is done using frto_counter states 2 and 3, when a new data - * segment of any size sent during F-RTO, state 2 is upgraded to 3. - * - * Rationale: if the RTO was spurious, new ACKs should arrive from the - * original window even after we transmit two new data segments. - * - * SACK version: - * on first step, wait until first cumulative ACK arrives, then move to - * the second step. In second step, the next ACK decides. - * - * F-RTO is implemented (mainly) in four functions: - * - tcp_use_frto() is used to determine if TCP is can use F-RTO - * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is - * called when tcp_use_frto() showed green light - * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm - * - tcp_enter_frto_loss() is called if there is not enough evidence - * to prove that the RTO is indeed spurious. It transfers the control - * from F-RTO to the conventional RTO recovery - */ -static bool tcp_process_frto(struct sock *sk, int flag) +static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) { - struct tcp_sock *tp = tcp_sk(sk); - - tcp_verify_left_out(tp); - - /* Duplicate the behavior from Loss state (fastretrans_alert) */ - if (flag & FLAG_DATA_ACKED) - inet_csk(sk)->icsk_retransmits = 0; - - if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || - ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) - tp->undo_marker = 0; - - if (!before(tp->snd_una, tp->frto_highmark)) { - tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); - return true; - } - - if (!tcp_is_sackfrto(tp)) { - /* RFC4138 shortcoming in step 2; should also have case c): - * ACK isn't duplicate nor advances window, e.g., opposite dir - * data, winupdate + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps. */ - if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) - return true; - - if (!(flag & FLAG_DATA_ACKED)) { - tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), - flag); - return true; - } - } else { - if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { - if (!tcp_packets_in_flight(tp)) { - tcp_enter_frto_loss(sk, 2, flag); - return true; - } - - /* Prevent sending of new data. */ - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp)); - return true; - } - - if ((tp->frto_counter >= 2) && - (!(flag & FLAG_FORWARD_PROGRESS) || - ((flag & FLAG_DATA_SACKED) && - !(flag & FLAG_ONLY_ORIG_SACKED)))) { - /* RFC4138 shortcoming (see comment above) */ - if (!(flag & FLAG_FORWARD_PROGRESS) && - (flag & FLAG_NOT_DUP)) - return true; - - tcp_enter_frto_loss(sk, 3, flag); - return true; - } - } - if (tp->frto_counter == 1) { - /* tcp_may_send_now needs to see updated state */ - tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; - tp->frto_counter = 2; - - if (!tcp_may_send_now(sk)) - tcp_enter_frto_loss(sk, 2, flag); - - return true; - } else { - switch (sysctl_tcp_frto_response) { - case 2: - tcp_undo_spur_to_response(sk, flag); - break; - case 1: - tcp_conservative_spur_to_response(tp); - break; - default: - tcp_cwr_spur_to_response(sk); - break; - } - tp->frto_counter = 0; - tp->undo_marker = 0; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); + if (tcp_paws_check(&tp->rx_opt, 0)) + tcp_store_ts_recent(tp); } - return false; } -/* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +/* This routine deals with acks during a TLP episode. + * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. + */ +static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { - /* unprotected vars, we dont care of overwrites */ - static u32 challenge_timestamp; - static unsigned int challenge_count; - u32 now = jiffies / HZ; + struct tcp_sock *tp = tcp_sk(sk); + bool is_tlp_dupack = (ack == tp->tlp_high_seq) && + !(flag & (FLAG_SND_UNA_ADVANCED | + FLAG_NOT_DUP | FLAG_DATA_SACKED)); - if (now != challenge_timestamp) { - challenge_timestamp = now; - challenge_count = 0; + /* Mark the end of TLP episode on receiving TLP dupack or when + * ack is after tlp_high_seq. + */ + if (is_tlp_dupack) { + tp->tlp_high_seq = 0; + return; } - if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); - tcp_send_ack(sk); + + if (after(ack, tp->tlp_high_seq)) { + tp->tlp_high_seq = 0; + /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ + if (!(flag & FLAG_DSACKING_ACK)) { + tcp_init_cwnd_reduction(sk, true); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_Open); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSSPROBERECOVERY); + } } } @@ -3589,7 +3333,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets; int prior_sacked = tp->sacked_out; int pkts_acked = 0; - bool frto_cwnd = false; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3609,24 +3352,22 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - if (tp->early_retrans_delayed) + if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; - if (sysctl_tcp_abc) { - if (icsk->icsk_ca_state < TCP_CA_CWR) - tp->bytes_acked += ack - prior_snd_una; - else if (icsk->icsk_ca_state == TCP_CA_Loss) - /* we assume just one segment left network */ - tp->bytes_acked += min(ack - prior_snd_una, - tp->mss_cache); - } - prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + if (flag & FLAG_UPDATE_TS_RECENT) + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -3671,30 +3412,29 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) pkts_acked = prior_packets - tp->packets_out; - if (tp->frto_counter) - frto_cwnd = tcp_process_frto(sk, flag); - /* Guarantee sacktag reordering detection against wrap-arounds */ - if (before(tp->frto_highmark, tp->snd_una)) - tp->frto_highmark = 0; - if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && - tcp_may_raise_cwnd(sk, flag)) + if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); } else { - if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) + if (flag & FLAG_DATA_ACKED) tcp_cong_avoid(sk, ack, prior_in_flight); } + if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { struct dst_entry *dst = __sk_dst_get(sk); if (dst) dst_confirm(dst); } + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) + tcp_schedule_loss_probe(sk); return 1; no_queue: @@ -3708,6 +3448,9 @@ no_queue: */ if (tcp_send_head(sk)) tcp_ack_probe(sk); + + if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); return 1; invalid_ack: @@ -3732,8 +3475,8 @@ old_ack: * But, this can also be called on packets in the established flow when * the fast version below fails. */ -void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, - const u8 **hvpp, int estab, +void tcp_parse_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc) { const unsigned char *ptr; @@ -3817,31 +3560,6 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o */ break; #endif - case TCPOPT_COOKIE: - /* This option is variable length. - */ - switch (opsize) { - case TCPOLEN_COOKIE_BASE: - /* not yet implemented */ - break; - case TCPOLEN_COOKIE_PAIR: - /* not yet implemented */ - break; - case TCPOLEN_COOKIE_MIN+0: - case TCPOLEN_COOKIE_MIN+2: - case TCPOLEN_COOKIE_MIN+4: - case TCPOLEN_COOKIE_MIN+6: - case TCPOLEN_COOKIE_MAX: - /* 16-bit multiple */ - opt_rx->cookie_plus = opsize; - *hvpp = ptr; - break; - default: - /* ignore option */ - break; - } - break; - case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. It's valid only in @@ -3877,7 +3595,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr); + tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; return true; } return false; @@ -3887,8 +3605,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr * If it is wrong it falls back on tcp_parse_options(). */ static bool tcp_fast_parse_options(const struct sk_buff *skb, - const struct tcphdr *th, - struct tcp_sock *tp, const u8 **hvpp) + const struct tcphdr *th, struct tcp_sock *tp) { /* In the spirit of fast parsing, compare doff directly to constant * values. Because equality is used, short doff can be ignored here. @@ -3901,7 +3618,11 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, if (tcp_parse_aligned_timestamp(tp, th)) return true; } - tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); + + tcp_parse_options(skb, &tp->rx_opt, 1, NULL); + if (tp->rx_opt.saw_tstamp) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; + return true; } @@ -3943,27 +3664,6 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) EXPORT_SYMBOL(tcp_parse_md5sig_option); #endif -static inline void tcp_store_ts_recent(struct tcp_sock *tp) -{ - tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); -} - -static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) -{ - if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { - /* PAWS bug workaround wrt. ACK frames, the PAWS discard - * extra check below makes sure this can only happen - * for pure ACK frames. -DaveM - * - * Not only, also it occurs for expired timestamps. - */ - - if (tcp_paws_check(&tp->rx_opt, 0)) - tcp_store_ts_recent(tp); - } -} - /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM * * It is not fatal. If this ACK does _not_ change critical state (seqs, window) @@ -5279,12 +4979,10 @@ out: static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { - const u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(skb, th, tp, &hash_location) && - tp->rx_opt.saw_tstamp && + if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); @@ -5498,6 +5196,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tcp_checksum_complete_user(sk, skb)) goto csum_error; + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5509,9 +5210,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ @@ -5559,14 +5257,9 @@ slow_path: return 0; step5: - if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) + if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) goto discard; - /* ts_recent update must be made after we are sure that the packet - * is in window. - */ - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ @@ -5580,6 +5273,7 @@ step5: return 0; csum_error: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: @@ -5638,12 +5332,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, if (mss == tp->rx_opt.user_mss) { struct tcp_options_received opt; - const u8 *hash_location; /* Get original SYNACK MSS value if user MSS sets mss_clamp */ tcp_clear_options(&opt); opt.user_mss = opt.mss_clamp = 0; - tcp_parse_options(synack, &opt, &hash_location, 0, NULL); + tcp_parse_options(synack, &opt, 0, NULL); mss = opt.mss_clamp; } @@ -5674,14 +5367,14 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { - const u8 *hash_location; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct tcp_cookie_values *cvp = tp->cookie_values; struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); + tcp_parse_options(skb, &tp->rx_opt, 0, &foc); + if (tp->rx_opt.saw_tstamp) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) { /* rfc793: @@ -5776,30 +5469,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * is initialized. */ tp->copied_seq = tp->rcv_nxt; - if (cvp != NULL && - cvp->cookie_pair_size > 0 && - tp->rx_opt.cookie_plus > 0) { - int cookie_size = tp->rx_opt.cookie_plus - - TCPOLEN_COOKIE_BASE; - int cookie_pair_size = cookie_size - + cvp->cookie_desired; - - /* A cookie extension option was sent and returned. - * Note that each incoming SYNACK replaces the - * Responder cookie. The initial exchange is most - * fragile, as protection against spoofing relies - * entirely upon the sequence and timestamp (above). - * This replacement strategy allows the correct pair to - * pass through, while any others will be filtered via - * Responder verification later. - */ - if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { - memcpy(&cvp->cookie_pair[cvp->cookie_desired], - hash_location, cookie_size); - cvp->cookie_pair_size = cookie_pair_size; - } - } - smp_mb(); tcp_finish_connect(sk, skb); @@ -6000,7 +5669,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* step 5: check the ACK field */ if (true) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT) > 0; switch (sk->sk_state) { case TCP_SYN_RECV: @@ -6151,11 +5821,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } } - /* ts_recent update must be made after we are sure that the packet - * is in window. - */ - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - /* step 6: check the URG bit */ tcp_urg(sk, skb, th); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index eadb693eef55..719652305a29 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -274,13 +274,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 mtu = tcp_sk(sk)->mtu_info; - /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs - * send out by Linux are always <576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == TCP_LISTEN) - return; - dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -408,6 +401,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; + tp->mtu_info = info; if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); @@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) * no RST generated if md5 hash doesn't match. */ sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), - &tcp_hashinfo, ip_hdr(skb)->daddr, + &tcp_hashinfo, ip_hdr(skb)->saddr, + th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ if (!sk1) @@ -725,7 +726,7 @@ release_sk1: */ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, - u32 win, u32 ts, int oif, + u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos) { @@ -746,12 +747,12 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - if (ts) { + if (tsecr) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - rep.opt[1] = htonl(tcp_time_stamp); - rep.opt[2] = htonl(ts); + rep.opt[1] = htonl(tsval); + rep.opt[2] = htonl(tsecr); arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; } @@ -766,7 +767,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, #ifdef CONFIG_TCP_MD5SIG if (key) { - int offset = (ts) ? 3 : 0; + int offset = (tsecr) ? 3 : 0; rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -801,6 +802,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -820,6 +822,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_time_stamp, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, @@ -835,7 +838,6 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct request_values *rvp, u16 queue_mapping, bool nocache) { @@ -848,7 +850,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, rvp, NULL); + skb = tcp_make_synack(sk, dst, req, NULL); if (skb) { __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); @@ -865,10 +867,9 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, return err; } -static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, - struct request_values *rvp) +static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { - int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); + int res = tcp_v4_send_synack(sk, NULL, req, 0, false); if (!res) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); @@ -951,7 +952,6 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, { struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; - struct hlist_node *pos; unsigned int size = sizeof(struct in_addr); struct tcp_md5sig_info *md5sig; @@ -965,7 +965,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, if (family == AF_INET6) size = sizeof(struct in6_addr); #endif - hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) { + hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; if (!memcmp(&key->addr, addr, size)) @@ -1066,14 +1066,14 @@ static void tcp_clear_md5_list(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; - struct hlist_node *pos, *n; + struct hlist_node *n; struct tcp_md5sig_info *md5sig; md5sig = rcu_dereference_protected(tp->md5sig_info, 1); if (!hlist_empty(&md5sig->head)) tcp_free_md5sig_pool(); - hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) { + hlist_for_each_entry_safe(key, n, &md5sig->head, node) { hlist_del_rcu(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); kfree_rcu(key, rcu); @@ -1369,8 +1369,7 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, static int tcp_v4_conn_req_fastopen(struct sock *sk, struct sk_buff *skb, struct sk_buff *skb_synack, - struct request_sock *req, - struct request_values *rvp) + struct request_sock *req) { struct tcp_sock *tp = tcp_sk(sk); struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; @@ -1465,9 +1464,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; - const u8 *hash_location; struct request_sock *req; struct inet_request_sock *ireq; struct tcp_sock *tp = tcp_sk(sk); @@ -1517,42 +1514,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0, - want_cookie ? NULL : &foc); - - if (tmp_opt.cookie_plus > 0 && - tmp_opt.saw_tstamp && - !tp->rx_opt.cookie_out_never && - (sysctl_tcp_cookie_size > 0 || - (tp->cookie_values != NULL && - tp->cookie_values->cookie_desired > 0))) { - u8 *c; - u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; - int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; - - if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) - goto drop_and_release; - - /* Secret recipe starts with IP addresses */ - *mess++ ^= (__force u32)daddr; - *mess++ ^= (__force u32)saddr; - - /* plus variable length Initiator Cookie */ - c = (u8 *)mess; - while (l-- > 0) - *c++ ^= *hash_location++; - - want_cookie = false; /* not our kind of cookie */ - tmp_ext.cookie_out_never = 0; /* false */ - tmp_ext.cookie_plus = tmp_opt.cookie_plus; - } else if (!tp->rx_opt.cookie_in_always) { - /* redundant indications, but ensure initialization. */ - tmp_ext.cookie_out_never = 1; /* true */ - tmp_ext.cookie_plus = 0; - } else { - goto drop_and_release; - } - tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); @@ -1570,7 +1532,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb); + TCP_ECN_create_request(req, skb, sock_net(sk)); if (want_cookie) { isn = cookie_v4_init_sequence(sk, skb, &req->mss); @@ -1634,7 +1596,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * of tcp_v4_send_synack()->tcp_select_initial_window(). */ skb_synack = tcp_make_synack(sk, dst, req, - (struct request_values *)&tmp_ext, fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); if (skb_synack) { @@ -1658,8 +1619,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (fastopen_cookie_present(&foc) && foc.len != 0) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req, - (struct request_values *)&tmp_ext)) + } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) goto drop_and_free; return 0; @@ -1906,6 +1866,7 @@ discard: return 0; csum_err: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -1948,6 +1909,51 @@ void tcp_v4_early_demux(struct sk_buff *skb) } } +/* Packet is added to VJ-style prequeue for processing in process + * context, if a reader task is waiting. Apparently, this exciting + * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) + * failed somewhere. Latency? Burstiness? Well, at least now we will + * see, why it failed. 8)8) --ANK + * + */ +bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (sysctl_tcp_low_latency || !tp->ucopy.task) + return false; + + if (skb->len <= tcp_hdrlen(skb) && + skb_queue_len(&tp->ucopy.prequeue) == 0) + return false; + + skb_dst_force(skb); + __skb_queue_tail(&tp->ucopy.prequeue, skb); + tp->ucopy.memory += skb->truesize; + if (tp->ucopy.memory > sk->sk_rcvbuf) { + struct sk_buff *skb1; + + BUG_ON(sock_owned_by_user(sk)); + + while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { + sk_backlog_rcv(sk, skb1); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPPREQUEUEDROPPED); + } + + tp->ucopy.memory = 0; + } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { + wake_up_interruptible_sync_poll(sk_sleep(sk), + POLLIN | POLLRDNORM | POLLRDBAND); + if (!inet_csk_ack_scheduled(sk)) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + (3 * tcp_rto_min(sk)) / 4, + TCP_RTO_MAX); + } + return true; +} +EXPORT_SYMBOL(tcp_prequeue); + /* * From tcp_input.c */ @@ -1981,7 +1987,7 @@ int tcp_v4_rcv(struct sk_buff *skb) * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) - goto bad_packet; + goto csum_error; th = tcp_hdr(skb); iph = ip_hdr(skb); @@ -2047,6 +2053,8 @@ no_tcp_socket: goto discard_it; if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { +csum_error: + TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { @@ -2068,15 +2076,19 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + if (skb->len < (th->doff << 2)) { inet_twsk_put(inet_twsk(sk)); - goto discard_it; + goto bad_packet; + } + if (tcp_checksum_complete(skb)) { + inet_twsk_put(inet_twsk(sk)); + goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (sk2) { @@ -2194,12 +2206,6 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); - /* TCP Cookie Transactions */ - if (tp->cookie_values != NULL) { - kref_put(&tp->cookie_values->kref, - tcp_cookie_values_release); - tp->cookie_values = NULL; - } BUG_ON(tp->fastopen_rsk != NULL); /* If socket is aborted during connect operation */ @@ -2577,7 +2583,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) int tcp_seq_open(struct inode *inode, struct file *file) { - struct tcp_seq_afinfo *afinfo = PDE(inode)->data; + struct tcp_seq_afinfo *afinfo = PDE_DATA(inode); struct tcp_iter_state *s; int err; @@ -2612,7 +2618,7 @@ EXPORT_SYMBOL(tcp_proc_register); void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) { - proc_net_remove(net, afinfo->name); + remove_proc_entry(afinfo->name, net->proc_net); } EXPORT_SYMBOL(tcp_proc_unregister); @@ -2656,7 +2662,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) __u16 srcp = ntohs(inet->inet_sport); int rx_queue; - if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS || + icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { @@ -2891,6 +2899,7 @@ EXPORT_SYMBOL(tcp_prot); static int __net_init tcp_sk_init(struct net *net) { + net->ipv4.sysctl_tcp_ecn = 2; return 0; } diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index b6f3583ddfe8..da14436c1735 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -64,7 +64,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) { struct cg_proto *cg_proto; struct tcp_memcontrol *tcp; - u64 val; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) @@ -72,8 +71,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) tcp = tcp_from_cgproto(cg_proto); percpu_counter_destroy(&tcp->tcp_sockets_allocated); - - val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); } EXPORT_SYMBOL(tcp_destroy_cgroup); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f696d7c2e9fa..f6a005c485a9 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -96,7 +96,8 @@ struct tcpm_hash_bucket { static DEFINE_SPINLOCK(tcp_metrics_lock); -static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) +static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, + bool fastopen_clear) { u32 val; @@ -122,9 +123,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); tm->tcpm_ts = 0; tm->tcpm_ts_stamp = 0; - tm->tcpm_fastopen.mss = 0; - tm->tcpm_fastopen.syn_loss = 0; - tm->tcpm_fastopen.cookie.len = 0; + if (fastopen_clear) { + tm->tcpm_fastopen.mss = 0; + tm->tcpm_fastopen.syn_loss = 0; + tm->tcpm_fastopen.cookie.len = 0; + } } static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, @@ -154,7 +157,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, } tm->tcpm_addr = *addr; - tcpm_suck_dst(tm, dst); + tcpm_suck_dst(tm, dst, true); if (likely(!reclaim)) { tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; @@ -171,7 +174,7 @@ out_unlock: static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) { if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) - tcpm_suck_dst(tm, dst); + tcpm_suck_dst(tm, dst, false); } #define TCP_METRICS_RECLAIM_DEPTH 5 diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f35f2dfb6401..0f0178827259 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -93,15 +93,15 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th) { struct tcp_options_received tmp_opt; - const u8 *hash_location; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); + tcp_parse_options(skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { + tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = tcptw->tw_ts_recent; tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); @@ -288,6 +288,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + tcptw->tw_ts_offset = tp->tsoffset; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { @@ -386,32 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp = tcp_sk(newsk); - struct tcp_sock *oldtp = tcp_sk(sk); - struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - - /* TCP Cookie Transactions require space for the cookie pair, - * as it differs for each connection. There is no need to - * copy any s_data_payload stored at the original socket. - * Failure will prevent resuming the connection. - * - * Presumed copied, in order of appearance: - * cookie_in_always, cookie_out_never - */ - if (oldcvp != NULL) { - struct tcp_cookie_values *newcvp = - kzalloc(sizeof(*newtp->cookie_values), - GFP_ATOMIC); - - if (newcvp != NULL) { - kref_init(&newcvp->kref); - newcvp->cookie_desired = - oldcvp->cookie_desired; - newtp->cookie_values = newcvp; - } else { - /* Not Yet Implemented */ - newtp->cookie_values = NULL; - } - } /* Now setup tcp_sock */ newtp->pred_flags = 0; @@ -420,8 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rcv_nxt = treq->rcv_isn + 1; newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = - treq->snt_isn + 1 + tcp_s_data_size(oldtp); + newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; tcp_prequeue_init(newtp); INIT_LIST_HEAD(&newtp->tsq_node); @@ -438,6 +412,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_enable_early_retrans(newtp); + newtp->tlp_high_seq = 0; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -446,10 +421,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, */ newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; - newtp->bytes_acked = 0; - - newtp->frto_counter = 0; - newtp->frto_highmark = 0; if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && !try_module_get(newicsk->icsk_ca_ops->owner)) @@ -458,8 +429,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); - newtp->write_seq = newtp->pushed_seq = - treq->snt_isn + 1 + tcp_s_data_size(oldtp); + newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; @@ -500,6 +470,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } + newtp->tsoffset = 0; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ if (newtp->af_specific->md5_lookup(sk, newsk)) @@ -535,7 +506,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool fastopen) { struct tcp_options_received tmp_opt; - const u8 *hash_location; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); @@ -545,7 +515,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); + tcp_parse_options(skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; @@ -581,8 +551,13 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * * Note that even if there is new data in the SYN packet * they will be thrown away too. + * + * Reset timer after retransmitting SYNACK, similar to + * the idea of fast retransmit in recovery. */ - inet_rtx_syn_ack(sk, req); + if (!inet_rtx_syn_ack(sk, req)) + req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, + TCP_RTO_MAX) + jiffies; return NULL; } @@ -645,7 +620,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, */ if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != - tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) + tcp_rsk(req)->snt_isn + 1)) return sk; /* Also, it would be not so bad idea to check rcv_tsecr, which diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5d451593ef16..536d40929ba6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -65,28 +65,24 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ -EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); - static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - /* Don't override Nagle indefinitely with F-RTO */ - if (tp->frto_counter == 2) - tp->frto_counter = 3; - tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets || tp->early_retrans_delayed) + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { tcp_rearm_rto(sk); + } } /* SND.NXT, if window was not shrunk. @@ -314,7 +310,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; } @@ -384,7 +380,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_TS (1 << 1) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) -#define OPTION_COOKIE_EXTENSION (1 << 4) #define OPTION_FAST_OPEN_COOKIE (1 << 8) struct tcp_out_options { @@ -398,36 +393,6 @@ struct tcp_out_options { struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ }; -/* The sysctl int routines are generic, so check consistency here. - */ -static u8 tcp_cookie_size_check(u8 desired) -{ - int cookie_size; - - if (desired > 0) - /* previously specified */ - return desired; - - cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); - if (cookie_size <= 0) - /* no default specified */ - return 0; - - if (cookie_size <= TCP_COOKIE_MIN) - /* value too small, specify minimum */ - return TCP_COOKIE_MIN; - - if (cookie_size >= TCP_COOKIE_MAX) - /* value too large, specify maximum */ - return TCP_COOKIE_MAX; - - if (cookie_size & 1) - /* 8-bit multiple, illegal, fix it */ - cookie_size++; - - return (u8)cookie_size; -} - /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -446,27 +411,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, { u16 options = opts->options; /* mungable copy */ - /* Having both authentication and cookies for security is redundant, - * and there's certainly not enough room. Instead, the cookie-less - * extension variant is proposed. - * - * Consider the pessimal case with authentication. The options - * could look like: - * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 - */ if (unlikely(OPTION_MD5 & options)) { - if (unlikely(OPTION_COOKIE_EXTENSION & options)) { - *ptr++ = htonl((TCPOPT_COOKIE << 24) | - (TCPOLEN_COOKIE_BASE << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - } else { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - } - options &= ~OPTION_COOKIE_EXTENSION; + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; @@ -495,44 +442,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } - /* Specification requires after timestamp, so do it now. - * - * Consider the pessimal case without authentication. The options - * could look like: - * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 - */ - if (unlikely(OPTION_COOKIE_EXTENSION & options)) { - __u8 *cookie_copy = opts->hash_location; - u8 cookie_size = opts->hash_size; - - /* 8-bit multiple handled in tcp_cookie_size_check() above, - * and elsewhere. - */ - if (0x2 & cookie_size) { - __u8 *p = (__u8 *)ptr; - - /* 16-bit multiple */ - *p++ = TCPOPT_COOKIE; - *p++ = TCPOLEN_COOKIE_BASE + cookie_size; - *p++ = *cookie_copy++; - *p++ = *cookie_copy++; - ptr++; - cookie_size -= 2; - } else { - /* 32-bit multiple */ - *ptr++ = htonl(((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_COOKIE << 8) | - TCPOLEN_COOKIE_BASE) + - cookie_size); - } - - if (cookie_size > 0) { - memcpy(ptr, cookie_copy, cookie_size); - ptr += (cookie_size / 4); - } - } - if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -591,11 +500,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_md5sig_key **md5) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_cookie_values *cvp = tp->cookie_values; unsigned int remaining = MAX_TCP_OPTION_SPACE; - u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? - tcp_cookie_size_check(cvp->cookie_desired) : - 0; struct tcp_fastopen_request *fastopen = tp->fastopen_req; #ifdef CONFIG_TCP_MD5SIG @@ -622,7 +527,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -647,52 +552,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, tp->syn_fastopen = 1; } } - /* Note that timestamps are required by the specification. - * - * Odd numbers of bytes are prohibited by the specification, ensuring - * that the cookie is 16-bit aligned, and the resulting cookie pair is - * 32-bit aligned. - */ - if (*md5 == NULL && - (OPTION_TS & opts->options) && - cookie_size > 0) { - int need = TCPOLEN_COOKIE_BASE + cookie_size; - - if (0x2 & need) { - /* 32-bit multiple */ - need += 2; /* NOPs */ - - if (need > remaining) { - /* try shrinking cookie to fit */ - cookie_size -= 2; - need -= 4; - } - } - while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { - cookie_size -= 4; - need -= 4; - } - if (TCP_COOKIE_MIN <= cookie_size) { - opts->options |= OPTION_COOKIE_EXTENSION; - opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; - opts->hash_size = cookie_size; - - /* Remember for future incarnations. */ - cvp->cookie_desired = cookie_size; - - if (cvp->cookie_desired != cvp->cookie_pair_size) { - /* Currently use random bytes as a nonce, - * assuming these are completely unpredictable - * by hostile users of the same system. - */ - get_random_bytes(&cvp->cookie_pair[0], - cookie_size); - cvp->cookie_pair_size = cookie_size; - } - remaining -= need; - } - } return MAX_TCP_OPTION_SPACE - remaining; } @@ -702,14 +562,10 @@ static unsigned int tcp_synack_options(struct sock *sk, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5, - struct tcp_extend_values *xvp, struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; - u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? - xvp->cookie_plus : - 0; #ifdef CONFIG_TCP_MD5SIG *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); @@ -757,28 +613,7 @@ static unsigned int tcp_synack_options(struct sock *sk, remaining -= need; } } - /* Similar rationale to tcp_syn_options() applies here, too. - * If the <SYN> options fit, the same options should fit now! - */ - if (*md5 == NULL && - ireq->tstamp_ok && - cookie_plus > TCPOLEN_COOKIE_BASE) { - int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ - - if (0x2 & need) { - /* 32-bit multiple */ - need += 2; /* NOPs */ - } - if (need <= remaining) { - opts->options |= OPTION_COOKIE_EXTENSION; - opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; - remaining -= need; - } else { - /* There's no error return, so flag it. */ - xvp->cookie_out_never = 1; /* true */ - opts->hash_size = 0; - } - } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -806,7 +641,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when : 0; + opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -953,7 +788,7 @@ void __init tcp_tasklet_init(void) * We cant xmit new skbs from this context, as we might already * hold qdisc lock. */ -static void tcp_wfree(struct sk_buff *skb) +void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); @@ -1012,6 +847,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, __net_timestamp(skb); if (likely(clone_it)) { + const struct sk_buff *fclone = skb + 1; + + if (unlikely(skb->fclone == SKB_FCLONE_ORIG && + fclone->fclone == SKB_FCLONE_CLONE)) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else @@ -1298,7 +1140,6 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = min_t(int, len, skb_headlen(skb)); if (eat) { __skb_pull(skb, eat); - skb->avail_size -= eat; len -= eat; if (!len) return; @@ -1331,7 +1172,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { - if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; __pskb_trim_head(skb, len); @@ -1351,8 +1192,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } -/* Calculate MSS. Not accounting for SACKs here. */ -int tcp_mtu_to_mss(struct sock *sk, int pmtu) +/* Calculate MSS not accounting any TCP options. */ +static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1381,13 +1222,17 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) mss_now = 48; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); - return mss_now; } +/* Calculate MSS. Not accounting for SACKs here. */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ + /* Subtract TCP options size, not including SACKs */ + return __tcp_mtu_to_mss(sk, pmtu) - + (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); +} + /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { @@ -1629,11 +1474,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf if (nonagle & TCP_NAGLE_PUSH) return true; - /* Don't use the nagle rule for urgent data (or for the final FIN). - * Nagle can be ignored during F-RTO too (see RFC4138). - */ - if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || - (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) @@ -1806,8 +1648,11 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) goto send_now; } - /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies << 1); + /* Ok, it looks like it is advisable to defer. + * Do not rearm the timer if already set to not break TCP ACK clocking. + */ + if (!tp->tso_deferred) + tp->tso_deferred = 1 | (jiffies << 1); return true; @@ -1955,6 +1800,9 @@ static int tcp_mtu_probe(struct sock *sk) * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * + * Send at most one packet when push_one > 0. Temporarily ignore + * cwnd limit to force at most one packet out when push_one == 2. + * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ @@ -1990,8 +1838,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, goto repair; /* Skip network transmission */ cwnd_quota = tcp_cwnd_test(tp, skb); - if (!cwnd_quota) - break; + if (!cwnd_quota) { + if (push_one == 2) + /* Force out a loss probe pkt. */ + cwnd_quota = 1; + else + break; + } if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; @@ -2045,10 +1898,129 @@ repair: if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; + + /* Send one loss probe per tail loss episode. */ + if (push_one != 2) + tcp_schedule_loss_probe(sk); tcp_cwnd_validate(sk); return false; } - return !tp->packets_out && tcp_send_head(sk); + return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); +} + +bool tcp_schedule_loss_probe(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 timeout, tlp_time_stamp, rto_time_stamp; + u32 rtt = tp->srtt >> 3; + + if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) + return false; + /* No consecutive loss probes. */ + if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { + tcp_rearm_rto(sk); + return false; + } + /* Don't do any loss probe on a Fast Open connection before 3WHS + * finishes. + */ + if (sk->sk_state == TCP_SYN_RECV) + return false; + + /* TLP is only scheduled when next timer event is RTO. */ + if (icsk->icsk_pending != ICSK_TIME_RETRANS) + return false; + + /* Schedule a loss probe in 2*RTT for SACK capable connections + * in Open state, that are either limited by cwnd or application. + */ + if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || + !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) + return false; + + if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && + tcp_send_head(sk)) + return false; + + /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account + * for delayed ack when there's one outstanding packet. + */ + timeout = rtt << 1; + if (tp->packets_out == 1) + timeout = max_t(u32, timeout, + (rtt + (rtt >> 1) + TCP_DELACK_MAX)); + timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + + /* If RTO is shorter, just schedule TLP in its place. */ + tlp_time_stamp = tcp_time_stamp + timeout; + rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; + if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { + s32 delta = rto_time_stamp - tcp_time_stamp; + if (delta > 0) + timeout = delta; + } + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, + TCP_RTO_MAX); + return true; +} + +/* When probe timeout (PTO) fires, send a new segment if one exists, else + * retransmit the last segment. + */ +void tcp_send_loss_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int pcount; + int mss = tcp_current_mss(sk); + int err = -1; + + if (tcp_send_head(sk) != NULL) { + err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + goto rearm_timer; + } + + /* At most one outstanding TLP retransmission. */ + if (tp->tlp_high_seq) + goto rearm_timer; + + /* Retransmit last segment. */ + skb = tcp_write_queue_tail(sk); + if (WARN_ON(!skb)) + goto rearm_timer; + + pcount = tcp_skb_pcount(skb); + if (WARN_ON(!pcount)) + goto rearm_timer; + + if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { + if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) + goto rearm_timer; + skb = tcp_write_queue_tail(sk); + } + + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + + /* Probe with zero data doesn't trigger fast recovery. */ + if (skb->len > 0) + err = __tcp_retransmit_skb(sk, skb); + + /* Record snd_nxt for loss detection. */ + if (likely(!err)) + tp->tlp_high_seq = tp->snd_nxt; + +rearm_timer: + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); + + if (likely(!err)) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSSPROBES); + return; } /* Push out any pending frames which were held back due to @@ -2382,8 +2354,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - /* make sure skb->data is aligned on arches that require it */ - if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) { + /* make sure skb->data is aligned on arches that require it + * and check if ack-trimming & collapsing extended the headroom + * beyond what csum_start can cover. + */ + if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || + skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : @@ -2669,32 +2645,24 @@ int tcp_send_synack(struct sock *sk) * sk: listener socket * dst: dst entry attached to the SYNACK * req: request_sock pointer - * rvp: request_values pointer * * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct request_values *rvp, struct tcp_fastopen_cookie *foc) { struct tcp_out_options opts; - struct tcp_extend_values *xvp = tcp_xv(rvp); struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); - const struct tcp_cookie_values *cvp = tp->cookie_values; struct tcphdr *th; struct sk_buff *skb; struct tcp_md5sig_key *md5; int tcp_header_size; int mss; - int s_data_desired = 0; - if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) - s_data_desired = cvp->s_data_desired; - skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, - sk_gfp_atomic(sk, GFP_ATOMIC)); + skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC)); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2703,6 +2671,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb_reserve(skb, MAX_TCP_HEADER); skb_dst_set(skb, dst); + security_skb_owned_by(skb, sk); mss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) @@ -2736,9 +2705,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, else #endif TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_header_size = tcp_synack_options(sk, req, mss, - skb, &opts, &md5, xvp, foc) - + sizeof(*th); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, + foc) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2756,40 +2724,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, TCPHDR_SYN | TCPHDR_ACK); - if (OPTION_COOKIE_EXTENSION & opts.options) { - if (s_data_desired) { - u8 *buf = skb_put(skb, s_data_desired); - - /* copy data directly from the listening socket. */ - memcpy(buf, cvp->s_data_payload, s_data_desired); - TCP_SKB_CB(skb)->end_seq += s_data_desired; - } - - if (opts.hash_size > 0) { - __u32 workspace[SHA_WORKSPACE_WORDS]; - u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; - u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; - - /* Secret recipe depends on the Timestamp, (future) - * Sequence and Acknowledgment Numbers, Initiator - * Cookie, and others handled by IP variant caller. - */ - *tail-- ^= opts.tsval; - *tail-- ^= tcp_rsk(req)->rcv_isn + 1; - *tail-- ^= TCP_SKB_CB(skb)->seq + 1; - - /* recommended */ - *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); - *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ - - sha_transform((__u32 *)&xvp->cookie_bakery[0], - (char *)mess, - &workspace[0]); - opts.hash_location = - (__u8 *)&xvp->cookie_bakery[0]; - } - } - th->seq = htonl(TCP_SKB_CB(skb)->seq); /* XXX data is queued and acked as is. No buffer/window check */ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); @@ -2930,7 +2864,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) */ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; - space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; syn_data = skb_copy_expand(syn, skb_headroom(syn), space, diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 4526fe68e60e..d4943f67aff2 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -234,7 +234,7 @@ static __init int tcpprobe_init(void) if (!tcp_probe.log) goto err0; - if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops)) + if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops)) goto err0; ret = register_jprobe(&tcp_jprobe); @@ -244,7 +244,7 @@ static __init int tcpprobe_init(void) pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize); return 0; err1: - proc_net_remove(&init_net, procname); + remove_proc_entry(procname, init_net.proc_net); err0: kfree(tcp_probe.log); return ret; @@ -253,7 +253,7 @@ module_init(tcpprobe_init); static __exit void tcpprobe_exit(void) { - proc_net_remove(&init_net, procname); + remove_proc_entry(procname, init_net.proc_net); unregister_jprobe(&tcp_jprobe); kfree(tcp_probe.log); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b78aac30c498..4b85e6f636c9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -342,10 +342,6 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - if (tp->early_retrans_delayed) { - tcp_resume_early_retransmit(sk); - return; - } if (tp->fastopen_rsk) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); @@ -360,6 +356,8 @@ void tcp_retransmit_timer(struct sock *sk) WARN_ON(tcp_write_queue_empty(sk)); + tp->tlp_high_seq = 0; + if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { /* Receiver dastardly shrinks window. Our retransmits @@ -418,11 +416,7 @@ void tcp_retransmit_timer(struct sock *sk) NET_INC_STATS_BH(sock_net(sk), mib_idx); } - if (tcp_use_frto(sk)) { - tcp_enter_frto(sk); - } else { - tcp_enter_loss(sk, 0); - } + tcp_enter_loss(sk, 0); if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { /* Retransmission failed because of local congestion, @@ -495,13 +489,20 @@ void tcp_write_timer_handler(struct sock *sk) } event = icsk->icsk_pending; - icsk->icsk_pending = 0; switch (event) { + case ICSK_TIME_EARLY_RETRANS: + tcp_resume_early_retransmit(sk); + break; + case ICSK_TIME_LOSS_PROBE: + tcp_send_loss_probe(sk); + break; case ICSK_TIME_RETRANS: + icsk->icsk_pending = 0; tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: + icsk->icsk_pending = 0; tcp_probe_timer(sk); break; } diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 1b91bf48e277..76a1e23259e1 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -236,7 +236,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; - case CA_EVENT_FRTO: + case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1f4d405eafba..0bf5d399a03c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -139,6 +139,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, { struct sock *sk2; struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); sk_nulls_for_each(sk2, node, &hslot->head) if (net_eq(sock_net(sk2), net) && @@ -147,6 +148,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (!sk2->sk_reuseport || !sk->sk_reuseport || + !uid_eq(uid, sock_i_uid(sk2))) && (*saddr_comp)(sk, sk2)) { if (bitmap) __set_bit(udp_sk(sk2)->udp_port_hash >> log, @@ -169,6 +172,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, { struct sock *sk2; struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); @@ -179,6 +183,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (!sk2->sk_reuseport || !sk->sk_reuseport || + !uid_eq(uid, sock_i_uid(sk2))) && (*saddr_comp)(sk, sk2)) { res = 1; break; @@ -337,26 +343,26 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, !ipv6_only_sock(sk)) { struct inet_sock *inet = inet_sk(sk); - score = (sk->sk_family == PF_INET ? 1 : 0); + score = (sk->sk_family == PF_INET ? 2 : 1); if (inet->inet_rcv_saddr) { if (inet->inet_rcv_saddr != daddr) return -1; - score += 2; + score += 4; } if (inet->inet_daddr) { if (inet->inet_daddr != saddr) return -1; - score += 2; + score += 4; } if (inet->inet_dport) { if (inet->inet_dport != sport) return -1; - score += 2; + score += 4; } if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) return -1; - score += 2; + score += 4; } } return score; @@ -365,7 +371,6 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, /* * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num) */ -#define SCORE2_MAX (1 + 2 + 2 + 2) static inline int compute_score2(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif) @@ -380,21 +385,21 @@ static inline int compute_score2(struct sock *sk, struct net *net, if (inet->inet_num != hnum) return -1; - score = (sk->sk_family == PF_INET ? 1 : 0); + score = (sk->sk_family == PF_INET ? 2 : 1); if (inet->inet_daddr) { if (inet->inet_daddr != saddr) return -1; - score += 2; + score += 4; } if (inet->inet_dport) { if (inet->inet_dport != sport) return -1; - score += 2; + score += 4; } if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) return -1; - score += 2; + score += 4; } } return score; @@ -409,19 +414,29 @@ static struct sock *udp4_lib_lookup2(struct net *net, { struct sock *sk, *result; struct hlist_nulls_node *node; - int score, badness; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; begin: result = NULL; - badness = -1; + badness = 0; udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { score = compute_score2(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { result = sk; badness = score; - if (score == SCORE2_MAX) - goto exact_match; + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = inet_ehashfn(net, daddr, hnum, + saddr, htons(sport)); + matches = 1; + } + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) + result = sk; + hash = next_pseudo_random32(hash); } } /* @@ -431,9 +446,7 @@ begin: */ if (get_nulls_value(node) != slot2) goto begin; - if (result) { -exact_match: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, @@ -457,7 +470,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; - int score, badness; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; rcu_read_lock(); if (hslot->count > 10) { @@ -486,13 +500,24 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, } begin: result = NULL; - badness = -1; + badness = 0; sk_nulls_for_each_rcu(sk, node, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { result = sk; badness = score; + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = inet_ehashfn(net, daddr, hnum, + saddr, htons(sport)); + matches = 1; + } + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) + result = sk; + hash = next_pseudo_random32(hash); } } /* @@ -877,9 +902,9 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; - err = sock_tx_timestamp(sk, &ipc.tx_flags); - if (err) - return err; + + sock_tx_timestamp(sk, &ipc.tx_flags); + if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc); if (err) @@ -971,7 +996,7 @@ back_from_confirm: sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); err = PTR_ERR(skb); - if (skb && !IS_ERR(skb)) + if (!IS_ERR_OR_NULL(skb)) err = udp_send_skb(skb, fl4); goto out; } @@ -1106,6 +1131,8 @@ static unsigned int first_packet_length(struct sock *sk) spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, + IS_UDPLITE(sk)); UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); atomic_inc(&sk->sk_drops); @@ -1261,8 +1288,10 @@ out: csum_copy_err: slow = lock_sock_fast(sk); - if (!skb_kill_datagram(sk, skb, flags)) + if (!skb_kill_datagram(sk, skb, flags)) { + UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + } unlock_sock_fast(sk, slow); if (noblock) @@ -1488,7 +1517,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) - goto drop; + goto csum_error; if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) @@ -1508,6 +1537,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return rc; +csum_error: + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); @@ -1724,6 +1755,7 @@ csum_error: proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), ulen); + UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); @@ -1737,9 +1769,16 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); + if (static_key_false(&udp_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } } /* @@ -2061,7 +2100,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v) int udp_seq_open(struct inode *inode, struct file *file) { - struct udp_seq_afinfo *afinfo = PDE(inode)->data; + struct udp_seq_afinfo *afinfo = PDE_DATA(inode); struct udp_iter_state *s; int err; @@ -2097,7 +2136,7 @@ EXPORT_SYMBOL(udp_proc_register); void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo) { - proc_net_remove(net, afinfo->name); + remove_proc_entry(afinfo->name, net->proc_net); } EXPORT_SYMBOL(udp_proc_unregister); @@ -2247,31 +2286,91 @@ void __init udp_init(void) int udp4_ufo_send_check(struct sk_buff *skb) { - const struct iphdr *iph; - struct udphdr *uh; - - if (!pskb_may_pull(skb, sizeof(*uh))) + if (!pskb_may_pull(skb, sizeof(struct udphdr))) return -EINVAL; - iph = ip_hdr(skb); - uh = udp_hdr(skb); + if (likely(!skb->encapsulation)) { + const struct iphdr *iph; + struct udphdr *uh; - uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - IPPROTO_UDP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; + iph = ip_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + } return 0; } +static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + int mac_len = skb->mac_len; + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + __be16 protocol = skb->protocol; + netdev_features_t enc_features; + int outer_hlen; + + if (unlikely(!pskb_may_pull(skb, tnl_hlen))) + goto out; + + skb->encapsulation = 0; + __skb_pull(skb, tnl_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb->mac_len = skb_inner_network_offset(skb); + skb->protocol = htons(ETH_P_TEB); + + /* segment inner packet. */ + enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + segs = skb_mac_gso_segment(skb, enc_features); + if (!segs || IS_ERR(segs)) + goto out; + + outer_hlen = skb_tnl_header_len(skb); + skb = segs; + do { + struct udphdr *uh; + int udp_offset = outer_hlen - tnl_hlen; + + skb->mac_len = mac_len; + + skb_push(skb, outer_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb_set_transport_header(skb, udp_offset); + uh = udp_hdr(skb); + uh->len = htons(skb->len - udp_offset); + + /* csum segment if tunnel sets skb with csum. */ + if (unlikely(uh->check)) { + struct iphdr *iph = ip_hdr(skb); + + uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + skb->len - udp_offset, + IPPROTO_UDP, 0); + uh->check = csum_fold(skb_checksum(skb, udp_offset, + skb->len - udp_offset, 0)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + } + skb->ip_summed = CHECKSUM_NONE; + skb->protocol = protocol; + } while ((skb = skb->next)); +out: + return segs; +} + struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; - int offset; - __wsum csum; - mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; @@ -2280,7 +2379,9 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, /* Packet is from an untrusted source, reset gso_segs. */ int type = skb_shinfo(skb)->gso_type; - if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_GRE) || !(type & (SKB_GSO_UDP)))) goto out; @@ -2290,20 +2391,27 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; } - /* Do software UFO. Complete and fill in the UDP checksum as HW cannot - * do checksum of UDP packets sent as multiple IP fragments. - */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); - skb->ip_summed = CHECKSUM_NONE; - /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ - segs = skb_segment(skb, features); + if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) + segs = skb_udp_tunnel_segment(skb, features); + else { + int offset; + __wsum csum; + + /* Do software UFO. Complete and fill in the UDP checksum as + * HW cannot do checksum of UDP packets sent as multiple + * IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + segs = skb_segment(skb, features); + } out: return segs; } - diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 505b30ad9182..7927db0a9279 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -25,7 +25,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, return 0; return inet_sk_diag_fill(sk, NULL, skb, req, - sk_user_ns(NETLINK_CB(cb->skb).ssk), + sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } @@ -64,14 +64,14 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, goto out; err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - 64)), GFP_KERNEL); + rep = nlmsg_new(sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + 64, + GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, req, - sk_user_ns(NETLINK_CB(in_skb).ssk), + sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, nlh); if (err < 0) { diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 06814b6216dc..1f12c8b45864 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -132,7 +132,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) * header and optional ESP marker bytes) and then modify the * protocol to ESP, and then call into the transform receiver. */ - if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto drop; /* Now we can update and verify the packet length... */ diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index ddee0a099a2c..eb1dd4d643f2 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -103,8 +103,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family); - /* DS disclosed */ - top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos, + /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */ + if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) + top_iph->tos = 0; + else + top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos; + top_iph->tos = INET_ECN_encapsulate(top_iph->tos, XFRM_MODE_SKB_CB(skb)->tos); flags = x->props.flags; @@ -142,8 +146,8 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) for_each_input_rcu(rcv_notify_handlers, handler) handler->handler(skb); - if (skb_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_unclone(skb, GFP_ATOMIC); + if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 3be0ac2c1920..9a459be24af7 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -262,21 +262,56 @@ static struct ctl_table xfrm4_policy_table[] = { { } }; -static struct ctl_table_header *sysctl_hdr; -#endif - -static void __init xfrm4_policy_init(void) +static int __net_init xfrm4_net_init(struct net *net) { - xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = xfrm4_policy_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh; + } + + hdr = register_net_sysctl(net, "net/ipv4", table); + if (!hdr) + goto err_reg; + + net->ipv4.xfrm4_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; } -static void __exit xfrm4_policy_fini(void) +static void __net_exit xfrm4_net_exit(struct net *net) { -#ifdef CONFIG_SYSCTL - if (sysctl_hdr) - unregister_net_sysctl_table(sysctl_hdr); + struct ctl_table *table; + + if (net->ipv4.xfrm4_hdr == NULL) + return; + + table = net->ipv4.xfrm4_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.xfrm4_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations __net_initdata xfrm4_net_ops = { + .init = xfrm4_net_init, + .exit = xfrm4_net_exit, +}; #endif - xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); + +static void __init xfrm4_policy_init(void) +{ + xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); } void __init xfrm4_init(void) @@ -286,8 +321,7 @@ void __init xfrm4_init(void) xfrm4_state_init(); xfrm4_policy_init(); #ifdef CONFIG_SYSCTL - sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4", - xfrm4_policy_table); + register_pernet_subsys(&xfrm4_net_ops); #endif } diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 4f7fe7270e37..11b13ea69db4 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -11,7 +11,7 @@ menuconfig IPV6 You will still be able to do traditional IPv4 networking as well. For general information about IPv6, see - <http://playground.sun.com/pub/ipng/html/ipng-main.html>. + <https://en.wikipedia.org/wiki/IPv6>. For Linux IPv6 development information, see <http://www.linux-ipv6.org>. For specific information about IPv6 under Linux, read the HOWTO at <http://www.bieringer.de/linux/IPv6/>. @@ -50,16 +50,15 @@ config IPV6_ROUTER_PREF If unsure, say N. config IPV6_ROUTE_INFO - bool "IPv6: Route Information (RFC 4191) support (EXPERIMENTAL)" - depends on IPV6_ROUTER_PREF && EXPERIMENTAL + bool "IPv6: Route Information (RFC 4191) support" + depends on IPV6_ROUTER_PREF ---help--- This is experimental support of Route Information. If unsure, say N. config IPV6_OPTIMISTIC_DAD - bool "IPv6: Enable RFC 4429 Optimistic DAD (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "IPv6: Enable RFC 4429 Optimistic DAD" ---help--- This is experimental support for optimistic Duplicate Address Detection. It allows for autoconfigured addresses @@ -105,8 +104,7 @@ config INET6_IPCOMP If unsure, say Y. config IPV6_MIP6 - tristate "IPv6: Mobility (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "IPv6: Mobility" select XFRM ---help--- Support for IPv6 Mobility described in RFC 3775. @@ -150,8 +148,7 @@ config INET6_XFRM_MODE_BEET If unsure, say Y. config INET6_XFRM_MODE_ROUTEOPTIMIZATION - tristate "IPv6: MIPv6 route optimization mode (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "IPv6: MIPv6 route optimization mode" select XFRM ---help--- Support for MIPv6 route optimization mode. @@ -159,6 +156,7 @@ config INET6_XFRM_MODE_ROUTEOPTIMIZATION config IPV6_SIT tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)" select INET_TUNNEL + select NET_IP_TUNNEL select IPV6_NDISC_NODETYPE default y ---help--- @@ -171,8 +169,8 @@ config IPV6_SIT Saying M here will produce a module called sit. If unsure, say Y. config IPV6_SIT_6RD - bool "IPv6: IPv6 Rapid Deployment (6RD) (EXPERIMENTAL)" - depends on IPV6_SIT && EXPERIMENTAL + bool "IPv6: IPv6 Rapid Deployment (6RD)" + depends on IPV6_SIT default n ---help--- IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon @@ -204,6 +202,7 @@ config IPV6_TUNNEL config IPV6_GRE tristate "IPv6: GRE tunnel" select IPV6_TUNNEL + select NET_IP_TUNNEL ---help--- Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -219,7 +218,6 @@ config IPV6_GRE config IPV6_MULTIPLE_TABLES bool "IPv6: Multiple Routing Tables" - depends on EXPERIMENTAL select FIB_RULES ---help--- Support multiple routing tables. @@ -239,8 +237,8 @@ config IPV6_SUBTREES If unsure, say N. config IPV6_MROUTE - bool "IPv6: multicast routing (EXPERIMENTAL)" - depends on IPV6 && EXPERIMENTAL + bool "IPv6: multicast routing" + depends on IPV6 ---help--- Experimental support for IPv6 multicast forwarding. If unsure, say N. @@ -260,7 +258,7 @@ config IPV6_MROUTE_MULTIPLE_TABLES If unsure, say N. config IPV6_PIMSM_V2 - bool "IPv6: PIM-SM version 2 support (EXPERIMENTAL)" + bool "IPv6: PIM-SM version 2 support" depends on IPV6_MROUTE ---help--- Support for IPv6 PIM multicast routing protocol PIM-SMv2. diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 4ea244891b58..9af088d2cdaa 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -40,7 +40,7 @@ obj-$(CONFIG_IPV6_SIT) += sit.o obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-$(CONFIG_IPV6_GRE) += ip6_gre.o -obj-y += addrconf_core.o exthdrs_core.o +obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1b5d8cb9b123..d1ab6ab29a55 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -70,6 +70,7 @@ #include <net/snmp.h> #include <net/af_ieee802154.h> +#include <net/firewire.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/ndisc.h> @@ -110,10 +111,6 @@ static inline u32 cstamp_delta(unsigned long cstamp) return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } -#define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ/50 : 1) -#define ADDRCONF_TIMER_FUZZ (HZ / 4) -#define ADDRCONF_TIMER_FUZZ_MAX (HZ) - #ifdef CONFIG_SYSCTL static void addrconf_sysctl_register(struct inet6_dev *idev); static void addrconf_sysctl_unregister(struct inet6_dev *idev); @@ -172,8 +169,6 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, struct net_device *dev); -static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); - static struct ipv6_devconf ipv6_devconf __read_mostly = { .forwarding = 0, .hop_limit = IPV6_DEFAULT_HOPLIMIT, @@ -248,6 +243,9 @@ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; +const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; /* Check if a valid qdisc is available */ static inline bool addrconf_qdisc_ok(const struct net_device *dev) @@ -422,6 +420,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ipv6_regen_rndid((unsigned long) ndev); } #endif + ndev->token = in6addr_any; if (netif_running(dev) && addrconf_qdisc_ok(dev)) ndev->if_flags |= IF_READY; @@ -432,6 +431,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) /* protected by rtnl_lock */ rcu_assign_pointer(dev->ip6_ptr, ndev); + /* Join interface-local all-node multicast group */ + ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes); + /* Join all-node multicast group */ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes); @@ -542,8 +544,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { }; static int inet6_netconf_get_devconf(struct sk_buff *in_skb, - struct nlmsghdr *nlh, - void *arg) + struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; @@ -603,6 +604,77 @@ errout: return err; } +static int inet6_netconf_dump_devconf(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx, s_idx; + struct net_device *dev; + struct inet6_dev *idev; + struct hlist_head *head; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ + net->dev_base_seq; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; + + if (inet6_netconf_fill_devconf(skb, dev->ifindex, + &idev->cnf, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + -1) <= 0) { + rcu_read_unlock(); + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + rcu_read_unlock(); + } + if (h == NETDEV_HASHENTRIES) { + if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } + if (h == NETDEV_HASHENTRIES + 1) { + if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } +done: + cb->args[0] = h; + cb->args[1] = idx; + + return skb->len; +} + #ifdef CONFIG_SYSCTL static void dev_forward_change(struct inet6_dev *idev) { @@ -615,10 +687,15 @@ static void dev_forward_change(struct inet6_dev *idev) if (idev->cnf.forwarding) dev_disable_lro(dev); if (dev->flags & IFF_MULTICAST) { - if (idev->cnf.forwarding) + if (idev->cnf.forwarding) { ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); - else + ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allrouters); + ipv6_dev_mc_inc(dev, &in6addr_sitelocal_allrouters); + } else { ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters); + ipv6_dev_mc_dec(dev, &in6addr_interfacelocal_allrouters); + ipv6_dev_mc_dec(dev, &in6addr_sitelocal_allrouters); + } } list_for_each_entry(ifa, &idev->addr_list, if_list) { @@ -799,6 +876,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, ifa->prefix_len = pfxlen; ifa->flags = flags | IFA_F_TENTATIVE; ifa->cstamp = ifa->tstamp = jiffies; + ifa->tokenized = false; ifa->rt = rt; @@ -830,7 +908,7 @@ out2: rcu_read_unlock_bh(); if (likely(err == 0)) - atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa); + inet6addr_notifier_call_chain(NETDEV_UP, ifa); else { kfree(ifa); ifa = ERR_PTR(err); @@ -920,7 +998,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) ipv6_ifa_notify(RTM_DELADDR, ifp); - atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp); + inet6addr_notifier_call_chain(NETDEV_DOWN, ifp); /* * Purge or update corresponding prefix @@ -1051,7 +1129,7 @@ retry: ipv6_add_addr(idev, &addr, tmp_plen, ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, addr_flags) : NULL; - if (!ift || IS_ERR(ift)) { + if (IS_ERR_OR_NULL(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); pr_info("%s: retry temporary address regeneration\n", __func__); @@ -1412,11 +1490,10 @@ int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict) { struct inet6_ifaddr *ifp; - struct hlist_node *node; unsigned int hash = inet6_addr_hash(addr); rcu_read_lock_bh(); - hlist_for_each_entry_rcu(ifp, node, &inet6_addr_lst[hash], addr_lst) { + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr) && @@ -1438,9 +1515,8 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, { unsigned int hash = inet6_addr_hash(addr); struct inet6_ifaddr *ifp; - struct hlist_node *node; - hlist_for_each_entry(ifp, node, &inet6_addr_lst[hash], addr_lst) { + hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { @@ -1480,10 +1556,9 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add { struct inet6_ifaddr *ifp, *result = NULL; unsigned int hash = inet6_addr_hash(addr); - struct hlist_node *node; rcu_read_lock_bh(); - hlist_for_each_entry_rcu_bh(ifp, node, &inet6_addr_lst[hash], addr_lst) { + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { @@ -1664,6 +1739,20 @@ static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev) return 0; } +static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev) +{ + union fwnet_hwaddr *ha; + + if (dev->addr_len != FWNET_ALEN) + return -1; + + ha = (union fwnet_hwaddr *)dev->dev_addr; + + memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id)); + eui[0] ^= 2; + return 0; +} + static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev) { /* XXX: inherit EUI-64 from other interface -- yoshfuji */ @@ -1728,6 +1817,8 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) return addrconf_ifid_gre(eui, dev); case ARPHRD_IEEE802154: return addrconf_ifid_eui64(eui, dev); + case ARPHRD_IEEE1394: + return addrconf_ifid_ieee1394(eui, dev); } return -1; } @@ -2042,11 +2133,19 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) struct inet6_ifaddr *ifp; struct in6_addr addr; int create = 0, update_lft = 0; + bool tokenized = false; if (pinfo->prefix_len == 64) { memcpy(&addr, &pinfo->prefix, 8); - if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && - ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { + + if (!ipv6_addr_any(&in6_dev->token)) { + read_lock_bh(&in6_dev->lock); + memcpy(addr.s6_addr + 8, + in6_dev->token.s6_addr + 8, 8); + read_unlock_bh(&in6_dev->lock); + tokenized = true; + } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && + ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { in6_dev_put(in6_dev); return; } @@ -2080,13 +2179,14 @@ ok: addr_type&IPV6_ADDR_SCOPE_MASK, addr_flags); - if (!ifp || IS_ERR(ifp)) { + if (IS_ERR_OR_NULL(ifp)) { in6_dev_put(in6_dev); return; } update_lft = create = 1; ifp->cstamp = jiffies; + ifp->tokenized = tokenized; addrconf_dad_start(ifp); } @@ -2525,6 +2625,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) static void init_loopback(struct net_device *dev) { struct inet6_dev *idev; + struct net_device *sp_dev; + struct inet6_ifaddr *sp_ifa; + struct rt6_info *sp_rt; /* ::1 */ @@ -2536,6 +2639,30 @@ static void init_loopback(struct net_device *dev) } add_addr(idev, &in6addr_loopback, 128, IFA_HOST); + + /* Add routes to other interface's IPv6 addresses */ + for_each_netdev(dev_net(dev), sp_dev) { + if (!strcmp(sp_dev->name, dev->name)) + continue; + + idev = __in6_dev_get(sp_dev); + if (!idev) + continue; + + read_lock_bh(&idev->lock); + list_for_each_entry(sp_ifa, &idev->addr_list, if_list) { + + if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE)) + continue; + + sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, 0); + + /* Failure cases are ignored */ + if (!IS_ERR(sp_rt)) + ip6_ins_rt(sp_rt); + } + read_unlock_bh(&idev->lock); + } } static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr) @@ -2569,7 +2696,8 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_FDDI) && (dev->type != ARPHRD_ARCNET) && (dev->type != ARPHRD_INFINIBAND) && - (dev->type != ARPHRD_IEEE802154)) { + (dev->type != ARPHRD_IEEE802154) && + (dev->type != ARPHRD_IEEE1394)) { /* Alas, we support only Ethernet autoconfiguration. */ return; } @@ -2900,11 +3028,10 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* Step 2: clear hash table */ for (i = 0; i < IN6_ADDR_HSIZE; i++) { struct hlist_head *h = &inet6_addr_lst[i]; - struct hlist_node *n; spin_lock_bh(&addrconf_hash_lock); restart: - hlist_for_each_entry_rcu(ifa, n, h, addr_lst) { + hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { hlist_del_init_rcu(&ifa->addr_lst); addrconf_del_timer(ifa); @@ -2958,7 +3085,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); - atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa); + inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); } in6_ifa_put(ifa); @@ -3211,8 +3338,7 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) } for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { - struct hlist_node *n; - hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket], + hlist_for_each_entry_rcu_bh(ifa, &inet6_addr_lst[state->bucket], addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; @@ -3237,9 +3363,8 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, { struct if6_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - struct hlist_node *n = &ifa->addr_lst; - hlist_for_each_entry_continue_rcu_bh(ifa, n, addr_lst) { + hlist_for_each_entry_continue_rcu_bh(ifa, addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; state->offset++; @@ -3248,7 +3373,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, while (++state->bucket < IN6_ADDR_HSIZE) { state->offset = 0; - hlist_for_each_entry_rcu_bh(ifa, n, + hlist_for_each_entry_rcu_bh(ifa, &inet6_addr_lst[state->bucket], addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; @@ -3318,14 +3443,14 @@ static const struct file_operations if6_fops = { static int __net_init if6_proc_net_init(struct net *net) { - if (!proc_net_fops_create(net, "if_inet6", S_IRUGO, &if6_fops)) + if (!proc_create("if_inet6", S_IRUGO, net->proc_net, &if6_fops)) return -ENOMEM; return 0; } static void __net_exit if6_proc_net_exit(struct net *net) { - proc_net_remove(net, "if_inet6"); + remove_proc_entry("if_inet6", net->proc_net); } static struct pernet_operations if6_proc_net_ops = { @@ -3350,11 +3475,10 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) { int ret = 0; struct inet6_ifaddr *ifp = NULL; - struct hlist_node *n; unsigned int hash = inet6_addr_hash(addr); rcu_read_lock_bh(); - hlist_for_each_entry_rcu_bh(ifp, n, &inet6_addr_lst[hash], addr_lst) { + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr) && @@ -3376,7 +3500,6 @@ static void addrconf_verify(unsigned long foo) { unsigned long now, next, next_sec, next_sched; struct inet6_ifaddr *ifp; - struct hlist_node *node; int i; rcu_read_lock_bh(); @@ -3388,7 +3511,7 @@ static void addrconf_verify(unsigned long foo) for (i = 0; i < IN6_ADDR_HSIZE; i++) { restart: - hlist_for_each_entry_rcu_bh(ifp, node, + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) { unsigned long age; @@ -3511,7 +3634,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { }; static int -inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; @@ -3577,7 +3700,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, } static int -inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; @@ -3808,6 +3931,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, NLM_F_MULTI); if (err <= 0) break; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } break; } @@ -3859,17 +3983,17 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev; struct inet6_dev *idev; struct hlist_head *head; - struct hlist_node *node; s_h = cb->args[0]; s_idx = idx = cb->args[1]; s_ip_idx = ip_idx = cb->args[2]; rcu_read_lock(); + cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; if (h > s_h || idx > s_idx) @@ -3917,8 +4041,7 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) return inet6_dump_addr(skb, cb, type); } -static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, - void *arg) +static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct ifaddrmsg *ifm; @@ -4051,7 +4174,8 @@ static inline size_t inet6_ifla6_size(void) + nla_total_size(sizeof(struct ifla_cacheinfo)) + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ - + nla_total_size(ICMP6_MIB_MAX * 8); /* IFLA_INET6_ICMP6STATS */ + + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ + + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */ } static inline size_t inet6_if_nlmsg_size(void) @@ -4138,6 +4262,13 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) goto nla_put_failure; snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); + if (nla == NULL) + goto nla_put_failure; + read_lock_bh(&idev->lock); + memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); + read_unlock_bh(&idev->lock); + return 0; nla_put_failure: @@ -4165,6 +4296,80 @@ static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev) return 0; } +static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) +{ + struct inet6_ifaddr *ifp; + struct net_device *dev = idev->dev; + bool update_rs = false; + + if (token == NULL) + return -EINVAL; + if (ipv6_addr_any(token)) + return -EINVAL; + if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) + return -EINVAL; + if (!ipv6_accept_ra(idev)) + return -EINVAL; + if (idev->cnf.rtr_solicits <= 0) + return -EINVAL; + + write_lock_bh(&idev->lock); + + BUILD_BUG_ON(sizeof(token->s6_addr) != 16); + memcpy(idev->token.s6_addr + 8, token->s6_addr + 8, 8); + + write_unlock_bh(&idev->lock); + + if (!idev->dead && (idev->if_flags & IF_READY)) { + struct in6_addr ll_addr; + + ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE | + IFA_F_OPTIMISTIC); + + /* If we're not ready, then normal ifup will take care + * of this. Otherwise, we need to request our rs here. + */ + ndisc_send_rs(dev, &ll_addr, &in6addr_linklocal_allrouters); + update_rs = true; + } + + write_lock_bh(&idev->lock); + + if (update_rs) + idev->if_flags |= IF_RS_SENT; + + /* Well, that's kinda nasty ... */ + list_for_each_entry(ifp, &idev->addr_list, if_list) { + spin_lock(&ifp->lock); + if (ifp->tokenized) { + ifp->valid_lft = 0; + ifp->prefered_lft = 0; + } + spin_unlock(&ifp->lock); + } + + write_unlock_bh(&idev->lock); + return 0; +} + +static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) +{ + int err = -EINVAL; + struct inet6_dev *idev = __in6_dev_get(dev); + struct nlattr *tb[IFLA_INET6_MAX + 1]; + + if (!idev) + return -EAFNOSUPPORT; + + if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0) + BUG(); + + if (tb[IFLA_INET6_TOKEN]) + err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN])); + + return err; +} + static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, u32 portid, u32 seq, int event, unsigned int flags) { @@ -4215,7 +4420,6 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; struct inet6_dev *idev; struct hlist_head *head; - struct hlist_node *node; s_h = cb->args[0]; s_idx = cb->args[1]; @@ -4224,7 +4428,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; idev = __in6_dev_get(dev); @@ -4344,6 +4548,8 @@ errout: static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) { + struct net *net = dev_net(ifp->idev->dev); + inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); switch (event) { @@ -4369,6 +4575,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) dst_free(&ifp->rt->dst); break; } + atomic_inc(&net->ipv6.dev_addr_genid); } static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) @@ -4787,26 +4994,20 @@ static void addrconf_sysctl_unregister(struct inet6_dev *idev) static int __net_init addrconf_init_net(struct net *net) { - int err; + int err = -ENOMEM; struct ipv6_devconf *all, *dflt; - err = -ENOMEM; - all = &ipv6_devconf; - dflt = &ipv6_devconf_dflt; + all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; - if (!net_eq(net, &init_net)) { - all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL); - if (all == NULL) - goto err_alloc_all; + dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; - dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); - if (dflt == NULL) - goto err_alloc_dflt; - } else { - /* these will be inherited by all namespaces */ - dflt->autoconf = ipv6_defaults.autoconf; - dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; - } + /* these will be inherited by all namespaces */ + dflt->autoconf = ipv6_defaults.autoconf; + dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; net->ipv6.devconf_all = all; net->ipv6.devconf_dflt = dflt; @@ -4851,26 +5052,11 @@ static struct pernet_operations addrconf_ops = { .exit = addrconf_exit_net, }; -/* - * Device notifier - */ - -int register_inet6addr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&inet6addr_chain, nb); -} -EXPORT_SYMBOL(register_inet6addr_notifier); - -int unregister_inet6addr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&inet6addr_chain, nb); -} -EXPORT_SYMBOL(unregister_inet6addr_notifier); - static struct rtnl_af_ops inet6_ops = { .family = AF_INET6, .fill_link_af = inet6_fill_link_af, .get_link_af_size = inet6_get_link_af_size, + .set_link_af = inet6_set_link_af, }; /* @@ -4943,7 +5129,7 @@ int __init addrconf_init(void) __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr, NULL); __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, - NULL, NULL); + inet6_netconf_dump_devconf, NULL); ipv6_addr_label_rtnl_register(); diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index d051e5f4bf34..72104562c864 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -78,3 +78,22 @@ int __ipv6_addr_type(const struct in6_addr *addr) } EXPORT_SYMBOL(__ipv6_addr_type); +static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); + +int register_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(register_inet6addr_notifier); + +int unregister_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(unregister_inet6addr_notifier); + +int inet6addr_notifier_call_chain(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&inet6addr_chain, val, v); +} +EXPORT_SYMBOL(inet6addr_notifier_call_chain); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index ff76eecfd622..f083a583a05c 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -173,9 +173,8 @@ static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex) { - struct hlist_node *pos; struct ip6addrlbl_entry *p; - hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { if (__ip6addrlbl_match(net, p, addr, type, ifindex)) return p; } @@ -261,9 +260,9 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) if (hlist_empty(&ip6addrlbl_table.head)) { hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); } else { - struct hlist_node *pos, *n; + struct hlist_node *n; struct ip6addrlbl_entry *p = NULL; - hlist_for_each_entry_safe(p, pos, n, + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { if (p->prefixlen == newp->prefixlen && net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && @@ -319,13 +318,13 @@ static int __ip6addrlbl_del(struct net *net, int ifindex) { struct ip6addrlbl_entry *p = NULL; - struct hlist_node *pos, *n; + struct hlist_node *n; int ret = -ESRCH; ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, prefixlen, ifindex); - hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && net_eq(ip6addrlbl_net(p), net) && p->ifindex == ifindex && @@ -380,11 +379,11 @@ static int __net_init ip6addrlbl_net_init(struct net *net) static void __net_exit ip6addrlbl_net_exit(struct net *net) { struct ip6addrlbl_entry *p = NULL; - struct hlist_node *pos, *n; + struct hlist_node *n; /* Remove all labels belonging to the exiting net */ spin_lock(&ip6addrlbl_table.lock); - hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { if (net_eq(ip6addrlbl_net(p), net)) { hlist_del_rcu(&p->list); ip6addrlbl_put(p); @@ -415,8 +414,7 @@ static const struct nla_policy ifal_policy[IFAL_MAX+1] = { [IFAL_LABEL] = { .len = sizeof(u32), }, }; -static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, - void *arg) +static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifaddrlblmsg *ifal; @@ -437,10 +435,7 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, if (!tb[IFAL_ADDRESS]) return -EINVAL; - pfx = nla_data(tb[IFAL_ADDRESS]); - if (!pfx) - return -EINVAL; if (!tb[IFAL_LABEL]) return -EINVAL; @@ -505,12 +500,11 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; - struct hlist_node *pos; int idx = 0, s_idx = cb->args[0]; int err; rcu_read_lock(); - hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { if (idx >= s_idx && net_eq(ip6addrlbl_net(p), net)) { if ((err = ip6addrlbl_fill(skb, p, @@ -535,8 +529,7 @@ static inline int ip6addrlbl_msgsize(void) + nla_total_size(4); /* IFAL_LABEL */ } -static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, - void *arg) +static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh) { struct net *net = sock_net(in_skb->sk); struct ifaddrlblmsg *ifal; @@ -563,10 +556,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, if (!tb[IFAL_ADDRESS]) return -EINVAL; - addr = nla_data(tb[IFAL_ADDRESS]); - if (!addr) - return -EINVAL; rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b043c60429bd..ab5c7ad482cd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -49,7 +49,6 @@ #include <net/udp.h> #include <net/udplite.h> #include <net/tcp.h> -#include <net/ipip.h> #include <net/protocol.h> #include <net/inet_common.h> #include <net/route.h> @@ -323,7 +322,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net_device *dev = NULL; rcu_read_lock(); - if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another one @@ -471,8 +470,8 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_port = inet->inet_sport; } - if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin->sin6_scope_id = sk->sk_bound_dev_if; + sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, + sk->sk_bound_dev_if); *uaddr_len = sizeof(*sin); return 0; } @@ -811,11 +810,10 @@ static struct pernet_operations inet6_net_ops = { static int __init inet6_init(void) { - struct sk_buff *dummy_skb; struct list_head *r; int err = 0; - BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); /* Register the socket-side information for inet6_create. */ for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 384233188ac1..bb02e176cb70 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -521,8 +521,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 757a810d8f15..5a80f15a9de2 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -47,7 +47,7 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); /* Big ac list lock for all the sockets */ -static DEFINE_RWLOCK(ipv6_sk_ac_lock); +static DEFINE_SPINLOCK(ipv6_sk_ac_lock); /* @@ -128,10 +128,10 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) err = ipv6_dev_ac_inc(dev, addr); if (!err) { - write_lock_bh(&ipv6_sk_ac_lock); + spin_lock_bh(&ipv6_sk_ac_lock); pac->acl_next = np->ipv6_ac_list; np->ipv6_ac_list = pac; - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); pac = NULL; } @@ -152,7 +152,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) struct ipv6_ac_socklist *pac, *prev_pac; struct net *net = sock_net(sk); - write_lock_bh(&ipv6_sk_ac_lock); + spin_lock_bh(&ipv6_sk_ac_lock); prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { if ((ifindex == 0 || pac->acl_ifindex == ifindex) && @@ -161,7 +161,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) prev_pac = pac; } if (!pac) { - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); return -ENOENT; } if (prev_pac) @@ -169,7 +169,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) else np->ipv6_ac_list = pac->acl_next; - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); rcu_read_lock(); dev = dev_get_by_index_rcu(net, pac->acl_ifindex); @@ -192,10 +192,10 @@ void ipv6_sock_ac_close(struct sock *sk) if (!np->ipv6_ac_list) return; - write_lock_bh(&ipv6_sk_ac_lock); + spin_lock_bh(&ipv6_sk_ac_lock); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); prev_index = 0; rcu_read_lock(); @@ -509,7 +509,7 @@ static const struct file_operations ac6_seq_fops = { int __net_init ac6_proc_init(struct net *net) { - if (!proc_net_fops_create(net, "anycast6", S_IRUGO, &ac6_seq_fops)) + if (!proc_create("anycast6", S_IRUGO, net->proc_net, &ac6_seq_fops)) return -ENOMEM; return 0; @@ -517,7 +517,7 @@ int __net_init ac6_proc_init(struct net *net) void ac6_proc_exit(struct net *net) { - proc_net_remove(net, "anycast6"); + remove_proc_entry("anycast6", net->proc_net); } #endif diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 7a778b9a7b85..4b56cbbc7890 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -30,6 +30,7 @@ #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/tcp_states.h> +#include <net/dsfield.h> #include <linux/errqueue.h> #include <asm/uaccess.h> @@ -123,7 +124,7 @@ ipv4_connected: goto out; } - if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (sk->sk_bound_dev_if && @@ -354,19 +355,19 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = serr->port; - sin->sin6_scope_id = 0; if (skb->protocol == htons(ETH_P_IPV6)) { - sin->sin6_addr = - *(struct in6_addr *)(nh + serr->addr_offset); + const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), + struct ipv6hdr, daddr); + sin->sin6_addr = ip6h->daddr; if (np->sndflow) - sin->sin6_flowinfo = - (*(__be32 *)(nh + serr->addr_offset - 24) & - IPV6_FLOWINFO_MASK); - if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin->sin6_scope_id = IP6CB(skb)->iif; + sin->sin6_flowinfo = ip6_flowinfo(ip6h); + sin->sin6_scope_id = + ipv6_iface_scope_id(&sin->sin6_addr, + IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), &sin->sin6_addr); + sin->sin6_scope_id = 0; } } @@ -376,18 +377,19 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; - sin->sin6_scope_id = 0; if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) ip6_datagram_recv_ctl(sk, msg, skb); - if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin->sin6_scope_id = IP6CB(skb)->iif; + sin->sin6_scope_id = + ipv6_iface_scope_id(&sin->sin6_addr, + IP6CB(skb)->iif); } else { struct inet_sock *inet = inet_sk(sk); ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); + sin->sin6_scope_id = 0; if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); } @@ -489,13 +491,14 @@ int ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, } if (np->rxopt.bits.rxtclass) { - int tclass = ipv6_tclass(ipv6_hdr(skb)); + int tclass = ipv6_get_dsfield(ipv6_hdr(skb)); put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } - if (np->rxopt.bits.rxflow && (*(__be32 *)nh & IPV6_FLOWINFO_MASK)) { - __be32 flowinfo = *(__be32 *)nh & IPV6_FLOWINFO_MASK; - put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + if (np->rxopt.bits.rxflow) { + __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh); + if (flowinfo) + put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ @@ -591,7 +594,9 @@ int ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, sin6.sin6_addr = ipv6_hdr(skb)->daddr; sin6.sin6_port = ports[1]; sin6.sin6_flowinfo = 0; - sin6.sin6_scope_id = 0; + sin6.sin6_scope_id = + ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr, + opt->iif); put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 473f628f9f20..07a7d65a7cb6 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -553,7 +553,8 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] == 2) { - IP6CB(skb)->ra = optoff; + IP6CB(skb)->flags |= IP6SKB_ROUTERALERT; + memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra)); return true; } LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index fff5bdd8b680..b4ff0a42b8c7 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -124,15 +124,6 @@ static __inline__ void icmpv6_xmit_unlock(struct sock *sk) } /* - * Slightly more convenient version of icmpv6_send. - */ -void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) -{ - icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); - kfree_skb(skb); -} - -/* * Figure out, may we reply to this packet with icmp error. * * We do not reply, if: @@ -332,7 +323,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *sk * anycast. */ if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n"); + LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: acast source\n"); dst_release(dst); return ERR_PTR(-EINVAL); } @@ -381,7 +372,7 @@ relookup_failed: /* * Send an ICMP message in response to a packet in error */ -void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { struct net *net = dev_net(skb->dev); struct inet6_dev *idev = NULL; @@ -406,7 +397,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) /* * Make sure we respect the rules * i.e. RFC 1885 2.4(e) - * Rule (e.1) is enforced by not using icmpv6_send + * Rule (e.1) is enforced by not using icmp6_send * in any code that processes icmp errors. */ addr_type = ipv6_addr_type(&hdr->daddr); @@ -434,7 +425,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) * Source addr check */ - if (addr_type & IPV6_ADDR_LINKLOCAL) + if (__ipv6_addr_needs_scope_id(addr_type)) iif = skb->dev->ifindex; /* @@ -444,7 +435,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); + LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: addr_any/mcast source\n"); return; } @@ -452,7 +443,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"); + LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: no reply to icmp error\n"); return; } @@ -529,7 +520,14 @@ out_dst_release: out: icmpv6_xmit_unlock(sk); } -EXPORT_SYMBOL(icmpv6_send); + +/* Slightly more convenient version of icmp6_send. + */ +void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) +{ + icmp6_send(skb, ICMPV6_PARAMPROB, code, pos); + kfree_skb(skb); +} static void icmpv6_echo_reply(struct sk_buff *skb) { @@ -701,7 +699,7 @@ static int icmpv6_rcv(struct sk_buff *skb) if (__skb_checksum_complete(skb)) { LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%pI6 > %pI6]\n", saddr, daddr); - goto discard_it; + goto csum_error; } } @@ -787,6 +785,8 @@ static int icmpv6_rcv(struct sk_buff *skb) kfree_skb(skb); return 0; +csum_error: + ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: @@ -885,8 +885,14 @@ int __init icmpv6_init(void) err = -EAGAIN; if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) goto fail; + + err = inet6_register_icmp_sender(icmp6_send); + if (err) + goto sender_reg_err; return 0; +sender_reg_err: + inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); fail: pr_err("Failed to register ICMP6 protocol\n"); unregister_pernet_subsys(&icmpv6_sk_ops); @@ -895,6 +901,7 @@ fail: void icmpv6_cleanup(void) { + inet6_unregister_icmp_sender(icmp6_send); unregister_pernet_subsys(&icmpv6_sk_ops); inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 30647857a375..e4311cbc8b4e 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -31,25 +31,37 @@ int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax) { const struct sock *sk2; - const struct hlist_node *node; + int reuse = sk->sk_reuse; + int reuseport = sk->sk_reuseport; + kuid_t uid = sock_i_uid((struct sock *)sk); /* We must walk the whole port owner list in this case. -DaveM */ /* * See comment in inet_csk_bind_conflict about sock lookup * vs net namespaces issues. */ - sk_for_each_bound(sk2, node, &tb->owners) { + sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && - (!sk->sk_reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - ipv6_rcv_saddr_equal(sk, sk2)) - break; + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if ((!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + (!reuseport || !sk2->sk_reuseport || + (sk2->sk_state != TCP_TIME_WAIT && + !uid_eq(uid, + sock_i_uid((struct sock *)sk2))))) { + if (ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + if (!relax && reuse && sk2->sk_reuse && + sk2->sk_state != TCP_LISTEN && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } } - return node != NULL; + return sk2 != NULL; } EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); @@ -161,10 +173,8 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin6->sin6_port = inet_sk(sk)->inet_dport; /* We do not store received flowlabel for TCP */ sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (sk->sk_bound_dev_if && - ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = sk->sk_bound_dev_if; + sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, + sk->sk_bound_dev_if); } EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index dea17fd28e50..32b4a1675d82 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -158,25 +158,38 @@ static inline int compute_score(struct sock *sk, struct net *net, } struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, + struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, + const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { struct sock *sk; const struct hlist_nulls_node *node; struct sock *result; - int score, hiscore; + int score, hiscore, matches = 0, reuseport = 0; + u32 phash = 0; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; rcu_read_lock(); begin: result = NULL; - hiscore = -1; + hiscore = 0; sk_nulls_for_each(sk, node, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { hiscore = score; result = sk; + reuseport = sk->sk_reuseport; + if (reuseport) { + phash = inet6_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } + } else if (score == hiscore && reuseport) { + matches++; + if (((u64)phash * matches) >> 32 == 0) + result = sk; + phash = next_pseudo_random32(phash); } } /* diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c new file mode 100644 index 000000000000..72d198b8e4d2 --- /dev/null +++ b/net/ipv6/ip6_checksum.c @@ -0,0 +1,97 @@ +#include <net/ip.h> +#include <net/udp.h> +#include <net/udplite.h> +#include <asm/checksum.h> + +#ifndef _HAVE_ARCH_IPV6_CSUM +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, unsigned short proto, + __wsum csum) +{ + + int carry; + __u32 ulen; + __u32 uproto; + __u32 sum = (__force u32)csum; + + sum += (__force u32)saddr->s6_addr32[0]; + carry = (sum < (__force u32)saddr->s6_addr32[0]); + sum += carry; + + sum += (__force u32)saddr->s6_addr32[1]; + carry = (sum < (__force u32)saddr->s6_addr32[1]); + sum += carry; + + sum += (__force u32)saddr->s6_addr32[2]; + carry = (sum < (__force u32)saddr->s6_addr32[2]); + sum += carry; + + sum += (__force u32)saddr->s6_addr32[3]; + carry = (sum < (__force u32)saddr->s6_addr32[3]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[0]; + carry = (sum < (__force u32)daddr->s6_addr32[0]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[1]; + carry = (sum < (__force u32)daddr->s6_addr32[1]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[2]; + carry = (sum < (__force u32)daddr->s6_addr32[2]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[3]; + carry = (sum < (__force u32)daddr->s6_addr32[3]); + sum += carry; + + ulen = (__force u32)htonl((__u32) len); + sum += ulen; + carry = (sum < ulen); + sum += carry; + + uproto = (__force u32)htonl(proto); + sum += uproto; + carry = (sum < uproto); + sum += carry; + + return csum_fold((__force __wsum)sum); +} +EXPORT_SYMBOL(csum_ipv6_magic); +#endif + +int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) +{ + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + } + + if (uh->check == 0) { + /* RFC 2460 section 8.1 says that we SHOULD log + this error. Well, it is reasonable. + */ + LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n"); + return 1; + } + if (skb->ip_summed == CHECKSUM_COMPLETE && + !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->len, proto, skb->csum)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (!skb_csum_unnecessary(skb)) + skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len, proto, 0)); + + return 0; +} +EXPORT_SYMBOL(udp6_csum_init); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 710cafd2e1a9..192dd1a0e188 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -224,7 +224,6 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; - struct hlist_node *node; unsigned int h; if (id == 0) @@ -232,7 +231,7 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; @@ -363,7 +362,6 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct rt6_rtnl_dump_arg arg; struct fib6_walker_t *w; struct fib6_table *tb; - struct hlist_node *node; struct hlist_head *head; int res = 0; @@ -398,7 +396,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; res = fib6_dump_table(tb, skb, cb); @@ -1520,14 +1518,13 @@ void fib6_clean_all_ro(struct net *net, int (*func)(struct rt6_info *, void *arg int prune, void *arg) { struct fib6_table *table; - struct hlist_node *node; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { + hlist_for_each_entry_rcu(table, head, tb6_hlist) { read_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, prune, arg); @@ -1540,14 +1537,13 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), int prune, void *arg) { struct fib6_table *table; - struct hlist_node *node; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { + hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, prune, arg); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index d6de4b447250..46e88433ec7d 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -51,25 +51,38 @@ #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); -static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1]; +static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(unsigned long dummy); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0); /* FL hash table lock: it protects only of GC */ -static DEFINE_RWLOCK(ip6_fl_lock); +static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ -static DEFINE_RWLOCK(ip6_sk_fl_lock); +static DEFINE_SPINLOCK(ip6_sk_fl_lock); +#define for_each_fl_rcu(hash, fl) \ + for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ + fl != NULL; \ + fl = rcu_dereference_bh(fl->next)) +#define for_each_fl_continue_rcu(fl) \ + for (fl = rcu_dereference_bh(fl->next); \ + fl != NULL; \ + fl = rcu_dereference_bh(fl->next)) + +#define for_each_sk_fl_rcu(np, sfl) \ + for (sfl = rcu_dereference_bh(np->ipv6_fl_list); \ + sfl != NULL; \ + sfl = rcu_dereference_bh(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; - for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) { + for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } @@ -80,11 +93,11 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; - read_lock_bh(&ip6_fl_lock); + rcu_read_lock_bh(); fl = __fl_lookup(net, label); - if (fl) - atomic_inc(&fl->users); - read_unlock_bh(&ip6_fl_lock); + if (fl && !atomic_inc_not_zero(&fl->users)) + fl = NULL; + rcu_read_unlock_bh(); return fl; } @@ -96,13 +109,13 @@ static void fl_free(struct ip6_flowlabel *fl) put_pid(fl->owner.pid); release_net(fl->fl_net); kfree(fl->opt); + kfree_rcu(fl, rcu); } - kfree(fl); } static void fl_release(struct ip6_flowlabel *fl) { - write_lock_bh(&ip6_fl_lock); + spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { @@ -119,7 +132,7 @@ static void fl_release(struct ip6_flowlabel *fl) time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(unsigned long dummy) @@ -128,12 +141,15 @@ static void ip6_fl_gc(unsigned long dummy) unsigned long now = jiffies; unsigned long sched = 0; - write_lock(&ip6_fl_lock); + spin_lock(&ip6_fl_lock); for (i=0; i<=FL_HASH_MASK; i++) { - struct ip6_flowlabel *fl, **flp; + struct ip6_flowlabel *fl; + struct ip6_flowlabel __rcu **flp; + flp = &fl_ht[i]; - while ((fl=*flp) != NULL) { + while ((fl = rcu_dereference_protected(*flp, + lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) @@ -156,18 +172,21 @@ static void ip6_fl_gc(unsigned long dummy) if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } - write_unlock(&ip6_fl_lock); + spin_unlock(&ip6_fl_lock); } static void __net_exit ip6_fl_purge(struct net *net) { int i; - write_lock(&ip6_fl_lock); + spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { - struct ip6_flowlabel *fl, **flp; + struct ip6_flowlabel *fl; + struct ip6_flowlabel __rcu **flp; + flp = &fl_ht[i]; - while ((fl = *flp) != NULL) { + while ((fl = rcu_dereference_protected(*flp, + lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; @@ -178,7 +197,7 @@ static void __net_exit ip6_fl_purge(struct net *net) flp = &fl->next; } } - write_unlock(&ip6_fl_lock); + spin_unlock(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, @@ -188,7 +207,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->label = label & IPV6_FLOWLABEL_MASK; - write_lock_bh(&ip6_fl_lock); + spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK; @@ -210,16 +229,16 @@ static struct ip6_flowlabel *fl_intern(struct net *net, lfl = __fl_lookup(net, fl->label); if (lfl != NULL) { atomic_inc(&lfl->users); - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; - fl_ht[FL_HASH(fl->label)] = fl; + rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); return NULL; } @@ -234,17 +253,17 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label) label &= IPV6_FLOWLABEL_MASK; - read_lock_bh(&ip6_sk_fl_lock); - for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label) { fl->lastuse = jiffies; atomic_inc(&fl->users); - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return fl; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return NULL; } @@ -255,11 +274,21 @@ void fl6_free_socklist(struct sock *sk) struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; - while ((sfl = np->ipv6_fl_list) != NULL) { + if (!rcu_access_pointer(np->ipv6_fl_list)) + return; + + spin_lock_bh(&ip6_sk_fl_lock); + while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, + lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { np->ipv6_fl_list = sfl->next; + spin_unlock_bh(&ip6_sk_fl_lock); + fl_release(sfl->fl); - kfree(sfl); + kfree_rcu(sfl, rcu); + + spin_lock_bh(&ip6_sk_fl_lock); } + spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ @@ -424,7 +453,7 @@ static int mem_check(struct sock *sk) if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) + for_each_sk_fl_rcu(np, sfl) count++; if (room <= 0 || @@ -467,11 +496,11 @@ static bool ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2) static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, struct ip6_flowlabel *fl) { - write_lock_bh(&ip6_sk_fl_lock); + spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; sfl->next = np->ipv6_fl_list; - np->ipv6_fl_list = sfl; - write_unlock_bh(&ip6_sk_fl_lock); + rcu_assign_pointer(np->ipv6_fl_list, sfl); + spin_unlock_bh(&ip6_sk_fl_lock); } int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) @@ -481,7 +510,8 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) struct ipv6_pinfo *np = inet6_sk(sk); struct in6_flowlabel_req freq; struct ipv6_fl_socklist *sfl1=NULL; - struct ipv6_fl_socklist *sfl, **sflp; + struct ipv6_fl_socklist *sfl; + struct ipv6_fl_socklist __rcu **sflp; struct ip6_flowlabel *fl, *fl1 = NULL; @@ -493,31 +523,33 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) switch (freq.flr_action) { case IPV6_FL_A_PUT: - write_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) { + spin_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; + (sfl = rcu_dereference(*sflp))!=NULL; + sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = sfl->next; - write_unlock_bh(&ip6_sk_fl_lock); + *sflp = rcu_dereference(sfl->next); + spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); - kfree(sfl); + kfree_rcu(sfl, rcu); return 0; } } - write_unlock_bh(&ip6_sk_fl_lock); + spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; case IPV6_FL_A_RENEW: - read_lock_bh(&ip6_sk_fl_lock); - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq.flr_label) { err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return err; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); if (freq.flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { @@ -541,11 +573,11 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) if (freq.flr_label) { err = -EEXIST; - read_lock_bh(&ip6_sk_fl_lock); - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_flags&IPV6_FL_F_EXCL) { - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); goto done; } fl1 = sfl->fl; @@ -553,7 +585,7 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) break; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); if (fl1 == NULL) fl1 = fl_lookup(net, freq.flr_label); @@ -641,13 +673,13 @@ static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { - fl = fl_ht[state->bucket]; - - while (fl && !net_eq(fl->fl_net, net)) - fl = fl->next; - if (fl) - break; + for_each_fl_rcu(state->bucket, fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } } + fl = NULL; +out: return fl; } @@ -656,18 +688,22 @@ static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flo struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); - fl = fl->next; + for_each_fl_continue_rcu(fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } + try_again: - while (fl && !net_eq(fl->fl_net, net)) - fl = fl->next; - - while (!fl) { - if (++state->bucket <= FL_HASH_MASK) { - fl = fl_ht[state->bucket]; - goto try_again; - } else - break; + if (++state->bucket <= FL_HASH_MASK) { + for_each_fl_rcu(state->bucket, fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } + goto try_again; } + fl = NULL; + +out: return fl; } @@ -681,9 +717,9 @@ static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(ip6_fl_lock) + __acquires(RCU) { - read_lock_bh(&ip6_fl_lock); + rcu_read_lock_bh(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -700,9 +736,9 @@ static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip6fl_seq_stop(struct seq_file *seq, void *v) - __releases(ip6_fl_lock) + __releases(RCU) { - read_unlock_bh(&ip6_fl_lock); + rcu_read_unlock_bh(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) @@ -775,15 +811,15 @@ static const struct file_operations ip6fl_seq_fops = { static int __net_init ip6_flowlabel_proc_init(struct net *net) { - if (!proc_net_fops_create(net, "ip6_flowlabel", - S_IRUGO, &ip6fl_seq_fops)) + if (!proc_create("ip6_flowlabel", S_IRUGO, net->proc_net, + &ip6fl_seq_fops)) return -ENOMEM; return 0; } static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { - proc_net_remove(net, "ip6_flowlabel"); + remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 131dd097736d..d3ddd8400354 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -38,6 +38,7 @@ #include <net/sock.h> #include <net/ip.h> +#include <net/ip_tunnels.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/addrconf.h> @@ -110,46 +111,6 @@ static u32 HASH_ADDR(const struct in6_addr *addr) #define tunnels_l tunnels[1] #define tunnels_wc tunnels[0] -static struct rtnl_link_stats64 *ip6gre_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -{ - int i; - - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } - - tot->multicast = dev->stats.multicast; - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->rx_errors = dev->stats.rx_errors; - - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - - return tot; -} - /* Given src, dst and key, find appropriate for input tunnel. */ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, @@ -667,7 +628,6 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, struct net_device_stats *stats = &tunnel->dev->stats; int err = -1; u8 proto; - int pkt_len; struct sk_buff *new_skb; if (dev->type == ARPHRD_ETHER) @@ -772,9 +732,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, * Push down and install the IP header. */ ipv6h = ipv6_hdr(skb); - *(__be32 *)ipv6h = fl6->flowlabel | htonl(0x60000000); - dsfield = INET_ECN_encapsulate(0, dsfield); - ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); ipv6h->hop_limit = tunnel->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -803,23 +761,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } } - nf_reset(skb); - pkt_len = skb->len; - err = ip6_local_out(skb); - - if (net_xmit_eval(err) == 0) { - struct pcpu_tstats *tstats = this_cpu_ptr(tunnel->dev->tstats); - - tstats->tx_bytes += pkt_len; - tstats->tx_packets++; - } else { - stats->tx_errors++; - stats->tx_aborted_errors++; - } - + ip6tunnel_xmit(skb, dev); if (ndst) ip6_tnl_dst_store(tunnel, ndst); - return 0; tx_err_link_failure: stats->tx_carrier_errors++; @@ -1240,7 +1184,7 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen); __be16 *p = (__be16 *)(ipv6h+1); - *(__be32 *)ipv6h = t->fl.u.ip6.flowlabel | htonl(0x60000000); + ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; @@ -1273,7 +1217,7 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_do_ioctl = ip6gre_tunnel_ioctl, .ndo_change_mtu = ip6gre_tunnel_change_mtu, - .ndo_get_stats64 = ip6gre_get_stats64, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; static void ip6gre_dev_free(struct net_device *dev) @@ -1522,7 +1466,7 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6gre_tunnel_change_mtu, - .ndo_get_stats64 = ip6gre_get_stats64, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; static void ip6gre_tap_setup(struct net_device *dev) diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c new file mode 100644 index 000000000000..4578e23834f7 --- /dev/null +++ b/net/ipv6/ip6_icmp.c @@ -0,0 +1,47 @@ +#include <linux/export.h> +#include <linux/icmpv6.h> +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/spinlock.h> + +#include <net/ipv6.h> + +#if IS_ENABLED(CONFIG_IPV6) + +static ip6_icmp_send_t __rcu *ip6_icmp_send; + +int inet6_register_icmp_sender(ip6_icmp_send_t *fn) +{ + return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? + 0 : -EBUSY; +} +EXPORT_SYMBOL(inet6_register_icmp_sender); + +int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) +{ + int ret; + + ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? + 0 : -EINVAL; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(inet6_unregister_icmp_sender); + +void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +{ + ip6_icmp_send_t *send; + + rcu_read_lock(); + send = rcu_dereference(ip6_icmp_send); + + if (!send) + goto out; + send(skb, type, code, info); +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(icmpv6_send); +#endif diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index a52d864d562b..2bab2aa59745 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -118,6 +118,27 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt ipv6_addr_loopback(&hdr->daddr)) goto err; + /* RFC4291 Errata ID: 3480 + * Interface-Local scope spans only a single interface on a + * node and is useful only for loopback transmission of + * multicast. Packets with interface-local scope received + * from another node must be discarded. + */ + if (!(skb->pkt_type == PACKET_LOOPBACK || + dev->flags & IFF_LOOPBACK) && + ipv6_addr_is_multicast(&hdr->daddr) && + IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1) + goto err; + + /* RFC4291 2.7 + * Nodes must not originate a packet to a multicast address whose scope + * field contains the reserved value 0; if such a packet is received, it + * must be silently dropped. + */ + if (ipv6_addr_is_multicast(&hdr->daddr) && + IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 0) + goto err; + /* * RFC4291 2.7 * Multicast addresses must not be used as source addresses in IPv6 @@ -212,7 +233,7 @@ resubmit: if (ipv6_addr_is_multicast(&hdr->daddr) && !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, &hdr->saddr) && - !ipv6_is_mld(skb, nexthdr)) + !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && @@ -232,9 +253,11 @@ resubmit: icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR, nhoff); } - } else + kfree_skb(skb); + } else { IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); - kfree_skb(skb); + consume_skb(skb); + } } rcu_read_unlock(); return 0; @@ -270,7 +293,8 @@ int ip6_mc_input(struct sk_buff *skb) * IPv6 multicast router mode is now supported ;) */ if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && - !(ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) && + !(ipv6_addr_type(&hdr->daddr) & + (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { /* * Okay, we try to forward - split and duplicate @@ -280,10 +304,8 @@ int ip6_mc_input(struct sk_buff *skb) struct inet6_skb_parm *opt = IP6CB(skb); /* Check for MLD */ - if (unlikely(opt->ra)) { + if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { /* Check if this is a mld message */ - u8 *ptr = skb_network_header(skb) + opt->ra; - struct icmp6hdr *icmp6; u8 nexthdr = hdr->nexthdr; __be16 frag_off; int offset; @@ -291,7 +313,7 @@ int ip6_mc_input(struct sk_buff *skb) /* Check if the value of Router Alert * is for MLD (0x0000). */ - if ((ptr[2] | ptr[3]) == 0) { + if (opt->ra == htons(IPV6_OPT_ROUTERALERT_MLD)) { deliver = false; if (!ipv6_ext_hdr(nexthdr)) { @@ -303,24 +325,10 @@ int ip6_mc_input(struct sk_buff *skb) if (offset < 0) goto out; - if (nexthdr != IPPROTO_ICMPV6) - goto out; - - if (!pskb_may_pull(skb, (skb_network_header(skb) + - offset + 1 - skb->data))) + if (!ipv6_is_mld(skb, nexthdr, offset)) goto out; - icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); - - switch (icmp6->icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - deliver = true; - break; - } - goto out; + deliver = true; } /* unknown RA - process it normally */ } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index f26f0da7f095..71b766ee821d 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -92,13 +92,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, u8 *prevhdr; int offset = 0; - if (!(features & NETIF_F_V6_CSUM)) - features &= ~NETIF_F_SG; - if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | + SKB_GSO_GRE | + SKB_GSO_UDP_TUNNEL | SKB_GSO_TCPV6 | 0))) goto out; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0c7c03d50dc0..d2eedf192330 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -56,8 +56,6 @@ #include <net/checksum.h> #include <linux/mroute6.h> -int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); - int __ip6_local_out(struct sk_buff *skb) { int len; @@ -88,7 +86,8 @@ static int ip6_finish_output2(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct neighbour *neigh; - struct rt6_info *rt; + struct in6_addr *nexthop; + int ret; skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -121,12 +120,26 @@ static int ip6_finish_output2(struct sk_buff *skb) IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, skb->len); + + if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= + IPV6_ADDR_SCOPE_NODELOCAL && + !(dev->flags & IFF_LOOPBACK)) { + kfree_skb(skb); + return 0; + } } - rt = (struct rt6_info *) dst; - neigh = rt->n; - if (neigh) - return dst_neigh_output(dst, neigh, skb); + rcu_read_lock_bh(); + nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); + neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); + if (!IS_ERR(neigh)) { + ret = dst_neigh_output(dst, neigh, skb); + rcu_read_unlock_bh(); + return ret; + } + rcu_read_unlock_bh(); IP6_INC_STATS_BH(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); @@ -216,7 +229,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); - *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel; + ip6_flow_hdr(hdr, tclass, fl6->flowlabel); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -236,9 +249,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, dst->dev, dst_output); } - net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n"); skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ipv6_local_error(sk, EMSGSIZE, fl6, mtu); IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; @@ -246,39 +258,6 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, EXPORT_SYMBOL(ip6_xmit); -/* - * To avoid extra problems ND packets are send through this - * routine. It's code duplication but I really want to avoid - * extra checks since ipv6_build_header is used by TCP (which - * is for us performance critical) - */ - -int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, - const struct in6_addr *saddr, const struct in6_addr *daddr, - int proto, int len) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6hdr *hdr; - - skb->protocol = htons(ETH_P_IPV6); - skb->dev = dev; - - skb_reset_network_header(skb); - skb_put(skb, sizeof(struct ipv6hdr)); - hdr = ipv6_hdr(skb); - - *(__be32*)hdr = htonl(0x60000000); - - hdr->payload_len = htons(len); - hdr->nexthdr = proto; - hdr->hop_limit = np->hop_limit; - - hdr->saddr = *saddr; - hdr->daddr = *daddr; - - return 0; -} - static int ip6_call_ra_chain(struct sk_buff *skb, int sel) { struct ip6_ra_chain *ra; @@ -913,8 +892,12 @@ static int ip6_dst_lookup_tail(struct sock *sk, * dst entry of the nexthop router */ rt = (struct rt6_info *) *dst; - n = rt->n; - if (n && !(n->nud_state & NUD_VALID)) { + rcu_read_lock_bh(); + n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr)); + err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; + rcu_read_unlock_bh(); + + if (err) { struct inet6_ifaddr *ifp; struct flowi6 fl_gw6; int redirect; @@ -1241,11 +1224,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, } /* For UDP, check if TX timestamp is enabled */ - if (sk->sk_type == SOCK_DGRAM) { - err = sock_tx_timestamp(sk, &tx_flags); - if (err) - goto error; - } + if (sk->sk_type == SOCK_DGRAM) + sock_tx_timestamp(sk, &tx_flags); /* * Let's try using as much space as possible. @@ -1548,9 +1528,7 @@ int ip6_push_pending_frames(struct sock *sk) skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - *(__be32*)hdr = fl6->flowlabel | - htonl(0x60000000 | ((int)np->cork.tclass << 20)); - + ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a14f28b280f5..1e55866cead7 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -47,6 +47,7 @@ #include <net/icmp.h> #include <net/ip.h> +#include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> @@ -955,7 +956,6 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, unsigned int max_headroom = sizeof(struct ipv6hdr); u8 proto; int err = -1; - int pkt_len; if (!fl6->flowi6_mark) dst = ip6_tnl_dst_check(t); @@ -1030,26 +1030,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - *(__be32*)ipv6h = fl6->flowlabel | htonl(0x60000000); - dsfield = INET_ECN_encapsulate(0, dsfield); - ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; - nf_reset(skb); - pkt_len = skb->len; - err = ip6_local_out(skb); - - if (net_xmit_eval(err) == 0) { - struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats); - - tstats->tx_bytes += pkt_len; - tstats->tx_packets++; - } else { - stats->tx_errors++; - stats->tx_aborted_errors++; - } + ip6tunnel_xmit(skb, dev); if (ndst) ip6_tnl_dst_store(t, ndst); return 0; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8fd154e5f079..241fb8ad9fcf 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -842,9 +842,9 @@ static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); - ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -ETIMEDOUT; + ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); @@ -1017,6 +1017,50 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, return NULL; } +/* Look for a (*,*,oif) entry */ +static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt, + mifi_t mifi) +{ + int line = MFC6_HASH(&in6addr_any, &in6addr_any); + struct mfc6_cache *c; + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_any(&c->mf6c_mcastgrp) && + (c->mfc_un.res.ttls[mifi] < 255)) + return c; + + return NULL; +} + +/* Look for a (*,G) entry */ +static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt, + struct in6_addr *mcastgrp, + mifi_t mifi) +{ + int line = MFC6_HASH(mcastgrp, &in6addr_any); + struct mfc6_cache *c, *proxy; + + if (ipv6_addr_any(mcastgrp)) + goto skip; + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) { + if (c->mfc_un.res.ttls[mifi] < 255) + return c; + + /* It's ok if the mifi is part of the static tree */ + proxy = ip6mr_cache_find_any_parent(mrt, + c->mf6c_parent); + if (proxy && proxy->mfc_un.res.ttls[mifi] < 255) + return c; + } + +skip: + return ip6mr_cache_find_any_parent(mrt, mifi); +} + /* * Allocate a multicast cache entry */ @@ -1056,13 +1100,13 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); - if (__ip6mr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { + if (__ip6mr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); - ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE; + ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else @@ -1247,7 +1291,8 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) * MFC6 cache manipulation by user space */ -static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc) +static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc, + int parent) { int line; struct mfc6_cache *c, *next; @@ -1256,7 +1301,9 @@ static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc) list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) { if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && - ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) { + ipv6_addr_equal(&c->mf6c_mcastgrp, + &mfc->mf6cc_mcastgrp.sin6_addr) && + (parent == -1 || parent == c->mf6c_parent)) { write_lock_bh(&mrt_lock); list_del(&c->list); write_unlock_bh(&mrt_lock); @@ -1312,9 +1359,9 @@ static int __net_init ip6mr_net_init(struct net *net) #ifdef CONFIG_PROC_FS err = -ENOMEM; - if (!proc_net_fops_create(net, "ip6_mr_vif", 0, &ip6mr_vif_fops)) + if (!proc_create("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_fops)) goto proc_vif_fail; - if (!proc_net_fops_create(net, "ip6_mr_cache", 0, &ip6mr_mfc_fops)) + if (!proc_create("ip6_mr_cache", 0, net->proc_net, &ip6mr_mfc_fops)) goto proc_cache_fail; #endif @@ -1322,7 +1369,7 @@ static int __net_init ip6mr_net_init(struct net *net) #ifdef CONFIG_PROC_FS proc_cache_fail: - proc_net_remove(net, "ip6_mr_vif"); + remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: ip6mr_rules_exit(net); #endif @@ -1333,8 +1380,8 @@ fail: static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_remove(net, "ip6_mr_cache"); - proc_net_remove(net, "ip6_mr_vif"); + remove_proc_entry("ip6_mr_cache", net->proc_net); + remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_rules_exit(net); } @@ -1391,7 +1438,7 @@ void ip6_mr_cleanup(void) } static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, - struct mf6cctl *mfc, int mrtsock) + struct mf6cctl *mfc, int mrtsock, int parent) { bool found = false; int line; @@ -1413,7 +1460,9 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && - ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) { + ipv6_addr_equal(&c->mf6c_mcastgrp, + &mfc->mf6cc_mcastgrp.sin6_addr) && + (parent == -1 || parent == mfc->mf6cc_parent)) { found = true; break; } @@ -1430,7 +1479,8 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, return 0; } - if (!ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) + if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && + !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; c = ip6mr_cache_alloc(); @@ -1596,7 +1646,7 @@ struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) { - int ret; + int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; @@ -1653,15 +1703,21 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: + parent = -1; + case MRT6_ADD_MFC_PROXY: + case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_user(&mfc, optval, sizeof(mfc))) return -EFAULT; + if (parent == 0) + parent = mfc.mf6cc_parent; rtnl_lock(); - if (optname == MRT6_DEL_MFC) - ret = ip6mr_mfc_delete(mrt, &mfc); + if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) + ret = ip6mr_mfc_delete(mrt, &mfc, parent); else - ret = ip6mr_mfc_add(net, mrt, &mfc, sk == mrt->mroute6_sk); + ret = ip6mr_mfc_add(net, mrt, &mfc, + sk == mrt->mroute6_sk, parent); rtnl_unlock(); return ret; @@ -2018,19 +2074,29 @@ static int ip6_mr_forward(struct net *net, struct mr6_table *mrt, { int psend = -1; int vif, ct; + int true_vifi = ip6mr_find_vif(mrt, skb->dev); vif = cache->mf6c_parent; cache->mfc_un.res.pkt++; cache->mfc_un.res.bytes += skb->len; + if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) { + struct mfc6_cache *cache_proxy; + + /* For an (*,G) entry, we only check that the incomming + * interface is part of the static tree. + */ + cache_proxy = ip6mr_cache_find_any_parent(mrt, vif); + if (cache_proxy && + cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + goto forward; + } + /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (mrt->vif6_table[vif].dev != skb->dev) { - int true_vifi; - cache->mfc_un.res.wrong_if++; - true_vifi = ip6mr_find_vif(mrt, skb->dev); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -2048,14 +2114,32 @@ static int ip6_mr_forward(struct net *net, struct mr6_table *mrt, goto dont_forward; } +forward: mrt->vif6_table[vif].pkt_in++; mrt->vif6_table[vif].bytes_in += skb->len; /* * Forward the frame */ + if (ipv6_addr_any(&cache->mf6c_origin) && + ipv6_addr_any(&cache->mf6c_mcastgrp)) { + if (true_vifi >= 0 && + true_vifi != cache->mf6c_parent && + ipv6_hdr(skb)->hop_limit > + cache->mfc_un.res.ttls[cache->mf6c_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = cache->mf6c_parent; + goto last_forward; + } + goto dont_forward; + } for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) { - if (ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) { + /* For (*,G) entry, don't forward to the incoming interface */ + if ((!ipv6_addr_any(&cache->mf6c_origin) || ct != true_vifi) && + ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) @@ -2064,6 +2148,7 @@ static int ip6_mr_forward(struct net *net, struct mr6_table *mrt, psend = ct; } } +last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, cache, psend); return 0; @@ -2099,6 +2184,14 @@ int ip6_mr_input(struct sk_buff *skb) read_lock(&mrt_lock); cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); + if (cache == NULL) { + int vif = ip6mr_find_vif(mrt, skb->dev); + + if (vif >= 0) + cache = ip6mr_cache_find_any(mrt, + &ipv6_hdr(skb)->daddr, + vif); + } /* * No usable cache entry @@ -2186,6 +2279,13 @@ int ip6mr_get_route(struct net *net, read_lock(&mrt_lock); cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); + if (!cache && skb->dev) { + int vif = ip6mr_find_vif(mrt, skb->dev); + + if (vif >= 0) + cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, + vif); + } if (!cache) { struct sk_buff *skb2; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 28dfa5f3801f..bfa6cc36ef2a 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -376,8 +376,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i=0; i<psl->sl_count; i++) { - rv = memcmp(&psl->sl_addr[i], source, - sizeof(struct in6_addr)); + rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) break; } @@ -427,12 +426,10 @@ int ip6_mc_source(int add, int omode, struct sock *sk, } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i=0; i<psl->sl_count; i++) { - rv = memcmp(&psl->sl_addr[i], source, sizeof(struct in6_addr)); - if (rv == 0) - break; + rv = !ipv6_addr_equal(&psl->sl_addr[i], source); + if (rv == 0) /* There is an error in the address. */ + goto done; } - if (rv == 0) /* address already there is an error */ - goto done; for (j=psl->sl_count-1; j>=i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = *source; @@ -664,6 +661,10 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < + IPV6_ADDR_SCOPE_LINKLOCAL) + return; + spin_lock_bh(&mc->mca_lock); if (!(mc->mca_flags&MAF_LOADED)) { mc->mca_flags |= MAF_LOADED; @@ -690,6 +691,10 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < + IPV6_ADDR_SCOPE_LINKLOCAL) + return; + spin_lock_bh(&mc->mca_lock); if (mc->mca_flags&MAF_LOADED) { mc->mca_flags &= ~MAF_LOADED; @@ -935,33 +940,6 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) } /* - * identify MLD packets for MLD filter exceptions - */ -bool ipv6_is_mld(struct sk_buff *skb, int nexthdr) -{ - struct icmp6hdr *pic; - - if (nexthdr != IPPROTO_ICMPV6) - return false; - - if (!pskb_may_pull(skb, sizeof(struct icmp6hdr))) - return false; - - pic = icmp6_hdr(skb); - - switch (pic->icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - return true; - default: - break; - } - return false; -} - -/* * check if the interface/address pair is valid */ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, @@ -1340,6 +1318,31 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) return scount; } +static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int proto, int len) +{ + struct ipv6hdr *hdr; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + skb_reset_network_header(skb); + skb_put(skb, sizeof(struct ipv6hdr)); + hdr = ipv6_hdr(skb); + + ip6_flow_hdr(hdr, 0, 0); + + hdr->payload_len = htons(len); + hdr->nexthdr = proto; + hdr->hop_limit = inet6_sk(sk)->hop_limit; + + hdr->saddr = *saddr; + hdr->daddr = *daddr; +} + static struct sk_buff *mld_newpack(struct net_device *dev, int size) { struct net *net = dev_net(dev); @@ -1375,7 +1378,7 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) } else saddr = &addr_buf; - ip6_nd_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); @@ -1418,7 +1421,7 @@ static void mld_sendpack(struct sk_buff *skb) icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); - dst = icmp6_dst_alloc(skb->dev, NULL, &fl6); + dst = icmp6_dst_alloc(skb->dev, &fl6); err = 0; if (IS_ERR(dst)) { @@ -1767,7 +1770,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) } else saddr = &addr_buf; - ip6_nd_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); + ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); @@ -1786,7 +1789,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); - dst = icmp6_dst_alloc(skb->dev, NULL, &fl6); + dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err_out; @@ -2596,10 +2599,10 @@ static int __net_init igmp6_proc_init(struct net *net) int err; err = -ENOMEM; - if (!proc_net_fops_create(net, "igmp6", S_IRUGO, &igmp6_mc_seq_fops)) + if (!proc_create("igmp6", S_IRUGO, net->proc_net, &igmp6_mc_seq_fops)) goto out; - if (!proc_net_fops_create(net, "mcfilter6", S_IRUGO, - &igmp6_mcf_seq_fops)) + if (!proc_create("mcfilter6", S_IRUGO, net->proc_net, + &igmp6_mcf_seq_fops)) goto out_proc_net_igmp6; err = 0; @@ -2607,14 +2610,14 @@ out: return err; out_proc_net_igmp6: - proc_net_remove(net, "igmp6"); + remove_proc_entry("igmp6", net->proc_net); goto out; } static void __net_exit igmp6_proc_exit(struct net *net) { - proc_net_remove(net, "mcfilter6"); - proc_net_remove(net, "igmp6"); + remove_proc_entry("mcfilter6", net->proc_net); + remove_proc_entry("igmp6", net->proc_net); } #else static inline int igmp6_proc_init(struct net *net) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 6574175795df..2712ab22a174 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -143,16 +143,12 @@ struct neigh_table nd_tbl = { .gc_thresh3 = 1024, }; -static inline int ndisc_opt_addr_space(struct net_device *dev) +static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) { - return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type)); -} - -static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, - unsigned short addr_type) -{ - int pad = ndisc_addr_option_pad(addr_type); - int space = NDISC_OPT_SPACE(data_len + pad); + int pad = ndisc_addr_option_pad(skb->dev->type); + int data_len = skb->dev->addr_len; + int space = ndisc_opt_addr_space(skb->dev); + u8 *opt = skb_put(skb, space); opt[0] = type; opt[1] = space>>3; @@ -166,7 +162,6 @@ static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, opt += data_len; if ((space -= data_len) > 0) memset(opt, 0, space); - return opt + space; } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, @@ -370,91 +365,88 @@ static void pndisc_destructor(struct pneigh_entry *n) ipv6_dev_mc_dec(dev, &maddr); } -static struct sk_buff *ndisc_build_skb(struct net_device *dev, - const struct in6_addr *daddr, - const struct in6_addr *saddr, - struct icmp6hdr *icmp6h, - const struct in6_addr *target, - int llinfo) +static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, + int len) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.ndisc_sk; - struct sk_buff *skb; - struct icmp6hdr *hdr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - int len; + struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; + struct sk_buff *skb; int err; - u8 *opt; - - if (!dev->addr_len) - llinfo = 0; - - len = sizeof(struct icmp6hdr) + (target ? sizeof(*target) : 0); - if (llinfo) - len += ndisc_opt_addr_space(dev); skb = sock_alloc_send_skb(sk, - (MAX_HEADER + sizeof(struct ipv6hdr) + - len + hlen + tlen), + hlen + sizeof(struct ipv6hdr) + len + tlen, 1, &err); if (!skb) { - ND_PRINTK(0, err, "ND: %s failed to allocate an skb, err=%d\n", + ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n", __func__, err); return NULL; } - skb_reserve(skb, hlen); - ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; - skb->transport_header = skb->tail; - skb_put(skb, len); + skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); + skb_reset_transport_header(skb); - hdr = (struct icmp6hdr *)skb_transport_header(skb); - memcpy(hdr, icmp6h, sizeof(*hdr)); + return skb; +} - opt = skb_transport_header(skb) + sizeof(struct icmp6hdr); - if (target) { - *(struct in6_addr *)opt = *target; - opt += sizeof(*target); - } +static void ip6_nd_hdr(struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int hop_limit, int len) +{ + struct ipv6hdr *hdr; - if (llinfo) - ndisc_fill_addr_option(opt, llinfo, dev->dev_addr, - dev->addr_len, dev->type); + skb_push(skb, sizeof(*hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); - hdr->icmp6_cksum = csum_ipv6_magic(saddr, daddr, len, - IPPROTO_ICMPV6, - csum_partial(hdr, - len, 0)); + ip6_flow_hdr(hdr, 0, 0); - return skb; + hdr->payload_len = htons(len); + hdr->nexthdr = IPPROTO_ICMPV6; + hdr->hop_limit = hop_limit; + + hdr->saddr = *saddr; + hdr->daddr = *daddr; } -static void ndisc_send_skb(struct sk_buff *skb, struct net_device *dev, - struct neighbour *neigh, +static void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, - const struct in6_addr *saddr, - struct icmp6hdr *icmp6h) + const struct in6_addr *saddr) { - struct flowi6 fl6; - struct dst_entry *dst; - struct net *net = dev_net(dev); + struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(skb->dev); struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; int err; + struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; - icmpv6_flow_init(sk, &fl6, type, saddr, daddr, dev->ifindex); - dst = icmp6_dst_alloc(dev, neigh, &fl6); - if (IS_ERR(dst)) { - kfree_skb(skb); - return; + if (!dst) { + struct sock *sk = net->ipv6.ndisc_sk; + struct flowi6 fl6; + + icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex); + dst = icmp6_dst_alloc(skb->dev, &fl6); + if (IS_ERR(dst)) { + kfree_skb(skb); + return; + } + + skb_dst_set(skb, dst); } - skb_dst_set(skb, dst); + icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_ICMPV6, + csum_partial(icmp6h, + skb->len, 0)); + + ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len); rcu_read_lock(); idev = __in6_dev_get(dst->dev); @@ -470,36 +462,17 @@ static void ndisc_send_skb(struct sk_buff *skb, struct net_device *dev, rcu_read_unlock(); } -/* - * Send a Neighbour Discover packet - */ -static void __ndisc_send(struct net_device *dev, - struct neighbour *neigh, - const struct in6_addr *daddr, - const struct in6_addr *saddr, - struct icmp6hdr *icmp6h, const struct in6_addr *target, - int llinfo) -{ - struct sk_buff *skb; - - skb = ndisc_build_skb(dev, daddr, saddr, icmp6h, target, llinfo); - if (!skb) - return; - - ndisc_send_skb(skb, dev, neigh, daddr, saddr, icmp6h); -} - static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, - int router, int solicited, int override, int inc_opt) + bool router, bool solicited, bool override, bool inc_opt) { + struct sk_buff *skb; struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; const struct in6_addr *src_addr; - struct icmp6hdr icmp6h = { - .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, - }; + struct nd_msg *msg; + int optlen = 0; /* for anycast or proxy, solicited_addr != src_addr */ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); @@ -517,13 +490,32 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, src_addr = &tmpaddr; } - icmp6h.icmp6_router = router; - icmp6h.icmp6_solicited = solicited; - icmp6h.icmp6_override = override; + if (!dev->addr_len) + inc_opt = 0; + if (inc_opt) + optlen += ndisc_opt_addr_space(dev); - __ndisc_send(dev, neigh, daddr, src_addr, - &icmp6h, solicited_addr, - inc_opt ? ND_OPT_TARGET_LL_ADDR : 0); + skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!skb) + return; + + msg = (struct nd_msg *)skb_put(skb, sizeof(*msg)); + *msg = (struct nd_msg) { + .icmph = { + .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, + .icmp6_router = router, + .icmp6_solicited = solicited, + .icmp6_override = override, + }, + .target = *solicited_addr, + }; + + if (inc_opt) + ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, + dev->dev_addr); + + + ndisc_send_skb(skb, daddr, src_addr); } static void ndisc_send_unsol_na(struct net_device *dev) @@ -551,10 +543,11 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct sk_buff *skb; struct in6_addr addr_buf; - struct icmp6hdr icmp6h = { - .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, - }; + int inc_opt = dev->addr_len; + int optlen = 0; + struct nd_msg *msg; if (saddr == NULL) { if (ipv6_get_lladdr(dev, &addr_buf, @@ -563,18 +556,37 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, saddr = &addr_buf; } - __ndisc_send(dev, neigh, daddr, saddr, - &icmp6h, solicit, - !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0); + if (ipv6_addr_any(saddr)) + inc_opt = 0; + if (inc_opt) + optlen += ndisc_opt_addr_space(dev); + + skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!skb) + return; + + msg = (struct nd_msg *)skb_put(skb, sizeof(*msg)); + *msg = (struct nd_msg) { + .icmph = { + .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, + }, + .target = *solicit, + }; + + if (inc_opt) + ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, + dev->dev_addr); + + ndisc_send_skb(skb, daddr, saddr); } void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr) { - struct icmp6hdr icmp6h = { - .icmp6_type = NDISC_ROUTER_SOLICITATION, - }; + struct sk_buff *skb; + struct rs_msg *msg; int send_sllao = dev->addr_len; + int optlen = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* @@ -598,9 +610,25 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, } } #endif - __ndisc_send(dev, NULL, daddr, saddr, - &icmp6h, NULL, - send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0); + if (send_sllao) + optlen += ndisc_opt_addr_space(dev); + + skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!skb) + return; + + msg = (struct rs_msg *)skb_put(skb, sizeof(*msg)); + *msg = (struct rs_msg) { + .icmph = { + .icmp6_type = NDISC_ROUTER_SOLICITATION, + }, + }; + + if (send_sllao) + ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, + dev->dev_addr); + + ndisc_send_skb(skb, daddr, saddr); } @@ -676,6 +704,11 @@ static void ndisc_recv_ns(struct sk_buff *skb) bool inc; int is_router = -1; + if (skb->len < sizeof(struct nd_msg)) { + ND_PRINTK(2, warn, "NS: packet too short\n"); + return; + } + if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NS: multicast target address\n"); return; @@ -685,11 +718,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) * RFC2461 7.1.1: * DAD has to be destined for solicited node multicast address. */ - if (dad && - !(daddr->s6_addr32[0] == htonl(0xff020000) && - daddr->s6_addr32[1] == htonl(0x00000000) && - daddr->s6_addr32[2] == htonl(0x00000001) && - daddr->s6_addr [12] == 0xff )) { + if (dad && !ipv6_addr_is_solict_mult(daddr)) { ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); return; } @@ -780,11 +809,11 @@ static void ndisc_recv_ns(struct sk_buff *skb) } if (is_router < 0) - is_router = !!idev->cnf.forwarding; + is_router = idev->cnf.forwarding; if (dad) { ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target, - is_router, 0, (ifp != NULL), 1); + !!is_router, false, (ifp != NULL), true); goto out; } @@ -805,8 +834,8 @@ static void ndisc_recv_ns(struct sk_buff *skb) NEIGH_UPDATE_F_OVERRIDE); if (neigh || !dev->header_ops) { ndisc_send_na(dev, neigh, saddr, &msg->target, - is_router, - 1, (ifp != NULL && inc), inc); + !!is_router, + true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); } @@ -1350,25 +1379,34 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } +static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, + struct sk_buff *orig_skb, + int rd_len) +{ + u8 *opt = skb_put(skb, rd_len); + + memset(opt, 0, 8); + *(opt++) = ND_OPT_REDIRECT_HDR; + *(opt++) = (rd_len >> 3); + opt += 6; + + memcpy(opt, ipv6_hdr(orig_skb), rd_len - 8); +} + void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); struct sock *sk = net->ipv6.ndisc_sk; - int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + int optlen = 0; struct inet_peer *peer; struct sk_buff *buff; - struct icmp6hdr *icmph; + struct rd_msg *msg; struct in6_addr saddr_buf; - struct in6_addr *addrp; struct rt6_info *rt; struct dst_entry *dst; - struct inet6_dev *idev; struct flowi6 fl6; - u8 *opt; - int hlen, tlen; int rd_len; - int err; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; bool ret; @@ -1424,7 +1462,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; - len += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev); } else read_unlock_bh(&neigh->lock); @@ -1432,80 +1470,40 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) } rd_len = min_t(unsigned int, - IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, skb->len + 8); + IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen, + skb->len + 8); rd_len &= ~0x7; - len += rd_len; - - hlen = LL_RESERVED_SPACE(dev); - tlen = dev->needed_tailroom; - buff = sock_alloc_send_skb(sk, - (MAX_HEADER + sizeof(struct ipv6hdr) + - len + hlen + tlen), - 1, &err); - if (buff == NULL) { - ND_PRINTK(0, err, - "Redirect: %s failed to allocate an skb, err=%d\n", - __func__, err); - goto release; - } - - skb_reserve(buff, hlen); - ip6_nd_hdr(sk, buff, dev, &saddr_buf, &ipv6_hdr(skb)->saddr, - IPPROTO_ICMPV6, len); - - skb_set_transport_header(buff, skb_tail_pointer(buff) - buff->data); - skb_put(buff, len); - icmph = icmp6_hdr(buff); - - memset(icmph, 0, sizeof(struct icmp6hdr)); - icmph->icmp6_type = NDISC_REDIRECT; + optlen += rd_len; - /* - * copy target and destination addresses - */ - - addrp = (struct in6_addr *)(icmph + 1); - *addrp = *target; - addrp++; - *addrp = ipv6_hdr(skb)->daddr; + buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!buff) + goto release; - opt = (u8*) (addrp + 1); + msg = (struct rd_msg *)skb_put(buff, sizeof(*msg)); + *msg = (struct rd_msg) { + .icmph = { + .icmp6_type = NDISC_REDIRECT, + }, + .target = *target, + .dest = ipv6_hdr(skb)->daddr, + }; /* * include target_address option */ if (ha) - opt = ndisc_fill_addr_option(opt, ND_OPT_TARGET_LL_ADDR, ha, - dev->addr_len, dev->type); + ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha); /* * build redirect option and copy skb over to the new packet. */ - memset(opt, 0, 8); - *(opt++) = ND_OPT_REDIRECT_HDR; - *(opt++) = (rd_len >> 3); - opt += 6; - - memcpy(opt, ipv6_hdr(skb), rd_len - 8); - - icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &ipv6_hdr(skb)->saddr, - len, IPPROTO_ICMPV6, - csum_partial(icmph, len, 0)); + if (rd_len) + ndisc_fill_redirect_hdr_option(buff, skb, rd_len); skb_dst_set(buff, dst); - rcu_read_lock(); - idev = __in6_dev_get(dst->dev); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev, - dst_output); - if (!err) { - ICMP6MSGOUT_INC_STATS(net, idev, NDISC_REDIRECT); - ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); - } - - rcu_read_unlock(); + ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf); return; release: @@ -1522,7 +1520,7 @@ int ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; - if (!pskb_may_pull(skb, skb->len)) + if (skb_linearize(skb)) return 0; msg = (struct nd_msg *)skb_transport_header(skb); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 429089cb073d..72836f40b730 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -1,3 +1,9 @@ +/* + * IPv6 specific functions of netfilter core + * + * Rusty Russell (C) 2000 -- This code is GPL. + * Patrick McHardy (C) 2006-2012 + */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/ipv6.h> @@ -29,7 +35,7 @@ int ip6_route_me_harder(struct sk_buff *skb) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); dst_release(dst); - return -EINVAL; + return dst->error; } /* Drop old route. */ @@ -43,7 +49,7 @@ int ip6_route_me_harder(struct sk_buff *skb) skb_dst_set(skb, NULL); dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), skb->sk, 0); if (IS_ERR(dst)) - return -1; + return PTR_ERR(dst); skb_dst_set(skb, dst); } #endif @@ -53,7 +59,7 @@ int ip6_route_me_harder(struct sk_buff *skb) if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) - return -1; + return -ENOMEM; return 0; } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index c72532a60d88..4433ab40e7de 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -105,7 +105,7 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED + depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW) ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 125a90d6a795..44400c216dc6 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -3,6 +3,7 @@ * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -284,6 +285,7 @@ static void trace_packet(const struct sk_buff *skb, const char *hookname, *chainname, *comment; const struct ip6t_entry *iter; unsigned int rulenum = 0; + struct net *net = dev_net(in ? in : out); table_base = private->entries[smp_processor_id()]; root = get_entry(table_base, private->hook_entry[hook]); @@ -296,7 +298,7 @@ static void trace_packet(const struct sk_buff *skb, &chainname, &comment, &rulenum) != 0) break; - nf_log_packet(AF_INET6, hook, skb, in, out, &trace_loginfo, + nf_log_packet(net, AF_INET6, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", tablename, chainname, comment, rulenum); } @@ -1098,7 +1100,7 @@ static int get_info(struct net *net, void __user *user, #endif t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), "ip6table_%s", name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1157,7 +1159,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, } t = xt_find_table_lock(net, AF_INET6, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) @@ -1197,7 +1199,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), "ip6table_%s", name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } @@ -1355,7 +1357,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, } t = xt_find_table_lock(net, AF_INET6, name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } @@ -1939,7 +1941,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; duprintf("t->private->number = %u\n", private->number); diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c index 83acc1405a18..590f767db5d4 100644 --- a/net/ipv6/netfilter/ip6t_NPT.c +++ b/net/ipv6/netfilter/ip6t_NPT.c @@ -18,9 +18,8 @@ static int ip6t_npt_checkentry(const struct xt_tgchk_param *par) { struct ip6t_npt_tginfo *npt = par->targinfo; - __wsum src_sum = 0, dst_sum = 0; struct in6_addr pfx; - unsigned int i; + __wsum src_sum, dst_sum; if (npt->src_pfx_len > 64 || npt->dst_pfx_len > 64) return -EINVAL; @@ -33,12 +32,8 @@ static int ip6t_npt_checkentry(const struct xt_tgchk_param *par) if (!ipv6_addr_equal(&pfx, &npt->dst_pfx.in6)) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(npt->src_pfx.in6.s6_addr16); i++) { - src_sum = csum_add(src_sum, - (__force __wsum)npt->src_pfx.in6.s6_addr16[i]); - dst_sum = csum_add(dst_sum, - (__force __wsum)npt->dst_pfx.in6.s6_addr16[i]); - } + src_sum = csum_partial(&npt->src_pfx.in6, sizeof(npt->src_pfx.in6), 0); + dst_sum = csum_partial(&npt->dst_pfx.in6, sizeof(npt->dst_pfx.in6), 0); npt->adjustment = ~csum_fold(csum_sub(src_sum, dst_sum)); return 0; @@ -57,7 +52,7 @@ static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt, if (pfx_len - i >= 32) mask = 0; else - mask = htonl(~((1 << (pfx_len - i)) - 1)); + mask = htonl((1 << (i - pfx_len + 32)) - 1); idx = i / 32; addr->s6_addr32[idx] &= mask; @@ -114,6 +109,7 @@ ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par) static struct xt_target ip6t_npt_target_reg[] __read_mostly = { { .name = "SNPT", + .table = "mangle", .target = ip6t_snpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .checkentry = ip6t_npt_checkentry, @@ -124,6 +120,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = { }, { .name = "DNPT", + .table = "mangle", .target = ip6t_dnpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .checkentry = ip6t_npt_checkentry, diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 029623dbd411..70f9abc0efe9 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -7,6 +7,8 @@ * Authors: * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> * + * Copyright (c) 2005-2007 Patrick McHardy <kaber@trash.net> + * * Based on net/ipv4/netfilter/ipt_REJECT.c * * This program is free software; you can redistribute it and/or @@ -126,7 +128,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) skb_put(nskb, sizeof(struct ipv6hdr)); skb_reset_network_header(nskb); ip6h = ipv6_hdr(nskb); - *(__be32 *)ip6h = htonl(0x60000000 | (tclass << 20)); + ip6_flow_hdr(ip6h, tclass, 0); ip6h->hop_limit = ip6_dst_hoplimit(dst); ip6h->nexthdr = IPPROTO_TCP; ip6h->saddr = oip6h->daddr; diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index 5060d54199ab..e0983f3648a6 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -71,6 +71,12 @@ static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, return ret; } +static bool rpfilter_is_local(const struct sk_buff *skb) +{ + const struct rt6_info *rt = (const void *) skb_dst(skb); + return rt && (rt->rt6i_flags & RTF_LOCAL); +} + static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rpfilter_info *info = par->matchinfo; @@ -78,7 +84,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) struct ipv6hdr *iph; bool invert = info->flags & XT_RPFILTER_INVERT; - if (par->in->flags & IFF_LOOPBACK) + if (rpfilter_is_local(skb)) return true ^ invert; iph = ipv6_hdr(skb); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 7431121b87de..e075399d8b72 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> +#include <net/ipv6.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -37,7 +38,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out) struct in6_addr saddr, daddr; u_int8_t hop_limit; u_int32_t flowlabel, mark; - + int err; #if 0 /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || @@ -60,12 +61,15 @@ ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out) dev_net(out)->ipv6.ip6table_mangle); if (ret != NF_DROP && ret != NF_STOLEN && - (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || - memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || + (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || + !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || - flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) - return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP; + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { + err = ip6_route_me_harder(skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } return ret; } diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index e0e788d25b14..6383f90efda8 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -179,6 +179,7 @@ nf_nat_ipv6_out(unsigned int hooknum, #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; + int err; #endif unsigned int ret; @@ -197,9 +198,11 @@ nf_nat_ipv6_out(unsigned int hooknum, &ct->tuplehash[!dir].tuple.dst.u3) || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) - if (nf_xfrm_me_harder(skb, AF_INET6) < 0) - ret = NF_DROP; + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } } #endif return ret; @@ -215,6 +218,7 @@ nf_nat_ipv6_local_fn(unsigned int hooknum, const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; + int err; /* root is playing with raw sockets. */ if (skb->len < sizeof(struct ipv6hdr)) @@ -227,16 +231,19 @@ nf_nat_ipv6_local_fn(unsigned int hooknum, if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { - if (ip6_route_me_harder(skb)) - ret = NF_DROP; + err = ip6_route_me_harder(skb); + if (err < 0) + ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (nf_xfrm_me_harder(skb, AF_INET6)) - ret = NF_DROP; + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } #endif } return ret; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 137e245860ab..97bcf2bae857 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -104,7 +104,6 @@ static unsigned int ipv6_helper(unsigned int hooknum, const struct nf_conn_help *help; const struct nf_conntrack_helper *helper; enum ip_conntrack_info ctinfo; - unsigned int ret; __be16 frag_off; int protoff; u8 nexthdr; @@ -130,12 +129,7 @@ static unsigned int ipv6_helper(unsigned int hooknum, return NF_ACCEPT; } - ret = helper->help(skb, protoff, ct, ctinfo); - if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) { - nf_log_packet(NFPROTO_IPV6, hooknum, skb, in, out, NULL, - "nf_ct_%s: dropping packet", helper->name); - } - return ret; + return helper->help(skb, protoff, ct, ctinfo); } static unsigned int ipv6_confirm(unsigned int hooknum, @@ -336,12 +330,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) sizeof(sin6.sin6_addr)); nf_ct_put(ct); - - if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6.sin6_scope_id = sk->sk_bound_dev_if; - else - sin6.sin6_scope_id = 0; - + sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, + sk->sk_bound_dev_if); return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; } @@ -421,54 +411,43 @@ static int ipv6_net_init(struct net *net) { int ret = 0; - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_tcp6); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp6); if (ret < 0) { - printk(KERN_ERR "nf_conntrack_l4proto_tcp6: protocol register failed\n"); + pr_err("nf_conntrack_tcp6: pernet registration failed\n"); goto out; } - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_udp6); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp6); if (ret < 0) { - printk(KERN_ERR "nf_conntrack_l4proto_udp6: protocol register failed\n"); + pr_err("nf_conntrack_udp6: pernet registration failed\n"); goto cleanup_tcp6; } - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_icmpv6); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmpv6); if (ret < 0) { - printk(KERN_ERR "nf_conntrack_l4proto_icmp6: protocol register failed\n"); + pr_err("nf_conntrack_icmp6: pernet registration failed\n"); goto cleanup_udp6; } - ret = nf_conntrack_l3proto_register(net, - &nf_conntrack_l3proto_ipv6); + ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6); if (ret < 0) { - printk(KERN_ERR "nf_conntrack_l3proto_ipv6: protocol register failed\n"); + pr_err("nf_conntrack_ipv6: pernet registration failed.\n"); goto cleanup_icmpv6; } return 0; cleanup_icmpv6: - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_icmpv6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6); cleanup_udp6: - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_udp6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6); cleanup_tcp6: - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_tcp6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6); out: return ret; } static void ipv6_net_exit(struct net *net) { - nf_conntrack_l3proto_unregister(net, - &nf_conntrack_l3proto_ipv6); - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_icmpv6); - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_udp6); - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_tcp6); + nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6); } static struct pernet_operations ipv6_net_ops = { @@ -491,19 +470,52 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) ret = register_pernet_subsys(&ipv6_net_ops); if (ret < 0) - goto cleanup_pernet; + goto cleanup_sockopt; + ret = nf_register_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); if (ret < 0) { pr_err("nf_conntrack_ipv6: can't register pre-routing defrag " "hook.\n"); - goto cleanup_ipv6; + goto cleanup_pernet; + } + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register tcp6 proto.\n"); + goto cleanup_hooks; + } + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register udp6 proto.\n"); + goto cleanup_tcp6; + } + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmpv6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register icmpv6 proto.\n"); + goto cleanup_udp6; + } + + ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n"); + goto cleanup_icmpv6; } return ret; - cleanup_ipv6: - unregister_pernet_subsys(&ipv6_net_ops); + cleanup_icmpv6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); + cleanup_udp6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6); + cleanup_tcp6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + cleanup_hooks: + nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); cleanup_pernet: + unregister_pernet_subsys(&ipv6_net_ops); + cleanup_sockopt: nf_unregister_sockopt(&so_getorigdst6); return ret; } @@ -511,6 +523,10 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) static void __exit nf_conntrack_l3proto_ipv6_fini(void) { synchronize_net(); + nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); unregister_pernet_subsys(&ipv6_net_ops); nf_unregister_sockopt(&so_getorigdst6); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 24df3dde0076..b3807c5cb888 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -131,7 +131,8 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6)) - nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + nf_log_packet(nf_ct_net(ct), PF_INET6, 0, skb, NULL, + NULL, NULL, "nf_ct_icmpv6: invalid new with type %d ", type + 128); return false; @@ -203,7 +204,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: short packet "); return -NF_ACCEPT; } @@ -211,7 +212,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: ICMPv6 checksum failed "); return -NF_ACCEPT; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3dacecc99065..dffdc1a389c5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -14,6 +14,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "IPv6-nf: " fmt + #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> @@ -39,6 +41,7 @@ #include <net/rawv6.h> #include <net/ndisc.h> #include <net/addrconf.h> +#include <net/inet_ecn.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <linux/sysctl.h> #include <linux/netfilter.h> @@ -97,9 +100,9 @@ static int nf_ct_frag6_sysctl_register(struct net *net) if (table == NULL) goto err_alloc; - table[0].data = &net->ipv6.frags.high_thresh; - table[1].data = &net->ipv6.frags.low_thresh; - table[2].data = &net->ipv6.frags.timeout; + table[0].data = &net->nf_frag.frags.timeout; + table[1].data = &net->nf_frag.frags.low_thresh; + table[2].data = &net->nf_frag.frags.high_thresh; } hdr = register_net_sysctl(net, "net/netfilter", table); @@ -136,6 +139,11 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) } #endif +static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) +{ + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); +} + static unsigned int nf_hashfn(struct inet_frag_queue *q) { const struct frag_queue *nq; @@ -164,7 +172,7 @@ static void nf_ct_frag6_expire(unsigned long data) /* Creation primitives. */ static inline struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, struct in6_addr *src, - struct in6_addr *dst) + struct in6_addr *dst, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -174,19 +182,18 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, arg.user = user; arg.src = src; arg.dst = dst; + arg.ecn = ecn; read_lock_bh(&nf_frags.lock); hash = inet6_hash_frag(id, src, dst, nf_frags.rnd); q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); local_bh_enable(); - if (q == NULL) - goto oom; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct frag_queue, q); - -oom: - return NULL; } @@ -196,6 +203,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev, *next; unsigned int payload_len; int offset, end; + u8 ecn; if (fq->q.last_in & INET_FRAG_COMPLETE) { pr_debug("Already completed\n"); @@ -213,6 +221,8 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, return -1; } + ecn = ip6_frag_ecn(ipv6_hdr(skb)); + if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, @@ -317,9 +327,10 @@ found: } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; + fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - atomic_add(skb->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -328,9 +339,8 @@ found: fq->nhoffset = nhoff; fq->q.last_in |= INET_FRAG_FIRST_IN; } - write_lock(&nf_frags.lock); - list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list); - write_unlock(&nf_frags.lock); + + inet_frag_lru_move(&fq->q); return 0; discard_fq: @@ -353,12 +363,17 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) { struct sk_buff *fp, *op, *head = fq->q.fragments; int payload_len; + u8 ecn; inet_frag_kill(&fq->q, &nf_frags); WARN_ON(head == NULL); WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); + ecn = ip_frag_ecn_table[fq->ecn]; + if (unlikely(ecn == 0xff)) + goto out_fail; + /* Unfragmented part is taken from the first segment. */ payload_len = ((head->data - skb_network_header(head)) - sizeof(struct ipv6hdr) + fq->q.len - @@ -369,7 +384,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) } /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { + if (skb_unclone(head, GFP_ATOMIC)) { pr_debug("skb is cloned but can't expand head"); goto out_oom; } @@ -398,7 +413,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - atomic_add(clone->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -422,13 +437,14 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; } - atomic_sub(head->truesize, &fq->q.net->mem); + sub_frag_mem_limit(&fq->q, head->truesize); head->local_df = 1; head->next = NULL; head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ @@ -573,7 +589,8 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false); local_bh_enable(); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr); + fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, + ip6_frag_ecn(hdr)); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); goto ret_orig; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 745a32042950..f3c1ff4357ff 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -90,6 +90,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -99,6 +100,7 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), + SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -129,6 +131,7 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -139,6 +142,7 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -247,7 +251,7 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) static int snmp6_dev_seq_open(struct inode *inode, struct file *file) { - return single_open(file, snmp6_dev_seq_show, PDE(inode)->data); + return single_open(file, snmp6_dev_seq_show, PDE_DATA(inode)); } static const struct file_operations snmp6_dev_seq_fops = { @@ -287,19 +291,18 @@ int snmp6_unregister_dev(struct inet6_dev *idev) return -ENOENT; if (!idev->stats.proc_dir_entry) return -EINVAL; - remove_proc_entry(idev->stats.proc_dir_entry->name, - net->mib.proc_net_devsnmp6); + proc_remove(idev->stats.proc_dir_entry); idev->stats.proc_dir_entry = NULL; return 0; } static int __net_init ipv6_proc_init_net(struct net *net) { - if (!proc_net_fops_create(net, "sockstat6", S_IRUGO, - &sockstat6_seq_fops)) + if (!proc_create("sockstat6", S_IRUGO, net->proc_net, + &sockstat6_seq_fops)) return -ENOMEM; - if (!proc_net_fops_create(net, "snmp6", S_IRUGO, &snmp6_seq_fops)) + if (!proc_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops)) goto proc_snmp6_fail; net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); @@ -308,17 +311,17 @@ static int __net_init ipv6_proc_init_net(struct net *net) return 0; proc_dev_snmp6_fail: - proc_net_remove(net, "snmp6"); + remove_proc_entry("snmp6", net->proc_net); proc_snmp6_fail: - proc_net_remove(net, "sockstat6"); + remove_proc_entry("sockstat6", net->proc_net); return -ENOMEM; } static void __net_exit ipv6_proc_exit_net(struct net *net) { - proc_net_remove(net, "sockstat6"); - proc_net_remove(net, "dev_snmp6"); - proc_net_remove(net, "snmp6"); + remove_proc_entry("sockstat6", net->proc_net); + remove_proc_entry("dev_snmp6", net->proc_net); + remove_proc_entry("snmp6", net->proc_net); } static struct pernet_operations ipv6_proc_ops = { diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 70fa81449997..eedff8ccded5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -71,10 +71,9 @@ static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, unsigned short num, const struct in6_addr *loc_addr, const struct in6_addr *rmt_addr, int dif) { - struct hlist_node *node; bool is_multicast = ipv6_addr_is_multicast(loc_addr); - sk_for_each_from(sk, node) + sk_for_each_from(sk) if (inet_sk(sk)->inet_num == num) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -264,7 +263,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; - if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another @@ -499,9 +498,8 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, sin6->sin6_port = 0; sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = IP6CB(skb)->iif; + sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, + IP6CB(skb)->iif); } sock_recv_ts_and_drops(msg, sk, skb); @@ -803,7 +801,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && - ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) fl6.flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) @@ -1292,7 +1290,7 @@ static const struct file_operations raw6_seq_fops = { static int __net_init raw6_init_net(struct net *net) { - if (!proc_net_fops_create(net, "raw6", S_IRUGO, &raw6_seq_fops)) + if (!proc_create("raw6", S_IRUGO, net->proc_net, &raw6_seq_fops)) return -ENOMEM; return 0; @@ -1300,7 +1298,7 @@ static int __net_init raw6_init_net(struct net *net) static void __net_exit raw6_exit_net(struct net *net) { - proc_net_remove(net, "raw6"); + remove_proc_entry("raw6", net->proc_net); } static struct pernet_operations raw6_net_ops = { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e5253ec9e0fc..790d9f4b8b0b 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -26,6 +26,9 @@ * YOSHIFUJI,H. @USAGI Always remove fragment header to * calculate ICV correctly. */ + +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> @@ -55,6 +58,7 @@ #include <net/ndisc.h> #include <net/addrconf.h> #include <net/inet_frag.h> +#include <net/inet_ecn.h> struct ip6frag_skb_cb { @@ -64,6 +68,10 @@ struct ip6frag_skb_cb #define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) +static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) +{ + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); +} static struct inet_frags ip6_frags; @@ -79,20 +87,8 @@ unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, { u32 c; - c = jhash_3words((__force u32)saddr->s6_addr32[0], - (__force u32)saddr->s6_addr32[1], - (__force u32)saddr->s6_addr32[2], - rnd); - - c = jhash_3words((__force u32)saddr->s6_addr32[3], - (__force u32)daddr->s6_addr32[0], - (__force u32)daddr->s6_addr32[1], - c); - - c = jhash_3words((__force u32)daddr->s6_addr32[2], - (__force u32)daddr->s6_addr32[3], - (__force u32)id, - c); + c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, rnd); return c & (INETFRAGS_HASHSZ - 1); } @@ -128,6 +124,7 @@ void ip6_frag_init(struct inet_frag_queue *q, void *a) fq->user = arg->user; fq->saddr = *arg->src; fq->daddr = *arg->dst; + fq->ecn = arg->ecn; } EXPORT_SYMBOL(ip6_frag_init); @@ -182,7 +179,8 @@ static void ip6_frag_expire(unsigned long data) } static __inline__ struct frag_queue * -fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6_addr *dst) +fq_find(struct net *net, __be32 id, const struct in6_addr *src, + const struct in6_addr *dst, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -192,14 +190,16 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6 arg.user = IP6_DEFRAG_LOCAL_DELIVER; arg.src = src; arg.dst = dst; + arg.ecn = ecn; read_lock(&ip6_frags.lock); hash = inet6_hash_frag(id, src, dst, ip6_frags.rnd); q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); - if (q == NULL) + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; - + } return container_of(q, struct frag_queue, q); } @@ -210,6 +210,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct net_device *dev; int offset, end; struct net *net = dev_net(skb_dst(skb)->dev); + u8 ecn; if (fq->q.last_in & INET_FRAG_COMPLETE) goto err; @@ -227,6 +228,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, return -1; } + ecn = ip6_frag_ecn(ipv6_hdr(skb)); + if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, @@ -327,7 +330,8 @@ found: } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &fq->q.net->mem); + fq->ecn |= ecn; + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -338,12 +342,18 @@ found: } if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len) - return ip6_frag_reasm(fq, prev, dev); + fq->q.meat == fq->q.len) { + int res; + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + res = ip6_frag_reasm(fq, prev, dev); + skb->_skb_refdst = orefdst; + return res; + } - write_lock(&ip6_frags.lock); - list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list); - write_unlock(&ip6_frags.lock); + skb_dst_drop(skb); + inet_frag_lru_move(&fq->q); return -1; discard_fq: @@ -372,9 +382,14 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, int payload_len; unsigned int nhoff; int sum_truesize; + u8 ecn; inet_frag_kill(&fq->q, &ip6_frags); + ecn = ip_frag_ecn_table[fq->ecn]; + if (unlikely(ecn == 0xff)) + goto out_fail; + /* Make the one we just received the head. */ if (prev) { head = prev->next; @@ -406,7 +421,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + if (skb_unclone(head, GFP_ATOMIC)) goto out_oom; /* If the first fragment is fragmented itself, we split @@ -429,7 +444,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -467,12 +482,13 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } fp = next; } - atomic_sub(sum_truesize, &fq->q.net->mem); + sub_frag_mem_limit(&fq->q, sum_truesize); head->next = NULL; head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); IP6CB(head)->nhoff = nhoff; /* Yes, and fold redundant checksum back. 8) */ @@ -536,7 +552,8 @@ static int ipv6_frag_rcv(struct sk_buff *skb) IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS, evicted); - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr); + fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, + ip6_frag_ecn(hdr)); if (fq != NULL) { int ret; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 363d8b7772e8..ad0aa6b0b86a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -145,25 +145,12 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, struct neighbour *n; daddr = choose_neigh_daddr(rt, skb, daddr); - n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr); + n = __ipv6_neigh_lookup(dst->dev, daddr); if (n) return n; return neigh_create(&nd_tbl, daddr, dst->dev); } -static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev) -{ - struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway); - if (!n) { - n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev); - if (IS_ERR(n)) - return PTR_ERR(n); - } - rt->n = n; - - return 0; -} - static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, .protocol = cpu_to_be16(ETH_P_IPV6), @@ -300,9 +287,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; - - if (rt->n) - neigh_release(rt->n); + struct dst_entry *from = dst->from; if (!(rt->dst.flags & DST_HOST)) dst_destroy_metrics_generic(dst); @@ -312,8 +297,8 @@ static void ip6_dst_destroy(struct dst_entry *dst) in6_dev_put(idev); } - if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from) - dst_release(dst->from); + dst->from = NULL; + dst_release(from); if (rt6_has_peer(rt)) { struct inet_peer *peer = rt6_peer_ptr(rt); @@ -354,11 +339,6 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, in6_dev_put(idev); } } - if (rt->n && rt->n->dev == dev) { - rt->n->dev = loopback_dev; - dev_hold(loopback_dev); - dev_put(dev); - } } } @@ -388,15 +368,8 @@ static int rt6_info_hash_nhsfn(unsigned int candidate_count, { unsigned int val = fl6->flowi6_proto; - val ^= (__force u32)fl6->daddr.s6_addr32[0]; - val ^= (__force u32)fl6->daddr.s6_addr32[1]; - val ^= (__force u32)fl6->daddr.s6_addr32[2]; - val ^= (__force u32)fl6->daddr.s6_addr32[3]; - - val ^= (__force u32)fl6->saddr.s6_addr32[0]; - val ^= (__force u32)fl6->saddr.s6_addr32[1]; - val ^= (__force u32)fl6->saddr.s6_addr32[2]; - val ^= (__force u32)fl6->saddr.s6_addr32[3]; + val ^= ipv6_addr_hash(&fl6->daddr); + val ^= ipv6_addr_hash(&fl6->saddr); /* Work only if this not encapsulated */ switch (fl6->flowi6_proto) { @@ -505,24 +478,34 @@ static void rt6_probe(struct rt6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - neigh = rt ? rt->n : NULL; - if (!neigh || (neigh->nud_state & NUD_VALID)) + if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) return; - read_lock_bh(&neigh->lock); - if (!(neigh->nud_state & NUD_VALID) && + rcu_read_lock_bh(); + neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + if (neigh) { + write_lock(&neigh->lock); + if (neigh->nud_state & NUD_VALID) + goto out; + } + + if (!neigh || time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { struct in6_addr mcaddr; struct in6_addr *target; - neigh->updated = jiffies; - read_unlock_bh(&neigh->lock); + if (neigh) { + neigh->updated = jiffies; + write_unlock(&neigh->lock); + } - target = (struct in6_addr *)&neigh->primary_key; + target = (struct in6_addr *)&rt->rt6i_gateway; addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL); } else { - read_unlock_bh(&neigh->lock); +out: + write_unlock(&neigh->lock); } + rcu_read_unlock_bh(); } #else static inline void rt6_probe(struct rt6_info *rt) @@ -549,20 +532,24 @@ static inline bool rt6_check_neigh(struct rt6_info *rt) struct neighbour *neigh; bool ret = false; - neigh = rt->n; if (rt->rt6i_flags & RTF_NONEXTHOP || !(rt->rt6i_flags & RTF_GATEWAY)) - ret = true; - else if (neigh) { - read_lock_bh(&neigh->lock); + return true; + + rcu_read_lock_bh(); + neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + if (neigh) { + read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) ret = true; #ifdef CONFIG_IPV6_ROUTER_PREF else if (!(neigh->nud_state & NUD_FAILED)) ret = true; #endif - read_unlock_bh(&neigh->lock); + read_unlock(&neigh->lock); } + rcu_read_unlock_bh(); + return ret; } @@ -838,8 +825,6 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, rt = ip6_rt_copy(ort, daddr); if (rt) { - int attempts = !in_softirq(); - if (!(rt->rt6i_flags & RTF_GATEWAY)) { if (ort->rt6i_dst.plen != 128 && ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) @@ -855,32 +840,6 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, rt->rt6i_src.plen = 128; } #endif - - retry: - if (rt6_bind_neighbour(rt, rt->dst.dev)) { - struct net *net = dev_net(rt->dst.dev); - int saved_rt_min_interval = - net->ipv6.sysctl.ip6_rt_gc_min_interval; - int saved_rt_elasticity = - net->ipv6.sysctl.ip6_rt_gc_elasticity; - - if (attempts-- > 0) { - net->ipv6.sysctl.ip6_rt_gc_elasticity = 1; - net->ipv6.sysctl.ip6_rt_gc_min_interval = 0; - - ip6_dst_gc(&net->ipv6.ip6_dst_ops); - - net->ipv6.sysctl.ip6_rt_gc_elasticity = - saved_rt_elasticity; - net->ipv6.sysctl.ip6_rt_gc_min_interval = - saved_rt_min_interval; - goto retry; - } - - net_warn_ratelimited("Neighbour table overflow\n"); - dst_free(&rt->dst); - return NULL; - } } return rt; @@ -891,10 +850,8 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, { struct rt6_info *rt = ip6_rt_copy(ort, daddr); - if (rt) { + if (rt) rt->rt6i_flags |= RTF_CACHE; - rt->n = neigh_clone(ort->n); - } return rt; } @@ -928,7 +885,7 @@ restart: dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); - if (!rt->n && !(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_LOCAL))) + if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); else if (!(rt->dst.flags & DST_HOST)) nrt = rt6_alloc_clone(rt, &fl6->daddr); @@ -994,7 +951,7 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, .saddr = iph->saddr, - .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, + .flowlabel = ip6_flowinfo(iph), .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; @@ -1054,7 +1011,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags; - rt6_clean_expires(rt); rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); @@ -1159,7 +1115,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, fl6.flowi6_flags = 0; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; - fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK; + fl6.flowlabel = ip6_flowinfo(iph); dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) @@ -1187,7 +1143,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) fl6.flowi6_flags = 0; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; - fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK; + fl6.flowlabel = ip6_flowinfo(iph); dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) @@ -1247,7 +1203,6 @@ static struct dst_entry *icmp6_dst_gc_list; static DEFINE_SPINLOCK(icmp6_dst_lock); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, - struct neighbour *neigh, struct flowi6 *fl6) { struct dst_entry *dst; @@ -1265,20 +1220,8 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, goto out; } - if (neigh) - neigh_hold(neigh); - else { - neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr); - if (IS_ERR(neigh)) { - in6_dev_put(idev); - dst_free(&rt->dst); - return ERR_CAST(neigh); - } - } - rt->dst.flags |= DST_HOST; rt->dst.output = ip6_output; - rt->n = neigh; atomic_set(&rt->dst.__refcnt, 1); rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; @@ -1587,12 +1530,6 @@ int ip6_route_add(struct fib6_config *cfg) } else rt->rt6i_prefsrc.plen = 0; - if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) { - err = rt6_bind_neighbour(rt, dev); - if (err) - goto out; - } - rt->rt6i_flags = cfg->fc_flags; install_route: @@ -1705,37 +1642,32 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu struct net *net = dev_net(skb->dev); struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; - const struct in6_addr *target; struct ndisc_options ndopts; - const struct in6_addr *dest; - struct neighbour *old_neigh; struct inet6_dev *in6_dev; struct neighbour *neigh; - struct icmp6hdr *icmph; + struct rd_msg *msg; int optlen, on_link; u8 *lladdr; optlen = skb->tail - skb->transport_header; - optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + optlen -= sizeof(*msg); if (optlen < 0) { net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); return; } - icmph = icmp6_hdr(skb); - target = (const struct in6_addr *) (icmph + 1); - dest = target + 1; + msg = (struct rd_msg *)icmp6_hdr(skb); - if (ipv6_addr_is_multicast(dest)) { + if (ipv6_addr_is_multicast(&msg->dest)) { net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); return; } on_link = 0; - if (ipv6_addr_equal(dest, target)) { + if (ipv6_addr_equal(&msg->dest, &msg->target)) { on_link = 1; - } else if (ipv6_addr_type(target) != + } else if (ipv6_addr_type(&msg->target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); return; @@ -1752,7 +1684,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * first-hop router for the specified ICMP Destination Address. */ - if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) { + if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) { net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); return; } @@ -1779,15 +1711,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu */ dst_confirm(&rt->dst); - neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); + neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); if (!neigh) return; - /* Duplicate redirect: silently ignore. */ - old_neigh = rt->n; - if (neigh == old_neigh) - goto out; - /* * We have finally decided to accept it. */ @@ -1799,7 +1726,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NEIGH_UPDATE_F_ISROUTER)) ); - nrt = ip6_rt_copy(rt, dest); + nrt = ip6_rt_copy(rt, &msg->dest); if (!nrt) goto out; @@ -1808,16 +1735,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_flags &= ~RTF_GATEWAY; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; - nrt->n = neigh_clone(neigh); if (ip6_ins_rt(nrt)) goto out; netevent.old = &rt->dst; - netevent.old_neigh = old_neigh; netevent.new = &nrt->dst; - netevent.new_neigh = neigh; - netevent.daddr = dest; + netevent.daddr = &msg->dest; + netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); if (rt->rt6i_flags & RTF_CACHE) { @@ -1859,8 +1784,6 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) == (RTF_DEFAULT | RTF_ADDRCONF)) rt6_set_from(rt, ort); - else - rt6_clean_expires(rt); rt->rt6i_metric = 0; #ifdef CONFIG_IPV6_SUBTREES @@ -1992,7 +1915,8 @@ void rt6_purge_dflt_routers(struct net *net) restart: read_lock_bh(&table->tb6_lock); for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) { + if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && + (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); ip6_del_rt(rt); @@ -2123,7 +2047,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, { struct net *net = dev_net(idev->dev); struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL); - int err; if (!rt) { net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n"); @@ -2142,11 +2065,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_flags |= RTF_ANYCAST; else rt->rt6i_flags |= RTF_LOCAL; - err = rt6_bind_neighbour(rt, rt->dst.dev); - if (err) { - dst_free(&rt->dst); - return ERR_PTR(err); - } rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; @@ -2437,7 +2355,7 @@ beginning: return last_err; } -static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh) { struct fib6_config cfg; int err; @@ -2452,7 +2370,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a return ip6_route_del(&cfg); } -static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh) { struct fib6_config cfg; int err; @@ -2492,7 +2410,6 @@ static int rt6_fill_node(struct net *net, struct nlmsghdr *nlh; long expires; u32 table; - struct neighbour *n; if (prefix) { /* user wants prefix routes only */ if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { @@ -2605,9 +2522,8 @@ static int rt6_fill_node(struct net *net, if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) goto nla_put_failure; - n = rt->n; - if (n) { - if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) + if (rt->rt6i_flags & RTF_GATEWAY) { + if (nla_put(skb, RTA_GATEWAY, 16, &rt->rt6i_gateway) < 0) goto nla_put_failure; } @@ -2646,7 +2562,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) prefix, 0, NLM_F_MULTI); } -static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; @@ -2802,7 +2718,6 @@ struct rt6_proc_arg static int rt6_info_route(struct rt6_info *rt, void *p_arg) { struct seq_file *m = p_arg; - struct neighbour *n; seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); @@ -2811,9 +2726,8 @@ static int rt6_info_route(struct rt6_info *rt, void *p_arg) #else seq_puts(m, "00000000000000000000000000000000 00 "); #endif - n = rt->n; - if (n) { - seq_printf(m, "%pi6", n->primary_key); + if (rt->rt6i_flags & RTF_GATEWAY) { + seq_printf(m, "%pi6", &rt->rt6i_gateway); } else { seq_puts(m, "00000000000000000000000000000000"); } @@ -3080,8 +2994,8 @@ static void __net_exit ip6_route_net_exit(struct net *net) static int __net_init ip6_route_net_init_late(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops); - proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops); + proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); + proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); #endif return 0; } @@ -3089,8 +3003,8 @@ static int __net_init ip6_route_net_init_late(struct net *net) static void __net_exit ip6_route_net_exit_late(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_remove(net, "ipv6_route"); - proc_net_remove(net, "rt6_stats"); + remove_proc_entry("ipv6_route", net->proc_net); + remove_proc_entry("rt6_stats", net->proc_net); #endif } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index cfba99b2c2a4..335363478bbf 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -49,7 +49,7 @@ #include <net/ip.h> #include <net/udp.h> #include <net/icmp.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/dsfield.h> @@ -72,6 +72,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static int ipip6_tunnel_init(struct net_device *dev); static void ipip6_tunnel_setup(struct net_device *dev); static void ipip6_dev_free(struct net_device *dev); +static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, + __be32 *v4dst); static struct rtnl_link_ops sit_link_ops __read_mostly; static int sit_net_id __read_mostly; @@ -85,41 +87,6 @@ struct sit_net { struct net_device *fb_tunnel_dev; }; -static struct rtnl_link_stats64 *ipip6_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -{ - int i; - - for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } - - tot->rx_errors = dev->stats.rx_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - - return tot; -} - /* * Must be invoked with rcu_read_lock */ @@ -590,17 +557,21 @@ out: return err; } +static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr, + const struct in6_addr *v6addr) +{ + __be32 v4embed = 0; + if (check_6rd(tunnel, v6addr, &v4embed) && v4addr != v4embed) + return true; + return false; +} + static int ipip6_rcv(struct sk_buff *skb) { - const struct iphdr *iph; + const struct iphdr *iph = ip_hdr(skb); struct ip_tunnel *tunnel; int err; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto out; - - iph = ip_hdr(skb); - tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel != NULL) { @@ -613,10 +584,19 @@ static int ipip6_rcv(struct sk_buff *skb) skb->protocol = htons(ETH_P_IPV6); skb->pkt_type = PACKET_HOST; - if ((tunnel->dev->priv_flags & IFF_ISATAP) && - !isatap_chksrc(skb, iph, tunnel)) { - tunnel->dev->stats.rx_errors++; - goto out; + if (tunnel->dev->priv_flags & IFF_ISATAP) { + if (!isatap_chksrc(skb, iph, tunnel)) { + tunnel->dev->stats.rx_errors++; + goto out; + } + } else { + if (is_spoofed_6rd(tunnel, iph->saddr, + &ipv6_hdr(skb)->saddr) || + is_spoofed_6rd(tunnel, iph->daddr, + &ipv6_hdr(skb)->daddr)) { + tunnel->dev->stats.rx_errors++; + goto out; + } } __skb_tunnel_rx(skb, tunnel->dev); @@ -650,14 +630,12 @@ out: } /* - * Returns the embedded IPv4 address if the IPv6 address - * comes from 6rd / 6to4 (RFC 3056) addr space. + * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function + * stores the embedded IPv4 address in v4dst and returns true. */ -static inline -__be32 try_6rd(const struct in6_addr *v6dst, struct ip_tunnel *tunnel) +static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, + __be32 *v4dst) { - __be32 dst = 0; - #ifdef CONFIG_IPV6_SIT_6RD if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, tunnel->ip6rd.prefixlen)) { @@ -676,14 +654,24 @@ __be32 try_6rd(const struct in6_addr *v6dst, struct ip_tunnel *tunnel) d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >> (32 - pbi1); - dst = tunnel->ip6rd.relay_prefix | htonl(d); + *v4dst = tunnel->ip6rd.relay_prefix | htonl(d); + return true; } #else if (v6dst->s6_addr16[0] == htons(0x2002)) { /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ - memcpy(&dst, &v6dst->s6_addr16[1], 4); + memcpy(v4dst, &v6dst->s6_addr16[1], 4); + return true; } #endif + return false; +} + +static inline __be32 try_6rd(struct ip_tunnel *tunnel, + const struct in6_addr *v6dst) +{ + __be32 dst = 0; + check_6rd(tunnel, v6dst, &dst); return dst; } @@ -744,7 +732,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (!dst) - dst = try_6rd(&iph6->daddr, tunnel); + dst = try_6rd(tunnel, &iph6->daddr); if (!dst) { struct neighbour *neigh = NULL; @@ -876,6 +864,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if ((iph->ttl = tiph->ttl) == 0) iph->ttl = iph6->hop_limit; + skb->ip_summed = CHECKSUM_NONE; + ip_select_ident(iph, skb_dst(skb), NULL); iptunnel_xmit(skb, dev); return NETDEV_TX_OK; @@ -1177,7 +1167,7 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_start_xmit = ipip6_tunnel_xmit, .ndo_do_ioctl = ipip6_tunnel_ioctl, .ndo_change_mtu = ipip6_tunnel_change_mtu, - .ndo_get_stats64= ipip6_get_stats64, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; static void ipip6_dev_free(struct net_device *dev) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 40161977f7cf..d5dda20bd717 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -149,7 +149,6 @@ static inline int cookie_check(const struct sk_buff *skb, __u32 cookie) struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; - const u8 *hash_location; struct inet_request_sock *ireq; struct inet6_request_sock *ireq6; struct tcp_request_sock *treq; @@ -177,9 +176,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); + tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) + if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) goto out; ret = NULL; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4f43537197ef..71167069b394 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -386,9 +386,17 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (dst) dst->ops->redirect(dst, sk, skb); + goto out; } if (type == ICMPV6_PKT_TOOBIG) { + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; + tp->mtu_info = ntohl(info); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); @@ -454,7 +462,6 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, struct flowi6 *fl6, struct request_sock *req, - struct request_values *rvp, u16 queue_mapping) { struct inet6_request_sock *treq = inet6_rsk(req); @@ -466,7 +473,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, rvp, NULL); + skb = tcp_make_synack(sk, dst, req, NULL); if (skb) { __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); @@ -481,13 +488,12 @@ done: return err; } -static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req, - struct request_values *rvp) +static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) { struct flowi6 fl6; int res; - res = tcp_v6_send_synack(sk, NULL, &fl6, req, rvp, 0); + res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0); if (!res) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); return res; @@ -713,7 +719,8 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { #endif static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, - u32 ts, struct tcp_md5sig_key *key, int rst, u8 tclass) + u32 tsval, u32 tsecr, + struct tcp_md5sig_key *key, int rst, u8 tclass) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; @@ -725,7 +732,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, struct dst_entry *dst; __be32 *topt; - if (ts) + if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG if (key) @@ -755,11 +762,11 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, topt = (__be32 *)(t1 + 1); - if (ts) { + if (tsecr) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - *topt++ = htonl(tcp_time_stamp); - *topt++ = htonl(ts); + *topt++ = htonl(tsval); + *topt++ = htonl(tsecr); } #ifdef CONFIG_TCP_MD5SIG @@ -835,7 +842,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), - &tcp_hashinfo, &ipv6h->daddr, + &tcp_hashinfo, &ipv6h->saddr, + th->source, &ipv6h->daddr, ntohs(th->source), inet6_iif(skb)); if (!sk1) return; @@ -859,7 +867,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); - tcp_v6_send_response(skb, seq, ack_seq, 0, 0, key, 1, 0); + tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, key, 1, 0); #ifdef CONFIG_TCP_MD5SIG release_sk1: @@ -870,10 +878,11 @@ release_sk1: #endif } -static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, + u32 win, u32 tsval, u32 tsecr, struct tcp_md5sig_key *key, u8 tclass) { - tcp_v6_send_response(skb, seq, ack, win, ts, key, 0, tclass); + tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, key, 0, tclass); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -883,6 +892,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw), tw->tw_tclass); @@ -892,7 +902,8 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { - tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent, + tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, + req->rcv_wnd, tcp_time_stamp, req->ts_recent, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0); } @@ -935,9 +946,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; - const u8 *hash_location; struct request_sock *req; struct inet6_request_sock *treq; struct ipv6_pinfo *np = inet6_sk(sk); @@ -975,50 +984,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); - - if (tmp_opt.cookie_plus > 0 && - tmp_opt.saw_tstamp && - !tp->rx_opt.cookie_out_never && - (sysctl_tcp_cookie_size > 0 || - (tp->cookie_values != NULL && - tp->cookie_values->cookie_desired > 0))) { - u8 *c; - u32 *d; - u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; - int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; - - if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) - goto drop_and_free; - - /* Secret recipe starts with IP addresses */ - d = (__force u32 *)&ipv6_hdr(skb)->daddr.s6_addr32[0]; - *mess++ ^= *d++; - *mess++ ^= *d++; - *mess++ ^= *d++; - *mess++ ^= *d++; - d = (__force u32 *)&ipv6_hdr(skb)->saddr.s6_addr32[0]; - *mess++ ^= *d++; - *mess++ ^= *d++; - *mess++ ^= *d++; - *mess++ ^= *d++; - - /* plus variable length Initiator Cookie */ - c = (u8 *)mess; - while (l-- > 0) - *c++ ^= *hash_location++; - - want_cookie = false; /* not our kind of cookie */ - tmp_ext.cookie_out_never = 0; /* false */ - tmp_ext.cookie_plus = tmp_opt.cookie_plus; - } else if (!tp->rx_opt.cookie_in_always) { - /* redundant indications, but ensure initialization. */ - tmp_ext.cookie_out_never = 1; /* true */ - tmp_ext.cookie_plus = 0; - } else { - goto drop_and_free; - } - tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; + tcp_parse_options(skb, &tmp_opt, 0, NULL); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); @@ -1030,7 +996,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) treq->rmt_addr = ipv6_hdr(skb)->saddr; treq->loc_addr = ipv6_hdr(skb)->daddr; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb); + TCP_ECN_create_request(req, skb, sock_net(sk)); treq->iif = sk->sk_bound_dev_if; @@ -1096,7 +1062,6 @@ have_isn: goto drop_and_release; if (tcp_v6_send_synack(sk, dst, &fl6, req, - (struct request_values *)&tmp_ext, skb_get_queue_mapping(skb)) || want_cookie) goto drop_and_free; @@ -1167,7 +1132,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - newnp->rcv_tclass = ipv6_tclass(ipv6_hdr(skb)); + newnp->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb)); /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count @@ -1247,7 +1212,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - newnp->rcv_tclass = ipv6_tclass(ipv6_hdr(skb)); + newnp->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb)); /* Clone native IPv6 options from listening socket (if any) @@ -1440,6 +1405,7 @@ discard: kfree_skb(skb); return 0; csum_err: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1460,7 +1426,7 @@ ipv6_pktoptions: if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; if (np->rxopt.bits.rxtclass) - np->rcv_tclass = ipv6_tclass(ipv6_hdr(skb)); + np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb)); if (ipv6_opt_accepted(sk, opt_skb)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); @@ -1501,7 +1467,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) goto discard_it; if (!skb_csum_unnecessary(skb) && tcp_v6_checksum_init(skb)) - goto bad_packet; + goto csum_error; th = tcp_hdr(skb); hdr = ipv6_hdr(skb); @@ -1565,6 +1531,8 @@ no_tcp_socket: goto discard_it; if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { +csum_error: + TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { @@ -1572,11 +1540,6 @@ bad_packet: } discard_it: - - /* - * Discard frame - */ - kfree_skb(skb); return 0; @@ -1590,10 +1553,13 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + if (skb->len < (th->doff<<2)) { inet_twsk_put(inet_twsk(sk)); - goto discard_it; + goto bad_packet; + } + if (tcp_checksum_complete(skb)) { + inet_twsk_put(inet_twsk(sk)); + goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { @@ -1602,6 +1568,7 @@ do_time_wait: struct sock *sk2; sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); if (sk2 != NULL) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index fb083295ff0b..d4defdd44937 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -45,6 +45,7 @@ #include <net/tcp_states.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> +#include <net/inet6_hashtables.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -203,7 +204,8 @@ static struct sock *udp6_lib_lookup2(struct net *net, { struct sock *sk, *result; struct hlist_nulls_node *node; - int score, badness; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; begin: result = NULL; @@ -214,8 +216,18 @@ begin: if (score > badness) { result = sk; badness = score; - if (score == SCORE2_MAX) + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = inet6_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } else if (score == SCORE2_MAX) goto exact_match; + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) + result = sk; + hash = next_pseudo_random32(hash); } } /* @@ -249,7 +261,8 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; - int score, badness; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; rcu_read_lock(); if (hslot->count > 10) { @@ -284,6 +297,17 @@ begin: if (score > badness) { result = sk; badness = score; + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = inet6_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) + result = sk; + hash = next_pseudo_random32(hash); } } /* @@ -426,15 +450,16 @@ try_again: sin6->sin6_family = AF_INET6; sin6->sin6_port = udp_hdr(skb)->source; sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (is_udp4) + if (is_udp4) { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin6->sin6_addr); - else { + sin6->sin6_scope_id = 0; + } else { sin6->sin6_addr = ipv6_hdr(skb)->saddr; - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = IP6CB(skb)->iif; + sin6->sin6_scope_id = + ipv6_iface_scope_id(&sin6->sin6_addr, + IP6CB(skb)->iif); } } @@ -458,12 +483,17 @@ out: csum_copy_err: slow = lock_sock_fast(sk); if (!skb_kill_datagram(sk, skb, flags)) { - if (is_udp4) + if (is_udp4) { + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - else + } else { + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); UDP6_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + } } unlock_sock_fast(sk, slow); @@ -612,7 +642,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (rcu_access_pointer(sk->sk_filter)) { if (udp_lib_checksum_complete(skb)) - goto drop; + goto csum_error; } if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) @@ -631,6 +661,8 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) bh_unlock_sock(sk); return rc; +csum_error: + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); @@ -752,40 +784,6 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, return 0; } -static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, - int proto) -{ - int err; - - UDP_SKB_CB(skb)->partial_cov = 0; - UDP_SKB_CB(skb)->cscov = skb->len; - - if (proto == IPPROTO_UDPLITE) { - err = udplite_checksum_init(skb, uh); - if (err) - return err; - } - - if (uh->check == 0) { - /* RFC 2460 section 8.1 says that we SHOULD log - this error. Well, it is reasonable. - */ - LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n"); - return 1; - } - if (skb->ip_summed == CHECKSUM_COMPLETE && - !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, - skb->len, proto, skb->csum)) - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (!skb_csum_unnecessary(skb)) - skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len, proto, 0)); - - return 0; -} - int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { @@ -826,7 +824,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, } if (udp6_csum_init(skb, uh, proto)) - goto discard; + goto csum_error; /* * Multicast receive code @@ -859,7 +857,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto discard; if (udp_lib_checksum_complete(skb)) - goto discard; + goto csum_error; UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); @@ -876,7 +874,9 @@ short_packet: skb->len, daddr, ntohs(uh->dest)); - + goto discard; +csum_error: + UDP6_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); @@ -1128,7 +1128,7 @@ do_udp_sendmsg: if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && - ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) fl6.flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) @@ -1295,10 +1295,18 @@ do_confirm: void udpv6_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); lock_sock(sk); udp_v6_flush_pending_frames(sk); release_sock(sk); + if (static_key_false(&udpv6_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } + inet6_destroy_sock(sk); } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 0c8934a317c2..3bb3a891a424 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -21,6 +21,10 @@ static int udp6_ufo_send_check(struct sk_buff *skb) const struct ipv6hdr *ipv6h; struct udphdr *uh; + /* UDP Tunnel offload on ipv6 is not yet supported. */ + if (skb->encapsulation) + return -EINVAL; + if (!pskb_may_pull(skb, sizeof(*uh))) return -EINVAL; @@ -56,7 +60,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, /* Packet is from an untrusted source, reset gso_segs. */ int type = skb_shinfo(skb)->gso_type; - if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + if (unlikely(type & ~(SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_GRE) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 9f2095b19ad0..4770d515c2c8 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -49,8 +49,11 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) sizeof(top_iph->flow_lbl)); top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family); - dsfield = XFRM_MODE_SKB_CB(skb)->tos; - dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) + dsfield = 0; + else + dsfield = XFRM_MODE_SKB_CB(skb)->tos; + dsfield = INET_ECN_encapsulate(dsfield, XFRM_MODE_SKB_CB(skb)->tos); if (x->props.flags & XFRM_STATE_NOECN) dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); @@ -69,8 +72,8 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; - if (skb_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_unclone(skb, GFP_ATOMIC); + if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index c9844135c9ca..4ef7bdb65440 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -110,7 +110,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ - xdst->u.rt6.n = neigh_clone(rt->n); xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); xdst->u.rt6.rt6i_metric = rt->rt6i_metric; @@ -321,7 +320,51 @@ static struct ctl_table xfrm6_policy_table[] = { { } }; -static struct ctl_table_header *sysctl_hdr; +static int __net_init xfrm6_net_init(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = xfrm6_policy_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(xfrm6_policy_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; + } + + hdr = register_net_sysctl(net, "net/ipv6", table); + if (!hdr) + goto err_reg; + + net->ipv6.sysctl.xfrm6_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit xfrm6_net_exit(struct net *net) +{ + struct ctl_table *table; + + if (net->ipv6.sysctl.xfrm6_hdr == NULL) + return; + + table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv6.sysctl.xfrm6_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations xfrm6_net_ops = { + .init = xfrm6_net_init, + .exit = xfrm6_net_exit, +}; #endif int __init xfrm6_init(void) @@ -340,8 +383,7 @@ int __init xfrm6_init(void) goto out_policy; #ifdef CONFIG_SYSCTL - sysctl_hdr = register_net_sysctl(&init_net, "net/ipv6", - xfrm6_policy_table); + register_pernet_subsys(&xfrm6_net_ops); #endif out: return ret; @@ -353,8 +395,7 @@ out_policy: void xfrm6_fini(void) { #ifdef CONFIG_SYSCTL - if (sysctl_hdr) - unregister_net_sysctl_table(sysctl_hdr); + unregister_pernet_subsys(&xfrm6_net_ops); #endif xfrm6_policy_fini(); xfrm6_state_fini(); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index ee5a7065aacc..de2bcfaaf759 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -72,7 +72,7 @@ static inline unsigned int xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *ad { unsigned int h; - h = (__force u32)(addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]); + h = ipv6_addr_hash((const struct in6_addr *)addr); h ^= h >> 16; h ^= h >> 8; h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; @@ -89,12 +89,11 @@ static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; - struct hlist_node *pos; - hlist_for_each_entry_rcu(x6spi, pos, + hlist_for_each_entry_rcu(x6spi, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { - if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) + if (xfrm6_addr_equal(&x6spi->addr, saddr)) return x6spi; } @@ -120,9 +119,8 @@ static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; int index = xfrm6_tunnel_spi_hash_byspi(spi); - struct hlist_node *pos; - hlist_for_each_entry(x6spi, pos, + hlist_for_each_entry(x6spi, &xfrm6_tn->spi_byspi[index], list_byspi) { if (x6spi->spi == spi) @@ -203,15 +201,15 @@ static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; - struct hlist_node *pos, *n; + struct hlist_node *n; spin_lock_bh(&xfrm6_tunnel_spi_lock); - hlist_for_each_entry_safe(x6spi, pos, n, + hlist_for_each_entry_safe(x6spi, n, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { - if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { + if (xfrm6_addr_equal(&x6spi->addr, saddr)) { if (atomic_dec_and_test(&x6spi->refcnt)) { hlist_del_rcu(&x6spi->list_byaddr); hlist_del_rcu(&x6spi->list_byspi); diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index dfd6faaf0ea7..f547a47d381c 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -228,9 +228,8 @@ static struct sock *__ipxitf_find_socket(struct ipx_interface *intrfc, __be16 port) { struct sock *s; - struct hlist_node *node; - sk_for_each(s, node, &intrfc->if_sklist) + sk_for_each(s, &intrfc->if_sklist) if (ipx_sk(s)->port == port) goto found; s = NULL; @@ -259,12 +258,11 @@ static struct sock *ipxitf_find_internal_socket(struct ipx_interface *intrfc, __be16 port) { struct sock *s; - struct hlist_node *node; ipxitf_hold(intrfc); spin_lock_bh(&intrfc->if_sklist_lock); - sk_for_each(s, node, &intrfc->if_sklist) { + sk_for_each(s, &intrfc->if_sklist) { struct ipx_sock *ipxs = ipx_sk(s); if (ipxs->port == port && @@ -282,14 +280,14 @@ found: static void __ipxitf_down(struct ipx_interface *intrfc) { struct sock *s; - struct hlist_node *node, *t; + struct hlist_node *t; /* Delete all routes associated with this interface */ ipxrtr_del_routes(intrfc); spin_lock_bh(&intrfc->if_sklist_lock); /* error sockets */ - sk_for_each_safe(s, node, t, &intrfc->if_sklist) { + sk_for_each_safe(s, t, &intrfc->if_sklist) { struct ipx_sock *ipxs = ipx_sk(s); s->sk_err = ENOLINK; @@ -385,12 +383,11 @@ static int ipxitf_demux_socket(struct ipx_interface *intrfc, int is_broadcast = !memcmp(ipx->ipx_dest.node, ipx_broadcast_node, IPX_NODE_LEN); struct sock *s; - struct hlist_node *node; int rc; spin_lock_bh(&intrfc->if_sklist_lock); - sk_for_each(s, node, &intrfc->if_sklist) { + sk_for_each(s, &intrfc->if_sklist) { struct ipx_sock *ipxs = ipx_sk(s); if (ipxs->port == ipx->ipx_dest.sock && @@ -446,12 +443,11 @@ static struct sock *ncp_connection_hack(struct ipx_interface *intrfc, connection = (((int) *(ncphdr + 9)) << 8) | (int) *(ncphdr + 8); if (connection) { - struct hlist_node *node; /* Now we have to look for a special NCP connection handling * socket. Only these sockets have ipx_ncp_conn != 0, set by * SIOCIPXNCPCONN. */ spin_lock_bh(&intrfc->if_sklist_lock); - sk_for_each(sk, node, &intrfc->if_sklist) + sk_for_each(sk, &intrfc->if_sklist) if (ipx_sk(sk)->ipx_ncp_conn == connection) { sock_hold(sk); goto found; diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c index 02ff7f2f60d4..65e8833a2510 100644 --- a/net/ipx/ipx_proc.c +++ b/net/ipx/ipx_proc.c @@ -103,19 +103,18 @@ out: static __inline__ struct sock *ipx_get_socket_idx(loff_t pos) { struct sock *s = NULL; - struct hlist_node *node; struct ipx_interface *i; list_for_each_entry(i, &ipx_interfaces, node) { spin_lock_bh(&i->if_sklist_lock); - sk_for_each(s, node, &i->if_sklist) { + sk_for_each(s, &i->if_sklist) { if (!pos) break; --pos; } spin_unlock_bh(&i->if_sklist_lock); if (!pos) { - if (node) + if (s) goto found; break; } diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index b833677d83d6..0578d4fa00a9 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -305,8 +305,7 @@ static void irda_connect_response(struct irda_sock *self) IRDA_DEBUG(2, "%s()\n", __func__); - skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER, - GFP_ATOMIC); + skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER, GFP_KERNEL); if (skb == NULL) { IRDA_DEBUG(0, "%s() Unable to allocate sk_buff!\n", __func__); @@ -1120,7 +1119,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol, } /* Allocate networking socket */ - sk = sk_alloc(net, PF_IRDA, GFP_ATOMIC, &irda_proto); + sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto); if (sk == NULL) return -ENOMEM; @@ -1386,6 +1385,8 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, IRDA_DEBUG(4, "%s()\n", __func__); + msg->msg_namelen = 0; + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &err); if (!skb) @@ -2567,8 +2568,7 @@ bed: err); /* If watchdog is still activated, kill it! */ - if(timer_pending(&(self->watchdog))) - del_timer(&(self->watchdog)); + del_timer(&(self->watchdog)); IRDA_DEBUG(1, "%s(), ...waking up !\n", __func__); @@ -2584,8 +2584,10 @@ bed: NULL, NULL, NULL); /* Check if the we got some results */ - if (!self->cachedaddr) - return -EAGAIN; /* Didn't find any devices */ + if (!self->cachedaddr) { + err = -EAGAIN; /* Didn't find any devices */ + goto out; + } daddr = self->cachedaddr; /* Cleanup */ self->cachedaddr = 0; diff --git a/net/irda/ircomm/Kconfig b/net/irda/ircomm/Kconfig index 2d4c6b4a78d6..19492c1707b7 100644 --- a/net/irda/ircomm/Kconfig +++ b/net/irda/ircomm/Kconfig @@ -1,6 +1,6 @@ config IRCOMM tristate "IrCOMM protocol" - depends on IRDA + depends on IRDA && TTY help Say Y here if you want to build support for the IrCOMM protocol. To compile it as modules, choose M here: the modules will be diff --git a/net/irda/ircomm/ircomm_core.c b/net/irda/ircomm/ircomm_core.c index 52079f19bbbe..b797daac063c 100644 --- a/net/irda/ircomm/ircomm_core.c +++ b/net/irda/ircomm/ircomm_core.c @@ -117,7 +117,7 @@ struct ircomm_cb *ircomm_open(notify_t *notify, __u8 service_type, int line) IRDA_ASSERT(ircomm != NULL, return NULL;); - self = kzalloc(sizeof(struct ircomm_cb), GFP_ATOMIC); + self = kzalloc(sizeof(struct ircomm_cb), GFP_KERNEL); if (self == NULL) return NULL; diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index a68c88cdec6e..41ac7938268b 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -280,7 +280,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, struct tty_port *port = &self->port; DECLARE_WAITQUEUE(wait, current); int retval; - int do_clocal = 0, extra_count = 0; + int do_clocal = 0; unsigned long flags; IRDA_DEBUG(2, "%s()\n", __func__ ); @@ -289,8 +289,15 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, * If non-blocking mode is set, or the port is not enabled, * then make the check up front and then exit. */ - if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){ - /* nonblock mode is set or port is not enabled */ + if (test_bit(TTY_IO_ERROR, &tty->flags)) { + port->flags |= ASYNC_NORMAL_ACTIVE; + return 0; + } + + if (filp->f_flags & O_NONBLOCK) { + /* nonblock mode is set */ + if (tty->termios.c_cflag & CBAUD) + tty_port_raise_dtr_rts(port); port->flags |= ASYNC_NORMAL_ACTIVE; IRDA_DEBUG(1, "%s(), O_NONBLOCK requested!\n", __func__ ); return 0; @@ -315,18 +322,16 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, __FILE__, __LINE__, tty->driver->name, port->count); spin_lock_irqsave(&port->lock, flags); - if (!tty_hung_up_p(filp)) { - extra_count = 1; + if (!tty_hung_up_p(filp)) port->count--; - } - spin_unlock_irqrestore(&port->lock, flags); port->blocked_open++; + spin_unlock_irqrestore(&port->lock, flags); while (1) { - if (tty->termios.c_cflag & CBAUD) + if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags)) tty_port_raise_dtr_rts(port); - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); if (tty_hung_up_p(filp) || !test_bit(ASYNCB_INITIALIZED, &port->flags)) { @@ -361,13 +366,11 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, __set_current_state(TASK_RUNNING); remove_wait_queue(&port->open_wait, &wait); - if (extra_count) { - /* ++ is not atomic, so this should be protected - Jean II */ - spin_lock_irqsave(&port->lock, flags); + spin_lock_irqsave(&port->lock, flags); + if (!tty_hung_up_p(filp)) port->count++; - spin_unlock_irqrestore(&port->lock, flags); - } port->blocked_open--; + spin_unlock_irqrestore(&port->lock, flags); IRDA_DEBUG(1, "%s(%d):block_til_ready after blocking on %s open_count=%d\n", __FILE__, __LINE__, tty->driver->name, port->count); @@ -452,7 +455,7 @@ static int ircomm_tty_open(struct tty_struct *tty, struct file *filp) self->line, self->port.count); /* Not really used by us, but lets do it anyway */ - tty->low_latency = (self->port.flags & ASYNC_LOW_LATENCY) ? 1 : 0; + self->port.low_latency = (self->port.flags & ASYNC_LOW_LATENCY) ? 1 : 0; /* * If the port is the middle of closing, bail out now @@ -1136,14 +1139,14 @@ static int ircomm_tty_data_indication(void *instance, void *sap, ircomm_tty_send_initial_parameters(self); ircomm_tty_link_established(self); } + tty_kref_put(tty); /* * Use flip buffer functions since the code may be called from interrupt * context */ - tty_insert_flip_string(tty, skb->data, skb->len); - tty_flip_buffer_push(tty); - tty_kref_put(tty); + tty_insert_flip_string(&self->port, skb->data, skb->len); + tty_flip_buffer_push(&self->port); /* No need to kfree_skb - see ircomm_ttp_data_indication() */ diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c index edab393e0c82..a2a508f5f268 100644 --- a/net/irda/ircomm/ircomm_tty_attach.c +++ b/net/irda/ircomm/ircomm_tty_attach.c @@ -997,12 +997,8 @@ static int ircomm_tty_state_ready(struct ircomm_tty_cb *self, self->settings.dce = IRCOMM_DELTA_CD; ircomm_tty_check_modem_status(self); } else { - struct tty_struct *tty = tty_port_tty_get(&self->port); IRDA_DEBUG(0, "%s(), hanging up!\n", __func__ ); - if (tty) { - tty_hangup(tty); - tty_kref_put(tty); - } + tty_port_tty_hangup(&self->port, false); } break; default: diff --git a/net/irda/iriap.c b/net/irda/iriap.c index e71e85ba2bf1..e1b37f5a2691 100644 --- a/net/irda/iriap.c +++ b/net/irda/iriap.c @@ -303,7 +303,8 @@ static void iriap_disconnect_indication(void *instance, void *sap, { struct iriap_cb *self; - IRDA_DEBUG(4, "%s(), reason=%s\n", __func__, irlmp_reasons[reason]); + IRDA_DEBUG(4, "%s(), reason=%s [%d]\n", __func__, + irlmp_reason_str(reason), reason); self = instance; @@ -495,8 +496,11 @@ static void iriap_getvaluebyclass_confirm(struct iriap_cb *self, /* case CS_ISO_8859_9: */ /* case CS_UNICODE: */ default: - IRDA_DEBUG(0, "%s(), charset %s, not supported\n", - __func__, ias_charset_types[charset]); + IRDA_DEBUG(0, "%s(), charset [%d] %s, not supported\n", + __func__, charset, + charset < ARRAY_SIZE(ias_charset_types) ? + ias_charset_types[charset] : + "(unknown)"); /* Aborting, close connection! */ iriap_disconnect_request(self); diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index 6115a44c0a24..98ad6ec4bd3c 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -58,7 +58,7 @@ int sysctl_discovery_slots = 6; /* 6 slots by default */ int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ; char sysctl_devname[65]; -const char *irlmp_reasons[] = { +static const char *irlmp_reasons[] = { "ERROR, NOT USED", "LM_USER_REQUEST", "LM_LAP_DISCONNECT", @@ -66,8 +66,15 @@ const char *irlmp_reasons[] = { "LM_LAP_RESET", "LM_INIT_DISCONNECT", "ERROR, NOT USED", + "UNKNOWN", }; +const char *irlmp_reason_str(LM_REASON reason) +{ + reason = min_t(size_t, reason, ARRAY_SIZE(irlmp_reasons) - 1); + return irlmp_reasons[reason]; +} + /* * Function irlmp_init (void) * @@ -747,7 +754,8 @@ void irlmp_disconnect_indication(struct lsap_cb *self, LM_REASON reason, { struct lsap_cb *lsap; - IRDA_DEBUG(1, "%s(), reason=%s\n", __func__, irlmp_reasons[reason]); + IRDA_DEBUG(1, "%s(), reason=%s [%d]\n", __func__, + irlmp_reason_str(reason), reason); IRDA_ASSERT(self != NULL, return;); IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 2bb2beb6a373..3c83a1e5ab03 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -214,8 +214,7 @@ irnet_get_discovery_log(irnet_socket * ap) * After reading : discoveries = NULL ; disco_index = Y ; disco_number = -1 */ static inline int -irnet_read_discovery_log(irnet_socket * ap, - char * event) +irnet_read_discovery_log(irnet_socket *ap, char *event, int buf_size) { int done_event = 0; @@ -237,12 +236,13 @@ irnet_read_discovery_log(irnet_socket * ap, if(ap->disco_index < ap->disco_number) { /* Write an event */ - sprintf(event, "Found %08x (%s) behind %08x {hints %02X-%02X}\n", - ap->discoveries[ap->disco_index].daddr, - ap->discoveries[ap->disco_index].info, - ap->discoveries[ap->disco_index].saddr, - ap->discoveries[ap->disco_index].hints[0], - ap->discoveries[ap->disco_index].hints[1]); + snprintf(event, buf_size, + "Found %08x (%s) behind %08x {hints %02X-%02X}\n", + ap->discoveries[ap->disco_index].daddr, + ap->discoveries[ap->disco_index].info, + ap->discoveries[ap->disco_index].saddr, + ap->discoveries[ap->disco_index].hints[0], + ap->discoveries[ap->disco_index].hints[1]); DEBUG(CTRL_INFO, "Writing discovery %d : %s\n", ap->disco_index, ap->discoveries[ap->disco_index].info); @@ -282,27 +282,24 @@ irnet_ctrl_read(irnet_socket * ap, size_t count) { DECLARE_WAITQUEUE(wait, current); - char event[64]; /* Max event is 61 char */ + char event[75]; ssize_t ret = 0; DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); - /* Check if we can write an event out in one go */ - DABORT(count < sizeof(event), -EOVERFLOW, CTRL_ERROR, "Buffer to small.\n"); - #ifdef INITIAL_DISCOVERY /* Check if we have read the log */ - if(irnet_read_discovery_log(ap, event)) + if (irnet_read_discovery_log(ap, event, sizeof(event))) { - /* We have an event !!! Copy it to the user */ - if(copy_to_user(buf, event, strlen(event))) + count = min(strlen(event), count); + if (copy_to_user(buf, event, count)) { DERROR(CTRL_ERROR, "Invalid user space pointer.\n"); return -EFAULT; } DEXIT(CTRL_TRACE, "\n"); - return strlen(event); + return count; } #endif /* INITIAL_DISCOVERY */ @@ -339,79 +336,81 @@ irnet_ctrl_read(irnet_socket * ap, switch(irnet_events.log[ap->event_index].event) { case IRNET_DISCOVER: - sprintf(event, "Discovered %08x (%s) behind %08x {hints %02X-%02X}\n", - irnet_events.log[ap->event_index].daddr, - irnet_events.log[ap->event_index].name, - irnet_events.log[ap->event_index].saddr, - irnet_events.log[ap->event_index].hints.byte[0], - irnet_events.log[ap->event_index].hints.byte[1]); + snprintf(event, sizeof(event), + "Discovered %08x (%s) behind %08x {hints %02X-%02X}\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].saddr, + irnet_events.log[ap->event_index].hints.byte[0], + irnet_events.log[ap->event_index].hints.byte[1]); break; case IRNET_EXPIRE: - sprintf(event, "Expired %08x (%s) behind %08x {hints %02X-%02X}\n", - irnet_events.log[ap->event_index].daddr, - irnet_events.log[ap->event_index].name, - irnet_events.log[ap->event_index].saddr, - irnet_events.log[ap->event_index].hints.byte[0], - irnet_events.log[ap->event_index].hints.byte[1]); + snprintf(event, sizeof(event), + "Expired %08x (%s) behind %08x {hints %02X-%02X}\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].saddr, + irnet_events.log[ap->event_index].hints.byte[0], + irnet_events.log[ap->event_index].hints.byte[1]); break; case IRNET_CONNECT_TO: - sprintf(event, "Connected to %08x (%s) on ppp%d\n", - irnet_events.log[ap->event_index].daddr, - irnet_events.log[ap->event_index].name, - irnet_events.log[ap->event_index].unit); + snprintf(event, sizeof(event), "Connected to %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); break; case IRNET_CONNECT_FROM: - sprintf(event, "Connection from %08x (%s) on ppp%d\n", - irnet_events.log[ap->event_index].daddr, - irnet_events.log[ap->event_index].name, - irnet_events.log[ap->event_index].unit); + snprintf(event, sizeof(event), "Connection from %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); break; case IRNET_REQUEST_FROM: - sprintf(event, "Request from %08x (%s) behind %08x\n", - irnet_events.log[ap->event_index].daddr, - irnet_events.log[ap->event_index].name, - irnet_events.log[ap->event_index].saddr); + snprintf(event, sizeof(event), "Request from %08x (%s) behind %08x\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].saddr); break; case IRNET_NOANSWER_FROM: - sprintf(event, "No-answer from %08x (%s) on ppp%d\n", - irnet_events.log[ap->event_index].daddr, - irnet_events.log[ap->event_index].name, - irnet_events.log[ap->event_index].unit); + snprintf(event, sizeof(event), "No-answer from %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); break; case IRNET_BLOCKED_LINK: - sprintf(event, "Blocked link with %08x (%s) on ppp%d\n", - irnet_events.log[ap->event_index].daddr, - irnet_events.log[ap->event_index].name, - irnet_events.log[ap->event_index].unit); + snprintf(event, sizeof(event), "Blocked link with %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); break; case IRNET_DISCONNECT_FROM: - sprintf(event, "Disconnection from %08x (%s) on ppp%d\n", - irnet_events.log[ap->event_index].daddr, - irnet_events.log[ap->event_index].name, - irnet_events.log[ap->event_index].unit); + snprintf(event, sizeof(event), "Disconnection from %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); break; case IRNET_DISCONNECT_TO: - sprintf(event, "Disconnected to %08x (%s)\n", - irnet_events.log[ap->event_index].daddr, - irnet_events.log[ap->event_index].name); + snprintf(event, sizeof(event), "Disconnected to %08x (%s)\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name); break; default: - sprintf(event, "Bug\n"); + snprintf(event, sizeof(event), "Bug\n"); } /* Increment our event index */ ap->event_index = (ap->event_index + 1) % IRNET_MAX_EVENTS; DEBUG(CTRL_INFO, "Event is :%s", event); - /* Copy it to the user */ - if(copy_to_user(buf, event, strlen(event))) + count = min(strlen(event), count); + if (copy_to_user(buf, event, count)) { DERROR(CTRL_ERROR, "Invalid user space pointer.\n"); return -EFAULT; } DEXIT(CTRL_TRACE, "\n"); - return strlen(event); + return count; } /*------------------------------------------------------------------*/ diff --git a/net/irda/timer.c b/net/irda/timer.c index 1d552b3946fc..0c4c115a5cab 100644 --- a/net/irda/timer.c +++ b/net/irda/timer.c @@ -57,7 +57,7 @@ void irlap_start_query_timer(struct irlap_cb *self, int S, int s) * Basically, we multiply the number of remaining slots by our * slot time, plus add some extra time to properly receive the last * discovery packet (which is longer due to extra discovery info), - * to avoid messing with for incomming connections requests and + * to avoid messing with for incoming connections requests and * to accommodate devices that perform discovery slower than us. * Jean II */ timeout = ((sysctl_slot_timeout * HZ / 1000) * (S - s) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cd6f7a991d80..ae691651b721 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -49,12 +49,6 @@ static const u8 iprm_shutdown[8] = #define TRGCLS_SIZE (sizeof(((struct iucv_message *)0)->class)) -/* macros to set/get socket control buffer at correct offset */ -#define CB_TAG(skb) ((skb)->cb) /* iucv message tag */ -#define CB_TAG_LEN (sizeof(((struct iucv_message *) 0)->tag)) -#define CB_TRGCLS(skb) ((skb)->cb + CB_TAG_LEN) /* iucv msg target class */ -#define CB_TRGCLS_LEN (TRGCLS_SIZE) - #define __iucv_sock_wait(sk, condition, timeo, ret) \ do { \ DEFINE_WAIT(__wait); \ @@ -156,14 +150,13 @@ static int afiucv_pm_freeze(struct device *dev) { struct iucv_sock *iucv; struct sock *sk; - struct hlist_node *node; int err = 0; #ifdef CONFIG_PM_DEBUG printk(KERN_WARNING "afiucv_pm_freeze\n"); #endif read_lock(&iucv_sk_list.lock); - sk_for_each(sk, node, &iucv_sk_list.head) { + sk_for_each(sk, &iucv_sk_list.head) { iucv = iucv_sk(sk); switch (sk->sk_state) { case IUCV_DISCONN: @@ -194,13 +187,12 @@ static int afiucv_pm_freeze(struct device *dev) static int afiucv_pm_restore_thaw(struct device *dev) { struct sock *sk; - struct hlist_node *node; #ifdef CONFIG_PM_DEBUG printk(KERN_WARNING "afiucv_pm_restore_thaw\n"); #endif read_lock(&iucv_sk_list.lock); - sk_for_each(sk, node, &iucv_sk_list.head) { + sk_for_each(sk, &iucv_sk_list.head) { switch (sk->sk_state) { case IUCV_CONNECTED: sk->sk_err = EPIPE; @@ -390,9 +382,8 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, static struct sock *__iucv_get_sock_by_name(char *nm) { struct sock *sk; - struct hlist_node *node; - sk_for_each(sk, node, &iucv_sk_list.head) + sk_for_each(sk, &iucv_sk_list.head) if (!memcmp(&iucv_sk(sk)->src_name, nm, 8)) return sk; @@ -1144,7 +1135,7 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock, /* increment and save iucv message tag for msg_completion cbk */ txmsg.tag = iucv->send_tag++; - memcpy(CB_TAG(skb), &txmsg.tag, CB_TAG_LEN); + IUCV_SKB_CB(skb)->tag = txmsg.tag; if (iucv->transport == AF_IUCV_TRANS_HIPER) { atomic_inc(&iucv->msg_sent); @@ -1227,7 +1218,7 @@ static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len) return -ENOMEM; /* copy target class to control buffer of new skb */ - memcpy(CB_TRGCLS(nskb), CB_TRGCLS(skb), CB_TRGCLS_LEN); + IUCV_SKB_CB(nskb)->class = IUCV_SKB_CB(skb)->class; /* copy data fragment */ memcpy(nskb->data, skb->data + copied, size); @@ -1259,7 +1250,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, /* store msg target class in the second 4 bytes of skb ctrl buffer */ /* Note: the first 4 bytes are reserved for msg tag */ - memcpy(CB_TRGCLS(skb), &msg->class, CB_TRGCLS_LEN); + IUCV_SKB_CB(skb)->class = msg->class; /* check for special IPRM messages (e.g. iucv_sock_shutdown) */ if ((msg->flags & IUCV_IPRMDATA) && len > 7) { @@ -1295,6 +1286,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, } } + IUCV_SKB_CB(skb)->offset = 0; if (sock_queue_rcv_skb(sk, skb)) skb_queue_head(&iucv_sk(sk)->backlog_skb_q, skb); } @@ -1330,6 +1322,9 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, unsigned int copied, rlen; struct sk_buff *skb, *rskb, *cskb; int err = 0; + u32 offset; + + msg->msg_namelen = 0; if ((sk->sk_state == IUCV_DISCONN) && skb_queue_empty(&iucv->backlog_skb_q) && @@ -1349,13 +1344,14 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, return err; } - rlen = skb->len; /* real length of skb */ + offset = IUCV_SKB_CB(skb)->offset; + rlen = skb->len - offset; /* real length of skb */ copied = min_t(unsigned int, rlen, len); if (!rlen) sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN; cskb = skb; - if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) { + if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; @@ -1373,7 +1369,8 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, * get the trgcls from the control buffer of the skb due to * fragmentation of original iucv message. */ err = put_cmsg(msg, SOL_IUCV, SCM_IUCV_TRGCLS, - CB_TRGCLS_LEN, CB_TRGCLS(skb)); + sizeof(IUCV_SKB_CB(skb)->class), + (void *)&IUCV_SKB_CB(skb)->class); if (err) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); @@ -1385,9 +1382,8 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, /* SOCK_STREAM: re-queue skb if it contains unreceived data */ if (sk->sk_type == SOCK_STREAM) { - skb_pull(skb, copied); - if (skb->len) { - skb_queue_head(&sk->sk_receive_queue, skb); + if (copied < rlen) { + IUCV_SKB_CB(skb)->offset = offset + copied; goto done; } } @@ -1406,6 +1402,7 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, spin_lock_bh(&iucv->message_q.lock); rskb = skb_dequeue(&iucv->backlog_skb_q); while (rskb) { + IUCV_SKB_CB(rskb)->offset = 0; if (sock_queue_rcv_skb(sk, rskb)) { skb_queue_head(&iucv->backlog_skb_q, rskb); @@ -1464,7 +1461,8 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock, return iucv_accept_poll(sk); if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= POLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0); if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP; @@ -1678,7 +1676,6 @@ static int iucv_callback_connreq(struct iucv_path *path, unsigned char user_data[16]; unsigned char nuser_data[16]; unsigned char src_name[8]; - struct hlist_node *node; struct sock *sk, *nsk; struct iucv_sock *iucv, *niucv; int err; @@ -1689,7 +1686,7 @@ static int iucv_callback_connreq(struct iucv_path *path, read_lock(&iucv_sk_list.lock); iucv = NULL; sk = NULL; - sk_for_each(sk, node, &iucv_sk_list.head) + sk_for_each(sk, &iucv_sk_list.head) if (sk->sk_state == IUCV_LISTEN && !memcmp(&iucv_sk(sk)->src_name, src_name, 8)) { /* @@ -1834,7 +1831,7 @@ static void iucv_callback_txdone(struct iucv_path *path, spin_lock_irqsave(&list->lock, flags); while (list_skb != (struct sk_buff *)list) { - if (!memcmp(&msg->tag, CB_TAG(list_skb), CB_TAG_LEN)) { + if (msg->tag != IUCV_SKB_CB(list_skb)->tag) { this = list_skb; break; } @@ -2095,6 +2092,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb) skb_pull(skb, sizeof(struct af_iucv_trans_hdr)); skb_reset_transport_header(skb); skb_reset_network_header(skb); + IUCV_SKB_CB(skb)->offset = 0; spin_lock(&iucv->message_q.lock); if (skb_queue_empty(&iucv->backlog_skb_q)) { if (sock_queue_rcv_skb(sk, skb)) { @@ -2115,7 +2113,6 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb) static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct hlist_node *node; struct sock *sk; struct iucv_sock *iucv; struct af_iucv_trans_hdr *trans_hdr; @@ -2132,7 +2129,7 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, iucv = NULL; sk = NULL; read_lock(&iucv_sk_list.lock); - sk_for_each(sk, node, &iucv_sk_list.head) { + sk_for_each(sk, &iucv_sk_list.head) { if (trans_hdr->flags == AF_IUCV_FLAG_SYN) { if ((!memcmp(&iucv_sk(sk)->src_name, trans_hdr->destAppName, 8)) && @@ -2200,8 +2197,7 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, /* fall through and receive zero length data */ case 0: /* plain data frame */ - memcpy(CB_TRGCLS(skb), &trans_hdr->iucv_hdr.class, - CB_TRGCLS_LEN); + IUCV_SKB_CB(skb)->class = trans_hdr->iucv_hdr.class; err = afiucv_hs_callback_rx(sk, skb); break; default: @@ -2225,10 +2221,9 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, struct sk_buff *list_skb; struct sk_buff *nskb; unsigned long flags; - struct hlist_node *node; read_lock_irqsave(&iucv_sk_list.lock, flags); - sk_for_each(sk, node, &iucv_sk_list.head) + sk_for_each(sk, &iucv_sk_list.head) if (sk == isk) { iucv = iucv_sk(sk); break; @@ -2299,14 +2294,13 @@ static int afiucv_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = (struct net_device *)ptr; - struct hlist_node *node; struct sock *sk; struct iucv_sock *iucv; switch (event) { case NETDEV_REBOOT: case NETDEV_GOING_DOWN: - sk_for_each(sk, node, &iucv_sk_list.head) { + sk_for_each(sk, &iucv_sk_list.head) { iucv = iucv_sk(sk); if ((iucv->hs_dev == event_dev) && (sk->sk_state == IUCV_CONNECTED)) { diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index df082508362d..4fe76ff214c2 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -831,8 +831,11 @@ static int iucv_reboot_event(struct notifier_block *this, { int i; + if (cpumask_empty(&iucv_irq_cpumask)) + return NOTIFY_DONE; + get_online_cpus(); - on_each_cpu(iucv_block_cpu, NULL, 1); + on_each_cpu_mask(&iucv_irq_cpumask, iucv_block_cpu, NULL, 1); preempt_disable(); for (i = 0; i < iucv_max_pathid; i++) { if (iucv_path_table[i]) diff --git a/net/key/af_key.c b/net/key/af_key.c index 5b426a646544..5b1e5af25713 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -203,7 +203,6 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, } if (*skb2 != NULL) { if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { - skb_orphan(*skb2); skb_set_owner_r(*skb2, sk); skb_queue_tail(&sk->sk_receive_queue, *skb2); sk->sk_data_ready(sk, (*skb2)->len); @@ -226,7 +225,6 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; - struct hlist_node *node; struct sk_buff *skb2 = NULL; int err = -ESRCH; @@ -237,7 +235,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, return -ENOMEM; rcu_read_lock(); - sk_for_each_rcu(sk, node, &net_pfkey->table) { + sk_for_each_rcu(sk, &net_pfkey->table) { struct pfkey_sock *pfk = pfkey_sk(sk); int err2; @@ -762,7 +760,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, } /* identity & sensitivity */ - if (xfrm_addr_cmp(&x->sel.saddr, &x->props.saddr, x->props.family)) + if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) size += sizeof(struct sadb_address) + sockaddr_size; if (add_keys) { @@ -816,18 +814,21 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, sa->sadb_sa_auth = 0; if (x->aalg) { struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); - sa->sadb_sa_auth = a ? a->desc.sadb_alg_id : 0; + sa->sadb_sa_auth = (a && a->pfkey_supported) ? + a->desc.sadb_alg_id : 0; } sa->sadb_sa_encrypt = 0; BUG_ON(x->ealg && x->calg); if (x->ealg) { struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0); - sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0; + sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? + a->desc.sadb_alg_id : 0; } /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */ if (x->calg) { struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0); - sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0; + sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? + a->desc.sadb_alg_id : 0; } sa->sadb_sa_flags = 0; @@ -909,8 +910,8 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, if (!addr->sadb_address_prefixlen) BUG(); - if (xfrm_addr_cmp(&x->sel.saddr, &x->props.saddr, - x->props.family)) { + if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, + x->props.family)) { addr = (struct sadb_address*) skb_put(skb, sizeof(struct sadb_address)+sockaddr_size); addr->sadb_address_len = @@ -1138,7 +1139,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, if (sa->sadb_sa_auth) { int keysize = 0; struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth); - if (!a) { + if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } @@ -1160,7 +1161,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, if (sa->sadb_sa_encrypt) { if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) { struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt); - if (!a) { + if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } @@ -1172,7 +1173,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, } else { int keysize = 0; struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt); - if (!a) { + if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } @@ -1321,7 +1322,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ if (hdr->sadb_msg_seq) { x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); - if (x && xfrm_addr_cmp(&x->id.daddr, xdaddr, family)) { + if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { xfrm_state_put(x); x = NULL; } @@ -1578,13 +1579,13 @@ static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig, struct sadb_msg *hdr; int len, auth_len, enc_len, i; - auth_len = xfrm_count_auth_supported(); + auth_len = xfrm_count_pfkey_auth_supported(); if (auth_len) { auth_len *= sizeof(struct sadb_alg); auth_len += sizeof(struct sadb_supported); } - enc_len = xfrm_count_enc_supported(); + enc_len = xfrm_count_pfkey_enc_supported(); if (enc_len) { enc_len *= sizeof(struct sadb_alg); enc_len += sizeof(struct sadb_supported); @@ -1615,6 +1616,8 @@ static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig, struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; + if (!aalg->pfkey_supported) + continue; if (aalg->available) *ap++ = aalg->desc; } @@ -1634,6 +1637,8 @@ static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig, struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; + if (!ealg->pfkey_supported) + continue; if (ealg->available) *ap++ = ealg->desc; } @@ -2196,7 +2201,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_ XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->priority = pol->sadb_x_policy_priority; - sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); if (!xp->family) { err = -EINVAL; @@ -2209,7 +2214,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_ if (xp->selector.sport) xp->selector.sport_mask = htons(0xffff); - sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1], + sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr); xp->selector.prefixlen_d = sa->sadb_address_prefixlen; @@ -2310,7 +2315,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa memset(&sel, 0, sizeof(sel)); - sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); @@ -2318,7 +2323,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa if (sel.sport) sel.sport_mask = htons(0xffff); - sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1], + sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); @@ -2688,6 +2693,7 @@ static int key_notify_policy_flush(const struct km_event *c) hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; + hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; @@ -2825,6 +2831,8 @@ static int count_ah_combs(const struct xfrm_tmpl *t) const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; + if (!aalg->pfkey_supported) + continue; if (aalg_tmpl_set(t, aalg) && aalg->available) sz += sizeof(struct sadb_comb); } @@ -2840,6 +2848,9 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!ealg) break; + if (!ealg->pfkey_supported) + continue; + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) continue; @@ -2848,6 +2859,9 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!aalg) break; + if (!aalg->pfkey_supported) + continue; + if (aalg_tmpl_set(t, aalg) && aalg->available) sz += sizeof(struct sadb_comb); } @@ -2871,6 +2885,9 @@ static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) if (!aalg) break; + if (!aalg->pfkey_supported) + continue; + if (aalg_tmpl_set(t, aalg) && aalg->available) { struct sadb_comb *c; c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb)); @@ -2903,6 +2920,9 @@ static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) if (!ealg) break; + if (!ealg->pfkey_supported) + continue; + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) continue; @@ -2911,6 +2931,8 @@ static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; + if (!aalg->pfkey_supported) + continue; if (!(aalg_tmpl_set(t, aalg) && aalg->available)) continue; c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb)); @@ -3718,7 +3740,7 @@ static int __net_init pfkey_init_proc(struct net *net) { struct proc_dir_entry *e; - e = proc_net_fops_create(net, "pfkey", 0, &pfkey_proc_ops); + e = proc_create("pfkey", 0, net->proc_net, &pfkey_proc_ops); if (e == NULL) return -ENOMEM; @@ -3727,7 +3749,7 @@ static int __net_init pfkey_init_proc(struct net *net) static void __net_exit pfkey_exit_proc(struct net *net) { - proc_net_remove(net, "pfkey"); + remove_proc_entry("pfkey", net->proc_net); } #else static inline int pfkey_init_proc(struct net *net) diff --git a/net/l2tp/Kconfig b/net/l2tp/Kconfig index 147a8fd47a17..adb9843dd7cf 100644 --- a/net/l2tp/Kconfig +++ b/net/l2tp/Kconfig @@ -46,8 +46,8 @@ config L2TP_DEBUGFS will be called l2tp_debugfs. config L2TP_V3 - bool "L2TPv3 support (EXPERIMENTAL)" - depends on EXPERIMENTAL && L2TP + bool "L2TPv3 support" + depends on L2TP help Layer Two Tunneling Protocol Version 3 diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 2ac884d0e89b..6984c3a353cd 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -101,6 +101,7 @@ struct l2tp_skb_cb { static atomic_t l2tp_tunnel_count; static atomic_t l2tp_session_count; +static struct workqueue_struct *l2tp_wq; /* per-net private data for this module */ static unsigned int l2tp_net_id; @@ -113,7 +114,6 @@ struct l2tp_net { static void l2tp_session_set_header_len(struct l2tp_session *session, int version); static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); -static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); static inline struct l2tp_net *l2tp_pernet(struct net *net) { @@ -122,7 +122,6 @@ static inline struct l2tp_net *l2tp_pernet(struct net *net) return net_generic(net, l2tp_net_id); } - /* Tunnel reference counts. Incremented per session that is added to * the tunnel. */ @@ -192,6 +191,7 @@ struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel) } else { /* Socket is owned by kernelspace */ sk = tunnel->sock; + sock_hold(sk); } out: @@ -210,6 +210,7 @@ void l2tp_tunnel_sock_put(struct sock *sk) } sock_put(sk); } + sock_put(sk); } EXPORT_SYMBOL_GPL(l2tp_tunnel_sock_put); @@ -221,10 +222,9 @@ static struct l2tp_session *l2tp_session_find_2(struct net *net, u32 session_id) struct hlist_head *session_list = l2tp_session_id_hash_2(pn, session_id); struct l2tp_session *session; - struct hlist_node *walk; rcu_read_lock_bh(); - hlist_for_each_entry_rcu(session, walk, session_list, global_hlist) { + hlist_for_each_entry_rcu(session, session_list, global_hlist) { if (session->session_id == session_id) { rcu_read_unlock_bh(); return session; @@ -253,7 +253,6 @@ struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunn { struct hlist_head *session_list; struct l2tp_session *session; - struct hlist_node *walk; /* In L2TPv3, session_ids are unique over all tunnels and we * sometimes need to look them up before we know the @@ -264,7 +263,7 @@ struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunn session_list = l2tp_session_id_hash(tunnel, session_id); read_lock_bh(&tunnel->hlist_lock); - hlist_for_each_entry(session, walk, session_list, hlist) { + hlist_for_each_entry(session, session_list, hlist) { if (session->session_id == session_id) { read_unlock_bh(&tunnel->hlist_lock); return session; @@ -279,13 +278,12 @@ EXPORT_SYMBOL_GPL(l2tp_session_find); struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth) { int hash; - struct hlist_node *walk; struct l2tp_session *session; int count = 0; read_lock_bh(&tunnel->hlist_lock); for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { - hlist_for_each_entry(session, walk, &tunnel->session_hlist[hash], hlist) { + hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) { if (++count > nth) { read_unlock_bh(&tunnel->hlist_lock); return session; @@ -306,12 +304,11 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname) { struct l2tp_net *pn = l2tp_pernet(net); int hash; - struct hlist_node *walk; struct l2tp_session *session; rcu_read_lock_bh(); for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) { - hlist_for_each_entry_rcu(session, walk, &pn->l2tp_session_hlist[hash], global_hlist) { + hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) { if (!strcmp(session->ifname, ifname)) { rcu_read_unlock_bh(); return session; @@ -377,10 +374,8 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk struct sk_buff *skbp; struct sk_buff *tmp; u32 ns = L2TP_SKB_CB(skb)->ns; - struct l2tp_stats *sstats; spin_lock_bh(&session->reorder_q.lock); - sstats = &session->stats; skb_queue_walk_safe(&session->reorder_q, skbp, tmp) { if (L2TP_SKB_CB(skbp)->ns > ns) { __skb_queue_before(&session->reorder_q, skbp, skb); @@ -388,9 +383,7 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n", session->name, ns, L2TP_SKB_CB(skbp)->ns, skb_queue_len(&session->reorder_q)); - u64_stats_update_begin(&sstats->syncp); - sstats->rx_oos_packets++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_oos_packets); goto out; } } @@ -407,23 +400,16 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff * { struct l2tp_tunnel *tunnel = session->tunnel; int length = L2TP_SKB_CB(skb)->length; - struct l2tp_stats *tstats, *sstats; /* We're about to requeue the skb, so return resources * to its current owner (a socket receive buffer). */ skb_orphan(skb); - tstats = &tunnel->stats; - u64_stats_update_begin(&tstats->syncp); - sstats = &session->stats; - u64_stats_update_begin(&sstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += length; - sstats->rx_packets++; - sstats->rx_bytes += length; - u64_stats_update_end(&tstats->syncp); - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&tunnel->stats.rx_packets); + atomic_long_add(length, &tunnel->stats.rx_bytes); + atomic_long_inc(&session->stats.rx_packets); + atomic_long_add(length, &session->stats.rx_bytes); if (L2TP_SKB_CB(skb)->has_seq) { /* Bump our Nr */ @@ -454,7 +440,6 @@ static void l2tp_recv_dequeue(struct l2tp_session *session) { struct sk_buff *skb; struct sk_buff *tmp; - struct l2tp_stats *sstats; /* If the pkt at the head of the queue has the nr that we * expect to send up next, dequeue it and any other @@ -462,13 +447,10 @@ static void l2tp_recv_dequeue(struct l2tp_session *session) */ start: spin_lock_bh(&session->reorder_q.lock); - sstats = &session->stats; skb_queue_walk_safe(&session->reorder_q, skb, tmp) { if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) { - u64_stats_update_begin(&sstats->syncp); - sstats->rx_seq_discards++; - sstats->rx_errors++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_seq_discards); + atomic_long_inc(&session->stats.rx_errors); l2tp_dbg(session, L2TP_MSG_SEQ, "%s: oos pkt %u len %d discarded (too old), waiting for %u, reorder_q_len=%d\n", session->name, L2TP_SKB_CB(skb)->ns, @@ -627,7 +609,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, struct l2tp_tunnel *tunnel = session->tunnel; int offset; u32 ns, nr; - struct l2tp_stats *sstats = &session->stats; /* The ref count is increased since we now hold a pointer to * the session. Take care to decrement the refcnt when exiting @@ -644,9 +625,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, "%s: cookie mismatch (%u/%u). Discarding.\n", tunnel->name, tunnel->tunnel_id, session->session_id); - u64_stats_update_begin(&sstats->syncp); - sstats->rx_cookie_discards++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_cookie_discards); goto discard; } ptr += session->peer_cookie_len; @@ -715,9 +694,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, l2tp_warn(session, L2TP_MSG_SEQ, "%s: recv data has no seq numbers when required. Discarding.\n", session->name); - u64_stats_update_begin(&sstats->syncp); - sstats->rx_seq_discards++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } @@ -736,9 +713,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, l2tp_warn(session, L2TP_MSG_SEQ, "%s: recv data has no seq numbers when required. Discarding.\n", session->name); - u64_stats_update_begin(&sstats->syncp); - sstats->rx_seq_discards++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } } @@ -792,9 +767,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * packets */ if (L2TP_SKB_CB(skb)->ns != session->nr) { - u64_stats_update_begin(&sstats->syncp); - sstats->rx_seq_discards++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_seq_discards); l2tp_dbg(session, L2TP_MSG_SEQ, "%s: oos pkt %u len %d discarded, waiting for %u, reorder_q_len=%d\n", session->name, L2TP_SKB_CB(skb)->ns, @@ -820,9 +793,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, return; discard: - u64_stats_update_begin(&sstats->syncp); - sstats->rx_errors++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); if (session->deref) @@ -832,6 +803,23 @@ discard: } EXPORT_SYMBOL(l2tp_recv_common); +/* Drop skbs from the session's reorder_q + */ +int l2tp_session_queue_purge(struct l2tp_session *session) +{ + struct sk_buff *skb = NULL; + BUG_ON(!session); + BUG_ON(session->magic != L2TP_SESSION_MAGIC); + while ((skb = skb_dequeue(&session->reorder_q))) { + atomic_long_inc(&session->stats.rx_errors); + kfree_skb(skb); + if (session->deref) + (*session->deref)(session); + } + return 0; +} +EXPORT_SYMBOL_GPL(l2tp_session_queue_purge); + /* Internal UDP receive frame. Do the real work of receiving an L2TP data frame * here. The skb is not on a list when we get here. * Returns 0 if the packet was a data packet and was successfully passed on. @@ -847,7 +835,6 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, u32 tunnel_id, session_id; u16 version; int length; - struct l2tp_stats *tstats; if (tunnel->sock && l2tp_verify_udp_checksum(tunnel->sock, skb)) goto discard_bad_csum; @@ -936,10 +923,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, discard_bad_csum: LIMIT_NETDEBUG("%s: UDP: bad checksum\n", tunnel->name); UDP_INC_STATS_USER(tunnel->l2tp_net, UDP_MIB_INERRORS, 0); - tstats = &tunnel->stats; - u64_stats_update_begin(&tstats->syncp); - tstats->rx_errors++; - u64_stats_update_end(&tstats->syncp); + atomic_long_inc(&tunnel->stats.rx_errors); kfree_skb(skb); return 0; @@ -1066,7 +1050,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, struct l2tp_tunnel *tunnel = session->tunnel; unsigned int len = skb->len; int error; - struct l2tp_stats *tstats, *sstats; /* Debug */ if (session->send_seq) @@ -1095,21 +1078,15 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, error = ip_queue_xmit(skb, fl); /* Update stats */ - tstats = &tunnel->stats; - u64_stats_update_begin(&tstats->syncp); - sstats = &session->stats; - u64_stats_update_begin(&sstats->syncp); if (error >= 0) { - tstats->tx_packets++; - tstats->tx_bytes += len; - sstats->tx_packets++; - sstats->tx_bytes += len; + atomic_long_inc(&tunnel->stats.tx_packets); + atomic_long_add(len, &tunnel->stats.tx_bytes); + atomic_long_inc(&session->stats.tx_packets); + atomic_long_add(len, &session->stats.tx_bytes); } else { - tstats->tx_errors++; - sstats->tx_errors++; + atomic_long_inc(&tunnel->stats.tx_errors); + atomic_long_inc(&session->stats.tx_errors); } - u64_stats_update_end(&tstats->syncp); - u64_stats_update_end(&sstats->syncp); return 0; } @@ -1271,6 +1248,7 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb); static void l2tp_tunnel_destruct(struct sock *sk) { struct l2tp_tunnel *tunnel; + struct l2tp_net *pn; tunnel = sk->sk_user_data; if (tunnel == NULL) @@ -1278,38 +1256,44 @@ static void l2tp_tunnel_destruct(struct sock *sk) l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name); - /* Close all sessions */ - l2tp_tunnel_closeall(tunnel); + /* Disable udp encapsulation */ switch (tunnel->encap) { case L2TP_ENCAPTYPE_UDP: /* No longer an encapsulation socket. See net/ipv4/udp.c */ (udp_sk(sk))->encap_type = 0; (udp_sk(sk))->encap_rcv = NULL; + (udp_sk(sk))->encap_destroy = NULL; break; case L2TP_ENCAPTYPE_IP: break; } /* Remove hooks into tunnel socket */ - tunnel->sock = NULL; sk->sk_destruct = tunnel->old_sk_destruct; sk->sk_user_data = NULL; + tunnel->sock = NULL; - /* Call the original destructor */ - if (sk->sk_destruct) - (*sk->sk_destruct)(sk); + /* Remove the tunnel struct from the tunnel list */ + pn = l2tp_pernet(tunnel->l2tp_net); + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_del_rcu(&tunnel->list); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + atomic_dec(&l2tp_tunnel_count); - /* We're finished with the socket */ + l2tp_tunnel_closeall(tunnel); l2tp_tunnel_dec_refcount(tunnel); + /* Call the original destructor */ + if (sk->sk_destruct) + (*sk->sk_destruct)(sk); end: return; } /* When the tunnel is closed, all the attached sessions need to go too. */ -static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { int hash; struct hlist_node *walk; @@ -1332,25 +1316,13 @@ again: hlist_del_init(&session->hlist); - /* Since we should hold the sock lock while - * doing any unbinding, we need to release the - * lock we're holding before taking that lock. - * Hold a reference to the sock so it doesn't - * disappear as we're jumping between locks. - */ if (session->ref != NULL) (*session->ref)(session); write_unlock_bh(&tunnel->hlist_lock); - if (tunnel->version != L2TP_HDR_VER_2) { - struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); - - spin_lock_bh(&pn->l2tp_session_hlist_lock); - hlist_del_init_rcu(&session->global_hlist); - spin_unlock_bh(&pn->l2tp_session_hlist_lock); - synchronize_rcu(); - } + __l2tp_session_unhash(session); + l2tp_session_queue_purge(session); if (session->session_close != NULL) (*session->session_close)(session); @@ -1358,6 +1330,8 @@ again: if (session->deref != NULL) (*session->deref)(session); + l2tp_session_dec_refcount(session); + write_lock_bh(&tunnel->hlist_lock); /* Now restart from the beginning of this hash @@ -1370,54 +1344,96 @@ again: } write_unlock_bh(&tunnel->hlist_lock); } +EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall); + +/* Tunnel socket destroy hook for UDP encapsulation */ +static void l2tp_udp_encap_destroy(struct sock *sk) +{ + struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); + if (tunnel) { + l2tp_tunnel_closeall(tunnel); + sock_put(sk); + } +} /* Really kill the tunnel. * Come here only when all sessions have been cleared from the tunnel. */ static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { - struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); - BUG_ON(atomic_read(&tunnel->ref_count) != 0); BUG_ON(tunnel->sock != NULL); - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: free...\n", tunnel->name); - - /* Remove from tunnel list */ - spin_lock_bh(&pn->l2tp_tunnel_list_lock); - list_del_rcu(&tunnel->list); kfree_rcu(tunnel, rcu); - spin_unlock_bh(&pn->l2tp_tunnel_list_lock); +} - atomic_dec(&l2tp_tunnel_count); +/* Workqueue tunnel deletion function */ +static void l2tp_tunnel_del_work(struct work_struct *work) +{ + struct l2tp_tunnel *tunnel = NULL; + struct socket *sock = NULL; + struct sock *sk = NULL; + + tunnel = container_of(work, struct l2tp_tunnel, del_work); + sk = l2tp_tunnel_sock_lookup(tunnel); + if (!sk) + return; + + sock = sk->sk_socket; + + /* If the tunnel socket was created by userspace, then go through the + * inet layer to shut the socket down, and let userspace close it. + * Otherwise, if we created the socket directly within the kernel, use + * the sk API to release it here. + * In either case the tunnel resources are freed in the socket + * destructor when the tunnel socket goes away. + */ + if (tunnel->fd >= 0) { + if (sock) + inet_shutdown(sock, 2); + } else { + if (sock) + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sk); + } + + l2tp_tunnel_sock_put(sk); } /* Create a socket for the tunnel, if one isn't set up by * userspace. This is used for static tunnels where there is no * managing L2TP daemon. + * + * Since we don't want these sockets to keep a namespace alive by + * themselves, we drop the socket's namespace refcount after creation. + * These sockets are freed when the namespace exits using the pernet + * exit hook. */ -static int l2tp_tunnel_sock_create(u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct socket **sockp) +static int l2tp_tunnel_sock_create(struct net *net, + u32 tunnel_id, + u32 peer_tunnel_id, + struct l2tp_tunnel_cfg *cfg, + struct socket **sockp) { int err = -EINVAL; - struct sockaddr_in udp_addr; + struct socket *sock = NULL; + struct sockaddr_in udp_addr = {0}; + struct sockaddr_l2tpip ip_addr = {0}; #if IS_ENABLED(CONFIG_IPV6) - struct sockaddr_in6 udp6_addr; - struct sockaddr_l2tpip6 ip6_addr; + struct sockaddr_in6 udp6_addr = {0}; + struct sockaddr_l2tpip6 ip6_addr = {0}; #endif - struct sockaddr_l2tpip ip_addr; - struct socket *sock = NULL; switch (cfg->encap) { case L2TP_ENCAPTYPE_UDP: #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { - err = sock_create(AF_INET6, SOCK_DGRAM, 0, sockp); + err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto out; - sock = *sockp; + sk_change_net(sock->sk, net); - memset(&udp6_addr, 0, sizeof(udp6_addr)); udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); @@ -1439,13 +1455,12 @@ static int l2tp_tunnel_sock_create(u32 tunnel_id, u32 peer_tunnel_id, struct l2t } else #endif { - err = sock_create(AF_INET, SOCK_DGRAM, 0, sockp); + err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto out; - sock = *sockp; + sk_change_net(sock->sk, net); - memset(&udp_addr, 0, sizeof(udp_addr)); udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = htons(cfg->local_udp_port); @@ -1472,14 +1487,13 @@ static int l2tp_tunnel_sock_create(u32 tunnel_id, u32 peer_tunnel_id, struct l2t case L2TP_ENCAPTYPE_IP: #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { - err = sock_create(AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, - sockp); + err = sock_create_kern(AF_INET6, SOCK_DGRAM, + IPPROTO_L2TP, &sock); if (err < 0) goto out; - sock = *sockp; + sk_change_net(sock->sk, net); - memset(&ip6_addr, 0, sizeof(ip6_addr)); ip6_addr.l2tp_family = AF_INET6; memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); @@ -1501,14 +1515,13 @@ static int l2tp_tunnel_sock_create(u32 tunnel_id, u32 peer_tunnel_id, struct l2t } else #endif { - err = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_L2TP, - sockp); + err = sock_create_kern(AF_INET, SOCK_DGRAM, + IPPROTO_L2TP, &sock); if (err < 0) goto out; - sock = *sockp; + sk_change_net(sock->sk, net); - memset(&ip_addr, 0, sizeof(ip_addr)); ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; @@ -1532,8 +1545,10 @@ static int l2tp_tunnel_sock_create(u32 tunnel_id, u32 peer_tunnel_id, struct l2t } out: + *sockp = sock; if ((err < 0) && sock) { - sock_release(sock); + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); *sockp = NULL; } @@ -1556,15 +1571,23 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 * kernel socket. */ if (fd < 0) { - err = l2tp_tunnel_sock_create(tunnel_id, peer_tunnel_id, cfg, &sock); + err = l2tp_tunnel_sock_create(net, tunnel_id, peer_tunnel_id, + cfg, &sock); if (err < 0) goto err; } else { - err = -EBADF; sock = sockfd_lookup(fd, &err); if (!sock) { - pr_err("tunl %hu: sockfd_lookup(fd=%d) returned %d\n", + pr_err("tunl %u: sockfd_lookup(fd=%d) returned %d\n", tunnel_id, fd, err); + err = -EBADF; + goto err; + } + + /* Reject namespace mismatches */ + if (!net_eq(sock_net(sock->sk), net)) { + pr_err("tunl %u: netns mismatch\n", tunnel_id); + err = -EINVAL; goto err; } } @@ -1630,6 +1653,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP; udp_sk(sk)->encap_rcv = l2tp_udp_encap_recv; + udp_sk(sk)->encap_destroy = l2tp_udp_encap_destroy; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET6) udpv6_encap_enable(); @@ -1651,6 +1675,9 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 sk->sk_allocation = GFP_ATOMIC; + /* Init delete workqueue struct */ + INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work); + /* Add tunnel to our list */ INIT_LIST_HEAD(&tunnel->list); atomic_inc(&l2tp_tunnel_count); @@ -1682,33 +1709,8 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); */ int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { - int err = -EBADF; - struct socket *sock = NULL; - struct sock *sk = NULL; - - sk = l2tp_tunnel_sock_lookup(tunnel); - if (!sk) - goto out; - - sock = sk->sk_socket; - BUG_ON(!sock); - - /* Force the tunnel socket to close. This will eventually - * cause the tunnel to be deleted via the normal socket close - * mechanisms when userspace closes the tunnel socket. - */ - err = inet_shutdown(sock, 2); - - /* If the tunnel's socket was created by the kernel, - * close the socket here since the socket was not - * created by userspace. - */ - if (sock->file == NULL) - err = inet_release(sock); - - l2tp_tunnel_sock_put(sk); -out: - return err; + l2tp_tunnel_closeall(tunnel); + return (false == queue_work(l2tp_wq, &tunnel->del_work)); } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); @@ -1716,62 +1718,71 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); */ void l2tp_session_free(struct l2tp_session *session) { - struct l2tp_tunnel *tunnel; + struct l2tp_tunnel *tunnel = session->tunnel; BUG_ON(atomic_read(&session->ref_count) != 0); - tunnel = session->tunnel; - if (tunnel != NULL) { + if (tunnel) { BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); + if (session->session_id != 0) + atomic_dec(&l2tp_session_count); + sock_put(tunnel->sock); + session->tunnel = NULL; + l2tp_tunnel_dec_refcount(tunnel); + } + + kfree(session); - /* Delete the session from the hash */ + return; +} +EXPORT_SYMBOL_GPL(l2tp_session_free); + +/* Remove an l2tp session from l2tp_core's hash lists. + * Provides a tidyup interface for pseudowire code which can't just route all + * shutdown via. l2tp_session_delete and a pseudowire-specific session_close + * callback. + */ +void __l2tp_session_unhash(struct l2tp_session *session) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + + /* Remove the session from core hashes */ + if (tunnel) { + /* Remove from the per-tunnel hash */ write_lock_bh(&tunnel->hlist_lock); hlist_del_init(&session->hlist); write_unlock_bh(&tunnel->hlist_lock); - /* Unlink from the global hash if not L2TPv2 */ + /* For L2TPv3 we have a per-net hash: remove from there, too */ if (tunnel->version != L2TP_HDR_VER_2) { struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); - spin_lock_bh(&pn->l2tp_session_hlist_lock); hlist_del_init_rcu(&session->global_hlist); spin_unlock_bh(&pn->l2tp_session_hlist_lock); synchronize_rcu(); } - - if (session->session_id != 0) - atomic_dec(&l2tp_session_count); - - sock_put(tunnel->sock); - - /* This will delete the tunnel context if this - * is the last session on the tunnel. - */ - session->tunnel = NULL; - l2tp_tunnel_dec_refcount(tunnel); } - - kfree(session); - - return; } -EXPORT_SYMBOL_GPL(l2tp_session_free); +EXPORT_SYMBOL_GPL(__l2tp_session_unhash); /* This function is used by the netlink SESSION_DELETE command and by pseudowire modules. */ int l2tp_session_delete(struct l2tp_session *session) { + if (session->ref) + (*session->ref)(session); + __l2tp_session_unhash(session); + l2tp_session_queue_purge(session); if (session->session_close != NULL) (*session->session_close)(session); - + if (session->deref) + (*session->deref)(session); l2tp_session_dec_refcount(session); - return 0; } EXPORT_SYMBOL_GPL(l2tp_session_delete); - /* We come here whenever a session's send_seq, cookie_len or * l2specific_len parameters are set. */ @@ -1892,8 +1903,21 @@ static __net_init int l2tp_init_net(struct net *net) return 0; } +static __net_exit void l2tp_exit_net(struct net *net) +{ + struct l2tp_net *pn = l2tp_pernet(net); + struct l2tp_tunnel *tunnel = NULL; + + rcu_read_lock_bh(); + list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { + (void)l2tp_tunnel_delete(tunnel); + } + rcu_read_unlock_bh(); +} + static struct pernet_operations l2tp_net_ops = { .init = l2tp_init_net, + .exit = l2tp_exit_net, .id = &l2tp_net_id, .size = sizeof(struct l2tp_net), }; @@ -1906,6 +1930,13 @@ static int __init l2tp_init(void) if (rc) goto out; + l2tp_wq = alloc_workqueue("l2tp", WQ_NON_REENTRANT | WQ_UNBOUND, 0); + if (!l2tp_wq) { + pr_err("alloc_workqueue failed\n"); + rc = -ENOMEM; + goto out; + } + pr_info("L2TP core driver, %s\n", L2TP_DRV_VERSION); out: @@ -1915,6 +1946,10 @@ out: static void __exit l2tp_exit(void) { unregister_pernet_device(&l2tp_net_ops); + if (l2tp_wq) { + destroy_workqueue(l2tp_wq); + l2tp_wq = NULL; + } } module_init(l2tp_init); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index e62204cad4fe..485a490fd990 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -36,16 +36,15 @@ enum { struct sk_buff; struct l2tp_stats { - u64 tx_packets; - u64 tx_bytes; - u64 tx_errors; - u64 rx_packets; - u64 rx_bytes; - u64 rx_seq_discards; - u64 rx_oos_packets; - u64 rx_errors; - u64 rx_cookie_discards; - struct u64_stats_sync syncp; + atomic_long_t tx_packets; + atomic_long_t tx_bytes; + atomic_long_t tx_errors; + atomic_long_t rx_packets; + atomic_long_t rx_bytes; + atomic_long_t rx_seq_discards; + atomic_long_t rx_oos_packets; + atomic_long_t rx_errors; + atomic_long_t rx_cookie_discards; }; struct l2tp_tunnel; @@ -191,6 +190,8 @@ struct l2tp_tunnel { int fd; /* Parent fd, if tunnel socket * was created by userspace */ + struct work_struct del_work; + uint8_t priv[0]; /* private data */ }; @@ -238,11 +239,14 @@ extern struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id); extern struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth); extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); +extern void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); +extern void __l2tp_session_unhash(struct l2tp_session *session); extern int l2tp_session_delete(struct l2tp_session *session); extern void l2tp_session_free(struct l2tp_session *session); extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb)); +extern int l2tp_session_queue_purge(struct l2tp_session *session); extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len); diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index c3813bc84552..072d7202e182 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -146,14 +146,14 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) tunnel->sock ? atomic_read(&tunnel->sock->sk_refcnt) : 0, atomic_read(&tunnel->ref_count)); - seq_printf(m, " %08x rx %llu/%llu/%llu rx %llu/%llu/%llu\n", + seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n", tunnel->debug, - (unsigned long long)tunnel->stats.tx_packets, - (unsigned long long)tunnel->stats.tx_bytes, - (unsigned long long)tunnel->stats.tx_errors, - (unsigned long long)tunnel->stats.rx_packets, - (unsigned long long)tunnel->stats.rx_bytes, - (unsigned long long)tunnel->stats.rx_errors); + atomic_long_read(&tunnel->stats.tx_packets), + atomic_long_read(&tunnel->stats.tx_bytes), + atomic_long_read(&tunnel->stats.tx_errors), + atomic_long_read(&tunnel->stats.rx_packets), + atomic_long_read(&tunnel->stats.rx_bytes), + atomic_long_read(&tunnel->stats.rx_errors)); if (tunnel->show != NULL) tunnel->show(m, tunnel); @@ -203,14 +203,14 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) seq_printf(m, "\n"); } - seq_printf(m, " %hu/%hu tx %llu/%llu/%llu rx %llu/%llu/%llu\n", + seq_printf(m, " %hu/%hu tx %ld/%ld/%ld rx %ld/%ld/%ld\n", session->nr, session->ns, - (unsigned long long)session->stats.tx_packets, - (unsigned long long)session->stats.tx_bytes, - (unsigned long long)session->stats.tx_errors, - (unsigned long long)session->stats.rx_packets, - (unsigned long long)session->stats.rx_bytes, - (unsigned long long)session->stats.rx_errors); + atomic_long_read(&session->stats.tx_packets), + atomic_long_read(&session->stats.tx_bytes), + atomic_long_read(&session->stats.tx_errors), + atomic_long_read(&session->stats.rx_packets), + atomic_long_read(&session->stats.rx_bytes), + atomic_long_read(&session->stats.rx_errors)); if (session->show != NULL) session->show(m, session); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 61d8b75d2686..571db8dd2292 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -49,10 +49,9 @@ static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id) { - struct hlist_node *node; struct sock *sk; - sk_for_each_bound(sk, node, &l2tp_ip_bind_table) { + sk_for_each_bound(sk, &l2tp_ip_bind_table) { struct inet_sock *inet = inet_sk(sk); struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); @@ -115,6 +114,7 @@ static inline struct sock *l2tp_ip_bind_lookup(struct net *net, __be32 laddr, in */ static int l2tp_ip_recv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); struct sock *sk; u32 session_id; u32 tunnel_id; @@ -142,7 +142,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_find(&init_net, NULL, session_id); + session = l2tp_session_find(net, NULL, session_id); if (session == NULL) goto discard; @@ -173,14 +173,14 @@ pass_up: goto discard; tunnel_id = ntohl(*(__be32 *) &skb->data[4]); - tunnel = l2tp_tunnel_find(&init_net, tunnel_id); + tunnel = l2tp_tunnel_find(net, tunnel_id); if (tunnel != NULL) sk = tunnel->sock; else { struct iphdr *iph = (struct iphdr *) skb_network_header(skb); read_lock_bh(&l2tp_ip_lock); - sk = __l2tp_ip_bind_lookup(&init_net, iph->daddr, 0, tunnel_id); + sk = __l2tp_ip_bind_lookup(net, iph->daddr, 0, tunnel_id); read_unlock_bh(&l2tp_ip_lock); } @@ -228,10 +228,16 @@ static void l2tp_ip_close(struct sock *sk, long timeout) static void l2tp_ip_destroy_sock(struct sock *sk) { struct sk_buff *skb; + struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); + if (tunnel) { + l2tp_tunnel_closeall(tunnel); + sock_put(sk); + } + sk_refcnt_debug_dec(sk); } @@ -239,6 +245,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *) uaddr; + struct net *net = sock_net(sk); int ret; int chk_addr_ret; @@ -251,7 +258,8 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) ret = -EADDRINUSE; read_lock_bh(&l2tp_ip_lock); - if (__l2tp_ip_bind_lookup(&init_net, addr->l2tp_addr.s_addr, sk->sk_bound_dev_if, addr->l2tp_conn_id)) + if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, + sk->sk_bound_dev_if, addr->l2tp_conn_id)) goto out_in_use; read_unlock_bh(&l2tp_ip_lock); @@ -260,7 +268,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip)) goto out; - chk_addr_ret = inet_addr_type(&init_net, addr->l2tp_addr.s_addr); + chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr); ret = -EADDRNOTAVAIL; if (addr->l2tp_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) @@ -369,7 +377,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) return 0; drop: - IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return -1; } @@ -605,6 +613,7 @@ static struct inet_protosw l2tp_ip_protosw = { static struct net_protocol l2tp_ip_protocol __read_mostly = { .handler = l2tp_ip_recv, + .netns_ok = 1, }; static int __init l2tp_ip_init(void) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 8ee4a86ae996..b8a6039314e8 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -60,10 +60,9 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net, struct in6_addr *laddr, int dif, u32 tunnel_id) { - struct hlist_node *node; struct sock *sk; - sk_for_each_bound(sk, node, &l2tp_ip6_bind_table) { + sk_for_each_bound(sk, &l2tp_ip6_bind_table) { struct in6_addr *addr = inet6_rcv_saddr(sk); struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); @@ -242,10 +241,17 @@ static void l2tp_ip6_close(struct sock *sk, long timeout) static void l2tp_ip6_destroy_sock(struct sock *sk) { + struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); + lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); + if (tunnel) { + l2tp_tunnel_closeall(tunnel); + sock_put(sk); + } + inet6_destroy_sock(sk); } @@ -684,6 +690,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk, lsa->l2tp_addr = ipv6_hdr(skb)->saddr; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; + lsa->l2tp_conn_id = 0; if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = IP6CB(skb)->iif; } diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index bbba3a19e944..0825ff26e113 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -37,6 +37,7 @@ static struct genl_family l2tp_nl_family = { .version = L2TP_GENL_VERSION, .hdrsize = 0, .maxattr = L2TP_ATTR_MAX, + .netnsok = true, }; /* Accessed under genl lock */ @@ -245,8 +246,6 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *np = NULL; #endif - struct l2tp_stats stats; - unsigned int start; hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, L2TP_CMD_TUNNEL_GET); @@ -264,28 +263,22 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla if (nest == NULL) goto nla_put_failure; - do { - start = u64_stats_fetch_begin(&tunnel->stats.syncp); - stats.tx_packets = tunnel->stats.tx_packets; - stats.tx_bytes = tunnel->stats.tx_bytes; - stats.tx_errors = tunnel->stats.tx_errors; - stats.rx_packets = tunnel->stats.rx_packets; - stats.rx_bytes = tunnel->stats.rx_bytes; - stats.rx_errors = tunnel->stats.rx_errors; - stats.rx_seq_discards = tunnel->stats.rx_seq_discards; - stats.rx_oos_packets = tunnel->stats.rx_oos_packets; - } while (u64_stats_fetch_retry(&tunnel->stats.syncp, start)); - - if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, stats.tx_packets) || - nla_put_u64(skb, L2TP_ATTR_TX_BYTES, stats.tx_bytes) || - nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, stats.tx_errors) || - nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, stats.rx_packets) || - nla_put_u64(skb, L2TP_ATTR_RX_BYTES, stats.rx_bytes) || + if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, + atomic_long_read(&tunnel->stats.tx_packets)) || + nla_put_u64(skb, L2TP_ATTR_TX_BYTES, + atomic_long_read(&tunnel->stats.tx_bytes)) || + nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, + atomic_long_read(&tunnel->stats.tx_errors)) || + nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, + atomic_long_read(&tunnel->stats.rx_packets)) || + nla_put_u64(skb, L2TP_ATTR_RX_BYTES, + atomic_long_read(&tunnel->stats.rx_bytes)) || nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, - stats.rx_seq_discards) || + atomic_long_read(&tunnel->stats.rx_seq_discards)) || nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS, - stats.rx_oos_packets) || - nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, stats.rx_errors)) + atomic_long_read(&tunnel->stats.rx_oos_packets)) || + nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, + atomic_long_read(&tunnel->stats.rx_errors))) goto nla_put_failure; nla_nest_end(skb, nest); @@ -611,8 +604,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl struct nlattr *nest; struct l2tp_tunnel *tunnel = session->tunnel; struct sock *sk = NULL; - struct l2tp_stats stats; - unsigned int start; sk = tunnel->sock; @@ -655,28 +646,22 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl if (nest == NULL) goto nla_put_failure; - do { - start = u64_stats_fetch_begin(&session->stats.syncp); - stats.tx_packets = session->stats.tx_packets; - stats.tx_bytes = session->stats.tx_bytes; - stats.tx_errors = session->stats.tx_errors; - stats.rx_packets = session->stats.rx_packets; - stats.rx_bytes = session->stats.rx_bytes; - stats.rx_errors = session->stats.rx_errors; - stats.rx_seq_discards = session->stats.rx_seq_discards; - stats.rx_oos_packets = session->stats.rx_oos_packets; - } while (u64_stats_fetch_retry(&session->stats.syncp, start)); - - if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, stats.tx_packets) || - nla_put_u64(skb, L2TP_ATTR_TX_BYTES, stats.tx_bytes) || - nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, stats.tx_errors) || - nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, stats.rx_packets) || - nla_put_u64(skb, L2TP_ATTR_RX_BYTES, stats.rx_bytes) || + if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, + atomic_long_read(&session->stats.tx_packets)) || + nla_put_u64(skb, L2TP_ATTR_TX_BYTES, + atomic_long_read(&session->stats.tx_bytes)) || + nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, + atomic_long_read(&session->stats.tx_errors)) || + nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, + atomic_long_read(&session->stats.rx_packets)) || + nla_put_u64(skb, L2TP_ATTR_RX_BYTES, + atomic_long_read(&session->stats.rx_bytes)) || nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, - stats.rx_seq_discards) || + atomic_long_read(&session->stats.rx_seq_discards)) || nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS, - stats.rx_oos_packets) || - nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, stats.rx_errors)) + atomic_long_read(&session->stats.rx_oos_packets)) || + nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, + atomic_long_read(&session->stats.rx_errors))) goto nla_put_failure; nla_nest_end(skb, nest); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 716605c241f4..637a341c1e2d 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -97,6 +97,7 @@ #include <net/ip.h> #include <net/udp.h> #include <net/xfrm.h> +#include <net/inet_common.h> #include <asm/byteorder.h> #include <linux/atomic.h> @@ -259,7 +260,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int session->name); /* Not bound. Nothing we can do, so discard. */ - session->stats.rx_errors++; + atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } @@ -355,6 +356,7 @@ static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msgh l2tp_xmit_skb(session, skb, session->hdr_len); sock_put(ps->tunnel_sock); + sock_put(sk); return error; @@ -446,34 +448,16 @@ static void pppol2tp_session_close(struct l2tp_session *session) { struct pppol2tp_session *ps = l2tp_session_priv(session); struct sock *sk = ps->sock; - struct sk_buff *skb; + struct socket *sock = sk->sk_socket; BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (session->session_id == 0) - goto out; - - if (sk != NULL) { - lock_sock(sk); - - if (sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) { - pppox_unbind_sock(sk); - sk->sk_state = PPPOX_DEAD; - sk->sk_state_change(sk); - } - - /* Purge any queued data */ - skb_queue_purge(&sk->sk_receive_queue); - skb_queue_purge(&sk->sk_write_queue); - while ((skb = skb_dequeue(&session->reorder_q))) { - kfree_skb(skb); - sock_put(sk); - } - release_sock(sk); + if (sock) { + inet_shutdown(sock, 2); + /* Don't let the session go away before our socket does */ + l2tp_session_inc_refcount(session); } - -out: return; } @@ -482,19 +466,12 @@ out: */ static void pppol2tp_session_destruct(struct sock *sk) { - struct l2tp_session *session; - - if (sk->sk_user_data != NULL) { - session = sk->sk_user_data; - if (session == NULL) - goto out; - + struct l2tp_session *session = sk->sk_user_data; + if (session) { sk->sk_user_data = NULL; BUG_ON(session->magic != L2TP_SESSION_MAGIC); l2tp_session_dec_refcount(session); } - -out: return; } @@ -524,16 +501,13 @@ static int pppol2tp_release(struct socket *sock) session = pppol2tp_sock_to_session(sk); /* Purge any queued data */ - skb_queue_purge(&sk->sk_receive_queue); - skb_queue_purge(&sk->sk_write_queue); if (session != NULL) { - struct sk_buff *skb; - while ((skb = skb_dequeue(&session->reorder_q))) { - kfree_skb(skb); - sock_put(sk); - } + __l2tp_session_unhash(session); + l2tp_session_queue_purge(session); sock_put(sk); } + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); release_sock(sk); @@ -879,18 +853,6 @@ out: return error; } -/* Called when deleting sessions via the netlink interface. - */ -static int pppol2tp_session_delete(struct l2tp_session *session) -{ - struct pppol2tp_session *ps = l2tp_session_priv(session); - - if (ps->sock == NULL) - l2tp_session_dec_refcount(session); - - return 0; -} - #endif /* CONFIG_L2TP_V3 */ /* getname() support. @@ -1024,14 +986,14 @@ end: static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, struct l2tp_stats *stats) { - dest->tx_packets = stats->tx_packets; - dest->tx_bytes = stats->tx_bytes; - dest->tx_errors = stats->tx_errors; - dest->rx_packets = stats->rx_packets; - dest->rx_bytes = stats->rx_bytes; - dest->rx_seq_discards = stats->rx_seq_discards; - dest->rx_oos_packets = stats->rx_oos_packets; - dest->rx_errors = stats->rx_errors; + dest->tx_packets = atomic_long_read(&stats->tx_packets); + dest->tx_bytes = atomic_long_read(&stats->tx_bytes); + dest->tx_errors = atomic_long_read(&stats->tx_errors); + dest->rx_packets = atomic_long_read(&stats->rx_packets); + dest->rx_bytes = atomic_long_read(&stats->rx_bytes); + dest->rx_seq_discards = atomic_long_read(&stats->rx_seq_discards); + dest->rx_oos_packets = atomic_long_read(&stats->rx_oos_packets); + dest->rx_errors = atomic_long_read(&stats->rx_errors); } /* Session ioctl helper. @@ -1665,14 +1627,14 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) tunnel->name, (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N', atomic_read(&tunnel->ref_count) - 1); - seq_printf(m, " %08x %llu/%llu/%llu %llu/%llu/%llu\n", + seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n", tunnel->debug, - (unsigned long long)tunnel->stats.tx_packets, - (unsigned long long)tunnel->stats.tx_bytes, - (unsigned long long)tunnel->stats.tx_errors, - (unsigned long long)tunnel->stats.rx_packets, - (unsigned long long)tunnel->stats.rx_bytes, - (unsigned long long)tunnel->stats.rx_errors); + atomic_long_read(&tunnel->stats.tx_packets), + atomic_long_read(&tunnel->stats.tx_bytes), + atomic_long_read(&tunnel->stats.tx_errors), + atomic_long_read(&tunnel->stats.rx_packets), + atomic_long_read(&tunnel->stats.rx_bytes), + atomic_long_read(&tunnel->stats.rx_errors)); } static void pppol2tp_seq_session_show(struct seq_file *m, void *v) @@ -1707,14 +1669,14 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) session->lns_mode ? "LNS" : "LAC", session->debug, jiffies_to_msecs(session->reorder_timeout)); - seq_printf(m, " %hu/%hu %llu/%llu/%llu %llu/%llu/%llu\n", + seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n", session->nr, session->ns, - (unsigned long long)session->stats.tx_packets, - (unsigned long long)session->stats.tx_bytes, - (unsigned long long)session->stats.tx_errors, - (unsigned long long)session->stats.rx_packets, - (unsigned long long)session->stats.rx_bytes, - (unsigned long long)session->stats.rx_errors); + atomic_long_read(&session->stats.tx_packets), + atomic_long_read(&session->stats.tx_bytes), + atomic_long_read(&session->stats.tx_errors), + atomic_long_read(&session->stats.rx_packets), + atomic_long_read(&session->stats.rx_bytes), + atomic_long_read(&session->stats.rx_errors)); if (po) seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); @@ -1783,7 +1745,8 @@ static __net_init int pppol2tp_init_net(struct net *net) struct proc_dir_entry *pde; int err = 0; - pde = proc_net_fops_create(net, "pppol2tp", S_IRUGO, &pppol2tp_proc_fops); + pde = proc_create("pppol2tp", S_IRUGO, net->proc_net, + &pppol2tp_proc_fops); if (!pde) { err = -ENOMEM; goto out; @@ -1795,7 +1758,7 @@ out: static __net_exit void pppol2tp_exit_net(struct net *net) { - proc_net_remove(net, "pppol2tp"); + remove_proc_entry("pppol2tp", net->proc_net); } static struct pernet_operations pppol2tp_net_ops = { @@ -1837,7 +1800,7 @@ static const struct pppox_proto pppol2tp_proto = { static const struct l2tp_nl_cmd_ops pppol2tp_nl_cmd_ops = { .session_create = pppol2tp_session_create, - .session_delete = pppol2tp_session_delete, + .session_delete = l2tp_session_delete, }; #endif /* CONFIG_L2TP_V3 */ diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig index f0b5efb31a00..6481839b76c9 100644 --- a/net/lapb/Kconfig +++ b/net/lapb/Kconfig @@ -3,8 +3,7 @@ # config LAPB - tristate "LAPB Data Link Driver (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "LAPB Data Link Driver" ---help--- Link Access Procedure, Balanced (LAPB) is the data link layer (i.e. the lower) part of the X.25 protocol. It offers a reliable diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 88709882c464..48aaa89253e0 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -720,6 +720,8 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, int target; /* Read at least this many bytes */ long timeo; + msg->msg_namelen = 0; + lock_sock(sk); copied = -ENOTCONN; if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 7c5073badc73..78be45cda5c1 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -393,12 +393,11 @@ static void llc_sap_mcast(struct llc_sap *sap, { int i = 0, count = 256 / sizeof(struct sock *); struct sock *sk, *stack[count]; - struct hlist_node *node; struct llc_sock *llc; struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex); spin_lock_bh(&sap->sk_lock); - hlist_for_each_entry(llc, node, dev_hb, dev_hash_node) { + hlist_for_each_entry(llc, dev_hb, dev_hash_node) { sk = &llc->sk; diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index b4ecf267a34b..62535fe9f570 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -81,7 +81,7 @@ comment "Some wireless drivers require a rate control algorithm" config MAC80211_MESH bool "Enable mac80211 mesh networking (pre-802.11s) support" - depends on MAC80211 && EXPERIMENTAL + depends on MAC80211 ---help--- This options enables support of Draft 802.11s mesh networking. The implementation is based on Draft 2.08 of the Mesh Networking @@ -258,6 +258,17 @@ config MAC80211_MESH_SYNC_DEBUG Do not select this option. +config MAC80211_MESH_PS_DEBUG + bool "Verbose mesh powersave debugging" + depends on MAC80211_DEBUG_MENU + depends on MAC80211_MESH + ---help--- + Selecting this option causes mac80211 to print out very verbose mesh + powersave debugging messages (when mac80211 is taking part in a + mesh network). + + Do not select this option. + config MAC80211_TDLS_DEBUG bool "Verbose TDLS debugging" depends on MAC80211_DEBUG_MENU diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 4911202334d9..9d7d840aac6d 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -39,7 +39,8 @@ mac80211-$(CONFIG_MAC80211_MESH) += \ mesh_pathtbl.o \ mesh_plink.o \ mesh_hwmp.o \ - mesh_sync.o + mesh_sync.o \ + mesh_ps.o mac80211-$(CONFIG_PM) += pm.o diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 808338a1bce5..31bf2586fb84 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -83,8 +83,8 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP, &sta->sta, tid, NULL, 0)) sdata_info(sta->sdata, - "HW problem - can not stop rx aggregation for tid %d\n", - tid); + "HW problem - can not stop rx aggregation for %pM tid %d\n", + sta->sta.addr, tid); /* check if this is a self generated aggregation halt */ if (initiator == WLAN_BACK_RECIPIENT && tx) @@ -159,7 +159,8 @@ static void sta_rx_agg_session_timer_expired(unsigned long data) } rcu_read_unlock(); - ht_dbg(sta->sdata, "rx session timer expired on tid %d\n", (u16)*ptid); + ht_dbg(sta->sdata, "RX session timer expired on %pM tid %d\n", + sta->sta.addr, (u16)*ptid); set_bit(*ptid, sta->ampdu_mlme.tid_rx_timer_expired); ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work); @@ -247,7 +248,9 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, status = WLAN_STATUS_REQUEST_DECLINED; if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) { - ht_dbg(sta->sdata, "Suspend in progress - Denying ADDBA request\n"); + ht_dbg(sta->sdata, + "Suspend in progress - Denying ADDBA request (%pM tid %d)\n", + sta->sta.addr, tid); goto end_no_lock; } @@ -317,7 +320,8 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START, &sta->sta, tid, &start_seq_num, 0); - ht_dbg(sta->sdata, "Rx A-MPDU request on tid %d result %d\n", tid, ret); + ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n", + sta->sta.addr, tid, ret); if (ret) { kfree(tid_agg_rx->reorder_buf); kfree(tid_agg_rx->reorder_time); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index eb9df22418f0..13b7683de5a4 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -149,16 +149,133 @@ void ieee80211_assign_tid_tx(struct sta_info *sta, int tid, rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], tid_tx); } +static inline int ieee80211_ac_from_tid(int tid) +{ + return ieee802_1d_to_ac[tid & 7]; +} + +/* + * When multiple aggregation sessions on multiple stations + * are being created/destroyed simultaneously, we need to + * refcount the global queue stop caused by that in order + * to not get into a situation where one of the aggregation + * setup or teardown re-enables queues before the other is + * ready to handle that. + * + * These two functions take care of this issue by keeping + * a global "agg_queue_stop" refcount. + */ +static void __acquires(agg_queue) +ieee80211_stop_queue_agg(struct ieee80211_sub_if_data *sdata, int tid) +{ + int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)]; + + if (atomic_inc_return(&sdata->local->agg_queue_stop[queue]) == 1) + ieee80211_stop_queue_by_reason( + &sdata->local->hw, queue, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + __acquire(agg_queue); +} + +static void __releases(agg_queue) +ieee80211_wake_queue_agg(struct ieee80211_sub_if_data *sdata, int tid) +{ + int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)]; + + if (atomic_dec_return(&sdata->local->agg_queue_stop[queue]) == 0) + ieee80211_wake_queue_by_reason( + &sdata->local->hw, queue, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + __release(agg_queue); +} + +/* + * splice packets from the STA's pending to the local pending, + * requires a call to ieee80211_agg_splice_finish later + */ +static void __acquires(agg_queue) +ieee80211_agg_splice_packets(struct ieee80211_sub_if_data *sdata, + struct tid_ampdu_tx *tid_tx, u16 tid) +{ + struct ieee80211_local *local = sdata->local; + int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)]; + unsigned long flags; + + ieee80211_stop_queue_agg(sdata, tid); + + if (WARN(!tid_tx, + "TID %d gone but expected when splicing aggregates from the pending queue\n", + tid)) + return; + + if (!skb_queue_empty(&tid_tx->pending)) { + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + /* copy over remaining packets */ + skb_queue_splice_tail_init(&tid_tx->pending, + &local->pending[queue]); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + } +} + +static void __releases(agg_queue) +ieee80211_agg_splice_finish(struct ieee80211_sub_if_data *sdata, u16 tid) +{ + ieee80211_wake_queue_agg(sdata, tid); +} + +static void ieee80211_remove_tid_tx(struct sta_info *sta, int tid) +{ + struct tid_ampdu_tx *tid_tx; + + lockdep_assert_held(&sta->ampdu_mlme.mtx); + lockdep_assert_held(&sta->lock); + + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + + /* + * When we get here, the TX path will not be lockless any more wrt. + * aggregation, since the OPERATIONAL bit has long been cleared. + * Thus it will block on getting the lock, if it occurs. So if we + * stop the queue now, we will not get any more packets, and any + * that might be being processed will wait for us here, thereby + * guaranteeing that no packets go to the tid_tx pending queue any + * more. + */ + + ieee80211_agg_splice_packets(sta->sdata, tid_tx, tid); + + /* future packets must not find the tid_tx struct any more */ + ieee80211_assign_tid_tx(sta, tid, NULL); + + ieee80211_agg_splice_finish(sta->sdata, tid); + + kfree_rcu(tid_tx, rcu_head); +} + int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, - enum ieee80211_back_parties initiator, - bool tx) + enum ieee80211_agg_stop_reason reason) { struct ieee80211_local *local = sta->local; struct tid_ampdu_tx *tid_tx; + enum ieee80211_ampdu_mlme_action action; int ret; lockdep_assert_held(&sta->ampdu_mlme.mtx); + switch (reason) { + case AGG_STOP_DECLINED: + case AGG_STOP_LOCAL_REQUEST: + case AGG_STOP_PEER_REQUEST: + action = IEEE80211_AMPDU_TX_STOP_CONT; + break; + case AGG_STOP_DESTROY_STA: + action = IEEE80211_AMPDU_TX_STOP_FLUSH; + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + spin_lock_bh(&sta->lock); tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -167,10 +284,19 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, return -ENOENT; } - /* if we're already stopping ignore any new requests to stop */ + /* + * if we're already stopping ignore any new requests to stop + * unless we're destroying it in which case notify the driver + */ if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { spin_unlock_bh(&sta->lock); - return -EALREADY; + if (reason != AGG_STOP_DESTROY_STA) + return -EALREADY; + ret = drv_ampdu_action(local, sta->sdata, + IEEE80211_AMPDU_TX_STOP_FLUSH_CONT, + &sta->sta, tid, NULL, 0); + WARN_ON_ONCE(ret); + return 0; } if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) { @@ -212,11 +338,12 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, */ synchronize_net(); - tid_tx->stop_initiator = initiator; - tid_tx->tx_stop = tx; + tid_tx->stop_initiator = reason == AGG_STOP_PEER_REQUEST ? + WLAN_BACK_RECIPIENT : + WLAN_BACK_INITIATOR; + tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST; - ret = drv_ampdu_action(local, sta->sdata, - IEEE80211_AMPDU_TX_STOP, + ret = drv_ampdu_action(local, sta->sdata, action, &sta->sta, tid, NULL, 0); /* HW shall not deny going back to legacy */ @@ -227,7 +354,17 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, */ } - return ret; + /* + * In the case of AGG_STOP_DESTROY_STA, the driver won't + * necessarily call ieee80211_stop_tx_ba_cb(), so this may + * seem like we can leave the tid_tx data pending forever. + * This is true, in a way, but "forever" is only until the + * station struct is actually destroyed. In the meantime, + * leaving it around ensures that we don't transmit packets + * to the driver on this TID which might confuse it. + */ + + return 0; } /* @@ -253,91 +390,18 @@ static void sta_addba_resp_timer_expired(unsigned long data) test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) { rcu_read_unlock(); ht_dbg(sta->sdata, - "timer expired on tid %d but we are not (or no longer) expecting addBA response there\n", - tid); + "timer expired on %pM tid %d but we are not (or no longer) expecting addBA response there\n", + sta->sta.addr, tid); return; } - ht_dbg(sta->sdata, "addBA response timer expired on tid %d\n", tid); + ht_dbg(sta->sdata, "addBA response timer expired on %pM tid %d\n", + sta->sta.addr, tid); ieee80211_stop_tx_ba_session(&sta->sta, tid); rcu_read_unlock(); } -static inline int ieee80211_ac_from_tid(int tid) -{ - return ieee802_1d_to_ac[tid & 7]; -} - -/* - * When multiple aggregation sessions on multiple stations - * are being created/destroyed simultaneously, we need to - * refcount the global queue stop caused by that in order - * to not get into a situation where one of the aggregation - * setup or teardown re-enables queues before the other is - * ready to handle that. - * - * These two functions take care of this issue by keeping - * a global "agg_queue_stop" refcount. - */ -static void __acquires(agg_queue) -ieee80211_stop_queue_agg(struct ieee80211_sub_if_data *sdata, int tid) -{ - int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)]; - - if (atomic_inc_return(&sdata->local->agg_queue_stop[queue]) == 1) - ieee80211_stop_queue_by_reason( - &sdata->local->hw, queue, - IEEE80211_QUEUE_STOP_REASON_AGGREGATION); - __acquire(agg_queue); -} - -static void __releases(agg_queue) -ieee80211_wake_queue_agg(struct ieee80211_sub_if_data *sdata, int tid) -{ - int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)]; - - if (atomic_dec_return(&sdata->local->agg_queue_stop[queue]) == 0) - ieee80211_wake_queue_by_reason( - &sdata->local->hw, queue, - IEEE80211_QUEUE_STOP_REASON_AGGREGATION); - __release(agg_queue); -} - -/* - * splice packets from the STA's pending to the local pending, - * requires a call to ieee80211_agg_splice_finish later - */ -static void __acquires(agg_queue) -ieee80211_agg_splice_packets(struct ieee80211_sub_if_data *sdata, - struct tid_ampdu_tx *tid_tx, u16 tid) -{ - struct ieee80211_local *local = sdata->local; - int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)]; - unsigned long flags; - - ieee80211_stop_queue_agg(sdata, tid); - - if (WARN(!tid_tx, - "TID %d gone but expected when splicing aggregates from the pending queue\n", - tid)) - return; - - if (!skb_queue_empty(&tid_tx->pending)) { - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - /* copy over remaining packets */ - skb_queue_splice_tail_init(&tid_tx->pending, - &local->pending[queue]); - spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - } -} - -static void __releases(agg_queue) -ieee80211_agg_splice_finish(struct ieee80211_sub_if_data *sdata, u16 tid) -{ - ieee80211_wake_queue_agg(sdata, tid); -} - void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) { struct tid_ampdu_tx *tid_tx; @@ -369,7 +433,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) &sta->sta, tid, &start_seq_num, 0); if (ret) { ht_dbg(sdata, - "BA request denied - HW unavailable for tid %d\n", tid); + "BA request denied - HW unavailable for %pM tid %d\n", + sta->sta.addr, tid); spin_lock_bh(&sta->lock); ieee80211_agg_splice_packets(sdata, tid_tx, tid); ieee80211_assign_tid_tx(sta, tid, NULL); @@ -382,7 +447,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) /* activate the timer for the recipient's addBA response */ mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL); - ht_dbg(sdata, "activated addBA response timer on tid %d\n", tid); + ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n", + sta->sta.addr, tid); spin_lock_bh(&sta->lock); sta->ampdu_mlme.last_addba_req_time[tid] = jiffies; @@ -429,7 +495,8 @@ static void sta_tx_agg_session_timer_expired(unsigned long data) rcu_read_unlock(); - ht_dbg(sta->sdata, "tx session timer expired on tid %d\n", (u16)*ptid); + ht_dbg(sta->sdata, "tx session timer expired on %pM tid %d\n", + sta->sta.addr, (u16)*ptid); ieee80211_stop_tx_ba_session(&sta->sta, *ptid); } @@ -465,7 +532,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) { ht_dbg(sdata, - "BA sessions blocked - Denying BA session request\n"); + "BA sessions blocked - Denying BA session request %pM tid %d\n", + sta->sta.addr, tid); return -EINVAL; } @@ -506,8 +574,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, time_before(jiffies, sta->ampdu_mlme.last_addba_req_time[tid] + HT_AGG_RETRIES_PERIOD)) { ht_dbg(sdata, - "BA request denied - waiting a grace period after %d failed requests on tid %u\n", - sta->ampdu_mlme.addba_req_num[tid], tid); + "BA request denied - waiting a grace period after %d failed requests on %pM tid %u\n", + sta->ampdu_mlme.addba_req_num[tid], sta->sta.addr, tid); ret = -EBUSY; goto err_unlock_sta; } @@ -516,8 +584,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, /* check if the TID is not in aggregation flow already */ if (tid_tx || sta->ampdu_mlme.tid_start_tx[tid]) { ht_dbg(sdata, - "BA request denied - session is not idle on tid %u\n", - tid); + "BA request denied - session is not idle on %pM tid %u\n", + sta->sta.addr, tid); ret = -EAGAIN; goto err_unlock_sta; } @@ -572,7 +640,8 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local, tid_tx = rcu_dereference_protected_tid_tx(sta, tid); - ht_dbg(sta->sdata, "Aggregation is on for tid %d\n", tid); + ht_dbg(sta->sdata, "Aggregation is on for %pM tid %d\n", + sta->sta.addr, tid); drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_TX_OPERATIONAL, @@ -660,14 +729,13 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe); int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, - enum ieee80211_back_parties initiator, - bool tx) + enum ieee80211_agg_stop_reason reason) { int ret; mutex_lock(&sta->ampdu_mlme.mtx); - ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator, tx); + ret = ___ieee80211_stop_tx_ba_session(sta, tid, reason); mutex_unlock(&sta->ampdu_mlme.mtx); @@ -743,7 +811,9 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) tid_tx = rcu_dereference_protected_tid_tx(sta, tid); if (!tid_tx || !test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { - ht_dbg(sdata, "unexpected callback to A-MPDU stop\n"); + ht_dbg(sdata, + "unexpected callback to A-MPDU stop for %pM tid %d\n", + sta->sta.addr, tid); goto unlock_sta; } @@ -751,24 +821,7 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) ieee80211_send_delba(sta->sdata, ra, tid, WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); - /* - * When we get here, the TX path will not be lockless any more wrt. - * aggregation, since the OPERATIONAL bit has long been cleared. - * Thus it will block on getting the lock, if it occurs. So if we - * stop the queue now, we will not get any more packets, and any - * that might be being processed will wait for us here, thereby - * guaranteeing that no packets go to the tid_tx pending queue any - * more. - */ - - ieee80211_agg_splice_packets(sta->sdata, tid_tx, tid); - - /* future packets must not find the tid_tx struct any more */ - ieee80211_assign_tid_tx(sta, tid, NULL); - - ieee80211_agg_splice_finish(sta->sdata, tid); - - kfree_rcu(tid_tx, rcu_head); + ieee80211_remove_tid_tx(sta, tid); unlock_sta: spin_unlock_bh(&sta->lock); @@ -819,13 +872,15 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, goto out; if (mgmt->u.action.u.addba_resp.dialog_token != tid_tx->dialog_token) { - ht_dbg(sta->sdata, "wrong addBA response token, tid %d\n", tid); + ht_dbg(sta->sdata, "wrong addBA response token, %pM tid %d\n", + sta->sta.addr, tid); goto out; } del_timer_sync(&tid_tx->addba_resp_timer); - ht_dbg(sta->sdata, "switched off addBA timer for tid %d\n", tid); + ht_dbg(sta->sdata, "switched off addBA timer for %pM tid %d\n", + sta->sta.addr, tid); /* * addba_resp_timer may have fired before we got here, and @@ -835,8 +890,8 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, if (test_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state) || test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { ht_dbg(sta->sdata, - "got addBA resp for tid %d but we already gave up\n", - tid); + "got addBA resp for %pM tid %d but we already gave up\n", + sta->sta.addr, tid); goto out; } @@ -868,8 +923,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, } } else { - ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR, - false); + ___ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_DECLINED); } out: diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 0479c64aa83c..1a89c80e6407 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -175,7 +175,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, * add it to the device after the station. */ if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) { - ieee80211_key_free(sdata->local, key); + ieee80211_key_free_unused(key); err = -ENOENT; goto out_unlock; } @@ -214,8 +214,6 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, } err = ieee80211_key_link(key, sdata, sta); - if (err) - ieee80211_key_free(sdata->local, key); out_unlock: mutex_unlock(&sdata->local->sta_mtx); @@ -254,7 +252,7 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, goto out_unlock; } - __ieee80211_key_free(key); + ieee80211_key_free(key, true); ret = 0; out_unlock: @@ -445,12 +443,14 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct timespec uptime; + u64 packets = 0; + int ac; sinfo->generation = sdata->local->sta_generation; sinfo->filled = STATION_INFO_INACTIVE_TIME | - STATION_INFO_RX_BYTES | - STATION_INFO_TX_BYTES | + STATION_INFO_RX_BYTES64 | + STATION_INFO_TX_BYTES64 | STATION_INFO_RX_PACKETS | STATION_INFO_TX_PACKETS | STATION_INFO_TX_RETRIES | @@ -467,10 +467,14 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->connected_time = uptime.tv_sec - sta->last_connected; sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); + sinfo->tx_bytes = 0; + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + sinfo->tx_bytes += sta->tx_bytes[ac]; + packets += sta->tx_packets[ac]; + } + sinfo->tx_packets = packets; sinfo->rx_bytes = sta->rx_bytes; - sinfo->tx_bytes = sta->tx_bytes; sinfo->rx_packets = sta->rx_packets; - sinfo->tx_packets = sta->tx_packets; sinfo->tx_retries = sta->tx_retry_count; sinfo->tx_failed = sta->tx_retry_failed; sinfo->rx_dropped_misc = sta->rx_dropped; @@ -492,7 +496,10 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) #ifdef CONFIG_MAC80211_MESH sinfo->filled |= STATION_INFO_LLID | STATION_INFO_PLID | - STATION_INFO_PLINK_STATE; + STATION_INFO_PLINK_STATE | + STATION_INFO_LOCAL_PM | + STATION_INFO_PEER_PM | + STATION_INFO_NONPEER_PM; sinfo->llid = le16_to_cpu(sta->llid); sinfo->plid = le16_to_cpu(sta->plid); @@ -501,6 +508,9 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->filled |= STATION_INFO_T_OFFSET; sinfo->t_offset = sta->t_offset; } + sinfo->local_pm = sta->local_pm; + sinfo->peer_pm = sta->peer_pm; + sinfo->nonpeer_pm = sta->nonpeer_pm; #endif } @@ -520,6 +530,7 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) BIT(NL80211_STA_FLAG_WME) | BIT(NL80211_STA_FLAG_MFP) | BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED) | BIT(NL80211_STA_FLAG_TDLS_PEER); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); @@ -531,6 +542,8 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); if (test_sta_flag(sta, WLAN_STA_AUTH)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); + if (test_sta_flag(sta, WLAN_STA_ASSOC)) + sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); } @@ -589,8 +602,8 @@ static void ieee80211_get_et_stats(struct wiphy *wiphy, data[i++] += sta->rx_fragments; \ data[i++] += sta->rx_dropped; \ \ - data[i++] += sta->tx_packets; \ - data[i++] += sta->tx_bytes; \ + data[i++] += sinfo.tx_packets; \ + data[i++] += sinfo.tx_bytes; \ data[i++] += sta->tx_fragments; \ data[i++] += sta->tx_filtered_count; \ data[i++] += sta->tx_retry_failed; \ @@ -612,13 +625,14 @@ static void ieee80211_get_et_stats(struct wiphy *wiphy, if (!(sta && !WARN_ON(sta->sdata->dev != dev))) goto do_survey; + sinfo.filled = 0; + sta_set_sinfo(sta, &sinfo); + i = 0; ADD_STA_STATS(sta); data[i++] = sta->sta_state; - sinfo.filled = 0; - sta_set_sinfo(sta, &sinfo); if (sinfo.filled & STATION_INFO_TX_BITRATE) data[i] = 100000 * @@ -791,8 +805,7 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, IEEE80211_CHANCTX_EXCLUSIVE); } } else if (local->open_count == local->monitors) { - local->_oper_channel = chandef->chan; - local->_oper_channel_type = cfg80211_get_chandef_type(chandef); + local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); } @@ -919,11 +932,13 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, /* TODO: make hostapd tell us what it wants */ sdata->smps_mode = IEEE80211_SMPS_OFF; sdata->needed_rx_chains = sdata->local->rx_chains; + sdata->radar_required = params->radar_required; err = ieee80211_vif_use_channel(sdata, ¶ms->chandef, IEEE80211_CHANCTX_SHARED); if (err) return err; + ieee80211_vif_copy_chanctx_to_vlans(sdata, false); /* * Apply control port protocol, this allows us to @@ -940,6 +955,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.beacon_int = params->beacon_interval; sdata->vif.bss_conf.dtim_period = params->dtim_period; + sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.ssid_len = params->ssid_len; if (params->ssid_len) @@ -948,8 +964,13 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.hidden_ssid = (params->hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE); - sdata->vif.bss_conf.p2p_ctwindow = params->p2p_ctwindow; - sdata->vif.bss_conf.p2p_oppps = params->p2p_opp_ps; + memset(&sdata->vif.bss_conf.p2p_noa_attr, 0, + sizeof(sdata->vif.bss_conf.p2p_noa_attr)); + sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow = + params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; + if (params->p2p_opp_ps) + sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= + IEEE80211_P2P_OPPPS_ENABLE_BIT; err = ieee80211_assign_beacon(sdata, ¶ms->beacon); if (err < 0) @@ -1020,8 +1041,20 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) kfree_rcu(old_probe_resp, rcu_head); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) - sta_info_flush(local, vlan); - sta_info_flush(local, sdata); + sta_info_flush_defer(vlan); + sta_info_flush_defer(sdata); + synchronize_net(); + rcu_barrier(); + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { + sta_info_flush_cleanup(vlan); + ieee80211_free_keys(vlan); + } + sta_info_flush_cleanup(sdata); + ieee80211_free_keys(sdata); + + sdata->vif.bss_conf.enable_beacon = false; + sdata->vif.bss_conf.ssid_len = 0; + clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); drv_stop_ap(sdata->local, sdata); @@ -1030,6 +1063,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf); skb_queue_purge(&sdata->u.ap.ps.bc_buf); + ieee80211_vif_copy_chanctx_to_vlans(sdata, true); ieee80211_vif_release_channel(sdata); return 0; @@ -1079,6 +1113,58 @@ static void ieee80211_send_layer2_update(struct sta_info *sta) netif_rx_ni(skb); } +static int sta_apply_auth_flags(struct ieee80211_local *local, + struct sta_info *sta, + u32 mask, u32 set) +{ + int ret; + + if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && + set & BIT(NL80211_STA_FLAG_AUTHENTICATED) && + !test_sta_flag(sta, WLAN_STA_AUTH)) { + ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); + if (ret) + return ret; + } + + if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && + set & BIT(NL80211_STA_FLAG_ASSOCIATED) && + !test_sta_flag(sta, WLAN_STA_ASSOC)) { + ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); + if (ret) + return ret; + } + + if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { + if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) + ret = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); + else if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); + else + ret = 0; + if (ret) + return ret; + } + + if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && + !(set & BIT(NL80211_STA_FLAG_ASSOCIATED)) && + test_sta_flag(sta, WLAN_STA_ASSOC)) { + ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); + if (ret) + return ret; + } + + if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && + !(set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) && + test_sta_flag(sta, WLAN_STA_AUTH)) { + ret = sta_info_move_state(sta, IEEE80211_STA_NONE); + if (ret) + return ret; + } + + return 0; +} + static int sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) @@ -1096,52 +1182,32 @@ static int sta_apply_parameters(struct ieee80211_local *local, mask = params->sta_flags_mask; set = params->sta_flags_set; - /* - * In mesh mode, we can clear AUTHENTICATED flag but must - * also make ASSOCIATED follow appropriately for the driver - * API. See also below, after AUTHORIZED changes. - */ - if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED)) { - /* cfg80211 should not allow this in non-mesh modes */ - if (WARN_ON(!ieee80211_vif_is_mesh(&sdata->vif))) - return -EINVAL; - - if (set & BIT(NL80211_STA_FLAG_AUTHENTICATED) && - !test_sta_flag(sta, WLAN_STA_AUTH)) { - ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); - if (ret) - return ret; - ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); - if (ret) - return ret; - } - } - - if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { - if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) - ret = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); - else if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) - ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); - if (ret) - return ret; - } - - if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED)) { - /* cfg80211 should not allow this in non-mesh modes */ - if (WARN_ON(!ieee80211_vif_is_mesh(&sdata->vif))) - return -EINVAL; - - if (!(set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) && - test_sta_flag(sta, WLAN_STA_AUTH)) { - ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); - if (ret) - return ret; - ret = sta_info_move_state(sta, IEEE80211_STA_NONE); - if (ret) - return ret; + if (ieee80211_vif_is_mesh(&sdata->vif)) { + /* + * In mesh mode, ASSOCIATED isn't part of the nl80211 + * API but must follow AUTHENTICATED for driver state. + */ + if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED)) + mask |= BIT(NL80211_STA_FLAG_ASSOCIATED); + if (set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) + set |= BIT(NL80211_STA_FLAG_ASSOCIATED); + } else if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + /* + * TDLS -- everything follows authorized, but + * only becoming authorized is possible, not + * going back + */ + if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) { + set |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED); + mask |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED); } } + ret = sta_apply_auth_flags(local, sta, mask, set); + if (ret) + return ret; if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) { if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) @@ -1187,10 +1253,11 @@ static int sta_apply_parameters(struct ieee80211_local *local, sta->sta.aid = params->aid; /* - * FIXME: updating the following information is racy when this - * function is called from ieee80211_change_station(). - * However, all this information should be static so - * maybe we should just reject attemps to change it. + * Some of the following updates would be racy if called on an + * existing station, via ieee80211_change_station(). However, + * all such changes are rejected by cfg80211 except for updates + * changing the supported rates on an existing but not yet used + * TDLS peer. */ if (params->listen_interval >= 0) @@ -1211,36 +1278,66 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->ht_capa) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, - params->ht_capa, - &sta->sta.ht_cap); + params->ht_capa, sta); if (params->vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - params->vht_capa, - &sta->sta.vht_cap); + params->vht_capa, sta); if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH - if (sdata->u.mesh.security & IEEE80211_MESH_SEC_SECURED) + u32 changed = 0; + + if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { switch (params->plink_state) { - case NL80211_PLINK_LISTEN: case NL80211_PLINK_ESTAB: + if (sta->plink_state != NL80211_PLINK_ESTAB) + changed = mesh_plink_inc_estab_count( + sdata); + sta->plink_state = params->plink_state; + + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_set_sta_local_pm(sta, + sdata->u.mesh.mshcfg.power_mode); + break; + case NL80211_PLINK_LISTEN: case NL80211_PLINK_BLOCKED: + case NL80211_PLINK_OPN_SNT: + case NL80211_PLINK_OPN_RCVD: + case NL80211_PLINK_CNF_RCVD: + case NL80211_PLINK_HOLDING: + if (sta->plink_state == NL80211_PLINK_ESTAB) + changed = mesh_plink_dec_estab_count( + sdata); sta->plink_state = params->plink_state; + + ieee80211_mps_sta_status_update(sta); + changed |= + ieee80211_mps_local_status_update(sdata); break; default: /* nothing */ break; } - else - switch (params->plink_action) { - case PLINK_ACTION_OPEN: - mesh_plink_open(sta); - break; - case PLINK_ACTION_BLOCK: - mesh_plink_block(sta); - break; - } + } + + switch (params->plink_action) { + case NL80211_PLINK_ACTION_NO_ACTION: + /* nothing */ + break; + case NL80211_PLINK_ACTION_OPEN: + changed |= mesh_plink_open(sta); + break; + case NL80211_PLINK_ACTION_BLOCK: + changed |= mesh_plink_block(sta); + break; + } + + if (params->local_pm) + changed |= + ieee80211_mps_set_sta_local_pm(sta, + params->local_pm); + ieee80211_bss_info_change_notify(sdata, changed); #endif } @@ -1275,8 +1372,14 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, if (!sta) return -ENOMEM; - sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); - sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); + /* + * defaults -- if userspace wants something else we'll + * change it accordingly in sta_apply_parameters() + */ + if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) { + sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); + sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); + } err = sta_apply_parameters(local, sta, params); if (err) { @@ -1285,8 +1388,8 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, } /* - * for TDLS, rate control should be initialized only when supported - * rates are known. + * for TDLS, rate control should be initialized only when + * rates are known and station is marked authorized */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) rate_control_rate_init(sta); @@ -1311,7 +1414,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, u8 *mac) { - struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -1319,53 +1421,72 @@ static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, if (mac) return sta_info_destroy_addr_bss(sdata, mac); - sta_info_flush(local, sdata); + sta_info_flush(sdata); return 0; } static int ieee80211_change_station(struct wiphy *wiphy, - struct net_device *dev, - u8 *mac, + struct net_device *dev, u8 *mac, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *vlansdata; + enum cfg80211_station_type statype; int err; mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, mac); if (!sta) { - mutex_unlock(&local->sta_mtx); - return -ENOENT; + err = -ENOENT; + goto out_err; } - /* in station mode, supported rates are only valid with TDLS */ - if (sdata->vif.type == NL80211_IFTYPE_STATION && - params->supported_rates && - !test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { - mutex_unlock(&local->sta_mtx); - return -EINVAL; + switch (sdata->vif.type) { + case NL80211_IFTYPE_MESH_POINT: + if (sdata->u.mesh.user_mpm) + statype = CFG80211_STA_MESH_PEER_USER; + else + statype = CFG80211_STA_MESH_PEER_KERNEL; + break; + case NL80211_IFTYPE_ADHOC: + statype = CFG80211_STA_IBSS; + break; + case NL80211_IFTYPE_STATION: + if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + statype = CFG80211_STA_AP_STA; + break; + } + if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + statype = CFG80211_STA_TDLS_PEER_ACTIVE; + else + statype = CFG80211_STA_TDLS_PEER_SETUP; + break; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + statype = CFG80211_STA_AP_CLIENT; + break; + default: + err = -EOPNOTSUPP; + goto out_err; } + err = cfg80211_check_station_change(wiphy, params, statype); + if (err) + goto out_err; + if (params->vlan && params->vlan != sta->sdata->dev) { bool prev_4addr = false; bool new_4addr = false; vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); - if (vlansdata->vif.type != NL80211_IFTYPE_AP_VLAN && - vlansdata->vif.type != NL80211_IFTYPE_AP) { - mutex_unlock(&local->sta_mtx); - return -EINVAL; - } - if (params->vlan->ieee80211_ptr->use_4addr) { if (vlansdata->u.vlan.sta) { - mutex_unlock(&local->sta_mtx); - return -EBUSY; + err = -EBUSY; + goto out_err; } rcu_assign_pointer(vlansdata->u.vlan.sta, sta); @@ -1392,12 +1513,12 @@ static int ieee80211_change_station(struct wiphy *wiphy, } err = sta_apply_parameters(local, sta, params); - if (err) { - mutex_unlock(&local->sta_mtx); - return err; - } + if (err) + goto out_err; - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && params->supported_rates) + /* When peer becomes authorized, init rate control as well */ + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + test_sta_flag(sta, WLAN_STA_AUTHORIZED)) rate_control_rate_init(sta); mutex_unlock(&local->sta_mtx); @@ -1407,7 +1528,11 @@ static int ieee80211_change_station(struct wiphy *wiphy, ieee80211_recalc_ps(local, -1); ieee80211_recalc_ps_vif(sdata); } + return 0; +out_err: + mutex_unlock(&local->sta_mtx); + return err; } #ifdef CONFIG_MAC80211_MESH @@ -1417,7 +1542,6 @@ static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; struct sta_info *sta; - int err; sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -1428,17 +1552,12 @@ static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, return -ENOENT; } - err = mesh_path_add(dst, sdata); - if (err) { + mpath = mesh_path_add(sdata, dst); + if (IS_ERR(mpath)) { rcu_read_unlock(); - return err; + return PTR_ERR(mpath); } - mpath = mesh_path_lookup(dst, sdata); - if (!mpath) { - rcu_read_unlock(); - return -ENXIO; - } mesh_path_fix_nexthop(mpath, sta); rcu_read_unlock(); @@ -1446,12 +1565,12 @@ static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev, - u8 *dst) + u8 *dst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (dst) - return mesh_path_del(dst, sdata); + return mesh_path_del(sdata, dst); mesh_path_flush_by_iface(sdata); return 0; @@ -1475,7 +1594,7 @@ static int ieee80211_change_mpath(struct wiphy *wiphy, return -ENOENT; } - mpath = mesh_path_lookup(dst, sdata); + mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; @@ -1539,7 +1658,7 @@ static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); - mpath = mesh_path_lookup(dst, sdata); + mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; @@ -1560,7 +1679,7 @@ static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev, sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); - mpath = mesh_path_lookup_by_idx(idx, sdata); + mpath = mesh_path_lookup_by_idx(sdata, idx); if (!mpath) { rcu_read_unlock(); return -ENOENT; @@ -1615,6 +1734,7 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, ifmsh->mesh_sp_id = setup->sync_method; ifmsh->mesh_pp_id = setup->path_sel_proto; ifmsh->mesh_pm_id = setup->path_metric; + ifmsh->user_mpm = setup->user_mpm; ifmsh->security = IEEE80211_MESH_SEC_NONE; if (setup->is_authenticated) ifmsh->security |= IEEE80211_MESH_SEC_AUTHED; @@ -1625,6 +1745,9 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, memcpy(sdata->vif.bss_conf.mcast_rate, setup->mcast_rate, sizeof(setup->mcast_rate)); + sdata->vif.bss_conf.beacon_int = setup->beacon_interval; + sdata->vif.bss_conf.dtim_period = setup->dtim_period; + return 0; } @@ -1655,8 +1778,11 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, conf->dot11MeshTTL = nconf->dot11MeshTTL; if (_chg_mesh_attr(NL80211_MESHCONF_ELEMENT_TTL, mask)) conf->element_ttl = nconf->element_ttl; - if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask)) + if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask)) { + if (ifmsh->user_mpm) + return -EBUSY; conf->auto_open_plinks = nconf->auto_open_plinks; + } if (_chg_mesh_attr(NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, mask)) conf->dot11MeshNbrOffsetMaxNeighbor = nconf->dot11MeshNbrOffsetMaxNeighbor; @@ -1723,6 +1849,14 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, mask)) conf->dot11MeshHWMPconfirmationInterval = nconf->dot11MeshHWMPconfirmationInterval; + if (_chg_mesh_attr(NL80211_MESHCONF_POWER_MODE, mask)) { + conf->power_mode = nconf->power_mode; + ieee80211_mps_local_status_update(sdata); + } + if (_chg_mesh_attr(NL80211_MESHCONF_AWAKE_WINDOW, mask)) + conf->dot11MeshAwakeWindowDuration = + nconf->dot11MeshAwakeWindowDuration; + ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } @@ -1748,9 +1882,7 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, if (err) return err; - ieee80211_start_mesh(sdata); - - return 0; + return ieee80211_start_mesh(sdata); } static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev) @@ -1829,12 +1961,20 @@ static int ieee80211_change_bss(struct wiphy *wiphy, } if (params->p2p_ctwindow >= 0) { - sdata->vif.bss_conf.p2p_ctwindow = params->p2p_ctwindow; + sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &= + ~IEEE80211_P2P_OPPPS_CTWINDOW_MASK; + sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= + params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; changed |= BSS_CHANGED_P2P_PS; } - if (params->p2p_opp_ps >= 0) { - sdata->vif.bss_conf.p2p_oppps = params->p2p_opp_ps; + if (params->p2p_opp_ps > 0) { + sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= + IEEE80211_P2P_OPPPS_ENABLE_BIT; + changed |= BSS_CHANGED_P2P_PS; + } else if (params->p2p_opp_ps == 0) { + sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &= + ~IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } @@ -2208,7 +2348,8 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); - if (sdata->vif.type != NL80211_IFTYPE_STATION) + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) @@ -2277,9 +2418,22 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, } for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + struct ieee80211_supported_band *sband = wiphy->bands[i]; + int j; + sdata->rc_rateidx_mask[i] = mask->control[i].legacy; memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].mcs, sizeof(mask->control[i].mcs)); + + sdata->rc_has_mcs_mask[i] = false; + if (!sband) + continue; + + for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) + if (~sdata->rc_rateidx_mcs_mask[i][j]) { + sdata->rc_has_mcs_mask[i] = true; + break; + } } return 0; @@ -2289,7 +2443,8 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, unsigned int duration, u64 *cookie, - struct sk_buff *txskb) + struct sk_buff *txskb, + enum ieee80211_roc_type type) { struct ieee80211_roc_work *roc, *tmp; bool queued = false; @@ -2308,13 +2463,15 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, roc->duration = duration; roc->req_duration = duration; roc->frame = txskb; + roc->type = type; roc->mgmt_tx_cookie = (unsigned long)txskb; roc->sdata = sdata; INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work); INIT_LIST_HEAD(&roc->dependents); /* if there's one pending or we're scanning, queue this one */ - if (!list_empty(&local->roc_list) || local->scanning) + if (!list_empty(&local->roc_list) || + local->scanning || local->radar_detect_enabled) goto out_check_combine; /* if not HW assist, just queue & schedule work */ @@ -2337,7 +2494,7 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, if (!duration) duration = 10; - ret = drv_remain_on_channel(local, sdata, channel, duration); + ret = drv_remain_on_channel(local, sdata, channel, duration, type); if (ret) { kfree(roc); return ret; @@ -2356,10 +2513,13 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, * * If it hasn't started yet, just increase the duration * and add the new one to the list of dependents. + * If the type of the new ROC has higher priority, modify the + * type of the previous one to match that of the new one. */ if (!tmp->started) { list_add_tail(&roc->list, &tmp->dependents); tmp->duration = max(tmp->duration, roc->duration); + tmp->type = max(tmp->type, roc->type); queued = true; break; } @@ -2371,16 +2531,18 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, /* * In the offloaded ROC case, if it hasn't begun, add * this new one to the dependent list to be handled - * when the the master one begins. If it has begun, + * when the master one begins. If it has begun, * check that there's still a minimum time left and * if so, start this one, transmitting the frame, but - * add it to the list directly after this one with a + * add it to the list directly after this one with * a reduced time so we'll ask the driver to execute * it right after finishing the previous one, in the * hope that it'll also be executed right afterwards, * effectively extending the old one. * If there's no minimum time left, just add it to the * normal list. + * TODO: the ROC type is ignored here, assuming that it + * is better to immediately use the current ROC. */ if (!tmp->hw_begun) { list_add_tail(&roc->list, &tmp->dependents); @@ -2474,7 +2636,8 @@ static int ieee80211_remain_on_channel(struct wiphy *wiphy, mutex_lock(&local->mtx); ret = ieee80211_start_roc_work(local, sdata, chan, - duration, cookie, NULL); + duration, cookie, NULL, + IEEE80211_ROC_TYPE_NORMAL); mutex_unlock(&local->mtx); return ret; @@ -2499,7 +2662,7 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local, list_del(&dep->list); mutex_unlock(&local->mtx); - ieee80211_roc_notify_destroy(dep); + ieee80211_roc_notify_destroy(dep, true); return 0; } @@ -2539,7 +2702,7 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local, ieee80211_start_next_roc(local); mutex_unlock(&local->mtx); - ieee80211_roc_notify_destroy(found); + ieee80211_roc_notify_destroy(found, true); } else { /* work may be pending so use it all the time */ found->abort = true; @@ -2549,6 +2712,8 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local, /* work will clean up etc */ flush_delayed_work(&found->work); + WARN_ON(!found->to_be_freed); + kfree(found); } return 0; @@ -2564,6 +2729,37 @@ static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, return ieee80211_cancel_roc(local, cookie, false); } +static int ieee80211_start_radar_detection(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_chan_def *chandef) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + unsigned long timeout; + int err; + + if (!list_empty(&local->roc_list) || local->scanning) + return -EBUSY; + + /* whatever, but channel contexts should not complain about that one */ + sdata->smps_mode = IEEE80211_SMPS_OFF; + sdata->needed_rx_chains = local->rx_chains; + sdata->radar_required = true; + + mutex_lock(&local->iflist_mtx); + err = ieee80211_vif_use_channel(sdata, chandef, + IEEE80211_CHANCTX_SHARED); + mutex_unlock(&local->iflist_mtx); + if (err) + return err; + + timeout = msecs_to_jiffies(IEEE80211_DFS_MIN_CAC_TIME_MS); + ieee80211_queue_delayed_work(&sdata->local->hw, + &sdata->dfs_cac_timer_work, timeout); + + return 0; +} + static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, bool offchan, unsigned int wait, const u8 *buf, size_t len, @@ -2668,14 +2864,16 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, goto out_unlock; } - IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN; + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | + IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) IEEE80211_SKB_CB(skb)->hw_queue = local->hw.offchannel_tx_hw_queue; /* This will handle all kinds of coalescing and immediate TX */ ret = ieee80211_start_roc_work(local, sdata, chan, - wait, cookie, skb); + wait, cookie, skb, + IEEE80211_ROC_TYPE_MGMT_TX); if (ret) kfree_skb(skb); out_unlock: @@ -3170,6 +3368,7 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_chanctx_conf *chanctx_conf; int ret = -ENODATA; @@ -3178,6 +3377,14 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy, if (chanctx_conf) { *chandef = chanctx_conf->def; ret = 0; + } else if (local->open_count > 0 && + local->open_count == local->monitors && + sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (local->use_chanctx) + *chandef = local->monitor_chandef; + else + *chandef = local->_oper_chandef; + ret = 0; } rcu_read_unlock(); @@ -3268,4 +3475,5 @@ struct cfg80211_ops mac80211_config_ops = { .get_et_stats = ieee80211_get_et_stats, .get_et_strings = ieee80211_get_et_strings, .get_channel = ieee80211_cfg_get_channel, + .start_radar_detection = ieee80211_start_radar_detection, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 80e55527504b..03e8d2e3270e 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -9,7 +9,7 @@ #include "ieee80211_i.h" #include "driver-ops.h" -static void ieee80211_change_chandef(struct ieee80211_local *local, +static void ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *chandef) { @@ -22,7 +22,7 @@ static void ieee80211_change_chandef(struct ieee80211_local *local, drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_WIDTH); if (!local->use_chanctx) { - local->_oper_channel_type = cfg80211_get_chandef_type(chandef); + local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); } } @@ -49,7 +49,7 @@ ieee80211_find_chanctx(struct ieee80211_local *local, if (!compat) continue; - ieee80211_change_chandef(local, ctx, compat); + ieee80211_change_chanctx(local, ctx, compat); return ctx; } @@ -57,12 +57,29 @@ ieee80211_find_chanctx(struct ieee80211_local *local, return NULL; } +static bool ieee80211_is_radar_required(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (sdata->radar_required) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + + return false; +} + static struct ieee80211_chanctx * ieee80211_new_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; + u32 changed; int err; lockdep_assert_held(&local->chanctx_mtx); @@ -75,34 +92,61 @@ ieee80211_new_chanctx(struct ieee80211_local *local, ctx->conf.rx_chains_static = 1; ctx->conf.rx_chains_dynamic = 1; ctx->mode = mode; + ctx->conf.radar_enabled = ieee80211_is_radar_required(local); + if (!local->use_chanctx) + local->hw.conf.radar_enabled = ctx->conf.radar_enabled; + + /* acquire mutex to prevent idle from changing */ + mutex_lock(&local->mtx); + /* turn idle off *before* setting channel -- some drivers need that */ + changed = ieee80211_idle_off(local); + if (changed) + ieee80211_hw_config(local, changed); if (!local->use_chanctx) { - local->_oper_channel_type = - cfg80211_get_chandef_type(chandef); - local->_oper_channel = chandef->chan; + local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); } else { err = drv_add_chanctx(local, ctx); if (err) { kfree(ctx); - return ERR_PTR(err); + ctx = ERR_PTR(err); + + ieee80211_recalc_idle(local); + goto out; } } + /* and keep the mutex held until the new chanctx is on the list */ list_add_rcu(&ctx->list, &local->chanctx_list); + out: + mutex_unlock(&local->mtx); + return ctx; } static void ieee80211_free_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { + bool check_single_channel = false; lockdep_assert_held(&local->chanctx_mtx); WARN_ON_ONCE(ctx->refcount != 0); if (!local->use_chanctx) { - local->_oper_channel_type = NL80211_CHAN_NO_HT; + struct cfg80211_chan_def *chandef = &local->_oper_chandef; + chandef->width = NL80211_CHAN_WIDTH_20_NOHT; + chandef->center_freq1 = chandef->chan->center_freq; + chandef->center_freq2 = 0; + + /* NOTE: Disabling radar is only valid here for + * single channel context. To be sure, check it ... + */ + if (local->hw.conf.radar_enabled) + check_single_channel = true; + local->hw.conf.radar_enabled = false; + ieee80211_hw_config(local, 0); } else { drv_remove_chanctx(local, ctx); @@ -110,6 +154,13 @@ static void ieee80211_free_chanctx(struct ieee80211_local *local, list_del_rcu(&ctx->list); kfree_rcu(ctx, rcu_head); + + /* throw a warning if this wasn't the only channel context. */ + WARN_ON(check_single_channel && !list_empty(&local->chanctx_list)); + + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); } static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata, @@ -128,6 +179,11 @@ static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata, ctx->refcount++; ieee80211_recalc_txpower(sdata); + sdata->vif.bss_conf.idle = false; + + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_MONITOR) + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); return 0; } @@ -162,7 +218,7 @@ static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, if (WARN_ON_ONCE(!compat)) return; - ieee80211_change_chandef(local, ctx, compat); + ieee80211_change_chanctx(local, ctx, compat); } static void ieee80211_unassign_vif_chanctx(struct ieee80211_sub_if_data *sdata, @@ -175,11 +231,18 @@ static void ieee80211_unassign_vif_chanctx(struct ieee80211_sub_if_data *sdata, ctx->refcount--; rcu_assign_pointer(sdata->vif.chanctx_conf, NULL); + sdata->vif.bss_conf.idle = true; + + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_MONITOR) + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); + drv_unassign_vif_chanctx(local, sdata, ctx); if (ctx->refcount > 0) { ieee80211_recalc_chanctx_chantype(sdata->local, ctx); ieee80211_recalc_smps_chanctx(local, ctx); + ieee80211_recalc_radar_chanctx(local, ctx); } } @@ -198,20 +261,34 @@ static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) ctx = container_of(conf, struct ieee80211_chanctx, conf); - if (sdata->vif.type == NL80211_IFTYPE_AP) { - struct ieee80211_sub_if_data *vlan; - - /* for the VLAN list */ - ASSERT_RTNL(); - list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) - rcu_assign_pointer(vlan->vif.chanctx_conf, NULL); - } - ieee80211_unassign_vif_chanctx(sdata, ctx); if (ctx->refcount == 0) ieee80211_free_chanctx(local, ctx); } +void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *chanctx) +{ + bool radar_enabled; + + lockdep_assert_held(&local->chanctx_mtx); + + radar_enabled = ieee80211_is_radar_required(local); + + if (radar_enabled == chanctx->conf.radar_enabled) + return; + + chanctx->conf.radar_enabled = radar_enabled; + local->radar_detect_enabled = chanctx->conf.radar_enabled; + + if (!local->use_chanctx) { + local->hw.conf.radar_enabled = chanctx->conf.radar_enabled; + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); + } + + drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR); +} + void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx) { @@ -326,16 +403,57 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, goto out; } - if (sdata->vif.type == NL80211_IFTYPE_AP) { - struct ieee80211_sub_if_data *vlan; + ieee80211_recalc_smps_chanctx(local, ctx); + ieee80211_recalc_radar_chanctx(local, ctx); + out: + mutex_unlock(&local->chanctx_mtx); + return ret; +} - /* for the VLAN list */ - ASSERT_RTNL(); - list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) - rcu_assign_pointer(vlan->vif.chanctx_conf, &ctx->conf); +int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + u32 *changed) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *ctx; + int ret; + + if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, + IEEE80211_CHAN_DISABLED)) + return -EINVAL; + + mutex_lock(&local->chanctx_mtx); + if (cfg80211_chandef_identical(chandef, &sdata->vif.bss_conf.chandef)) { + ret = 0; + goto out; } - ieee80211_recalc_smps_chanctx(local, ctx); + if (chandef->width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT) { + ret = -EINVAL; + goto out; + } + + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) { + ret = -EINVAL; + goto out; + } + + ctx = container_of(conf, struct ieee80211_chanctx, conf); + if (!cfg80211_chandef_compatible(&conf->def, chandef)) { + ret = -EINVAL; + goto out; + } + + sdata->vif.bss_conf.chandef = *chandef; + + ieee80211_recalc_chanctx_chantype(local, ctx); + + *changed |= BSS_CHANGED_BANDWIDTH; + ret = 0; out: mutex_unlock(&local->chanctx_mtx); return ret; @@ -369,6 +487,40 @@ void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->chanctx_mtx); } +void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, + bool clear) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *vlan; + struct ieee80211_chanctx_conf *conf; + + ASSERT_RTNL(); + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP)) + return; + + mutex_lock(&local->chanctx_mtx); + + /* + * Check that conf exists, even when clearing this function + * must be called with the AP's channel context still there + * as it would otherwise cause VLANs to have an invalid + * channel context pointer for a while, possibly pointing + * to a channel context that has already been freed. + */ + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + WARN_ON(!conf); + + if (clear) + conf = NULL; + + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + rcu_assign_pointer(vlan->vif.chanctx_conf, conf); + + mutex_unlock(&local->chanctx_mtx); +} + void ieee80211_iter_chan_contexts_atomic( struct ieee80211_hw *hw, void (*iter)(struct ieee80211_hw *hw, @@ -381,7 +533,8 @@ void ieee80211_iter_chan_contexts_atomic( rcu_read_lock(); list_for_each_entry_rcu(ctx, &local->chanctx_list, list) - iter(hw, &ctx->conf, iter_data); + if (ctx->driver_present) + iter(hw, &ctx->conf, iter_data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_atomic); diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h index 8f383a576016..4ccc5ed6237d 100644 --- a/net/mac80211/debug.h +++ b/net/mac80211/debug.h @@ -44,6 +44,12 @@ #define MAC80211_MESH_SYNC_DEBUG 0 #endif +#ifdef CONFIG_MAC80211_MESH_PS_DEBUG +#define MAC80211_MESH_PS_DEBUG 1 +#else +#define MAC80211_MESH_PS_DEBUG 0 +#endif + #ifdef CONFIG_MAC80211_TDLS_DEBUG #define MAC80211_TDLS_DEBUG 1 #else @@ -151,6 +157,10 @@ do { \ _sdata_dbg(MAC80211_MESH_SYNC_DEBUG, \ sdata, fmt, ##__VA_ARGS__) +#define mps_dbg(sdata, fmt, ...) \ + _sdata_dbg(MAC80211_MESH_PS_DEBUG, \ + sdata, fmt, ##__VA_ARGS__) + #define tdls_dbg(sdata, fmt, ...) \ _sdata_dbg(MAC80211_TDLS_DEBUG, \ sdata, fmt, ##__VA_ARGS__) diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 466f4b45dd94..b0e32d628114 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -121,8 +121,8 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, sf += snprintf(buf + sf, mxln - sf, "SIGNAL_UNSPEC\n"); if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) sf += snprintf(buf + sf, mxln - sf, "SIGNAL_DBM\n"); - if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD) - sf += snprintf(buf + sf, mxln - sf, "NEED_DTIM_PERIOD\n"); + if (local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC) + sf += snprintf(buf + sf, mxln - sf, "NEED_DTIM_BEFORE_ASSOC\n"); if (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT) sf += snprintf(buf + sf, mxln - sf, "SPECTRUM_MGMT\n"); if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) @@ -151,8 +151,6 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, sf += snprintf(buf + sf, mxln - sf, "AP_LINK_PS\n"); if (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW) sf += snprintf(buf + sf, mxln - sf, "TX_AMPDU_SETUP_IN_HW\n"); - if (local->hw.flags & IEEE80211_HW_SCAN_WHILE_IDLE) - sf += snprintf(buf + sf, mxln - sf, "SCAN_WHILE_IDLE\n"); rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index c3a3082b72e5..1521cabad3d6 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -295,7 +295,7 @@ void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata) char buf[50]; struct ieee80211_key *key; - if (!sdata->debugfs.dir) + if (!sdata->vif.debugfs_dir) return; lockdep_assert_held(&sdata->local->key_mtx); @@ -311,7 +311,7 @@ void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata) sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_unicast_key = debugfs_create_symlink("default_unicast_key", - sdata->debugfs.dir, buf); + sdata->vif.debugfs_dir, buf); } if (sdata->debugfs.default_multicast_key) { @@ -325,7 +325,7 @@ void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata) sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_multicast_key = debugfs_create_symlink("default_multicast_key", - sdata->debugfs.dir, buf); + sdata->vif.debugfs_dir, buf); } } @@ -334,7 +334,7 @@ void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata) char buf[50]; struct ieee80211_key *key; - if (!sdata->debugfs.dir) + if (!sdata->vif.debugfs_dir) return; key = key_mtx_dereference(sdata->local, @@ -343,7 +343,7 @@ void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata) sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_mgmt_key = debugfs_create_symlink("default_mgmt_key", - sdata->debugfs.dir, buf); + sdata->vif.debugfs_dir, buf); } else ieee80211_debugfs_key_remove_mgmt_default(sdata); } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index cbde5cc49a40..14abcf44f974 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -124,6 +124,15 @@ static ssize_t ieee80211_if_fmt_##name( \ return scnprintf(buf, buflen, "%d\n", sdata->field / 16); \ } +#define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, field) \ +static ssize_t ieee80211_if_fmt_##name( \ + const struct ieee80211_sub_if_data *sdata, \ + char *buf, int buflen) \ +{ \ + return scnprintf(buf, buflen, "%d\n", \ + jiffies_to_msecs(sdata->field)); \ +} + #define __IEEE80211_IF_FILE(name, _write) \ static ssize_t ieee80211_if_read_##name(struct file *file, \ char __user *userbuf, \ @@ -197,6 +206,7 @@ IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); IEEE80211_IF_FILE(last_beacon, u.mgd.last_beacon_signal, DEC); IEEE80211_IF_FILE(ave_beacon, u.mgd.ave_beacon_signal, DEC_DIV_16); +IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS); static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode) @@ -515,10 +525,13 @@ IEEE80211_IF_FILE(dot11MeshHWMProotInterval, u.mesh.mshcfg.dot11MeshHWMProotInterval, DEC); IEEE80211_IF_FILE(dot11MeshHWMPconfirmationInterval, u.mesh.mshcfg.dot11MeshHWMPconfirmationInterval, DEC); +IEEE80211_IF_FILE(power_mode, u.mesh.mshcfg.power_mode, DEC); +IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, + u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ - debugfs_create_file(#name, mode, sdata->debugfs.dir, \ + debugfs_create_file(#name, mode, sdata->vif.debugfs_dir, \ sdata, &name##_ops); #define DEBUGFS_ADD(name) DEBUGFS_ADD_MODE(name, 0400) @@ -539,6 +552,7 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(aid); DEBUGFS_ADD(last_beacon); DEBUGFS_ADD(ave_beacon); + DEBUGFS_ADD(beacon_timeout); DEBUGFS_ADD_MODE(smps, 0600); DEBUGFS_ADD_MODE(tkip_mic_test, 0200); DEBUGFS_ADD_MODE(uapsd_queues, 0600); @@ -574,7 +588,7 @@ static void add_mesh_files(struct ieee80211_sub_if_data *sdata) static void add_mesh_stats(struct ieee80211_sub_if_data *sdata) { struct dentry *dir = debugfs_create_dir("mesh_stats", - sdata->debugfs.dir); + sdata->vif.debugfs_dir); #define MESHSTATS_ADD(name)\ debugfs_create_file(#name, 0400, dir, sdata, &name##_ops); @@ -591,7 +605,7 @@ static void add_mesh_stats(struct ieee80211_sub_if_data *sdata) static void add_mesh_config(struct ieee80211_sub_if_data *sdata) { struct dentry *dir = debugfs_create_dir("mesh_config", - sdata->debugfs.dir); + sdata->vif.debugfs_dir); #define MESHPARAMS_ADD(name) \ debugfs_create_file(#name, 0600, dir, sdata, &name##_ops); @@ -620,13 +634,15 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(dot11MeshHWMPactivePathToRootTimeout); MESHPARAMS_ADD(dot11MeshHWMProotInterval); MESHPARAMS_ADD(dot11MeshHWMPconfirmationInterval); + MESHPARAMS_ADD(power_mode); + MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); #undef MESHPARAMS_ADD } #endif static void add_files(struct ieee80211_sub_if_data *sdata) { - if (!sdata->debugfs.dir) + if (!sdata->vif.debugfs_dir) return; DEBUGFS_ADD(flags); @@ -668,21 +684,21 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata) char buf[10+IFNAMSIZ]; sprintf(buf, "netdev:%s", sdata->name); - sdata->debugfs.dir = debugfs_create_dir(buf, + sdata->vif.debugfs_dir = debugfs_create_dir(buf, sdata->local->hw.wiphy->debugfsdir); - if (sdata->debugfs.dir) + if (sdata->vif.debugfs_dir) sdata->debugfs.subdir_stations = debugfs_create_dir("stations", - sdata->debugfs.dir); + sdata->vif.debugfs_dir); add_files(sdata); } void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata) { - if (!sdata->debugfs.dir) + if (!sdata->vif.debugfs_dir) return; - debugfs_remove_recursive(sdata->debugfs.dir); - sdata->debugfs.dir = NULL; + debugfs_remove_recursive(sdata->vif.debugfs_dir); + sdata->vif.debugfs_dir = NULL; } void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) @@ -690,7 +706,7 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) struct dentry *dir; char buf[10 + IFNAMSIZ]; - dir = sdata->debugfs.dir; + dir = sdata->vif.debugfs_dir; if (!dir) return; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 6fb1168b9f16..44e201d60a13 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -54,6 +54,7 @@ STA_FILE(aid, sta.aid, D); STA_FILE(dev, sdata->name, S); STA_FILE(last_signal, last_signal, D); STA_FILE(last_ack_signal, last_ack_signal, D); +STA_FILE(beacon_loss_count, beacon_loss_count, D); static ssize_t sta_flags_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) @@ -65,7 +66,7 @@ static ssize_t sta_flags_read(struct file *file, char __user *userbuf, test_sta_flag(sta, WLAN_STA_##flg) ? #flg "\n" : "" int res = scnprintf(buf, sizeof(buf), - "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", TEST(AUTH), TEST(ASSOC), TEST(PS_STA), TEST(PS_DRIVER), TEST(AUTHORIZED), TEST(SHORT_PREAMBLE), @@ -74,7 +75,8 @@ static ssize_t sta_flags_read(struct file *file, char __user *userbuf, TEST(UAPSD), TEST(SP), TEST(TDLS_PEER), TEST(TDLS_PEER_AUTH), TEST(4ADDR_EVENT), TEST(INSERTED), TEST(RATE_CONTROL), - TEST(TOFFSET_KNOWN)); + TEST(TOFFSET_KNOWN), TEST(MPSP_OWNER), + TEST(MPSP_RECIPIENT)); #undef TEST return simple_read_from_buffer(userbuf, count, ppos, buf, res); } @@ -324,6 +326,36 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, } STA_OPS(ht_capa); +static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[128], *p = buf; + struct sta_info *sta = file->private_data; + struct ieee80211_sta_vht_cap *vhtc = &sta->sta.vht_cap; + + p += scnprintf(p, sizeof(buf) + buf - p, "VHT %ssupported\n", + vhtc->vht_supported ? "" : "not "); + if (vhtc->vht_supported) { + p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.8x\n", vhtc->cap); + + p += scnprintf(p, sizeof(buf)+buf-p, "RX MCS: %.4x\n", + le16_to_cpu(vhtc->vht_mcs.rx_mcs_map)); + if (vhtc->vht_mcs.rx_highest) + p += scnprintf(p, sizeof(buf)+buf-p, + "MCS RX highest: %d Mbps\n", + le16_to_cpu(vhtc->vht_mcs.rx_highest)); + p += scnprintf(p, sizeof(buf)+buf-p, "TX MCS: %.4x\n", + le16_to_cpu(vhtc->vht_mcs.tx_mcs_map)); + if (vhtc->vht_mcs.tx_highest) + p += scnprintf(p, sizeof(buf)+buf-p, + "MCS TX highest: %d Mbps\n", + le16_to_cpu(vhtc->vht_mcs.tx_highest)); + } + + return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); +} +STA_OPS(vht_capa); + static ssize_t sta_current_tx_rate_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -403,7 +435,9 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD(agg_status); DEBUGFS_ADD(dev); DEBUGFS_ADD(last_signal); + DEBUGFS_ADD(beacon_loss_count); DEBUGFS_ADD(ht_capa); + DEBUGFS_ADD(vht_capa); DEBUGFS_ADD(last_ack_signal); DEBUGFS_ADD(current_tx_rate); DEBUGFS_ADD(last_rx_rate); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 698dc7e6f309..169664c122e2 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -207,6 +207,17 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local, { might_sleep(); + if (WARN_ON_ONCE(changed & (BSS_CHANGED_BEACON | + BSS_CHANGED_BEACON_ENABLED) && + sdata->vif.type != NL80211_IFTYPE_AP && + sdata->vif.type != NL80211_IFTYPE_ADHOC && + sdata->vif.type != NL80211_IFTYPE_MESH_POINT)) + return; + + if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || + sdata->vif.type == NL80211_IFTYPE_MONITOR)) + return; + check_sdata_in_driver(sdata); trace_drv_bss_info_changed(local, sdata, info, changed); @@ -230,6 +241,22 @@ static inline u64 drv_prepare_multicast(struct ieee80211_local *local, return ret; } +static inline void drv_set_multicast_list(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct netdev_hw_addr_list *mc_list) +{ + bool allmulti = sdata->flags & IEEE80211_SDATA_ALLMULTI; + + trace_drv_set_multicast_list(local, sdata, mc_list->count); + + check_sdata_in_driver(sdata); + + if (local->ops->set_multicast_list) + local->ops->set_multicast_list(&local->hw, &sdata->vif, + allmulti, mc_list); + trace_drv_return_void(local); +} + static inline void drv_configure_filter(struct ieee80211_local *local, unsigned int changed_flags, unsigned int *total_flags, @@ -561,7 +588,8 @@ static inline void drv_sta_rc_update(struct ieee80211_local *local, check_sdata_in_driver(sdata); WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && - sdata->vif.type != NL80211_IFTYPE_ADHOC); + (sdata->vif.type != NL80211_IFTYPE_ADHOC && + sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); trace_drv_sta_rc_update(local, sdata, sta, changed); if (local->ops->sta_rc_update) @@ -692,13 +720,14 @@ static inline void drv_rfkill_poll(struct ieee80211_local *local) local->ops->rfkill_poll(&local->hw); } -static inline void drv_flush(struct ieee80211_local *local, bool drop) +static inline void drv_flush(struct ieee80211_local *local, + u32 queues, bool drop) { might_sleep(); - trace_drv_flush(local, drop); + trace_drv_flush(local, queues, drop); if (local->ops->flush) - local->ops->flush(&local->hw, drop); + local->ops->flush(&local->hw, queues, drop); trace_drv_return_void(local); } @@ -738,15 +767,16 @@ static inline int drv_get_antenna(struct ieee80211_local *local, static inline int drv_remain_on_channel(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, - unsigned int duration) + unsigned int duration, + enum ieee80211_roc_type type) { int ret; might_sleep(); - trace_drv_remain_on_channel(local, sdata, chan, duration); + trace_drv_remain_on_channel(local, sdata, chan, duration, type); ret = local->ops->remain_on_channel(&local->hw, &sdata->vif, - chan, duration); + chan, duration, type); trace_drv_return_int(local, ret); return ret; @@ -837,11 +867,12 @@ static inline void drv_set_rekey_data(struct ieee80211_local *local, } static inline void drv_rssi_callback(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, const enum ieee80211_rssi_event event) { - trace_drv_rssi_callback(local, event); + trace_drv_rssi_callback(local, sdata, event); if (local->ops->rssi_callback) - local->ops->rssi_callback(&local->hw, event); + local->ops->rssi_callback(&local->hw, &sdata->vif, event); trace_drv_return_void(local); } @@ -913,6 +944,8 @@ static inline int drv_add_chanctx(struct ieee80211_local *local, if (local->ops->add_chanctx) ret = local->ops->add_chanctx(&local->hw, &ctx->conf); trace_drv_return_int(local, ret); + if (!ret) + ctx->driver_present = true; return ret; } @@ -924,6 +957,7 @@ static inline void drv_remove_chanctx(struct ieee80211_local *local, if (local->ops->remove_chanctx) local->ops->remove_chanctx(&local->hw, &ctx->conf); trace_drv_return_void(local); + ctx->driver_present = false; } static inline void drv_change_chanctx(struct ieee80211_local *local, @@ -931,8 +965,10 @@ static inline void drv_change_chanctx(struct ieee80211_local *local, u32 changed) { trace_drv_change_chanctx(local, ctx, changed); - if (local->ops->change_chanctx) + if (local->ops->change_chanctx) { + WARN_ON_ONCE(!ctx->driver_present); local->ops->change_chanctx(&local->hw, &ctx->conf, changed); + } trace_drv_return_void(local); } @@ -945,10 +981,12 @@ static inline int drv_assign_vif_chanctx(struct ieee80211_local *local, check_sdata_in_driver(sdata); trace_drv_assign_vif_chanctx(local, sdata, ctx); - if (local->ops->assign_vif_chanctx) + if (local->ops->assign_vif_chanctx) { + WARN_ON_ONCE(!ctx->driver_present); ret = local->ops->assign_vif_chanctx(&local->hw, &sdata->vif, &ctx->conf); + } trace_drv_return_int(local, ret); return ret; @@ -961,10 +999,12 @@ static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local, check_sdata_in_driver(sdata); trace_drv_unassign_vif_chanctx(local, sdata, ctx); - if (local->ops->unassign_vif_chanctx) + if (local->ops->unassign_vif_chanctx) { + WARN_ON_ONCE(!ctx->driver_present); local->ops->unassign_vif_chanctx(&local->hw, &sdata->vif, &ctx->conf); + } trace_drv_return_void(local); } @@ -1003,4 +1043,32 @@ static inline void drv_restart_complete(struct ieee80211_local *local) trace_drv_return_void(local); } +static inline void +drv_set_default_unicast_key(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + int key_idx) +{ + check_sdata_in_driver(sdata); + + WARN_ON_ONCE(key_idx < -1 || key_idx > 3); + + trace_drv_set_default_unicast_key(local, sdata, key_idx); + if (local->ops->set_default_unicast_key) + local->ops->set_default_unicast_key(&local->hw, &sdata->vif, + key_idx); + trace_drv_return_void(local); +} + +#if IS_ENABLED(CONFIG_IPV6) +static inline void drv_ipv6_addr_change(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct inet6_dev *idev) +{ + trace_drv_ipv6_addr_change(local, sdata); + if (local->ops->ipv6_addr_change) + local->ops->ipv6_addr_change(&local->hw, &sdata->vif, idev); + trace_drv_return_void(local); +} +#endif + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index a71d891794a4..af8cee06e4f3 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -37,12 +37,8 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, u8 *smask = (u8 *)(&sdata->u.mgd.ht_capa_mask.mcs.rx_mask); int i; - if (sdata->vif.type != NL80211_IFTYPE_STATION) { - /* AP interfaces call this code when adding new stations, - * so just silently ignore non station interfaces. - */ + if (!ht_cap->ht_supported) return; - } /* NOTE: If you add more over-rides here, update register_hw * ht_capa_mod_msk logic in main.c as well. @@ -62,6 +58,9 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, __check_htcap_disable(sdata, ht_cap, IEEE80211_HT_CAP_SUP_WIDTH_20_40); __check_htcap_disable(sdata, ht_cap, IEEE80211_HT_CAP_SGI_40); + /* Allow user to disable SGI-20 (SGI-40 is handled above) */ + __check_htcap_disable(sdata, ht_cap, IEEE80211_HT_CAP_SGI_20); + /* Allow user to disable the max-AMSDU bit. */ __check_htcap_disable(sdata, ht_cap, IEEE80211_HT_CAP_MAX_AMSDU); @@ -86,22 +85,36 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, } -void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, +bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, - struct ieee80211_ht_cap *ht_cap_ie, - struct ieee80211_sta_ht_cap *ht_cap) + const struct ieee80211_ht_cap *ht_cap_ie, + struct sta_info *sta) { + struct ieee80211_sta_ht_cap ht_cap, own_cap; u8 ampdu_info, tx_mcs_set_cap; int i, max_tx_streams; + bool changed; + enum ieee80211_sta_rx_bandwidth bw; + enum ieee80211_smps_mode smps_mode; - BUG_ON(!ht_cap); - - memset(ht_cap, 0, sizeof(*ht_cap)); + memset(&ht_cap, 0, sizeof(ht_cap)); if (!ht_cap_ie || !sband->ht_cap.ht_supported) - return; + goto apply; - ht_cap->ht_supported = true; + ht_cap.ht_supported = true; + + own_cap = sband->ht_cap; + + /* + * If user has specified capability over-rides, take care + * of that if the station we're setting up is the AP that + * we advertised a restricted capability set to. Override + * our own capabilities and then use those below. + */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + ieee80211_apply_htcap_overrides(sdata, &own_cap); /* * The bits listed in this expression should be @@ -109,38 +122,38 @@ void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, * advertises more then we can't use those thus * we mask them out. */ - ht_cap->cap = le16_to_cpu(ht_cap_ie->cap_info) & - (sband->ht_cap.cap | - ~(IEEE80211_HT_CAP_LDPC_CODING | - IEEE80211_HT_CAP_SUP_WIDTH_20_40 | - IEEE80211_HT_CAP_GRN_FLD | - IEEE80211_HT_CAP_SGI_20 | - IEEE80211_HT_CAP_SGI_40 | - IEEE80211_HT_CAP_DSSSCCK40)); + ht_cap.cap = le16_to_cpu(ht_cap_ie->cap_info) & + (own_cap.cap | ~(IEEE80211_HT_CAP_LDPC_CODING | + IEEE80211_HT_CAP_SUP_WIDTH_20_40 | + IEEE80211_HT_CAP_GRN_FLD | + IEEE80211_HT_CAP_SGI_20 | + IEEE80211_HT_CAP_SGI_40 | + IEEE80211_HT_CAP_DSSSCCK40)); + /* * The STBC bits are asymmetric -- if we don't have * TX then mask out the peer's RX and vice versa. */ - if (!(sband->ht_cap.cap & IEEE80211_HT_CAP_TX_STBC)) - ht_cap->cap &= ~IEEE80211_HT_CAP_RX_STBC; - if (!(sband->ht_cap.cap & IEEE80211_HT_CAP_RX_STBC)) - ht_cap->cap &= ~IEEE80211_HT_CAP_TX_STBC; + if (!(own_cap.cap & IEEE80211_HT_CAP_TX_STBC)) + ht_cap.cap &= ~IEEE80211_HT_CAP_RX_STBC; + if (!(own_cap.cap & IEEE80211_HT_CAP_RX_STBC)) + ht_cap.cap &= ~IEEE80211_HT_CAP_TX_STBC; ampdu_info = ht_cap_ie->ampdu_params_info; - ht_cap->ampdu_factor = + ht_cap.ampdu_factor = ampdu_info & IEEE80211_HT_AMPDU_PARM_FACTOR; - ht_cap->ampdu_density = + ht_cap.ampdu_density = (ampdu_info & IEEE80211_HT_AMPDU_PARM_DENSITY) >> 2; /* own MCS TX capabilities */ - tx_mcs_set_cap = sband->ht_cap.mcs.tx_params; + tx_mcs_set_cap = own_cap.mcs.tx_params; /* Copy peer MCS TX capabilities, the driver might need them. */ - ht_cap->mcs.tx_params = ht_cap_ie->mcs.tx_params; + ht_cap.mcs.tx_params = ht_cap_ie->mcs.tx_params; /* can we TX with MCS rates? */ if (!(tx_mcs_set_cap & IEEE80211_HT_MCS_TX_DEFINED)) - return; + goto apply; /* Counting from 0, therefore +1 */ if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_RX_DIFF) @@ -158,37 +171,84 @@ void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, * - remainder are multiple spatial streams using unequal modulation */ for (i = 0; i < max_tx_streams; i++) - ht_cap->mcs.rx_mask[i] = - sband->ht_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i]; + ht_cap.mcs.rx_mask[i] = + own_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i]; if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION) for (i = IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE; i < IEEE80211_HT_MCS_MASK_LEN; i++) - ht_cap->mcs.rx_mask[i] = - sband->ht_cap.mcs.rx_mask[i] & + ht_cap.mcs.rx_mask[i] = + own_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i]; /* handle MCS rate 32 too */ - if (sband->ht_cap.mcs.rx_mask[32/8] & ht_cap_ie->mcs.rx_mask[32/8] & 1) - ht_cap->mcs.rx_mask[32/8] |= 1; + if (own_cap.mcs.rx_mask[32/8] & ht_cap_ie->mcs.rx_mask[32/8] & 1) + ht_cap.mcs.rx_mask[32/8] |= 1; - /* - * If user has specified capability over-rides, take care - * of that here. - */ - ieee80211_apply_htcap_overrides(sdata, ht_cap); + apply: + changed = memcmp(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap)); + + memcpy(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap)); + + switch (sdata->vif.bss_conf.chandef.width) { + default: + WARN_ON_ONCE(1); + /* fall through */ + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + bw = IEEE80211_STA_RX_BW_20; + break; + case NL80211_CHAN_WIDTH_40: + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + bw = ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? + IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; + break; + } + + if (bw != sta->sta.bandwidth) + changed = true; + sta->sta.bandwidth = bw; + + sta->cur_max_bandwidth = + ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? + IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; + + switch ((ht_cap.cap & IEEE80211_HT_CAP_SM_PS) + >> IEEE80211_HT_CAP_SM_PS_SHIFT) { + case WLAN_HT_CAP_SM_PS_INVALID: + case WLAN_HT_CAP_SM_PS_STATIC: + smps_mode = IEEE80211_SMPS_STATIC; + break; + case WLAN_HT_CAP_SM_PS_DYNAMIC: + smps_mode = IEEE80211_SMPS_DYNAMIC; + break; + case WLAN_HT_CAP_SM_PS_DISABLED: + smps_mode = IEEE80211_SMPS_OFF; + break; + } + + if (smps_mode != sta->sta.smps_mode) + changed = true; + sta->sta.smps_mode = smps_mode; + + return changed; } -void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx) +void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, + enum ieee80211_agg_stop_reason reason) { int i; cancel_work_sync(&sta->ampdu_mlme.work); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - __ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR, tx); + __ieee80211_stop_tx_ba_session(sta, i, reason); __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_LEAVE_QBSS, tx); + WLAN_REASON_QSTA_LEAVE_QBSS, + reason != AGG_STOP_DESTROY_STA && + reason != AGG_STOP_PEER_REQUEST); } } @@ -245,8 +305,7 @@ void ieee80211_ba_session_work(struct work_struct *work) if (tid_tx && test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state)) ___ieee80211_stop_tx_ba_session(sta, tid, - WLAN_BACK_INITIATOR, - true); + AGG_STOP_LOCAL_REQUEST); } mutex_unlock(&sta->ampdu_mlme.mtx); } @@ -314,8 +373,7 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0, true); else - __ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, - true); + __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_PEER_REQUEST); } int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, @@ -387,6 +445,9 @@ void ieee80211_request_smps(struct ieee80211_vif *vif, if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF)) smps_mode = IEEE80211_SMPS_AUTOMATIC; + if (sdata->u.mgd.driver_smps_mode == smps_mode) + return; + sdata->u.mgd.driver_smps_mode = smps_mode; ieee80211_queue_work(&sdata->local->hw, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 6b7644e818d8..170f9a7fa319 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -44,7 +44,6 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; int rates, i; - struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u8 *pos; struct ieee80211_supported_band *sband; @@ -52,31 +51,34 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, u32 bss_change; u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; struct cfg80211_chan_def chandef; + struct beacon_data *presp; + int frame_len; lockdep_assert_held(&ifibss->mtx); /* Reset own TSF to allow time synchronization work. */ drv_reset_tsf(local, sdata); - skb = ifibss->skb; - RCU_INIT_POINTER(ifibss->presp, NULL); - synchronize_rcu(); - skb->data = skb->head; - skb->len = 0; - skb_reset_tail_pointer(skb); - skb_reserve(skb, sdata->local->hw.extra_tx_headroom); - if (!ether_addr_equal(ifibss->bssid, bssid)) - sta_info_flush(sdata->local, sdata); + sta_info_flush(sdata); /* if merging, indicate to driver that we leave the old IBSS */ if (sdata->vif.bss_conf.ibss_joined) { sdata->vif.bss_conf.ibss_joined = false; sdata->vif.bss_conf.ibss_creator = false; + sdata->vif.bss_conf.enable_beacon = false; netif_carrier_off(sdata->dev); - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IBSS); + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_IBSS | + BSS_CHANGED_BEACON_ENABLED); } + presp = rcu_dereference_protected(ifibss->presp, + lockdep_is_held(&ifibss->mtx)); + rcu_assign_pointer(ifibss->presp, NULL); + if (presp) + kfree_rcu(presp, rcu_head); + sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0; cfg80211_chandef_create(&chandef, chan, ifibss->channel_type); @@ -98,19 +100,24 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[chan->band]; - /* build supported rates array */ - pos = supp_rates; - for (i = 0; i < sband->n_bitrates; i++) { - int rate = sband->bitrates[i].bitrate; - u8 basic = 0; - if (basic_rates & BIT(i)) - basic = 0x80; - *pos++ = basic | (u8) (rate / 5); - } - /* Build IBSS probe response */ - mgmt = (void *) skb_put(skb, 24 + sizeof(mgmt->u.beacon)); - memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon)); + frame_len = sizeof(struct ieee80211_hdr_3addr) + + 12 /* struct ieee80211_mgmt.u.beacon */ + + 2 + IEEE80211_MAX_SSID_LEN /* max SSID */ + + 2 + 8 /* max Supported Rates */ + + 3 /* max DS params */ + + 4 /* IBSS params */ + + 2 + (IEEE80211_MAX_SUPP_RATES - 8) + + 2 + sizeof(struct ieee80211_ht_cap) + + 2 + sizeof(struct ieee80211_ht_operation) + + ifibss->ie_len; + presp = kzalloc(sizeof(*presp) + frame_len, GFP_KERNEL); + if (!presp) + return; + + presp->head = (void *)(presp + 1); + + mgmt = (void *) presp->head; mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP); eth_broadcast_addr(mgmt->da); @@ -120,27 +127,30 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, mgmt->u.beacon.timestamp = cpu_to_le64(tsf); mgmt->u.beacon.capab_info = cpu_to_le16(capability); - pos = skb_put(skb, 2 + ifibss->ssid_len); + pos = (u8 *)mgmt + offsetof(struct ieee80211_mgmt, u.beacon.variable); + *pos++ = WLAN_EID_SSID; *pos++ = ifibss->ssid_len; memcpy(pos, ifibss->ssid, ifibss->ssid_len); + pos += ifibss->ssid_len; - rates = sband->n_bitrates; - if (rates > 8) - rates = 8; - pos = skb_put(skb, 2 + rates); + rates = min_t(int, 8, sband->n_bitrates); *pos++ = WLAN_EID_SUPP_RATES; *pos++ = rates; - memcpy(pos, supp_rates, rates); + for (i = 0; i < rates; i++) { + int rate = sband->bitrates[i].bitrate; + u8 basic = 0; + if (basic_rates & BIT(i)) + basic = 0x80; + *pos++ = basic | (u8) (rate / 5); + } if (sband->band == IEEE80211_BAND_2GHZ) { - pos = skb_put(skb, 2 + 1); *pos++ = WLAN_EID_DS_PARAMS; *pos++ = 1; *pos++ = ieee80211_frequency_to_channel(chan->center_freq); } - pos = skb_put(skb, 2 + 2); *pos++ = WLAN_EID_IBSS_PARAMS; *pos++ = 2; /* FIX: set ATIM window based on scan results */ @@ -148,23 +158,25 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, *pos++ = 0; if (sband->n_bitrates > 8) { - rates = sband->n_bitrates - 8; - pos = skb_put(skb, 2 + rates); *pos++ = WLAN_EID_EXT_SUPP_RATES; - *pos++ = rates; - memcpy(pos, &supp_rates[8], rates); + *pos++ = sband->n_bitrates - 8; + for (i = 8; i < sband->n_bitrates; i++) { + int rate = sband->bitrates[i].bitrate; + u8 basic = 0; + if (basic_rates & BIT(i)) + basic = 0x80; + *pos++ = basic | (u8) (rate / 5); + } } - if (ifibss->ie_len) - memcpy(skb_put(skb, ifibss->ie_len), - ifibss->ie, ifibss->ie_len); + if (ifibss->ie_len) { + memcpy(pos, ifibss->ie, ifibss->ie_len); + pos += ifibss->ie_len; + } /* add HT capability and information IEs */ if (chandef.width != NL80211_CHAN_WIDTH_20_NOHT && sband->ht_cap.ht_supported) { - pos = skb_put(skb, 4 + - sizeof(struct ieee80211_ht_cap) + - sizeof(struct ieee80211_ht_operation)); pos = ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, sband->ht_cap.cap); /* @@ -177,7 +189,6 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, } if (local->hw.queues >= IEEE80211_NUM_ACS) { - pos = skb_put(skb, 9); *pos++ = WLAN_EID_VENDOR_SPECIFIC; *pos++ = 7; /* len */ *pos++ = 0x00; /* Microsoft OUI 00:50:F2 */ @@ -189,10 +200,17 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, *pos++ = 0; /* U-APSD no in use */ } - rcu_assign_pointer(ifibss->presp, skb); + presp->head_len = pos - presp->head; + if (WARN_ON(presp->head_len > frame_len)) + return; + + rcu_assign_pointer(ifibss->presp, presp); + sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.beacon_int = beacon_int; sdata->vif.bss_conf.basic_rates = basic_rates; + sdata->vif.bss_conf.ssid_len = ifibss->ssid_len; + memcpy(sdata->vif.bss_conf.ssid, ifibss->ssid, ifibss->ssid_len); bss_change = BSS_CHANGED_BEACON_INT; bss_change |= ieee80211_reset_erp_info(sdata); bss_change |= BSS_CHANGED_BSSID; @@ -201,6 +219,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, bss_change |= BSS_CHANGED_BASIC_RATES; bss_change |= BSS_CHANGED_HT; bss_change |= BSS_CHANGED_IBSS; + bss_change |= BSS_CHANGED_SSID; /* * In 5 GHz/802.11a, we can always use short slot time. @@ -226,8 +245,8 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); bss = cfg80211_inform_bss_frame(local->hw.wiphy, chan, - mgmt, skb->len, 0, GFP_KERNEL); - cfg80211_put_bss(bss); + mgmt, presp->head_len, 0, GFP_KERNEL); + cfg80211_put_bss(local->hw.wiphy, bss); netif_carrier_on(sdata->dev); cfg80211_ibss_joined(sdata->dev, ifibss->bssid, GFP_KERNEL); } @@ -241,6 +260,8 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, u32 basic_rates; int i, j; u16 beacon_int = cbss->beacon_interval; + const struct cfg80211_bss_ies *ies; + u64 tsf; lockdep_assert_held(&sdata->u.ibss.mtx); @@ -264,13 +285,17 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, } } + rcu_read_lock(); + ies = rcu_dereference(cbss->ies); + tsf = ies->tsf; + rcu_read_unlock(); + __ieee80211_sta_join_ibss(sdata, cbss->bssid, beacon_int, cbss->channel, basic_rates, cbss->capability, - cbss->tsf, - false); + tsf, false); } static struct sta_info *ieee80211_ibss_finish_sta(struct sta_info *sta, @@ -301,7 +326,7 @@ static struct sta_info *ieee80211_ibss_finish_sta(struct sta_info *sta, "TX Auth SA=%pM DA=%pM BSSID=%pM (auth_transaction=1)\n", sdata->vif.addr, addr, sdata->u.ibss.bssid); ieee80211_send_auth(sdata, 1, WLAN_AUTH_OPEN, 0, NULL, 0, - addr, sdata->u.ibss.bssid, NULL, 0, 0); + addr, sdata->u.ibss.bssid, NULL, 0, 0, 0); } return sta; } @@ -421,15 +446,13 @@ static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata, * has actually implemented this. */ ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, 0, NULL, 0, - mgmt->sa, sdata->u.ibss.bssid, NULL, 0, 0); + mgmt->sa, sdata->u.ibss.bssid, NULL, 0, 0, 0); } static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, - size_t len, + struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status, - struct ieee802_11_elems *elems, - bool beacon) + struct ieee802_11_elems *elems) { struct ieee80211_local *local = sdata->local; int freq; @@ -443,7 +466,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; bool rates_updated = false; - if (elems->ds_params && elems->ds_params_len == 1) + if (elems->ds_params) freq = ieee80211_channel_to_frequency(elems->ds_params[0], band); else @@ -491,33 +514,26 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, if (sta && elems->ht_operation && elems->ht_cap_elem && sdata->u.ibss.channel_type != NL80211_CHAN_NO_HT) { /* we both use HT */ - struct ieee80211_sta_ht_cap sta_ht_cap_new; + struct ieee80211_ht_cap htcap_ie; struct cfg80211_chan_def chandef; ieee80211_ht_oper_to_chandef(channel, elems->ht_operation, &chandef); - ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, - elems->ht_cap_elem, - &sta_ht_cap_new); + memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie)); /* * fall back to HT20 if we don't use or use * the other extension channel */ - if (chandef.width != NL80211_CHAN_WIDTH_40 || - cfg80211_get_chandef_type(&chandef) != + if (cfg80211_get_chandef_type(&chandef) != sdata->u.ibss.channel_type) - sta_ht_cap_new.cap &= - ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; - - if (memcmp(&sta->sta.ht_cap, &sta_ht_cap_new, - sizeof(sta_ht_cap_new))) { - memcpy(&sta->sta.ht_cap, &sta_ht_cap_new, - sizeof(sta_ht_cap_new)); - rates_updated = true; - } + htcap_ie.cap_info &= + cpu_to_le16(~IEEE80211_HT_CAP_SUP_WIDTH_20_40); + + rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap( + sdata, sband, &htcap_ie, sta); } if (sta && rates_updated) { @@ -530,14 +546,14 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, } bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, - channel, beacon); + channel); if (!bss) return; cbss = container_of((void *)bss, struct cfg80211_bss, priv); - /* was just updated in ieee80211_bss_info_update */ - beacon_timestamp = cbss->tsf; + /* same for beacon and probe response */ + beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp); /* check if we need to merge IBSS */ @@ -824,8 +840,7 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; int tx_last_beacon, len = req->len; struct sk_buff *skb; - struct ieee80211_mgmt *resp; - struct sk_buff *presp; + struct beacon_data *presp; u8 *pos, *end; lockdep_assert_held(&ifibss->mtx); @@ -866,51 +881,42 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, } /* Reply with ProbeResp */ - skb = skb_copy(presp, GFP_KERNEL); + skb = dev_alloc_skb(local->tx_headroom + presp->head_len); if (!skb) return; - resp = (struct ieee80211_mgmt *) skb->data; - memcpy(resp->da, mgmt->sa, ETH_ALEN); - ibss_dbg(sdata, "Sending ProbeResp to %pM\n", resp->da); + skb_reserve(skb, local->tx_headroom); + memcpy(skb_put(skb, presp->head_len), presp->head, presp->head_len); + + memcpy(((struct ieee80211_mgmt *) skb->data)->da, mgmt->sa, ETH_ALEN); + ibss_dbg(sdata, "Sending ProbeResp to %pM\n", mgmt->sa); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } -static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, - size_t len, - struct ieee80211_rx_status *rx_status) +static +void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee80211_rx_status *rx_status) { size_t baselen; struct ieee802_11_elems elems; + BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) != + offsetof(typeof(mgmt->u.beacon), variable)); + + /* + * either beacon or probe_resp but the variable field is at the + * same offset + */ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; if (baselen > len) return; ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, - &elems); + false, &elems); - ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false); -} - -static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, - size_t len, - struct ieee80211_rx_status *rx_status) -{ - size_t baselen; - struct ieee802_11_elems elems; - - /* Process beacon from the current BSS */ - baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; - if (baselen > len) - return; - - ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems); - - ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true); + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems); } void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, @@ -934,12 +940,9 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, ieee80211_rx_mgmt_probe_req(sdata, skb); break; case IEEE80211_STYPE_PROBE_RESP: - ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, - rx_status); - break; case IEEE80211_STYPE_BEACON: - ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, - rx_status); + ieee80211_rx_mgmt_probe_beacon(sdata, mgmt, skb->len, + rx_status); break; case IEEE80211_STYPE_AUTH: ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len); @@ -1001,36 +1004,9 @@ static void ieee80211_ibss_timer(unsigned long data) { struct ieee80211_sub_if_data *sdata = (struct ieee80211_sub_if_data *) data; - struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; - struct ieee80211_local *local = sdata->local; - - if (local->quiescing) { - ifibss->timer_running = true; - return; - } - - ieee80211_queue_work(&local->hw, &sdata->work); -} - -#ifdef CONFIG_PM -void ieee80211_ibss_quiesce(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; - - if (del_timer_sync(&ifibss->timer)) - ifibss->timer_running = true; -} - -void ieee80211_ibss_restart(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; - if (ifibss->timer_running) { - add_timer(&ifibss->timer); - ifibss->timer_running = false; - } + ieee80211_queue_work(&sdata->local->hw, &sdata->work); } -#endif void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) { @@ -1063,23 +1039,8 @@ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local) int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params) { - struct sk_buff *skb; u32 changed = 0; - skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + - sizeof(struct ieee80211_hdr_3addr) + - 12 /* struct ieee80211_mgmt.u.beacon */ + - 2 + IEEE80211_MAX_SSID_LEN /* max SSID */ + - 2 + 8 /* max Supported Rates */ + - 3 /* max DS params */ + - 4 /* IBSS params */ + - 2 + (IEEE80211_MAX_SUPP_RATES - 8) + - 2 + sizeof(struct ieee80211_ht_cap) + - 2 + sizeof(struct ieee80211_ht_operation) + - params->ie_len); - if (!skb) - return -ENOMEM; - mutex_lock(&sdata->u.ibss.mtx); if (params->bssid) { @@ -1108,7 +1069,6 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, sdata->u.ibss.ie_len = params->ie_len; } - sdata->u.ibss.skb = skb; sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH; sdata->u.ibss.ibss_join_req = jiffies; @@ -1117,10 +1077,6 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, mutex_unlock(&sdata->u.ibss.mtx); - mutex_lock(&sdata->local->mtx); - ieee80211_recalc_idle(sdata->local); - mutex_unlock(&sdata->local->mtx); - /* * 802.11n-2009 9.13.3.1: In an IBSS, the HT Protection field is * reserved, but an HT STA shall protect HT transmissions as though @@ -1148,13 +1104,13 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) { - struct sk_buff *skb; struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct cfg80211_bss *cbss; u16 capability; int active_ibss; struct sta_info *sta; + struct beacon_data *presp; mutex_lock(&sdata->u.ibss.mtx); @@ -1174,7 +1130,7 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) if (cbss) { cfg80211_unlink_bss(local->hw.wiphy, cbss); - cfg80211_put_bss(cbss); + cfg80211_put_bss(local->hw.wiphy, cbss); } } @@ -1182,7 +1138,7 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) memset(ifibss->bssid, 0, ETH_ALEN); ifibss->ssid_len = 0; - sta_info_flush(sdata->local, sdata); + sta_info_flush(sdata); spin_lock_bh(&ifibss->incomplete_lock); while (!list_empty(&ifibss->incomplete_stations)) { @@ -1200,15 +1156,18 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) /* remove beacon */ kfree(sdata->u.ibss.ie); - skb = rcu_dereference_protected(sdata->u.ibss.presp, - lockdep_is_held(&sdata->u.ibss.mtx)); + presp = rcu_dereference_protected(ifibss->presp, + lockdep_is_held(&sdata->u.ibss.mtx)); RCU_INIT_POINTER(sdata->u.ibss.presp, NULL); sdata->vif.bss_conf.ibss_joined = false; sdata->vif.bss_conf.ibss_creator = false; + sdata->vif.bss_conf.enable_beacon = false; + sdata->vif.bss_conf.ssid_len = 0; + clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_IBSS); synchronize_rcu(); - kfree_skb(skb); + kfree(presp); skb_queue_purge(&sdata->skb_queue); @@ -1216,9 +1175,5 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) mutex_unlock(&sdata->u.ibss.mtx); - mutex_lock(&local->mtx); - ieee80211_recalc_idle(sdata->local); - mutex_unlock(&local->mtx); - return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2ed065c09562..158e6eb188d3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -86,23 +86,11 @@ struct ieee80211_fragment_entry { struct ieee80211_bss { - /* don't want to look up all the time */ - size_t ssid_len; - u8 ssid[IEEE80211_MAX_SSID_LEN]; - - u32 device_ts; + u32 device_ts_beacon, device_ts_presp; bool wmm_used; bool uapsd_supported; - unsigned long last_probe_resp; - -#ifdef CONFIG_MAC80211_MESH - u8 *mesh_id; - size_t mesh_id_len; - u8 *mesh_cfg; -#endif - #define IEEE80211_MAX_SUPP_RATES 32 u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; size_t supp_rates_len; @@ -153,31 +141,6 @@ enum ieee80211_bss_valid_data_flags { IEEE80211_BSS_VALID_ERP = BIT(3) }; -static inline u8 *bss_mesh_cfg(struct ieee80211_bss *bss) -{ -#ifdef CONFIG_MAC80211_MESH - return bss->mesh_cfg; -#endif - return NULL; -} - -static inline u8 *bss_mesh_id(struct ieee80211_bss *bss) -{ -#ifdef CONFIG_MAC80211_MESH - return bss->mesh_id; -#endif - return NULL; -} - -static inline u8 bss_mesh_id_len(struct ieee80211_bss *bss) -{ -#ifdef CONFIG_MAC80211_MESH - return bss->mesh_id_len; -#endif - return 0; -} - - typedef unsigned __bitwise__ ieee80211_tx_result; #define TX_CONTINUE ((__force ieee80211_tx_result) 0u) #define TX_DROP ((__force ieee80211_tx_result) 1u) @@ -193,6 +156,7 @@ struct ieee80211_tx_data { struct ieee80211_sub_if_data *sdata; struct sta_info *sta; struct ieee80211_key *key; + struct ieee80211_tx_rate rate; unsigned int flags; }; @@ -346,12 +310,14 @@ struct ieee80211_roc_work { struct ieee80211_channel *chan; bool started, abort, hw_begun, notified; + bool to_be_freed; unsigned long hw_start_time; u32 duration, req_duration; struct sk_buff *frame; u64 cookie, mgmt_tx_cookie; + enum ieee80211_roc_type type; }; /* flags used in struct ieee80211_if_managed.flags */ @@ -380,6 +346,7 @@ struct ieee80211_mgd_auth_data { u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; bool done; + bool timeout_started; u16 sae_trans, sae_status; size_t data_len; @@ -399,12 +366,14 @@ struct ieee80211_mgd_assoc_data { u8 ssid_len; u8 supp_rates_len; bool wmm, uapsd; - bool have_beacon; - bool sent_assoc; + bool have_beacon, need_beacon; bool synced; + bool timeout_started; u8 ap_ht_param; + struct ieee80211_vht_cap ap_vht_cap; + size_t ie_len; u8 ie[]; }; @@ -423,6 +392,7 @@ struct ieee80211_if_managed { unsigned long probe_timeout; int probe_send_count; bool nullfunc_failed; + bool connection_loss; struct mutex mtx; struct cfg80211_bss *associated; @@ -433,7 +403,6 @@ struct ieee80211_if_managed { u16 aid; - unsigned long timers_running; /* used for quiesce/restart */ bool powersave; /* powersave requested for this iface */ bool broken_ap; /* AP is broken -- turn off powersave */ u8 dtim_period; @@ -447,6 +416,10 @@ struct ieee80211_if_managed { bool beacon_crc_valid; u32 beacon_crc; + bool status_acked; + bool status_received; + __le16 status_fc; + enum { IEEE80211_MFP_DISABLED, IEEE80211_MFP_OPTIONAL, @@ -471,7 +444,7 @@ struct ieee80211_if_managed { u8 use_4addr; - u8 p2p_noa_index; + s16 p2p_noa_index; /* Signal strength from the last Beacon frame in the current BSS. */ int last_beacon_signal; @@ -508,6 +481,8 @@ struct ieee80211_if_managed { struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */ struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */ + struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */ + struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */ }; struct ieee80211_if_ibss { @@ -519,8 +494,6 @@ struct ieee80211_if_ibss { u32 basic_rates; - bool timer_running; - bool fixed_bssid; bool fixed_channel; bool privacy; @@ -537,8 +510,7 @@ struct ieee80211_if_ibss { unsigned long ibss_join_req; /* probe response/beacon for IBSS */ - struct sk_buff __rcu *presp; - struct sk_buff *skb; + struct beacon_data __rcu *presp; spinlock_t incomplete_lock; struct list_head incomplete_stations; @@ -572,8 +544,6 @@ struct ieee80211_if_mesh { struct timer_list mesh_path_timer; struct timer_list mesh_path_root_timer; - unsigned long timers_running; - unsigned long wrkq_flags; u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN]; @@ -609,6 +579,9 @@ struct ieee80211_if_mesh { u32 mesh_seqnum; bool accepting_plinks; int num_gates; + struct beacon_data __rcu *beacon; + /* just protects beacon updates for now */ + struct mutex mtx; const u8 *ie; u8 ie_len; enum { @@ -616,11 +589,17 @@ struct ieee80211_if_mesh { IEEE80211_MESH_SEC_AUTHED = 0x1, IEEE80211_MESH_SEC_SECURED = 0x2, } security; + bool user_mpm; /* Extensible Synchronization Framework */ const struct ieee80211_mesh_sync_ops *sync_ops; s64 sync_offset_clockdrift_max; spinlock_t sync_offset_lock; bool adjusting_tbtt; + /* mesh power save */ + enum nl80211_mesh_power_mode nonpeer_pm; + int ps_peers_light_sleep; + int ps_peers_deep_sleep; + struct ps_data ps; }; #ifdef CONFIG_MAC80211_MESH @@ -659,10 +638,13 @@ enum ieee80211_sub_if_data_flags { * change handling while the interface is up * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel * mode, so queues are stopped + * @SDATA_STATE_OFFCHANNEL_BEACON_STOPPED: Beaconing was stopped due + * to offchannel, reset when offchannel returns */ enum ieee80211_sdata_state_bits { SDATA_STATE_RUNNING, SDATA_STATE_OFFCHANNEL, + SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, }; /** @@ -685,6 +667,7 @@ struct ieee80211_chanctx { enum ieee80211_chanctx_mode mode; int refcount; + bool driver_present; struct ieee80211_chanctx_conf conf; }; @@ -699,6 +682,8 @@ struct ieee80211_sub_if_data { /* count for keys needing tailroom space allocation */ int crypto_tx_tailroom_needed_cnt; + int crypto_tx_tailroom_pending_dec; + struct delayed_work dec_tailroom_needed_wk; struct net_device *dev; struct ieee80211_local *local; @@ -711,9 +696,6 @@ struct ieee80211_sub_if_data { char name[IFNAMSIZ]; - /* to detect idle changes */ - bool old_idle; - /* Fragment table for host-based reassembly */ struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX]; unsigned int fragment_next; @@ -741,14 +723,15 @@ struct ieee80211_sub_if_data { struct work_struct work; struct sk_buff_head skb_queue; - bool arp_filter_state; - u8 needed_rx_chains; enum ieee80211_smps_mode smps_mode; int user_power_level; /* in dBm */ int ap_power_level; /* in dBm */ + bool radar_required; + struct delayed_work dfs_cac_timer_work; + /* * AP this belongs to: self in AP mode and * corresponding AP in VLAN mode, NULL for @@ -758,6 +741,8 @@ struct ieee80211_sub_if_data { /* bitmap of allowed (non-MCS) rate indexes for rate control */ u32 rc_rateidx_mask[IEEE80211_NUM_BANDS]; + + bool rc_has_mcs_mask[IEEE80211_NUM_BANDS]; u8 rc_rateidx_mcs_mask[IEEE80211_NUM_BANDS][IEEE80211_HT_MCS_MASK_LEN]; union { @@ -776,13 +761,13 @@ struct ieee80211_sub_if_data { #ifdef CONFIG_MAC80211_DEBUGFS struct { - struct dentry *dir; struct dentry *subdir_stations; struct dentry *default_unicast_key; struct dentry *default_multicast_key; struct dentry *default_mgmt_key; } debugfs; #endif + /* must be last, dynamically sized area in this! */ struct ieee80211_vif vif; }; @@ -817,11 +802,6 @@ enum sdata_queue_type { enum { IEEE80211_RX_MSG = 1, IEEE80211_TX_STATUS_MSG = 2, - IEEE80211_EOSP_MSG = 3, -}; - -struct skb_eosp_msg_data { - u8 sta[ETH_ALEN], iface[ETH_ALEN]; }; enum queue_stop_reason { @@ -831,6 +811,8 @@ enum queue_stop_reason { IEEE80211_QUEUE_STOP_REASON_AGGREGATION, IEEE80211_QUEUE_STOP_REASON_SUSPEND, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, + IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, + IEEE80211_QUEUE_STOP_REASON_FLUSH, }; #ifdef CONFIG_MAC80211_LEDS @@ -963,6 +945,10 @@ struct ieee80211_local { /* wowlan is enabled -- don't reconfig on resume */ bool wowlan; + /* DFS/radar detection is enabled */ + bool radar_detect_enabled; + struct work_struct radar_detected_work; + /* number of RX chains the hardware has */ u8 rx_chains; @@ -977,14 +963,7 @@ struct ieee80211_local { struct sk_buff_head skb_queue; struct sk_buff_head skb_queue_unreliable; - /* - * Internal FIFO queue which is shared between multiple rx path - * stages. Its main task is to provide a serialization mechanism, - * so all rx handlers can enjoy having exclusive access to their - * private data structures. - */ - struct sk_buff_head rx_skb_queue; - bool running_rx_handler; /* protected by rx_skb_queue.lock */ + spinlock_t rx_path_lock; /* Station data */ /* @@ -1044,10 +1023,9 @@ struct ieee80211_local { enum mac80211_scan_state next_scan_state; struct delayed_work scan_work; struct ieee80211_sub_if_data __rcu *scan_sdata; - struct ieee80211_channel *csa_channel; + struct cfg80211_chan_def csa_chandef; /* For backward compatibility only -- do not use */ - struct ieee80211_channel *_oper_channel; - enum nl80211_channel_type _oper_channel_type; + struct cfg80211_chan_def _oper_chandef; /* Temporary remain-on-channel for off-channel operations */ struct ieee80211_channel *tmp_channel; @@ -1118,14 +1096,13 @@ struct ieee80211_local { struct timer_list dynamic_ps_timer; struct notifier_block network_latency_notifier; struct notifier_block ifa_notifier; + struct notifier_block ifa6_notifier; /* * The dynamic ps timeout configured from user space via WEXT - * this will override whatever chosen by mac80211 internally. */ int dynamic_ps_forced_timeout; - int dynamic_ps_user_timeout; - bool disable_dynamic_ps; int user_power_level; /* in dBm, for all interfaces */ @@ -1153,11 +1130,6 @@ struct ieee80211_local { struct ieee80211_sub_if_data __rcu *p2p_sdata; - /* dummy netdev for use w/ NAPI */ - struct net_device napi_dev; - - struct napi_struct napi; - /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct cfg80211_chan_def monitor_chandef; @@ -1183,53 +1155,47 @@ struct ieee80211_ra_tid { /* Parsed Information Elements */ struct ieee802_11_elems { - u8 *ie_start; + const u8 *ie_start; size_t total_len; /* pointers to IEs */ - u8 *ssid; - u8 *supp_rates; - u8 *fh_params; - u8 *ds_params; - u8 *cf_params; - struct ieee80211_tim_ie *tim; - u8 *ibss_params; - u8 *challenge; - u8 *wpa; - u8 *rsn; - u8 *erp_info; - u8 *ext_supp_rates; - u8 *wmm_info; - u8 *wmm_param; - struct ieee80211_ht_cap *ht_cap_elem; - struct ieee80211_ht_operation *ht_operation; - struct ieee80211_vht_cap *vht_cap_elem; - struct ieee80211_vht_operation *vht_operation; - struct ieee80211_meshconf_ie *mesh_config; - u8 *mesh_id; - u8 *peering; - u8 *preq; - u8 *prep; - u8 *perr; - struct ieee80211_rann_ie *rann; - struct ieee80211_channel_sw_ie *ch_switch_ie; - u8 *country_elem; - u8 *pwr_constr_elem; - u8 *quiet_elem; /* first quite element */ - u8 *timeout_int; + const u8 *ssid; + const u8 *supp_rates; + const u8 *ds_params; + const struct ieee80211_tim_ie *tim; + const u8 *challenge; + const u8 *rsn; + const u8 *erp_info; + const u8 *ext_supp_rates; + const u8 *wmm_info; + const u8 *wmm_param; + const struct ieee80211_ht_cap *ht_cap_elem; + const struct ieee80211_ht_operation *ht_operation; + const struct ieee80211_vht_cap *vht_cap_elem; + const struct ieee80211_vht_operation *vht_operation; + const struct ieee80211_meshconf_ie *mesh_config; + const u8 *mesh_id; + const u8 *peering; + const __le16 *awake_window; + const u8 *preq; + const u8 *prep; + const u8 *perr; + const struct ieee80211_rann_ie *rann; + const struct ieee80211_channel_sw_ie *ch_switch_ie; + const struct ieee80211_ext_chansw_ie *ext_chansw_ie; + const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; + const u8 *country_elem; + const u8 *pwr_constr_elem; + const struct ieee80211_timeout_interval_ie *timeout_int; + const u8 *opmode_notif; + const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; /* length of them, respectively */ u8 ssid_len; u8 supp_rates_len; - u8 fh_params_len; - u8 ds_params_len; - u8 cf_params_len; u8 tim_len; - u8 ibss_params_len; u8 challenge_len; - u8 wpa_len; u8 rsn_len; - u8 erp_info_len; u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; @@ -1239,9 +1205,6 @@ struct ieee802_11_elems { u8 prep_len; u8 perr_len; u8 country_elem_len; - u8 quiet_elem_len; - u8 num_of_quiet_elem; /* can be more the one */ - u8 timeout_int_len; /* whether a parse error occurred while retrieving these elements */ bool parse_error; @@ -1296,18 +1259,14 @@ void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata); int ieee80211_max_network_latency(struct notifier_block *nb, unsigned long data, void *dummy); int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata); -void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, - struct ieee80211_channel_sw_ie *sw_elem, - struct ieee80211_bss *bss, - u64 timestamp); -void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata); -void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata); +void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, + __le16 fc, bool acked); /* IBSS code */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); @@ -1317,8 +1276,6 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params); int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata); -void ieee80211_ibss_quiesce(struct ieee80211_sub_if_data *sdata); -void ieee80211_ibss_restart(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata); void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); @@ -1346,8 +1303,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_mgmt *mgmt, size_t len, struct ieee802_11_elems *elems, - struct ieee80211_channel *channel, - bool beacon); + struct ieee80211_channel *channel); void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss); @@ -1362,8 +1318,9 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local); void ieee80211_offchannel_return(struct ieee80211_local *local); void ieee80211_roc_setup(struct ieee80211_local *local); void ieee80211_start_next_roc(struct ieee80211_local *local); -void ieee80211_roc_purge(struct ieee80211_sub_if_data *sdata); -void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc); +void ieee80211_roc_purge(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); +void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free); void ieee80211_sw_roc_work(struct work_struct *work); void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc); @@ -1377,11 +1334,14 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type); void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata); void ieee80211_remove_interfaces(struct ieee80211_local *local); +u32 ieee80211_idle_off(struct ieee80211_local *local); void ieee80211_recalc_idle(struct ieee80211_local *local); void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, const int offset); int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up); void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata); +int ieee80211_add_virtual_monitor(struct ieee80211_local *local); +void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); @@ -1404,10 +1364,10 @@ void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap *ht_cap); -void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, +bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, - struct ieee80211_ht_cap *ht_cap_ie, - struct ieee80211_sta_ht_cap *ht_cap); + const struct ieee80211_ht_cap *ht_cap_ie, + struct sta_info *sta); void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code); @@ -1420,7 +1380,8 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); -void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx); +void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, + enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); @@ -1434,11 +1395,9 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, size_t len); int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, - enum ieee80211_back_parties initiator, - bool tx); + enum ieee80211_agg_stop_reason reason); int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, - enum ieee80211_back_parties initiator, - bool tx); + enum ieee80211_agg_stop_reason reason); void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); void ieee80211_ba_session_work(struct work_struct *work); @@ -1448,10 +1407,19 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs); /* VHT */ -void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, - struct ieee80211_supported_band *sband, - struct ieee80211_vht_cap *vht_cap_ie, - struct ieee80211_sta_vht_cap *vht_cap); +void +ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct ieee80211_vht_cap *vht_cap_ie, + struct sta_info *sta); +enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta); +void ieee80211_sta_set_rx_nss(struct sta_info *sta); +void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, u8 opmode, + enum ieee80211_band band, bool nss_only); +void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta_vht_cap *vht_cap); + /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, @@ -1528,11 +1496,15 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, ieee80211_tx_skb_tid(sdata, skb, 7); } -void ieee802_11_parse_elems(u8 *start, size_t len, - struct ieee802_11_elems *elems); -u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, +u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, bool action, struct ieee802_11_elems *elems, u64 filter, u32 crc); +static inline void ieee802_11_parse_elems(u8 *start, size_t len, bool action, + struct ieee802_11_elems *elems) +{ + ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0); +} + u32 ieee80211_mandatory_rates(struct ieee80211_local *local, enum ieee80211_band band); @@ -1548,8 +1520,10 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, + unsigned long queues, enum queue_stop_reason reason); void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, + unsigned long queues, enum queue_stop_reason reason); void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason); @@ -1566,11 +1540,14 @@ static inline void ieee80211_add_pending_skbs(struct ieee80211_local *local, { ieee80211_add_pending_skbs_fn(local, skbs, NULL, NULL); } +void ieee80211_flush_queues(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, - u8 *extra, size_t extra_len, const u8 *bssid, - const u8 *da, const u8 *key, u8 key_len, u8 key_idx); + const u8 *extra, size_t extra_len, const u8 *bssid, + const u8 *da, const u8 *key, u8 key_len, u8 key_idx, + u32 tx_flags); void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf); @@ -1587,7 +1564,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - u32 ratemask, bool directed, bool no_cck, + u32 ratemask, bool directed, u32 tx_flags, struct ieee80211_channel *channel, bool scan); void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, @@ -1619,18 +1596,31 @@ int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, /* channel management */ void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, - struct ieee80211_ht_operation *ht_oper, + const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef); int __must_check ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode); +int __must_check +ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + u32 *changed); void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata); +void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, + bool clear); void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); +void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *chanctx); + +void ieee80211_dfs_cac_timer(unsigned long data); +void ieee80211_dfs_cac_timer_work(struct work_struct *work); +void ieee80211_dfs_cac_cancel(struct ieee80211_local *local); +void ieee80211_dfs_radar_detected_work(struct work_struct *work); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 8be854e86cd9..60f1ce5e5e52 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1,5 +1,5 @@ /* - * Interface handling (except master interface) + * Interface handling * * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. @@ -78,8 +78,7 @@ void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_TXPOWER); } -static u32 ieee80211_idle_off(struct ieee80211_local *local, - const char *reason) +static u32 __ieee80211_idle_off(struct ieee80211_local *local) { if (!(local->hw.conf.flags & IEEE80211_CONF_IDLE)) return 0; @@ -88,121 +87,67 @@ static u32 ieee80211_idle_off(struct ieee80211_local *local, return IEEE80211_CONF_CHANGE_IDLE; } -static u32 ieee80211_idle_on(struct ieee80211_local *local) +static u32 __ieee80211_idle_on(struct ieee80211_local *local) { if (local->hw.conf.flags & IEEE80211_CONF_IDLE) return 0; - drv_flush(local, false); + ieee80211_flush_queues(local, NULL); local->hw.conf.flags |= IEEE80211_CONF_IDLE; return IEEE80211_CONF_CHANGE_IDLE; } -static u32 __ieee80211_recalc_idle(struct ieee80211_local *local) +static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, + bool force_active) { - struct ieee80211_sub_if_data *sdata; - int count = 0; - bool working = false, scanning = false; + bool working = false, scanning, active; unsigned int led_trig_start = 0, led_trig_stop = 0; struct ieee80211_roc_work *roc; -#ifdef CONFIG_PROVE_LOCKING - WARN_ON(debug_locks && !lockdep_rtnl_is_held() && - !lockdep_is_held(&local->iflist_mtx)); -#endif lockdep_assert_held(&local->mtx); - list_for_each_entry(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) { - sdata->vif.bss_conf.idle = true; - continue; - } - - sdata->old_idle = sdata->vif.bss_conf.idle; - - /* do not count disabled managed interfaces */ - if (sdata->vif.type == NL80211_IFTYPE_STATION && - !sdata->u.mgd.associated && - !sdata->u.mgd.auth_data && - !sdata->u.mgd.assoc_data) { - sdata->vif.bss_conf.idle = true; - continue; - } - /* do not count unused IBSS interfaces */ - if (sdata->vif.type == NL80211_IFTYPE_ADHOC && - !sdata->u.ibss.ssid_len) { - sdata->vif.bss_conf.idle = true; - continue; - } - - if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) - continue; - - /* count everything else */ - sdata->vif.bss_conf.idle = false; - count++; - } + active = force_active || + !list_empty(&local->chanctx_list) || + local->monitors; if (!local->ops->remain_on_channel) { list_for_each_entry(roc, &local->roc_list, list) { working = true; - roc->sdata->vif.bss_conf.idle = false; + break; } } - sdata = rcu_dereference_protected(local->scan_sdata, - lockdep_is_held(&local->mtx)); - if (sdata && !(local->hw.flags & IEEE80211_HW_SCAN_WHILE_IDLE)) { - scanning = true; - sdata->vif.bss_conf.idle = false; - } - - list_for_each_entry(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_MONITOR || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) - continue; - if (sdata->old_idle == sdata->vif.bss_conf.idle) - continue; - if (!ieee80211_sdata_running(sdata)) - continue; - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); - } + scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) || + test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); if (working || scanning) led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_WORK; else led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_WORK; - if (count) + if (active) led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED; else led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED; ieee80211_mod_tpt_led_trig(local, led_trig_start, led_trig_stop); - if (working) - return ieee80211_idle_off(local, "working"); - if (scanning) - return ieee80211_idle_off(local, "scanning"); - if (!count) - return ieee80211_idle_on(local); - else - return ieee80211_idle_off(local, "in use"); + if (working || scanning || active) + return __ieee80211_idle_off(local); + return __ieee80211_idle_on(local); +} - return 0; +u32 ieee80211_idle_off(struct ieee80211_local *local) +{ + return __ieee80211_recalc_idle(local, true); } void ieee80211_recalc_idle(struct ieee80211_local *local) { - u32 chg; - - mutex_lock(&local->iflist_mtx); - chg = __ieee80211_recalc_idle(local); - mutex_unlock(&local->iflist_mtx); - if (chg) - ieee80211_hw_config(local, chg); + u32 change = __ieee80211_recalc_idle(local, false); + if (change) + ieee80211_hw_config(local, change); } static int ieee80211_change_mtu(struct net_device *dev, int new_mtu) @@ -360,7 +305,8 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata) } } - if ((sdata->vif.type != NL80211_IFTYPE_AP) || + if ((sdata->vif.type != NL80211_IFTYPE_AP && + sdata->vif.type != NL80211_IFTYPE_MESH_POINT) || !(sdata->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) { sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE; return 0; @@ -411,24 +357,22 @@ static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata) sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE; } -static int ieee80211_add_virtual_monitor(struct ieee80211_local *local) +int ieee80211_add_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; - int ret = 0; + int ret; if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF)) return 0; - mutex_lock(&local->iflist_mtx); + ASSERT_RTNL(); if (local->monitor_sdata) - goto out_unlock; + return 0; sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL); - if (!sdata) { - ret = -ENOMEM; - goto out_unlock; - } + if (!sdata) + return -ENOMEM; /* set up data */ sdata->local = local; @@ -442,13 +386,13 @@ static int ieee80211_add_virtual_monitor(struct ieee80211_local *local) if (WARN_ON(ret)) { /* ok .. stupid driver, it asked for this! */ kfree(sdata); - goto out_unlock; + return ret; } ret = ieee80211_check_queues(sdata); if (ret) { kfree(sdata); - goto out_unlock; + return ret; } ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef, @@ -456,30 +400,37 @@ static int ieee80211_add_virtual_monitor(struct ieee80211_local *local) if (ret) { drv_remove_interface(local, sdata); kfree(sdata); - goto out_unlock; + return ret; } + mutex_lock(&local->iflist_mtx); rcu_assign_pointer(local->monitor_sdata, sdata); - out_unlock: mutex_unlock(&local->iflist_mtx); - return ret; + + return 0; } -static void ieee80211_del_virtual_monitor(struct ieee80211_local *local) +void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF)) return; + ASSERT_RTNL(); + mutex_lock(&local->iflist_mtx); sdata = rcu_dereference_protected(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx)); - if (!sdata) - goto out_unlock; + if (!sdata) { + mutex_unlock(&local->iflist_mtx); + return; + } rcu_assign_pointer(local->monitor_sdata, NULL); + mutex_unlock(&local->iflist_mtx); + synchronize_net(); ieee80211_vif_release_channel(sdata); @@ -487,8 +438,6 @@ static void ieee80211_del_virtual_monitor(struct ieee80211_local *local) drv_remove_interface(local, sdata); kfree(sdata); - out_unlock: - mutex_unlock(&local->iflist_mtx); } /* @@ -550,8 +499,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) res = drv_start(local); if (res) goto err_del_bss; - if (local->ops->napi_poll) - napi_enable(&local->napi); /* we're brought up, everything changes */ hw_reconf_flags = ~0; ieee80211_led_radio(local, true); @@ -606,6 +553,9 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) ieee80211_adjust_monitor_flags(sdata, 1); ieee80211_configure_filter(local); + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); netif_carrier_on(dev); break; @@ -645,7 +595,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NL80211_IFTYPE_P2P_DEVICE: break; default: - netif_carrier_on(dev); + /* not reached */ + WARN_ON(1); } /* @@ -694,10 +645,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (sdata->flags & IEEE80211_SDATA_PROMISC) atomic_inc(&local->iff_promiscs); - mutex_lock(&local->mtx); - hw_reconf_flags |= __ieee80211_recalc_idle(local); - mutex_unlock(&local->mtx); - if (coming_up) local->open_count++; @@ -706,8 +653,28 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) ieee80211_recalc_ps(local, -1); - if (dev) - netif_tx_start_all_queues(dev); + if (dev) { + unsigned long flags; + int n_acs = IEEE80211_NUM_ACS; + int ac; + + if (local->hw.queues < IEEE80211_NUM_ACS) + n_acs = 1; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE || + (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 && + skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) { + for (ac = 0; ac < n_acs; ac++) { + int ac_queue = sdata->vif.hw_queue[ac]; + + if (local->queue_stop_reasons[ac_queue] == 0 && + skb_queue_empty(&local->pending[ac_queue])) + netif_start_subqueue(dev, ac); + } + } + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + } return 0; err_del_interface: @@ -747,7 +714,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, unsigned long flags; struct sk_buff *skb, *tmp; u32 hw_reconf_flags = 0; - int i; + int i, flushed; + struct ps_data *ps; clear_bit(SDATA_STATE_RUNNING, &sdata->state); @@ -760,7 +728,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (sdata->dev) netif_tx_stop_all_queues(sdata->dev); - ieee80211_roc_purge(sdata); + ieee80211_roc_purge(local, sdata); + + if (sdata->vif.type == NL80211_IFTYPE_STATION) + ieee80211_mgd_stop(sdata); /* * Remove all stations associated with this interface. @@ -772,18 +743,17 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, * (because if we remove a STA after ops->remove_interface() * the driver will have removed the vif info already!) * - * This is relevant only in AP, WDS and mesh modes, since in - * all other modes we've already removed all stations when - * disconnecting etc. + * This is relevant only in WDS mode, in all other modes we've + * already removed all stations when disconnecting or similar, + * so warn otherwise. + * + * We call sta_info_flush_cleanup() later, to combine RCU waits. */ - sta_info_flush(local, sdata); + flushed = sta_info_flush_defer(sdata); + WARN_ON_ONCE((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || + (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)); - /* - * Don't count this interface for promisc/allmulti while it - * is down. dev_mc_unsync() will invoke set_multicast_list - * on the master interface which will sync these down to the - * hardware as filter flags. - */ + /* don't count this interface for promisc/allmulti while it is down */ if (sdata->flags & IEEE80211_SDATA_ALLMULTI) atomic_dec(&local->iff_allmultis); @@ -804,8 +774,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, sdata->dev->addr_len); spin_unlock_bh(&local->filter_lock); netif_addr_unlock_bh(sdata->dev); - - ieee80211_configure_filter(local); } del_timer_sync(&local->dynamic_ps_timer); @@ -813,6 +781,17 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, cancel_work_sync(&sdata->recalc_smps); + cancel_delayed_work_sync(&sdata->dfs_cac_timer_work); + + if (sdata->wdev.cac_started) { + WARN_ON(local->suspended); + mutex_lock(&local->iflist_mtx); + ieee80211_vif_release_channel(sdata); + mutex_unlock(&local->iflist_mtx); + cfg80211_cac_event(sdata->dev, NL80211_RADAR_CAC_ABORTED, + GFP_KERNEL); + } + /* APs need special treatment */ if (sdata->vif.type == NL80211_IFTYPE_AP) { struct ieee80211_sub_if_data *vlan, *tmpsdata; @@ -822,8 +801,19 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, u.vlan.list) dev_close(vlan->dev); WARN_ON(!list_empty(&sdata->u.ap.vlans)); - } else if (sdata->vif.type == NL80211_IFTYPE_STATION) { - ieee80211_mgd_stop(sdata); + } else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + /* remove all packets in parent bc_buf pointing to this dev */ + ps = &sdata->bss->ps; + + spin_lock_irqsave(&ps->bc_buf.lock, flags); + skb_queue_walk_safe(&ps->bc_buf, skb, tmp) { + if (skb->dev == sdata->dev) { + __skb_unlink(skb, &ps->bc_buf); + local->total_ps_buffered--; + ieee80211_free_txskb(&local->hw, skb); + } + } + spin_unlock_irqrestore(&ps->bc_buf.lock, flags); } if (going_down) @@ -845,11 +835,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (local->monitors == 0) { local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR; hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; - ieee80211_del_virtual_monitor(local); } ieee80211_adjust_monitor_flags(sdata, -1); - ieee80211_configure_filter(local); break; case NL80211_IFTYPE_P2P_DEVICE: /* relies on synchronize_rcu() below */ @@ -859,45 +847,31 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, cancel_work_sync(&sdata->work); /* * When we get here, the interface is marked down. - * Call synchronize_rcu() to wait for the RX path - * should it be using the interface and enqueuing - * frames at this very time on another CPU. + * + * sta_info_flush_cleanup() requires rcu_barrier() + * first to wait for the station call_rcu() calls + * to complete, and we also need synchronize_rcu() + * to wait for the RX path in case it is using the + * interface and enqueuing frames at this very time on + * another CPU. */ synchronize_rcu(); - skb_queue_purge(&sdata->skb_queue); + rcu_barrier(); + sta_info_flush_cleanup(sdata); /* * Free all remaining keys, there shouldn't be any, - * except maybe group keys in AP more or WDS? + * except maybe in WDS mode? */ ieee80211_free_keys(sdata); - if (going_down) - drv_remove_interface(local, sdata); + /* fall through */ + case NL80211_IFTYPE_AP: + skb_queue_purge(&sdata->skb_queue); } sdata->bss = NULL; - mutex_lock(&local->mtx); - hw_reconf_flags |= __ieee80211_recalc_idle(local); - mutex_unlock(&local->mtx); - - ieee80211_recalc_ps(local, -1); - - if (local->open_count == 0) { - if (local->ops->napi_poll) - napi_disable(&local->napi); - ieee80211_clear_tx_pending(local); - ieee80211_stop_device(local); - - /* no reconfiguring after stop! */ - hw_reconf_flags = 0; - } - - /* do after stop to avoid reconfiguring when we stop anyway */ - if (hw_reconf_flags) - ieee80211_hw_config(local, hw_reconf_flags); - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { skb_queue_walk_safe(&local->pending[i], skb, tmp) { @@ -910,7 +884,54 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - if (local->monitors == local->open_count && local->monitors > 0) + if (local->open_count == 0) + ieee80211_clear_tx_pending(local); + + /* + * If the interface goes down while suspended, presumably because + * the device was unplugged and that happens before our resume, + * then the driver is already unconfigured and the remainder of + * this function isn't needed. + * XXX: what about WoWLAN? If the device has software state, e.g. + * memory allocated, it might expect teardown commands from + * mac80211 here? + */ + if (local->suspended) { + WARN_ON(local->wowlan); + WARN_ON(rtnl_dereference(local->monitor_sdata)); + return; + } + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + break; + case NL80211_IFTYPE_MONITOR: + if (local->monitors == 0) + ieee80211_del_virtual_monitor(local); + + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); + break; + default: + if (going_down) + drv_remove_interface(local, sdata); + } + + ieee80211_recalc_ps(local, -1); + + if (local->open_count == 0) { + ieee80211_stop_device(local); + + /* no reconfiguring after stop! */ + return; + } + + /* do after stop to avoid reconfiguring when we stop anyway */ + ieee80211_configure_filter(local); + ieee80211_hw_config(local, hw_reconf_flags); + + if (local->monitors == local->open_count) ieee80211_add_virtual_monitor(local); } @@ -949,6 +970,17 @@ static void ieee80211_set_multicast_list(struct net_device *dev) atomic_dec(&local->iff_promiscs); sdata->flags ^= IEEE80211_SDATA_PROMISC; } + + /* + * TODO: If somebody needs this on AP interfaces, + * it can be enabled easily but multicast + * addresses from VLANs need to be synced. + */ + if (sdata->vif.type != NL80211_IFTYPE_MONITOR && + sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_AP) + drv_set_multicast_list(local, sdata, &dev->mc); + spin_lock_bh(&local->filter_lock); __hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len); spin_unlock_bh(&local->filter_lock); @@ -961,7 +993,6 @@ static void ieee80211_set_multicast_list(struct net_device *dev) */ static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_local *local = sdata->local; int flushed; int i; @@ -977,7 +1008,7 @@ static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata) if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_rmc_free(sdata); - flushed = sta_info_flush(local, sdata); + flushed = sta_info_flush(sdata); WARN_ON(flushed); } @@ -1218,6 +1249,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP: skb_queue_head_init(&sdata->u.ap.ps.bc_buf); INIT_LIST_HEAD(&sdata->u.ap.vlans); + sdata->vif.bss_conf.bssid = sdata->vif.addr; break; case NL80211_IFTYPE_P2P_CLIENT: type = NL80211_IFTYPE_STATION; @@ -1225,9 +1257,11 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->vif.p2p = true; /* fall through */ case NL80211_IFTYPE_STATION: + sdata->vif.bss_conf.bssid = sdata->u.mgd.bssid; ieee80211_sta_setup_sdata(sdata); break; case NL80211_IFTYPE_ADHOC: + sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid; ieee80211_ibss_setup_sdata(sdata); break; case NL80211_IFTYPE_MESH_POINT: @@ -1241,8 +1275,12 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, MONITOR_FLAG_OTHER_BSS; break; case NL80211_IFTYPE_WDS: + sdata->vif.bss_conf.bssid = NULL; + break; case NL80211_IFTYPE_AP_VLAN: + break; case NL80211_IFTYPE_P2P_DEVICE: + sdata->vif.bss_conf.bssid = sdata->vif.addr; break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: @@ -1558,9 +1596,6 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, /* initialise type-independent data */ sdata->wdev.wiphy = local->hw.wiphy; sdata->local = local; -#ifdef CONFIG_INET - sdata->arp_filter_state = true; -#endif for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) skb_queue_head_init(&sdata->fragments[i].skb_list); @@ -1570,6 +1605,10 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, spin_lock_init(&sdata->cleanup_stations_lock); INIT_LIST_HEAD(&sdata->cleanup_stations); INIT_WORK(&sdata->cleanup_stations_wk, ieee80211_cleanup_sdata_stas_wk); + INIT_DELAYED_WORK(&sdata->dfs_cac_timer_work, + ieee80211_dfs_cac_timer_work); + INIT_DELAYED_WORK(&sdata->dec_tailroom_needed_wk, + ieee80211_delayed_tailroom_dec); for (i = 0; i < IEEE80211_NUM_BANDS; i++) { struct ieee80211_supported_band *sband; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 619c5d697999..67059b88fea5 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -204,8 +204,11 @@ static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, if (idx >= 0 && idx < NUM_DEFAULT_KEYS) key = key_mtx_dereference(sdata->local, sdata->keys[idx]); - if (uni) + if (uni) { rcu_assign_pointer(sdata->default_unicast_key, key); + drv_set_default_unicast_key(sdata->local, sdata, idx); + } + if (multi) rcu_assign_pointer(sdata->default_multicast_key, key); @@ -245,11 +248,11 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, } -static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, - bool pairwise, - struct ieee80211_key *old, - struct ieee80211_key *new) +static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + bool pairwise, + struct ieee80211_key *old, + struct ieee80211_key *new) { int idx; bool defunikey, defmultikey, defmgmtkey; @@ -394,7 +397,41 @@ struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, return key; } -static void __ieee80211_key_destroy(struct ieee80211_key *key) +static void ieee80211_key_free_common(struct ieee80211_key *key) +{ + if (key->conf.cipher == WLAN_CIPHER_SUITE_CCMP) + ieee80211_aes_key_free(key->u.ccmp.tfm); + if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC) + ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); + kfree(key); +} + +static void __ieee80211_key_destroy(struct ieee80211_key *key, + bool delay_tailroom) +{ + if (key->local) + ieee80211_key_disable_hw_accel(key); + + if (key->local) { + struct ieee80211_sub_if_data *sdata = key->sdata; + + ieee80211_debugfs_key_remove(key); + + if (delay_tailroom) { + /* see ieee80211_delayed_tailroom_dec */ + sdata->crypto_tx_tailroom_pending_dec++; + schedule_delayed_work(&sdata->dec_tailroom_needed_wk, + HZ/2); + } else { + sdata->crypto_tx_tailroom_needed_cnt--; + } + } + + ieee80211_key_free_common(key); +} + +static void ieee80211_key_destroy(struct ieee80211_key *key, + bool delay_tailroom) { if (!key) return; @@ -405,19 +442,13 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key) */ synchronize_net(); - if (key->local) - ieee80211_key_disable_hw_accel(key); - - if (key->conf.cipher == WLAN_CIPHER_SUITE_CCMP) - ieee80211_aes_key_free(key->u.ccmp.tfm); - if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC) - ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); - if (key->local) { - ieee80211_debugfs_key_remove(key); - key->sdata->crypto_tx_tailroom_needed_cnt--; - } + __ieee80211_key_destroy(key, delay_tailroom); +} - kfree(key); +void ieee80211_key_free_unused(struct ieee80211_key *key) +{ + WARN_ON(key->sdata || key->local); + ieee80211_key_free_common(key); } int ieee80211_key_link(struct ieee80211_key *key, @@ -437,32 +468,6 @@ int ieee80211_key_link(struct ieee80211_key *key, key->sdata = sdata; key->sta = sta; - if (sta) { - /* - * some hardware cannot handle TKIP with QoS, so - * we indicate whether QoS could be in use. - */ - if (test_sta_flag(sta, WLAN_STA_WME)) - key->conf.flags |= IEEE80211_KEY_FLAG_WMM_STA; - } else { - if (sdata->vif.type == NL80211_IFTYPE_STATION) { - struct sta_info *ap; - - /* - * We're getting a sta pointer in, so must be under - * appropriate locking for sta_info_get(). - */ - - /* same here, the AP could be using QoS */ - ap = sta_info_get(key->sdata, key->sdata->u.mgd.bssid); - if (ap) { - if (test_sta_flag(ap, WLAN_STA_WME)) - key->conf.flags |= - IEEE80211_KEY_FLAG_WMM_STA; - } - } - } - mutex_lock(&sdata->local->key_mtx); if (sta && pairwise) @@ -474,19 +479,22 @@ int ieee80211_key_link(struct ieee80211_key *key, increment_tailroom_need_count(sdata); - __ieee80211_key_replace(sdata, sta, pairwise, old_key, key); - __ieee80211_key_destroy(old_key); + ieee80211_key_replace(sdata, sta, pairwise, old_key, key); + ieee80211_key_destroy(old_key, true); ieee80211_debugfs_key_add(key); ret = ieee80211_key_enable_hw_accel(key); + if (ret) + ieee80211_key_free(key, true); + mutex_unlock(&sdata->local->key_mtx); return ret; } -void __ieee80211_key_free(struct ieee80211_key *key) +void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom) { if (!key) return; @@ -495,18 +503,10 @@ void __ieee80211_key_free(struct ieee80211_key *key) * Replace key with nothingness if it was ever used. */ if (key->sdata) - __ieee80211_key_replace(key->sdata, key->sta, + ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - __ieee80211_key_destroy(key); -} - -void ieee80211_key_free(struct ieee80211_local *local, - struct ieee80211_key *key) -{ - mutex_lock(&local->key_mtx); - __ieee80211_key_free(key); - mutex_unlock(&local->key_mtx); + ieee80211_key_destroy(key, delay_tailroom); } void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) @@ -563,36 +563,109 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_iter_keys); -void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata) +void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_key *key; + struct ieee80211_key *key, *tmp; + LIST_HEAD(keys); - ASSERT_RTNL(); + cancel_delayed_work_sync(&sdata->dec_tailroom_needed_wk); mutex_lock(&sdata->local->key_mtx); - list_for_each_entry(key, &sdata->key_list, list) - ieee80211_key_disable_hw_accel(key); + sdata->crypto_tx_tailroom_needed_cnt -= + sdata->crypto_tx_tailroom_pending_dec; + sdata->crypto_tx_tailroom_pending_dec = 0; + + ieee80211_debugfs_key_remove_mgmt_default(sdata); + + list_for_each_entry_safe(key, tmp, &sdata->key_list, list) { + ieee80211_key_replace(key->sdata, key->sta, + key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, + key, NULL); + list_add_tail(&key->list, &keys); + } + + ieee80211_debugfs_key_update_default(sdata); + + if (!list_empty(&keys)) { + synchronize_net(); + list_for_each_entry_safe(key, tmp, &keys, list) + __ieee80211_key_destroy(key, false); + } + + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || + sdata->crypto_tx_tailroom_pending_dec); mutex_unlock(&sdata->local->key_mtx); } -void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata) +void ieee80211_free_sta_keys(struct ieee80211_local *local, + struct sta_info *sta) { struct ieee80211_key *key, *tmp; + LIST_HEAD(keys); + int i; - mutex_lock(&sdata->local->key_mtx); + mutex_lock(&local->key_mtx); + for (i = 0; i < NUM_DEFAULT_KEYS; i++) { + key = key_mtx_dereference(local, sta->gtk[i]); + if (!key) + continue; + ieee80211_key_replace(key->sdata, key->sta, + key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, + key, NULL); + list_add(&key->list, &keys); + } - ieee80211_debugfs_key_remove_mgmt_default(sdata); + key = key_mtx_dereference(local, sta->ptk); + if (key) { + ieee80211_key_replace(key->sdata, key->sta, + key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, + key, NULL); + list_add(&key->list, &keys); + } - list_for_each_entry_safe(key, tmp, &sdata->key_list, list) - __ieee80211_key_free(key); + /* + * NB: the station code relies on this being + * done even if there aren't any keys + */ + synchronize_net(); - ieee80211_debugfs_key_update_default(sdata); + list_for_each_entry_safe(key, tmp, &keys, list) + __ieee80211_key_destroy(key, true); - mutex_unlock(&sdata->local->key_mtx); + mutex_unlock(&local->key_mtx); } +void ieee80211_delayed_tailroom_dec(struct work_struct *wk) +{ + struct ieee80211_sub_if_data *sdata; + + sdata = container_of(wk, struct ieee80211_sub_if_data, + dec_tailroom_needed_wk.work); + + /* + * The reason for the delayed tailroom needed decrementing is to + * make roaming faster: during roaming, all keys are first deleted + * and then new keys are installed. The first new key causes the + * crypto_tx_tailroom_needed_cnt to go from 0 to 1, which invokes + * the cost of synchronize_net() (which can be slow). Avoid this + * by deferring the crypto_tx_tailroom_needed_cnt decrementing on + * key removal for a while, so if we roam the value is larger than + * zero and no 0->1 transition happens. + * + * The cost is that if the AP switching was from an AP with keys + * to one without, we still allocate tailroom while it would no + * longer be needed. However, in the typical (fast) roaming case + * within an ESS this usually won't happen. + */ + + mutex_lock(&sdata->local->key_mtx); + sdata->crypto_tx_tailroom_needed_cnt -= + sdata->crypto_tx_tailroom_pending_dec; + sdata->crypto_tx_tailroom_pending_dec = 0; + mutex_unlock(&sdata->local->key_mtx); +} void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid, const u8 *replay_ctr, gfp_t gfp) diff --git a/net/mac80211/key.h b/net/mac80211/key.h index 382dc44ed330..e8de3e6d7804 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -129,23 +129,25 @@ struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, size_t seq_len, const u8 *seq); /* * Insert a key into data structures (sdata, sta if necessary) - * to make it used, free old key. + * to make it used, free old key. On failure, also free the new key. */ -int __must_check ieee80211_key_link(struct ieee80211_key *key, - struct ieee80211_sub_if_data *sdata, - struct sta_info *sta); -void __ieee80211_key_free(struct ieee80211_key *key); -void ieee80211_key_free(struct ieee80211_local *local, - struct ieee80211_key *key); +int ieee80211_key_link(struct ieee80211_key *key, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta); +void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom); +void ieee80211_key_free_unused(struct ieee80211_key *key); void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx, bool uni, bool multi); void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx); void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata); +void ieee80211_free_sta_keys(struct ieee80211_local *local, + struct sta_info *sta); void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata); -void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata); #define key_mtx_dereference(local, ref) \ rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx))) +void ieee80211_delayed_tailroom_dec(struct work_struct *wk); + #endif /* IEEE80211_KEY_H */ diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 1b087fff93e7..8a7bfc47d577 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -23,6 +23,7 @@ #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <net/cfg80211.h> +#include <net/addrconf.h> #include "ieee80211_i.h" #include "driver-ops.h" @@ -33,8 +34,6 @@ #include "cfg.h" #include "debugfs.h" -static struct lock_class_key ieee80211_rx_skb_queue_class; - void ieee80211_configure_filter(struct ieee80211_local *local) { u64 mc; @@ -96,43 +95,47 @@ static void ieee80211_reconfig_filter(struct work_struct *work) static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; - struct ieee80211_channel *chan; + struct cfg80211_chan_def chandef = {}; u32 changed = 0; int power; - enum nl80211_channel_type channel_type; u32 offchannel_flag; - bool scanning = false; offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; + if (local->scan_channel) { - chan = local->scan_channel; + chandef.chan = local->scan_channel; /* If scanning on oper channel, use whatever channel-type * is currently in use. */ - if (chan == local->_oper_channel) - channel_type = local->_oper_channel_type; - else - channel_type = NL80211_CHAN_NO_HT; + if (chandef.chan == local->_oper_chandef.chan) { + chandef = local->_oper_chandef; + } else { + chandef.width = NL80211_CHAN_WIDTH_20_NOHT; + chandef.center_freq1 = chandef.chan->center_freq; + } } else if (local->tmp_channel) { - chan = local->tmp_channel; - channel_type = NL80211_CHAN_NO_HT; - } else { - chan = local->_oper_channel; - channel_type = local->_oper_channel_type; - } - - if (chan != local->_oper_channel || - channel_type != local->_oper_channel_type) + chandef.chan = local->tmp_channel; + chandef.width = NL80211_CHAN_WIDTH_20_NOHT; + chandef.center_freq1 = chandef.chan->center_freq; + } else + chandef = local->_oper_chandef; + + WARN(!cfg80211_chandef_valid(&chandef), + "control:%d MHz width:%d center: %d/%d MHz", + chandef.chan->center_freq, chandef.width, + chandef.center_freq1, chandef.center_freq2); + + if (!cfg80211_chandef_identical(&chandef, &local->_oper_chandef)) local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; else local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL; offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; - if (offchannel_flag || chan != local->hw.conf.channel || - channel_type != local->hw.conf.channel_type) { - local->hw.conf.channel = chan; - local->hw.conf.channel_type = channel_type; + if (offchannel_flag || + !cfg80211_chandef_identical(&local->hw.conf.chandef, + &local->_oper_chandef)) { + local->hw.conf.chandef = chandef; changed |= IEEE80211_CONF_CHANGE_CHANNEL; } @@ -148,10 +151,7 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) changed |= IEEE80211_CONF_CHANGE_SMPS; } - scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) || - test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning) || - test_bit(SCAN_HW_SCANNING, &local->scanning); - power = chan->max_power; + power = chandef.chan->max_power; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { @@ -207,76 +207,10 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, u32 changed) { struct ieee80211_local *local = sdata->local; - static const u8 zero[ETH_ALEN] = { 0 }; if (!changed) return; - if (sdata->vif.type == NL80211_IFTYPE_STATION) { - sdata->vif.bss_conf.bssid = sdata->u.mgd.bssid; - } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) - sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid; - else if (sdata->vif.type == NL80211_IFTYPE_AP) - sdata->vif.bss_conf.bssid = sdata->vif.addr; - else if (sdata->vif.type == NL80211_IFTYPE_WDS) - sdata->vif.bss_conf.bssid = NULL; - else if (ieee80211_vif_is_mesh(&sdata->vif)) { - sdata->vif.bss_conf.bssid = zero; - } else if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) { - sdata->vif.bss_conf.bssid = sdata->vif.addr; - WARN_ONCE(changed & ~(BSS_CHANGED_IDLE), - "P2P Device BSS changed %#x", changed); - } else { - WARN_ON(1); - return; - } - - switch (sdata->vif.type) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_WDS: - case NL80211_IFTYPE_MESH_POINT: - break; - default: - /* do not warn to simplify caller in scan.c */ - changed &= ~BSS_CHANGED_BEACON_ENABLED; - if (WARN_ON(changed & BSS_CHANGED_BEACON)) - return; - break; - } - - if (changed & BSS_CHANGED_BEACON_ENABLED) { - if (local->quiescing || !ieee80211_sdata_running(sdata) || - test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) { - sdata->vif.bss_conf.enable_beacon = false; - } else { - /* - * Beacon should be enabled, but AP mode must - * check whether there is a beacon configured. - */ - switch (sdata->vif.type) { - case NL80211_IFTYPE_AP: - sdata->vif.bss_conf.enable_beacon = - !!sdata->u.ap.beacon; - break; - case NL80211_IFTYPE_ADHOC: - sdata->vif.bss_conf.enable_beacon = - !!sdata->u.ibss.presp; - break; -#ifdef CONFIG_MAC80211_MESH - case NL80211_IFTYPE_MESH_POINT: - sdata->vif.bss_conf.enable_beacon = - !!sdata->u.mesh.mesh_id_len; - break; -#endif - default: - /* not reached */ - WARN_ON(1); - break; - } - } - } - drv_bss_info_changed(local, sdata, &sdata->vif.bss_conf, changed); } @@ -293,8 +227,6 @@ u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata) static void ieee80211_tasklet_handler(unsigned long data) { struct ieee80211_local *local = (struct ieee80211_local *) data; - struct sta_info *sta, *tmp; - struct skb_eosp_msg_data *eosp_data; struct sk_buff *skb; while ((skb = skb_dequeue(&local->skb_queue)) || @@ -310,18 +242,6 @@ static void ieee80211_tasklet_handler(unsigned long data) skb->pkt_type = 0; ieee80211_tx_status(&local->hw, skb); break; - case IEEE80211_EOSP_MSG: - eosp_data = (void *)skb->cb; - for_each_sta_info(local, eosp_data->sta, sta, tmp) { - /* skip wrong virtual interface */ - if (memcmp(eosp_data->iface, - sta->sdata->vif.addr, ETH_ALEN)) - continue; - clear_sta_flag(sta, WLAN_STA_SP); - break; - } - dev_kfree_skb(skb); - break; default: WARN(1, "mac80211: Packet is of unknown type %d\n", skb->pkt_type); @@ -362,8 +282,8 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw) "Hardware restart was requested\n"); /* use this reason, ieee80211_reconfig will unblock it */ - ieee80211_stop_queues_by_reason(hw, - IEEE80211_QUEUE_STOP_REASON_SUSPEND); + ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_SUSPEND); /* * Stop all Rx during the reconfig. We don't want state changes @@ -415,27 +335,19 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, /* Copy the addresses to the bss_conf list */ ifa = idev->ifa_list; - while (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN && ifa) { - bss_conf->arp_addr_list[c] = ifa->ifa_address; + while (ifa) { + if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN) + bss_conf->arp_addr_list[c] = ifa->ifa_address; ifa = ifa->ifa_next; c++; } - /* If not all addresses fit the list, disable filtering */ - if (ifa) { - sdata->arp_filter_state = false; - c = 0; - } else { - sdata->arp_filter_state = true; - } bss_conf->arp_addr_cnt = c; /* Configure driver only if associated (which also implies it is up) */ - if (ifmgd->associated) { - bss_conf->arp_filter_enabled = sdata->arp_filter_state; + if (ifmgd->associated) ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_ARP_FILTER); - } mutex_unlock(&ifmgd->mtx); @@ -443,29 +355,36 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, } #endif -static int ieee80211_napi_poll(struct napi_struct *napi, int budget) +#if IS_ENABLED(CONFIG_IPV6) +static int ieee80211_ifa6_changed(struct notifier_block *nb, + unsigned long data, void *arg) { + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)arg; + struct inet6_dev *idev = ifa->idev; + struct net_device *ndev = ifa->idev->dev; struct ieee80211_local *local = - container_of(napi, struct ieee80211_local, napi); + container_of(nb, struct ieee80211_local, ifa6_notifier); + struct wireless_dev *wdev = ndev->ieee80211_ptr; + struct ieee80211_sub_if_data *sdata; - return local->ops->napi_poll(&local->hw, budget); -} + /* Make sure it's our interface that got changed */ + if (!wdev || wdev->wiphy != local->hw.wiphy) + return NOTIFY_DONE; -void ieee80211_napi_schedule(struct ieee80211_hw *hw) -{ - struct ieee80211_local *local = hw_to_local(hw); + sdata = IEEE80211_DEV_TO_SUB_IF(ndev); - napi_schedule(&local->napi); -} -EXPORT_SYMBOL(ieee80211_napi_schedule); + /* + * For now only support station mode. This is mostly because + * doing AP would have to handle AP_VLAN in some way ... + */ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return NOTIFY_DONE; -void ieee80211_napi_complete(struct ieee80211_hw *hw) -{ - struct ieee80211_local *local = hw_to_local(hw); + drv_ipv6_addr_change(local, sdata, idev); - napi_complete(&local->napi); + return NOTIFY_DONE; } -EXPORT_SYMBOL(ieee80211_napi_complete); +#endif /* There isn't a lot of sense in it, but you can transmit anything you like */ static const struct ieee80211_txrx_stypes @@ -537,6 +456,7 @@ static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = { .cap_info = cpu_to_le16(IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_MAX_AMSDU | + IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40), .mcs = { .rx_mask = { 0xff, 0xff, 0xff, 0xff, 0xff, @@ -544,6 +464,32 @@ static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = { }, }; +static const struct ieee80211_vht_cap mac80211_vht_capa_mod_mask = { + .vht_cap_info = + cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC | + IEEE80211_VHT_CAP_SHORT_GI_80 | + IEEE80211_VHT_CAP_SHORT_GI_160 | + IEEE80211_VHT_CAP_RXSTBC_1 | + IEEE80211_VHT_CAP_RXSTBC_2 | + IEEE80211_VHT_CAP_RXSTBC_3 | + IEEE80211_VHT_CAP_RXSTBC_4 | + IEEE80211_VHT_CAP_TXSTBC | + IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE | + IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | + IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN | + IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN | + IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK), + .supp_mcs = { + .rx_mcs_map = cpu_to_le16(~0), + .tx_mcs_map = cpu_to_le16(~0), + }, +}; + +static const u8 extended_capabilities[] = { + 0, 0, 0, 0, 0, 0, 0, + WLAN_EXT_CAPA8_OPMODE_NOTIF, +}; + struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { @@ -600,13 +546,18 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, WIPHY_FLAG_REPORTS_OBSS | WIPHY_FLAG_OFFCHAN_TX; + wiphy->extended_capabilities = extended_capabilities; + wiphy->extended_capabilities_mask = extended_capabilities; + wiphy->extended_capabilities_len = ARRAY_SIZE(extended_capabilities); + if (ops->remain_on_channel) wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL; wiphy->features |= NL80211_FEATURE_SK_TX_STATUS | NL80211_FEATURE_SAE | NL80211_FEATURE_HT_IBSS | - NL80211_FEATURE_VIF_TXPOWER; + NL80211_FEATURE_VIF_TXPOWER | + NL80211_FEATURE_USERSPACE_MPM; if (!ops->hw_scan) wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | @@ -641,8 +592,11 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, IEEE80211_RADIOTAP_MCS_HAVE_BW; local->hw.radiotap_vht_details = IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH; + local->hw.uapsd_queues = IEEE80211_DEFAULT_UAPSD_QUEUES; + local->hw.uapsd_max_sp_len = IEEE80211_DEFAULT_MAX_SP_LEN; local->user_power_level = IEEE80211_UNSET_POWER_LEVEL; wiphy->ht_capa_mod_mask = &mac80211_ht_capa_mod_mask; + wiphy->vht_capa_mod_mask = &mac80211_vht_capa_mod_mask; INIT_LIST_HEAD(&local->interfaces); @@ -653,25 +607,19 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, mutex_init(&local->key_mtx); spin_lock_init(&local->filter_lock); + spin_lock_init(&local->rx_path_lock); spin_lock_init(&local->queue_stop_reason_lock); INIT_LIST_HEAD(&local->chanctx_list); mutex_init(&local->chanctx_mtx); - /* - * The rx_skb_queue is only accessed from tasklets, - * but other SKB queues are used from within IRQ - * context. Therefore, this one needs a different - * locking class so our direct, non-irq-safe use of - * the queue's lock doesn't throw lockdep warnings. - */ - skb_queue_head_init_class(&local->rx_skb_queue, - &ieee80211_rx_skb_queue_class); - INIT_DELAYED_WORK(&local->scan_work, ieee80211_scan_work); INIT_WORK(&local->restart_work, ieee80211_restart_work); + INIT_WORK(&local->radar_detected_work, + ieee80211_dfs_radar_detected_work); + INIT_WORK(&local->reconfig_filter, ieee80211_reconfig_filter); local->smps_mode = IEEE80211_SMPS_OFF; @@ -687,8 +635,6 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, spin_lock_init(&local->ack_status_lock); idr_init(&local->ack_status_frames); - /* preallocate at least one entry */ - idr_pre_get(&local->ack_status_frames, GFP_KERNEL); sta_info_init(local); @@ -706,9 +652,6 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); - /* init dummy netdev for use w/ NAPI */ - init_dummy_netdev(&local->napi_dev); - ieee80211_led_names(local); ieee80211_roc_setup(local); @@ -725,6 +668,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int channels, max_bitrates; bool supp_ht, supp_vht; netdev_features_t feature_whitelist; + struct cfg80211_chan_def dflt_chandef = {}; static const u32 cipher_suites[] = { /* keep WEP first, it may be removed below */ WLAN_CIPHER_SUITE_WEP40, @@ -747,9 +691,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) return -EINVAL; #endif - if ((hw->flags & IEEE80211_HW_SCAN_WHILE_IDLE) && !local->ops->hw_scan) - return -EINVAL; - if (!local->use_chanctx) { for (i = 0; i < local->hw.wiphy->n_iface_combinations; i++) { const struct ieee80211_iface_combination *comb; @@ -767,6 +708,16 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS)) return -EINVAL; + + /* DFS currently not supported with channel context drivers */ + for (i = 0; i < local->hw.wiphy->n_iface_combinations; i++) { + const struct ieee80211_iface_combination *comb; + + comb = &local->hw.wiphy->iface_combinations[i]; + + if (comb->radar_detect_widths) + return -EINVAL; + } } /* Only HW csum features are currently compatible with mac80211 */ @@ -795,15 +746,19 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) sband = local->hw.wiphy->bands[band]; if (!sband) continue; - if (!local->use_chanctx && !local->_oper_channel) { + + if (!dflt_chandef.chan) { + cfg80211_chandef_create(&dflt_chandef, + &sband->channels[0], + NL80211_CHAN_NO_HT); /* init channel we're on */ - local->hw.conf.channel = - local->_oper_channel = &sband->channels[0]; - local->hw.conf.channel_type = NL80211_CHAN_NO_HT; + if (!local->use_chanctx && !local->_oper_chandef.chan) { + local->hw.conf.chandef = dflt_chandef; + local->_oper_chandef = dflt_chandef; + } + local->monitor_chandef = dflt_chandef; } - cfg80211_chandef_create(&local->monitor_chandef, - &sband->channels[0], - NL80211_CHAN_NO_HT); + channels += sband->n_channels; if (max_bitrates < sband->n_bitrates) @@ -886,22 +841,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (supp_ht) local->scan_ies_len += 2 + sizeof(struct ieee80211_ht_cap); - if (supp_vht) { + if (supp_vht) local->scan_ies_len += 2 + sizeof(struct ieee80211_vht_cap); - /* - * (for now at least), drivers wanting to use VHT must - * support channel contexts, as they contain all the - * necessary VHT information and the global hw config - * doesn't (yet) - */ - if (WARN_ON(!local->use_chanctx)) { - result = -EINVAL; - goto fail_wiphy_register; - } - } - if (!local->ops->hw_scan) { /* For hw_scan, driver needs to set these up. */ local->hw.wiphy->max_scan_ssids = 4; @@ -1049,12 +992,22 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) goto fail_ifa; #endif - netif_napi_add(&local->napi_dev, &local->napi, ieee80211_napi_poll, - local->hw.napi_weight); +#if IS_ENABLED(CONFIG_IPV6) + local->ifa6_notifier.notifier_call = ieee80211_ifa6_changed; + result = register_inet6addr_notifier(&local->ifa6_notifier); + if (result) + goto fail_ifa6; +#endif return 0; +#if IS_ENABLED(CONFIG_IPV6) + fail_ifa6: #ifdef CONFIG_INET + unregister_inetaddr_notifier(&local->ifa_notifier); +#endif +#endif +#if defined(CONFIG_INET) || defined(CONFIG_IPV6) fail_ifa: pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY, &local->network_latency_notifier); @@ -1090,6 +1043,9 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) #ifdef CONFIG_INET unregister_inetaddr_notifier(&local->ifa_notifier); #endif +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&local->ifa6_notifier); +#endif rtnl_lock(); @@ -1113,7 +1069,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) wiphy_warn(local->hw.wiphy, "skb_queue not empty\n"); skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); - skb_queue_purge(&local->rx_skb_queue); destroy_workqueue(local->workqueue); wiphy_unregister(local->hw.wiphy); @@ -1191,8 +1146,7 @@ static void __exit ieee80211_exit(void) rc80211_minstrel_ht_exit(); rc80211_minstrel_exit(); - if (mesh_allocated) - ieee80211s_stop(); + ieee80211s_stop(); ieee80211_iface_exit(); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 649ad513547f..6952760881c8 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -13,23 +13,14 @@ #include "ieee80211_i.h" #include "mesh.h" -#define TMR_RUNNING_HK 0 -#define TMR_RUNNING_MP 1 -#define TMR_RUNNING_MPR 2 - -int mesh_allocated; +static int mesh_allocated; static struct kmem_cache *rm_cache; -#ifdef CONFIG_MAC80211_MESH bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt) { return (mgmt->u.action.u.mesh_action.action_code == WLAN_MESH_ACTION_HWMP_PATH_SELECTION); } -#else -bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt) -{ return false; } -#endif void ieee80211s_init(void) { @@ -41,6 +32,8 @@ void ieee80211s_init(void) void ieee80211s_stop(void) { + if (!mesh_allocated) + return; mesh_pathtbl_unregister(); kmem_cache_destroy(rm_cache); } @@ -53,11 +46,6 @@ static void ieee80211_mesh_housekeeping_timer(unsigned long data) set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); - if (local->quiescing) { - set_bit(TMR_RUNNING_HK, &ifmsh->timers_running); - return; - } - ieee80211_queue_work(&local->hw, &sdata->work); } @@ -95,24 +83,22 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, (ifmsh->mesh_cc_id == ie->mesh_config->meshconf_congest) && (ifmsh->mesh_sp_id == ie->mesh_config->meshconf_synch) && (ifmsh->mesh_auth_id == ie->mesh_config->meshconf_auth))) - goto mismatch; + return false; ieee80211_sta_get_rates(local, ie, ieee80211_get_sdata_band(sdata), &basic_rates); if (sdata->vif.bss_conf.basic_rates != basic_rates) - goto mismatch; + return false; ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, ie->ht_operation, &sta_chan_def); if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, &sta_chan_def)) - goto mismatch; + return false; return true; -mismatch: - return false; } /** @@ -123,7 +109,7 @@ mismatch: bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie) { return (ie->mesh_config->meshconf_cap & - IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS) != 0; + IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS) != 0; } /** @@ -154,6 +140,31 @@ u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata) return changed; } +/* + * mesh_sta_cleanup - clean up any mesh sta state + * + * @sta: mesh sta to clean up. + */ +void mesh_sta_cleanup(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 changed; + + /* + * maybe userspace handles peer allocation and peering, but in either + * case the beacon is still generated by the kernel and we might need + * an update. + */ + changed = mesh_accept_plinks_update(sdata); + if (!sdata->u.mesh.user_mpm) { + changed |= mesh_plink_deactivate(sta); + del_timer_sync(&sta->plink_timer); + } + + if (changed) + ieee80211_mbss_info_change_notify(sdata, changed); +} + int mesh_rmc_init(struct ieee80211_sub_if_data *sdata) { int i; @@ -176,11 +187,12 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata) if (!sdata->u.mesh.rmc) return; - for (i = 0; i < RMC_BUCKETS; i++) + for (i = 0; i < RMC_BUCKETS; i++) { list_for_each_entry_safe(p, n, &rmc->bucket[i], list) { list_del(&p->list); kmem_cache_free(rm_cache, p); } + } kfree(rmc); sdata->u.mesh.rmc = NULL; @@ -189,6 +201,7 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata) /** * mesh_rmc_check - Check frame in recent multicast cache and add if absent. * + * @sdata: interface * @sa: source address * @mesh_hdr: mesh_header * @@ -198,8 +211,8 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata) * received this frame lately. If the frame is not in the cache, it is added to * it. */ -int mesh_rmc_check(u8 *sa, struct ieee80211s_hdr *mesh_hdr, - struct ieee80211_sub_if_data *sdata) +int mesh_rmc_check(struct ieee80211_sub_if_data *sdata, + const u8 *sa, struct ieee80211s_hdr *mesh_hdr) { struct mesh_rmc *rmc = sdata->u.mesh.rmc; u32 seqnum = 0; @@ -213,12 +226,11 @@ int mesh_rmc_check(u8 *sa, struct ieee80211s_hdr *mesh_hdr, list_for_each_entry_safe(p, n, &rmc->bucket[idx], list) { ++entries; if (time_after(jiffies, p->exp_time) || - (entries == RMC_QUEUE_MAX_LEN)) { + entries == RMC_QUEUE_MAX_LEN) { list_del(&p->list); kmem_cache_free(rm_cache, p); --entries; - } else if ((seqnum == p->seqnum) && - (ether_addr_equal(sa, p->sa))) + } else if ((seqnum == p->seqnum) && ether_addr_equal(sa, p->sa)) return -1; } @@ -233,8 +245,8 @@ int mesh_rmc_check(u8 *sa, struct ieee80211s_hdr *mesh_hdr, return 0; } -int -mesh_add_meshconf_ie(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata) +int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u8 *pos, neighbors; @@ -265,16 +277,18 @@ mesh_add_meshconf_ie(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata) /* Mesh capability */ *pos = IEEE80211_MESHCONF_CAPAB_FORWARDING; *pos |= ifmsh->accepting_plinks ? - IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS : 0x00; + IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS : 0x00; + /* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */ + *pos |= ifmsh->ps_peers_deep_sleep ? + IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00; *pos++ |= ifmsh->adjusting_tbtt ? - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00; + IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00; *pos++ = 0x00; return 0; } -int -mesh_add_meshid_ie(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata) +int mesh_add_meshid_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u8 *pos; @@ -291,8 +305,31 @@ mesh_add_meshid_ie(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata) return 0; } -int -mesh_add_vendor_ies(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata) +static int mesh_add_awake_window_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + u8 *pos; + + /* see IEEE802.11-2012 13.14.6 */ + if (ifmsh->ps_peers_light_sleep == 0 && + ifmsh->ps_peers_deep_sleep == 0 && + ifmsh->nonpeer_pm == NL80211_MESH_POWER_ACTIVE) + return 0; + + if (skb_tailroom(skb) < 4) + return -ENOMEM; + + pos = skb_put(skb, 2 + 2); + *pos++ = WLAN_EID_MESH_AWAKE_WINDOW; + *pos++ = 2; + put_unaligned_le16(ifmsh->mshcfg.dot11MeshAwakeWindowDuration, pos); + + return 0; +} + +int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u8 offset, len; @@ -315,8 +352,7 @@ mesh_add_vendor_ies(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata) return 0; } -int -mesh_add_rsn_ie(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata) +int mesh_add_rsn_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u8 len = 0; @@ -344,11 +380,9 @@ mesh_add_rsn_ie(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata) return 0; } -int mesh_add_ds_params_ie(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata) +static int mesh_add_ds_params_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { - struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *chan; u8 *pos; @@ -365,19 +399,16 @@ int mesh_add_ds_params_ie(struct sk_buff *skb, chan = chanctx_conf->def.chan; rcu_read_unlock(); - sband = local->hw.wiphy->bands[chan->band]; - if (sband->band == IEEE80211_BAND_2GHZ) { - pos = skb_put(skb, 2 + 1); - *pos++ = WLAN_EID_DS_PARAMS; - *pos++ = 1; - *pos++ = ieee80211_frequency_to_channel(chan->center_freq); - } + pos = skb_put(skb, 2 + 1); + *pos++ = WLAN_EID_DS_PARAMS; + *pos++ = 1; + *pos++ = ieee80211_frequency_to_channel(chan->center_freq); return 0; } -int mesh_add_ht_cap_ie(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata) +int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; enum ieee80211_band band = ieee80211_get_sdata_band(sdata); @@ -398,8 +429,8 @@ int mesh_add_ht_cap_ie(struct sk_buff *skb, return 0; } -int mesh_add_ht_oper_ie(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata) +int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; @@ -434,19 +465,13 @@ int mesh_add_ht_oper_ie(struct sk_buff *skb, return 0; } + static void ieee80211_mesh_path_timer(unsigned long data) { struct ieee80211_sub_if_data *sdata = (struct ieee80211_sub_if_data *) data; - struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - struct ieee80211_local *local = sdata->local; - if (local->quiescing) { - set_bit(TMR_RUNNING_MP, &ifmsh->timers_running); - return; - } - - ieee80211_queue_work(&local->hw, &sdata->work); + ieee80211_queue_work(&sdata->local->hw, &sdata->work); } static void ieee80211_mesh_path_root_timer(unsigned long data) @@ -454,16 +479,10 @@ static void ieee80211_mesh_path_root_timer(unsigned long data) struct ieee80211_sub_if_data *sdata = (struct ieee80211_sub_if_data *) data; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - struct ieee80211_local *local = sdata->local; set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags); - if (local->quiescing) { - set_bit(TMR_RUNNING_MPR, &ifmsh->timers_running); - return; - } - - ieee80211_queue_work(&local->hw, &sdata->work); + ieee80211_queue_work(&sdata->local->hw, &sdata->work); } void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh) @@ -479,7 +498,7 @@ void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh) /** * ieee80211_fill_mesh_addresses - fill addresses of a locally originated mesh frame - * @hdr: 802.11 frame header + * @hdr: 802.11 frame header * @fc: frame control field * @meshda: destination address in the mesh * @meshsa: source address address in the mesh. Same as TA, as frame is @@ -510,8 +529,8 @@ int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, /** * ieee80211_new_mesh_header - create a new mesh header - * @meshhdr: uninitialized mesh header * @sdata: mesh interface to be used + * @meshhdr: uninitialized mesh header * @addr4or5: 1st address in the ae header, which may correspond to address 4 * (if addr6 is NULL) or address 5 (if addr6 is present). It may * be NULL. @@ -520,42 +539,49 @@ int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, * * Return the header length. */ -int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr, - struct ieee80211_sub_if_data *sdata, char *addr4or5, - char *addr6) +int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, + struct ieee80211s_hdr *meshhdr, + const char *addr4or5, const char *addr6) { - int aelen = 0; - BUG_ON(!addr4or5 && addr6); + if (WARN_ON(!addr4or5 && addr6)) + return 0; + memset(meshhdr, 0, sizeof(*meshhdr)); + meshhdr->ttl = sdata->u.mesh.mshcfg.dot11MeshTTL; + + /* FIXME: racy -- TX on multiple queues can be concurrent */ put_unaligned(cpu_to_le32(sdata->u.mesh.mesh_seqnum), &meshhdr->seqnum); sdata->u.mesh.mesh_seqnum++; + if (addr4or5 && !addr6) { meshhdr->flags |= MESH_FLAGS_AE_A4; - aelen += ETH_ALEN; memcpy(meshhdr->eaddr1, addr4or5, ETH_ALEN); + return 2 * ETH_ALEN; } else if (addr4or5 && addr6) { meshhdr->flags |= MESH_FLAGS_AE_A5_A6; - aelen += 2 * ETH_ALEN; memcpy(meshhdr->eaddr1, addr4or5, ETH_ALEN); memcpy(meshhdr->eaddr2, addr6, ETH_ALEN); + return 3 * ETH_ALEN; } - return 6 + aelen; + + return ETH_ALEN; } -static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_mesh *ifmsh) +static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u32 changed; ieee80211_sta_expire(sdata, IEEE80211_MESH_PEER_INACTIVITY_LIMIT); mesh_path_expire(sdata); changed = mesh_accept_plinks_update(sdata); - ieee80211_bss_info_change_notify(sdata, changed); + ieee80211_mbss_info_change_notify(sdata, changed); mod_timer(&ifmsh->housekeeping_timer, - round_jiffies(jiffies + IEEE80211_MESH_HOUSEKEEPING_INTERVAL)); + round_jiffies(jiffies + + IEEE80211_MESH_HOUSEKEEPING_INTERVAL)); } static void ieee80211_mesh_rootpath(struct ieee80211_sub_if_data *sdata) @@ -574,39 +600,147 @@ static void ieee80211_mesh_rootpath(struct ieee80211_sub_if_data *sdata) round_jiffies(TU_TO_EXP_TIME(interval))); } -#ifdef CONFIG_PM -void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata) +static int +ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) { - struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct beacon_data *bcn; + int head_len, tail_len; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + struct ieee80211_chanctx_conf *chanctx_conf; + enum ieee80211_band band; + u8 *pos; + struct ieee80211_sub_if_data *sdata; + int hdr_len = offsetof(struct ieee80211_mgmt, u.beacon) + + sizeof(mgmt->u.beacon); + + sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + band = chanctx_conf->def.chan->band; + rcu_read_unlock(); - /* use atomic bitops in case all timers fire at the same time */ + head_len = hdr_len + + 2 + /* NULL SSID */ + 2 + 8 + /* supported rates */ + 2 + 3; /* DS params */ + tail_len = 2 + (IEEE80211_MAX_SUPP_RATES - 8) + + 2 + sizeof(struct ieee80211_ht_cap) + + 2 + sizeof(struct ieee80211_ht_operation) + + 2 + ifmsh->mesh_id_len + + 2 + sizeof(struct ieee80211_meshconf_ie) + + 2 + sizeof(__le16) + /* awake window */ + ifmsh->ie_len; + + bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL); + /* need an skb for IE builders to operate on */ + skb = dev_alloc_skb(max(head_len, tail_len)); + + if (!bcn || !skb) + goto out_free; - if (del_timer_sync(&ifmsh->housekeeping_timer)) - set_bit(TMR_RUNNING_HK, &ifmsh->timers_running); - if (del_timer_sync(&ifmsh->mesh_path_timer)) - set_bit(TMR_RUNNING_MP, &ifmsh->timers_running); - if (del_timer_sync(&ifmsh->mesh_path_root_timer)) - set_bit(TMR_RUNNING_MPR, &ifmsh->timers_running); + /* + * pointers go into the block we allocated, + * memory is | beacon_data | head | tail | + */ + bcn->head = ((u8 *) bcn) + sizeof(*bcn); + + /* fill in the head */ + mgmt = (struct ieee80211_mgmt *) skb_put(skb, hdr_len); + memset(mgmt, 0, hdr_len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON); + eth_broadcast_addr(mgmt->da); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); + ieee80211_mps_set_frame_flags(sdata, NULL, (void *) mgmt); + mgmt->u.beacon.beacon_int = + cpu_to_le16(sdata->vif.bss_conf.beacon_int); + mgmt->u.beacon.capab_info |= cpu_to_le16( + sdata->u.mesh.security ? WLAN_CAPABILITY_PRIVACY : 0); + + pos = skb_put(skb, 2); + *pos++ = WLAN_EID_SSID; + *pos++ = 0x0; + + if (ieee80211_add_srates_ie(sdata, skb, true, band) || + mesh_add_ds_params_ie(sdata, skb)) + goto out_free; + + bcn->head_len = skb->len; + memcpy(bcn->head, skb->data, bcn->head_len); + + /* now the tail */ + skb_trim(skb, 0); + bcn->tail = bcn->head + bcn->head_len; + + if (ieee80211_add_ext_srates_ie(sdata, skb, true, band) || + mesh_add_rsn_ie(sdata, skb) || + mesh_add_ht_cap_ie(sdata, skb) || + mesh_add_ht_oper_ie(sdata, skb) || + mesh_add_meshid_ie(sdata, skb) || + mesh_add_meshconf_ie(sdata, skb) || + mesh_add_awake_window_ie(sdata, skb) || + mesh_add_vendor_ies(sdata, skb)) + goto out_free; + + bcn->tail_len = skb->len; + memcpy(bcn->tail, skb->data, bcn->tail_len); + + dev_kfree_skb(skb); + rcu_assign_pointer(ifmsh->beacon, bcn); + return 0; +out_free: + kfree(bcn); + dev_kfree_skb(skb); + return -ENOMEM; } -void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata) +static int +ieee80211_mesh_rebuild_beacon(struct ieee80211_if_mesh *ifmsh) { - struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct beacon_data *old_bcn; + int ret; + + mutex_lock(&ifmsh->mtx); + + old_bcn = rcu_dereference_protected(ifmsh->beacon, + lockdep_is_held(&ifmsh->mtx)); + ret = ieee80211_mesh_build_beacon(ifmsh); + if (ret) + /* just reuse old beacon */ + goto out; + + if (old_bcn) + kfree_rcu(old_bcn, rcu_head); +out: + mutex_unlock(&ifmsh->mtx); + return ret; +} - if (test_and_clear_bit(TMR_RUNNING_HK, &ifmsh->timers_running)) - add_timer(&ifmsh->housekeeping_timer); - if (test_and_clear_bit(TMR_RUNNING_MP, &ifmsh->timers_running)) - add_timer(&ifmsh->mesh_path_timer); - if (test_and_clear_bit(TMR_RUNNING_MPR, &ifmsh->timers_running)) - add_timer(&ifmsh->mesh_path_root_timer); - ieee80211_mesh_root_setup(ifmsh); +void ieee80211_mbss_info_change_notify(struct ieee80211_sub_if_data *sdata, + u32 changed) +{ + if (sdata->vif.bss_conf.enable_beacon && + (changed & (BSS_CHANGED_BEACON | + BSS_CHANGED_HT | + BSS_CHANGED_BASIC_RATES | + BSS_CHANGED_BEACON_INT))) + if (ieee80211_mesh_rebuild_beacon(&sdata->u.mesh)) + return; + ieee80211_bss_info_change_notify(sdata, changed); } -#endif -void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) +int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_local *local = sdata->local; + u32 changed = BSS_CHANGED_BEACON | + BSS_CHANGED_BEACON_ENABLED | + BSS_CHANGED_HT | + BSS_CHANGED_BASIC_RATES | + BSS_CHANGED_BEACON_INT; + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); local->fif_other_bss++; /* mesh ifaces must set allmulti to forward mcast traffic */ @@ -624,34 +758,51 @@ void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) ieee80211_queue_work(&local->hw, &sdata->work); sdata->vif.bss_conf.ht_operation_mode = ifmsh->mshcfg.ht_opmode; - sdata->vif.bss_conf.beacon_int = MESH_DEFAULT_BEACON_INTERVAL; + sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.basic_rates = - ieee80211_mandatory_rates(sdata->local, - ieee80211_get_sdata_band(sdata)); - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON | - BSS_CHANGED_BEACON_ENABLED | - BSS_CHANGED_HT | - BSS_CHANGED_BASIC_RATES | - BSS_CHANGED_BEACON_INT); + ieee80211_mandatory_rates(local, band); + + changed |= ieee80211_mps_local_status_update(sdata); + + if (ieee80211_mesh_build_beacon(ifmsh)) { + ieee80211_stop_mesh(sdata); + return -ENOMEM; + } + + ieee80211_bss_info_change_notify(sdata, changed); netif_carrier_on(sdata->dev); + return 0; } void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct beacon_data *bcn; netif_carrier_off(sdata->dev); /* stop the beacon */ ifmsh->mesh_id_len = 0; + sdata->vif.bss_conf.enable_beacon = false; + clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); + mutex_lock(&ifmsh->mtx); + bcn = rcu_dereference_protected(ifmsh->beacon, + lockdep_is_held(&ifmsh->mtx)); + rcu_assign_pointer(ifmsh->beacon, NULL); + kfree_rcu(bcn, rcu_head); + mutex_unlock(&ifmsh->mtx); /* flush STAs and mpaths on this iface */ - sta_info_flush(sdata->local, sdata); + sta_info_flush(sdata); mesh_path_flush_by_iface(sdata); + /* free all potentially still buffered group-addressed frames */ + local->total_ps_buffered -= skb_queue_len(&ifmsh->ps.bc_buf); + skb_queue_purge(&ifmsh->ps.bc_buf); + del_timer_sync(&sdata->u.mesh.housekeeping_timer); del_timer_sync(&sdata->u.mesh.mesh_path_root_timer); del_timer_sync(&sdata->u.mesh.mesh_path_timer); @@ -667,8 +818,61 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) local->fif_other_bss--; atomic_dec(&local->iff_allmultis); ieee80211_configure_filter(local); +} + +static void +ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct sk_buff *presp; + struct beacon_data *bcn; + struct ieee80211_mgmt *hdr; + struct ieee802_11_elems elems; + size_t baselen; + u8 *pos; + + pos = mgmt->u.probe_req.variable; + baselen = (u8 *) pos - (u8 *) mgmt; + if (baselen > len) + return; + + ieee802_11_parse_elems(pos, len - baselen, false, &elems); + + /* 802.11-2012 10.1.4.3.2 */ + if ((!ether_addr_equal(mgmt->da, sdata->vif.addr) && + !is_broadcast_ether_addr(mgmt->da)) || + elems.ssid_len != 0) + return; - sdata->u.mesh.timers_running = 0; + if (elems.mesh_id_len != 0 && + (elems.mesh_id_len != ifmsh->mesh_id_len || + memcmp(elems.mesh_id, ifmsh->mesh_id, ifmsh->mesh_id_len))) + return; + + rcu_read_lock(); + bcn = rcu_dereference(ifmsh->beacon); + + if (!bcn) + goto out; + + presp = dev_alloc_skb(local->tx_headroom + + bcn->head_len + bcn->tail_len); + if (!presp) + goto out; + + skb_reserve(presp, local->tx_headroom); + memcpy(skb_put(presp, bcn->head_len), bcn->head, bcn->head_len); + memcpy(skb_put(presp, bcn->tail_len), bcn->tail, bcn->tail_len); + hdr = (struct ieee80211_mgmt *) presp->data; + hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_RESP); + memcpy(hdr->da, mgmt->sa, ETH_ALEN); + IEEE80211_SKB_CB(presp)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + ieee80211_tx_skb(sdata, presp); +out: + rcu_read_unlock(); } static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, @@ -695,7 +899,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, return; ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, - &elems); + false, &elems); /* ignore non-mesh or secure / unsecure mismatch */ if ((!elems.mesh_id || !elems.mesh_config) || @@ -703,7 +907,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, (!elems.rsn && sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE)) return; - if (elems.ds_params && elems.ds_params_len == 1) + if (elems.ds_params) freq = ieee80211_channel_to_frequency(elems.ds_params[0], band); else freq = rx_status->freq; @@ -760,6 +964,9 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, ieee80211_mesh_rx_bcn_presp(sdata, stype, mgmt, skb->len, rx_status); break; + case IEEE80211_STYPE_PROBE_REQ: + ieee80211_mesh_rx_probe_req(sdata, mgmt, skb->len); + break; case IEEE80211_STYPE_ACTION: ieee80211_mesh_rx_mgmt_action(sdata, mgmt, skb->len, rx_status); break; @@ -782,7 +989,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata) mesh_mpp_table_grow(); if (test_and_clear_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags)) - ieee80211_mesh_housekeeping(sdata, ifmsh); + ieee80211_mesh_housekeeping(sdata); if (test_and_clear_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags)) ieee80211_mesh_rootpath(sdata); @@ -797,7 +1004,8 @@ void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) - if (ieee80211_vif_is_mesh(&sdata->vif)) + if (ieee80211_vif_is_mesh(&sdata->vif) && + ieee80211_sdata_running(sdata)) ieee80211_queue_work(&local->hw, &sdata->work); rcu_read_unlock(); } @@ -805,6 +1013,7 @@ void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + static u8 zero_addr[ETH_ALEN] = {}; setup_timer(&ifmsh->housekeeping_timer, ieee80211_mesh_housekeeping_timer, @@ -828,6 +1037,11 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) ieee80211_mesh_path_root_timer, (unsigned long) sdata); INIT_LIST_HEAD(&ifmsh->preq_queue.list); + skb_queue_head_init(&ifmsh->ps.bc_buf); spin_lock_init(&ifmsh->mesh_preq_queue_lock); spin_lock_init(&ifmsh->sync_offset_lock); + RCU_INIT_POINTER(ifmsh->beacon, NULL); + mutex_init(&ifmsh->mtx); + + sdata->vif.bss_conf.bssid = zero_addr; } diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 84c28c6101cd..da158774eebb 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -26,12 +26,12 @@ * @MESH_PATH_ACTIVE: the mesh path can be used for forwarding * @MESH_PATH_RESOLVING: the discovery process is running for this mesh path * @MESH_PATH_SN_VALID: the mesh path contains a valid destination sequence - * number + * number * @MESH_PATH_FIXED: the mesh path has been manually set and should not be - * modified + * modified * @MESH_PATH_RESOLVED: the mesh path can has been resolved * @MESH_PATH_REQ_QUEUED: there is an unsent path request for this destination - * already queued up, waiting for the discovery process to start. + * already queued up, waiting for the discovery process to start. * * MESH_PATH_RESOLVED is used by the mesh path timer to * decide when to stop or cancel the mesh path discovery. @@ -73,16 +73,16 @@ enum mesh_deferred_task_flags { * @dst: mesh path destination mac address * @sdata: mesh subif * @next_hop: mesh neighbor to which frames for this destination will be - * forwarded + * forwarded * @timer: mesh path discovery timer * @frame_queue: pending queue for frames sent to this destination while the - * path is unresolved + * path is unresolved * @sn: target sequence number * @metric: current metric to this destination * @hop_count: hops to destination * @exp_time: in jiffies, when the path will expire or when it expired * @discovery_timeout: timeout (lapse in jiffies) used for the last discovery - * retry + * retry * @discovery_retries: number of discovery retries * @flags: mesh path flags, as specified on &enum mesh_path_flags * @state_lock: mesh path state lock used to protect changes to the @@ -191,8 +191,6 @@ struct mesh_rmc { #define IEEE80211_MESH_PEER_INACTIVITY_LIMIT (1800 * HZ) #define IEEE80211_MESH_HOUSEKEEPING_INTERVAL (60 * HZ) -#define MESH_DEFAULT_BEACON_INTERVAL 1000 /* in 1024 us units */ - #define MESH_PATH_EXPIRE (600 * HZ) /* Default maximum number of plinks per interface */ @@ -208,104 +206,133 @@ struct mesh_rmc { /* Various */ int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, const u8 *da, const u8 *sa); -int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr, - struct ieee80211_sub_if_data *sdata, char *addr4or5, - char *addr6); -int mesh_rmc_check(u8 *addr, struct ieee80211s_hdr *mesh_hdr, - struct ieee80211_sub_if_data *sdata); +int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, + struct ieee80211s_hdr *meshhdr, + const char *addr4or5, const char *addr6); +int mesh_rmc_check(struct ieee80211_sub_if_data *sdata, + const u8 *addr, struct ieee80211s_hdr *mesh_hdr); bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *ie); void mesh_ids_set_default(struct ieee80211_if_mesh *mesh); -void mesh_mgmt_ies_add(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); -int mesh_add_meshconf_ie(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); -int mesh_add_meshid_ie(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); -int mesh_add_rsn_ie(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); -int mesh_add_vendor_ies(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); -int mesh_add_ds_params_ie(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); -int mesh_add_ht_cap_ie(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); -int mesh_add_ht_oper_ie(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); +void mesh_mgmt_ies_add(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int mesh_add_meshid_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int mesh_add_rsn_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); void ieee80211s_update_metric(struct ieee80211_local *local, - struct sta_info *sta, struct sk_buff *skb); -void ieee80211s_stop(void); + struct sta_info *sta, struct sk_buff *skb); void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata); -void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata); +int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata); void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh); const struct ieee80211_mesh_sync_ops *ieee80211_mesh_sync_ops_get(u8 method); +/* wrapper for ieee80211_bss_info_change_notify() */ +void ieee80211_mbss_info_change_notify(struct ieee80211_sub_if_data *sdata, + u32 changed); + +/* mesh power save */ +u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata); +u32 ieee80211_mps_set_sta_local_pm(struct sta_info *sta, + enum nl80211_mesh_power_mode pm); +void ieee80211_mps_set_frame_flags(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee80211_hdr *hdr); +void ieee80211_mps_sta_status_update(struct sta_info *sta); +void ieee80211_mps_rx_h_sta_process(struct sta_info *sta, + struct ieee80211_hdr *hdr); +void ieee80211_mpsp_trigger_process(u8 *qc, struct sta_info *sta, + bool tx, bool acked); +void ieee80211_mps_frame_release(struct sta_info *sta, + struct ieee802_11_elems *elems); /* Mesh paths */ -int mesh_nexthop_lookup(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); -int mesh_nexthop_resolve(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); +int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata); -struct mesh_path *mesh_path_lookup(u8 *dst, - struct ieee80211_sub_if_data *sdata); -struct mesh_path *mpp_path_lookup(u8 *dst, - struct ieee80211_sub_if_data *sdata); -int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata); -struct mesh_path *mesh_path_lookup_by_idx(int idx, - struct ieee80211_sub_if_data *sdata); +struct mesh_path *mesh_path_lookup(struct ieee80211_sub_if_data *sdata, + const u8 *dst); +struct mesh_path *mpp_path_lookup(struct ieee80211_sub_if_data *sdata, + const u8 *dst); +int mpp_path_add(struct ieee80211_sub_if_data *sdata, + const u8 *dst, const u8 *mpp); +struct mesh_path * +mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx); void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop); void mesh_path_expire(struct ieee80211_sub_if_data *sdata); void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, size_t len); -int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata); + struct ieee80211_mgmt *mgmt, size_t len); +struct mesh_path * +mesh_path_add(struct ieee80211_sub_if_data *sdata, const u8 *dst); int mesh_path_add_gate(struct mesh_path *mpath); int mesh_path_send_to_gates(struct mesh_path *mpath); int mesh_gate_num(struct ieee80211_sub_if_data *sdata); + /* Mesh plinks */ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, - u8 *hw_addr, - struct ieee802_11_elems *ie); + u8 *hw_addr, struct ieee802_11_elems *ie); bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie); u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata); void mesh_plink_broken(struct sta_info *sta); -void mesh_plink_deactivate(struct sta_info *sta); -int mesh_plink_open(struct sta_info *sta); -void mesh_plink_block(struct sta_info *sta); +u32 mesh_plink_deactivate(struct sta_info *sta); +u32 mesh_plink_open(struct sta_info *sta); +u32 mesh_plink_block(struct sta_info *sta); void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status); +void mesh_sta_cleanup(struct sta_info *sta); /* Private interfaces */ /* Mesh tables */ void mesh_mpath_table_grow(void); void mesh_mpp_table_grow(void); /* Mesh paths */ -int mesh_path_error_tx(u8 ttl, u8 *target, __le32 target_sn, __le16 target_rcode, - const u8 *ra, struct ieee80211_sub_if_data *sdata); +int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, + u8 ttl, const u8 *target, __le32 target_sn, + __le16 target_rcode, const u8 *ra); void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta); void mesh_path_flush_pending(struct mesh_path *mpath); void mesh_path_tx_pending(struct mesh_path *mpath); int mesh_pathtbl_init(void); void mesh_pathtbl_unregister(void); -int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata); +int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr); void mesh_path_timer(unsigned long data); void mesh_path_flush_by_nexthop(struct sta_info *sta); -void mesh_path_discard_frame(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata); -void mesh_path_quiesce(struct ieee80211_sub_if_data *sdata); -void mesh_path_restart(struct ieee80211_sub_if_data *sdata); +void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata); bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt); extern int mesh_paths_generation; #ifdef CONFIG_MAC80211_MESH -extern int mesh_allocated; +static inline +u32 mesh_plink_inc_estab_count(struct ieee80211_sub_if_data *sdata) +{ + atomic_inc(&sdata->u.mesh.estab_plinks); + return mesh_accept_plinks_update(sdata); +} + +static inline +u32 mesh_plink_dec_estab_count(struct ieee80211_sub_if_data *sdata) +{ + atomic_dec(&sdata->u.mesh.estab_plinks); + return mesh_accept_plinks_update(sdata); +} static inline int mesh_plink_free_count(struct ieee80211_sub_if_data *sdata) { @@ -331,26 +358,17 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local); -void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata); -void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata); -void mesh_plink_quiesce(struct sta_info *sta); -void mesh_plink_restart(struct sta_info *sta); void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata); void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata); +void ieee80211s_stop(void); #else -#define mesh_allocated 0 static inline void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {} -static inline void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata) -{} -static inline void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata) -{} -static inline void mesh_plink_quiesce(struct sta_info *sta) {} -static inline void mesh_plink_restart(struct sta_info *sta) {} static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) { return false; } static inline void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) {} +static inline void ieee80211s_stop(void) {} #endif #endif /* IEEE80211S_H */ diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 2659e428b80c..486819cd02cd 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -30,14 +30,14 @@ static void mesh_queue_preq(struct mesh_path *, u8); -static inline u32 u32_field_get(u8 *preq_elem, int offset, bool ae) +static inline u32 u32_field_get(const u8 *preq_elem, int offset, bool ae) { if (ae) offset += 6; return get_unaligned_le32(preq_elem + offset); } -static inline u32 u16_field_get(u8 *preq_elem, int offset, bool ae) +static inline u32 u16_field_get(const u8 *preq_elem, int offset, bool ae) { if (ae) offset += 6; @@ -102,10 +102,13 @@ enum mpath_frame_type { static const u8 broadcast_addr[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, - u8 *orig_addr, __le32 orig_sn, u8 target_flags, u8 *target, - __le32 target_sn, const u8 *da, u8 hop_count, u8 ttl, - __le32 lifetime, __le32 metric, __le32 preq_id, - struct ieee80211_sub_if_data *sdata) + const u8 *orig_addr, __le32 orig_sn, + u8 target_flags, const u8 *target, + __le32 target_sn, const u8 *da, + u8 hop_count, u8 ttl, + __le32 lifetime, __le32 metric, + __le32 preq_id, + struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; @@ -141,7 +144,7 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, *pos++ = WLAN_EID_PREQ; break; case MPATH_PREP: - mhwmp_dbg(sdata, "sending PREP to %pM\n", target); + mhwmp_dbg(sdata, "sending PREP to %pM\n", orig_addr); ie_len = 31; pos = skb_put(skb, 2 + ie_len); *pos++ = WLAN_EID_PREP; @@ -205,6 +208,7 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; skb_set_mac_header(skb, 0); skb_set_network_header(skb, 0); @@ -217,23 +221,26 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata, info->control.vif = &sdata->vif; info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; ieee80211_set_qos_hdr(sdata, skb); + ieee80211_mps_set_frame_flags(sdata, NULL, hdr); } /** - * mesh_send_path error - Sends a PERR mesh management frame + * mesh_path_error_tx - Sends a PERR mesh management frame * + * @ttl: allowed remaining hops * @target: broken destination * @target_sn: SN of the broken destination * @target_rcode: reason code for this PERR * @ra: node this frame is addressed to + * @sdata: local mesh subif * * Note: This function may be called with driver locks taken that the driver * also acquires in the TX path. To avoid a deadlock we don't transmit the * frame directly but add it to the pending queue instead. */ -int mesh_path_error_tx(u8 ttl, u8 *target, __le32 target_sn, - __le16 target_rcode, const u8 *ra, - struct ieee80211_sub_if_data *sdata) +int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, + u8 ttl, const u8 *target, __le32 target_sn, + __le16 target_rcode, const u8 *ra) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; @@ -353,6 +360,7 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, * @sdata: local mesh subif * @mgmt: mesh management frame * @hwmp_ie: hwmp information element (PREP or PREQ) + * @action: type of hwmp ie * * This function updates the path routing information to the originator and the * transmitter of a HWMP PREQ or PREP frame. @@ -364,14 +372,14 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, * path routing information is updated. */ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, - u8 *hwmp_ie, enum mpath_frame_type action) + struct ieee80211_mgmt *mgmt, + const u8 *hwmp_ie, enum mpath_frame_type action) { struct ieee80211_local *local = sdata->local; struct mesh_path *mpath; struct sta_info *sta; bool fresh_info; - u8 *orig_addr, *ta; + const u8 *orig_addr, *ta; u32 orig_sn, orig_metric; unsigned long orig_lifetime, exp_time; u32 last_hop_metric, new_metric; @@ -422,7 +430,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, process = false; fresh_info = false; } else { - mpath = mesh_path_lookup(orig_addr, sdata); + mpath = mesh_path_lookup(sdata, orig_addr); if (mpath) { spin_lock_bh(&mpath->state_lock); if (mpath->flags & MESH_PATH_FIXED) @@ -437,9 +445,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, } } } else { - mesh_path_add(orig_addr, sdata); - mpath = mesh_path_lookup(orig_addr, sdata); - if (!mpath) { + mpath = mesh_path_add(sdata, orig_addr); + if (IS_ERR(mpath)) { rcu_read_unlock(); return 0; } @@ -470,7 +477,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, else { fresh_info = true; - mpath = mesh_path_lookup(ta, sdata); + mpath = mesh_path_lookup(sdata, ta); if (mpath) { spin_lock_bh(&mpath->state_lock); if ((mpath->flags & MESH_PATH_FIXED) || @@ -478,9 +485,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, (last_hop_metric > mpath->metric))) fresh_info = false; } else { - mesh_path_add(ta, sdata); - mpath = mesh_path_lookup(ta, sdata); - if (!mpath) { + mpath = mesh_path_add(sdata, ta); + if (IS_ERR(mpath)) { rcu_read_unlock(); return 0; } @@ -506,11 +512,11 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, - u8 *preq_elem, u32 metric) + const u8 *preq_elem, u32 metric) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath = NULL; - u8 *target_addr, *orig_addr; + const u8 *target_addr, *orig_addr; const u8 *da; u8 target_flags, ttl, flags; u32 orig_sn, target_sn, lifetime, orig_metric; @@ -545,7 +551,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, } else if (is_broadcast_ether_addr(target_addr) && (target_flags & IEEE80211_PREQ_TO_FLAG)) { rcu_read_lock(); - mpath = mesh_path_lookup(orig_addr, sdata); + mpath = mesh_path_lookup(sdata, orig_addr); if (mpath) { if (flags & IEEE80211_PREQ_PROACTIVE_PREP_FLAG) { reply = true; @@ -560,7 +566,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); } else { rcu_read_lock(); - mpath = mesh_path_lookup(target_addr, sdata); + mpath = mesh_path_lookup(sdata, target_addr); if (mpath) { if ((!(mpath->flags & MESH_PATH_SN_VALID)) || SN_LT(mpath->sn, target_sn)) { @@ -643,17 +649,17 @@ next_hop_deref_protected(struct mesh_path *mpath) static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, - u8 *prep_elem, u32 metric) + const u8 *prep_elem, u32 metric) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath; - u8 *target_addr, *orig_addr; + const u8 *target_addr, *orig_addr; u8 ttl, hopcount, flags; u8 next_hop[ETH_ALEN]; u32 target_sn, orig_sn, lifetime; mhwmp_dbg(sdata, "received PREP from %pM\n", - PREP_IE_ORIG_ADDR(prep_elem)); + PREP_IE_TARGET_ADDR(prep_elem)); orig_addr = PREP_IE_ORIG_ADDR(prep_elem); if (ether_addr_equal(orig_addr, sdata->vif.addr)) @@ -670,7 +676,7 @@ static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata, } rcu_read_lock(); - mpath = mesh_path_lookup(orig_addr, sdata); + mpath = mesh_path_lookup(sdata, orig_addr); if (mpath) spin_lock_bh(&mpath->state_lock); else @@ -706,12 +712,13 @@ fail: } static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, u8 *perr_elem) + struct ieee80211_mgmt *mgmt, + const u8 *perr_elem) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath; u8 ttl; - u8 *ta, *target_addr; + const u8 *ta, *target_addr; u32 target_sn; u16 target_rcode; @@ -727,7 +734,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, target_rcode = PERR_IE_TARGET_RCODE(perr_elem); rcu_read_lock(); - mpath = mesh_path_lookup(target_addr, sdata); + mpath = mesh_path_lookup(sdata, target_addr); if (mpath) { struct sta_info *sta; @@ -742,9 +749,10 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, spin_unlock_bh(&mpath->state_lock); if (!ifmsh->mshcfg.dot11MeshForwarding) goto endperr; - mesh_path_error_tx(ttl, target_addr, cpu_to_le32(target_sn), + mesh_path_error_tx(sdata, ttl, target_addr, + cpu_to_le32(target_sn), cpu_to_le16(target_rcode), - broadcast_addr, sdata); + broadcast_addr); } else spin_unlock_bh(&mpath->state_lock); } @@ -753,15 +761,15 @@ endperr: } static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, - struct ieee80211_rann_ie *rann) + struct ieee80211_mgmt *mgmt, + const struct ieee80211_rann_ie *rann) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct mesh_path *mpath; u8 ttl, flags, hopcount; - u8 *orig_addr; + const u8 *orig_addr; u32 orig_sn, metric, metric_txsta, interval; bool root_is_gate; @@ -792,11 +800,10 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, metric_txsta = airtime_link_metric_get(local, sta); - mpath = mesh_path_lookup(orig_addr, sdata); + mpath = mesh_path_lookup(sdata, orig_addr); if (!mpath) { - mesh_path_add(orig_addr, sdata); - mpath = mesh_path_lookup(orig_addr, sdata); - if (!mpath) { + mpath = mesh_path_add(sdata, orig_addr); + if (IS_ERR(mpath)) { rcu_read_unlock(); sdata->u.mesh.mshstats.dropped_frames_no_route++; return; @@ -852,8 +859,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, - size_t len) + struct ieee80211_mgmt *mgmt, size_t len) { struct ieee802_11_elems elems; size_t baselen; @@ -874,7 +880,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt; ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable, - len - baselen, &elems); + len - baselen, false, &elems); if (elems.preq) { if (elems.preq_len != 37) @@ -997,7 +1003,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) spin_unlock_bh(&ifmsh->mesh_preq_queue_lock); rcu_read_lock(); - mpath = mesh_path_lookup(preq_node->dst, sdata); + mpath = mesh_path_lookup(sdata, preq_node->dst); if (!mpath) goto enddiscovery; @@ -1067,8 +1073,8 @@ enddiscovery: * Returns: 0 if the next hop was found and -ENOENT if the frame was queued. * skb is freeed here if no mpath could be allocated. */ -int mesh_nexthop_resolve(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata) +int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -1077,19 +1083,22 @@ int mesh_nexthop_resolve(struct sk_buff *skb, u8 *target_addr = hdr->addr3; int err = 0; + /* Nulls are only sent to peers for PS and should be pre-addressed */ + if (ieee80211_is_qos_nullfunc(hdr->frame_control)) + return 0; + rcu_read_lock(); - err = mesh_nexthop_lookup(skb, sdata); + err = mesh_nexthop_lookup(sdata, skb); if (!err) goto endlookup; /* no nexthop found, start resolving */ - mpath = mesh_path_lookup(target_addr, sdata); + mpath = mesh_path_lookup(sdata, target_addr); if (!mpath) { - mesh_path_add(target_addr, sdata); - mpath = mesh_path_lookup(target_addr, sdata); - if (!mpath) { - mesh_path_discard_frame(skb, sdata); - err = -ENOSPC; + mpath = mesh_path_add(sdata, target_addr); + if (IS_ERR(mpath)) { + mesh_path_discard_frame(sdata, skb); + err = PTR_ERR(mpath); goto endlookup; } } @@ -1105,12 +1114,13 @@ int mesh_nexthop_resolve(struct sk_buff *skb, skb_queue_tail(&mpath->frame_queue, skb); err = -ENOENT; if (skb_to_free) - mesh_path_discard_frame(skb_to_free, sdata); + mesh_path_discard_frame(sdata, skb_to_free); endlookup: rcu_read_unlock(); return err; } + /** * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame. Calling * this function is considered "using" the associated mpath, so preempt a path @@ -1121,8 +1131,8 @@ endlookup: * * Returns: 0 if the next hop was found. Nonzero otherwise. */ -int mesh_nexthop_lookup(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata) +int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { struct mesh_path *mpath; struct sta_info *next_hop; @@ -1131,7 +1141,7 @@ int mesh_nexthop_lookup(struct sk_buff *skb, int err = -ENOENT; rcu_read_lock(); - mpath = mesh_path_lookup(target_addr, sdata); + mpath = mesh_path_lookup(sdata, target_addr); if (!mpath || !(mpath->flags & MESH_PATH_ACTIVE)) goto endlookup; @@ -1148,6 +1158,7 @@ int mesh_nexthop_lookup(struct sk_buff *skb, if (next_hop) { memcpy(hdr->addr1, next_hop->sta.addr, ETH_ALEN); memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + ieee80211_mps_set_frame_flags(sdata, next_hop, hdr); err = 0; } @@ -1189,8 +1200,7 @@ void mesh_path_timer(unsigned long data) } } -void -mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata) +void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u32 interval = ifmsh->mshcfg.dot11MeshHWMPRannInterval; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index aa749818860e..89aacfd2756d 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -24,9 +24,12 @@ /* Keep the mean chain length below this constant */ #define MEAN_CHAIN_LEN 2 -#define MPATH_EXPIRED(mpath) ((mpath->flags & MESH_PATH_ACTIVE) && \ - time_after(jiffies, mpath->exp_time) && \ - !(mpath->flags & MESH_PATH_FIXED)) +static inline bool mpath_expired(struct mesh_path *mpath) +{ + return (mpath->flags & MESH_PATH_ACTIVE) && + time_after(jiffies, mpath->exp_time) && + !(mpath->flags & MESH_PATH_FIXED); +} struct mpath_node { struct hlist_node list; @@ -69,9 +72,9 @@ static inline struct mesh_table *resize_dereference_mpp_paths(void) * it's used twice. So it is illegal to do * for_each_mesh_entry(rcu_dereference(...), ...) */ -#define for_each_mesh_entry(tbl, p, node, i) \ +#define for_each_mesh_entry(tbl, node, i) \ for (i = 0; i <= tbl->hash_mask; i++) \ - hlist_for_each_entry_rcu(node, p, &tbl->hash_buckets[i], list) + hlist_for_each_entry_rcu(node, &tbl->hash_buckets[i], list) static struct mesh_table *mesh_table_alloc(int size_order) @@ -136,7 +139,7 @@ static void mesh_table_free(struct mesh_table *tbl, bool free_leafs) } if (free_leafs) { spin_lock_bh(&tbl->gates_lock); - hlist_for_each_entry_safe(gate, p, q, + hlist_for_each_entry_safe(gate, q, tbl->known_gates, list) { hlist_del(&gate->list); kfree(gate); @@ -181,12 +184,12 @@ errcopy: return -ENOMEM; } -static u32 mesh_table_hash(u8 *addr, struct ieee80211_sub_if_data *sdata, +static u32 mesh_table_hash(const u8 *addr, struct ieee80211_sub_if_data *sdata, struct mesh_table *tbl) { /* Use last four bytes of hw addr and interface index as hash index */ - return jhash_2words(*(u32 *)(addr+2), sdata->dev->ifindex, tbl->hash_rnd) - & tbl->hash_mask; + return jhash_2words(*(u32 *)(addr+2), sdata->dev->ifindex, + tbl->hash_rnd) & tbl->hash_mask; } @@ -212,6 +215,7 @@ void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta) hdr = (struct ieee80211_hdr *) skb->data; memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN); memcpy(hdr->addr2, mpath->sdata->vif.addr, ETH_ALEN); + ieee80211_mps_set_frame_flags(sta->sdata, sta, hdr); } spin_unlock_irqrestore(&mpath->frame_queue.lock, flags); @@ -325,20 +329,19 @@ static void mesh_path_move_to_queue(struct mesh_path *gate_mpath, } -static struct mesh_path *mpath_lookup(struct mesh_table *tbl, u8 *dst, - struct ieee80211_sub_if_data *sdata) +static struct mesh_path *mpath_lookup(struct mesh_table *tbl, const u8 *dst, + struct ieee80211_sub_if_data *sdata) { struct mesh_path *mpath; - struct hlist_node *n; struct hlist_head *bucket; struct mpath_node *node; bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)]; - hlist_for_each_entry_rcu(node, n, bucket, list) { + hlist_for_each_entry_rcu(node, bucket, list) { mpath = node->mpath; if (mpath->sdata == sdata && ether_addr_equal(dst, mpath->dst)) { - if (MPATH_EXPIRED(mpath)) { + if (mpath_expired(mpath)) { spin_lock_bh(&mpath->state_lock); mpath->flags &= ~MESH_PATH_ACTIVE; spin_unlock_bh(&mpath->state_lock); @@ -351,19 +354,21 @@ static struct mesh_path *mpath_lookup(struct mesh_table *tbl, u8 *dst, /** * mesh_path_lookup - look up a path in the mesh path table - * @dst: hardware address (ETH_ALEN length) of destination * @sdata: local subif + * @dst: hardware address (ETH_ALEN length) of destination * * Returns: pointer to the mesh path structure, or NULL if not found * * Locking: must be called within a read rcu section. */ -struct mesh_path *mesh_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata) +struct mesh_path * +mesh_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst) { return mpath_lookup(rcu_dereference(mesh_paths), dst, sdata); } -struct mesh_path *mpp_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata) +struct mesh_path * +mpp_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst) { return mpath_lookup(rcu_dereference(mpp_paths), dst, sdata); } @@ -378,19 +383,19 @@ struct mesh_path *mpp_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata) * * Locking: must be called within a read rcu section. */ -struct mesh_path *mesh_path_lookup_by_idx(int idx, struct ieee80211_sub_if_data *sdata) +struct mesh_path * +mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { struct mesh_table *tbl = rcu_dereference(mesh_paths); struct mpath_node *node; - struct hlist_node *p; int i; int j = 0; - for_each_mesh_entry(tbl, p, node, i) { + for_each_mesh_entry(tbl, node, i) { if (sdata && node->mpath->sdata != sdata) continue; if (j++ == idx) { - if (MPATH_EXPIRED(node->mpath)) { + if (mpath_expired(node->mpath)) { spin_lock_bh(&node->mpath->state_lock); node->mpath->flags &= ~MESH_PATH_ACTIVE; spin_unlock_bh(&node->mpath->state_lock); @@ -410,13 +415,12 @@ int mesh_path_add_gate(struct mesh_path *mpath) { struct mesh_table *tbl; struct mpath_node *gate, *new_gate; - struct hlist_node *n; int err; rcu_read_lock(); tbl = rcu_dereference(mesh_paths); - hlist_for_each_entry_rcu(gate, n, tbl->known_gates, list) + hlist_for_each_entry_rcu(gate, tbl->known_gates, list) if (gate->mpath == mpath) { err = -EEXIST; goto err_rcu; @@ -434,11 +438,10 @@ int mesh_path_add_gate(struct mesh_path *mpath) spin_lock_bh(&tbl->gates_lock); hlist_add_head_rcu(&new_gate->list, tbl->known_gates); spin_unlock_bh(&tbl->gates_lock); - rcu_read_unlock(); mpath_dbg(mpath->sdata, "Mesh path: Recorded new gate: %pM. %d known gates\n", mpath->dst, mpath->sdata->u.mesh.num_gates); - return 0; + err = 0; err_rcu: rcu_read_unlock(); return err; @@ -449,30 +452,27 @@ err_rcu: * @tbl: table which holds our list of known gates * @mpath: gate mpath * - * Returns: 0 on success - * * Locking: must be called inside rcu_read_lock() section */ -static int mesh_gate_del(struct mesh_table *tbl, struct mesh_path *mpath) +static void mesh_gate_del(struct mesh_table *tbl, struct mesh_path *mpath) { struct mpath_node *gate; - struct hlist_node *p, *q; + struct hlist_node *q; - hlist_for_each_entry_safe(gate, p, q, tbl->known_gates, list) - if (gate->mpath == mpath) { - spin_lock_bh(&tbl->gates_lock); - hlist_del_rcu(&gate->list); - kfree_rcu(gate, rcu); - spin_unlock_bh(&tbl->gates_lock); - mpath->sdata->u.mesh.num_gates--; - mpath->is_gate = false; - mpath_dbg(mpath->sdata, - "Mesh path: Deleted gate: %pM. %d known gates\n", - mpath->dst, mpath->sdata->u.mesh.num_gates); - break; - } - - return 0; + hlist_for_each_entry_safe(gate, q, tbl->known_gates, list) { + if (gate->mpath != mpath) + continue; + spin_lock_bh(&tbl->gates_lock); + hlist_del_rcu(&gate->list); + kfree_rcu(gate, rcu); + spin_unlock_bh(&tbl->gates_lock); + mpath->sdata->u.mesh.num_gates--; + mpath->is_gate = false; + mpath_dbg(mpath->sdata, + "Mesh path: Deleted gate: %pM. %d known gates\n", + mpath->dst, mpath->sdata->u.mesh.num_gates); + break; + } } /** @@ -486,14 +486,15 @@ int mesh_gate_num(struct ieee80211_sub_if_data *sdata) /** * mesh_path_add - allocate and add a new path to the mesh path table - * @addr: destination address of the path (ETH_ALEN length) + * @dst: destination address of the path (ETH_ALEN length) * @sdata: local subif * * Returns: 0 on success * * State: the initial state of the new path is set to 0 */ -int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata) +struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, + const u8 *dst) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_local *local = sdata->local; @@ -501,20 +502,34 @@ int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata) struct mesh_path *mpath, *new_mpath; struct mpath_node *node, *new_node; struct hlist_head *bucket; - struct hlist_node *n; int grow = 0; - int err = 0; + int err; u32 hash_idx; if (ether_addr_equal(dst, sdata->vif.addr)) /* never add ourselves as neighbours */ - return -ENOTSUPP; + return ERR_PTR(-ENOTSUPP); if (is_multicast_ether_addr(dst)) - return -ENOTSUPP; + return ERR_PTR(-ENOTSUPP); if (atomic_add_unless(&sdata->u.mesh.mpaths, 1, MESH_MAX_MPATHS) == 0) - return -ENOSPC; + return ERR_PTR(-ENOSPC); + + read_lock_bh(&pathtbl_resize_lock); + tbl = resize_dereference_mesh_paths(); + + hash_idx = mesh_table_hash(dst, sdata, tbl); + bucket = &tbl->hash_buckets[hash_idx]; + + spin_lock(&tbl->hashwlock[hash_idx]); + + hlist_for_each_entry(node, bucket, list) { + mpath = node->mpath; + if (mpath->sdata == sdata && + ether_addr_equal(dst, mpath->dst)) + goto found; + } err = -ENOMEM; new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC); @@ -525,7 +540,6 @@ int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata) if (!new_node) goto err_node_alloc; - read_lock_bh(&pathtbl_resize_lock); memcpy(new_mpath->dst, dst, ETH_ALEN); eth_broadcast_addr(new_mpath->rann_snd_addr); new_mpath->is_root = false; @@ -539,21 +553,6 @@ int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata) spin_lock_init(&new_mpath->state_lock); init_timer(&new_mpath->timer); - tbl = resize_dereference_mesh_paths(); - - hash_idx = mesh_table_hash(dst, sdata, tbl); - bucket = &tbl->hash_buckets[hash_idx]; - - spin_lock(&tbl->hashwlock[hash_idx]); - - err = -EEXIST; - hlist_for_each_entry(node, n, bucket, list) { - mpath = node->mpath; - if (mpath->sdata == sdata && - ether_addr_equal(dst, mpath->dst)) - goto err_exists; - } - hlist_add_head_rcu(&new_node->list, bucket); if (atomic_inc_return(&tbl->entries) >= tbl->mean_chain_len * (tbl->hash_mask + 1)) @@ -561,23 +560,23 @@ int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata) mesh_paths_generation++; - spin_unlock(&tbl->hashwlock[hash_idx]); - read_unlock_bh(&pathtbl_resize_lock); if (grow) { set_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags); ieee80211_queue_work(&local->hw, &sdata->work); } - return 0; - -err_exists: + mpath = new_mpath; +found: spin_unlock(&tbl->hashwlock[hash_idx]); read_unlock_bh(&pathtbl_resize_lock); - kfree(new_node); + return mpath; + err_node_alloc: kfree(new_mpath); err_path_alloc: atomic_dec(&sdata->u.mesh.mpaths); - return err; + spin_unlock(&tbl->hashwlock[hash_idx]); + read_unlock_bh(&pathtbl_resize_lock); + return ERR_PTR(err); } static void mesh_table_free_rcu(struct rcu_head *rcu) @@ -628,7 +627,8 @@ void mesh_mpp_table_grow(void) write_unlock_bh(&pathtbl_resize_lock); } -int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata) +int mpp_path_add(struct ieee80211_sub_if_data *sdata, + const u8 *dst, const u8 *mpp) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_local *local = sdata->local; @@ -636,7 +636,6 @@ int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata) struct mesh_path *mpath, *new_mpath; struct mpath_node *node, *new_node; struct hlist_head *bucket; - struct hlist_node *n; int grow = 0; int err = 0; u32 hash_idx; @@ -676,7 +675,7 @@ int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata) spin_lock(&tbl->hashwlock[hash_idx]); err = -EEXIST; - hlist_for_each_entry(node, n, bucket, list) { + hlist_for_each_entry(node, bucket, list) { mpath = node->mpath; if (mpath->sdata == sdata && ether_addr_equal(dst, mpath->dst)) @@ -721,14 +720,13 @@ void mesh_plink_broken(struct sta_info *sta) static const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; struct mesh_path *mpath; struct mpath_node *node; - struct hlist_node *p; struct ieee80211_sub_if_data *sdata = sta->sdata; int i; __le16 reason = cpu_to_le16(WLAN_REASON_MESH_PATH_DEST_UNREACHABLE); rcu_read_lock(); tbl = rcu_dereference(mesh_paths); - for_each_mesh_entry(tbl, p, node, i) { + for_each_mesh_entry(tbl, node, i) { mpath = node->mpath; if (rcu_dereference(mpath->next_hop) == sta && mpath->flags & MESH_PATH_ACTIVE && @@ -737,9 +735,10 @@ void mesh_plink_broken(struct sta_info *sta) mpath->flags &= ~MESH_PATH_ACTIVE; ++mpath->sn; spin_unlock_bh(&mpath->state_lock); - mesh_path_error_tx(sdata->u.mesh.mshcfg.element_ttl, - mpath->dst, cpu_to_le32(mpath->sn), - reason, bcast, sdata); + mesh_path_error_tx(sdata, + sdata->u.mesh.mshcfg.element_ttl, + mpath->dst, cpu_to_le32(mpath->sn), + reason, bcast); } } rcu_read_unlock(); @@ -787,13 +786,12 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) struct mesh_table *tbl; struct mesh_path *mpath; struct mpath_node *node; - struct hlist_node *p; int i; rcu_read_lock(); read_lock_bh(&pathtbl_resize_lock); tbl = resize_dereference_mesh_paths(); - for_each_mesh_entry(tbl, p, node, i) { + for_each_mesh_entry(tbl, node, i) { mpath = node->mpath; if (rcu_dereference(mpath->next_hop) == sta) { spin_lock(&tbl->hashwlock[i]); @@ -810,11 +808,10 @@ static void table_flush_by_iface(struct mesh_table *tbl, { struct mesh_path *mpath; struct mpath_node *node; - struct hlist_node *p; int i; WARN_ON(!rcu_read_lock_held()); - for_each_mesh_entry(tbl, p, node, i) { + for_each_mesh_entry(tbl, node, i) { mpath = node->mpath; if (mpath->sdata != sdata) continue; @@ -854,13 +851,12 @@ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) * * Returns: 0 if successful */ -int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata) +int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct mesh_table *tbl; struct mesh_path *mpath; struct mpath_node *node; struct hlist_head *bucket; - struct hlist_node *n; int hash_idx; int err = 0; @@ -870,7 +866,7 @@ int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata) bucket = &tbl->hash_buckets[hash_idx]; spin_lock(&tbl->hashwlock[hash_idx]); - hlist_for_each_entry(node, n, bucket, list) { + hlist_for_each_entry(node, bucket, list) { mpath = node->mpath; if (mpath->sdata == sdata && ether_addr_equal(addr, mpath->dst)) { @@ -915,7 +911,6 @@ void mesh_path_tx_pending(struct mesh_path *mpath) int mesh_path_send_to_gates(struct mesh_path *mpath) { struct ieee80211_sub_if_data *sdata = mpath->sdata; - struct hlist_node *n; struct mesh_table *tbl; struct mesh_path *from_mpath = mpath; struct mpath_node *gate = NULL; @@ -930,7 +925,7 @@ int mesh_path_send_to_gates(struct mesh_path *mpath) if (!known_gates) return -EHOSTUNREACH; - hlist_for_each_entry_rcu(gate, n, known_gates, list) { + hlist_for_each_entry_rcu(gate, known_gates, list) { if (gate->mpath->sdata != sdata) continue; @@ -946,7 +941,7 @@ int mesh_path_send_to_gates(struct mesh_path *mpath) } } - hlist_for_each_entry_rcu(gate, n, known_gates, list) + hlist_for_each_entry_rcu(gate, known_gates, list) if (gate->mpath->sdata == sdata) { mpath_dbg(sdata, "Sending to %pM\n", gate->mpath->dst); mesh_path_tx_pending(gate->mpath); @@ -963,8 +958,8 @@ int mesh_path_send_to_gates(struct mesh_path *mpath) * * Locking: the function must me called within a rcu_read_lock region */ -void mesh_path_discard_frame(struct sk_buff *skb, - struct ieee80211_sub_if_data *sdata) +void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { kfree_skb(skb); sdata->u.mesh.mshstats.dropped_frames_no_route++; @@ -982,7 +977,7 @@ void mesh_path_flush_pending(struct mesh_path *mpath) struct sk_buff *skb; while ((skb = skb_dequeue(&mpath->frame_queue)) != NULL) - mesh_path_discard_frame(skb, mpath->sdata); + mesh_path_discard_frame(mpath->sdata, skb); } /** @@ -1091,19 +1086,18 @@ void mesh_path_expire(struct ieee80211_sub_if_data *sdata) struct mesh_table *tbl; struct mesh_path *mpath; struct mpath_node *node; - struct hlist_node *p; int i; rcu_read_lock(); tbl = rcu_dereference(mesh_paths); - for_each_mesh_entry(tbl, p, node, i) { + for_each_mesh_entry(tbl, node, i) { if (node->mpath->sdata != sdata) continue; mpath = node->mpath; if ((!(mpath->flags & MESH_PATH_RESOLVING)) && (!(mpath->flags & MESH_PATH_FIXED)) && time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) - mesh_path_del(mpath->dst, mpath->sdata); + mesh_path_del(mpath->sdata, mpath->dst); } rcu_read_unlock(); } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 4b274e9c91a5..09bebed99416 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -37,23 +37,31 @@ enum plink_event { CLS_IGNR }; -static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, - enum ieee80211_self_protected_actioncode action, - u8 *da, __le16 llid, __le16 plid, __le16 reason); +static const char * const mplstates[] = { + [NL80211_PLINK_LISTEN] = "LISTEN", + [NL80211_PLINK_OPN_SNT] = "OPN-SNT", + [NL80211_PLINK_OPN_RCVD] = "OPN-RCVD", + [NL80211_PLINK_CNF_RCVD] = "CNF_RCVD", + [NL80211_PLINK_ESTAB] = "ESTAB", + [NL80211_PLINK_HOLDING] = "HOLDING", + [NL80211_PLINK_BLOCKED] = "BLOCKED" +}; -static inline -u32 mesh_plink_inc_estab_count(struct ieee80211_sub_if_data *sdata) -{ - atomic_inc(&sdata->u.mesh.estab_plinks); - return mesh_accept_plinks_update(sdata); -} +static const char * const mplevents[] = { + [PLINK_UNDEFINED] = "NONE", + [OPN_ACPT] = "OPN_ACPT", + [OPN_RJCT] = "OPN_RJCT", + [OPN_IGNR] = "OPN_IGNR", + [CNF_ACPT] = "CNF_ACPT", + [CNF_RJCT] = "CNF_RJCT", + [CNF_IGNR] = "CNF_IGNR", + [CLS_ACPT] = "CLS_ACPT", + [CLS_IGNR] = "CLS_IGNR" +}; -static inline -u32 mesh_plink_dec_estab_count(struct ieee80211_sub_if_data *sdata) -{ - atomic_dec(&sdata->u.mesh.estab_plinks); - return mesh_accept_plinks_update(sdata); -} +static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, + enum ieee80211_self_protected_actioncode action, + u8 *da, __le16 llid, __le16 plid, __le16 reason); /** * mesh_plink_fsm_restart - restart a mesh peer link finite state machine @@ -70,27 +78,63 @@ static inline void mesh_plink_fsm_restart(struct sta_info *sta) } /* - * Allocate mesh sta entry and insert into station table + * mesh_set_short_slot_time - enable / disable ERP short slot time. + * + * The standard indirectly mandates mesh STAs to turn off short slot time by + * disallowing advertising this (802.11-2012 8.4.1.4), but that doesn't mean we + * can't be sneaky about it. Enable short slot time if all mesh STAs in the + * MBSS support ERP rates. + * + * Returns BSS_CHANGED_ERP_SLOT or 0 for no change. */ -static struct sta_info *mesh_plink_alloc(struct ieee80211_sub_if_data *sdata, - u8 *hw_addr) +static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_local *local = sdata->local; + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; struct sta_info *sta; + u32 erp_rates = 0, changed = 0; + int i; + bool short_slot = false; - if (sdata->local->num_sta >= MESH_MAX_PLINKS) - return NULL; + if (band == IEEE80211_BAND_5GHZ) { + /* (IEEE 802.11-2012 19.4.5) */ + short_slot = true; + goto out; + } else if (band != IEEE80211_BAND_2GHZ || + (band == IEEE80211_BAND_2GHZ && + local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) + goto out; - sta = sta_info_alloc(sdata, hw_addr, GFP_KERNEL); - if (!sta) - return NULL; + for (i = 0; i < sband->n_bitrates; i++) + if (sband->bitrates[i].flags & IEEE80211_RATE_ERP_G) + erp_rates |= BIT(i); - sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); - sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); - sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); + if (!erp_rates) + goto out; - set_sta_flag(sta, WLAN_STA_WME); + rcu_read_lock(); + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata || + sta->plink_state != NL80211_PLINK_ESTAB) + continue; - return sta; + short_slot = false; + if (erp_rates & sta->sta.supp_rates[band]) + short_slot = true; + else + break; + } + rcu_read_unlock(); + +out: + if (sdata->vif.bss_conf.use_short_slot != short_slot) { + sdata->vif.bss_conf.use_short_slot = short_slot; + changed = BSS_CHANGED_ERP_SLOT; + mpl_dbg(sdata, "mesh_plink %pM: ERP short slot time %d\n", + sdata->vif.addr, short_slot); + } + return changed; } /** @@ -107,7 +151,6 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; - u32 changed = 0; u16 ht_opmode; bool non_ht_sta = false, ht20_sta = false; @@ -120,23 +163,19 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) sta->plink_state != NL80211_PLINK_ESTAB) continue; - switch (sta->ch_width) { - case NL80211_CHAN_WIDTH_20_NOHT: - mpl_dbg(sdata, - "mesh_plink %pM: nonHT sta (%pM) is present\n", - sdata->vif.addr, sta->sta.addr); + if (sta->sta.bandwidth > IEEE80211_STA_RX_BW_20) + continue; + + if (!sta->sta.ht_cap.ht_supported) { + mpl_dbg(sdata, "nonHT sta (%pM) is present\n", + sta->sta.addr); non_ht_sta = true; - goto out; - case NL80211_CHAN_WIDTH_20: - mpl_dbg(sdata, - "mesh_plink %pM: HT20 sta (%pM) is present\n", - sdata->vif.addr, sta->sta.addr); - ht20_sta = true; - default: break; } + + mpl_dbg(sdata, "HT20 sta (%pM) is present\n", sta->sta.addr); + ht20_sta = true; } -out: rcu_read_unlock(); if (non_ht_sta) @@ -147,16 +186,13 @@ out: else ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_NONE; - if (sdata->vif.bss_conf.ht_operation_mode != ht_opmode) { - sdata->vif.bss_conf.ht_operation_mode = ht_opmode; - sdata->u.mesh.mshcfg.ht_opmode = ht_opmode; - changed = BSS_CHANGED_HT; - mpl_dbg(sdata, - "mesh_plink %pM: protection mode changed to %d\n", - sdata->vif.addr, ht_opmode); - } + if (sdata->vif.bss_conf.ht_operation_mode == ht_opmode) + return 0; - return changed; + sdata->vif.bss_conf.ht_operation_mode = ht_opmode; + sdata->u.mesh.mshcfg.ht_opmode = ht_opmode; + mpl_dbg(sdata, "selected new HT protection mode %d\n", ht_opmode); + return BSS_CHANGED_HT; } /** @@ -179,6 +215,9 @@ static u32 __mesh_plink_deactivate(struct sta_info *sta) sta->plink_state = NL80211_PLINK_BLOCKED; mesh_path_flush_by_nexthop(sta); + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_local_status_update(sdata); + return changed; } @@ -189,7 +228,7 @@ static u32 __mesh_plink_deactivate(struct sta_info *sta) * * All mesh paths with this peer as next hop will be flushed */ -void mesh_plink_deactivate(struct sta_info *sta) +u32 mesh_plink_deactivate(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; u32 changed; @@ -202,12 +241,13 @@ void mesh_plink_deactivate(struct sta_info *sta) sta->reason); spin_unlock_bh(&sta->lock); - ieee80211_bss_info_change_notify(sdata, changed); + return changed; } static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, - enum ieee80211_self_protected_actioncode action, - u8 *da, __le16 llid, __le16 plid, __le16 reason) { + enum ieee80211_self_protected_actioncode action, + u8 *da, __le16 llid, __le16 plid, __le16 reason) +{ struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_tx_info *info; @@ -258,13 +298,13 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, } if (ieee80211_add_srates_ie(sdata, skb, true, band) || ieee80211_add_ext_srates_ie(sdata, skb, true, band) || - mesh_add_rsn_ie(skb, sdata) || - mesh_add_meshid_ie(skb, sdata) || - mesh_add_meshconf_ie(skb, sdata)) + mesh_add_rsn_ie(sdata, skb) || + mesh_add_meshid_ie(sdata, skb) || + mesh_add_meshconf_ie(sdata, skb)) goto free; } else { /* WLAN_SP_MESH_PEERING_CLOSE */ info->flags |= IEEE80211_TX_CTL_NO_ACK; - if (mesh_add_meshid_ie(skb, sdata)) + if (mesh_add_meshid_ie(sdata, skb)) goto free; } @@ -308,12 +348,12 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, } if (action != WLAN_SP_MESH_PEERING_CLOSE) { - if (mesh_add_ht_cap_ie(skb, sdata) || - mesh_add_ht_oper_ie(skb, sdata)) + if (mesh_add_ht_cap_ie(sdata, skb) || + mesh_add_ht_oper_ie(sdata, skb)) goto free; } - if (mesh_add_vendor_ies(skb, sdata)) + if (mesh_add_vendor_ies(sdata, skb)) goto free; ieee80211_tx_skb(sdata, skb); @@ -323,92 +363,147 @@ free: return err; } -/** - * mesh_peer_init - initialize new mesh peer and return resulting sta_info - * - * @sdata: local meshif - * @addr: peer's address - * @elems: IEs from beacon or mesh peering frame - * - * call under RCU - */ -static struct sta_info *mesh_peer_init(struct ieee80211_sub_if_data *sdata, - u8 *addr, - struct ieee802_11_elems *elems) +static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee802_11_elems *elems, bool insert) { struct ieee80211_local *local = sdata->local; enum ieee80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; - u32 rates, basic_rates = 0; - struct sta_info *sta; - bool insert = false; + u32 rates, basic_rates = 0, changed = 0; sband = local->hw.wiphy->bands[band]; rates = ieee80211_sta_get_rates(local, elems, band, &basic_rates); - sta = sta_info_get(sdata, addr); - if (!sta) { - /* Userspace handles peer allocation when security is enabled */ - if (sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) { - cfg80211_notify_new_peer_candidate(sdata->dev, addr, - elems->ie_start, - elems->total_len, - GFP_ATOMIC); - return NULL; - } - - sta = mesh_plink_alloc(sdata, addr); - if (!sta) - return NULL; - insert = true; - } - spin_lock_bh(&sta->lock); sta->last_rx = jiffies; - if (sta->plink_state == NL80211_PLINK_ESTAB) { - spin_unlock_bh(&sta->lock); - return sta; - } + /* rates and capabilities don't change during peering */ + if (sta->plink_state == NL80211_PLINK_ESTAB) + goto out; + + if (sta->sta.supp_rates[band] != rates) + changed |= IEEE80211_RC_SUPP_RATES_CHANGED; sta->sta.supp_rates[band] = rates; - if (elems->ht_cap_elem && - sdata->vif.bss_conf.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) - ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, - elems->ht_cap_elem, - &sta->sta.ht_cap); - else - memset(&sta->sta.ht_cap, 0, sizeof(sta->sta.ht_cap)); - - if (elems->ht_operation) { - struct cfg80211_chan_def chandef; - - if (!(elems->ht_operation->ht_param & - IEEE80211_HT_PARAM_CHAN_WIDTH_ANY)) - sta->sta.ht_cap.cap &= - ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; - ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, - elems->ht_operation, &chandef); - sta->ch_width = chandef.width; + + if (ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, + elems->ht_cap_elem, sta)) + changed |= IEEE80211_RC_BW_CHANGED; + + /* HT peer is operating 20MHz-only */ + if (elems->ht_operation && + !(elems->ht_operation->ht_param & + IEEE80211_HT_PARAM_CHAN_WIDTH_ANY)) { + if (sta->sta.bandwidth != IEEE80211_STA_RX_BW_20) + changed |= IEEE80211_RC_BW_CHANGED; + sta->sta.bandwidth = IEEE80211_STA_RX_BW_20; } if (insert) rate_control_rate_init(sta); + else + rate_control_rate_update(local, sband, sta, changed); +out: spin_unlock_bh(&sta->lock); +} - if (insert && sta_info_insert(sta)) +static struct sta_info * +__mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) +{ + struct sta_info *sta; + + if (sdata->local->num_sta >= MESH_MAX_PLINKS) + return NULL; + + sta = sta_info_alloc(sdata, hw_addr, GFP_KERNEL); + if (!sta) return NULL; + sta->plink_state = NL80211_PLINK_LISTEN; + + sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); + sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); + sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); + + set_sta_flag(sta, WLAN_STA_WME); + + return sta; +} + +static struct sta_info * +mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, + struct ieee802_11_elems *elems) +{ + struct sta_info *sta = NULL; + + /* Userspace handles station allocation */ + if (sdata->u.mesh.user_mpm || + sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) + cfg80211_notify_new_peer_candidate(sdata->dev, addr, + elems->ie_start, + elems->total_len, + GFP_KERNEL); + else + sta = __mesh_sta_info_alloc(sdata, addr); + return sta; } +/* + * mesh_sta_info_get - return mesh sta info entry for @addr. + * + * @sdata: local meshif + * @addr: peer's address + * @elems: IEs from beacon or mesh peering frame. + * + * Return existing or newly allocated sta_info under RCU read lock. + * (re)initialize with given IEs. + */ +static struct sta_info * +mesh_sta_info_get(struct ieee80211_sub_if_data *sdata, + u8 *addr, struct ieee802_11_elems *elems) __acquires(RCU) +{ + struct sta_info *sta = NULL; + + rcu_read_lock(); + sta = sta_info_get(sdata, addr); + if (sta) { + mesh_sta_info_init(sdata, sta, elems, false); + } else { + rcu_read_unlock(); + /* can't run atomic */ + sta = mesh_sta_info_alloc(sdata, addr, elems); + if (!sta) { + rcu_read_lock(); + return NULL; + } + + mesh_sta_info_init(sdata, sta, elems, true); + + if (sta_info_insert_rcu(sta)) + return NULL; + } + + return sta; +} + +/* + * mesh_neighbour_update - update or initialize new mesh neighbor. + * + * @sdata: local meshif + * @addr: peer's address + * @elems: IEs from beacon or mesh peering frame + * + * Initiates peering if appropriate. + */ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, u8 *hw_addr, struct ieee802_11_elems *elems) { struct sta_info *sta; + u32 changed = 0; - rcu_read_lock(); - sta = mesh_peer_init(sdata, hw_addr, elems); + sta = mesh_sta_info_get(sdata, hw_addr, elems); if (!sta) goto out; @@ -417,10 +512,12 @@ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, sdata->u.mesh.accepting_plinks && sdata->u.mesh.mshcfg.auto_open_plinks && rssi_threshold_check(sta, sdata)) - mesh_plink_open(sta); + changed = mesh_plink_open(sta); + ieee80211_mps_frame_release(sta, elems); out: rcu_read_unlock(); + ieee80211_mbss_info_change_notify(sdata, changed); } static void mesh_plink_timer(unsigned long data) @@ -437,10 +534,8 @@ static void mesh_plink_timer(unsigned long data) */ sta = (struct sta_info *) data; - if (sta->sdata->local->quiescing) { - sta->plink_timer_was_running = true; + if (sta->sdata->local->quiescing) return; - } spin_lock_bh(&sta->lock); if (sta->ignore_plink_timer) { @@ -449,8 +544,8 @@ static void mesh_plink_timer(unsigned long data) return; } mpl_dbg(sta->sdata, - "Mesh plink timer for %pM fired on state %d\n", - sta->sta.addr, sta->plink_state); + "Mesh plink timer for %pM fired on state %s\n", + sta->sta.addr, mplstates[sta->plink_state]); reason = 0; llid = sta->llid; plid = sta->plid; @@ -501,22 +596,6 @@ static void mesh_plink_timer(unsigned long data) } } -#ifdef CONFIG_PM -void mesh_plink_quiesce(struct sta_info *sta) -{ - if (del_timer_sync(&sta->plink_timer)) - sta->plink_timer_was_running = true; -} - -void mesh_plink_restart(struct sta_info *sta) -{ - if (sta->plink_timer_was_running) { - add_timer(&sta->plink_timer); - sta->plink_timer_was_running = false; - } -} -#endif - static inline void mesh_plink_timer_set(struct sta_info *sta, int timeout) { sta->plink_timer.expires = jiffies + (HZ * timeout / 1000); @@ -526,13 +605,14 @@ static inline void mesh_plink_timer_set(struct sta_info *sta, int timeout) add_timer(&sta->plink_timer); } -int mesh_plink_open(struct sta_info *sta) +u32 mesh_plink_open(struct sta_info *sta) { __le16 llid; struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 changed; if (!test_sta_flag(sta, WLAN_STA_AUTH)) - return -EPERM; + return 0; spin_lock_bh(&sta->lock); get_random_bytes(&llid, 2); @@ -540,7 +620,7 @@ int mesh_plink_open(struct sta_info *sta) if (sta->plink_state != NL80211_PLINK_LISTEN && sta->plink_state != NL80211_PLINK_BLOCKED) { spin_unlock_bh(&sta->lock); - return -EBUSY; + return 0; } sta->plink_state = NL80211_PLINK_OPN_SNT; mesh_plink_timer_set(sta, sdata->u.mesh.mshcfg.dot11MeshRetryTimeout); @@ -549,13 +629,16 @@ int mesh_plink_open(struct sta_info *sta) "Mesh plink: starting establishment with %pM\n", sta->sta.addr); - return mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_OPEN, - sta->sta.addr, llid, 0, 0); + /* set the non-peer mode to active during peering */ + changed = ieee80211_mps_local_status_update(sdata); + + mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_OPEN, + sta->sta.addr, llid, 0, 0); + return changed; } -void mesh_plink_block(struct sta_info *sta) +u32 mesh_plink_block(struct sta_info *sta) { - struct ieee80211_sub_if_data *sdata = sta->sdata; u32 changed; spin_lock_bh(&sta->lock); @@ -563,12 +646,13 @@ void mesh_plink_block(struct sta_info *sta) sta->plink_state = NL80211_PLINK_BLOCKED; spin_unlock_bh(&sta->lock); - ieee80211_bss_info_change_notify(sdata, changed); + return changed; } -void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, - size_t len, struct ieee80211_rx_status *rx_status) +void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee80211_rx_status *rx_status) { struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg; struct ieee802_11_elems elems; @@ -581,20 +665,15 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m u8 *baseaddr; u32 changed = 0; __le16 plid, llid, reason; - static const char *mplstates[] = { - [NL80211_PLINK_LISTEN] = "LISTEN", - [NL80211_PLINK_OPN_SNT] = "OPN-SNT", - [NL80211_PLINK_OPN_RCVD] = "OPN-RCVD", - [NL80211_PLINK_CNF_RCVD] = "CNF_RCVD", - [NL80211_PLINK_ESTAB] = "ESTAB", - [NL80211_PLINK_HOLDING] = "HOLDING", - [NL80211_PLINK_BLOCKED] = "BLOCKED" - }; /* need action_code, aux */ if (len < IEEE80211_MIN_ACTION_SIZE + 3) return; + if (sdata->u.mesh.user_mpm) + /* userspace must register for these */ + return; + if (is_multicast_ether_addr(mgmt->da)) { mpl_dbg(sdata, "Mesh plink: ignore frame from multicast address\n"); @@ -608,14 +687,16 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m baseaddr += 4; baselen += 4; } - ieee802_11_parse_elems(baseaddr, len - baselen, &elems); + ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems); + if (!elems.peering) { mpl_dbg(sdata, "Mesh plink: missing necessary peer link ie\n"); return; } + if (elems.rsn_len && - sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) { + sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) { mpl_dbg(sdata, "Mesh plink: can't establish link with secure peer\n"); return; @@ -634,7 +715,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m } if (ftype != WLAN_SP_MESH_PEERING_CLOSE && - (!elems.mesh_id || !elems.mesh_config)) { + (!elems.mesh_id || !elems.mesh_config)) { mpl_dbg(sdata, "Mesh plink: missing necessary ie\n"); return; } @@ -646,6 +727,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m (ftype == WLAN_SP_MESH_PEERING_CLOSE && ie_len == 8)) memcpy(&llid, PLINK_GET_PLID(elems.peering), 2); + /* WARNING: Only for sta pointer, is dropped & re-acquired */ rcu_read_lock(); sta = sta_info_get(sdata, mgmt->sa); @@ -749,8 +831,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m } if (event == OPN_ACPT) { + rcu_read_unlock(); /* allocate sta entry if necessary and update info */ - sta = mesh_peer_init(sdata, mgmt->sa, &elems); + sta = mesh_sta_info_get(sdata, mgmt->sa, &elems); if (!sta) { mpl_dbg(sdata, "Mesh plink: failed to init peer!\n"); rcu_read_unlock(); @@ -758,11 +841,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m } } - mpl_dbg(sdata, - "Mesh plink (peer, state, llid, plid, event): %pM %s %d %d %d\n", - mgmt->sa, mplstates[sta->plink_state], - le16_to_cpu(sta->llid), le16_to_cpu(sta->plid), - event); + mpl_dbg(sdata, "peer %pM in state %s got event %s\n", mgmt->sa, + mplstates[sta->plink_state], mplevents[event]); reason = 0; spin_lock_bh(&sta->lock); switch (sta->plink_state) { @@ -780,6 +860,10 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m sta->llid = llid; mesh_plink_timer_set(sta, mshcfg->dot11MeshRetryTimeout); + + /* set the non-peer mode to active during peering */ + changed |= ieee80211_mps_local_status_update(sdata); + spin_unlock_bh(&sta->lock); mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_OPEN, @@ -870,8 +954,12 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m spin_unlock_bh(&sta->lock); changed |= mesh_plink_inc_estab_count(sdata); changed |= mesh_set_ht_prot_mode(sdata); + changed |= mesh_set_short_slot_time(sdata); mpl_dbg(sdata, "Mesh plink with %pM ESTABLISHED\n", sta->sta.addr); + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_set_sta_local_pm(sta, + mshcfg->power_mode); break; default: spin_unlock_bh(&sta->lock); @@ -905,11 +993,15 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m spin_unlock_bh(&sta->lock); changed |= mesh_plink_inc_estab_count(sdata); changed |= mesh_set_ht_prot_mode(sdata); + changed |= mesh_set_short_slot_time(sdata); mpl_dbg(sdata, "Mesh plink with %pM ESTABLISHED\n", sta->sta.addr); mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CONFIRM, sta->sta.addr, llid, plid, 0); + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_set_sta_local_pm(sta, + mshcfg->power_mode); break; default: spin_unlock_bh(&sta->lock); @@ -928,6 +1020,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout); spin_unlock_bh(&sta->lock); changed |= mesh_set_ht_prot_mode(sdata); + changed |= mesh_set_short_slot_time(sdata); mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, sta->sta.addr, llid, plid, reason); break; @@ -976,5 +1069,5 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m rcu_read_unlock(); if (changed) - ieee80211_bss_info_change_notify(sdata, changed); + ieee80211_mbss_info_change_notify(sdata, changed); } diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c new file mode 100644 index 000000000000..3b7bfc01ee36 --- /dev/null +++ b/net/mac80211/mesh_ps.c @@ -0,0 +1,598 @@ +/* + * Copyright 2012-2013, Marco Porsch <marco.porsch@s2005.tu-chemnitz.de> + * Copyright 2012-2013, cozybit Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "mesh.h" +#include "wme.h" + + +/* mesh PS management */ + +/** + * mps_qos_null_get - create pre-addressed QoS Null frame for mesh powersave + */ +static struct sk_buff *mps_qos_null_get(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + struct ieee80211_hdr *nullfunc; /* use 4addr header */ + struct sk_buff *skb; + int size = sizeof(*nullfunc); + __le16 fc; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + size + 2); + if (!skb) + return NULL; + skb_reserve(skb, local->hw.extra_tx_headroom); + + nullfunc = (struct ieee80211_hdr *) skb_put(skb, size); + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC); + ieee80211_fill_mesh_addresses(nullfunc, &fc, sta->sta.addr, + sdata->vif.addr); + nullfunc->frame_control = fc; + nullfunc->duration_id = 0; + /* no address resolution for this frame -> set addr 1 immediately */ + memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); + memset(skb_put(skb, 2), 0, 2); /* append QoS control field */ + ieee80211_mps_set_frame_flags(sdata, sta, nullfunc); + + return skb; +} + +/** + * mps_qos_null_tx - send a QoS Null to indicate link-specific power mode + */ +static void mps_qos_null_tx(struct sta_info *sta) +{ + struct sk_buff *skb; + + skb = mps_qos_null_get(sta); + if (!skb) + return; + + mps_dbg(sta->sdata, "announcing peer-specific power mode to %pM\n", + sta->sta.addr); + + /* don't unintentionally start a MPSP */ + if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { + u8 *qc = ieee80211_get_qos_ctl((void *) skb->data); + + qc[0] |= IEEE80211_QOS_CTL_EOSP; + } + + ieee80211_tx_skb(sta->sdata, skb); +} + +/** + * ieee80211_mps_local_status_update - track status of local link-specific PMs + * + * @sdata: local mesh subif + * + * sets the non-peer power mode and triggers the driver PS (re-)configuration + * Return BSS_CHANGED_BEACON if a beacon update is necessary. + */ +u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct sta_info *sta; + bool peering = false; + int light_sleep_cnt = 0; + int deep_sleep_cnt = 0; + u32 changed = 0; + enum nl80211_mesh_power_mode nonpeer_pm; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (sdata != sta->sdata) + continue; + + switch (sta->plink_state) { + case NL80211_PLINK_OPN_SNT: + case NL80211_PLINK_OPN_RCVD: + case NL80211_PLINK_CNF_RCVD: + peering = true; + break; + case NL80211_PLINK_ESTAB: + if (sta->local_pm == NL80211_MESH_POWER_LIGHT_SLEEP) + light_sleep_cnt++; + else if (sta->local_pm == NL80211_MESH_POWER_DEEP_SLEEP) + deep_sleep_cnt++; + break; + default: + break; + } + } + rcu_read_unlock(); + + /* + * Set non-peer mode to active during peering/scanning/authentication + * (see IEEE802.11-2012 13.14.8.3). The non-peer mesh power mode is + * deep sleep if the local STA is in light or deep sleep towards at + * least one mesh peer (see 13.14.3.1). Otherwise, set it to the + * user-configured default value. + */ + if (peering) { + mps_dbg(sdata, "setting non-peer PM to active for peering\n"); + nonpeer_pm = NL80211_MESH_POWER_ACTIVE; + } else if (light_sleep_cnt || deep_sleep_cnt) { + mps_dbg(sdata, "setting non-peer PM to deep sleep\n"); + nonpeer_pm = NL80211_MESH_POWER_DEEP_SLEEP; + } else { + mps_dbg(sdata, "setting non-peer PM to user value\n"); + nonpeer_pm = ifmsh->mshcfg.power_mode; + } + + /* need update if sleep counts move between 0 and non-zero */ + if (ifmsh->nonpeer_pm != nonpeer_pm || + !ifmsh->ps_peers_light_sleep != !light_sleep_cnt || + !ifmsh->ps_peers_deep_sleep != !deep_sleep_cnt) + changed = BSS_CHANGED_BEACON; + + ifmsh->nonpeer_pm = nonpeer_pm; + ifmsh->ps_peers_light_sleep = light_sleep_cnt; + ifmsh->ps_peers_deep_sleep = deep_sleep_cnt; + + return changed; +} + +/** + * ieee80211_mps_set_sta_local_pm - set local PM towards a mesh STA + * + * @sta: mesh STA + * @pm: the power mode to set + * Return BSS_CHANGED_BEACON if a beacon update is in order. + */ +u32 ieee80211_mps_set_sta_local_pm(struct sta_info *sta, + enum nl80211_mesh_power_mode pm) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + + mps_dbg(sdata, "local STA operates in mode %d with %pM\n", + pm, sta->sta.addr); + + sta->local_pm = pm; + + /* + * announce peer-specific power mode transition + * (see IEEE802.11-2012 13.14.3.2 and 13.14.3.3) + */ + if (sta->plink_state == NL80211_PLINK_ESTAB) + mps_qos_null_tx(sta); + + return ieee80211_mps_local_status_update(sdata); +} + +/** + * ieee80211_mps_set_frame_flags - set mesh PS flags in FC (and QoS Control) + * + * @sdata: local mesh subif + * @sta: mesh STA + * @hdr: 802.11 frame header + * + * see IEEE802.11-2012 8.2.4.1.7 and 8.2.4.5.11 + * + * NOTE: sta must be given when an individually-addressed QoS frame header + * is handled, for group-addressed and management frames it is not used + */ +void ieee80211_mps_set_frame_flags(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee80211_hdr *hdr) +{ + enum nl80211_mesh_power_mode pm; + u8 *qc; + + if (WARN_ON(is_unicast_ether_addr(hdr->addr1) && + ieee80211_is_data_qos(hdr->frame_control) && + !sta)) + return; + + if (is_unicast_ether_addr(hdr->addr1) && + ieee80211_is_data_qos(hdr->frame_control) && + sta->plink_state == NL80211_PLINK_ESTAB) + pm = sta->local_pm; + else + pm = sdata->u.mesh.nonpeer_pm; + + if (pm == NL80211_MESH_POWER_ACTIVE) + hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_PM); + else + hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); + + if (!ieee80211_is_data_qos(hdr->frame_control)) + return; + + qc = ieee80211_get_qos_ctl(hdr); + + if ((is_unicast_ether_addr(hdr->addr1) && + pm == NL80211_MESH_POWER_DEEP_SLEEP) || + (is_multicast_ether_addr(hdr->addr1) && + sdata->u.mesh.ps_peers_deep_sleep > 0)) + qc[1] |= (IEEE80211_QOS_CTL_MESH_PS_LEVEL >> 8); + else + qc[1] &= ~(IEEE80211_QOS_CTL_MESH_PS_LEVEL >> 8); +} + +/** + * ieee80211_mps_sta_status_update - update buffering status of neighbor STA + * + * @sta: mesh STA + * + * called after change of peering status or non-peer/peer-specific power mode + */ +void ieee80211_mps_sta_status_update(struct sta_info *sta) +{ + enum nl80211_mesh_power_mode pm; + bool do_buffer; + + /* + * use peer-specific power mode if peering is established and the + * peer's power mode is known + */ + if (sta->plink_state == NL80211_PLINK_ESTAB && + sta->peer_pm != NL80211_MESH_POWER_UNKNOWN) + pm = sta->peer_pm; + else + pm = sta->nonpeer_pm; + + do_buffer = (pm != NL80211_MESH_POWER_ACTIVE); + + /* Don't let the same PS state be set twice */ + if (test_sta_flag(sta, WLAN_STA_PS_STA) == do_buffer) + return; + + if (do_buffer) { + set_sta_flag(sta, WLAN_STA_PS_STA); + atomic_inc(&sta->sdata->u.mesh.ps.num_sta_ps); + mps_dbg(sta->sdata, "start PS buffering frames towards %pM\n", + sta->sta.addr); + } else { + ieee80211_sta_ps_deliver_wakeup(sta); + } + + /* clear the MPSP flags for non-peers or active STA */ + if (sta->plink_state != NL80211_PLINK_ESTAB) { + clear_sta_flag(sta, WLAN_STA_MPSP_OWNER); + clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); + } else if (!do_buffer) { + clear_sta_flag(sta, WLAN_STA_MPSP_OWNER); + } +} + +static void mps_set_sta_peer_pm(struct sta_info *sta, + struct ieee80211_hdr *hdr) +{ + enum nl80211_mesh_power_mode pm; + u8 *qc = ieee80211_get_qos_ctl(hdr); + + /* + * Test Power Management field of frame control (PW) and + * mesh power save level subfield of QoS control field (PSL) + * + * | PM | PSL| Mesh PM | + * +----+----+---------+ + * | 0 |Rsrv| Active | + * | 1 | 0 | Light | + * | 1 | 1 | Deep | + */ + if (ieee80211_has_pm(hdr->frame_control)) { + if (qc[1] & (IEEE80211_QOS_CTL_MESH_PS_LEVEL >> 8)) + pm = NL80211_MESH_POWER_DEEP_SLEEP; + else + pm = NL80211_MESH_POWER_LIGHT_SLEEP; + } else { + pm = NL80211_MESH_POWER_ACTIVE; + } + + if (sta->peer_pm == pm) + return; + + mps_dbg(sta->sdata, "STA %pM enters mode %d\n", + sta->sta.addr, pm); + + sta->peer_pm = pm; + + ieee80211_mps_sta_status_update(sta); +} + +static void mps_set_sta_nonpeer_pm(struct sta_info *sta, + struct ieee80211_hdr *hdr) +{ + enum nl80211_mesh_power_mode pm; + + if (ieee80211_has_pm(hdr->frame_control)) + pm = NL80211_MESH_POWER_DEEP_SLEEP; + else + pm = NL80211_MESH_POWER_ACTIVE; + + if (sta->nonpeer_pm == pm) + return; + + mps_dbg(sta->sdata, "STA %pM sets non-peer mode to %d\n", + sta->sta.addr, pm); + + sta->nonpeer_pm = pm; + + ieee80211_mps_sta_status_update(sta); +} + +/** + * ieee80211_mps_rx_h_sta_process - frame receive handler for mesh powersave + * + * @sta: STA info that transmitted the frame + * @hdr: IEEE 802.11 (QoS) Header + */ +void ieee80211_mps_rx_h_sta_process(struct sta_info *sta, + struct ieee80211_hdr *hdr) +{ + if (is_unicast_ether_addr(hdr->addr1) && + ieee80211_is_data_qos(hdr->frame_control)) { + /* + * individually addressed QoS Data/Null frames contain + * peer link-specific PS mode towards the local STA + */ + mps_set_sta_peer_pm(sta, hdr); + + /* check for mesh Peer Service Period trigger frames */ + ieee80211_mpsp_trigger_process(ieee80211_get_qos_ctl(hdr), + sta, false, false); + } else { + /* + * can only determine non-peer PS mode + * (see IEEE802.11-2012 8.2.4.1.7) + */ + mps_set_sta_nonpeer_pm(sta, hdr); + } +} + + +/* mesh PS frame release */ + +static void mpsp_trigger_send(struct sta_info *sta, bool rspi, bool eosp) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct sk_buff *skb; + struct ieee80211_hdr *nullfunc; + struct ieee80211_tx_info *info; + u8 *qc; + + skb = mps_qos_null_get(sta); + if (!skb) + return; + + nullfunc = (struct ieee80211_hdr *) skb->data; + if (!eosp) + nullfunc->frame_control |= + cpu_to_le16(IEEE80211_FCTL_MOREDATA); + /* + * | RSPI | EOSP | MPSP triggering | + * +------+------+--------------------+ + * | 0 | 0 | local STA is owner | + * | 0 | 1 | no MPSP (MPSP end) | + * | 1 | 0 | both STA are owner | + * | 1 | 1 | peer STA is owner | see IEEE802.11-2012 13.14.9.2 + */ + qc = ieee80211_get_qos_ctl(nullfunc); + if (rspi) + qc[1] |= (IEEE80211_QOS_CTL_RSPI >> 8); + if (eosp) + qc[0] |= IEEE80211_QOS_CTL_EOSP; + + info = IEEE80211_SKB_CB(skb); + + info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER | + IEEE80211_TX_CTL_REQ_TX_STATUS; + + mps_dbg(sdata, "sending MPSP trigger%s%s to %pM\n", + rspi ? " RSPI" : "", eosp ? " EOSP" : "", sta->sta.addr); + + ieee80211_tx_skb(sdata, skb); +} + +/** + * mpsp_qos_null_append - append QoS Null frame to MPSP skb queue if needed + * + * To properly end a mesh MPSP the last transmitted frame has to set the EOSP + * flag in the QoS Control field. In case the current tailing frame is not a + * QoS Data frame, append a QoS Null to carry the flag. + */ +static void mpsp_qos_null_append(struct sta_info *sta, + struct sk_buff_head *frames) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct sk_buff *new_skb, *skb = skb_peek_tail(frames); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *info; + + if (ieee80211_is_data_qos(hdr->frame_control)) + return; + + new_skb = mps_qos_null_get(sta); + if (!new_skb) + return; + + mps_dbg(sdata, "appending QoS Null in MPSP towards %pM\n", + sta->sta.addr); + /* + * This frame has to be transmitted last. Assign lowest priority to + * make sure it cannot pass other frames when releasing multiple ACs. + */ + new_skb->priority = 1; + skb_set_queue_mapping(new_skb, IEEE80211_AC_BK); + ieee80211_set_qos_hdr(sdata, new_skb); + + info = IEEE80211_SKB_CB(new_skb); + info->control.vif = &sdata->vif; + info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + + __skb_queue_tail(frames, new_skb); +} + +/** + * mps_frame_deliver - transmit frames during mesh powersave + * + * @sta: STA info to transmit to + * @n_frames: number of frames to transmit. -1 for all + */ +static void mps_frame_deliver(struct sta_info *sta, int n_frames) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + int ac; + struct sk_buff_head frames; + struct sk_buff *skb; + bool more_data = false; + + skb_queue_head_init(&frames); + + /* collect frame(s) from buffers */ + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + while (n_frames != 0) { + skb = skb_dequeue(&sta->tx_filtered[ac]); + if (!skb) { + skb = skb_dequeue( + &sta->ps_tx_buf[ac]); + if (skb) + local->total_ps_buffered--; + } + if (!skb) + break; + n_frames--; + __skb_queue_tail(&frames, skb); + } + + if (!skb_queue_empty(&sta->tx_filtered[ac]) || + !skb_queue_empty(&sta->ps_tx_buf[ac])) + more_data = true; + } + + /* nothing to send? -> EOSP */ + if (skb_queue_empty(&frames)) { + mpsp_trigger_send(sta, false, true); + return; + } + + /* in a MPSP make sure the last skb is a QoS Data frame */ + if (test_sta_flag(sta, WLAN_STA_MPSP_OWNER)) + mpsp_qos_null_append(sta, &frames); + + mps_dbg(sta->sdata, "sending %d frames to PS STA %pM\n", + skb_queue_len(&frames), sta->sta.addr); + + /* prepare collected frames for transmission */ + skb_queue_walk(&frames, skb) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (void *) skb->data; + + /* + * Tell TX path to send this frame even though the + * STA may still remain is PS mode after this frame + * exchange. + */ + info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; + + if (more_data || !skb_queue_is_last(&frames, skb)) + hdr->frame_control |= + cpu_to_le16(IEEE80211_FCTL_MOREDATA); + else + hdr->frame_control &= + cpu_to_le16(~IEEE80211_FCTL_MOREDATA); + + if (skb_queue_is_last(&frames, skb) && + ieee80211_is_data_qos(hdr->frame_control)) { + u8 *qoshdr = ieee80211_get_qos_ctl(hdr); + + /* MPSP trigger frame ends service period */ + *qoshdr |= IEEE80211_QOS_CTL_EOSP; + info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + } + } + + ieee80211_add_pending_skbs(local, &frames); + sta_info_recalc_tim(sta); +} + +/** + * ieee80211_mpsp_trigger_process - track status of mesh Peer Service Periods + * + * @qc: QoS Control field + * @sta: peer to start a MPSP with + * @tx: frame was transmitted by the local STA + * @acked: frame has been transmitted successfully + * + * NOTE: active mode STA may only serve as MPSP owner + */ +void ieee80211_mpsp_trigger_process(u8 *qc, struct sta_info *sta, + bool tx, bool acked) +{ + u8 rspi = qc[1] & (IEEE80211_QOS_CTL_RSPI >> 8); + u8 eosp = qc[0] & IEEE80211_QOS_CTL_EOSP; + + if (tx) { + if (rspi && acked) + set_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); + + if (eosp) + clear_sta_flag(sta, WLAN_STA_MPSP_OWNER); + else if (acked && + test_sta_flag(sta, WLAN_STA_PS_STA) && + !test_and_set_sta_flag(sta, WLAN_STA_MPSP_OWNER)) + mps_frame_deliver(sta, -1); + } else { + if (eosp) + clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); + else if (sta->local_pm != NL80211_MESH_POWER_ACTIVE) + set_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); + + if (rspi && !test_and_set_sta_flag(sta, WLAN_STA_MPSP_OWNER)) + mps_frame_deliver(sta, -1); + } +} + +/** + * ieee80211_mps_frame_release - release buffered frames in response to beacon + * + * @sta: mesh STA + * @elems: beacon IEs + * + * For peers if we have individually-addressed frames buffered or the peer + * indicates buffered frames, send a corresponding MPSP trigger frame. Since + * we do not evaluate the awake window duration, QoS Nulls are used as MPSP + * trigger frames. If the neighbour STA is not a peer, only send single frames. + */ +void ieee80211_mps_frame_release(struct sta_info *sta, + struct ieee802_11_elems *elems) +{ + int ac, buffer_local = 0; + bool has_buffered = false; + + /* TIM map only for LLID <= IEEE80211_MAX_AID */ + if (sta->plink_state == NL80211_PLINK_ESTAB) + has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len, + le16_to_cpu(sta->llid) % IEEE80211_MAX_AID); + + if (has_buffered) + mps_dbg(sta->sdata, "%pM indicates buffered frames\n", + sta->sta.addr); + + /* only transmit to PS STA with announced, non-zero awake window */ + if (test_sta_flag(sta, WLAN_STA_PS_STA) && + (!elems->awake_window || !le16_to_cpu(*elems->awake_window))) + return; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + buffer_local += skb_queue_len(&sta->ps_tx_buf[ac]) + + skb_queue_len(&sta->tx_filtered[ac]); + + if (!has_buffered && !buffer_local) + return; + + if (sta->plink_state == NL80211_PLINK_ESTAB) + mpsp_trigger_send(sta, has_buffered, !buffer_local); + else + mps_frame_deliver(sta, 1); +} diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index aa8d1e437385..05a256b38e24 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -43,7 +43,7 @@ struct sync_method { static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie) { return (ie->mesh_config->meshconf_cap & - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0; + IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0; } void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) @@ -112,7 +112,8 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) { clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN); - msync_dbg(sdata, "STA %pM : is adjusting TBTT\n", sta->sta.addr); + msync_dbg(sdata, "STA %pM : is adjusting TBTT\n", + sta->sta.addr); goto no_sync; } @@ -129,18 +130,15 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, sta->t_offset = t_t - t_r; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { - s64 t_clockdrift = sta->t_offset_setpoint - - sta->t_offset; + s64 t_clockdrift = sta->t_offset_setpoint - sta->t_offset; msync_dbg(sdata, "STA %pM : sta->t_offset=%lld, sta->t_offset_setpoint=%lld, t_clockdrift=%lld\n", - sta->sta.addr, - (long long) sta->t_offset, - (long long) - sta->t_offset_setpoint, + sta->sta.addr, (long long) sta->t_offset, + (long long) sta->t_offset_setpoint, (long long) t_clockdrift); if (t_clockdrift > TOFFSET_MAXIMUM_ADJUSTMENT || - t_clockdrift < -TOFFSET_MAXIMUM_ADJUSTMENT) { + t_clockdrift < -TOFFSET_MAXIMUM_ADJUSTMENT) { msync_dbg(sdata, "STA %pM : t_clockdrift=%lld too large, setpoint reset\n", sta->sta.addr, @@ -149,15 +147,10 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, goto no_sync; } - rcu_read_unlock(); - spin_lock_bh(&ifmsh->sync_offset_lock); - if (t_clockdrift > - ifmsh->sync_offset_clockdrift_max) - ifmsh->sync_offset_clockdrift_max - = t_clockdrift; + if (t_clockdrift > ifmsh->sync_offset_clockdrift_max) + ifmsh->sync_offset_clockdrift_max = t_clockdrift; spin_unlock_bh(&ifmsh->sync_offset_lock); - } else { sta->t_offset_setpoint = sta->t_offset - TOFFSET_SET_MARGIN; set_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN); @@ -165,9 +158,7 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, "STA %pM : offset was invalid, sta->t_offset=%lld\n", sta->sta.addr, (long long) sta->t_offset); - rcu_read_unlock(); } - return; no_sync: rcu_read_unlock(); @@ -177,14 +168,12 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - WARN_ON(ifmsh->mesh_sp_id - != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET); + WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET); BUG_ON(!rcu_read_lock_held()); spin_lock_bh(&ifmsh->sync_offset_lock); - if (ifmsh->sync_offset_clockdrift_max > - TOFFSET_MINIMUM_ADJUSTMENT) { + if (ifmsh->sync_offset_clockdrift_max > TOFFSET_MINIMUM_ADJUSTMENT) { /* Since ajusting the tsf here would * require a possibly blocking call * to the driver tsf setter, we punt @@ -193,8 +182,7 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata) msync_dbg(sdata, "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n", ifmsh->sync_offset_clockdrift_max); - set_bit(MESH_WORK_DRIFT_ADJUST, - &ifmsh->wrkq_flags); + set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags); ifmsh->adjusting_tbtt = true; } else { @@ -220,14 +208,11 @@ static const struct sync_method sync_methods[] = { const struct ieee80211_mesh_sync_ops *ieee80211_mesh_sync_ops_get(u8 method) { - const struct ieee80211_mesh_sync_ops *ops = NULL; - u8 i; + int i; for (i = 0 ; i < ARRAY_SIZE(sync_methods); ++i) { - if (sync_methods[i].method == method) { - ops = &sync_methods[i].ops; - break; - } + if (sync_methods[i].method == method) + return &sync_methods[i].ops; } - return ops; + return NULL; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5107248af7fb..29620bfc7a69 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -30,11 +30,13 @@ #include "rate.h" #include "led.h" -#define IEEE80211_AUTH_TIMEOUT (HZ / 5) -#define IEEE80211_AUTH_MAX_TRIES 3 -#define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) -#define IEEE80211_ASSOC_TIMEOUT (HZ / 5) -#define IEEE80211_ASSOC_MAX_TRIES 3 +#define IEEE80211_AUTH_TIMEOUT (HZ / 5) +#define IEEE80211_AUTH_TIMEOUT_SHORT (HZ / 10) +#define IEEE80211_AUTH_MAX_TRIES 3 +#define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) +#define IEEE80211_ASSOC_TIMEOUT (HZ / 5) +#define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) +#define IEEE80211_ASSOC_MAX_TRIES 3 static int max_nullfunc_tries = 2; module_param(max_nullfunc_tries, int, 0644); @@ -54,7 +56,10 @@ MODULE_PARM_DESC(max_probe_tries, * probe on beacon miss before declaring the connection lost * default to what we want. */ -#define IEEE80211_BEACON_LOSS_COUNT 7 +static int beacon_loss_count = 7; +module_param(beacon_loss_count, int, 0644); +MODULE_PARM_DESC(beacon_loss_count, + "Number of beacon intervals before we decide beacon was lost."); /* * Time the connection can be idle before we probe @@ -85,9 +90,6 @@ MODULE_PARM_DESC(probe_wait_ms, */ #define IEEE80211_SIGNAL_AVE_MIN_COUNT 4 -#define TMR_RUNNING_TIMER 0 -#define TMR_RUNNING_CHANSW 1 - /* * All cfg80211 functions have to be called outside a locked * section so that they can acquire a lock themselves... This @@ -112,6 +114,9 @@ enum rx_mgmt_action { /* caller must call cfg80211_send_assoc_timeout() */ RX_MGMT_CFG80211_ASSOC_TIMEOUT, + + /* used when a processed beacon causes a deauth */ + RX_MGMT_CFG80211_TX_DEAUTH, }; /* utils */ @@ -172,79 +177,331 @@ static int ecw2cw(int ecw) return (1 << ecw) - 1; } -static u32 ieee80211_config_ht_tx(struct ieee80211_sub_if_data *sdata, - struct ieee80211_ht_operation *ht_oper, - const u8 *bssid, bool reconfig) +static u32 chandef_downgrade(struct cfg80211_chan_def *c) +{ + u32 ret; + int tmp; + + switch (c->width) { + case NL80211_CHAN_WIDTH_20: + c->width = NL80211_CHAN_WIDTH_20_NOHT; + ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; + break; + case NL80211_CHAN_WIDTH_40: + c->width = NL80211_CHAN_WIDTH_20; + c->center_freq1 = c->chan->center_freq; + ret = IEEE80211_STA_DISABLE_40MHZ | + IEEE80211_STA_DISABLE_VHT; + break; + case NL80211_CHAN_WIDTH_80: + tmp = (30 + c->chan->center_freq - c->center_freq1)/20; + /* n_P40 */ + tmp /= 2; + /* freq_P40 */ + c->center_freq1 = c->center_freq1 - 20 + 40 * tmp; + c->width = NL80211_CHAN_WIDTH_40; + ret = IEEE80211_STA_DISABLE_VHT; + break; + case NL80211_CHAN_WIDTH_80P80: + c->center_freq2 = 0; + c->width = NL80211_CHAN_WIDTH_80; + ret = IEEE80211_STA_DISABLE_80P80MHZ | + IEEE80211_STA_DISABLE_160MHZ; + break; + case NL80211_CHAN_WIDTH_160: + /* n_P20 */ + tmp = (70 + c->chan->center_freq - c->center_freq1)/20; + /* n_P80 */ + tmp /= 4; + c->center_freq1 = c->center_freq1 - 40 + 80 * tmp; + c->width = NL80211_CHAN_WIDTH_80; + ret = IEEE80211_STA_DISABLE_80P80MHZ | + IEEE80211_STA_DISABLE_160MHZ; + break; + default: + case NL80211_CHAN_WIDTH_20_NOHT: + WARN_ON_ONCE(1); + c->width = NL80211_CHAN_WIDTH_20_NOHT; + ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; + break; + } + + WARN_ON_ONCE(!cfg80211_chandef_valid(c)); + + return ret; +} + +static u32 +ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + struct ieee80211_channel *channel, + const struct ieee80211_ht_operation *ht_oper, + const struct ieee80211_vht_operation *vht_oper, + struct cfg80211_chan_def *chandef, bool verbose) +{ + struct cfg80211_chan_def vht_chandef; + u32 ht_cfreq, ret; + + chandef->chan = channel; + chandef->width = NL80211_CHAN_WIDTH_20_NOHT; + chandef->center_freq1 = channel->center_freq; + chandef->center_freq2 = 0; + + if (!ht_oper || !sband->ht_cap.ht_supported) { + ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; + goto out; + } + + chandef->width = NL80211_CHAN_WIDTH_20; + + ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, + channel->band); + /* check that channel matches the right operating channel */ + if (channel->center_freq != ht_cfreq) { + /* + * It's possible that some APs are confused here; + * Netgear WNDR3700 sometimes reports 4 higher than + * the actual channel in association responses, but + * since we look at probe response/beacon data here + * it should be OK. + */ + if (verbose) + sdata_info(sdata, + "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n", + channel->center_freq, ht_cfreq, + ht_oper->primary_chan, channel->band); + ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; + goto out; + } + + /* check 40 MHz support, if we have it */ + if (sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) { + switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { + case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: + chandef->width = NL80211_CHAN_WIDTH_40; + chandef->center_freq1 += 10; + break; + case IEEE80211_HT_PARAM_CHA_SEC_BELOW: + chandef->width = NL80211_CHAN_WIDTH_40; + chandef->center_freq1 -= 10; + break; + } + } else { + /* 40 MHz (and 80 MHz) must be supported for VHT */ + ret = IEEE80211_STA_DISABLE_VHT; + /* also mark 40 MHz disabled */ + ret |= IEEE80211_STA_DISABLE_40MHZ; + goto out; + } + + if (!vht_oper || !sband->vht_cap.vht_supported) { + ret = IEEE80211_STA_DISABLE_VHT; + goto out; + } + + vht_chandef.chan = channel; + vht_chandef.center_freq1 = + ieee80211_channel_to_frequency(vht_oper->center_freq_seg1_idx, + channel->band); + vht_chandef.center_freq2 = 0; + + switch (vht_oper->chan_width) { + case IEEE80211_VHT_CHANWIDTH_USE_HT: + vht_chandef.width = chandef->width; + break; + case IEEE80211_VHT_CHANWIDTH_80MHZ: + vht_chandef.width = NL80211_CHAN_WIDTH_80; + break; + case IEEE80211_VHT_CHANWIDTH_160MHZ: + vht_chandef.width = NL80211_CHAN_WIDTH_160; + break; + case IEEE80211_VHT_CHANWIDTH_80P80MHZ: + vht_chandef.width = NL80211_CHAN_WIDTH_80P80; + vht_chandef.center_freq2 = + ieee80211_channel_to_frequency( + vht_oper->center_freq_seg2_idx, + channel->band); + break; + default: + if (verbose) + sdata_info(sdata, + "AP VHT operation IE has invalid channel width (%d), disable VHT\n", + vht_oper->chan_width); + ret = IEEE80211_STA_DISABLE_VHT; + goto out; + } + + if (!cfg80211_chandef_valid(&vht_chandef)) { + if (verbose) + sdata_info(sdata, + "AP VHT information is invalid, disable VHT\n"); + ret = IEEE80211_STA_DISABLE_VHT; + goto out; + } + + if (cfg80211_chandef_identical(chandef, &vht_chandef)) { + ret = 0; + goto out; + } + + if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) { + if (verbose) + sdata_info(sdata, + "AP VHT information doesn't match HT, disable VHT\n"); + ret = IEEE80211_STA_DISABLE_VHT; + goto out; + } + + *chandef = vht_chandef; + + ret = 0; + +out: + /* don't print the message below for VHT mismatch if VHT is disabled */ + if (ret & IEEE80211_STA_DISABLE_VHT) + vht_chandef = *chandef; + + while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, + IEEE80211_CHAN_DISABLED)) { + if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) { + ret = IEEE80211_STA_DISABLE_HT | + IEEE80211_STA_DISABLE_VHT; + goto out; + } + + ret |= chandef_downgrade(chandef); + } + + if (chandef->width != vht_chandef.width && verbose) + sdata_info(sdata, + "capabilities/regulatory prevented using AP HT/VHT configuration, downgraded\n"); + + WARN_ON_ONCE(!cfg80211_chandef_valid(chandef)); + return ret; +} + +static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + const struct ieee80211_ht_operation *ht_oper, + const struct ieee80211_vht_operation *vht_oper, + const u8 *bssid, u32 *changed) { struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_supported_band *sband; - struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *chan; - struct sta_info *sta; - u32 changed = 0; + struct cfg80211_chan_def chandef; u16 ht_opmode; - bool disable_40 = false; + u32 flags; + enum ieee80211_sta_rx_bandwidth new_sta_bw; + int ret; - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (WARN_ON(!chanctx_conf)) { - rcu_read_unlock(); + /* if HT was/is disabled, don't track any bandwidth changes */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HT || !ht_oper) return 0; - } - chan = chanctx_conf->def.chan; - rcu_read_unlock(); + + /* don't check VHT if we associated as non-VHT station */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT) + vht_oper = NULL; + + if (WARN_ON_ONCE(!sta)) + return -EINVAL; + + chan = sdata->vif.bss_conf.chandef.chan; sband = local->hw.wiphy->bands[chan->band]; - switch (sdata->vif.bss_conf.chandef.width) { + /* calculate new channel (type) based on HT/VHT operation IEs */ + flags = ieee80211_determine_chantype(sdata, sband, chan, ht_oper, + vht_oper, &chandef, false); + + /* + * Downgrade the new channel if we associated with restricted + * capabilities. For example, if we associated as a 20 MHz STA + * to a 40 MHz AP (due to regulatory, capabilities or config + * reasons) then switching to a 40 MHz channel now won't do us + * any good -- we couldn't use it with the AP. + */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_80P80MHZ && + chandef.width == NL80211_CHAN_WIDTH_80P80) + flags |= chandef_downgrade(&chandef); + if (ifmgd->flags & IEEE80211_STA_DISABLE_160MHZ && + chandef.width == NL80211_CHAN_WIDTH_160) + flags |= chandef_downgrade(&chandef); + if (ifmgd->flags & IEEE80211_STA_DISABLE_40MHZ && + chandef.width > NL80211_CHAN_WIDTH_20) + flags |= chandef_downgrade(&chandef); + + if (cfg80211_chandef_identical(&chandef, &sdata->vif.bss_conf.chandef)) + return 0; + + sdata_info(sdata, + "AP %pM changed bandwidth, new config is %d MHz, width %d (%d/%d MHz)\n", + ifmgd->bssid, chandef.chan->center_freq, chandef.width, + chandef.center_freq1, chandef.center_freq2); + + if (flags != (ifmgd->flags & (IEEE80211_STA_DISABLE_HT | + IEEE80211_STA_DISABLE_VHT | + IEEE80211_STA_DISABLE_40MHZ | + IEEE80211_STA_DISABLE_80P80MHZ | + IEEE80211_STA_DISABLE_160MHZ)) || + !cfg80211_chandef_valid(&chandef)) { + sdata_info(sdata, + "AP %pM changed bandwidth in a way we can't support - disconnect\n", + ifmgd->bssid); + return -EINVAL; + } + + switch (chandef.width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + new_sta_bw = IEEE80211_STA_RX_BW_20; + break; case NL80211_CHAN_WIDTH_40: - if (sdata->vif.bss_conf.chandef.chan->center_freq > - sdata->vif.bss_conf.chandef.center_freq1 && - chan->flags & IEEE80211_CHAN_NO_HT40PLUS) - disable_40 = true; - if (sdata->vif.bss_conf.chandef.chan->center_freq < - sdata->vif.bss_conf.chandef.center_freq1 && - chan->flags & IEEE80211_CHAN_NO_HT40MINUS) - disable_40 = true; + new_sta_bw = IEEE80211_STA_RX_BW_40; break; - default: + case NL80211_CHAN_WIDTH_80: + new_sta_bw = IEEE80211_STA_RX_BW_80; + break; + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + new_sta_bw = IEEE80211_STA_RX_BW_160; break; + default: + return -EINVAL; } - /* This can change during the lifetime of the BSS */ - if (!(ht_oper->ht_param & IEEE80211_HT_PARAM_CHAN_WIDTH_ANY)) - disable_40 = true; - - mutex_lock(&local->sta_mtx); - sta = sta_info_get(sdata, bssid); - - WARN_ON_ONCE(!sta); - - if (sta && !sta->supports_40mhz) - disable_40 = true; + if (new_sta_bw > sta->cur_max_bandwidth) + new_sta_bw = sta->cur_max_bandwidth; - if (sta && (!reconfig || - (disable_40 != !(sta->sta.ht_cap.cap & - IEEE80211_HT_CAP_SUP_WIDTH_20_40)))) { + if (new_sta_bw < sta->sta.bandwidth) { + sta->sta.bandwidth = new_sta_bw; + rate_control_rate_update(local, sband, sta, + IEEE80211_RC_BW_CHANGED); + } - if (disable_40) - sta->sta.ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; - else - sta->sta.ht_cap.cap |= IEEE80211_HT_CAP_SUP_WIDTH_20_40; + ret = ieee80211_vif_change_bandwidth(sdata, &chandef, changed); + if (ret) { + sdata_info(sdata, + "AP %pM changed bandwidth to incompatible one - disconnect\n", + ifmgd->bssid); + return ret; + } + if (new_sta_bw > sta->sta.bandwidth) { + sta->sta.bandwidth = new_sta_bw; rate_control_rate_update(local, sband, sta, IEEE80211_RC_BW_CHANGED); } - mutex_unlock(&local->sta_mtx); ht_opmode = le16_to_cpu(ht_oper->operation_mode); /* if bss configuration changed store the new one */ - if (!reconfig || (sdata->vif.bss_conf.ht_operation_mode != ht_opmode)) { - changed |= BSS_CHANGED_HT; + if (sdata->vif.bss_conf.ht_operation_mode != ht_opmode) { + *changed |= BSS_CHANGED_HT; sdata->vif.bss_conf.ht_operation_mode = ht_opmode; } - return changed; + return 0; } /* frame sending functions */ @@ -341,7 +598,8 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, - struct ieee80211_supported_band *sband) + struct ieee80211_supported_band *sband, + struct ieee80211_vht_cap *ap_vht_cap) { u8 *pos; u32 cap; @@ -350,6 +608,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, BUILD_BUG_ON(sizeof(vht_cap) != sizeof(sband->vht_cap)); memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap)); + ieee80211_apply_vhtcap_overrides(sdata, &vht_cap); /* determine capability flags */ cap = vht_cap.cap; @@ -364,6 +623,14 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, cap &= ~IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ; } + /* + * Some APs apparently get confused if our capabilities are better + * than theirs, so restrict what we advertise in the assoc request. + */ + if (!(ap_vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) + cap &= ~IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE; + /* reserve and fill IE */ pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); @@ -562,7 +829,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) sband, chan, sdata->smps_mode); if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) - ieee80211_add_vht_ie(sdata, skb, sband); + ieee80211_add_vht_ie(sdata, skb, sband, + &assoc_data->ap_vht_cap); /* if present, add any custom non-vendor IEs that go after HT */ if (assoc_data->ie_len && assoc_data->ie) { @@ -605,6 +873,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) drv_mgd_prepare_tx(local, sdata); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | + IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_tx_skb(sdata, skb); } @@ -641,7 +912,8 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, if (powersave) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); - IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | + IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (ifmgd->flags & (IEEE80211_STA_BEACON_POLL | IEEE80211_STA_CONNECTION_POLL)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_USE_MINRATE; @@ -684,6 +956,7 @@ static void ieee80211_chswitch_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work); + struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (!ieee80211_sdata_running(sdata)) @@ -693,21 +966,22 @@ static void ieee80211_chswitch_work(struct work_struct *work) if (!ifmgd->associated) goto out; - sdata->local->_oper_channel = sdata->local->csa_channel; - if (!sdata->local->ops->channel_switch) { + local->_oper_chandef = local->csa_chandef; + + if (!local->ops->channel_switch) { /* call "hw_config" only if doing sw channel switch */ - ieee80211_hw_config(sdata->local, - IEEE80211_CONF_CHANGE_CHANNEL); + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } else { /* update the device channel directly */ - sdata->local->hw.conf.channel = sdata->local->_oper_channel; + local->hw.conf.chandef = local->_oper_chandef; } /* XXX: shouldn't really modify cfg80211-owned data! */ - ifmgd->associated->channel = sdata->local->_oper_channel; + ifmgd->associated->channel = local->_oper_chandef.chan; /* XXX: wait for a beacon first? */ - ieee80211_wake_queues_by_reason(&sdata->local->hw, + ieee80211_wake_queues_by_reason(&local->hw, + IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_CSA); out: ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED; @@ -735,66 +1009,197 @@ static void ieee80211_chswitch_timer(unsigned long data) { struct ieee80211_sub_if_data *sdata = (struct ieee80211_sub_if_data *) data; - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - - if (sdata->local->quiescing) { - set_bit(TMR_RUNNING_CHANSW, &ifmgd->timers_running); - return; - } - ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work); + ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.chswitch_work); } -void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, - struct ieee80211_channel_sw_ie *sw_elem, - struct ieee80211_bss *bss, - u64 timestamp) +static void +ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, + u64 timestamp, struct ieee802_11_elems *elems) { - struct cfg80211_bss *cbss = - container_of((void *)bss, struct cfg80211_bss, priv); - struct ieee80211_channel *new_ch; + struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num, - cbss->channel->band); + struct cfg80211_bss *cbss = ifmgd->associated; + struct ieee80211_bss *bss; struct ieee80211_chanctx *chanctx; + enum ieee80211_band new_band; + int new_freq; + u8 new_chan_no; + u8 count; + u8 mode; + struct ieee80211_channel *new_chan; + struct cfg80211_chan_def new_chandef = {}; + struct cfg80211_chan_def new_vht_chandef = {}; + const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; + const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; + int secondary_channel_offset = -1; ASSERT_MGD_MTX(ifmgd); - if (!ifmgd->associated) + if (!cbss) return; - if (sdata->local->scanning) + if (local->scanning) return; - /* Disregard subsequent beacons if we are already running a timer - processing a CSA */ - + /* disregard subsequent announcements if we are already processing */ if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED) return; - new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq); - if (!new_ch || new_ch->flags & IEEE80211_CHAN_DISABLED) { + sec_chan_offs = elems->sec_chan_offs; + wide_bw_chansw_ie = elems->wide_bw_chansw_ie; + + if (ifmgd->flags & (IEEE80211_STA_DISABLE_HT | + IEEE80211_STA_DISABLE_40MHZ)) { + sec_chan_offs = NULL; + wide_bw_chansw_ie = NULL; + } + + if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT) + wide_bw_chansw_ie = NULL; + + if (elems->ext_chansw_ie) { + if (!ieee80211_operating_class_to_band( + elems->ext_chansw_ie->new_operating_class, + &new_band)) { + sdata_info(sdata, + "cannot understand ECSA IE operating class %d, disconnecting\n", + elems->ext_chansw_ie->new_operating_class); + ieee80211_queue_work(&local->hw, + &ifmgd->csa_connection_drop_work); + } + new_chan_no = elems->ext_chansw_ie->new_ch_num; + count = elems->ext_chansw_ie->count; + mode = elems->ext_chansw_ie->mode; + } else if (elems->ch_switch_ie) { + new_band = cbss->channel->band; + new_chan_no = elems->ch_switch_ie->new_ch_num; + count = elems->ch_switch_ie->count; + mode = elems->ch_switch_ie->mode; + } else { + /* nothing here we understand */ + return; + } + + bss = (void *)cbss->priv; + + new_freq = ieee80211_channel_to_frequency(new_chan_no, new_band); + new_chan = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq); + if (!new_chan || new_chan->flags & IEEE80211_CHAN_DISABLED) { sdata_info(sdata, "AP %pM switches to unsupported channel (%d MHz), disconnecting\n", ifmgd->associated->bssid, new_freq); - ieee80211_queue_work(&sdata->local->hw, + ieee80211_queue_work(&local->hw, + &ifmgd->csa_connection_drop_work); + return; + } + + if (sec_chan_offs) { + secondary_channel_offset = sec_chan_offs->sec_chan_offs; + } else if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { + /* if HT is enabled and the IE not present, it's still HT */ + secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE; + } + + switch (secondary_channel_offset) { + default: + /* secondary_channel_offset was present but is invalid */ + case IEEE80211_HT_PARAM_CHA_SEC_NONE: + cfg80211_chandef_create(&new_chandef, new_chan, + NL80211_CHAN_HT20); + break; + case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: + cfg80211_chandef_create(&new_chandef, new_chan, + NL80211_CHAN_HT40PLUS); + break; + case IEEE80211_HT_PARAM_CHA_SEC_BELOW: + cfg80211_chandef_create(&new_chandef, new_chan, + NL80211_CHAN_HT40MINUS); + break; + case -1: + cfg80211_chandef_create(&new_chandef, new_chan, + NL80211_CHAN_NO_HT); + break; + } + + if (wide_bw_chansw_ie) { + new_vht_chandef.chan = new_chan; + new_vht_chandef.center_freq1 = + ieee80211_channel_to_frequency( + wide_bw_chansw_ie->new_center_freq_seg0, + new_band); + + switch (wide_bw_chansw_ie->new_channel_width) { + default: + /* hmmm, ignore VHT and use HT if present */ + case IEEE80211_VHT_CHANWIDTH_USE_HT: + new_vht_chandef.chan = NULL; + break; + case IEEE80211_VHT_CHANWIDTH_80MHZ: + new_vht_chandef.width = NL80211_CHAN_WIDTH_80; + break; + case IEEE80211_VHT_CHANWIDTH_160MHZ: + new_vht_chandef.width = NL80211_CHAN_WIDTH_160; + break; + case IEEE80211_VHT_CHANWIDTH_80P80MHZ: + /* field is otherwise reserved */ + new_vht_chandef.center_freq2 = + ieee80211_channel_to_frequency( + wide_bw_chansw_ie->new_center_freq_seg1, + new_band); + new_vht_chandef.width = NL80211_CHAN_WIDTH_80P80; + break; + } + if (ifmgd->flags & IEEE80211_STA_DISABLE_80P80MHZ && + new_vht_chandef.width == NL80211_CHAN_WIDTH_80P80) + chandef_downgrade(&new_vht_chandef); + if (ifmgd->flags & IEEE80211_STA_DISABLE_160MHZ && + new_vht_chandef.width == NL80211_CHAN_WIDTH_160) + chandef_downgrade(&new_vht_chandef); + if (ifmgd->flags & IEEE80211_STA_DISABLE_40MHZ && + new_vht_chandef.width > NL80211_CHAN_WIDTH_20) + chandef_downgrade(&new_vht_chandef); + } + + /* if VHT data is there validate & use it */ + if (new_vht_chandef.chan) { + if (!cfg80211_chandef_compatible(&new_vht_chandef, + &new_chandef)) { + sdata_info(sdata, + "AP %pM CSA has inconsistent channel data, disconnecting\n", + ifmgd->associated->bssid); + ieee80211_queue_work(&local->hw, + &ifmgd->csa_connection_drop_work); + return; + } + new_chandef = new_vht_chandef; + } + + if (!cfg80211_chandef_usable(local->hw.wiphy, &new_chandef, + IEEE80211_CHAN_DISABLED)) { + sdata_info(sdata, + "AP %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", + ifmgd->associated->bssid, new_freq, + new_chandef.width, new_chandef.center_freq1, + new_chandef.center_freq2); + ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); return; } ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED; - if (sdata->local->use_chanctx) { + if (local->use_chanctx) { sdata_info(sdata, "not handling channel switch with channel contexts\n"); - ieee80211_queue_work(&sdata->local->hw, + ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); return; } - mutex_lock(&sdata->local->chanctx_mtx); + mutex_lock(&local->chanctx_mtx); if (WARN_ON(!rcu_access_pointer(sdata->vif.chanctx_conf))) { - mutex_unlock(&sdata->local->chanctx_mtx); + mutex_unlock(&local->chanctx_mtx); return; } chanctx = container_of(rcu_access_pointer(sdata->vif.chanctx_conf), @@ -802,39 +1207,39 @@ void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, if (chanctx->refcount > 1) { sdata_info(sdata, "channel switch with multiple interfaces on the same channel, disconnecting\n"); - ieee80211_queue_work(&sdata->local->hw, + ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); - mutex_unlock(&sdata->local->chanctx_mtx); + mutex_unlock(&local->chanctx_mtx); return; } - mutex_unlock(&sdata->local->chanctx_mtx); + mutex_unlock(&local->chanctx_mtx); - sdata->local->csa_channel = new_ch; + local->csa_chandef = new_chandef; - if (sw_elem->mode) - ieee80211_stop_queues_by_reason(&sdata->local->hw, + if (mode) + ieee80211_stop_queues_by_reason(&local->hw, + IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_CSA); - if (sdata->local->ops->channel_switch) { + if (local->ops->channel_switch) { /* use driver's channel switch callback */ struct ieee80211_channel_switch ch_switch = { .timestamp = timestamp, - .block_tx = sw_elem->mode, - .channel = new_ch, - .count = sw_elem->count, + .block_tx = mode, + .chandef = new_chandef, + .count = count, }; - drv_channel_switch(sdata->local, &ch_switch); + drv_channel_switch(local, &ch_switch); return; } /* channel switch handled in software */ - if (sw_elem->count <= 1) - ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work); + if (count <= 1) + ieee80211_queue_work(&local->hw, &ifmgd->chswitch_work); else mod_timer(&ifmgd->chswitch_timer, - TU_TO_EXP_TIME(sw_elem->count * - cbss->beacon_interval)); + TU_TO_EXP_TIME(count * cbss->beacon_interval)); } static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, @@ -907,39 +1312,6 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, return 0; } -void ieee80211_enable_dyn_ps(struct ieee80211_vif *vif) -{ - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - struct ieee80211_local *local = sdata->local; - struct ieee80211_conf *conf = &local->hw.conf; - - WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION || - !(local->hw.flags & IEEE80211_HW_SUPPORTS_PS) || - (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)); - - local->disable_dynamic_ps = false; - conf->dynamic_ps_timeout = local->dynamic_ps_user_timeout; -} -EXPORT_SYMBOL(ieee80211_enable_dyn_ps); - -void ieee80211_disable_dyn_ps(struct ieee80211_vif *vif) -{ - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - struct ieee80211_local *local = sdata->local; - struct ieee80211_conf *conf = &local->hw.conf; - - WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION || - !(local->hw.flags & IEEE80211_HW_SUPPORTS_PS) || - (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)); - - local->disable_dynamic_ps = true; - conf->dynamic_ps_timeout = 0; - del_timer_sync(&local->dynamic_ps_timer); - ieee80211_queue_work(&local->hw, - &local->dynamic_ps_enable_work); -} -EXPORT_SYMBOL(ieee80211_disable_dyn_ps); - /* powersave */ static void ieee80211_enable_ps(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) @@ -1042,7 +1414,6 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency) } if (count == 1 && ieee80211_powersave_allowed(found)) { - struct ieee80211_conf *conf = &local->hw.conf; s32 beaconint_us; if (latency < 0) @@ -1066,10 +1437,7 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency) else timeout = 100; } - local->dynamic_ps_user_timeout = timeout; - if (!local->disable_dynamic_ps) - conf->dynamic_ps_timeout = - local->dynamic_ps_user_timeout; + local->hw.conf.dynamic_ps_timeout = timeout; if (beaconint_us > latency) { local->ps_sdata = NULL; @@ -1117,6 +1485,7 @@ void ieee80211_dynamic_ps_disable_work(struct work_struct *work) } ieee80211_wake_queues_by_reason(&local->hw, + IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_PS); } @@ -1139,8 +1508,7 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) if (local->hw.conf.flags & IEEE80211_CONF_PS) return; - if (!local->disable_dynamic_ps && - local->hw.conf.dynamic_ps_timeout > 0) { + if (local->hw.conf.dynamic_ps_timeout > 0) { /* don't enter PS if TX frames are pending */ if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + @@ -1170,16 +1538,14 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && !(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { - netif_tx_stop_all_queues(sdata->dev); - - if (drv_tx_frames_pending(local)) + if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); - else { + } else { ieee80211_send_nullfunc(local, sdata, 1); /* Flush to get the tx status of nullfunc frame */ - drv_flush(local, false); + ieee80211_flush_queues(local, sdata); } } @@ -1190,9 +1556,6 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) local->hw.conf.flags |= IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } - - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) - netif_tx_wake_all_queues(sdata->dev); } void ieee80211_dynamic_ps_timer(unsigned long data) @@ -1205,16 +1568,30 @@ void ieee80211_dynamic_ps_timer(unsigned long data) ieee80211_queue_work(&local->hw, &local->dynamic_ps_enable_work); } +void ieee80211_dfs_cac_timer_work(struct work_struct *work) +{ + struct delayed_work *delayed_work = + container_of(work, struct delayed_work, work); + struct ieee80211_sub_if_data *sdata = + container_of(delayed_work, struct ieee80211_sub_if_data, + dfs_cac_timer_work); + + ieee80211_vif_release_channel(sdata); + + cfg80211_cac_event(sdata->dev, NL80211_RADAR_CAC_FINISHED, GFP_KERNEL); +} + /* MLME */ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - u8 *wmm_param, size_t wmm_param_len) + const u8 *wmm_param, size_t wmm_param_len) { struct ieee80211_tx_queue_params params; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t left; int count; - u8 *pos, uapsd_queues = 0; + const u8 *pos; + u8 uapsd_queues = 0; if (!local->ops->conf_tx) return false; @@ -1284,6 +1661,7 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, params.cw_max = ecw2cw((pos[1] & 0xf0) >> 4); params.cw_min = ecw2cw(pos[1] & 0x0f); params.txop = get_unaligned_le16(pos + 2); + params.acm = acm; params.uapsd = uapsd; mlme_dbg(sdata, @@ -1371,7 +1749,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, bss_conf->assoc_capability, bss->has_erp_value, bss->erp_value); sdata->u.mgd.beacon_timeout = usecs_to_jiffies(ieee80211_tu_to_usec( - IEEE80211_BEACON_LOSS_COUNT * bss_conf->beacon_int)); + beacon_loss_count * bss_conf->beacon_int)); sdata->u.mgd.associated = cbss; memcpy(sdata->u.mgd.bssid, cbss->bssid, ETH_ALEN); @@ -1384,18 +1762,17 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) { - u8 noa[2]; int ret; ret = cfg80211_get_p2p_attr( ies->data, ies->len, IEEE80211_P2P_ATTR_ABSENCE_NOTICE, - noa, sizeof(noa)); + (u8 *) &bss_conf->p2p_noa_attr, + sizeof(bss_conf->p2p_noa_attr)); if (ret >= 2) { - bss_conf->p2p_oppps = noa[1] & 0x80; - bss_conf->p2p_ctwindow = noa[1] & 0x7f; + sdata->u.mgd.p2p_noa_index = + bss_conf->p2p_noa_attr.index; bss_info_changed |= BSS_CHANGED_P2P_PS; - sdata->u.mgd.p2p_noa_index = noa[0]; } } rcu_read_unlock(); @@ -1406,7 +1783,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, ieee80211_led_assoc(local, 1); - if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD) { + if (sdata->u.mgd.assoc_data->have_beacon) { /* * If the AP is buggy we may get here with no DTIM period * known, so assume it's 1 which is the only safe assumption @@ -1414,6 +1791,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, * probably just won't work at all. */ bss_conf->dtim_period = sdata->u.mgd.dtim_period ?: 1; + bss_info_changed |= BSS_CHANGED_DTIM_PERIOD; } else { bss_conf->dtim_period = 0; } @@ -1426,10 +1804,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, bss_info_changed |= BSS_CHANGED_CQM; /* Enable ARP filtering */ - if (bss_conf->arp_filter_enabled != sdata->arp_filter_state) { - bss_conf->arp_filter_enabled = sdata->arp_filter_state; + if (bss_conf->arp_addr_cnt) bss_info_changed |= BSS_CHANGED_ARP_FILTER; - } ieee80211_bss_info_change_notify(sdata, bss_info_changed); @@ -1440,7 +1816,6 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_smps(sdata); ieee80211_recalc_ps_vif(sdata); - netif_tx_start_all_queues(sdata->dev); netif_carrier_on(sdata->dev); } @@ -1450,7 +1825,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; - struct sta_info *sta; u32 changed = 0; ASSERT_MGD_MTX(ifmgd); @@ -1464,32 +1838,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_stop_poll(sdata); ifmgd->associated = NULL; - - /* - * we need to commit the associated = NULL change because the - * scan code uses that to determine whether this iface should - * go to/wake up from powersave or not -- and could otherwise - * wake the queues erroneously. - */ - smp_mb(); - - /* - * Thus, we can only afterwards stop the queues -- to account - * for the case where another CPU is finishing a scan at this - * time -- we don't want the scan code to enable queues. - */ - - netif_tx_stop_all_queues(sdata->dev); netif_carrier_off(sdata->dev); - mutex_lock(&local->sta_mtx); - sta = sta_info_get(sdata, ifmgd->bssid); - if (sta) { - set_sta_flag(sta, WLAN_STA_BLOCK_BA); - ieee80211_sta_tear_down_BA_sessions(sta, false); - } - mutex_unlock(&local->sta_mtx); - /* * if we want to get out of ps before disassoc (why?) we have * to do it before sending disassoc, as otherwise the null-packet @@ -1506,7 +1856,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, /* flush out any pending frame (e.g. DELBA) before deauth/disassoc */ if (tx) - drv_flush(local, false); + ieee80211_flush_queues(local, sdata); /* deauthenticate/disassociate now */ if (tx || frame_buf) @@ -1515,13 +1865,13 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, /* flush out frame */ if (tx) - drv_flush(local, false); + ieee80211_flush_queues(local, sdata); /* clear bssid only after building the needed mgmt frames */ memset(ifmgd->bssid, 0, ETH_ALEN); /* remove AP and TDLS peers */ - sta_info_flush(local, sdata); + sta_info_flush_defer(sdata); /* finally reset all BSS / config parameters */ changed |= ieee80211_reset_erp_info(sdata); @@ -1530,12 +1880,15 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, changed |= BSS_CHANGED_ASSOC; sdata->vif.bss_conf.assoc = false; - sdata->vif.bss_conf.p2p_ctwindow = 0; - sdata->vif.bss_conf.p2p_oppps = false; + ifmgd->p2p_noa_index = -1; + memset(&sdata->vif.bss_conf.p2p_noa_attr, 0, + sizeof(sdata->vif.bss_conf.p2p_noa_attr)); - /* on the next assoc, re-program HT parameters */ + /* on the next assoc, re-program HT/VHT parameters */ memset(&ifmgd->ht_capa, 0, sizeof(ifmgd->ht_capa)); memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask)); + memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa)); + memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask)); sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL; @@ -1543,10 +1896,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, cancel_work_sync(&local->dynamic_ps_enable_work); /* Disable ARP filtering */ - if (sdata->vif.bss_conf.arp_filter_enabled) { - sdata->vif.bss_conf.arp_filter_enabled = false; + if (sdata->vif.bss_conf.arp_addr_cnt) changed |= BSS_CHANGED_ARP_FILTER; - } sdata->vif.bss_conf.qos = false; changed |= BSS_CHANGED_QOS; @@ -1563,8 +1914,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, del_timer_sync(&sdata->u.mgd.timer); del_timer_sync(&sdata->u.mgd.chswitch_timer); - sdata->u.mgd.timers_running = 0; - sdata->vif.bss_conf.dtim_period = 0; ifmgd->flags = 0; @@ -1629,17 +1978,18 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, if (!ieee80211_is_data(hdr->frame_control)) return; - if (ack) - ieee80211_sta_reset_conn_monitor(sdata); - if (ieee80211_is_nullfunc(hdr->frame_control) && sdata->u.mgd.probe_send_count > 0) { if (ack) - sdata->u.mgd.probe_send_count = 0; + ieee80211_sta_reset_conn_monitor(sdata); else sdata->u.mgd.nullfunc_failed = true; ieee80211_queue_work(&sdata->local->hw, &sdata->work); + return; } + + if (ack) + ieee80211_sta_reset_conn_monitor(sdata); } static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) @@ -1680,7 +2030,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) ssid_len = ssid[1]; ieee80211_send_probe_req(sdata, dst, ssid + 2, ssid_len, NULL, - 0, (u32) -1, true, false, + 0, (u32) -1, true, 0, ifmgd->associated->channel, false); rcu_read_unlock(); } @@ -1688,7 +2038,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) ifmgd->probe_timeout = jiffies + msecs_to_jiffies(probe_wait_ms); run_again(ifmgd, ifmgd->probe_timeout); if (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) - drv_flush(sdata->local, false); + ieee80211_flush_queues(sdata->local, sdata); } static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, @@ -1712,12 +2062,15 @@ static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, goto out; } - if (beacon) + if (beacon) { mlme_dbg_ratelimited(sdata, - "detected beacon loss from AP - sending probe request\n"); + "detected beacon loss from AP (missed %d beacons) - probing\n", + beacon_loss_count); - ieee80211_cqm_rssi_notify(&sdata->vif, - NL80211_CQM_RSSI_BEACON_LOSS_EVENT, GFP_KERNEL); + ieee80211_cqm_rssi_notify(&sdata->vif, + NL80211_CQM_RSSI_BEACON_LOSS_EVENT, + GFP_KERNEL); + } /* * The driver/our work has already reported this event or the @@ -1795,11 +2148,9 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_ap_probereq_get); -static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata, - bool transmit_frame) +static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_local *local = sdata->local; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; mutex_lock(&ifmgd->mtx); @@ -1810,8 +2161,11 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata, ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, - transmit_frame, frame_buf); + true, frame_buf); ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED; + ieee80211_wake_queues_by_reason(&sdata->local->hw, + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_CSA); mutex_unlock(&ifmgd->mtx); /* @@ -1819,10 +2173,6 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata, * but that's not a problem. */ cfg80211_send_deauth(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN); - - mutex_lock(&local->mtx); - ieee80211_recalc_idle(local); - mutex_unlock(&local->mtx); } static void ieee80211_beacon_connection_loss_work(struct work_struct *work) @@ -1841,10 +2191,10 @@ static void ieee80211_beacon_connection_loss_work(struct work_struct *work) rcu_read_unlock(); } - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) { + if (ifmgd->connection_loss) { sdata_info(sdata, "Connection to AP %pM lost\n", ifmgd->bssid); - __ieee80211_disconnect(sdata, false); + __ieee80211_disconnect(sdata); } else { ieee80211_mgd_probe_ap(sdata, true); } @@ -1856,9 +2206,7 @@ static void ieee80211_csa_connection_drop_work(struct work_struct *work) container_of(work, struct ieee80211_sub_if_data, u.mgd.csa_connection_drop_work); - ieee80211_wake_queues_by_reason(&sdata->local->hw, - IEEE80211_QUEUE_STOP_REASON_CSA); - __ieee80211_disconnect(sdata, true); + __ieee80211_disconnect(sdata); } void ieee80211_beacon_loss(struct ieee80211_vif *vif) @@ -1868,7 +2216,7 @@ void ieee80211_beacon_loss(struct ieee80211_vif *vif) trace_api_beacon_loss(sdata); - WARN_ON(hw->flags & IEEE80211_HW_CONNECTION_MONITOR); + sdata->u.mgd.connection_loss = false; ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_beacon_loss); @@ -1880,7 +2228,7 @@ void ieee80211_connection_loss(struct ieee80211_vif *vif) trace_api_connection_loss(sdata); - WARN_ON(!(hw->flags & IEEE80211_HW_CONNECTION_MONITOR)); + sdata->u.mgd.connection_loss = true; ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_connection_loss); @@ -1902,7 +2250,7 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, ieee80211_vif_release_channel(sdata); } - cfg80211_put_bss(auth_data->bss); + cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss); kfree(auth_data); sdata->u.mgd.auth_data = NULL; } @@ -1910,21 +2258,26 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { + struct ieee80211_local *local = sdata->local; struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data; u8 *pos; struct ieee802_11_elems elems; + u32 tx_flags = 0; pos = mgmt->u.auth.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); + ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems); if (!elems.challenge) return; auth_data->expected_transaction = 4; drv_mgd_prepare_tx(sdata->local, sdata); + if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | + IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0, elems.challenge - 2, elems.challenge_len + 2, auth_data->bss->bssid, auth_data->bss->bssid, auth_data->key, auth_data->key_len, - auth_data->key_idx); + auth_data->key_idx, tx_flags); } static enum rx_mgmt_action __must_check @@ -1991,6 +2344,7 @@ ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "authenticated\n"); ifmgd->auth_data->done = true; ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC; + ifmgd->auth_data->timeout_started = true; run_again(ifmgd, ifmgd->auth_data->timeout); if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && @@ -2049,10 +2403,6 @@ ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_set_disassoc(sdata, 0, 0, false, NULL); - mutex_lock(&sdata->local->mtx); - ieee80211_recalc_idle(sdata->local); - mutex_unlock(&sdata->local->mtx); - return RX_MGMT_CFG80211_DEAUTH; } @@ -2080,10 +2430,6 @@ ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_set_disassoc(sdata, 0, 0, false, NULL); - mutex_lock(&sdata->local->mtx); - ieee80211_recalc_idle(sdata->local); - mutex_unlock(&sdata->local->mtx); - return RX_MGMT_CFG80211_DISASSOC; } @@ -2184,7 +2530,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } pos = mgmt->u.assoc_resp.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); + ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems); if (!elems.supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); @@ -2193,6 +2539,24 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ifmgd->aid = aid; + /* + * We previously checked these in the beacon/probe response, so + * they should be present here. This is just a safety net. + */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && + (!elems.wmm_param || !elems.ht_cap_elem || !elems.ht_operation)) { + sdata_info(sdata, + "HT AP is missing WMM params or HT capability/operation in AssocResp\n"); + return false; + } + + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + (!elems.vht_cap_elem || !elems.vht_operation)) { + sdata_info(sdata, + "VHT AP is missing VHT capability/operation in AssocResp\n"); + return false; + } + mutex_lock(&sdata->local->sta_mtx); /* * station info was already allocated and inserted before @@ -2206,17 +2570,36 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[ieee80211_get_sdata_band(sdata)]; + /* Set up internal HT/VHT capabilities */ if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, - elems.ht_cap_elem, &sta->sta.ht_cap); - - sta->supports_40mhz = - sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40; + elems.ht_cap_elem, sta); if (elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - elems.vht_cap_elem, - &sta->sta.vht_cap); + elems.vht_cap_elem, sta); + + /* + * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data + * in their association response, so ignore that data for our own + * configuration. If it changed since the last beacon, we'll get the + * next beacon and update then. + */ + + /* + * If an operating mode notification IE is present, override the + * NSS calculation (that would be done in rate_control_rate_init()) + * and use the # of streams from that element. + */ + if (elems.opmode_notif && + !(*elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { + u8 nss; + + nss = *elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; + nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; + nss += 1; + sta->sta.rx_nss = nss; + } rate_control_rate_init(sta); @@ -2226,9 +2609,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (elems.wmm_param) set_sta_flag(sta, WLAN_STA_WME); - err = sta_info_move_state(sta, IEEE80211_STA_AUTH); - if (!err) - err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); + err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) err = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); if (err) { @@ -2257,11 +2638,6 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ieee80211_set_wmm_default(sdata, false); changed |= BSS_CHANGED_QOS; - if (elems.ht_operation && elems.wmm_param && - !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) - changed |= ieee80211_config_ht_tx(sdata, elems.ht_operation, - cbss->bssid, false); - /* set AID and assoc capability, * ieee80211_set_associated() will tell the driver */ bss_conf->aid = aid; @@ -2323,18 +2699,19 @@ ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14)))); pos = mgmt->u.assoc_resp.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); + ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems); if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY && - elems.timeout_int && elems.timeout_int_len == 5 && - elems.timeout_int[0] == WLAN_TIMEOUT_ASSOC_COMEBACK) { + elems.timeout_int && + elems.timeout_int->type == WLAN_TIMEOUT_ASSOC_COMEBACK) { u32 tu, ms; - tu = get_unaligned_le32(elems.timeout_int + 1); + tu = le32_to_cpu(elems.timeout_int->value); ms = tu * 1024 / 1000; sdata_info(sdata, "%pM rejected association temporarily; comeback duration %u TU (%u ms)\n", mgmt->sa, tu, ms); assoc_data->timeout = jiffies + msecs_to_jiffies(ms); + assoc_data->timeout_started = true; if (ms > IEEE80211_ASSOC_TIMEOUT) run_again(ifmgd, assoc_data->timeout); return RX_MGMT_NONE; @@ -2350,7 +2727,7 @@ ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (!ieee80211_assoc_success(sdata, *bss, mgmt, len)) { /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, false); - cfg80211_put_bss(*bss); + cfg80211_put_bss(sdata->local->hw.wiphy, *bss); return RX_MGMT_CFG80211_ASSOC_TIMEOUT; } sdata_info(sdata, "associated\n"); @@ -2369,8 +2746,7 @@ ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status, - struct ieee802_11_elems *elems, - bool beacon) + struct ieee802_11_elems *elems) { struct ieee80211_local *local = sdata->local; int freq; @@ -2378,6 +2754,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel; bool need_ps = false; + lockdep_assert_held(&sdata->u.mgd.mtx); + if ((sdata->u.mgd.associated && ether_addr_equal(mgmt->bssid, sdata->u.mgd.associated->bssid)) || (sdata->u.mgd.assoc_data && @@ -2387,12 +2765,12 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, need_ps = sdata->u.mgd.associated && !sdata->u.mgd.dtim_period; if (elems->tim && !elems->parse_error) { - struct ieee80211_tim_ie *tim_ie = elems->tim; + const struct ieee80211_tim_ie *tim_ie = elems->tim; sdata->u.mgd.dtim_period = tim_ie->dtim_period; } } - if (elems->ds_params && elems->ds_params_len == 1) + if (elems->ds_params) freq = ieee80211_channel_to_frequency(elems->ds_params[0], rx_status->band); else @@ -2404,11 +2782,12 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, - channel, beacon); + channel); if (bss) ieee80211_rx_bss_put(local, bss); - if (!sdata->u.mgd.associated) + if (!sdata->u.mgd.associated || + !ether_addr_equal(mgmt->bssid, sdata->u.mgd.associated->bssid)) return; if (need_ps) { @@ -2417,10 +2796,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, mutex_unlock(&local->iflist_mtx); } - if (elems->ch_switch_ie && - memcmp(mgmt->bssid, sdata->u.mgd.associated->bssid, ETH_ALEN) == 0) - ieee80211_sta_process_chanswitch(sdata, elems->ch_switch_ie, - bss, rx_status->mactime); + ieee80211_sta_process_chanswitch(sdata, rx_status->mactime, elems); + } @@ -2445,9 +2822,9 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, return; ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, - &elems); + false, &elems); - ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false); + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems); if (ifmgd->associated && ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) @@ -2459,6 +2836,7 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "direct probe responded\n"); ifmgd->auth_data->tries = 0; ifmgd->auth_data->timeout = jiffies; + ifmgd->auth_data->timeout_started = true; run_again(ifmgd, ifmgd->auth_data->timeout); } } @@ -2484,10 +2862,10 @@ static const u64 care_about_ies = (1ULL << WLAN_EID_HT_CAPABILITY) | (1ULL << WLAN_EID_HT_OPERATION); -static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, - size_t len, - struct ieee80211_rx_status *rx_status) +static enum rx_mgmt_action +ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len, + u8 *deauth_buf, struct ieee80211_rx_status *rx_status) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; @@ -2496,6 +2874,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *chan; + struct sta_info *sta; u32 changed = 0; bool erp_valid; u8 erp_value = 0; @@ -2507,40 +2886,51 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, /* Process beacon from the current BSS */ baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; if (baselen > len) - return; + return RX_MGMT_NONE; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); - return; + return RX_MGMT_NONE; } if (rx_status->freq != chanctx_conf->def.chan->center_freq) { rcu_read_unlock(); - return; + return RX_MGMT_NONE; } chan = chanctx_conf->def.chan; rcu_read_unlock(); - if (ifmgd->assoc_data && !ifmgd->assoc_data->have_beacon && + if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon && ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) { ieee802_11_parse_elems(mgmt->u.beacon.variable, - len - baselen, &elems); + len - baselen, false, &elems); - ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, - false); + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems); ifmgd->assoc_data->have_beacon = true; - ifmgd->assoc_data->sent_assoc = false; + ifmgd->assoc_data->need_beacon = false; + if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + sdata->vif.bss_conf.sync_tsf = + le64_to_cpu(mgmt->u.beacon.timestamp); + sdata->vif.bss_conf.sync_device_ts = + rx_status->device_timestamp; + if (elems.tim) + sdata->vif.bss_conf.sync_dtim_count = + elems.tim->dtim_count; + else + sdata->vif.bss_conf.sync_dtim_count = 0; + } /* continue assoc process */ ifmgd->assoc_data->timeout = jiffies; + ifmgd->assoc_data->timeout_started = true; run_again(ifmgd, ifmgd->assoc_data->timeout); - return; + return RX_MGMT_NONE; } if (!ifmgd->associated || !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) - return; + return RX_MGMT_NONE; bssid = ifmgd->associated->bssid; /* Track average RSSI from the Beacon frames of the current AP */ @@ -2571,12 +2961,12 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (sig > ifmgd->rssi_max_thold && (last_sig <= ifmgd->rssi_min_thold || last_sig == 0)) { ifmgd->last_ave_beacon_signal = sig; - drv_rssi_callback(local, RSSI_EVENT_HIGH); + drv_rssi_callback(local, sdata, RSSI_EVENT_HIGH); } else if (sig < ifmgd->rssi_min_thold && (last_sig >= ifmgd->rssi_max_thold || last_sig == 0)) { ifmgd->last_ave_beacon_signal = sig; - drv_rssi_callback(local, RSSI_EVENT_LOW); + drv_rssi_callback(local, sdata, RSSI_EVENT_LOW); } } @@ -2606,7 +2996,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_BEACON_POLL) { mlme_dbg_ratelimited(sdata, - "cancelling probereq poll due to a received beacon\n"); + "cancelling AP probe due to a received beacon\n"); mutex_lock(&local->mtx); ifmgd->flags &= ~IEEE80211_STA_BEACON_POLL; ieee80211_run_deferred_scan(local); @@ -2625,7 +3015,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4); ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable, - len - baselen, &elems, + len - baselen, false, &elems, care_about_ies, ncrc); if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) { @@ -2657,39 +3047,72 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, } if (sdata->vif.p2p) { - u8 noa[2]; + struct ieee80211_p2p_noa_attr noa = {}; int ret; ret = cfg80211_get_p2p_attr(mgmt->u.beacon.variable, len - baselen, IEEE80211_P2P_ATTR_ABSENCE_NOTICE, - noa, sizeof(noa)); - if (ret >= 2 && sdata->u.mgd.p2p_noa_index != noa[0]) { - bss_conf->p2p_oppps = noa[1] & 0x80; - bss_conf->p2p_ctwindow = noa[1] & 0x7f; + (u8 *) &noa, sizeof(noa)); + if (ret >= 2) { + if (sdata->u.mgd.p2p_noa_index != noa.index) { + /* valid noa_attr and index changed */ + sdata->u.mgd.p2p_noa_index = noa.index; + memcpy(&bss_conf->p2p_noa_attr, &noa, sizeof(noa)); + changed |= BSS_CHANGED_P2P_PS; + /* + * make sure we update all information, the CRC + * mechanism doesn't look at P2P attributes. + */ + ifmgd->beacon_crc_valid = false; + } + } else if (sdata->u.mgd.p2p_noa_index != -1) { + /* noa_attr not found and we had valid noa_attr before */ + sdata->u.mgd.p2p_noa_index = -1; + memset(&bss_conf->p2p_noa_attr, 0, sizeof(bss_conf->p2p_noa_attr)); changed |= BSS_CHANGED_P2P_PS; - sdata->u.mgd.p2p_noa_index = noa[0]; - /* - * make sure we update all information, the CRC - * mechanism doesn't look at P2P attributes. - */ ifmgd->beacon_crc_valid = false; } } if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid) - return; + return RX_MGMT_NONE; ifmgd->beacon_crc = ncrc; ifmgd->beacon_crc_valid = true; - ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, - true); + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems); if (ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, elems.wmm_param_len)) changed |= BSS_CHANGED_QOS; - if (elems.erp_info && elems.erp_info_len >= 1) { + /* + * If we haven't had a beacon before, tell the driver about the + * DTIM period (and beacon timing if desired) now. + */ + if (!bss_conf->dtim_period) { + /* a few bogus AP send dtim_period = 0 or no TIM IE */ + if (elems.tim) + bss_conf->dtim_period = elems.tim->dtim_period ?: 1; + else + bss_conf->dtim_period = 1; + + if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + sdata->vif.bss_conf.sync_tsf = + le64_to_cpu(mgmt->u.beacon.timestamp); + sdata->vif.bss_conf.sync_device_ts = + rx_status->device_timestamp; + if (elems.tim) + sdata->vif.bss_conf.sync_dtim_count = + elems.tim->dtim_count; + else + sdata->vif.bss_conf.sync_dtim_count = 0; + } + + changed |= BSS_CHANGED_DTIM_PERIOD; + } + + if (elems.erp_info) { erp_valid = true; erp_value = elems.erp_info[0]; } else { @@ -2699,11 +3122,22 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, le16_to_cpu(mgmt->u.beacon.capab_info), erp_valid, erp_value); + mutex_lock(&local->sta_mtx); + sta = sta_info_get(sdata, bssid); + + if (ieee80211_config_bw(sdata, sta, elems.ht_operation, + elems.vht_operation, bssid, &changed)) { + mutex_unlock(&local->sta_mtx); + ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, + WLAN_REASON_DEAUTH_LEAVING, + true, deauth_buf); + return RX_MGMT_CFG80211_TX_DEAUTH; + } - if (elems.ht_cap_elem && elems.ht_operation && elems.wmm_param && - !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) - changed |= ieee80211_config_ht_tx(sdata, elems.ht_operation, - bssid, true); + if (sta && elems.opmode_notif) + ieee80211_vht_handle_opmode(sdata, sta, *elems.opmode_notif, + rx_status->band, true); + mutex_unlock(&local->sta_mtx); if (elems.country_elem && elems.pwr_constr_elem && mgmt->u.probe_resp.capab_info & @@ -2714,6 +3148,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, elems.pwr_constr_elem); ieee80211_bss_info_change_notify(sdata, changed); + + return RX_MGMT_NONE; } void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, @@ -2724,7 +3160,10 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; struct cfg80211_bss *bss = NULL; enum rx_mgmt_action rma = RX_MGMT_NONE; + u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN]; u16 fc; + struct ieee802_11_elems elems; + int ies_len; rx_status = (struct ieee80211_rx_status *) skb->cb; mgmt = (struct ieee80211_mgmt *) skb->data; @@ -2734,7 +3173,8 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, switch (fc & IEEE80211_FCTL_STYPE) { case IEEE80211_STYPE_BEACON: - ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, rx_status); + rma = ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, + deauth_buf, rx_status); break; case IEEE80211_STYPE_PROBE_RESP: ieee80211_rx_mgmt_probe_resp(sdata, skb); @@ -2753,14 +3193,48 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, rma = ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, &bss); break; case IEEE80211_STYPE_ACTION: - switch (mgmt->u.action.category) { - case WLAN_CATEGORY_SPECTRUM_MGMT: + if (mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) { + ies_len = skb->len - + offsetof(struct ieee80211_mgmt, + u.action.u.chan_switch.variable); + + if (ies_len < 0) + break; + + ieee802_11_parse_elems( + mgmt->u.action.u.chan_switch.variable, + ies_len, true, &elems); + + if (elems.parse_error) + break; + ieee80211_sta_process_chanswitch(sdata, - &mgmt->u.action.u.chan_switch.sw_elem, - (void *)ifmgd->associated->priv, - rx_status->mactime); - break; + rx_status->mactime, + &elems); + } else if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) { + ies_len = skb->len - + offsetof(struct ieee80211_mgmt, + u.action.u.ext_chan_switch.variable); + + if (ies_len < 0) + break; + + ieee802_11_parse_elems( + mgmt->u.action.u.ext_chan_switch.variable, + ies_len, true, &elems); + + if (elems.parse_error) + break; + + /* for the handling code pretend this was also an IE */ + elems.ext_chansw_ie = + &mgmt->u.action.u.ext_chan_switch.data; + + ieee80211_sta_process_chanswitch(sdata, + rx_status->mactime, + &elems); } + break; } mutex_unlock(&ifmgd->mtx); @@ -2783,6 +3257,10 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, case RX_MGMT_CFG80211_ASSOC_TIMEOUT: cfg80211_send_assoc_timeout(sdata->dev, mgmt->bssid); break; + case RX_MGMT_CFG80211_TX_DEAUTH: + cfg80211_send_deauth(sdata->dev, deauth_buf, + sizeof(deauth_buf)); + break; default: WARN(1, "unexpected: %d", rma); } @@ -2792,26 +3270,18 @@ static void ieee80211_sta_timer(unsigned long data) { struct ieee80211_sub_if_data *sdata = (struct ieee80211_sub_if_data *) data; - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_local *local = sdata->local; - - if (local->quiescing) { - set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running); - return; - } - ieee80211_queue_work(&local->hw, &sdata->work); + ieee80211_queue_work(&sdata->local->hw, &sdata->work); } static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, - u8 *bssid, u8 reason) + u8 *bssid, u8 reason, bool tx) { - struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, reason, - false, frame_buf); + tx, frame_buf); mutex_unlock(&ifmgd->mtx); /* @@ -2820,10 +3290,6 @@ static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, */ cfg80211_send_deauth(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN); - mutex_lock(&local->mtx); - ieee80211_recalc_idle(local); - mutex_unlock(&local->mtx); - mutex_lock(&ifmgd->mtx); } @@ -2832,12 +3298,17 @@ static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data = ifmgd->auth_data; + u32 tx_flags = 0; lockdep_assert_held(&ifmgd->mtx); if (WARN_ON_ONCE(!auth_data)) return -EINVAL; + if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | + IEEE80211_TX_INTFL_MLME_CONN_TX; + auth_data->tries++; if (auth_data->tries > IEEE80211_AUTH_MAX_TRIES) { @@ -2874,7 +3345,8 @@ static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) ieee80211_send_auth(sdata, trans, auth_data->algorithm, status, auth_data->data, auth_data->data_len, auth_data->bss->bssid, - auth_data->bss->bssid, NULL, 0, 0); + auth_data->bss->bssid, NULL, 0, 0, + tx_flags); } else { const u8 *ssidie; @@ -2893,13 +3365,18 @@ static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) * will not answer to direct packet in unassociated state. */ ieee80211_send_probe_req(sdata, NULL, ssidie + 2, ssidie[1], - NULL, 0, (u32) -1, true, false, + NULL, 0, (u32) -1, true, tx_flags, auth_data->bss->channel, false); rcu_read_unlock(); } - auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; - run_again(ifmgd, auth_data->timeout); + if (!(local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) { + auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; + ifmgd->auth_data->timeout_started = true; + run_again(ifmgd, auth_data->timeout); + } else { + auth_data->timeout_started = false; + } return 0; } @@ -2930,12 +3407,29 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) IEEE80211_ASSOC_MAX_TRIES); ieee80211_send_assoc(sdata); - assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; - run_again(&sdata->u.mgd, assoc_data->timeout); + if (!(local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) { + assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; + assoc_data->timeout_started = true; + run_again(&sdata->u.mgd, assoc_data->timeout); + } else { + assoc_data->timeout_started = false; + } return 0; } +void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, + __le16 fc, bool acked) +{ + struct ieee80211_local *local = sdata->local; + + sdata->u.mgd.status_fc = fc; + sdata->u.mgd.status_acked = acked; + sdata->u.mgd.status_received = true; + + ieee80211_queue_work(&local->hw, &sdata->work); +} + void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; @@ -2943,7 +3437,36 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) mutex_lock(&ifmgd->mtx); - if (ifmgd->auth_data && + if (ifmgd->status_received) { + __le16 fc = ifmgd->status_fc; + bool status_acked = ifmgd->status_acked; + + ifmgd->status_received = false; + if (ifmgd->auth_data && + (ieee80211_is_probe_req(fc) || ieee80211_is_auth(fc))) { + if (status_acked) { + ifmgd->auth_data->timeout = + jiffies + IEEE80211_AUTH_TIMEOUT_SHORT; + run_again(ifmgd, ifmgd->auth_data->timeout); + } else { + ifmgd->auth_data->timeout = jiffies - 1; + } + ifmgd->auth_data->timeout_started = true; + } else if (ifmgd->assoc_data && + (ieee80211_is_assoc_req(fc) || + ieee80211_is_reassoc_req(fc))) { + if (status_acked) { + ifmgd->assoc_data->timeout = + jiffies + IEEE80211_ASSOC_TIMEOUT_SHORT; + run_again(ifmgd, ifmgd->assoc_data->timeout); + } else { + ifmgd->assoc_data->timeout = jiffies - 1; + } + ifmgd->assoc_data->timeout_started = true; + } + } + + if (ifmgd->auth_data && ifmgd->auth_data->timeout_started && time_after(jiffies, ifmgd->auth_data->timeout)) { if (ifmgd->auth_data->done) { /* @@ -2962,12 +3485,13 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) cfg80211_send_auth_timeout(sdata->dev, bssid); mutex_lock(&ifmgd->mtx); } - } else if (ifmgd->auth_data) + } else if (ifmgd->auth_data && ifmgd->auth_data->timeout_started) run_again(ifmgd, ifmgd->auth_data->timeout); - if (ifmgd->assoc_data && + if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started && time_after(jiffies, ifmgd->assoc_data->timeout)) { - if (!ifmgd->assoc_data->have_beacon || + if ((ifmgd->assoc_data->need_beacon && + !ifmgd->assoc_data->have_beacon) || ieee80211_do_assoc(sdata)) { u8 bssid[ETH_ALEN]; @@ -2979,7 +3503,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) cfg80211_send_assoc_timeout(sdata->dev, bssid); mutex_lock(&ifmgd->mtx); } - } else if (ifmgd->assoc_data) + } else if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started) run_again(ifmgd, ifmgd->assoc_data->timeout); if (ifmgd->flags & (IEEE80211_STA_BEACON_POLL | @@ -3010,7 +3534,8 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) "No ack for nullfunc frame to AP %pM, disconnecting.\n", bssid); ieee80211_sta_connection_lost(sdata, bssid, - WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY); + WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, + false); } } else if (time_is_after_jiffies(ifmgd->probe_timeout)) run_again(ifmgd, ifmgd->probe_timeout); @@ -3019,7 +3544,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) "Failed to send nullfunc to AP %pM after %dms, disconnecting\n", bssid, probe_wait_ms); ieee80211_sta_connection_lost(sdata, bssid, - WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY); + WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); } else if (ifmgd->probe_send_count < max_tries) { mlme_dbg(sdata, "No probe response from AP %pM after %dms, try %d/%i\n", @@ -3038,15 +3563,11 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) bssid, probe_wait_ms); ieee80211_sta_connection_lost(sdata, bssid, - WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY); + WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); } } mutex_unlock(&ifmgd->mtx); - - mutex_lock(&local->mtx); - ieee80211_recalc_idle(local); - mutex_unlock(&local->mtx); } static void ieee80211_sta_bcn_mon_timer(unsigned long data) @@ -3058,6 +3579,7 @@ static void ieee80211_sta_bcn_mon_timer(unsigned long data) if (local->quiescing) return; + sdata->u.mgd.connection_loss = false; ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.beacon_connection_loss_work); } @@ -3101,68 +3623,6 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) } } -#ifdef CONFIG_PM -void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - - /* - * we need to use atomic bitops for the running bits - * only because both timers might fire at the same - * time -- the code here is properly synchronised. - */ - - cancel_work_sync(&ifmgd->request_smps_work); - - cancel_work_sync(&ifmgd->monitor_work); - cancel_work_sync(&ifmgd->beacon_connection_loss_work); - cancel_work_sync(&ifmgd->csa_connection_drop_work); - if (del_timer_sync(&ifmgd->timer)) - set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running); - - cancel_work_sync(&ifmgd->chswitch_work); - if (del_timer_sync(&ifmgd->chswitch_timer)) - set_bit(TMR_RUNNING_CHANSW, &ifmgd->timers_running); - - /* these will just be re-established on connection */ - del_timer_sync(&ifmgd->conn_mon_timer); - del_timer_sync(&ifmgd->bcn_mon_timer); -} - -void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - - if (!ifmgd->associated) - return; - - if (sdata->flags & IEEE80211_SDATA_DISCONNECT_RESUME) { - sdata->flags &= ~IEEE80211_SDATA_DISCONNECT_RESUME; - mutex_lock(&ifmgd->mtx); - if (ifmgd->associated) { - mlme_dbg(sdata, - "driver requested disconnect after resume\n"); - ieee80211_sta_connection_lost(sdata, - ifmgd->associated->bssid, - WLAN_REASON_UNSPECIFIED); - mutex_unlock(&ifmgd->mtx); - return; - } - mutex_unlock(&ifmgd->mtx); - } - - if (test_and_clear_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running)) - add_timer(&ifmgd->timer); - if (test_and_clear_bit(TMR_RUNNING_CHANSW, &ifmgd->timers_running)) - add_timer(&ifmgd->chswitch_timer); - ieee80211_sta_reset_beacon_monitor(sdata); - - mutex_lock(&sdata->local->mtx); - ieee80211_restart_sta_timer(sdata); - mutex_unlock(&sdata->local->mtx); -} -#endif - /* interface setup */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) { @@ -3187,8 +3647,9 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ifmgd->flags = 0; ifmgd->powersave = sdata->wdev.ps; - ifmgd->uapsd_queues = IEEE80211_DEFAULT_UAPSD_QUEUES; - ifmgd->uapsd_max_sp_len = IEEE80211_DEFAULT_MAX_SP_LEN; + ifmgd->uapsd_queues = sdata->local->hw.uapsd_queues; + ifmgd->uapsd_max_sp_len = sdata->local->hw.uapsd_max_sp_len; + ifmgd->p2p_noa_index = -1; mutex_init(&ifmgd->mtx); @@ -3205,8 +3666,10 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local) /* Restart STA timers */ rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) - ieee80211_restart_sta_timer(sdata); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (ieee80211_sdata_running(sdata)) + ieee80211_restart_sta_timer(sdata); + } rcu_read_unlock(); } @@ -3225,201 +3688,6 @@ int ieee80211_max_network_latency(struct notifier_block *nb, return 0; } -static u32 chandef_downgrade(struct cfg80211_chan_def *c) -{ - u32 ret; - int tmp; - - switch (c->width) { - case NL80211_CHAN_WIDTH_20: - c->width = NL80211_CHAN_WIDTH_20_NOHT; - ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; - break; - case NL80211_CHAN_WIDTH_40: - c->width = NL80211_CHAN_WIDTH_20; - c->center_freq1 = c->chan->center_freq; - ret = IEEE80211_STA_DISABLE_40MHZ | - IEEE80211_STA_DISABLE_VHT; - break; - case NL80211_CHAN_WIDTH_80: - tmp = (30 + c->chan->center_freq - c->center_freq1)/20; - /* n_P40 */ - tmp /= 2; - /* freq_P40 */ - c->center_freq1 = c->center_freq1 - 20 + 40 * tmp; - c->width = NL80211_CHAN_WIDTH_40; - ret = IEEE80211_STA_DISABLE_VHT; - break; - case NL80211_CHAN_WIDTH_80P80: - c->center_freq2 = 0; - c->width = NL80211_CHAN_WIDTH_80; - ret = IEEE80211_STA_DISABLE_80P80MHZ | - IEEE80211_STA_DISABLE_160MHZ; - break; - case NL80211_CHAN_WIDTH_160: - /* n_P20 */ - tmp = (70 + c->chan->center_freq - c->center_freq1)/20; - /* n_P80 */ - tmp /= 4; - c->center_freq1 = c->center_freq1 - 40 + 80 * tmp; - c->width = NL80211_CHAN_WIDTH_80; - ret = IEEE80211_STA_DISABLE_80P80MHZ | - IEEE80211_STA_DISABLE_160MHZ; - break; - default: - case NL80211_CHAN_WIDTH_20_NOHT: - WARN_ON_ONCE(1); - c->width = NL80211_CHAN_WIDTH_20_NOHT; - ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; - break; - } - - WARN_ON_ONCE(!cfg80211_chandef_valid(c)); - - return ret; -} - -static u32 -ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, - struct ieee80211_supported_band *sband, - struct ieee80211_channel *channel, - const struct ieee80211_ht_operation *ht_oper, - const struct ieee80211_vht_operation *vht_oper, - struct cfg80211_chan_def *chandef) -{ - struct cfg80211_chan_def vht_chandef; - u32 ht_cfreq, ret; - - chandef->chan = channel; - chandef->width = NL80211_CHAN_WIDTH_20_NOHT; - chandef->center_freq1 = channel->center_freq; - chandef->center_freq2 = 0; - - if (!ht_oper || !sband->ht_cap.ht_supported) { - ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; - goto out; - } - - chandef->width = NL80211_CHAN_WIDTH_20; - - ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, - channel->band); - /* check that channel matches the right operating channel */ - if (channel->center_freq != ht_cfreq) { - /* - * It's possible that some APs are confused here; - * Netgear WNDR3700 sometimes reports 4 higher than - * the actual channel in association responses, but - * since we look at probe response/beacon data here - * it should be OK. - */ - sdata_info(sdata, - "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n", - channel->center_freq, ht_cfreq, - ht_oper->primary_chan, channel->band); - ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; - goto out; - } - - /* check 40 MHz support, if we have it */ - if (sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) { - switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { - case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: - chandef->width = NL80211_CHAN_WIDTH_40; - chandef->center_freq1 += 10; - break; - case IEEE80211_HT_PARAM_CHA_SEC_BELOW: - chandef->width = NL80211_CHAN_WIDTH_40; - chandef->center_freq1 -= 10; - break; - } - } else { - /* 40 MHz (and 80 MHz) must be supported for VHT */ - ret = IEEE80211_STA_DISABLE_VHT; - goto out; - } - - if (!vht_oper || !sband->vht_cap.vht_supported) { - ret = IEEE80211_STA_DISABLE_VHT; - goto out; - } - - vht_chandef.chan = channel; - vht_chandef.center_freq1 = - ieee80211_channel_to_frequency(vht_oper->center_freq_seg1_idx, - channel->band); - vht_chandef.center_freq2 = 0; - - if (vht_oper->center_freq_seg2_idx) - vht_chandef.center_freq2 = - ieee80211_channel_to_frequency( - vht_oper->center_freq_seg2_idx, - channel->band); - - switch (vht_oper->chan_width) { - case IEEE80211_VHT_CHANWIDTH_USE_HT: - vht_chandef.width = chandef->width; - break; - case IEEE80211_VHT_CHANWIDTH_80MHZ: - vht_chandef.width = NL80211_CHAN_WIDTH_80; - break; - case IEEE80211_VHT_CHANWIDTH_160MHZ: - vht_chandef.width = NL80211_CHAN_WIDTH_160; - break; - case IEEE80211_VHT_CHANWIDTH_80P80MHZ: - vht_chandef.width = NL80211_CHAN_WIDTH_80P80; - break; - default: - sdata_info(sdata, - "AP VHT operation IE has invalid channel width (%d), disable VHT\n", - vht_oper->chan_width); - ret = IEEE80211_STA_DISABLE_VHT; - goto out; - } - - if (!cfg80211_chandef_valid(&vht_chandef)) { - sdata_info(sdata, - "AP VHT information is invalid, disable VHT\n"); - ret = IEEE80211_STA_DISABLE_VHT; - goto out; - } - - if (cfg80211_chandef_identical(chandef, &vht_chandef)) { - ret = 0; - goto out; - } - - if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) { - sdata_info(sdata, - "AP VHT information doesn't match HT, disable VHT\n"); - ret = IEEE80211_STA_DISABLE_VHT; - goto out; - } - - *chandef = vht_chandef; - - ret = 0; - -out: - while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, - IEEE80211_CHAN_DISABLED)) { - if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) { - ret = IEEE80211_STA_DISABLE_HT | - IEEE80211_STA_DISABLE_VHT; - goto out; - } - - ret |= chandef_downgrade(chandef); - } - - if (chandef->width != vht_chandef.width) - sdata_info(sdata, - "capabilities/regulatory prevented using AP HT/VHT configuration, downgraded\n"); - - WARN_ON_ONCE(!cfg80211_chandef_valid(chandef)); - return ret; -} - static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss) { @@ -3485,16 +3753,22 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && sband->ht_cap.ht_supported) { - const u8 *ht_oper_ie; + const u8 *ht_oper_ie, *ht_cap; ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION); if (ht_oper_ie && ht_oper_ie[1] >= sizeof(*ht_oper)) ht_oper = (void *)(ht_oper_ie + 2); + + ht_cap = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_CAPABILITY); + if (!ht_cap || ht_cap[1] < sizeof(struct ieee80211_ht_cap)) { + ifmgd->flags |= IEEE80211_STA_DISABLE_HT; + ht_oper = NULL; + } } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && sband->vht_cap.vht_supported) { - const u8 *vht_oper_ie; + const u8 *vht_oper_ie, *vht_cap; vht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_VHT_OPERATION); @@ -3504,15 +3778,21 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, vht_oper = NULL; sdata_info(sdata, "AP advertised VHT without HT, disabling both\n"); - sdata->flags |= IEEE80211_STA_DISABLE_HT; - sdata->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= IEEE80211_STA_DISABLE_HT; + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + } + + vht_cap = ieee80211_bss_get_ie(cbss, WLAN_EID_VHT_CAPABILITY); + if (!vht_cap || vht_cap[1] < sizeof(struct ieee80211_vht_cap)) { + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + vht_oper = NULL; } } ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, ht_oper, vht_oper, - &chandef); + &chandef, true); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), local->rx_chains); @@ -3562,15 +3842,12 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, return -ENOMEM; } - mutex_lock(&local->mtx); - ieee80211_recalc_idle(sdata->local); - mutex_unlock(&local->mtx); - if (new_sta) { u32 rates = 0, basic_rates = 0; bool have_higher_than_11mbit; int min_rate = INT_MAX, min_rate_index = -1; struct ieee80211_supported_band *sband; + const struct cfg80211_bss_ies *ies; sband = local->hw.wiphy->bands[cbss->channel->band]; @@ -3614,8 +3891,34 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, /* set timing information */ sdata->vif.bss_conf.beacon_int = cbss->beacon_interval; - sdata->vif.bss_conf.sync_tsf = cbss->tsf; - sdata->vif.bss_conf.sync_device_ts = bss->device_ts; + rcu_read_lock(); + ies = rcu_dereference(cbss->beacon_ies); + if (ies) { + const u8 *tim_ie; + + sdata->vif.bss_conf.sync_tsf = ies->tsf; + sdata->vif.bss_conf.sync_device_ts = + bss->device_ts_beacon; + tim_ie = cfg80211_find_ie(WLAN_EID_TIM, + ies->data, ies->len); + if (tim_ie && tim_ie[1] >= 2) + sdata->vif.bss_conf.sync_dtim_count = tim_ie[2]; + else + sdata->vif.bss_conf.sync_dtim_count = 0; + } else if (!(local->hw.flags & + IEEE80211_HW_TIMING_BEACON_ONLY)) { + ies = rcu_dereference(cbss->proberesp_ies); + /* must be non-NULL since beacon IEs were NULL */ + sdata->vif.bss_conf.sync_tsf = ies->tsf; + sdata->vif.bss_conf.sync_device_ts = + bss->device_ts_presp; + sdata->vif.bss_conf.sync_dtim_count = 0; + } else { + sdata->vif.bss_conf.sync_tsf = 0; + sdata->vif.bss_conf.sync_device_ts = 0; + sdata->vif.bss_conf.sync_dtim_count = 0; + } + rcu_read_unlock(); /* tell driver about BSSID, basic rates and timing */ ieee80211_bss_info_change_notify(sdata, @@ -3719,8 +4022,16 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, /* prep auth_data so we don't go into idle on disassoc */ ifmgd->auth_data = auth_data; - if (ifmgd->associated) - ieee80211_set_disassoc(sdata, 0, 0, false, NULL); + if (ifmgd->associated) { + u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; + + ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, + WLAN_REASON_UNSPECIFIED, + false, frame_buf); + + __cfg80211_send_deauth(sdata->dev, frame_buf, + sizeof(frame_buf)); + } sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid); @@ -3735,7 +4046,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, } /* hold our own reference */ - cfg80211_ref_bss(auth_data->bss); + cfg80211_ref_bss(local->hw.wiphy, auth_data->bss); err = 0; goto out_unlock; @@ -3758,8 +4069,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)req->bss->priv; struct ieee80211_mgd_assoc_data *assoc_data; + const struct cfg80211_bss_ies *beacon_ies; struct ieee80211_supported_band *sband; - const u8 *ssidie, *ht_ie; + const u8 *ssidie, *ht_ie, *vht_ie; int i, err; assoc_data = kzalloc(sizeof(*assoc_data) + req->ie_len, GFP_KERNEL); @@ -3779,8 +4091,16 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, mutex_lock(&ifmgd->mtx); - if (ifmgd->associated) - ieee80211_set_disassoc(sdata, 0, 0, false, NULL); + if (ifmgd->associated) { + u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; + + ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, + WLAN_REASON_UNSPECIFIED, + false, frame_buf); + + __cfg80211_send_deauth(sdata->dev, frame_buf, + sizeof(frame_buf)); + } if (ifmgd->auth_data && !ifmgd->auth_data->done) { err = -EBUSY; @@ -3827,6 +4147,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; } + if (req->flags & ASSOC_REQ_DISABLE_VHT) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + /* Also disable HT if we don't support it or the AP doesn't use WMM */ sband = local->hw.wiphy->bands[req->bss->channel->band]; if (!sband->ht_cap.ht_supported || @@ -3850,6 +4173,10 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, memcpy(&ifmgd->ht_capa_mask, &req->ht_capa_mask, sizeof(ifmgd->ht_capa_mask)); + memcpy(&ifmgd->vht_capa, &req->vht_capa, sizeof(ifmgd->vht_capa)); + memcpy(&ifmgd->vht_capa_mask, &req->vht_capa_mask, + sizeof(ifmgd->vht_capa_mask)); + if (req->ie && req->ie_len) { memcpy(assoc_data->ie, req->ie, req->ie_len); assoc_data->ie_len = req->ie_len; @@ -3878,10 +4205,17 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ((struct ieee80211_ht_operation *)(ht_ie + 2))->ht_param; else ifmgd->flags |= IEEE80211_STA_DISABLE_HT; + vht_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_VHT_CAPABILITY); + if (vht_ie && vht_ie[1] >= sizeof(struct ieee80211_vht_cap)) + memcpy(&assoc_data->ap_vht_cap, vht_ie + 2, + sizeof(struct ieee80211_vht_cap)); + else + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; rcu_read_unlock(); if (bss->wmm_used && bss->uapsd_supported && - (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)) { + (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) && + sdata->wmm_acm != 0xff) { assoc_data->uapsd = true; ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED; } else { @@ -3917,40 +4251,48 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (err) goto err_clear; - if (sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD) { - const struct cfg80211_bss_ies *beacon_ies; + rcu_read_lock(); + beacon_ies = rcu_dereference(req->bss->beacon_ies); - rcu_read_lock(); - beacon_ies = rcu_dereference(req->bss->beacon_ies); - if (!beacon_ies) { - /* - * Wait up to one beacon interval ... - * should this be more if we miss one? - */ - sdata_info(sdata, "waiting for beacon from %pM\n", - ifmgd->bssid); - assoc_data->timeout = - TU_TO_EXP_TIME(req->bss->beacon_interval); - } else { - const u8 *tim_ie = cfg80211_find_ie(WLAN_EID_TIM, - beacon_ies->data, - beacon_ies->len); - if (tim_ie && tim_ie[1] >= - sizeof(struct ieee80211_tim_ie)) { - const struct ieee80211_tim_ie *tim; - tim = (void *)(tim_ie + 2); - ifmgd->dtim_period = tim->dtim_period; - } - assoc_data->have_beacon = true; - assoc_data->sent_assoc = false; - assoc_data->timeout = jiffies; + if (sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC && + !beacon_ies) { + /* + * Wait up to one beacon interval ... + * should this be more if we miss one? + */ + sdata_info(sdata, "waiting for beacon from %pM\n", + ifmgd->bssid); + assoc_data->timeout = TU_TO_EXP_TIME(req->bss->beacon_interval); + assoc_data->timeout_started = true; + assoc_data->need_beacon = true; + } else if (beacon_ies) { + const u8 *tim_ie = cfg80211_find_ie(WLAN_EID_TIM, + beacon_ies->data, + beacon_ies->len); + u8 dtim_count = 0; + + if (tim_ie && tim_ie[1] >= sizeof(struct ieee80211_tim_ie)) { + const struct ieee80211_tim_ie *tim; + tim = (void *)(tim_ie + 2); + ifmgd->dtim_period = tim->dtim_period; + dtim_count = tim->dtim_count; } - rcu_read_unlock(); - } else { assoc_data->have_beacon = true; - assoc_data->sent_assoc = false; assoc_data->timeout = jiffies; + assoc_data->timeout_started = true; + + if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + sdata->vif.bss_conf.sync_tsf = beacon_ies->tsf; + sdata->vif.bss_conf.sync_device_ts = + bss->device_ts_beacon; + sdata->vif.bss_conf.sync_dtim_count = dtim_count; + } + } else { + assoc_data->timeout = jiffies; + assoc_data->timeout_started = true; } + rcu_read_unlock(); + run_again(ifmgd, assoc_data->timeout); if (bss->corrupt_data) { @@ -4017,10 +4359,6 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, mutex_unlock(&ifmgd->mtx); out: - mutex_lock(&sdata->local->mtx); - ieee80211_recalc_idle(sdata->local); - mutex_unlock(&sdata->local->mtx); - if (sent_frame) __cfg80211_send_deauth(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN); @@ -4061,10 +4399,6 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, __cfg80211_send_disassoc(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN); - mutex_lock(&sdata->local->mtx); - ieee80211_recalc_idle(sdata->local); - mutex_unlock(&sdata->local->mtx); - return 0; } @@ -4072,6 +4406,17 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + /* + * Make sure some work items will not run after this, + * they will not do anything but might not have been + * cancelled when disconnecting. + */ + cancel_work_sync(&ifmgd->monitor_work); + cancel_work_sync(&ifmgd->beacon_connection_loss_work); + cancel_work_sync(&ifmgd->request_smps_work); + cancel_work_sync(&ifmgd->csa_connection_drop_work); + cancel_work_sync(&ifmgd->chswitch_work); + mutex_lock(&ifmgd->mtx); if (ifmgd->assoc_data) ieee80211_destroy_assoc_data(sdata, false); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index a3ad4c3c80a3..acd1f71adc03 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -113,6 +113,15 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) * notify the AP about us leaving the channel and stop all * STA interfaces. */ + + /* + * Stop queues and transmit all frames queued by the driver + * before sending nullfunc to enable powersave at the AP. + */ + ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL); + ieee80211_flush_queues(local, NULL); + mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) @@ -125,18 +134,17 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); /* Check to see if we should disable beaconing. */ - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_ADHOC || - sdata->vif.type == NL80211_IFTYPE_MESH_POINT) + if (sdata->vif.bss_conf.enable_beacon) { + set_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, + &sdata->state); + sdata->vif.bss_conf.enable_beacon = false; ieee80211_bss_info_change_notify( sdata, BSS_CHANGED_BEACON_ENABLED); - - if (sdata->vif.type != NL80211_IFTYPE_MONITOR) { - netif_tx_stop_all_queues(sdata->dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION && - sdata->u.mgd.associated) - ieee80211_offchannel_ps_enable(sdata); } + + if (sdata->vif.type == NL80211_IFTYPE_STATION && + sdata->u.mgd.associated) + ieee80211_offchannel_ps_enable(sdata); } mutex_unlock(&local->iflist_mtx); } @@ -164,27 +172,17 @@ void ieee80211_offchannel_return(struct ieee80211_local *local) sdata->u.mgd.associated) ieee80211_offchannel_ps_disable(sdata); - if (sdata->vif.type != NL80211_IFTYPE_MONITOR) { - /* - * This may wake up queues even though the driver - * currently has them stopped. This is not very - * likely, since the driver won't have gotten any - * (or hardly any) new packets while we weren't - * on the right channel, and even if it happens - * it will at most lead to queueing up one more - * packet per queue in mac80211 rather than on - * the interface qdisc. - */ - netif_tx_wake_all_queues(sdata->dev); - } - - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_ADHOC || - sdata->vif.type == NL80211_IFTYPE_MESH_POINT) + if (test_and_clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, + &sdata->state)) { + sdata->vif.bss_conf.enable_beacon = true; ieee80211_bss_info_change_notify( sdata, BSS_CHANGED_BEACON_ENABLED); + } } mutex_unlock(&local->iflist_mtx); + + ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL); } void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc) @@ -279,7 +277,7 @@ void ieee80211_start_next_roc(struct ieee80211_local *local) duration = 10; ret = drv_remain_on_channel(local, roc->sdata, roc->chan, - duration); + duration, roc->type); roc->started = true; @@ -299,10 +297,13 @@ void ieee80211_start_next_roc(struct ieee80211_local *local) } } -void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) +void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free) { struct ieee80211_roc_work *dep, *tmp; + if (WARN_ON(roc->to_be_freed)) + return; + /* was never transmitted */ if (roc->frame) { cfg80211_mgmt_tx_status(&roc->sdata->wdev, @@ -318,9 +319,12 @@ void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) GFP_KERNEL); list_for_each_entry_safe(dep, tmp, &roc->dependents, list) - ieee80211_roc_notify_destroy(dep); + ieee80211_roc_notify_destroy(dep, true); - kfree(roc); + if (free) + kfree(roc); + else + roc->to_be_freed = true; } void ieee80211_sw_roc_work(struct work_struct *work) @@ -333,6 +337,9 @@ void ieee80211_sw_roc_work(struct work_struct *work) mutex_lock(&local->mtx); + if (roc->to_be_freed) + goto out_unlock; + if (roc->abort) goto finish; @@ -372,10 +379,10 @@ void ieee80211_sw_roc_work(struct work_struct *work) finish: list_del(&roc->list); started = roc->started; - ieee80211_roc_notify_destroy(roc); + ieee80211_roc_notify_destroy(roc, !roc->abort); if (started) { - drv_flush(local, false); + ieee80211_flush_queues(local, NULL); local->tmp_channel = NULL; ieee80211_hw_config(local, 0); @@ -412,7 +419,7 @@ static void ieee80211_hw_roc_done(struct work_struct *work) list_del(&roc->list); - ieee80211_roc_notify_destroy(roc); + ieee80211_roc_notify_destroy(roc, true); /* if there's another roc, start it now */ ieee80211_start_next_roc(local); @@ -438,15 +445,15 @@ void ieee80211_roc_setup(struct ieee80211_local *local) INIT_LIST_HEAD(&local->roc_list); } -void ieee80211_roc_purge(struct ieee80211_sub_if_data *sdata) +void ieee80211_roc_purge(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) { - struct ieee80211_local *local = sdata->local; struct ieee80211_roc_work *roc, *tmp; LIST_HEAD(tmp_list); mutex_lock(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { - if (roc->sdata != sdata) + if (sdata && roc->sdata != sdata) continue; if (roc->started && local->ops->remain_on_channel) { @@ -462,12 +469,14 @@ void ieee80211_roc_purge(struct ieee80211_sub_if_data *sdata) list_for_each_entry_safe(roc, tmp, &tmp_list, list) { if (local->ops->remain_on_channel) { list_del(&roc->list); - ieee80211_roc_notify_destroy(roc); + ieee80211_roc_notify_destroy(roc, true); } else { ieee80211_queue_delayed_work(&local->hw, &roc->work, 0); /* work will clean up etc */ flush_delayed_work(&roc->work); + WARN_ON(!roc->to_be_freed); + kfree(roc); } } diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 79a48f37d409..7fc5d0d8149a 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -6,56 +6,42 @@ #include "driver-ops.h" #include "led.h" -/* return value indicates whether the driver should be further notified */ -static bool ieee80211_quiesce(struct ieee80211_sub_if_data *sdata) -{ - switch (sdata->vif.type) { - case NL80211_IFTYPE_STATION: - ieee80211_sta_quiesce(sdata); - return true; - case NL80211_IFTYPE_ADHOC: - ieee80211_ibss_quiesce(sdata); - return true; - case NL80211_IFTYPE_MESH_POINT: - ieee80211_mesh_quiesce(sdata); - return true; - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_MONITOR: - /* don't tell driver about this */ - return false; - default: - return true; - } -} - int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; struct sta_info *sta; - struct ieee80211_chanctx *ctx; if (!local->open_count) goto suspend; ieee80211_scan_cancel(local); + ieee80211_dfs_cac_cancel(local); + + ieee80211_roc_purge(local, NULL); + + ieee80211_del_virtual_monitor(local); + if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); - ieee80211_sta_tear_down_BA_sessions(sta, true); + ieee80211_sta_tear_down_BA_sessions( + sta, AGG_STOP_LOCAL_REQUEST); } mutex_unlock(&local->sta_mtx); } ieee80211_stop_queues_by_reason(hw, - IEEE80211_QUEUE_STOP_REASON_SUSPEND); + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_SUSPEND); - /* flush out all packets */ + /* flush out all packets and station cleanup call_rcu()s */ synchronize_net(); + rcu_barrier(); - drv_flush(local, false); + ieee80211_flush_queues(local, NULL); local->quiescing = true; /* make quiescing visible to timers everywhere */ @@ -88,24 +74,17 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) mutex_unlock(&local->sta_mtx); } ieee80211_wake_queues_by_reason(hw, + IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND); return err; } else if (err > 0) { WARN_ON(err != 1); - local->wowlan = false; + return err; } else { - list_for_each_entry(sdata, &local->interfaces, list) { - cancel_work_sync(&sdata->work); - ieee80211_quiesce(sdata); - } goto suspend; } } - /* disable keys */ - list_for_each_entry(sdata, &local->interfaces, list) - ieee80211_disable_keys(sdata); - /* tear down aggregation sessions and remove STAs */ mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { @@ -117,74 +96,21 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) WARN_ON(drv_sta_state(local, sta->sdata, sta, state, state - 1)); } - - mesh_plink_quiesce(sta); } mutex_unlock(&local->sta_mtx); /* remove all interfaces */ list_for_each_entry(sdata, &local->interfaces, list) { - cancel_work_sync(&sdata->work); - - if (!ieee80211_quiesce(sdata)) - continue; - if (!ieee80211_sdata_running(sdata)) continue; - - /* disable beaconing */ - ieee80211_bss_info_change_notify(sdata, - BSS_CHANGED_BEACON_ENABLED); - - if (sdata->vif.type == NL80211_IFTYPE_AP && - rcu_access_pointer(sdata->u.ap.beacon)) - drv_stop_ap(local, sdata); - - if (local->use_chanctx) { - struct ieee80211_chanctx_conf *conf; - - mutex_lock(&local->chanctx_mtx); - conf = rcu_dereference_protected( - sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - if (conf) { - ctx = container_of(conf, - struct ieee80211_chanctx, - conf); - drv_unassign_vif_chanctx(local, sdata, ctx); - } - - mutex_unlock(&local->chanctx_mtx); - } - drv_remove_interface(local, sdata); - } - - sdata = rtnl_dereference(local->monitor_sdata); - if (sdata) { - if (local->use_chanctx) { - struct ieee80211_chanctx_conf *conf; - - mutex_lock(&local->chanctx_mtx); - conf = rcu_dereference_protected( - sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - if (conf) { - ctx = container_of(conf, - struct ieee80211_chanctx, - conf); - drv_unassign_vif_chanctx(local, sdata, ctx); - } - - mutex_unlock(&local->chanctx_mtx); - } - drv_remove_interface(local, sdata); } - mutex_lock(&local->chanctx_mtx); - list_for_each_entry(ctx, &local->chanctx_list, list) - drv_remove_chanctx(local, ctx); - mutex_unlock(&local->chanctx_mtx); + /* + * We disconnected on all interfaces before suspend, all channel + * contexts should be released. + */ + WARN_ON(!list_empty(&local->chanctx_list)); /* stop hardware - this must stop RX */ if (local->open_count) @@ -204,3 +130,13 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) * ieee80211_reconfig(), which is also needed for hardware * hang/firmware failure/etc. recovery. */ + +void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif, + struct cfg80211_wowlan_wakeup *wakeup, + gfp_t gfp) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + cfg80211_report_wowlan_wakeup(&sdata->wdev, wakeup, gfp); +} +EXPORT_SYMBOL(ieee80211_report_wowlan_wakeup); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index dd88381c53b7..0d51877efdb7 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -252,6 +252,25 @@ rate_lowest_non_cck_index(struct ieee80211_supported_band *sband, return 0; } +static void __rate_control_send_low(struct ieee80211_hw *hw, + struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, + struct ieee80211_tx_info *info) +{ + if ((sband->band != IEEE80211_BAND_2GHZ) || + !(info->flags & IEEE80211_TX_CTL_NO_CCK_RATE)) + info->control.rates[0].idx = rate_lowest_index(sband, sta); + else + info->control.rates[0].idx = + rate_lowest_non_cck_index(sband, sta); + + info->control.rates[0].count = + (info->flags & IEEE80211_TX_CTL_NO_ACK) ? + 1 : hw->max_rate_tries; + + info->control.skip_table = 1; +} + bool rate_control_send_low(struct ieee80211_sta *sta, void *priv_sta, @@ -262,16 +281,8 @@ bool rate_control_send_low(struct ieee80211_sta *sta, int mcast_rate; if (!sta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) { - if ((sband->band != IEEE80211_BAND_2GHZ) || - !(info->flags & IEEE80211_TX_CTL_NO_CCK_RATE)) - info->control.rates[0].idx = - rate_lowest_index(txrc->sband, sta); - else - info->control.rates[0].idx = - rate_lowest_non_cck_index(txrc->sband, sta); - info->control.rates[0].count = - (info->flags & IEEE80211_TX_CTL_NO_ACK) ? - 1 : txrc->hw->max_rate_tries; + __rate_control_send_low(txrc->hw, sband, sta, info); + if (!sta && txrc->bss) { mcast_rate = txrc->bss_conf->mcast_rate[sband->band]; if (mcast_rate > 0) { @@ -355,7 +366,8 @@ static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate, static void rate_idx_match_mask(struct ieee80211_tx_rate *rate, - struct ieee80211_tx_rate_control *txrc, + struct ieee80211_supported_band *sband, + enum nl80211_chan_width chan_width, u32 mask, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]) { @@ -375,27 +387,17 @@ static void rate_idx_match_mask(struct ieee80211_tx_rate *rate, IEEE80211_TX_RC_USE_SHORT_PREAMBLE); alt_rate.count = rate->count; if (rate_idx_match_legacy_mask(&alt_rate, - txrc->sband->n_bitrates, - mask)) { + sband->n_bitrates, mask)) { *rate = alt_rate; return; } } else { - struct sk_buff *skb = txrc->skb; - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - __le16 fc; - /* handle legacy rates */ - if (rate_idx_match_legacy_mask(rate, txrc->sband->n_bitrates, - mask)) + if (rate_idx_match_legacy_mask(rate, sband->n_bitrates, mask)) return; /* if HT BSS, and we handle a data frame, also try HT rates */ - if (txrc->bss_conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT) - return; - - fc = hdr->frame_control; - if (!ieee80211_is_data(fc)) + if (chan_width == NL80211_CHAN_WIDTH_20_NOHT) return; alt_rate.idx = 0; @@ -408,7 +410,7 @@ static void rate_idx_match_mask(struct ieee80211_tx_rate *rate, alt_rate.flags |= IEEE80211_TX_RC_MCS; - if (txrc->bss_conf->chandef.width == NL80211_CHAN_WIDTH_40) + if (chan_width == NL80211_CHAN_WIDTH_40) alt_rate.flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (rate_idx_match_mcs_mask(&alt_rate, mcs_mask)) { @@ -426,6 +428,228 @@ static void rate_idx_match_mask(struct ieee80211_tx_rate *rate, */ } +static void rate_fixup_ratelist(struct ieee80211_vif *vif, + struct ieee80211_supported_band *sband, + struct ieee80211_tx_info *info, + struct ieee80211_tx_rate *rates, + int max_rates) +{ + struct ieee80211_rate *rate; + bool inval = false; + int i; + + /* + * Set up the RTS/CTS rate as the fastest basic rate + * that is not faster than the data rate unless there + * is no basic rate slower than the data rate, in which + * case we pick the slowest basic rate + * + * XXX: Should this check all retry rates? + */ + if (!(rates[0].flags & IEEE80211_TX_RC_MCS)) { + u32 basic_rates = vif->bss_conf.basic_rates; + s8 baserate = basic_rates ? ffs(basic_rates - 1) : 0; + + rate = &sband->bitrates[rates[0].idx]; + + for (i = 0; i < sband->n_bitrates; i++) { + /* must be a basic rate */ + if (!(basic_rates & BIT(i))) + continue; + /* must not be faster than the data rate */ + if (sband->bitrates[i].bitrate > rate->bitrate) + continue; + /* maximum */ + if (sband->bitrates[baserate].bitrate < + sband->bitrates[i].bitrate) + baserate = i; + } + + info->control.rts_cts_rate_idx = baserate; + } + + for (i = 0; i < max_rates; i++) { + /* + * make sure there's no valid rate following + * an invalid one, just in case drivers don't + * take the API seriously to stop at -1. + */ + if (inval) { + rates[i].idx = -1; + continue; + } + if (rates[i].idx < 0) { + inval = true; + continue; + } + + /* + * For now assume MCS is already set up correctly, this + * needs to be fixed. + */ + if (rates[i].flags & IEEE80211_TX_RC_MCS) { + WARN_ON(rates[i].idx > 76); + + if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) && + info->control.use_cts_prot) + rates[i].flags |= + IEEE80211_TX_RC_USE_CTS_PROTECT; + continue; + } + + if (rates[i].flags & IEEE80211_TX_RC_VHT_MCS) { + WARN_ON(ieee80211_rate_get_vht_mcs(&rates[i]) > 9); + continue; + } + + /* set up RTS protection if desired */ + if (info->control.use_rts) { + rates[i].flags |= IEEE80211_TX_RC_USE_RTS_CTS; + info->control.use_cts_prot = false; + } + + /* RC is busted */ + if (WARN_ON_ONCE(rates[i].idx >= sband->n_bitrates)) { + rates[i].idx = -1; + continue; + } + + rate = &sband->bitrates[rates[i].idx]; + + /* set up short preamble */ + if (info->control.short_preamble && + rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) + rates[i].flags |= IEEE80211_TX_RC_USE_SHORT_PREAMBLE; + + /* set up G protection */ + if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) && + info->control.use_cts_prot && + rate->flags & IEEE80211_RATE_ERP_G) + rates[i].flags |= IEEE80211_TX_RC_USE_CTS_PROTECT; + } +} + + +static void rate_control_fill_sta_table(struct ieee80211_sta *sta, + struct ieee80211_tx_info *info, + struct ieee80211_tx_rate *rates, + int max_rates) +{ + struct ieee80211_sta_rates *ratetbl = NULL; + int i; + + if (sta && !info->control.skip_table) + ratetbl = rcu_dereference(sta->rates); + + /* Fill remaining rate slots with data from the sta rate table. */ + max_rates = min_t(int, max_rates, IEEE80211_TX_RATE_TABLE_SIZE); + for (i = 0; i < max_rates; i++) { + if (i < ARRAY_SIZE(info->control.rates) && + info->control.rates[i].idx >= 0 && + info->control.rates[i].count) { + if (rates != info->control.rates) + rates[i] = info->control.rates[i]; + } else if (ratetbl) { + rates[i].idx = ratetbl->rate[i].idx; + rates[i].flags = ratetbl->rate[i].flags; + if (info->control.use_rts) + rates[i].count = ratetbl->rate[i].count_rts; + else if (info->control.use_cts_prot) + rates[i].count = ratetbl->rate[i].count_cts; + else + rates[i].count = ratetbl->rate[i].count; + } else { + rates[i].idx = -1; + rates[i].count = 0; + } + + if (rates[i].idx < 0 || !rates[i].count) + break; + } +} + +static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + struct ieee80211_supported_band *sband, + struct ieee80211_tx_info *info, + struct ieee80211_tx_rate *rates, + int max_rates) +{ + enum nl80211_chan_width chan_width; + u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; + bool has_mcs_mask; + u32 mask; + int i; + + /* + * Try to enforce the rateidx mask the user wanted. skip this if the + * default mask (allow all rates) is used to save some processing for + * the common case. + */ + mask = sdata->rc_rateidx_mask[info->band]; + has_mcs_mask = sdata->rc_has_mcs_mask[info->band]; + if (mask == (1 << sband->n_bitrates) - 1 && !has_mcs_mask) + return; + + if (has_mcs_mask) + memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[info->band], + sizeof(mcs_mask)); + else + memset(mcs_mask, 0xff, sizeof(mcs_mask)); + + if (sta) { + /* Filter out rates that the STA does not support */ + mask &= sta->supp_rates[info->band]; + for (i = 0; i < sizeof(mcs_mask); i++) + mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; + } + + /* + * Make sure the rate index selected for each TX rate is + * included in the configured mask and change the rate indexes + * if needed. + */ + chan_width = sdata->vif.bss_conf.chandef.width; + for (i = 0; i < max_rates; i++) { + /* Skip invalid rates */ + if (rates[i].idx < 0) + break; + + rate_idx_match_mask(&rates[i], sband, mask, chan_width, + mcs_mask); + } +} + +void ieee80211_get_tx_rates(struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + struct sk_buff *skb, + struct ieee80211_tx_rate *dest, + int max_rates) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_supported_band *sband; + + rate_control_fill_sta_table(sta, info, dest, max_rates); + + if (!vif) + return; + + sdata = vif_to_sdata(vif); + sband = sdata->local->hw.wiphy->bands[info->band]; + + if (ieee80211_is_data(hdr->frame_control)) + rate_control_apply_mask(sdata, sta, sband, info, dest, max_rates); + + if (dest[0].idx < 0) + __rate_control_send_low(&sdata->local->hw, sband, sta, info); + + if (sta) + rate_fixup_ratelist(vif, sband, info, dest, max_rates); +} +EXPORT_SYMBOL(ieee80211_get_tx_rates); + void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_tx_rate_control *txrc) @@ -435,8 +659,6 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *ista = NULL; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); int i; - u32 mask; - u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { ista = &sta->sta; @@ -454,37 +676,27 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); - /* - * Try to enforce the rateidx mask the user wanted. skip this if the - * default mask (allow all rates) is used to save some processing for - * the common case. - */ - mask = sdata->rc_rateidx_mask[info->band]; - memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[info->band], - sizeof(mcs_mask)); - if (mask != (1 << txrc->sband->n_bitrates) - 1) { - if (sta) { - /* Filter out rates that the STA does not support */ - mask &= sta->sta.supp_rates[info->band]; - for (i = 0; i < sizeof(mcs_mask); i++) - mcs_mask[i] &= sta->sta.ht_cap.mcs.rx_mask[i]; - } - /* - * Make sure the rate index selected for each TX rate is - * included in the configured mask and change the rate indexes - * if needed. - */ - for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { - /* Skip invalid rates */ - if (info->control.rates[i].idx < 0) - break; - rate_idx_match_mask(&info->control.rates[i], txrc, - mask, mcs_mask); - } - } + if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_RC_TABLE) + return; + + ieee80211_get_tx_rates(&sdata->vif, ista, txrc->skb, + info->control.rates, + ARRAY_SIZE(info->control.rates)); +} - BUG_ON(info->control.rates[0].idx < 0); +int rate_control_set_rates(struct ieee80211_hw *hw, + struct ieee80211_sta *pubsta, + struct ieee80211_sta_rates *rates) +{ + struct ieee80211_sta_rates *old = rcu_dereference(pubsta->rates); + + rcu_assign_pointer(pubsta->rates, rates); + if (old) + kfree_rcu(old, rcu_head); + + return 0; } +EXPORT_SYMBOL(rate_control_set_rates); int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, const char *name) diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 301386dabf88..d35a5dd3fb13 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -68,6 +68,8 @@ static inline void rate_control_rate_init(struct sta_info *sta) sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; rcu_read_unlock(); + ieee80211_sta_set_rx_nss(sta); + ref->ops->rate_init(ref->priv, sband, ista, priv_sta); set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 8c5acdc06226..ac7ef5414bde 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -55,7 +55,6 @@ #include "rate.h" #include "rc80211_minstrel.h" -#define SAMPLE_COLUMNS 10 #define SAMPLE_TBL(_mi, _idx, _col) \ _mi->sample_table[(_idx * SAMPLE_COLUMNS) + _col] @@ -70,16 +69,75 @@ rix_to_ndx(struct minstrel_sta_info *mi, int rix) return i; } +/* find & sort topmost throughput rates */ +static inline void +minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list) +{ + int j = MAX_THR_RATES; + + while (j > 0 && mi->r[i].cur_tp > mi->r[tp_list[j - 1]].cur_tp) + j--; + if (j < MAX_THR_RATES - 1) + memmove(&tp_list[j + 1], &tp_list[j], MAX_THR_RATES - (j + 1)); + if (j < MAX_THR_RATES) + tp_list[j] = i; +} + +static void +minstrel_set_rate(struct minstrel_sta_info *mi, struct ieee80211_sta_rates *ratetbl, + int offset, int idx) +{ + struct minstrel_rate *r = &mi->r[idx]; + + ratetbl->rate[offset].idx = r->rix; + ratetbl->rate[offset].count = r->adjusted_retry_count; + ratetbl->rate[offset].count_cts = r->retry_count_cts; + ratetbl->rate[offset].count_rts = r->retry_count_rtscts; +} + +static void +minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi) +{ + struct ieee80211_sta_rates *ratetbl; + int i = 0; + + ratetbl = kzalloc(sizeof(*ratetbl), GFP_ATOMIC); + if (!ratetbl) + return; + + /* Start with max_tp_rate */ + minstrel_set_rate(mi, ratetbl, i++, mi->max_tp_rate[0]); + + if (mp->hw->max_rates >= 3) { + /* At least 3 tx rates supported, use max_tp_rate2 next */ + minstrel_set_rate(mi, ratetbl, i++, mi->max_tp_rate[1]); + } + + if (mp->hw->max_rates >= 2) { + /* At least 2 tx rates supported, use max_prob_rate next */ + minstrel_set_rate(mi, ratetbl, i++, mi->max_prob_rate); + } + + /* Use lowest rate last */ + ratetbl->rate[i].idx = mi->lowest_rix; + ratetbl->rate[i].count = mp->max_retry; + ratetbl->rate[i].count_cts = mp->max_retry; + ratetbl->rate[i].count_rts = mp->max_retry; + + rate_control_set_rates(mp->hw, mi->sta, ratetbl); +} + static void minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) { - u32 max_tp = 0, index_max_tp = 0, index_max_tp2 = 0; - u32 max_prob = 0, index_max_prob = 0; + u8 tmp_tp_rate[MAX_THR_RATES]; + u8 tmp_prob_rate = 0; u32 usecs; - u32 p; int i; - mi->stats_update = jiffies; + for (i=0; i < MAX_THR_RATES; i++) + tmp_tp_rate[i] = 0; + for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; @@ -87,27 +145,32 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) if (!usecs) usecs = 1000000; - /* To avoid rounding issues, probabilities scale from 0 (0%) - * to 18000 (100%) */ - if (mr->attempts) { - p = (mr->success * 18000) / mr->attempts; + if (unlikely(mr->attempts > 0)) { + mr->sample_skipped = 0; + mr->cur_prob = MINSTREL_FRAC(mr->success, mr->attempts); mr->succ_hist += mr->success; mr->att_hist += mr->attempts; - mr->cur_prob = p; - p = ((p * (100 - mp->ewma_level)) + (mr->probability * - mp->ewma_level)) / 100; - mr->probability = p; - mr->cur_tp = p * (1000000 / usecs); - } + mr->probability = minstrel_ewma(mr->probability, + mr->cur_prob, + EWMA_LEVEL); + } else + mr->sample_skipped++; mr->last_success = mr->success; mr->last_attempts = mr->attempts; mr->success = 0; mr->attempts = 0; + /* Update throughput per rate, reset thr. below 10% success */ + if (mr->probability < MINSTREL_FRAC(10, 100)) + mr->cur_tp = 0; + else + mr->cur_tp = mr->probability * (1000000 / usecs); + /* Sample less often below the 10% chance of success. * Sample less often above the 95% chance of success. */ - if ((mr->probability > 17100) || (mr->probability < 1800)) { + if (mr->probability > MINSTREL_FRAC(95, 100) || + mr->probability < MINSTREL_FRAC(10, 100)) { mr->adjusted_retry_count = mr->retry_count >> 1; if (mr->adjusted_retry_count > 2) mr->adjusted_retry_count = 2; @@ -118,35 +181,32 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) } if (!mr->adjusted_retry_count) mr->adjusted_retry_count = 2; - } - for (i = 0; i < mi->n_rates; i++) { - struct minstrel_rate *mr = &mi->r[i]; - if (max_tp < mr->cur_tp) { - index_max_tp = i; - max_tp = mr->cur_tp; - } - if (max_prob < mr->probability) { - index_max_prob = i; - max_prob = mr->probability; + minstrel_sort_best_tp_rates(mi, i, tmp_tp_rate); + + /* To determine the most robust rate (max_prob_rate) used at + * 3rd mmr stage we distinct between two cases: + * (1) if any success probabilitiy >= 95%, out of those rates + * choose the maximum throughput rate as max_prob_rate + * (2) if all success probabilities < 95%, the rate with + * highest success probability is choosen as max_prob_rate */ + if (mr->probability >= MINSTREL_FRAC(95,100)) { + if (mr->cur_tp >= mi->r[tmp_prob_rate].cur_tp) + tmp_prob_rate = i; + } else { + if (mr->probability >= mi->r[tmp_prob_rate].probability) + tmp_prob_rate = i; } } - max_tp = 0; - for (i = 0; i < mi->n_rates; i++) { - struct minstrel_rate *mr = &mi->r[i]; + /* Assign the new rate set */ + memcpy(mi->max_tp_rate, tmp_tp_rate, sizeof(mi->max_tp_rate)); + mi->max_prob_rate = tmp_prob_rate; - if (i == index_max_tp) - continue; + /* Reset update timer */ + mi->stats_update = jiffies; - if (max_tp < mr->cur_tp) { - index_max_tp2 = i; - max_tp = mr->cur_tp; - } - } - mi->max_tp_rate = index_max_tp; - mi->max_tp_rate2 = index_max_tp2; - mi->max_prob_rate = index_max_prob; + minstrel_update_rates(mp, mi); } static void @@ -195,9 +255,9 @@ minstrel_get_retry_count(struct minstrel_rate *mr, { unsigned int retry = mr->adjusted_retry_count; - if (info->control.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) + if (info->control.use_rts) retry = max(2U, min(mr->retry_count_rtscts, retry)); - else if (info->control.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) + else if (info->control.use_cts_prot) retry = max(2U, min(mr->retry_count_cts, retry)); return retry; } @@ -207,10 +267,10 @@ static int minstrel_get_next_sample(struct minstrel_sta_info *mi) { unsigned int sample_ndx; - sample_ndx = SAMPLE_TBL(mi, mi->sample_idx, mi->sample_column); - mi->sample_idx++; - if ((int) mi->sample_idx > (mi->n_rates - 2)) { - mi->sample_idx = 0; + sample_ndx = SAMPLE_TBL(mi, mi->sample_row, mi->sample_column); + mi->sample_row++; + if ((int) mi->sample_row >= mi->n_rates) { + mi->sample_row = 0; mi->sample_column++; if (mi->sample_column >= SAMPLE_COLUMNS) mi->sample_column = 0; @@ -226,111 +286,96 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct minstrel_sta_info *mi = priv_sta; struct minstrel_priv *mp = priv; - struct ieee80211_tx_rate *ar = info->control.rates; - unsigned int ndx, sample_ndx = 0; - bool mrr; - bool sample_slower = false; - bool sample = false; - int i, delta; - int mrr_ndx[3]; - int sample_rate; - + struct ieee80211_tx_rate *rate = &info->control.rates[0]; + struct minstrel_rate *msr, *mr; + unsigned int ndx; + bool mrr_capable; + bool prev_sample = mi->prev_sample; + int delta; + int sampling_ratio; + + /* management/no-ack frames do not use rate control */ if (rate_control_send_low(sta, priv_sta, txrc)) return; - mrr = mp->has_mrr && !txrc->rts && !txrc->bss_conf->use_cts_prot; - - ndx = mi->max_tp_rate; - - if (mrr) - sample_rate = mp->lookaround_rate_mrr; + /* check multi-rate-retry capabilities & adjust lookaround_rate */ + mrr_capable = mp->has_mrr && + !txrc->rts && + !txrc->bss_conf->use_cts_prot; + if (mrr_capable) + sampling_ratio = mp->lookaround_rate_mrr; else - sample_rate = mp->lookaround_rate; + sampling_ratio = mp->lookaround_rate; + /* increase sum packet counter */ mi->packet_count++; - delta = (mi->packet_count * sample_rate / 100) - + + delta = (mi->packet_count * sampling_ratio / 100) - (mi->sample_count + mi->sample_deferred / 2); - /* delta > 0: sampling required */ - if ((delta > 0) && (mrr || !mi->prev_sample)) { - struct minstrel_rate *msr; - if (mi->packet_count >= 10000) { - mi->sample_deferred = 0; - mi->sample_count = 0; - mi->packet_count = 0; - } else if (delta > mi->n_rates * 2) { - /* With multi-rate retry, not every planned sample - * attempt actually gets used, due to the way the retry - * chain is set up - [max_tp,sample,prob,lowest] for - * sample_rate < max_tp. - * - * If there's too much sampling backlog and the link - * starts getting worse, minstrel would start bursting - * out lots of sampling frames, which would result - * in a large throughput loss. */ - mi->sample_count += (delta - mi->n_rates * 2); - } + /* delta < 0: no sampling required */ + mi->prev_sample = false; + if (delta < 0 || (!mrr_capable && prev_sample)) + return; - sample_ndx = minstrel_get_next_sample(mi); - msr = &mi->r[sample_ndx]; - sample = true; - sample_slower = mrr && (msr->perfect_tx_time > - mi->r[ndx].perfect_tx_time); - - if (!sample_slower) { - if (msr->sample_limit != 0) { - ndx = sample_ndx; - mi->sample_count++; - if (msr->sample_limit > 0) - msr->sample_limit--; - } else { - sample = false; - } - } else { - /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark - * packets that have the sampling rate deferred to the - * second MRR stage. Increase the sample counter only - * if the deferred sample rate was actually used. - * Use the sample_deferred counter to make sure that - * the sampling is not done in large bursts */ - info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; - mi->sample_deferred++; - } + if (mi->packet_count >= 10000) { + mi->sample_deferred = 0; + mi->sample_count = 0; + mi->packet_count = 0; + } else if (delta > mi->n_rates * 2) { + /* With multi-rate retry, not every planned sample + * attempt actually gets used, due to the way the retry + * chain is set up - [max_tp,sample,prob,lowest] for + * sample_rate < max_tp. + * + * If there's too much sampling backlog and the link + * starts getting worse, minstrel would start bursting + * out lots of sampling frames, which would result + * in a large throughput loss. */ + mi->sample_count += (delta - mi->n_rates * 2); + } + + /* get next random rate sample */ + ndx = minstrel_get_next_sample(mi); + msr = &mi->r[ndx]; + mr = &mi->r[mi->max_tp_rate[0]]; + + /* Decide if direct ( 1st mrr stage) or indirect (2nd mrr stage) + * rate sampling method should be used. + * Respect such rates that are not sampled for 20 interations. + */ + if (mrr_capable && + msr->perfect_tx_time > mr->perfect_tx_time && + msr->sample_skipped < 20) { + /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark + * packets that have the sampling rate deferred to the + * second MRR stage. Increase the sample counter only + * if the deferred sample rate was actually used. + * Use the sample_deferred counter to make sure that + * the sampling is not done in large bursts */ + info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; + rate++; + mi->sample_deferred++; + } else { + if (!msr->sample_limit != 0) + return; + + mi->sample_count++; + if (msr->sample_limit > 0) + msr->sample_limit--; } - mi->prev_sample = sample; /* If we're not using MRR and the sampling rate already * has a probability of >95%, we shouldn't be attempting * to use it, as this only wastes precious airtime */ - if (!mrr && sample && (mi->r[ndx].probability > 17100)) - ndx = mi->max_tp_rate; - - ar[0].idx = mi->r[ndx].rix; - ar[0].count = minstrel_get_retry_count(&mi->r[ndx], info); - - if (!mrr) { - if (!sample) - ar[0].count = mp->max_retry; - ar[1].idx = mi->lowest_rix; - ar[1].count = mp->max_retry; + if (!mrr_capable && + (mi->r[ndx].probability > MINSTREL_FRAC(95, 100))) return; - } - /* MRR setup */ - if (sample) { - if (sample_slower) - mrr_ndx[0] = sample_ndx; - else - mrr_ndx[0] = mi->max_tp_rate; - } else { - mrr_ndx[0] = mi->max_tp_rate2; - } - mrr_ndx[1] = mi->max_prob_rate; - mrr_ndx[2] = 0; - for (i = 1; i < 4; i++) { - ar[i].idx = mi->r[mrr_ndx[i - 1]].rix; - ar[i].count = mi->r[mrr_ndx[i - 1]].adjusted_retry_count; - } + mi->prev_sample = true; + + rate->idx = mi->r[ndx].rix; + rate->count = minstrel_get_retry_count(&mi->r[ndx], info); } @@ -351,26 +396,21 @@ static void init_sample_table(struct minstrel_sta_info *mi) { unsigned int i, col, new_idx; - unsigned int n_srates = mi->n_rates - 1; u8 rnd[8]; mi->sample_column = 0; - mi->sample_idx = 0; - memset(mi->sample_table, 0, SAMPLE_COLUMNS * mi->n_rates); + mi->sample_row = 0; + memset(mi->sample_table, 0xff, SAMPLE_COLUMNS * mi->n_rates); for (col = 0; col < SAMPLE_COLUMNS; col++) { - for (i = 0; i < n_srates; i++) { + for (i = 0; i < mi->n_rates; i++) { get_random_bytes(rnd, sizeof(rnd)); - new_idx = (i + rnd[i & 7]) % n_srates; + new_idx = (i + rnd[i & 7]) % mi->n_rates; - while (SAMPLE_TBL(mi, new_idx, col) != 0) - new_idx = (new_idx + 1) % n_srates; + while (SAMPLE_TBL(mi, new_idx, col) != 0xff) + new_idx = (new_idx + 1) % mi->n_rates; - /* Don't sample the slowest rate (i.e. slowest base - * rate). We must presume that the slowest rate works - * fine, or else other management frames will also be - * failing and the link will break */ - SAMPLE_TBL(mi, new_idx, col) = i + 1; + SAMPLE_TBL(mi, new_idx, col) = i; } } } @@ -385,12 +425,16 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, unsigned int i, n = 0; unsigned int t_slot = 9; /* FIXME: get real slot time */ + mi->sta = sta; mi->lowest_rix = rate_lowest_index(sband, sta); ctl_rate = &sband->bitrates[mi->lowest_rix]; mi->sp_ack_dur = ieee80211_frame_duration(sband->band, 10, ctl_rate->bitrate, !!(ctl_rate->flags & IEEE80211_RATE_ERP_G), 1); + memset(mi->max_tp_rate, 0, sizeof(mi->max_tp_rate)); + mi->max_prob_rate = 0; + for (i = 0; i < sband->n_bitrates; i++) { struct minstrel_rate *mr = &mi->r[n]; unsigned int tx_time = 0, tx_time_cts = 0, tx_time_rtscts = 0; @@ -433,6 +477,8 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, } while ((tx_time < mp->segment_size) && (++mr->retry_count < mp->max_retry)); mr->adjusted_retry_count = mr->retry_count; + if (!(sband->bitrates[i].flags & IEEE80211_RATE_ERP_G)) + mr->retry_count_cts = mr->retry_count; } for (i = n; i < sband->n_bitrates; i++) { @@ -444,6 +490,7 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, mi->stats_update = jiffies; init_sample_table(mi); + minstrel_update_rates(mp, mi); } static void * @@ -494,6 +541,33 @@ minstrel_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) kfree(mi); } +static void +minstrel_init_cck_rates(struct minstrel_priv *mp) +{ + static const int bitrates[4] = { 10, 20, 55, 110 }; + struct ieee80211_supported_band *sband; + int i, j; + + sband = mp->hw->wiphy->bands[IEEE80211_BAND_2GHZ]; + if (!sband) + return; + + for (i = 0, j = 0; i < sband->n_bitrates; i++) { + struct ieee80211_rate *rate = &sband->bitrates[i]; + + if (rate->flags & IEEE80211_RATE_ERP_G) + continue; + + for (j = 0; j < ARRAY_SIZE(bitrates); j++) { + if (rate->bitrate != bitrates[j]) + continue; + + mp->cck_rates[j] = i; + break; + } + } +} + static void * minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) { @@ -515,9 +589,6 @@ minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) mp->lookaround_rate = 5; mp->lookaround_rate_mrr = 10; - /* moving average weight for EWMA */ - mp->ewma_level = 75; - /* maximum time that the hw is allowed to stay in one MRR segment */ mp->segment_size = 6000; @@ -539,6 +610,8 @@ minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx); #endif + minstrel_init_cck_rates(mp); + return mp; } diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index 5d278eccaef0..f4301f4b2e41 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -9,6 +9,29 @@ #ifndef __RC_MINSTREL_H #define __RC_MINSTREL_H +#define EWMA_LEVEL 96 /* ewma weighting factor [/EWMA_DIV] */ +#define EWMA_DIV 128 +#define SAMPLE_COLUMNS 10 /* number of columns in sample table */ + + +/* scaled fraction values */ +#define MINSTREL_SCALE 16 +#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) +#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE) + +/* number of highest throughput rates to consider*/ +#define MAX_THR_RATES 4 + +/* + * Perform EWMA (Exponentially Weighted Moving Average) calculation + */ +static inline int +minstrel_ewma(int old, int new, int weight) +{ + return (new * (EWMA_DIV - weight) + old * weight) / EWMA_DIV; +} + + struct minstrel_rate { int bitrate; int rix; @@ -26,6 +49,7 @@ struct minstrel_rate { u32 attempts; u32 last_attempts; u32 last_success; + u8 sample_skipped; /* parts per thousand */ u32 cur_prob; @@ -39,20 +63,21 @@ struct minstrel_rate { }; struct minstrel_sta_info { + struct ieee80211_sta *sta; + unsigned long stats_update; unsigned int sp_ack_dur; unsigned int rate_avg; unsigned int lowest_rix; - unsigned int max_tp_rate; - unsigned int max_tp_rate2; - unsigned int max_prob_rate; + u8 max_tp_rate[MAX_THR_RATES]; + u8 max_prob_rate; unsigned int packet_count; unsigned int sample_count; int sample_deferred; - unsigned int sample_idx; + unsigned int sample_row; unsigned int sample_column; int n_rates; @@ -73,12 +98,13 @@ struct minstrel_priv { unsigned int cw_min; unsigned int cw_max; unsigned int max_retry; - unsigned int ewma_level; unsigned int segment_size; unsigned int update_interval; unsigned int lookaround_rate; unsigned int lookaround_rate_mrr; + u8 cck_rates[4]; + #ifdef CONFIG_MAC80211_DEBUGFS /* * enable fixed rate processing per RC diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index d5a56226e675..fd0b9ca1570e 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -68,23 +68,25 @@ minstrel_stats_open(struct inode *inode, struct file *file) file->private_data = ms; p = ms->buf; - p += sprintf(p, "rate throughput ewma prob this prob " + p += sprintf(p, "rate throughput ewma prob this prob " "this succ/attempt success attempts\n"); for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; - *(p++) = (i == mi->max_tp_rate) ? 'T' : ' '; - *(p++) = (i == mi->max_tp_rate2) ? 't' : ' '; + *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; + *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; + *(p++) = (i == mi->max_tp_rate[2]) ? 'C' : ' '; + *(p++) = (i == mi->max_tp_rate[3]) ? 'D' : ' '; *(p++) = (i == mi->max_prob_rate) ? 'P' : ' '; p += sprintf(p, "%3u%s", mr->bitrate / 2, (mr->bitrate & 1 ? ".5" : " ")); - tp = mr->cur_tp / ((18000 << 10) / 96); - prob = mr->cur_prob / 18; - eprob = mr->probability / 18; + tp = MINSTREL_TRUNC(mr->cur_tp / 10); + prob = MINSTREL_TRUNC(mr->cur_prob * 1000); + eprob = MINSTREL_TRUNC(mr->probability * 1000); p += sprintf(p, " %6u.%1u %6u.%1u %6u.%1u " - "%3u(%3u) %8llu %8llu\n", + " %3u(%3u) %8llu %8llu\n", tp / 10, tp % 10, eprob / 10, eprob % 10, prob / 10, prob % 10, diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 9f9c453bc45d..5b2d3012b983 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010 Felix Fietkau <nbd@openwrt.org> + * Copyright (C) 2010-2013 Felix Fietkau <nbd@openwrt.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -17,8 +17,6 @@ #include "rc80211_minstrel_ht.h" #define AVG_PKT_SIZE 1200 -#define SAMPLE_COLUMNS 10 -#define EWMA_LEVEL 75 /* Number of bits for an average sized packet */ #define MCS_NBITS (AVG_PKT_SIZE << 3) @@ -26,11 +24,11 @@ /* Number of symbols for a packet with (bps) bits per symbol */ #define MCS_NSYMS(bps) ((MCS_NBITS + (bps) - 1) / (bps)) -/* Transmission time for a packet containing (syms) symbols */ +/* Transmission time (nanoseconds) for a packet containing (syms) symbols */ #define MCS_SYMBOL_TIME(sgi, syms) \ (sgi ? \ - ((syms) * 18 + 4) / 5 : /* syms * 3.6 us */ \ - (syms) << 2 /* syms * 4 us */ \ + ((syms) * 18000 + 4000) / 5 : /* syms * 3.6 us */ \ + ((syms) * 1000) << 2 /* syms * 4 us */ \ ) /* Transmit duration for the raw data part of an average sized packet */ @@ -63,6 +61,30 @@ } \ } +#define CCK_DURATION(_bitrate, _short, _len) \ + (1000 * (10 /* SIFS */ + \ + (_short ? 72 + 24 : 144 + 48 ) + \ + (8 * (_len + 4) * 10) / (_bitrate))) + +#define CCK_ACK_DURATION(_bitrate, _short) \ + (CCK_DURATION((_bitrate > 10 ? 20 : 10), false, 60) + \ + CCK_DURATION(_bitrate, _short, AVG_PKT_SIZE)) + +#define CCK_DURATION_LIST(_short) \ + CCK_ACK_DURATION(10, _short), \ + CCK_ACK_DURATION(20, _short), \ + CCK_ACK_DURATION(55, _short), \ + CCK_ACK_DURATION(110, _short) + +#define CCK_GROUP \ + [MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS] = { \ + .streams = 0, \ + .duration = { \ + CCK_DURATION_LIST(false), \ + CCK_DURATION_LIST(true) \ + } \ + } + /* * To enable sufficiently targeted rate sampling, MCS rates are divided into * groups, based on the number of streams and flags (HT40, SGI) that they @@ -95,18 +117,17 @@ const struct mcs_group minstrel_mcs_groups[] = { #if MINSTREL_MAX_STREAMS >= 3 MCS_GROUP(3, 1, 1), #endif + + /* must be last */ + CCK_GROUP }; +#define MINSTREL_CCK_GROUP (ARRAY_SIZE(minstrel_mcs_groups) - 1) + static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES]; -/* - * Perform EWMA (Exponentially Weighted Moving Average) calculation - */ -static int -minstrel_ewma(int old, int new, int weight) -{ - return (new * (100 - weight) + old * weight) / 100; -} +static void +minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi); /* * Look up an MCS group index based on mac80211 rate information @@ -119,6 +140,29 @@ minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate) !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)); } +static struct minstrel_rate_stats * +minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, + struct ieee80211_tx_rate *rate) +{ + int group, idx; + + if (rate->flags & IEEE80211_TX_RC_MCS) { + group = minstrel_ht_get_group_idx(rate); + idx = rate->idx % MCS_GROUP_RATES; + } else { + group = MINSTREL_CCK_GROUP; + + for (idx = 0; idx < ARRAY_SIZE(mp->cck_rates); idx++) + if (rate->idx == mp->cck_rates[idx]) + break; + + /* short preamble */ + if (!(mi->groups[group].supported & BIT(idx))) + idx += 4; + } + return &mi->groups[group].rates[idx]; +} + static inline struct minstrel_rate_stats * minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index) { @@ -159,18 +203,32 @@ static void minstrel_ht_calc_tp(struct minstrel_ht_sta *mi, int group, int rate) { struct minstrel_rate_stats *mr; - unsigned int usecs; + unsigned int nsecs = 0; + unsigned int tp; + unsigned int prob; mr = &mi->groups[group].rates[rate]; + prob = mr->probability; - if (mr->probability < MINSTREL_FRAC(1, 10)) { + if (prob < MINSTREL_FRAC(1, 10)) { mr->cur_tp = 0; return; } - usecs = mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len); - usecs += minstrel_mcs_groups[group].duration[rate]; - mr->cur_tp = MINSTREL_TRUNC((1000000 / usecs) * mr->probability); + /* + * For the throughput calculation, limit the probability value to 90% to + * account for collision related packet error rate fluctuation + */ + if (prob > MINSTREL_FRAC(9, 10)) + prob = MINSTREL_FRAC(9, 10); + + if (group != MINSTREL_CCK_GROUP) + nsecs = 1000 * mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len); + + nsecs += minstrel_mcs_groups[group].duration[rate]; + tp = 1000000 * ((mr->probability * 1000) / nsecs); + + mr->cur_tp = MINSTREL_TRUNC(tp); } /* @@ -189,6 +247,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) struct minstrel_rate_stats *mr; int cur_prob, cur_prob_tp, cur_tp, cur_tp2; int group, i, index; + bool mi_rates_valid = false; if (mi->ampdu_packets > 0) { mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len, @@ -199,11 +258,10 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) mi->sample_slow = 0; mi->sample_count = 0; - mi->max_tp_rate = 0; - mi->max_tp_rate2 = 0; - mi->max_prob_rate = 0; for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { + bool mg_rates_valid = false; + cur_prob = 0; cur_prob_tp = 0; cur_tp = 0; @@ -213,15 +271,24 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) if (!mg->supported) continue; - mg->max_tp_rate = 0; - mg->max_tp_rate2 = 0; - mg->max_prob_rate = 0; mi->sample_count++; for (i = 0; i < MCS_GROUP_RATES; i++) { if (!(mg->supported & BIT(i))) continue; + /* initialize rates selections starting indexes */ + if (!mg_rates_valid) { + mg->max_tp_rate = mg->max_tp_rate2 = + mg->max_prob_rate = i; + if (!mi_rates_valid) { + mi->max_tp_rate = mi->max_tp_rate2 = + mi->max_prob_rate = i; + mi_rates_valid = true; + } + mg_rates_valid = true; + } + mr = &mg->rates[i]; mr->retry_updated = false; index = MCS_GROUP_RATES * group + i; @@ -231,10 +298,6 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) if (!mr->cur_tp) continue; - /* ignore the lowest rate of each single-stream group */ - if (!i && minstrel_mcs_groups[group].streams == 1) - continue; - if ((mr->cur_tp > cur_prob_tp && mr->probability > MINSTREL_FRAC(3, 4)) || mr->probability > cur_prob) { mg->max_prob_rate = index; @@ -258,8 +321,8 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) } } - /* try to sample up to half of the available rates during each interval */ - mi->sample_count *= 4; + /* try to sample all available rates during each interval */ + mi->sample_count *= 8; cur_prob = 0; cur_prob_tp = 0; @@ -270,20 +333,13 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) if (!mg->supported) continue; - mr = minstrel_get_ratestats(mi, mg->max_prob_rate); - if (cur_prob_tp < mr->cur_tp && - minstrel_mcs_groups[group].streams == 1) { - mi->max_prob_rate = mg->max_prob_rate; - cur_prob = mr->cur_prob; - cur_prob_tp = mr->cur_tp; - } - mr = minstrel_get_ratestats(mi, mg->max_tp_rate); if (cur_tp < mr->cur_tp) { mi->max_tp_rate2 = mi->max_tp_rate; cur_tp2 = cur_tp; mi->max_tp_rate = mg->max_tp_rate; cur_tp = mr->cur_tp; + mi->max_prob_streams = minstrel_mcs_groups[group].streams - 1; } mr = minstrel_get_ratestats(mi, mg->max_tp_rate2); @@ -293,11 +349,28 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) } } + if (mi->max_prob_streams < 1) + mi->max_prob_streams = 1; + + for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { + mg = &mi->groups[group]; + if (!mg->supported) + continue; + mr = minstrel_get_ratestats(mi, mg->max_prob_rate); + if (cur_prob_tp < mr->cur_tp && + minstrel_mcs_groups[group].streams <= mi->max_prob_streams) { + mi->max_prob_rate = mg->max_prob_rate; + cur_prob = mr->cur_prob; + cur_prob_tp = mr->cur_tp; + } + } + + mi->stats_update = jiffies; } static bool -minstrel_ht_txstat_valid(struct ieee80211_tx_rate *rate) +minstrel_ht_txstat_valid(struct minstrel_priv *mp, struct ieee80211_tx_rate *rate) { if (rate->idx < 0) return false; @@ -305,7 +378,13 @@ minstrel_ht_txstat_valid(struct ieee80211_tx_rate *rate) if (!rate->count) return false; - return !!(rate->flags & IEEE80211_TX_RC_MCS); + if (rate->flags & IEEE80211_TX_RC_MCS) + return true; + + return rate->idx == mp->cck_rates[0] || + rate->idx == mp->cck_rates[1] || + rate->idx == mp->cck_rates[2] || + rate->idx == mp->cck_rates[3]; } static void @@ -389,8 +468,7 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_tx_rate *ar = info->status.rates; struct minstrel_rate_stats *rate, *rate2; struct minstrel_priv *mp = priv; - bool last; - int group; + bool last, update = false; int i; if (!msp->is_ht) @@ -412,20 +490,19 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, if (!mi->sample_wait && !mi->sample_tries && mi->sample_count > 0) { mi->sample_wait = 16 + 2 * MINSTREL_TRUNC(mi->avg_ampdu_len); - mi->sample_tries = 2; + mi->sample_tries = 1; mi->sample_count--; } if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) mi->sample_packets += info->status.ampdu_len; - last = !minstrel_ht_txstat_valid(&ar[0]); + last = !minstrel_ht_txstat_valid(mp, &ar[0]); for (i = 0; !last; i++) { last = (i == IEEE80211_TX_MAX_RATES - 1) || - !minstrel_ht_txstat_valid(&ar[i + 1]); + !minstrel_ht_txstat_valid(mp, &ar[i + 1]); - group = minstrel_ht_get_group_idx(&ar[i]); - rate = &mi->groups[group].rates[ar[i].idx % 8]; + rate = minstrel_ht_get_stats(mp, mi, &ar[i]); if (last) rate->success += info->status.ampdu_ack_len; @@ -440,20 +517,29 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, rate = minstrel_get_ratestats(mi, mi->max_tp_rate); if (rate->attempts > 30 && MINSTREL_FRAC(rate->success, rate->attempts) < - MINSTREL_FRAC(20, 100)) + MINSTREL_FRAC(20, 100)) { minstrel_downgrade_rate(mi, &mi->max_tp_rate, true); + update = true; + } rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate2); if (rate2->attempts > 30 && MINSTREL_FRAC(rate2->success, rate2->attempts) < - MINSTREL_FRAC(20, 100)) + MINSTREL_FRAC(20, 100)) { minstrel_downgrade_rate(mi, &mi->max_tp_rate2, false); + update = true; + } if (time_after(jiffies, mi->stats_update + (mp->update_interval / 2 * HZ) / 1000)) { + update = true; minstrel_ht_update_stats(mp, mi); - if (!(info->flags & IEEE80211_TX_CTL_AMPDU)) + if (!(info->flags & IEEE80211_TX_CTL_AMPDU) && + mi->max_prob_rate / MCS_GROUP_RATES != MINSTREL_CCK_GROUP) minstrel_aggr_check(sta, skb); } + + if (update) + minstrel_ht_update_rates(mp, mi); } static void @@ -467,6 +553,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, unsigned int ctime = 0; unsigned int t_slot = 9; /* FIXME */ unsigned int ampdu_len = MINSTREL_TRUNC(mi->avg_ampdu_len); + unsigned int overhead = 0, overhead_rtscts = 0; mr = minstrel_get_ratestats(mi, index); if (mr->probability < MINSTREL_FRAC(1, 10)) { @@ -480,7 +567,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, mr->retry_updated = true; group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; - tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len; + tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len / 1000; /* Contention time for first 2 tries */ ctime = (t_slot * cw) >> 1; @@ -488,9 +575,14 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, ctime += (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); + if (index / MCS_GROUP_RATES != MINSTREL_CCK_GROUP) { + overhead = mi->overhead; + overhead_rtscts = mi->overhead_rtscts; + } + /* Total TX time for data and Contention after first 2 tries */ - tx_time = ctime + 2 * (mi->overhead + tx_time_data); - tx_time_rtscts = ctime + 2 * (mi->overhead_rtscts + tx_time_data); + tx_time = ctime + 2 * (overhead + tx_time_data); + tx_time_rtscts = ctime + 2 * (overhead_rtscts + tx_time_data); /* See how many more tries we can fit inside segment size */ do { @@ -499,8 +591,8 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, cw = min((cw << 1) | 1, mp->cw_max); /* Total TX time after this try */ - tx_time += ctime + mi->overhead + tx_time_data; - tx_time_rtscts += ctime + mi->overhead_rtscts + tx_time_data; + tx_time += ctime + overhead + tx_time_data; + tx_time_rtscts += ctime + overhead_rtscts + tx_time_data; if (tx_time_rtscts < mp->segment_size) mr->retry_count_rtscts++; @@ -511,29 +603,71 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, static void minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, - struct ieee80211_tx_rate *rate, int index, - bool sample, bool rtscts) + struct ieee80211_sta_rates *ratetbl, int offset, int index) { const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; struct minstrel_rate_stats *mr; + u8 idx; + u16 flags; mr = minstrel_get_ratestats(mi, index); if (!mr->retry_updated) minstrel_calc_retransmit(mp, mi, index); - if (sample) - rate->count = 1; - else if (mr->probability < MINSTREL_FRAC(20, 100)) - rate->count = 2; - else if (rtscts) - rate->count = mr->retry_count_rtscts; - else - rate->count = mr->retry_count; + if (mr->probability < MINSTREL_FRAC(20, 100) || !mr->retry_count) { + ratetbl->rate[offset].count = 2; + ratetbl->rate[offset].count_rts = 2; + ratetbl->rate[offset].count_cts = 2; + } else { + ratetbl->rate[offset].count = mr->retry_count; + ratetbl->rate[offset].count_cts = mr->retry_count; + ratetbl->rate[offset].count_rts = mr->retry_count_rtscts; + } + + if (index / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) { + idx = mp->cck_rates[index % ARRAY_SIZE(mp->cck_rates)]; + flags = 0; + } else { + idx = index % MCS_GROUP_RATES + + (group->streams - 1) * MCS_GROUP_RATES; + flags = IEEE80211_TX_RC_MCS | group->flags; + } + + if (offset > 0) { + ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts; + flags |= IEEE80211_TX_RC_USE_RTS_CTS; + } - rate->flags = IEEE80211_TX_RC_MCS | group->flags; - if (rtscts) - rate->flags |= IEEE80211_TX_RC_USE_RTS_CTS; - rate->idx = index % MCS_GROUP_RATES + (group->streams - 1) * MCS_GROUP_RATES; + ratetbl->rate[offset].idx = idx; + ratetbl->rate[offset].flags = flags; +} + +static void +minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) +{ + struct ieee80211_sta_rates *rates; + int i = 0; + + rates = kzalloc(sizeof(*rates), GFP_ATOMIC); + if (!rates) + return; + + /* Start with max_tp_rate */ + minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate); + + if (mp->hw->max_rates >= 3) { + /* At least 3 tx rates supported, use max_tp_rate2 next */ + minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate2); + } + + if (mp->hw->max_rates >= 2) { + /* + * At least 2 tx rates supported, use max_prob_rate next */ + minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_prob_rate); + } + + rates->rate[i].idx = -1; + rate_control_set_rates(mp->hw, mi->sta, rates); } static inline int @@ -548,6 +682,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_rate_stats *mr; struct minstrel_mcs_group_data *mg; + unsigned int sample_dur, sample_group; int sample_idx = 0; if (mi->sample_wait > 0) { @@ -558,54 +693,77 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) if (!mi->sample_tries) return -1; - mi->sample_tries--; mg = &mi->groups[mi->sample_group]; sample_idx = sample_table[mg->column][mg->index]; mr = &mg->rates[sample_idx]; - sample_idx += mi->sample_group * MCS_GROUP_RATES; + sample_group = mi->sample_group; + sample_idx += sample_group * MCS_GROUP_RATES; minstrel_next_sample_idx(mi); /* * Sampling might add some overhead (RTS, no aggregation) * to the frame. Hence, don't use sampling for the currently - * used max TP rate. + * used rates. */ - if (sample_idx == mi->max_tp_rate) + if (sample_idx == mi->max_tp_rate || + sample_idx == mi->max_tp_rate2 || + sample_idx == mi->max_prob_rate) return -1; + /* - * When not using MRR, do not sample if the probability is already - * higher than 95% to avoid wasting airtime + * Do not sample if the probability is already higher than 95% + * to avoid wasting airtime. */ - if (!mp->has_mrr && (mr->probability > MINSTREL_FRAC(95, 100))) + if (mr->probability > MINSTREL_FRAC(95, 100)) return -1; /* * Make sure that lower rates get sampled only occasionally, * if the link is working perfectly. */ - if (minstrel_get_duration(sample_idx) > - minstrel_get_duration(mi->max_tp_rate)) { + sample_dur = minstrel_get_duration(sample_idx); + if (sample_dur >= minstrel_get_duration(mi->max_tp_rate2) && + (mi->max_prob_streams < + minstrel_mcs_groups[sample_group].streams || + sample_dur >= minstrel_get_duration(mi->max_prob_rate))) { if (mr->sample_skipped < 20) return -1; if (mi->sample_slow++ > 2) return -1; } + mi->sample_tries--; return sample_idx; } static void +minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp, + struct minstrel_ht_sta *mi, bool val) +{ + u8 supported = mi->groups[MINSTREL_CCK_GROUP].supported; + + if (!supported || !mi->cck_supported_short) + return; + + if (supported & (mi->cck_supported_short << (val * 4))) + return; + + supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4); + mi->groups[MINSTREL_CCK_GROUP].supported = supported; +} + +static void minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct ieee80211_tx_rate_control *txrc) { + const struct mcs_group *sample_group; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); - struct ieee80211_tx_rate *ar = info->status.rates; + struct ieee80211_tx_rate *rate = &info->status.rates[0]; struct minstrel_ht_sta_priv *msp = priv_sta; struct minstrel_ht_sta *mi = &msp->ht; struct minstrel_priv *mp = priv; int sample_idx; - bool sample = false; if (rate_control_send_low(sta, priv_sta, txrc)) return; @@ -614,6 +772,7 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc); info->flags |= mi->tx_flags; + minstrel_ht_check_cck_shortpreamble(mp, mi, txrc->short_preamble); /* Don't use EAPOL frames for sampling on non-mrr hw */ if (mp->hw->max_rates == 1 && @@ -632,51 +791,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, } #endif - if (sample_idx >= 0) { - sample = true; - minstrel_ht_set_rate(mp, mi, &ar[0], sample_idx, - true, false); - info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; - } else { - minstrel_ht_set_rate(mp, mi, &ar[0], mi->max_tp_rate, - false, false); - } - - if (mp->hw->max_rates >= 3) { - /* - * At least 3 tx rates supported, use - * sample_rate -> max_tp_rate -> max_prob_rate for sampling and - * max_tp_rate -> max_tp_rate2 -> max_prob_rate by default. - */ - if (sample_idx >= 0) - minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_tp_rate, - false, false); - else - minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_tp_rate2, - false, true); - - minstrel_ht_set_rate(mp, mi, &ar[2], mi->max_prob_rate, - false, !sample); - - ar[3].count = 0; - ar[3].idx = -1; - } else if (mp->hw->max_rates == 2) { - /* - * Only 2 tx rates supported, use - * sample_rate -> max_prob_rate for sampling and - * max_tp_rate -> max_prob_rate by default. - */ - minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_prob_rate, - false, !sample); - - ar[2].count = 0; - ar[2].idx = -1; - } else { - /* Not using MRR, only use the first rate */ - ar[1].count = 0; - ar[1].idx = -1; - } - mi->total_packets++; /* wraparound */ @@ -684,6 +798,40 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, mi->total_packets = 0; mi->sample_packets = 0; } + + if (sample_idx < 0) + return; + + sample_group = &minstrel_mcs_groups[sample_idx / MCS_GROUP_RATES]; + info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; + rate->idx = sample_idx % MCS_GROUP_RATES + + (sample_group->streams - 1) * MCS_GROUP_RATES; + rate->flags = IEEE80211_TX_RC_MCS | sample_group->flags; + rate->count = 1; +} + +static void +minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, + struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta) +{ + int i; + + if (sband->band != IEEE80211_BAND_2GHZ) + return; + + mi->cck_supported = 0; + mi->cck_supported_short = 0; + for (i = 0; i < 4; i++) { + if (!rate_supported(sta, sband->band, mp->cck_rates[i])) + continue; + + mi->cck_supported |= BIT(i); + if (sband->bitrates[i].flags & IEEE80211_RATE_SHORT_PREAMBLE) + mi->cck_supported_short |= BIT(i); + } + + mi->groups[MINSTREL_CCK_GROUP].supported = mi->cck_supported; } static void @@ -699,17 +847,18 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, int ack_dur; int stbc; int i; - unsigned int smps; /* fall back to the old minstrel for legacy stations */ if (!sta->ht_cap.ht_supported) goto use_legacy; BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != - MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS); + MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS + 1); msp->is_ht = true; memset(mi, 0, sizeof(*mi)); + + mi->sta = sta; mi->stats_update = jiffies; ack_dur = ieee80211_frame_duration(sband->band, 10, 60, 1, 1); @@ -735,28 +884,29 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, if (sta_cap & IEEE80211_HT_CAP_LDPC_CODING) mi->tx_flags |= IEEE80211_TX_CTL_LDPC; - smps = (sta_cap & IEEE80211_HT_CAP_SM_PS) >> - IEEE80211_HT_CAP_SM_PS_SHIFT; - for (i = 0; i < ARRAY_SIZE(mi->groups); i++) { - u16 req = 0; - mi->groups[i].supported = 0; - if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_SHORT_GI) { - if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) - req |= IEEE80211_HT_CAP_SGI_40; - else - req |= IEEE80211_HT_CAP_SGI_20; + if (i == MINSTREL_CCK_GROUP) { + minstrel_ht_update_cck(mp, mi, sband, sta); + continue; } - if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) - req |= IEEE80211_HT_CAP_SUP_WIDTH_20_40; + if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_SHORT_GI) { + if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) { + if (!(sta_cap & IEEE80211_HT_CAP_SGI_40)) + continue; + } else { + if (!(sta_cap & IEEE80211_HT_CAP_SGI_20)) + continue; + } + } - if ((sta_cap & req) != req) + if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH && + sta->bandwidth < IEEE80211_STA_RX_BW_40) continue; /* Mark MCS > 7 as unsupported if STA is in static SMPS mode */ - if (smps == WLAN_HT_CAP_SM_PS_STATIC && + if (sta->smps_mode == IEEE80211_SMPS_STATIC && minstrel_mcs_groups[i].streams > 1) continue; @@ -770,6 +920,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, if (!n_supported) goto use_legacy; + /* create an initial rate table with the lowest supported rates */ + minstrel_ht_update_stats(mp, mi); + minstrel_ht_update_rates(mp, mi); + return; use_legacy: diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index 462d2b227ed5..d655586773ac 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -16,11 +16,6 @@ #define MINSTREL_MAX_STREAMS 3 #define MINSTREL_STREAM_GROUPS 4 -/* scaled fraction values */ -#define MINSTREL_SCALE 16 -#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) -#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE) - #define MCS_GROUP_RATES 8 struct mcs_group { @@ -70,6 +65,8 @@ struct minstrel_mcs_group_data { }; struct minstrel_ht_sta { + struct ieee80211_sta *sta; + /* ampdu length (average, per sampling interval) */ unsigned int ampdu_len; unsigned int ampdu_packets; @@ -85,6 +82,7 @@ struct minstrel_ht_sta { /* best probability rate */ unsigned int max_prob_rate; + unsigned int max_prob_streams; /* time of last status update */ unsigned long stats_update; @@ -107,8 +105,11 @@ struct minstrel_ht_sta { /* current MCS group to be sampled */ u8 sample_group; + u8 cck_supported; + u8 cck_supported_short; + /* MCS rate group info and statistics */ - struct minstrel_mcs_group_data groups[MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS]; + struct minstrel_mcs_group_data groups[MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS + 1]; }; struct minstrel_ht_sta_priv { diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index e788f76a1dfe..df44a5ad8270 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -15,13 +15,76 @@ #include "rc80211_minstrel.h" #include "rc80211_minstrel_ht.h" +static char * +minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) +{ + unsigned int max_mcs = MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS; + const struct mcs_group *mg; + unsigned int j, tp, prob, eprob; + char htmode = '2'; + char gimode = 'L'; + + if (!mi->groups[i].supported) + return p; + + mg = &minstrel_mcs_groups[i]; + if (mg->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + htmode = '4'; + if (mg->flags & IEEE80211_TX_RC_SHORT_GI) + gimode = 'S'; + + for (j = 0; j < MCS_GROUP_RATES; j++) { + struct minstrel_rate_stats *mr = &mi->groups[i].rates[j]; + static const int bitrates[4] = { 10, 20, 55, 110 }; + int idx = i * MCS_GROUP_RATES + j; + + if (!(mi->groups[i].supported & BIT(j))) + continue; + + if (i == max_mcs) + p += sprintf(p, "CCK/%cP ", j < 4 ? 'L' : 'S'); + else + p += sprintf(p, "HT%c0/%cGI ", htmode, gimode); + + *(p++) = (idx == mi->max_tp_rate) ? 'T' : ' '; + *(p++) = (idx == mi->max_tp_rate2) ? 't' : ' '; + *(p++) = (idx == mi->max_prob_rate) ? 'P' : ' '; + + if (i == max_mcs) { + int r = bitrates[j % 4]; + p += sprintf(p, " %2u.%1uM", r / 10, r % 10); + } else { + p += sprintf(p, " MCS%-2u", (mg->streams - 1) * + MCS_GROUP_RATES + j); + } + + tp = mr->cur_tp / 10; + prob = MINSTREL_TRUNC(mr->cur_prob * 1000); + eprob = MINSTREL_TRUNC(mr->probability * 1000); + + p += sprintf(p, " %6u.%1u %6u.%1u %6u.%1u " + "%3u %3u(%3u) %8llu %8llu\n", + tp / 10, tp % 10, + eprob / 10, eprob % 10, + prob / 10, prob % 10, + mr->retry_count, + mr->last_success, + mr->last_attempts, + (unsigned long long)mr->succ_hist, + (unsigned long long)mr->att_hist); + } + + return p; +} + static int minstrel_ht_stats_open(struct inode *inode, struct file *file) { struct minstrel_ht_sta_priv *msp = inode->i_private; struct minstrel_ht_sta *mi = &msp->ht; struct minstrel_debugfs_info *ms; - unsigned int i, j, tp, prob, eprob; + unsigned int i; + unsigned int max_mcs = MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS; char *p; int ret; @@ -38,50 +101,13 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) file->private_data = ms; p = ms->buf; - p += sprintf(p, "type rate throughput ewma prob this prob " - "this succ/attempt success attempts\n"); - for (i = 0; i < MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS; i++) { - char htmode = '2'; - char gimode = 'L'; - - if (!mi->groups[i].supported) - continue; - - if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) - htmode = '4'; - if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_SHORT_GI) - gimode = 'S'; + p += sprintf(p, "type rate throughput ewma prob this prob " + "retry this succ/attempt success attempts\n"); - for (j = 0; j < MCS_GROUP_RATES; j++) { - struct minstrel_rate_stats *mr = &mi->groups[i].rates[j]; - int idx = i * MCS_GROUP_RATES + j; + p = minstrel_ht_stats_dump(mi, max_mcs, p); + for (i = 0; i < max_mcs; i++) + p = minstrel_ht_stats_dump(mi, i, p); - if (!(mi->groups[i].supported & BIT(j))) - continue; - - p += sprintf(p, "HT%c0/%cGI ", htmode, gimode); - - *(p++) = (idx == mi->max_tp_rate) ? 'T' : ' '; - *(p++) = (idx == mi->max_tp_rate2) ? 't' : ' '; - *(p++) = (idx == mi->max_prob_rate) ? 'P' : ' '; - p += sprintf(p, "MCS%-2u", (minstrel_mcs_groups[i].streams - 1) * - MCS_GROUP_RATES + j); - - tp = mr->cur_tp / 10; - prob = MINSTREL_TRUNC(mr->cur_prob * 1000); - eprob = MINSTREL_TRUNC(mr->probability * 1000); - - p += sprintf(p, " %6u.%1u %6u.%1u %6u.%1u " - "%3u(%3u) %8llu %8llu\n", - tp / 10, tp % 10, - eprob / 10, eprob % 10, - prob / 10, prob % 10, - mr->last_success, - mr->last_attempts, - (unsigned long long)mr->succ_hist, - (unsigned long long)mr->att_hist); - } - } p += sprintf(p, "\nTotal packet count:: ideal %d " "lookaround %d\n", max(0, (int) mi->total_packets - (int) mi->sample_packets), diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 580704eba8b8..c8447af76ead 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -648,29 +648,11 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) return RX_CONTINUE; } -#define SEQ_MODULO 0x1000 -#define SEQ_MASK 0xfff - -static inline int seq_less(u16 sq1, u16 sq2) -{ - return ((sq1 - sq2) & SEQ_MASK) > (SEQ_MODULO >> 1); -} - -static inline u16 seq_inc(u16 sq) -{ - return (sq + 1) & SEQ_MASK; -} - -static inline u16 seq_sub(u16 sq1, u16 sq2) -{ - return (sq1 - sq2) & SEQ_MASK; -} - static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_rx *tid_agg_rx, - int index) + int index, + struct sk_buff_head *frames) { - struct ieee80211_local *local = sdata->local; struct sk_buff *skb = tid_agg_rx->reorder_buf[index]; struct ieee80211_rx_status *status; @@ -684,24 +666,27 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, tid_agg_rx->reorder_buf[index] = NULL; status = IEEE80211_SKB_RXCB(skb); status->rx_flags |= IEEE80211_RX_DEFERRED_RELEASE; - skb_queue_tail(&local->rx_skb_queue, skb); + __skb_queue_tail(frames, skb); no_frame: - tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num); + tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num); } static void ieee80211_release_reorder_frames(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_rx *tid_agg_rx, - u16 head_seq_num) + u16 head_seq_num, + struct sk_buff_head *frames) { int index; lockdep_assert_held(&tid_agg_rx->reorder_lock); - while (seq_less(tid_agg_rx->head_seq_num, head_seq_num)) { - index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % + while (ieee80211_sn_less(tid_agg_rx->head_seq_num, head_seq_num)) { + index = ieee80211_sn_sub(tid_agg_rx->head_seq_num, + tid_agg_rx->ssn) % tid_agg_rx->buf_size; - ieee80211_release_reorder_frame(sdata, tid_agg_rx, index); + ieee80211_release_reorder_frame(sdata, tid_agg_rx, index, + frames); } } @@ -717,15 +702,16 @@ static void ieee80211_release_reorder_frames(struct ieee80211_sub_if_data *sdata #define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10) static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, - struct tid_ampdu_rx *tid_agg_rx) + struct tid_ampdu_rx *tid_agg_rx, + struct sk_buff_head *frames) { int index, j; lockdep_assert_held(&tid_agg_rx->reorder_lock); /* release the buffer until next missing frame */ - index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % - tid_agg_rx->buf_size; + index = ieee80211_sn_sub(tid_agg_rx->head_seq_num, + tid_agg_rx->ssn) % tid_agg_rx->buf_size; if (!tid_agg_rx->reorder_buf[index] && tid_agg_rx->stored_mpdu_num) { /* @@ -746,24 +732,29 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, ht_dbg_ratelimited(sdata, "release an RX reorder frame due to timeout on earlier frames\n"); - ieee80211_release_reorder_frame(sdata, tid_agg_rx, j); + ieee80211_release_reorder_frame(sdata, tid_agg_rx, j, + frames); /* * Increment the head seq# also for the skipped slots. */ tid_agg_rx->head_seq_num = - (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK; + (tid_agg_rx->head_seq_num + + skipped) & IEEE80211_SN_MASK; skipped = 0; } } else while (tid_agg_rx->reorder_buf[index]) { - ieee80211_release_reorder_frame(sdata, tid_agg_rx, index); - index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % + ieee80211_release_reorder_frame(sdata, tid_agg_rx, index, + frames); + index = ieee80211_sn_sub(tid_agg_rx->head_seq_num, + tid_agg_rx->ssn) % tid_agg_rx->buf_size; } if (tid_agg_rx->stored_mpdu_num) { - j = index = seq_sub(tid_agg_rx->head_seq_num, - tid_agg_rx->ssn) % tid_agg_rx->buf_size; + j = index = ieee80211_sn_sub(tid_agg_rx->head_seq_num, + tid_agg_rx->ssn) % + tid_agg_rx->buf_size; for (; j != (index - 1) % tid_agg_rx->buf_size; j = (j + 1) % tid_agg_rx->buf_size) { @@ -788,7 +779,8 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, */ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_rx *tid_agg_rx, - struct sk_buff *skb) + struct sk_buff *skb, + struct sk_buff_head *frames) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; u16 sc = le16_to_cpu(hdr->seq_ctrl); @@ -803,7 +795,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata head_seq_num = tid_agg_rx->head_seq_num; /* frame with out of date sequence number */ - if (seq_less(mpdu_seq_num, head_seq_num)) { + if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) { dev_kfree_skb(skb); goto out; } @@ -812,16 +804,18 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata * If frame the sequence number exceeds our buffering window * size release some previous frames to make room for this one. */ - if (!seq_less(mpdu_seq_num, head_seq_num + buf_size)) { - head_seq_num = seq_inc(seq_sub(mpdu_seq_num, buf_size)); + if (!ieee80211_sn_less(mpdu_seq_num, head_seq_num + buf_size)) { + head_seq_num = ieee80211_sn_inc( + ieee80211_sn_sub(mpdu_seq_num, buf_size)); /* release stored frames up to new head to stack */ ieee80211_release_reorder_frames(sdata, tid_agg_rx, - head_seq_num); + head_seq_num, frames); } /* Now the new frame is always in the range of the reordering buffer */ - index = seq_sub(mpdu_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size; + index = ieee80211_sn_sub(mpdu_seq_num, + tid_agg_rx->ssn) % tid_agg_rx->buf_size; /* check if we already stored this frame */ if (tid_agg_rx->reorder_buf[index]) { @@ -837,7 +831,8 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata */ if (mpdu_seq_num == tid_agg_rx->head_seq_num && tid_agg_rx->stored_mpdu_num == 0) { - tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num); + tid_agg_rx->head_seq_num = + ieee80211_sn_inc(tid_agg_rx->head_seq_num); ret = false; goto out; } @@ -846,7 +841,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata tid_agg_rx->reorder_buf[index] = skb; tid_agg_rx->reorder_time[index] = jiffies; tid_agg_rx->stored_mpdu_num++; - ieee80211_sta_reorder_release(sdata, tid_agg_rx); + ieee80211_sta_reorder_release(sdata, tid_agg_rx, frames); out: spin_unlock(&tid_agg_rx->reorder_lock); @@ -857,7 +852,8 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata * Reorder MPDUs from A-MPDUs, keeping them on a buffer. Returns * true if the MPDU was buffered, false if it should be processed. */ -static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx) +static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, + struct sk_buff_head *frames) { struct sk_buff *skb = rx->skb; struct ieee80211_local *local = rx->local; @@ -922,11 +918,12 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx) * sure that we cannot get to it any more before doing * anything with it. */ - if (ieee80211_sta_manage_reorder_buf(rx->sdata, tid_agg_rx, skb)) + if (ieee80211_sta_manage_reorder_buf(rx->sdata, tid_agg_rx, skb, + frames)) return; dont_reorder: - skb_queue_tail(&local->rx_skb_queue, skb); + __skb_queue_tail(frames, skb); } static ieee80211_rx_result debug_noinline @@ -1452,6 +1449,10 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) } } + /* mesh power save support */ + if (ieee80211_vif_is_mesh(&rx->sdata->vif)) + ieee80211_mps_rx_h_sta_process(sta, hdr); + /* * Drop (qos-)data::nullfunc frames silently, since they * are used only to control station power saving mode. @@ -1882,8 +1883,10 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) * 'align' will only take the values 0 or 2 here * since all frames are required to be aligned * to 2-byte boundaries when being passed to - * mac80211. That also explains the __skb_push() - * below. + * mac80211; the code here works just as well if + * that isn't true, but mac80211 assumes it can + * access fields as 2-byte aligned (e.g. for + * compare_ether_addr) */ align = ((unsigned long)(skb->data + sizeof(struct ethhdr))) & 3; if (align) { @@ -2015,7 +2018,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) /* frame is in RMC, don't forward */ if (ieee80211_is_data(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && - mesh_rmc_check(hdr->addr3, mesh_hdr, rx->sdata)) + mesh_rmc_check(rx->sdata, hdr->addr3, mesh_hdr)) return RX_DROP_MONITOR; if (!ieee80211_is_data(hdr->frame_control) || @@ -2042,9 +2045,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) } rcu_read_lock(); - mppath = mpp_path_lookup(proxied_addr, sdata); + mppath = mpp_path_lookup(sdata, proxied_addr); if (!mppath) { - mpp_path_add(proxied_addr, mpp_addr, sdata); + mpp_path_add(sdata, proxied_addr, mpp_addr); } else { spin_lock_bh(&mppath->state_lock); if (!ether_addr_equal(mppath->mpp, mpp_addr)) @@ -2082,6 +2085,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) } fwd_hdr = (struct ieee80211_hdr *) fwd_skb->data; + fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY); info = IEEE80211_SKB_CB(fwd_skb); memset(info, 0, sizeof(*info)); info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; @@ -2090,12 +2094,15 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) if (is_multicast_ether_addr(fwd_hdr->addr1)) { IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_mcast); memcpy(fwd_hdr->addr2, sdata->vif.addr, ETH_ALEN); - } else if (!mesh_nexthop_lookup(fwd_skb, sdata)) { + /* update power mode indication when forwarding */ + ieee80211_mps_set_frame_flags(sdata, NULL, fwd_hdr); + } else if (!mesh_nexthop_lookup(sdata, fwd_skb)) { + /* mesh power mode flags updated in mesh_nexthop_lookup */ IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_unicast); } else { /* unable to resolve next hop */ - mesh_path_error_tx(ifmsh->mshcfg.element_ttl, fwd_hdr->addr3, - 0, reason, fwd_hdr->addr2, sdata); + mesh_path_error_tx(sdata, ifmsh->mshcfg.element_ttl, + fwd_hdr->addr3, 0, reason, fwd_hdr->addr2); IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_no_route); kfree_skb(fwd_skb); return RX_DROP_MONITOR; @@ -2177,7 +2184,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) } static ieee80211_rx_result debug_noinline -ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx) +ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) { struct sk_buff *skb = rx->skb; struct ieee80211_bar *bar = (struct ieee80211_bar *)skb->data; @@ -2216,7 +2223,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx) spin_lock(&tid_agg_rx->reorder_lock); /* release stored frames up to start of BAR */ ieee80211_release_reorder_frames(rx->sdata, tid_agg_rx, - start_seq_num); + start_seq_num, frames); spin_unlock(&tid_agg_rx->reorder_lock); kfree_skb(skb); @@ -2353,38 +2360,34 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) sdata->vif.type != NL80211_IFTYPE_ADHOC) break; - /* verify action & smps_control are present */ + /* verify action & smps_control/chanwidth are present */ if (len < IEEE80211_MIN_ACTION_SIZE + 2) goto invalid; switch (mgmt->u.action.u.ht_smps.action) { case WLAN_HT_ACTION_SMPS: { struct ieee80211_supported_band *sband; - u8 smps; + enum ieee80211_smps_mode smps_mode; /* convert to HT capability */ switch (mgmt->u.action.u.ht_smps.smps_control) { case WLAN_HT_SMPS_CONTROL_DISABLED: - smps = WLAN_HT_CAP_SM_PS_DISABLED; + smps_mode = IEEE80211_SMPS_OFF; break; case WLAN_HT_SMPS_CONTROL_STATIC: - smps = WLAN_HT_CAP_SM_PS_STATIC; + smps_mode = IEEE80211_SMPS_STATIC; break; case WLAN_HT_SMPS_CONTROL_DYNAMIC: - smps = WLAN_HT_CAP_SM_PS_DYNAMIC; + smps_mode = IEEE80211_SMPS_DYNAMIC; break; default: goto invalid; } - smps <<= IEEE80211_HT_CAP_SM_PS_SHIFT; /* if no change do nothing */ - if ((rx->sta->sta.ht_cap.cap & - IEEE80211_HT_CAP_SM_PS) == smps) + if (rx->sta->sta.smps_mode == smps_mode) goto handled; - - rx->sta->sta.ht_cap.cap &= ~IEEE80211_HT_CAP_SM_PS; - rx->sta->sta.ht_cap.cap |= smps; + rx->sta->sta.smps_mode = smps_mode; sband = rx->local->hw.wiphy->bands[status->band]; @@ -2392,11 +2395,82 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) IEEE80211_RC_SMPS_CHANGED); goto handled; } + case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { + struct ieee80211_supported_band *sband; + u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth; + enum ieee80211_sta_rx_bandwidth new_bw; + + /* If it doesn't support 40 MHz it can't change ... */ + if (!(rx->sta->sta.ht_cap.cap & + IEEE80211_HT_CAP_SUP_WIDTH_20_40)) + goto handled; + + if (chanwidth == IEEE80211_HT_CHANWIDTH_20MHZ) + new_bw = IEEE80211_STA_RX_BW_20; + else + new_bw = ieee80211_sta_cur_vht_bw(rx->sta); + + if (rx->sta->sta.bandwidth == new_bw) + goto handled; + + sband = rx->local->hw.wiphy->bands[status->band]; + + rate_control_rate_update(local, sband, rx->sta, + IEEE80211_RC_BW_CHANGED); + goto handled; + } default: goto invalid; } break; + case WLAN_CATEGORY_PUBLIC: + if (len < IEEE80211_MIN_ACTION_SIZE + 1) + goto invalid; + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + if (!rx->sta) + break; + if (!ether_addr_equal(mgmt->bssid, sdata->u.mgd.bssid)) + break; + if (mgmt->u.action.u.ext_chan_switch.action_code != + WLAN_PUB_ACTION_EXT_CHANSW_ANN) + break; + if (len < offsetof(struct ieee80211_mgmt, + u.action.u.ext_chan_switch.variable)) + goto invalid; + goto queue; + case WLAN_CATEGORY_VHT: + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_MESH_POINT && + sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_AP && + sdata->vif.type != NL80211_IFTYPE_ADHOC) + break; + + /* verify action code is present */ + if (len < IEEE80211_MIN_ACTION_SIZE + 1) + goto invalid; + + switch (mgmt->u.action.u.vht_opmode_notif.action_code) { + case WLAN_VHT_ACTION_OPMODE_NOTIF: { + u8 opmode; + + /* verify opmode is present */ + if (len < IEEE80211_MIN_ACTION_SIZE + 2) + goto invalid; + + opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; + + ieee80211_vht_handle_opmode(rx->sdata, rx->sta, + opmode, status->band, + false); + goto handled; + } + default: + break; + } + break; case WLAN_CATEGORY_BACK: if (sdata->vif.type != NL80211_IFTYPE_STATION && sdata->vif.type != NL80211_IFTYPE_MESH_POINT && @@ -2449,10 +2523,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) ieee80211_process_measurement_req(sdata, mgmt, len); goto handled; case WLAN_ACTION_SPCT_CHL_SWITCH: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.chan_switch))) - break; - if (sdata->vif.type != NL80211_IFTYPE_STATION) break; @@ -2486,7 +2556,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) case WLAN_SP_MESH_PEERING_CONFIRM: if (!ieee80211_vif_is_mesh(&sdata->vif)) goto invalid; - if (sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE) + if (sdata->u.mesh.user_mpm) /* userspace handles this frame */ break; goto queue; @@ -2609,7 +2679,19 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) memset(nskb->cb, 0, sizeof(nskb->cb)); - ieee80211_tx_skb(rx->sdata, nskb); + if (rx->sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(nskb); + + info->flags = IEEE80211_TX_CTL_TX_OFFCHAN | + IEEE80211_TX_INTFL_OFFCHAN_TX_OK | + IEEE80211_TX_CTL_NO_CCK_RATE; + if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + info->hw_queue = + local->hw.offchannel_tx_hw_queue; + } + + __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, + status->band); } dev_kfree_skb(rx->skb); return RX_QUEUED; @@ -2648,8 +2730,9 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) return RX_DROP_MONITOR; break; case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ): - /* process only for ibss */ - if (sdata->vif.type != NL80211_IFTYPE_ADHOC) + /* process only for ibss and mesh */ + if (sdata->vif.type != NL80211_IFTYPE_ADHOC && + sdata->vif.type != NL80211_IFTYPE_MESH_POINT) return RX_DROP_MONITOR; break; default: @@ -2772,7 +2855,8 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, } } -static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx) +static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, + struct sk_buff_head *frames) { ieee80211_rx_result res = RX_DROP_MONITOR; struct sk_buff *skb; @@ -2784,15 +2868,9 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx) goto rxh_next; \ } while (0); - spin_lock(&rx->local->rx_skb_queue.lock); - if (rx->local->running_rx_handler) - goto unlock; - - rx->local->running_rx_handler = true; - - while ((skb = __skb_dequeue(&rx->local->rx_skb_queue))) { - spin_unlock(&rx->local->rx_skb_queue.lock); + spin_lock_bh(&rx->local->rx_path_lock); + while ((skb = __skb_dequeue(frames))) { /* * all the other fields are valid across frames * that belong to an aMPDU since they are on the @@ -2813,7 +2891,12 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx) #endif CALL_RXH(ieee80211_rx_h_amsdu) CALL_RXH(ieee80211_rx_h_data) - CALL_RXH(ieee80211_rx_h_ctrl); + + /* special treatment -- needs the queue */ + res = ieee80211_rx_h_ctrl(rx, frames); + if (res != RX_CONTINUE) + goto rxh_next; + CALL_RXH(ieee80211_rx_h_mgmt_check) CALL_RXH(ieee80211_rx_h_action) CALL_RXH(ieee80211_rx_h_userspace_mgmt) @@ -2822,20 +2905,20 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx) rxh_next: ieee80211_rx_handlers_result(rx, res); - spin_lock(&rx->local->rx_skb_queue.lock); + #undef CALL_RXH } - rx->local->running_rx_handler = false; - - unlock: - spin_unlock(&rx->local->rx_skb_queue.lock); + spin_unlock_bh(&rx->local->rx_path_lock); } static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) { + struct sk_buff_head reorder_release; ieee80211_rx_result res = RX_DROP_MONITOR; + __skb_queue_head_init(&reorder_release); + #define CALL_RXH(rxh) \ do { \ res = rxh(rx); \ @@ -2845,9 +2928,9 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) CALL_RXH(ieee80211_rx_h_check) - ieee80211_rx_reorder_ampdu(rx); + ieee80211_rx_reorder_ampdu(rx, &reorder_release); - ieee80211_rx_handlers(rx); + ieee80211_rx_handlers(rx, &reorder_release); return; rxh_next: @@ -2862,6 +2945,7 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) */ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) { + struct sk_buff_head frames; struct ieee80211_rx_data rx = { .sta = sta, .sdata = sta->sdata, @@ -2877,11 +2961,13 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) if (!tid_agg_rx) return; + __skb_queue_head_init(&frames); + spin_lock(&tid_agg_rx->reorder_lock); - ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx); + ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames); spin_unlock(&tid_agg_rx->reorder_lock); - ieee80211_rx_handlers(&rx); + ieee80211_rx_handlers(&rx, &frames); } /* main receive path */ @@ -2969,7 +3055,8 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, !ieee80211_is_probe_resp(hdr->frame_control) && !ieee80211_is_beacon(hdr->frame_control)) return 0; - if (!ether_addr_equal(sdata->vif.addr, hdr->addr1)) + if (!ether_addr_equal(sdata->vif.addr, hdr->addr1) && + !multicast) status->rx_flags &= ~IEEE80211_RX_RA_MATCH; break; default: diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index bf82e69d0601..99b103921a4b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -27,22 +27,15 @@ #define IEEE80211_PROBE_DELAY (HZ / 33) #define IEEE80211_CHANNEL_TIME (HZ / 33) -#define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 8) - -static void ieee80211_rx_bss_free(struct cfg80211_bss *cbss) -{ - struct ieee80211_bss *bss = (void *)cbss->priv; - - kfree(bss_mesh_id(bss)); - kfree(bss_mesh_cfg(bss)); -} +#define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 9) void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss) { if (!bss) return; - cfg80211_put_bss(container_of((void *)bss, struct cfg80211_bss, priv)); + cfg80211_put_bss(local->hw.wiphy, + container_of((void *)bss, struct cfg80211_bss, priv)); } static bool is_uapsd_supported(struct ieee802_11_elems *elems) @@ -65,12 +58,11 @@ static bool is_uapsd_supported(struct ieee802_11_elems *elems) struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, - struct ieee80211_mgmt *mgmt, - size_t len, + struct ieee80211_mgmt *mgmt, size_t len, struct ieee802_11_elems *elems, - struct ieee80211_channel *channel, - bool beacon) + struct ieee80211_channel *channel) { + bool beacon = ieee80211_is_beacon(mgmt->frame_control); struct cfg80211_bss *cbss; struct ieee80211_bss *bss; int clen, srlen; @@ -86,10 +78,12 @@ ieee80211_bss_info_update(struct ieee80211_local *local, if (!cbss) return NULL; - cbss->free_priv = ieee80211_rx_bss_free; bss = (void *)cbss->priv; - bss->device_ts = rx_status->device_timestamp; + if (beacon) + bss->device_ts_beacon = rx_status->device_timestamp; + else + bss->device_ts_presp = rx_status->device_timestamp; if (elems->parse_error) { if (beacon) @@ -104,9 +98,8 @@ ieee80211_bss_info_update(struct ieee80211_local *local, } /* save the ERP value so that it is available at association time */ - if (elems->erp_info && elems->erp_info_len >= 1 && - (!elems->parse_error || - !(bss->valid_data & IEEE80211_BSS_VALID_ERP))) { + if (elems->erp_info && (!elems->parse_error || + !(bss->valid_data & IEEE80211_BSS_VALID_ERP))) { bss->erp_value = elems->erp_info[0]; bss->has_erp_value = true; if (!elems->parse_error) @@ -147,9 +140,6 @@ ieee80211_bss_info_update(struct ieee80211_local *local, bss->valid_data |= IEEE80211_BSS_VALID_WMM; } - if (!beacon) - bss->last_probe_resp = jiffies; - return bss; } @@ -162,7 +152,6 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) u8 *elements; struct ieee80211_channel *channel; size_t baselen; - bool beacon; struct ieee802_11_elems elems; if (skb->len < 24 || @@ -184,17 +173,15 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) elements = mgmt->u.probe_resp.variable; baselen = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); - beacon = false; } else { baselen = offsetof(struct ieee80211_mgmt, u.beacon.variable); elements = mgmt->u.beacon.variable; - beacon = true; } if (baselen > skb->len) return; - ieee802_11_parse_elems(elements, skb->len - baselen, &elems); + ieee802_11_parse_elems(elements, skb->len - baselen, false, &elems); channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); @@ -203,7 +190,7 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) bss = ieee80211_bss_info_update(local, rx_status, mgmt, skb->len, &elems, - channel, beacon); + channel); if (bss) ieee80211_rx_bss_put(local, bss); } @@ -343,6 +330,9 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local) ieee80211_offchannel_stop_vifs(local); + /* ensure nullfunc is transmitted before leaving operating channel */ + ieee80211_flush_queues(local, NULL); + ieee80211_configure_filter(local); /* We need to set power level at maximum rate for scanning. */ @@ -357,6 +347,9 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local) static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { + if (local->radar_detect_enabled) + return false; + if (!list_empty(&local->roc_list)) return false; @@ -390,7 +383,12 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, { int i; struct ieee80211_sub_if_data *sdata; - enum ieee80211_band band = local->hw.conf.channel->band; + enum ieee80211_band band = local->hw.conf.chandef.chan->band; + u32 tx_flags; + + tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; + if (local->scan_req->no_cck) + tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->mtx)); @@ -402,8 +400,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, local->scan_req->ssids[i].ssid_len, local->scan_req->ie, local->scan_req->ie_len, local->scan_req->rates[band], false, - local->scan_req->no_cck, - local->hw.conf.channel, true); + tx_flags, local->hw.conf.chandef.chan, true); /* * After sending probe requests, wait for probe responses @@ -469,7 +466,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, if (local->ops->hw_scan) { __set_bit(SCAN_HW_SCANNING, &local->scanning); } else if ((req->n_channels == 1) && - (req->channels[0] == local->_oper_channel)) { + (req->channels[0] == local->_oper_chandef.chan)) { /* * If we are scanning only on the operating channel * then we do not need to stop normal activities @@ -547,8 +544,6 @@ static void ieee80211_scan_state_decision(struct ieee80211_local *local, bool associated = false; bool tx_empty = true; bool bad_latency; - bool listen_int_exceeded; - unsigned long min_beacon_int = 0; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *next_chan; enum mac80211_scan_state next_scan_state; @@ -567,11 +562,6 @@ static void ieee80211_scan_state_decision(struct ieee80211_local *local, if (sdata->u.mgd.associated) { associated = true; - if (sdata->vif.bss_conf.beacon_int < - min_beacon_int || min_beacon_int == 0) - min_beacon_int = - sdata->vif.bss_conf.beacon_int; - if (!qdisc_all_tx_empty(sdata->dev)) { tx_empty = false; break; @@ -588,34 +578,19 @@ static void ieee80211_scan_state_decision(struct ieee80211_local *local, * see if we can scan another channel without interfering * with the current traffic situation. * - * Since we don't know if the AP has pending frames for us - * we can only check for our tx queues and use the current - * pm_qos requirements for rx. Hence, if no tx traffic occurs - * at all we will scan as many channels in a row as the pm_qos - * latency allows us to. Additionally we also check for the - * currently negotiated listen interval to prevent losing - * frames unnecessarily. - * - * Otherwise switch back to the operating channel. + * Keep good latency, do not stay off-channel more than 125 ms. */ bad_latency = time_after(jiffies + - ieee80211_scan_get_channel_time(next_chan), - local->leave_oper_channel_time + - usecs_to_jiffies(pm_qos_request(PM_QOS_NETWORK_LATENCY))); - - listen_int_exceeded = time_after(jiffies + - ieee80211_scan_get_channel_time(next_chan), - local->leave_oper_channel_time + - usecs_to_jiffies(min_beacon_int * 1024) * - local->hw.conf.listen_interval); + ieee80211_scan_get_channel_time(next_chan), + local->leave_oper_channel_time + HZ / 8); if (associated && !tx_empty) { if (local->scan_req->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) next_scan_state = SCAN_ABORT; else next_scan_state = SCAN_SUSPEND; - } else if (associated && (bad_latency || listen_int_exceeded)) { + } else if (associated && bad_latency) { next_scan_state = SCAN_SUSPEND; } else { next_scan_state = SCAN_SET_CHANNEL; @@ -692,7 +667,7 @@ static void ieee80211_scan_state_resume(struct ieee80211_local *local, ieee80211_offchannel_stop_vifs(local); if (local->ops->flush) { - drv_flush(local, false); + ieee80211_flush_queues(local, NULL); *next_delay = 0; } else *next_delay = HZ / 10; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index ca9fde198188..11216bc13b27 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -104,12 +104,24 @@ static void cleanup_single_sta(struct sta_info *sta) * neither mac80211 nor the driver can reference this * sta struct any more except by still existing timers * associated with this station that we clean up below. + * + * Note though that this still uses the sdata and even + * calls the driver in AP and mesh mode, so interfaces + * of those types mush use call sta_info_flush_cleanup() + * (typically via sta_info_flush()) before deconfiguring + * the driver. + * + * In station mode, nothing happens here so it doesn't + * have to (and doesn't) do that, this is intentional to + * speed up roaming. */ if (test_sta_flag(sta, WLAN_STA_PS_STA)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; + else if (ieee80211_vif_is_mesh(&sdata->vif)) + ps = &sdata->u.mesh.ps; else return; @@ -125,13 +137,8 @@ static void cleanup_single_sta(struct sta_info *sta) ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]); } -#ifdef CONFIG_MAC80211_MESH - if (ieee80211_vif_is_mesh(&sdata->vif)) { - mesh_accept_plinks_update(sdata); - mesh_plink_deactivate(sta); - del_timer_sync(&sta->plink_timer); - } -#endif + if (ieee80211_vif_is_mesh(&sdata->vif)) + mesh_sta_cleanup(sta); cancel_work_sync(&sta->drv_unblock_wk); @@ -335,6 +342,11 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, INIT_WORK(&sta->drv_unblock_wk, sta_unblock); INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); mutex_init(&sta->ampdu_mlme.mtx); +#ifdef CONFIG_MAC80211_MESH + if (ieee80211_vif_is_mesh(&sdata->vif) && + !sdata->u.mesh.user_mpm) + init_timer(&sta->plink_timer); +#endif memcpy(sta->sta.addr, addr, ETH_ALEN); sta->local = local; @@ -368,12 +380,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); - sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); + sta->sta.smps_mode = IEEE80211_SMPS_OFF; -#ifdef CONFIG_MAC80211_MESH - sta->plink_state = NL80211_PLINK_LISTEN; - init_timer(&sta->plink_timer); -#endif + sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); return sta; } @@ -547,6 +556,15 @@ static inline void __bss_tim_clear(u8 *tim, u16 id) tim[id / 8] &= ~(1 << (id % 8)); } +static inline bool __bss_tim_get(u8 *tim, u16 id) +{ + /* + * This format has been mandated by the IEEE specifications, + * so this line may not be changed to use the test_bit() format. + */ + return tim[id / 8] & (1 << (id % 8)); +} + static unsigned long ieee80211_tids_for_ac(int ac) { /* If we ever support TIDs > 7, this obviously needs to be adjusted */ @@ -569,7 +587,6 @@ void sta_info_recalc_tim(struct sta_info *sta) { struct ieee80211_local *local = sta->local; struct ps_data *ps; - unsigned long flags; bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; @@ -582,6 +599,12 @@ void sta_info_recalc_tim(struct sta_info *sta) ps = &sta->sdata->bss->ps; id = sta->sta.aid; +#ifdef CONFIG_MAC80211_MESH + } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { + ps = &sta->sdata->u.mesh.ps; + /* TIM map only for PLID <= IEEE80211_MAX_AID */ + id = le16_to_cpu(sta->plid) % IEEE80211_MAX_AID; +#endif } else { return; } @@ -620,7 +643,10 @@ void sta_info_recalc_tim(struct sta_info *sta) } done: - spin_lock_irqsave(&local->tim_lock, flags); + spin_lock_bh(&local->tim_lock); + + if (indicate_tim == __bss_tim_get(ps->tim, id)) + goto out_unlock; if (indicate_tim) __bss_tim_set(ps->tim, id); @@ -633,7 +659,8 @@ void sta_info_recalc_tim(struct sta_info *sta) local->tim_in_locked_section = false; } - spin_unlock_irqrestore(&local->tim_lock, flags); +out_unlock: + spin_unlock_bh(&local->tim_lock); } static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb) @@ -740,8 +767,9 @@ static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, bool have_buffered = false; int ac; - /* This is only necessary for stations on BSS interfaces */ - if (!sta->sdata->bss) + /* This is only necessary for stations on BSS/MBSS interfaces */ + if (!sta->sdata->bss && + !ieee80211_vif_is_mesh(&sta->sdata->vif)) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) @@ -755,7 +783,7 @@ int __must_check __sta_info_destroy(struct sta_info *sta) { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; - int ret, i; + int ret; might_sleep(); @@ -774,7 +802,7 @@ int __must_check __sta_info_destroy(struct sta_info *sta) * will be sufficient. */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); - ieee80211_sta_tear_down_BA_sessions(sta, false); + ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); ret = sta_info_hash_del(local, sta); if (ret) @@ -782,12 +810,8 @@ int __must_check __sta_info_destroy(struct sta_info *sta) list_del_rcu(&sta->list); - mutex_lock(&local->key_mtx); - for (i = 0; i < NUM_DEFAULT_KEYS; i++) - __ieee80211_key_free(key_mtx_dereference(local, sta->gtk[i])); - if (sta->ptk) - __ieee80211_key_free(key_mtx_dereference(local, sta->ptk)); - mutex_unlock(&local->key_mtx); + /* this always calls synchronize_net() */ + ieee80211_free_sta_keys(local, sta); sta->dead = true; @@ -885,20 +909,12 @@ void sta_info_init(struct ieee80211_local *local) void sta_info_stop(struct ieee80211_local *local) { del_timer_sync(&local->sta_cleanup); - sta_info_flush(local, NULL); } -/** - * sta_info_flush - flush matching STA entries from the STA table - * - * Returns the number of removed STA entries. - * - * @local: local interface data - * @sdata: matching rule for the net device (sta->dev) or %NULL to match all STAs - */ -int sta_info_flush(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) + +int sta_info_flush_defer(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; int ret = 0; @@ -906,30 +922,22 @@ int sta_info_flush(struct ieee80211_local *local, mutex_lock(&local->sta_mtx); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { - if (!sdata || sdata == sta->sdata) { + if (sdata == sta->sdata) { WARN_ON(__sta_info_destroy(sta)); ret++; } } mutex_unlock(&local->sta_mtx); - rcu_barrier(); - - if (sdata) { - ieee80211_cleanup_sdata_stas(sdata); - cancel_work_sync(&sdata->cleanup_stations_wk); - } else { - mutex_lock(&local->iflist_mtx); - list_for_each_entry(sdata, &local->interfaces, list) { - ieee80211_cleanup_sdata_stas(sdata); - cancel_work_sync(&sdata->cleanup_stations_wk); - } - mutex_unlock(&local->iflist_mtx); - } - return ret; } +void sta_info_flush_cleanup(struct ieee80211_sub_if_data *sdata) +{ + ieee80211_cleanup_sdata_stas(sdata); + cancel_work_sync(&sdata->cleanup_stations_wk); +} + void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time) { @@ -945,6 +953,11 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, if (time_after(jiffies, sta->last_rx + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); + + if (ieee80211_vif_is_mesh(&sdata->vif) && + test_sta_flag(sta, WLAN_STA_PS_STA)) + atomic_dec(&sdata->u.mesh.ps.num_sta_ps); + WARN_ON(__sta_info_destroy(sta)); } } @@ -1003,6 +1016,8 @@ static void clear_sta_ps_flags(void *_sta) if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; + else if (ieee80211_vif_is_mesh(&sdata->vif)) + ps = &sdata->u.mesh.ps; else return; @@ -1120,6 +1135,8 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, drv_allow_buffered_frames(local, sta, BIT(tid), 1, reason, false); + skb->dev = sdata->dev; + rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { @@ -1380,30 +1397,16 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_sta_block_awake); -void ieee80211_sta_eosp_irqsafe(struct ieee80211_sta *pubsta) +void ieee80211_sta_eosp(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->local; - struct sk_buff *skb; - struct skb_eosp_msg_data *data; trace_api_eosp(local, pubsta); - skb = alloc_skb(0, GFP_ATOMIC); - if (!skb) { - /* too bad ... but race is better than loss */ - clear_sta_flag(sta, WLAN_STA_SP); - return; - } - - data = (void *)skb->cb; - memcpy(data->sta, pubsta->addr, ETH_ALEN); - memcpy(data->iface, sta->sdata->vif.addr, ETH_ALEN); - skb->pkt_type = IEEE80211_EOSP_MSG; - skb_queue_tail(&local->skb_queue, skb); - tasklet_schedule(&local->tasklet); + clear_sta_flag(sta, WLAN_STA_SP); } -EXPORT_SYMBOL(ieee80211_sta_eosp_irqsafe); +EXPORT_SYMBOL(ieee80211_sta_eosp); void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, u8 tid, bool buffered) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 37c1889afd3a..adc30045f99e 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -56,6 +56,8 @@ * @WLAN_STA_INSERTED: This station is inserted into the hash table. * @WLAN_STA_RATE_CONTROL: rate control was initialized for this station. * @WLAN_STA_TOFFSET_KNOWN: toffset calculated for this station is valid. + * @WLAN_STA_MPSP_OWNER: local STA is owner of a mesh Peer Service Period. + * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP. */ enum ieee80211_sta_info_flags { WLAN_STA_AUTH, @@ -78,6 +80,8 @@ enum ieee80211_sta_info_flags { WLAN_STA_INSERTED, WLAN_STA_RATE_CONTROL, WLAN_STA_TOFFSET_KNOWN, + WLAN_STA_MPSP_OWNER, + WLAN_STA_MPSP_RECIPIENT, }; #define ADDBA_RESP_INTERVAL HZ @@ -92,6 +96,13 @@ enum ieee80211_sta_info_flags { #define HT_AGG_STATE_WANT_START 4 #define HT_AGG_STATE_WANT_STOP 5 +enum ieee80211_agg_stop_reason { + AGG_STOP_DECLINED, + AGG_STOP_LOCAL_REQUEST, + AGG_STOP_PEER_REQUEST, + AGG_STOP_DESTROY_STA, +}; + /** * struct tid_ampdu_tx - TID aggregation information (Tx). * @@ -270,11 +281,12 @@ struct sta_ampdu_mlme { * @plink_state: peer link state * @plink_timeout: timeout of peer link * @plink_timer: peer link watch timer - * @plink_timer_was_running: used by suspend/resume to restore timers * @t_offset: timing offset relative to this host * @t_offset_setpoint: reference timing offset of this sta to be used when * calculating clockdrift - * @ch_width: peer's channel width + * @local_pm: local link-specific power save mode + * @peer_pm: peer-specific power save mode towards local STA + * @nonpeer_pm: STA power save mode towards non-peer neighbors * @debugfs: debug filesystem info * @dead: set to true when sta is unlinked * @uploaded: set to true when sta is uploaded to the driver @@ -282,8 +294,9 @@ struct sta_ampdu_mlme { * @sta: station information we share with the driver * @sta_state: duplicates information about station state (for debug) * @beacon_loss_count: number of times beacon loss has triggered - * @supports_40mhz: tracks whether the station advertised 40 MHz support - * as we overwrite its HT parameters with the currently used value + * @rcu_head: RCU head used for freeing this station struct + * @cur_max_bandwidth: maximum bandwidth to use for TX to the station, + * taken from HT/VHT capabilities or VHT operating mode notification */ struct sta_info { /* General information, mostly static */ @@ -320,7 +333,8 @@ struct sta_info { unsigned long driver_buffered_tids; /* Updated from RX path only, no locking requirements */ - unsigned long rx_packets, rx_bytes; + unsigned long rx_packets; + u64 rx_bytes; unsigned long wep_weak_iv_count; unsigned long last_rx; long last_connected; @@ -340,9 +354,9 @@ struct sta_info { unsigned int fail_avg; /* Updated from TX path only, no locking requirements */ - unsigned long tx_packets; - unsigned long tx_bytes; - unsigned long tx_fragments; + u32 tx_fragments; + u64 tx_packets[IEEE80211_NUM_ACS]; + u64 tx_bytes[IEEE80211_NUM_ACS]; struct ieee80211_tx_rate last_tx_rate; int last_rx_rate_idx; u32 last_rx_rate_flag; @@ -365,13 +379,15 @@ struct sta_info { __le16 reason; u8 plink_retries; bool ignore_plink_timer; - bool plink_timer_was_running; enum nl80211_plink_state plink_state; u32 plink_timeout; struct timer_list plink_timer; s64 t_offset; s64 t_offset_setpoint; - enum nl80211_chan_width ch_width; + /* mesh power save */ + enum nl80211_mesh_power_mode local_pm; + enum nl80211_mesh_power_mode peer_pm; + enum nl80211_mesh_power_mode nonpeer_pm; #endif #ifdef CONFIG_MAC80211_DEBUGFS @@ -381,11 +397,11 @@ struct sta_info { } debugfs; #endif + enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; + unsigned int lost_packets; unsigned int beacon_loss_count; - bool supports_40mhz; - /* keep last! */ struct ieee80211_sta sta; }; @@ -548,8 +564,39 @@ void sta_info_recalc_tim(struct sta_info *sta); void sta_info_init(struct ieee80211_local *local); void sta_info_stop(struct ieee80211_local *local); -int sta_info_flush(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata); +int sta_info_flush_defer(struct ieee80211_sub_if_data *sdata); + +/** + * sta_info_flush_cleanup - flush the sta_info cleanup queue + * @sdata: the interface + * + * Flushes the sta_info cleanup queue for a given interface; + * this is necessary before the interface is removed or, for + * AP/mesh interfaces, before it is deconfigured. + * + * Note an rcu_barrier() must precede the function, after all + * stations have been flushed/removed to ensure the call_rcu() + * calls that add stations to the cleanup queue have completed. + */ +void sta_info_flush_cleanup(struct ieee80211_sub_if_data *sdata); + +/** + * sta_info_flush - flush matching STA entries from the STA table + * + * Returns the number of removed STA entries. + * + * @sdata: sdata to remove all stations from + */ +static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata) +{ + int ret = sta_info_flush_defer(sdata); + + rcu_barrier(); + sta_info_flush_cleanup(sdata); + + return ret; +} + void sta_set_rate_info_tx(struct sta_info *sta, const struct ieee80211_tx_rate *rate, struct rate_info *rinfo); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 07d99578a2b1..43439203f4e4 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -335,7 +335,8 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, if (dropped) acked = false; - if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { + if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX | + IEEE80211_TX_INTFL_MLME_CONN_TX)) { struct ieee80211_sub_if_data *sdata = NULL; struct ieee80211_sub_if_data *iter_sdata; u64 cookie = (unsigned long)skb; @@ -357,10 +358,13 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, sdata = rcu_dereference(local->p2p_sdata); } - if (!sdata) + if (!sdata) { skb->dev = NULL; - else if (ieee80211_is_nullfunc(hdr->frame_control) || - ieee80211_is_qos_nullfunc(hdr->frame_control)) { + } else if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { + ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, + acked); + } else if (ieee80211_is_nullfunc(hdr->frame_control) || + ieee80211_is_qos_nullfunc(hdr->frame_control)) { cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, GFP_ATOMIC); } else { @@ -468,6 +472,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) return; } + /* mesh Peer Service Period support */ + if (ieee80211_vif_is_mesh(&sta->sdata->vif) && + ieee80211_is_data_qos(fc)) + ieee80211_mpsp_trigger_process( + ieee80211_get_qos_ctl(hdr), + sta, true, acked); + if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) && (rates_idx != -1)) sta->last_tx_rate = info->status.rates[rates_idx]; @@ -502,11 +513,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) IEEE80211_BAR_CTRL_TID_INFO_MASK) >> IEEE80211_BAR_CTRL_TID_INFO_SHIFT; - if (local->hw.flags & - IEEE80211_HW_TEARDOWN_AGGR_ON_BAR_FAIL) - ieee80211_stop_tx_ba_session(&sta->sta, tid); - else - ieee80211_set_bar_pending(sta, tid, ssn); + ieee80211_set_bar_pending(sta, tid, ssn); } } diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c index 57e14d59e12f..3ed801d90f1e 100644 --- a/net/mac80211/tkip.c +++ b/net/mac80211/tkip.c @@ -177,12 +177,11 @@ void ieee80211_get_tkip_p1k_iv(struct ieee80211_key_conf *keyconf, struct ieee80211_key *key = (struct ieee80211_key *) container_of(keyconf, struct ieee80211_key, conf); struct tkip_ctx *ctx = &key->u.tkip.tx; - unsigned long flags; - spin_lock_irqsave(&key->u.tkip.txlock, flags); + spin_lock_bh(&key->u.tkip.txlock); ieee80211_compute_tkip_p1k(key, iv32); memcpy(p1k, ctx->p1k, sizeof(ctx->p1k)); - spin_unlock_irqrestore(&key->u.tkip.txlock, flags); + spin_unlock_bh(&key->u.tkip.txlock); } EXPORT_SYMBOL(ieee80211_get_tkip_p1k_iv); @@ -208,12 +207,11 @@ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, const u8 *data = (u8 *)hdr + ieee80211_hdrlen(hdr->frame_control); u32 iv32 = get_unaligned_le32(&data[4]); u16 iv16 = data[2] | (data[0] << 8); - unsigned long flags; - spin_lock_irqsave(&key->u.tkip.txlock, flags); + spin_lock_bh(&key->u.tkip.txlock); ieee80211_compute_tkip_p1k(key, iv32); tkip_mixing_phase2(tk, ctx, iv16, p2k); - spin_unlock_irqrestore(&key->u.tkip.txlock, flags); + spin_unlock_bh(&key->u.tkip.txlock); } EXPORT_SYMBOL(ieee80211_get_tkip_p2k); diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index a8270b441a6f..c215fafd7a2f 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -28,21 +28,27 @@ #define VIF_PR_FMT " vif:%s(%d%s)" #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" -#define CHANCTX_ENTRY __field(u32, control_freq) \ - __field(u32, chan_width) \ - __field(u32, center_freq1) \ - __field(u32, center_freq2) \ - __field(u8, rx_chains_static) \ +#define CHANDEF_ENTRY __field(u32, control_freq) \ + __field(u32, chan_width) \ + __field(u32, center_freq1) \ + __field(u32, center_freq2) +#define CHANDEF_ASSIGN(c) \ + __entry->control_freq = (c)->chan ? (c)->chan->center_freq : 0; \ + __entry->chan_width = (c)->width; \ + __entry->center_freq1 = (c)->center_freq1; \ + __entry->center_freq2 = (c)->center_freq2; +#define CHANDEF_PR_FMT " control:%d MHz width:%d center: %d/%d MHz" +#define CHANDEF_PR_ARG __entry->control_freq, __entry->chan_width, \ + __entry->center_freq1, __entry->center_freq2 + +#define CHANCTX_ENTRY CHANDEF_ENTRY \ + __field(u8, rx_chains_static) \ __field(u8, rx_chains_dynamic) -#define CHANCTX_ASSIGN __entry->control_freq = ctx->conf.def.chan->center_freq;\ - __entry->chan_width = ctx->conf.def.width; \ - __entry->center_freq1 = ctx->conf.def.center_freq1; \ - __entry->center_freq2 = ctx->conf.def.center_freq2; \ - __entry->rx_chains_static = ctx->conf.rx_chains_static; \ +#define CHANCTX_ASSIGN CHANDEF_ASSIGN(&ctx->conf.def) \ + __entry->rx_chains_static = ctx->conf.rx_chains_static; \ __entry->rx_chains_dynamic = ctx->conf.rx_chains_dynamic -#define CHANCTX_PR_FMT " control:%d MHz width:%d center: %d/%d MHz chains:%d/%d" -#define CHANCTX_PR_ARG __entry->control_freq, __entry->chan_width, \ - __entry->center_freq1, __entry->center_freq2, \ +#define CHANCTX_PR_FMT CHANDEF_PR_FMT " chains:%d/%d" +#define CHANCTX_PR_ARG CHANDEF_PR_ARG, \ __entry->rx_chains_static, __entry->rx_chains_dynamic @@ -280,8 +286,7 @@ TRACE_EVENT(drv_config, __field(u16, listen_interval) __field(u8, long_frame_max_tx_count) __field(u8, short_frame_max_tx_count) - __field(int, center_freq) - __field(int, channel_type) + CHANDEF_ENTRY __field(int, smps) ), @@ -297,15 +302,13 @@ TRACE_EVENT(drv_config, local->hw.conf.long_frame_max_tx_count; __entry->short_frame_max_tx_count = local->hw.conf.short_frame_max_tx_count; - __entry->center_freq = local->hw.conf.channel ? - local->hw.conf.channel->center_freq : 0; - __entry->channel_type = local->hw.conf.channel_type; + CHANDEF_ASSIGN(&local->hw.conf.chandef) __entry->smps = local->hw.conf.smps_mode; ), TP_printk( - LOCAL_PR_FMT " ch:%#x freq:%d", - LOCAL_PR_ARG, __entry->changed, __entry->center_freq + LOCAL_PR_FMT " ch:%#x" CHANDEF_PR_FMT, + LOCAL_PR_ARG, __entry->changed, CHANDEF_PR_ARG ) ); @@ -334,6 +337,7 @@ TRACE_EVENT(drv_bss_info_changed, __field(u16, assoc_cap) __field(u64, sync_tsf) __field(u32, sync_device_ts) + __field(u8, sync_dtim_count) __field(u32, basic_rates) __array(int, mcast_rate, IEEE80211_NUM_BANDS) __field(u16, ht_operation_mode) @@ -341,16 +345,18 @@ TRACE_EVENT(drv_bss_info_changed, __field(s32, cqm_rssi_hyst); __field(u32, channel_width); __field(u32, channel_cfreq1); - __dynamic_array(u32, arp_addr_list, info->arp_addr_cnt); - __field(bool, arp_filter_enabled); + __dynamic_array(u32, arp_addr_list, + info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? + IEEE80211_BSS_ARP_ADDR_LIST_LEN : + info->arp_addr_cnt); + __field(int, arp_addr_cnt); __field(bool, qos); __field(bool, idle); __field(bool, ps); __dynamic_array(u8, ssid, info->ssid_len); __field(bool, hidden_ssid); __field(int, txpower) - __field(u8, p2p_ctwindow) - __field(bool, p2p_oppps) + __field(u8, p2p_oppps_ctwindow) ), TP_fast_assign( @@ -370,6 +376,7 @@ TRACE_EVENT(drv_bss_info_changed, __entry->assoc_cap = info->assoc_capability; __entry->sync_tsf = info->sync_tsf; __entry->sync_device_ts = info->sync_device_ts; + __entry->sync_dtim_count = info->sync_dtim_count; __entry->basic_rates = info->basic_rates; memcpy(__entry->mcast_rate, info->mcast_rate, sizeof(__entry->mcast_rate)); @@ -378,17 +385,18 @@ TRACE_EVENT(drv_bss_info_changed, __entry->cqm_rssi_hyst = info->cqm_rssi_hyst; __entry->channel_width = info->chandef.width; __entry->channel_cfreq1 = info->chandef.center_freq1; + __entry->arp_addr_cnt = info->arp_addr_cnt; memcpy(__get_dynamic_array(arp_addr_list), info->arp_addr_list, - sizeof(u32) * info->arp_addr_cnt); - __entry->arp_filter_enabled = info->arp_filter_enabled; + sizeof(u32) * (info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? + IEEE80211_BSS_ARP_ADDR_LIST_LEN : + info->arp_addr_cnt)); __entry->qos = info->qos; __entry->idle = info->idle; __entry->ps = info->ps; memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len); __entry->hidden_ssid = info->hidden_ssid; __entry->txpower = info->txpower; - __entry->p2p_ctwindow = info->p2p_ctwindow; - __entry->p2p_oppps = info->p2p_oppps; + __entry->p2p_oppps_ctwindow = info->p2p_noa_attr.oppps_ctwindow; ), TP_printk( @@ -418,6 +426,30 @@ TRACE_EVENT(drv_prepare_multicast, ) ); +TRACE_EVENT(drv_set_multicast_list, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, int mc_count), + + TP_ARGS(local, sdata, mc_count), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, allmulti) + __field(int, mc_count) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->allmulti = sdata->flags & IEEE80211_SDATA_ALLMULTI; + __entry->mc_count = mc_count; + ), + + TP_printk( + LOCAL_PR_FMT " configure mc filter, count=%d, allmulti=%d", + LOCAL_PR_ARG, __entry->mc_count, __entry->allmulti + ) +); + TRACE_EVENT(drv_configure_filter, TP_PROTO(struct ieee80211_local *local, unsigned int changed_flags, @@ -466,7 +498,7 @@ TRACE_EVENT(drv_set_tim, TP_printk( LOCAL_PR_FMT STA_PR_FMT " set:%d", - LOCAL_PR_ARG, STA_PR_FMT, __entry->set + LOCAL_PR_ARG, STA_PR_ARG, __entry->set ) ); @@ -927,23 +959,26 @@ TRACE_EVENT(drv_get_survey, ); TRACE_EVENT(drv_flush, - TP_PROTO(struct ieee80211_local *local, bool drop), + TP_PROTO(struct ieee80211_local *local, + u32 queues, bool drop), - TP_ARGS(local, drop), + TP_ARGS(local, queues, drop), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, drop) + __field(u32, queues) ), TP_fast_assign( LOCAL_ASSIGN; __entry->drop = drop; + __entry->queues = queues; ), TP_printk( - LOCAL_PR_FMT " drop:%d", - LOCAL_PR_ARG, __entry->drop + LOCAL_PR_FMT " queues:0x%x drop:%d", + LOCAL_PR_ARG, __entry->queues, __entry->drop ) ); @@ -955,23 +990,23 @@ TRACE_EVENT(drv_channel_switch, TP_STRUCT__entry( LOCAL_ENTRY + CHANDEF_ENTRY __field(u64, timestamp) __field(bool, block_tx) - __field(u16, freq) __field(u8, count) ), TP_fast_assign( LOCAL_ASSIGN; + CHANDEF_ASSIGN(&ch_switch->chandef) __entry->timestamp = ch_switch->timestamp; __entry->block_tx = ch_switch->block_tx; - __entry->freq = ch_switch->channel->center_freq; __entry->count = ch_switch->count; ), TP_printk( - LOCAL_PR_FMT " new freq:%u count:%d", - LOCAL_PR_ARG, __entry->freq, __entry->count + LOCAL_PR_FMT " new " CHANDEF_PR_FMT " count:%d", + LOCAL_PR_ARG, CHANDEF_PR_ARG, __entry->count ) ); @@ -1029,15 +1064,17 @@ TRACE_EVENT(drv_remain_on_channel, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, - unsigned int duration), + unsigned int duration, + enum ieee80211_roc_type type), - TP_ARGS(local, sdata, chan, duration), + TP_ARGS(local, sdata, chan, duration, type), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(int, center_freq) __field(unsigned int, duration) + __field(u32, type) ), TP_fast_assign( @@ -1045,12 +1082,13 @@ TRACE_EVENT(drv_remain_on_channel, VIF_ASSIGN; __entry->center_freq = chan->center_freq; __entry->duration = duration; + __entry->type = type; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " freq:%dMHz duration:%dms", + LOCAL_PR_FMT VIF_PR_FMT " freq:%dMHz duration:%dms type=%d", LOCAL_PR_ARG, VIF_PR_ARG, - __entry->center_freq, __entry->duration + __entry->center_freq, __entry->duration, __entry->type ) ); @@ -1178,23 +1216,26 @@ TRACE_EVENT(drv_set_rekey_data, TRACE_EVENT(drv_rssi_callback, TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, enum ieee80211_rssi_event rssi_event), - TP_ARGS(local, rssi_event), + TP_ARGS(local, sdata, rssi_event), TP_STRUCT__entry( LOCAL_ENTRY + VIF_ENTRY __field(u32, rssi_event) ), TP_fast_assign( LOCAL_ASSIGN; + VIF_ASSIGN; __entry->rssi_event = rssi_event; ), TP_printk( - LOCAL_PR_FMT " rssi_event:%d", - LOCAL_PR_ARG, __entry->rssi_event + LOCAL_PR_FMT VIF_PR_FMT " rssi_event:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->rssi_event ) ); @@ -1426,6 +1467,14 @@ DEFINE_EVENT(local_only_evt, drv_restart_complete, TP_ARGS(local) ); +#if IS_ENABLED(CONFIG_IPV6) +DEFINE_EVENT(local_sdata_evt, drv_ipv6_addr_change, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); +#endif + /* * Tracing for API calls that drivers call. */ @@ -1660,7 +1709,7 @@ TRACE_EVENT(api_sta_block_awake, TP_printk( LOCAL_PR_FMT STA_PR_FMT " block:%d", - LOCAL_PR_ARG, STA_PR_FMT, __entry->block + LOCAL_PR_ARG, STA_PR_ARG, __entry->block ) ); @@ -1758,7 +1807,7 @@ TRACE_EVENT(api_eosp, TP_printk( LOCAL_PR_FMT STA_PR_FMT, - LOCAL_PR_ARG, STA_PR_FMT + LOCAL_PR_ARG, STA_PR_ARG ) ); @@ -1815,6 +1864,48 @@ TRACE_EVENT(stop_queue, ) ); +TRACE_EVENT(drv_set_default_unicast_key, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + int key_idx), + + TP_ARGS(local, sdata, key_idx), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(int, key_idx) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->key_idx = key_idx; + ), + + TP_printk(LOCAL_PR_FMT VIF_PR_FMT " key_idx:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->key_idx) +); + +TRACE_EVENT(api_radar_detected, + TP_PROTO(struct ieee80211_local *local), + + TP_ARGS(local), + + TP_STRUCT__entry( + LOCAL_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT " radar detected", + LOCAL_PR_ARG + ) +); + #ifdef CONFIG_MAC80211_MESSAGE_TRACING #undef TRACE_SYSTEM #define TRACE_SYSTEM mac80211_msg diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 467c1d1b66f2..9972e07a2f96 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -48,15 +48,15 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); /* assume HW handles this */ - if (info->control.rates[0].flags & IEEE80211_TX_RC_MCS) + if (tx->rate.flags & IEEE80211_TX_RC_MCS) return 0; /* uh huh? */ - if (WARN_ON_ONCE(info->control.rates[0].idx < 0)) + if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; sband = local->hw.wiphy->bands[info->band]; - txrate = &sband->bitrates[info->control.rates[0].idx]; + txrate = &sband->bitrates[tx->rate.idx]; erp = txrate->flags & IEEE80211_RATE_ERP_G; @@ -233,6 +233,7 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) if (local->hw.conf.flags & IEEE80211_CONF_PS) { ieee80211_stop_queues_by_reason(&local->hw, + IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_PS); ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; ieee80211_queue_work(&local->hw, @@ -329,6 +330,8 @@ static void purge_old_ps_buffers(struct ieee80211_local *local) if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->u.ap.ps; + else if (ieee80211_vif_is_mesh(&sdata->vif)) + ps = &sdata->u.mesh.ps; else continue; @@ -372,18 +375,20 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) /* * broadcast/multicast frame * - * If any of the associated stations is in power save mode, + * If any of the associated/peer stations is in power save mode, * the frame is buffered to be sent after DTIM beacon frame. * This is done either by the hardware or us. */ - /* powersaving STAs currently only in AP/VLAN mode */ + /* powersaving STAs currently only in AP/VLAN/mesh mode */ if (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!tx->sdata->bss) return TX_CONTINUE; ps = &tx->sdata->bss->ps; + } else if (ieee80211_vif_is_mesh(&tx->sdata->vif)) { + ps = &tx->sdata->u.mesh.ps; } else { return TX_CONTINUE; } @@ -594,7 +599,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) break; } - if (unlikely(tx->key && tx->key->flags & KEY_FLAG_TAINTED)) + if (unlikely(tx->key && tx->key->flags & KEY_FLAG_TAINTED && + !ieee80211_is_deauth(hdr->frame_control))) return TX_DROP; if (!skip_hw && tx->key && @@ -611,11 +617,9 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (void *)tx->skb->data; struct ieee80211_supported_band *sband; - struct ieee80211_rate *rate; - int i; u32 len; - bool inval = false, rts = false, short_preamble = false; struct ieee80211_tx_rate_control txrc; + struct ieee80211_sta_rates *ratetbl = NULL; bool assoc = false; memset(&txrc, 0, sizeof(txrc)); @@ -636,18 +640,23 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) txrc.max_rate_idx = -1; else txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1; - memcpy(txrc.rate_idx_mcs_mask, - tx->sdata->rc_rateidx_mcs_mask[info->band], - sizeof(txrc.rate_idx_mcs_mask)); + + if (tx->sdata->rc_has_mcs_mask[info->band]) + txrc.rate_idx_mcs_mask = + tx->sdata->rc_rateidx_mcs_mask[info->band]; + txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT || tx->sdata->vif.type == NL80211_IFTYPE_ADHOC); /* set up RTS protection if desired */ if (len > tx->local->hw.wiphy->rts_threshold) { - txrc.rts = rts = true; + txrc.rts = true; } + info->control.use_rts = txrc.rts; + info->control.use_cts_prot = tx->sdata->vif.bss_conf.use_cts_prot; + /* * Use short preamble if the BSS can handle it, but not for * management frames unless we know the receiver can handle @@ -657,7 +666,9 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) if (tx->sdata->vif.bss_conf.use_short_preamble && (ieee80211_is_data(hdr->frame_control) || (tx->sta && test_sta_flag(tx->sta, WLAN_STA_SHORT_PREAMBLE)))) - txrc.short_preamble = short_preamble = true; + txrc.short_preamble = true; + + info->control.short_preamble = txrc.short_preamble; if (tx->sta) assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC); @@ -681,16 +692,38 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) */ rate_control_get_rate(tx->sdata, tx->sta, &txrc); - if (unlikely(info->control.rates[0].idx < 0)) - return TX_DROP; + if (tx->sta && !info->control.skip_table) + ratetbl = rcu_dereference(tx->sta->sta.rates); + + if (unlikely(info->control.rates[0].idx < 0)) { + if (ratetbl) { + struct ieee80211_tx_rate rate = { + .idx = ratetbl->rate[0].idx, + .flags = ratetbl->rate[0].flags, + .count = ratetbl->rate[0].count + }; + + if (ratetbl->rate[0].idx < 0) + return TX_DROP; + + tx->rate = rate; + } else { + return TX_DROP; + } + } else { + tx->rate = info->control.rates[0]; + } if (txrc.reported_rate.idx < 0) { - txrc.reported_rate = info->control.rates[0]; + txrc.reported_rate = tx->rate; if (tx->sta && ieee80211_is_data(hdr->frame_control)) tx->sta->last_tx_rate = txrc.reported_rate; } else if (tx->sta) tx->sta->last_tx_rate = txrc.reported_rate; + if (ratetbl) + return TX_CONTINUE; + if (unlikely(!info->control.rates[0].count)) info->control.rates[0].count = 1; @@ -698,91 +731,6 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) (info->flags & IEEE80211_TX_CTL_NO_ACK))) info->control.rates[0].count = 1; - if (is_multicast_ether_addr(hdr->addr1)) { - /* - * XXX: verify the rate is in the basic rateset - */ - return TX_CONTINUE; - } - - /* - * set up the RTS/CTS rate as the fastest basic rate - * that is not faster than the data rate - * - * XXX: Should this check all retry rates? - */ - if (!(info->control.rates[0].flags & IEEE80211_TX_RC_MCS)) { - s8 baserate = 0; - - rate = &sband->bitrates[info->control.rates[0].idx]; - - for (i = 0; i < sband->n_bitrates; i++) { - /* must be a basic rate */ - if (!(tx->sdata->vif.bss_conf.basic_rates & BIT(i))) - continue; - /* must not be faster than the data rate */ - if (sband->bitrates[i].bitrate > rate->bitrate) - continue; - /* maximum */ - if (sband->bitrates[baserate].bitrate < - sband->bitrates[i].bitrate) - baserate = i; - } - - info->control.rts_cts_rate_idx = baserate; - } - - for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { - /* - * make sure there's no valid rate following - * an invalid one, just in case drivers don't - * take the API seriously to stop at -1. - */ - if (inval) { - info->control.rates[i].idx = -1; - continue; - } - if (info->control.rates[i].idx < 0) { - inval = true; - continue; - } - - /* - * For now assume MCS is already set up correctly, this - * needs to be fixed. - */ - if (info->control.rates[i].flags & IEEE80211_TX_RC_MCS) { - WARN_ON(info->control.rates[i].idx > 76); - continue; - } - - /* set up RTS protection if desired */ - if (rts) - info->control.rates[i].flags |= - IEEE80211_TX_RC_USE_RTS_CTS; - - /* RC is busted */ - if (WARN_ON_ONCE(info->control.rates[i].idx >= - sband->n_bitrates)) { - info->control.rates[i].idx = -1; - continue; - } - - rate = &sband->bitrates[info->control.rates[i].idx]; - - /* set up short preamble */ - if (short_preamble && - rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) - info->control.rates[i].flags |= - IEEE80211_TX_RC_USE_SHORT_PREAMBLE; - - /* set up G protection */ - if (!rts && tx->sdata->vif.bss_conf.use_cts_prot && - rate->flags & IEEE80211_RATE_ERP_G) - info->control.rates[i].flags |= - IEEE80211_TX_RC_USE_CTS_PROTECT; - } - return TX_CONTINUE; } @@ -986,15 +934,18 @@ static ieee80211_tx_result debug_noinline ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) { struct sk_buff *skb; + int ac = -1; if (!tx->sta) return TX_CONTINUE; - tx->sta->tx_packets++; skb_queue_walk(&tx->skbs, skb) { + ac = skb_get_queue_mapping(skb); tx->sta->tx_fragments++; - tx->sta->tx_bytes += skb->len; + tx->sta->tx_bytes[ac] += skb->len; } + if (ac >= 0) + tx->sta->tx_packets[ac]++; return TX_CONTINUE; } @@ -1225,20 +1176,41 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { - /* - * Since queue is stopped, queue up frames for later - * transmission from the tx-pending tasklet when the - * queue is woken again. - */ - if (txpending) - skb_queue_splice_init(skbs, &local->pending[q]); - else - skb_queue_splice_tail_init(skbs, - &local->pending[q]); + if (unlikely(info->flags & + IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) { + if (local->queue_stop_reasons[q] & + ~BIT(IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL)) { + /* + * Drop off-channel frames if queues + * are stopped for any reason other + * than off-channel operation. Never + * queue them. + */ + spin_unlock_irqrestore( + &local->queue_stop_reason_lock, + flags); + ieee80211_purge_tx_queue(&local->hw, + skbs); + return true; + } + } else { - spin_unlock_irqrestore(&local->queue_stop_reason_lock, - flags); - return false; + /* + * Since queue is stopped, queue up frames for + * later transmission from the tx-pending + * tasklet when the queue is woken again. + */ + if (txpending) + skb_queue_splice_init(skbs, + &local->pending[q]); + else + skb_queue_splice_tail_init(skbs, + &local->pending[q]); + + spin_unlock_irqrestore(&local->queue_stop_reason_lock, + flags); + return false; + } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); @@ -1472,12 +1444,14 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, hdr = (struct ieee80211_hdr *) skb->data; info->control.vif = &sdata->vif; - if (ieee80211_vif_is_mesh(&sdata->vif) && - ieee80211_is_data(hdr->frame_control) && - !is_multicast_ether_addr(hdr->addr1) && - mesh_nexthop_resolve(skb, sdata)) { - /* skb queued: don't free */ - return; + if (ieee80211_vif_is_mesh(&sdata->vif)) { + if (ieee80211_is_data(hdr->frame_control) && + is_unicast_ether_addr(hdr->addr1)) { + if (mesh_nexthop_resolve(sdata, skb)) + return; /* skb queued: don't free */ + } else { + ieee80211_mps_set_frame_flags(sdata, NULL, hdr); + } } ieee80211_set_qos_hdr(sdata, skb); @@ -1677,7 +1651,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, if (chanctx_conf) chan = chanctx_conf->def.chan; else if (!local->use_chanctx) - chan = local->_oper_channel; + chan = local->_oper_chandef.chan; else goto fail_rcu; @@ -1787,16 +1761,16 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, break; /* fall through */ case NL80211_IFTYPE_AP: + if (sdata->vif.type == NL80211_IFTYPE_AP) + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (!chanctx_conf) + goto fail_rcu; fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 24; - if (sdata->vif.type == NL80211_IFTYPE_AP) - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (!chanctx_conf) - goto fail_rcu; band = chanctx_conf->def.chan->band; break; case NL80211_IFTYPE_WDS: @@ -1811,7 +1785,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, * This is the exception! WDS style interfaces are prohibited * when channel contexts are in used so this must be valid */ - band = local->hw.conf.channel->band; + band = local->hw.conf.chandef.chan->band; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: @@ -1822,9 +1796,24 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, } if (!is_multicast_ether_addr(skb->data)) { - mpath = mesh_path_lookup(skb->data, sdata); - if (!mpath) - mppath = mpp_path_lookup(skb->data, sdata); + struct sta_info *next_hop; + bool mpp_lookup = true; + + mpath = mesh_path_lookup(sdata, skb->data); + if (mpath) { + mpp_lookup = false; + next_hop = rcu_dereference(mpath->next_hop); + if (!next_hop || + !(mpath->flags & (MESH_PATH_ACTIVE | + MESH_PATH_RESOLVING))) + mpp_lookup = true; + } + + if (mpp_lookup) + mppath = mpp_path_lookup(sdata, skb->data); + + if (mppath && mpath) + mesh_path_del(mpath->sdata, mpath->dst); } /* @@ -1837,8 +1826,8 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, !(mppath && !ether_addr_equal(mppath->mpp, skb->data))) { hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, skb->data, skb->data + ETH_ALEN); - meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, - sdata, NULL, NULL); + meshhdrlen = ieee80211_new_mesh_header(sdata, &mesh_hdr, + NULL, NULL); } else { /* DS -> MBSS (802.11-2012 13.11.3.3). * For unicast with unknown forwarding information, @@ -1857,18 +1846,14 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, mesh_da, sdata->vif.addr); if (is_multicast_ether_addr(mesh_da)) /* DA TA mSA AE:SA */ - meshhdrlen = - ieee80211_new_mesh_header(&mesh_hdr, - sdata, - skb->data + ETH_ALEN, - NULL); + meshhdrlen = ieee80211_new_mesh_header( + sdata, &mesh_hdr, + skb->data + ETH_ALEN, NULL); else /* RA TA mDA mSA AE:DA SA */ - meshhdrlen = - ieee80211_new_mesh_header(&mesh_hdr, - sdata, - skb->data, - skb->data + ETH_ALEN); + meshhdrlen = ieee80211_new_mesh_header( + sdata, &mesh_hdr, skb->data, + skb->data + ETH_ALEN); } chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); @@ -1999,24 +1984,14 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, skb = skb_clone(skb, GFP_ATOMIC); if (skb) { unsigned long flags; - int id, r; + int id; spin_lock_irqsave(&local->ack_status_lock, flags); - r = idr_get_new_above(&local->ack_status_frames, - orig_skb, 1, &id); - if (r == -EAGAIN) { - idr_pre_get(&local->ack_status_frames, - GFP_ATOMIC); - r = idr_get_new_above(&local->ack_status_frames, - orig_skb, 1, &id); - } - if (WARN_ON(!id) || id > 0xffff) { - idr_remove(&local->ack_status_frames, id); - r = -ERANGE; - } + id = idr_alloc(&local->ack_status_frames, orig_skb, + 1, 0x10000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, flags); - if (!r) { + if (id >= 0) { info_id = id; info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; } else if (skb_shared(skb)) { @@ -2056,7 +2031,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, encaps_data = bridge_tunnel_header; encaps_len = sizeof(bridge_tunnel_header); skip_header_bytes -= 2; - } else if (ethertype >= 0x600) { + } else if (ethertype >= ETH_P_802_3_MIN) { encaps_data = rfc1042_header; encaps_len = sizeof(rfc1042_header); skip_header_bytes -= 2; @@ -2264,9 +2239,8 @@ void ieee80211_tx_pending(unsigned long data) /* functions for drivers to get certain frames */ -static void ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, - struct ps_data *ps, - struct sk_buff *skb) +static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, + struct ps_data *ps, struct sk_buff *skb) { u8 *pos, *tim; int aid0 = 0; @@ -2328,6 +2302,29 @@ static void ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, } } +static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, + struct ps_data *ps, struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + + /* + * Not very nice, but we want to allow the driver to call + * ieee80211_beacon_get() as a response to the set_tim() + * callback. That, however, is already invoked under the + * sta_lock to guarantee consistent and race-free update + * of the tim bitmap in mac80211 and the driver. + */ + if (local->tim_in_locked_section) { + __ieee80211_beacon_add_tim(sdata, ps, skb); + } else { + spin_lock_bh(&local->tim_lock); + __ieee80211_beacon_add_tim(sdata, ps, skb); + spin_unlock_bh(&local->tim_lock); + } + + return 0; +} + struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 *tim_offset, u16 *tim_length) @@ -2372,22 +2369,7 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, memcpy(skb_put(skb, beacon->head_len), beacon->head, beacon->head_len); - /* - * Not very nice, but we want to allow the driver to call - * ieee80211_beacon_get() as a response to the set_tim() - * callback. That, however, is already invoked under the - * sta_lock to guarantee consistent and race-free update - * of the tim bitmap in mac80211 and the driver. - */ - if (local->tim_in_locked_section) { - ieee80211_beacon_add_tim(sdata, &ap->ps, skb); - } else { - unsigned long flags; - - spin_lock_irqsave(&local->tim_lock, flags); - ieee80211_beacon_add_tim(sdata, &ap->ps, skb); - spin_unlock_irqrestore(&local->tim_lock, flags); - } + ieee80211_beacon_add_tim(sdata, &ap->ps, skb); if (tim_offset) *tim_offset = beacon->head_len; @@ -2402,79 +2384,42 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_hdr *hdr; - struct sk_buff *presp = rcu_dereference(ifibss->presp); + struct beacon_data *presp = rcu_dereference(ifibss->presp); if (!presp) goto out; - skb = skb_copy(presp, GFP_ATOMIC); + skb = dev_alloc_skb(local->tx_headroom + presp->head_len); if (!skb) goto out; + skb_reserve(skb, local->tx_headroom); + memcpy(skb_put(skb, presp->head_len), presp->head, + presp->head_len); hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); } else if (ieee80211_vif_is_mesh(&sdata->vif)) { - struct ieee80211_mgmt *mgmt; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - u8 *pos; - int hdr_len = offsetof(struct ieee80211_mgmt, u.beacon) + - sizeof(mgmt->u.beacon); + struct beacon_data *bcn = rcu_dereference(ifmsh->beacon); -#ifdef CONFIG_MAC80211_MESH - if (!sdata->u.mesh.mesh_id_len) + if (!bcn) goto out; -#endif if (ifmsh->sync_ops) ifmsh->sync_ops->adjust_tbtt( sdata); skb = dev_alloc_skb(local->tx_headroom + - hdr_len + - 2 + /* NULL SSID */ - 2 + 8 + /* supported rates */ - 2 + 3 + /* DS params */ - 2 + (IEEE80211_MAX_SUPP_RATES - 8) + - 2 + sizeof(struct ieee80211_ht_cap) + - 2 + sizeof(struct ieee80211_ht_operation) + - 2 + sdata->u.mesh.mesh_id_len + - 2 + sizeof(struct ieee80211_meshconf_ie) + - sdata->u.mesh.ie_len); + bcn->head_len + + 256 + /* TIM IE */ + bcn->tail_len); if (!skb) goto out; - - skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = (struct ieee80211_mgmt *) skb_put(skb, hdr_len); - memset(mgmt, 0, hdr_len); - mgmt->frame_control = - cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); - eth_broadcast_addr(mgmt->da); - memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); - memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); - mgmt->u.beacon.beacon_int = - cpu_to_le16(sdata->vif.bss_conf.beacon_int); - mgmt->u.beacon.capab_info |= cpu_to_le16( - sdata->u.mesh.security ? WLAN_CAPABILITY_PRIVACY : 0); - - pos = skb_put(skb, 2); - *pos++ = WLAN_EID_SSID; - *pos++ = 0x0; - - band = chanctx_conf->def.chan->band; - - if (ieee80211_add_srates_ie(sdata, skb, true, band) || - mesh_add_ds_params_ie(skb, sdata) || - ieee80211_add_ext_srates_ie(sdata, skb, true, band) || - mesh_add_rsn_ie(skb, sdata) || - mesh_add_ht_cap_ie(skb, sdata) || - mesh_add_ht_oper_ie(skb, sdata) || - mesh_add_meshid_ie(skb, sdata) || - mesh_add_meshconf_ie(skb, sdata) || - mesh_add_vendor_ies(skb, sdata)) { - pr_err("o11s: couldn't add ies!\n"); - goto out; - } + skb_reserve(skb, local->tx_headroom); + memcpy(skb_put(skb, bcn->head_len), bcn->head, bcn->head_len); + ieee80211_beacon_add_tim(sdata, &ifmsh->ps, skb); + memcpy(skb_put(skb, bcn->tail_len), bcn->tail, bcn->tail_len); } else { WARN_ON(1); goto out; @@ -2499,8 +2444,6 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, txrc.max_rate_idx = -1; else txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1; - memcpy(txrc.rate_idx_mcs_mask, sdata->rc_rateidx_mcs_mask[band], - sizeof(txrc.rate_idx_mcs_mask)); txrc.bss = true; rate_control_get_rate(sdata, NULL, &txrc); @@ -2724,6 +2667,8 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw, goto out; ps = &sdata->u.ap.ps; + } else if (ieee80211_vif_is_mesh(&sdata->vif)) { + ps = &sdata->u.mesh.ps; } else { goto out; } @@ -2747,6 +2692,8 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw, cpu_to_le16(IEEE80211_FCTL_MOREDATA); } + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = IEEE80211_DEV_TO_SUB_IF(skb->dev); if (!ieee80211_tx_prepare(sdata, &tx, skb)) break; dev_kfree_skb_any(skb); @@ -2779,6 +2726,8 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, skb_set_queue_mapping(skb, ac); skb->priority = tid; + skb->dev = sdata->dev; + /* * The other path calling ieee80211_xmit is from the tasklet, * and while we can handle concurrent transmissions locking diff --git a/net/mac80211/util.c b/net/mac80211/util.c index f11e8c540db4..3f87fa468b1f 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -453,7 +453,8 @@ void ieee80211_add_pending_skbs_fn(struct ieee80211_local *local, } void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, - enum queue_stop_reason reason) + unsigned long queues, + enum queue_stop_reason reason) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; @@ -461,7 +462,7 @@ void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - for (i = 0; i < hw->queues; i++) + for_each_set_bit(i, &queues, hw->queues) __ieee80211_stop_queue(hw, i, reason); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); @@ -469,7 +470,7 @@ void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, void ieee80211_stop_queues(struct ieee80211_hw *hw) { - ieee80211_stop_queues_by_reason(hw, + ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER); } EXPORT_SYMBOL(ieee80211_stop_queues); @@ -484,13 +485,15 @@ int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) return true; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - ret = !!local->queue_stop_reasons[queue]; + ret = test_bit(IEEE80211_QUEUE_STOP_REASON_DRIVER, + &local->queue_stop_reasons[queue]); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return ret; } EXPORT_SYMBOL(ieee80211_queue_stopped); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, + unsigned long queues, enum queue_stop_reason reason) { struct ieee80211_local *local = hw_to_local(hw); @@ -499,7 +502,7 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - for (i = 0; i < hw->queues; i++) + for_each_set_bit(i, &queues, hw->queues) __ieee80211_wake_queue(hw, i, reason); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); @@ -507,10 +510,42 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, void ieee80211_wake_queues(struct ieee80211_hw *hw) { - ieee80211_wake_queues_by_reason(hw, IEEE80211_QUEUE_STOP_REASON_DRIVER); + ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_DRIVER); } EXPORT_SYMBOL(ieee80211_wake_queues); +void ieee80211_flush_queues(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + u32 queues; + + if (!local->ops->flush) + return; + + if (sdata && local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) { + int ac; + + queues = 0; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + queues |= BIT(sdata->vif.hw_queue[ac]); + if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE) + queues |= BIT(sdata->vif.cab_queue); + } else { + /* all queues */ + queues = BIT(local->hw.queues) - 1; + } + + ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_FLUSH); + + drv_flush(local, queues, false); + + ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_FLUSH); +} + void ieee80211_iterate_active_interfaces( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, @@ -626,7 +661,7 @@ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_queue_delayed_work); -u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, +u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, bool action, struct ieee802_11_elems *elems, u64 filter, u32 crc) { @@ -634,6 +669,7 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, u8 *pos = start; bool calc_crc = filter != 0; DECLARE_BITMAP(seen_elems, 256); + const u8 *ie; bitmap_zero(seen_elems, 256); memset(elems, 0, sizeof(*elems)); @@ -681,6 +717,12 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, case WLAN_EID_COUNTRY: case WLAN_EID_PWR_CONSTRAINT: case WLAN_EID_TIMEOUT_INTERVAL: + case WLAN_EID_SECONDARY_CHANNEL_OFFSET: + case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: + /* + * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible + * that if the content gets bigger it might be needed more than once + */ if (test_bit(id, seen_elems)) { elems->parse_error = true; left -= elen; @@ -704,17 +746,11 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, elems->supp_rates = pos; elems->supp_rates_len = elen; break; - case WLAN_EID_FH_PARAMS: - elems->fh_params = pos; - elems->fh_params_len = elen; - break; case WLAN_EID_DS_PARAMS: - elems->ds_params = pos; - elems->ds_params_len = elen; - break; - case WLAN_EID_CF_PARAMS: - elems->cf_params = pos; - elems->cf_params_len = elen; + if (elen >= 1) + elems->ds_params = pos; + else + elem_parse_failed = true; break; case WLAN_EID_TIM: if (elen >= sizeof(struct ieee80211_tim_ie)) { @@ -723,10 +759,6 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, } else elem_parse_failed = true; break; - case WLAN_EID_IBSS_PARAMS: - elems->ibss_params = pos; - elems->ibss_params_len = elen; - break; case WLAN_EID_CHALLENGE: elems->challenge = pos; elems->challenge_len = elen; @@ -739,11 +771,7 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, if (calc_crc) crc = crc32_be(crc, pos - 2, elen + 2); - if (pos[3] == 1) { - /* OUI Type 1 - WPA IE */ - elems->wpa = pos; - elems->wpa_len = elen; - } else if (elen >= 5 && pos[3] == 2) { + if (elen >= 5 && pos[3] == 2) { /* OUI Type 2 - WMM IE */ if (pos[4] == 0) { elems->wmm_info = pos; @@ -760,8 +788,10 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, elems->rsn_len = elen; break; case WLAN_EID_ERP_INFO: - elems->erp_info = pos; - elems->erp_info_len = elen; + if (elen >= 1) + elems->erp_info = pos; + else + elem_parse_failed = true; break; case WLAN_EID_EXT_SUPP_RATES: elems->ext_supp_rates = pos; @@ -791,6 +821,12 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, else elem_parse_failed = true; break; + case WLAN_EID_OPMODE_NOTIF: + if (elen > 0) + elems->opmode_notif = pos; + else + elem_parse_failed = true; + break; case WLAN_EID_MESH_ID: elems->mesh_id = pos; elems->mesh_id_len = elen; @@ -805,6 +841,10 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, elems->peering = pos; elems->peering_len = elen; break; + case WLAN_EID_MESH_AWAKE_WINDOW: + if (elen >= 2) + elems->awake_window = (void *)pos; + break; case WLAN_EID_PREQ: elems->preq = pos; elems->preq_len = elen; @@ -830,12 +870,47 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, } elems->ch_switch_ie = (void *)pos; break; - case WLAN_EID_QUIET: - if (!elems->quiet_elem) { - elems->quiet_elem = pos; - elems->quiet_elem_len = elen; + case WLAN_EID_EXT_CHANSWITCH_ANN: + if (elen != sizeof(struct ieee80211_ext_chansw_ie)) { + elem_parse_failed = true; + break; + } + elems->ext_chansw_ie = (void *)pos; + break; + case WLAN_EID_SECONDARY_CHANNEL_OFFSET: + if (elen != sizeof(struct ieee80211_sec_chan_offs_ie)) { + elem_parse_failed = true; + break; + } + elems->sec_chan_offs = (void *)pos; + break; + case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: + if (!action || + elen != sizeof(*elems->wide_bw_chansw_ie)) { + elem_parse_failed = true; + break; + } + elems->wide_bw_chansw_ie = (void *)pos; + break; + case WLAN_EID_CHANNEL_SWITCH_WRAPPER: + if (action) { + elem_parse_failed = true; + break; + } + /* + * This is a bit tricky, but as we only care about + * the wide bandwidth channel switch element, so + * just parse it out manually. + */ + ie = cfg80211_find_ie(WLAN_EID_WIDE_BW_CHANNEL_SWITCH, + pos, elen); + if (ie) { + if (ie[1] == sizeof(*elems->wide_bw_chansw_ie)) + elems->wide_bw_chansw_ie = + (void *)(ie + 2); + else + elem_parse_failed = true; } - elems->num_of_quiet_elem++; break; case WLAN_EID_COUNTRY: elems->country_elem = pos; @@ -849,8 +924,10 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, elems->pwr_constr_elem = pos; break; case WLAN_EID_TIMEOUT_INTERVAL: - elems->timeout_int = pos; - elems->timeout_int_len = elen; + if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) + elems->timeout_int = (void *)pos; + else + elem_parse_failed = true; break; default: break; @@ -871,12 +948,6 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, return crc; } -void ieee802_11_parse_elems(u8 *start, size_t len, - struct ieee802_11_elems *elems) -{ - ieee802_11_parse_elems_crc(start, len, elems, 0, 0); -} - void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify) { @@ -1029,8 +1100,9 @@ u32 ieee80211_mandatory_rates(struct ieee80211_local *local, void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, - u8 *extra, size_t extra_len, const u8 *da, - const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx) + const u8 *extra, size_t extra_len, const u8 *da, + const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx, + u32 tx_flags) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; @@ -1063,7 +1135,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, WARN_ON(err); } - IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | + tx_flags; ieee80211_tx_skb(sdata, skb); } @@ -1277,7 +1350,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - u32 ratemask, bool directed, bool no_cck, + u32 ratemask, bool directed, u32 tx_flags, struct ieee80211_channel *channel, bool scan) { struct sk_buff *skb; @@ -1286,9 +1359,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, ssid, ssid_len, ie, ie_len, directed); if (skb) { - if (no_cck) - IEEE80211_SKB_CB(skb)->flags |= - IEEE80211_TX_CTL_NO_CCK_RATE; + IEEE80211_SKB_CB(skb)->flags |= tx_flags; if (scan) ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); else @@ -1351,6 +1422,25 @@ void ieee80211_stop_device(struct ieee80211_local *local) drv_stop(local); } +static void ieee80211_assign_chanctx(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *ctx; + + if (!local->use_chanctx) + return; + + mutex_lock(&local->chanctx_mtx); + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (conf) { + ctx = container_of(conf, struct ieee80211_chanctx, conf); + drv_assign_vif_chanctx(local, sdata, ctx); + } + mutex_unlock(&local->chanctx_mtx); +} + int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; @@ -1358,6 +1448,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) struct ieee80211_chanctx *ctx; struct sta_info *sta; int res, i; + bool reconfig_due_to_wowlan = false; #ifdef CONFIG_PM if (local->suspended) @@ -1377,6 +1468,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) * res is 1, which means the driver requested * to go through a regular reset on wakeup. */ + reconfig_due_to_wowlan = true; } #endif /* everything else happens only if HW was up & running */ @@ -1413,6 +1505,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* add interfaces */ sdata = rtnl_dereference(local->monitor_sdata); if (sdata) { + /* in HW restart it exists already */ + WARN_ON(local->resuming); res = drv_add_interface(local, sdata); if (WARN_ON(res)) { rcu_assign_pointer(local->monitor_sdata, NULL); @@ -1437,36 +1531,14 @@ int ieee80211_reconfig(struct ieee80211_local *local) } list_for_each_entry(sdata, &local->interfaces, list) { - struct ieee80211_chanctx_conf *ctx_conf; - if (!ieee80211_sdata_running(sdata)) continue; - - mutex_lock(&local->chanctx_mtx); - ctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - if (ctx_conf) { - ctx = container_of(ctx_conf, struct ieee80211_chanctx, - conf); - drv_assign_vif_chanctx(local, sdata, ctx); - } - mutex_unlock(&local->chanctx_mtx); + ieee80211_assign_chanctx(local, sdata); } sdata = rtnl_dereference(local->monitor_sdata); - if (sdata && local->use_chanctx && ieee80211_sdata_running(sdata)) { - struct ieee80211_chanctx_conf *ctx_conf; - - mutex_lock(&local->chanctx_mtx); - ctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - if (ctx_conf) { - ctx = container_of(ctx_conf, struct ieee80211_chanctx, - conf); - drv_assign_vif_chanctx(local, sdata, ctx); - } - mutex_unlock(&local->chanctx_mtx); - } + if (sdata && ieee80211_sdata_running(sdata)) + ieee80211_assign_chanctx(local, sdata); /* add STAs back */ mutex_lock(&local->sta_mtx); @@ -1531,6 +1603,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) changed |= BSS_CHANGED_ASSOC | BSS_CHANGED_ARP_FILTER | BSS_CHANGED_PS; + + if (sdata->u.mgd.dtim_period) + changed |= BSS_CHANGED_DTIM_PERIOD; + mutex_lock(&sdata->u.mgd.mtx); ieee80211_bss_info_change_notify(sdata, changed); mutex_unlock(&sdata->u.mgd.mtx); @@ -1550,9 +1626,11 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* fall through */ case NL80211_IFTYPE_MESH_POINT: - changed |= BSS_CHANGED_BEACON | - BSS_CHANGED_BEACON_ENABLED; - ieee80211_bss_info_change_notify(sdata, changed); + if (sdata->vif.bss_conf.enable_beacon) { + changed |= BSS_CHANGED_BEACON | + BSS_CHANGED_BEACON_ENABLED; + ieee80211_bss_info_change_notify(sdata, changed); + } break; case NL80211_IFTYPE_WDS: break; @@ -1618,6 +1696,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) local->in_reconfig = false; barrier(); + if (local->monitors == local->open_count && local->monitors > 0) + ieee80211_add_virtual_monitor(local); + /* * Clear the WLAN_STA_BLOCK_BA flag so new aggregation * sessions can be established after a resume. @@ -1632,24 +1713,26 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { - ieee80211_sta_tear_down_BA_sessions(sta, true); + ieee80211_sta_tear_down_BA_sessions( + sta, AGG_STOP_LOCAL_REQUEST); clear_sta_flag(sta, WLAN_STA_BLOCK_BA); } mutex_unlock(&local->sta_mtx); } - ieee80211_wake_queues_by_reason(hw, - IEEE80211_QUEUE_STOP_REASON_SUSPEND); + ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_SUSPEND); /* * If this is for hw restart things are still running. * We may want to change that later, however. */ - if (!local->suspended) { + if (!local->suspended || reconfig_due_to_wowlan) drv_restart_complete(local); + + if (!local->suspended) return 0; - } #ifdef CONFIG_PM /* first set suspended false, then resuming */ @@ -1657,28 +1740,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) mb(); local->resuming = false; - list_for_each_entry(sdata, &local->interfaces, list) { - switch(sdata->vif.type) { - case NL80211_IFTYPE_STATION: - ieee80211_sta_restart(sdata); - break; - case NL80211_IFTYPE_ADHOC: - ieee80211_ibss_restart(sdata); - break; - case NL80211_IFTYPE_MESH_POINT: - ieee80211_mesh_restart(sdata); - break; - default: - break; - } - } - mod_timer(&local->sta_cleanup, jiffies + 1); - - mutex_lock(&local->sta_mtx); - list_for_each_entry(sta, &local->sta_list, list) - mesh_plink_restart(sta); - mutex_unlock(&local->sta_mtx); #else WARN_ON(1); #endif @@ -1864,7 +1926,7 @@ u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, } u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, - u32 cap) + u32 cap) { __le32 tmp; @@ -1926,7 +1988,7 @@ u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, } void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, - struct ieee80211_ht_operation *ht_oper, + const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef) { enum nl80211_channel_type channel_type; @@ -2030,7 +2092,7 @@ int ieee80211_ave_rssi(struct ieee80211_vif *vif) /* non-managed type inferfaces */ return 0; } - return ifmgd->ave_beacon_signal; + return ifmgd->ave_beacon_signal / 16; } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); @@ -2114,3 +2176,48 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, return ts; } + +void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + cancel_delayed_work_sync(&sdata->dfs_cac_timer_work); + + if (sdata->wdev.cac_started) { + ieee80211_vif_release_channel(sdata); + cfg80211_cac_event(sdata->dev, + NL80211_RADAR_CAC_ABORTED, + GFP_KERNEL); + } + } + mutex_unlock(&local->iflist_mtx); +} + +void ieee80211_dfs_radar_detected_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, radar_detected_work); + struct cfg80211_chan_def chandef; + + ieee80211_dfs_cac_cancel(local); + + if (local->use_chanctx) + /* currently not handled */ + WARN_ON(1); + else { + chandef = local->hw.conf.chandef; + cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL); + } +} + +void ieee80211_radar_detected(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + trace_api_radar_detected(local); + + ieee80211_queue_work(hw, &local->radar_detected_work); +} +EXPORT_SYMBOL(ieee80211_radar_detected); diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index f311388aeedf..171344d4eb7c 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -10,26 +10,396 @@ #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" +#include "rate.h" -void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, - struct ieee80211_supported_band *sband, - struct ieee80211_vht_cap *vht_cap_ie, - struct ieee80211_sta_vht_cap *vht_cap) +static void __check_vhtcap_disable(struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta_vht_cap *vht_cap, + u32 flag) { - if (WARN_ON_ONCE(!vht_cap)) + __le32 le_flag = cpu_to_le32(flag); + + if (sdata->u.mgd.vht_capa_mask.vht_cap_info & le_flag && + !(sdata->u.mgd.vht_capa.vht_cap_info & le_flag)) + vht_cap->cap &= ~flag; +} + +void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta_vht_cap *vht_cap) +{ + int i; + u16 rxmcs_mask, rxmcs_cap, rxmcs_n, txmcs_mask, txmcs_cap, txmcs_n; + + if (!vht_cap->vht_supported) return; + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return; + + __check_vhtcap_disable(sdata, vht_cap, + IEEE80211_VHT_CAP_RXLDPC); + __check_vhtcap_disable(sdata, vht_cap, + IEEE80211_VHT_CAP_SHORT_GI_80); + __check_vhtcap_disable(sdata, vht_cap, + IEEE80211_VHT_CAP_SHORT_GI_160); + __check_vhtcap_disable(sdata, vht_cap, + IEEE80211_VHT_CAP_TXSTBC); + __check_vhtcap_disable(sdata, vht_cap, + IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); + __check_vhtcap_disable(sdata, vht_cap, + IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); + __check_vhtcap_disable(sdata, vht_cap, + IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN); + __check_vhtcap_disable(sdata, vht_cap, + IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN); + + /* Allow user to decrease AMPDU length exponent */ + if (sdata->u.mgd.vht_capa_mask.vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK)) { + u32 cap, n; + + n = le32_to_cpu(sdata->u.mgd.vht_capa.vht_cap_info) & + IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; + n >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; + cap = vht_cap->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; + cap >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; + + if (n < cap) { + vht_cap->cap &= + ~IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; + vht_cap->cap |= + n << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; + } + } + + /* Allow the user to decrease MCSes */ + rxmcs_mask = + le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.rx_mcs_map); + rxmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.rx_mcs_map); + rxmcs_n &= rxmcs_mask; + rxmcs_cap = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); + + txmcs_mask = + le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.tx_mcs_map); + txmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.tx_mcs_map); + txmcs_n &= txmcs_mask; + txmcs_cap = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); + for (i = 0; i < 8; i++) { + u8 m, n, c; + + m = (rxmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + n = (rxmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + c = (rxmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + + if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || + n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { + rxmcs_cap &= ~(3 << 2*i); + rxmcs_cap |= (rxmcs_n & (3 << 2*i)); + } + + m = (txmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + n = (txmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + c = (txmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + + if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || + n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { + txmcs_cap &= ~(3 << 2*i); + txmcs_cap |= (txmcs_n & (3 << 2*i)); + } + } + vht_cap->vht_mcs.rx_mcs_map = cpu_to_le16(rxmcs_cap); + vht_cap->vht_mcs.tx_mcs_map = cpu_to_le16(txmcs_cap); +} + +void +ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct ieee80211_vht_cap *vht_cap_ie, + struct sta_info *sta) +{ + struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; + struct ieee80211_sta_vht_cap own_cap; + u32 cap_info, i; + memset(vht_cap, 0, sizeof(*vht_cap)); + if (!sta->sta.ht_cap.ht_supported) + return; + if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; + /* A VHT STA must support 40 MHz */ + if (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) + return; + vht_cap->vht_supported = true; - vht_cap->cap = le32_to_cpu(vht_cap_ie->vht_cap_info); + own_cap = sband->vht_cap; + /* + * If user has specified capability overrides, take care + * of that if the station we're setting up is the AP that + * we advertised a restricted capability set to. Override + * our own capabilities and then use those below. + */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + ieee80211_apply_vhtcap_overrides(sdata, &own_cap); + + /* take some capabilities as-is */ + cap_info = le32_to_cpu(vht_cap_ie->vht_cap_info); + vht_cap->cap = cap_info; + vht_cap->cap &= IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 | + IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 | + IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 | + IEEE80211_VHT_CAP_RXLDPC | + IEEE80211_VHT_CAP_VHT_TXOP_PS | + IEEE80211_VHT_CAP_HTC_VHT | + IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK | + IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB | + IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB | + IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN | + IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN; + + /* and some based on our own capabilities */ + switch (own_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { + case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: + vht_cap->cap |= cap_info & + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ; + break; + case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: + vht_cap->cap |= cap_info & + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; + break; + default: + /* nothing */ + break; + } + + /* symmetric capabilities */ + vht_cap->cap |= cap_info & own_cap.cap & + (IEEE80211_VHT_CAP_SHORT_GI_80 | + IEEE80211_VHT_CAP_SHORT_GI_160); + + /* remaining ones */ + if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE) { + vht_cap->cap |= cap_info & + (IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE | + IEEE80211_VHT_CAP_BEAMFORMER_ANTENNAS_MAX | + IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MAX); + } + + if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE) + vht_cap->cap |= cap_info & + IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE; + + if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE) + vht_cap->cap |= cap_info & + IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + + if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) + vht_cap->cap |= cap_info & + IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE; + + if (own_cap.cap & IEEE80211_VHT_CAP_TXSTBC) + vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_RXSTBC_MASK; + + if (own_cap.cap & IEEE80211_VHT_CAP_RXSTBC_MASK) + vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_TXSTBC; /* Copy peer MCS info, the driver might need them. */ memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs, sizeof(struct ieee80211_vht_mcs_info)); + + /* but also restrict MCSes */ + for (i = 0; i < 8; i++) { + u16 own_rx, own_tx, peer_rx, peer_tx; + + own_rx = le16_to_cpu(own_cap.vht_mcs.rx_mcs_map); + own_rx = (own_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + + own_tx = le16_to_cpu(own_cap.vht_mcs.tx_mcs_map); + own_tx = (own_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + + peer_rx = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); + peer_rx = (peer_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + + peer_tx = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); + peer_tx = (peer_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + + if (peer_tx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { + if (own_rx == IEEE80211_VHT_MCS_NOT_SUPPORTED) + peer_tx = IEEE80211_VHT_MCS_NOT_SUPPORTED; + else if (own_rx < peer_tx) + peer_tx = own_rx; + } + + if (peer_rx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { + if (own_tx == IEEE80211_VHT_MCS_NOT_SUPPORTED) + peer_rx = IEEE80211_VHT_MCS_NOT_SUPPORTED; + else if (own_tx < peer_rx) + peer_rx = own_tx; + } + + vht_cap->vht_mcs.rx_mcs_map &= + ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); + vht_cap->vht_mcs.rx_mcs_map |= cpu_to_le16(peer_rx << i * 2); + + vht_cap->vht_mcs.tx_mcs_map &= + ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); + vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2); + } + + /* finally set up the bandwidth */ + switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { + case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: + case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + break; + default: + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; + } + + sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); +} + +enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 cap = sta->sta.vht_cap.cap; + enum ieee80211_sta_rx_bandwidth bw; + + if (!sta->sta.vht_cap.vht_supported) { + bw = sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? + IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; + goto check_max; + } + + switch (sdata->vif.bss_conf.chandef.width) { + default: + WARN_ON_ONCE(1); + /* fall through */ + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + bw = sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? + IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; + break; + case NL80211_CHAN_WIDTH_160: + if ((cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) == + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) { + bw = IEEE80211_STA_RX_BW_160; + break; + } + /* fall through */ + case NL80211_CHAN_WIDTH_80P80: + if ((cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) == + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) { + bw = IEEE80211_STA_RX_BW_160; + break; + } + /* fall through */ + case NL80211_CHAN_WIDTH_80: + bw = IEEE80211_STA_RX_BW_80; + } + + check_max: + if (bw > sta->cur_max_bandwidth) + bw = sta->cur_max_bandwidth; + return bw; +} + +void ieee80211_sta_set_rx_nss(struct sta_info *sta) +{ + u8 ht_rx_nss = 0, vht_rx_nss = 0; + + /* if we received a notification already don't overwrite it */ + if (sta->sta.rx_nss) + return; + + if (sta->sta.ht_cap.ht_supported) { + if (sta->sta.ht_cap.mcs.rx_mask[0]) + ht_rx_nss++; + if (sta->sta.ht_cap.mcs.rx_mask[1]) + ht_rx_nss++; + if (sta->sta.ht_cap.mcs.rx_mask[2]) + ht_rx_nss++; + if (sta->sta.ht_cap.mcs.rx_mask[3]) + ht_rx_nss++; + /* FIXME: consider rx_highest? */ + } + + if (sta->sta.vht_cap.vht_supported) { + int i; + u16 rx_mcs_map; + + rx_mcs_map = le16_to_cpu(sta->sta.vht_cap.vht_mcs.rx_mcs_map); + + for (i = 7; i >= 0; i--) { + u8 mcs = (rx_mcs_map >> (2 * i)) & 3; + + if (mcs != IEEE80211_VHT_MCS_NOT_SUPPORTED) { + vht_rx_nss = i + 1; + break; + } + } + /* FIXME: consider rx_highest? */ + } + + ht_rx_nss = max(ht_rx_nss, vht_rx_nss); + sta->sta.rx_nss = max_t(u8, 1, ht_rx_nss); +} + +void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, u8 opmode, + enum ieee80211_band band, bool nss_only) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + enum ieee80211_sta_rx_bandwidth new_bw; + u32 changed = 0; + u8 nss; + + sband = local->hw.wiphy->bands[band]; + + /* ignore - no support for BF yet */ + if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) + return; + + nss = opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; + nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; + nss += 1; + + if (sta->sta.rx_nss != nss) { + sta->sta.rx_nss = nss; + changed |= IEEE80211_RC_NSS_CHANGED; + } + + if (nss_only) + goto change; + + switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { + case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; + break; + case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; + break; + case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; + break; + case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + break; + } + + new_bw = ieee80211_sta_cur_vht_bw(sta); + if (new_bw != sta->sta.bandwidth) { + sta->sta.bandwidth = new_bw; + changed |= IEEE80211_RC_NSS_CHANGED; + } + + change: + if (changed) + rate_control_rate_update(local, sband, sta, changed); } diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 906f00cd6d2f..afba19cb6f87 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -191,6 +191,15 @@ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, /* qos header is 2 bytes */ *p++ = ack_policy | tid; - *p = ieee80211_vif_is_mesh(&sdata->vif) ? - (IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT >> 8) : 0; + if (ieee80211_vif_is_mesh(&sdata->vif)) { + /* preserve RSPI and Mesh PS Level bit */ + *p &= ((IEEE80211_QOS_CTL_RSPI | + IEEE80211_QOS_CTL_MESH_PS_LEVEL) >> 8); + + /* Nulls don't have a mesh header (frame body) */ + if (!ieee80211_is_qos_nullfunc(hdr->frame_control)) + *p |= (IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT >> 8); + } else { + *p = 0; + } } diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index c175ee866ff4..c7c6d644486f 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -181,7 +181,6 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_key *key = tx->key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - unsigned long flags; unsigned int hdrlen; int len, tail; u8 *pos; @@ -216,12 +215,12 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) return 0; /* Increase IV for the frame */ - spin_lock_irqsave(&key->u.tkip.txlock, flags); + spin_lock(&key->u.tkip.txlock); key->u.tkip.tx.iv16++; if (key->u.tkip.tx.iv16 == 0) key->u.tkip.tx.iv32++; pos = ieee80211_tkip_add_iv(pos, key); - spin_unlock_irqrestore(&key->u.tkip.txlock, flags); + spin_unlock(&key->u.tkip.txlock); /* hwaccel - with software IV */ if (info->control.hw_key) diff --git a/net/mac802154/Kconfig b/net/mac802154/Kconfig index a967ddaa4e2f..b33dd76d4307 100644 --- a/net/mac802154/Kconfig +++ b/net/mac802154/Kconfig @@ -1,6 +1,6 @@ config MAC802154 tristate "Generic IEEE 802.15.4 Soft Networking Stack (mac802154)" - depends on IEEE802154 && EXPERIMENTAL + depends on IEEE802154 select CRC_CCITT ---help--- This option enables the hardware independent IEEE 802.15.4 diff --git a/net/mac802154/mac802154.h b/net/mac802154/mac802154.h index a4dcaf1dd4b6..d48422e27110 100644 --- a/net/mac802154/mac802154.h +++ b/net/mac802154/mac802154.h @@ -88,9 +88,7 @@ struct mac802154_sub_if_data { #define mac802154_to_priv(_hw) container_of(_hw, struct mac802154_priv, hw) -#define MAC802154_MAX_XMIT_ATTEMPTS 3 - -#define MAC802154_CHAN_NONE (~(u8)0) /* No channel is assigned */ +#define MAC802154_CHAN_NONE 0xff /* No channel is assigned */ extern struct ieee802154_reduced_mlme_ops mac802154_mlme_reduced; extern struct ieee802154_mlme_ops mac802154_mlme_wpan; @@ -114,5 +112,6 @@ void mac802154_dev_set_ieee_addr(struct net_device *dev); u16 mac802154_dev_get_pan_id(const struct net_device *dev); void mac802154_dev_set_pan_id(struct net_device *dev, u16 val); void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan); +u8 mac802154_dev_get_dsn(const struct net_device *dev); #endif /* MAC802154_H */ diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c index d8d277006089..a99910d4d52f 100644 --- a/net/mac802154/mac_cmd.c +++ b/net/mac802154/mac_cmd.c @@ -73,4 +73,5 @@ struct ieee802154_mlme_ops mac802154_mlme_wpan = { .start_req = mac802154_mlme_start_req, .get_pan_id = mac802154_dev_get_pan_id, .get_short_addr = mac802154_dev_get_short_addr, + .get_dsn = mac802154_dev_get_dsn, }; diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index f47781ab0ccc..8ded97cf1c33 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -159,6 +159,15 @@ void mac802154_dev_set_pan_id(struct net_device *dev, u16 val) } } +u8 mac802154_dev_get_dsn(const struct net_device *dev) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + return priv->dsn++; +} + static void phy_chan_notify(struct work_struct *work) { struct phy_chan_notify_work *nw = container_of(work, @@ -167,9 +176,15 @@ static void phy_chan_notify(struct work_struct *work) struct mac802154_sub_if_data *priv = netdev_priv(nw->dev); int res; + mutex_lock(&priv->hw->phy->pib_lock); res = hw->ops->set_channel(&hw->hw, priv->page, priv->chan); if (res) pr_debug("set_channel failed\n"); + else { + priv->hw->phy->current_channel = priv->chan; + priv->hw->phy->current_page = priv->page; + } + mutex_unlock(&priv->hw->phy->pib_lock); kfree(nw); } @@ -186,8 +201,11 @@ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) priv->chan = chan; spin_unlock_bh(&priv->mib_lock); + mutex_lock(&priv->hw->phy->pib_lock); if (priv->hw->phy->current_channel != priv->chan || priv->hw->phy->current_page != priv->page) { + mutex_unlock(&priv->hw->phy->pib_lock); + work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return; @@ -195,5 +213,6 @@ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) INIT_WORK(&work->work, phy_chan_notify); work->dev = dev; queue_work(priv->hw->dev_workqueue, &work->work); - } + } else + mutex_unlock(&priv->hw->phy->pib_lock); } diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 4e09d070995a..6d1647399d4f 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -25,6 +25,7 @@ #include <linux/if_arp.h> #include <linux/crc-ccitt.h> +#include <net/ieee802154_netdev.h> #include <net/mac802154.h> #include <net/wpan-phy.h> @@ -39,12 +40,12 @@ struct xmit_work { struct mac802154_priv *priv; u8 chan; u8 page; - u8 xmit_attempts; }; static void mac802154_xmit_worker(struct work_struct *work) { struct xmit_work *xw = container_of(work, struct xmit_work, work); + struct mac802154_sub_if_data *sdata; int res; mutex_lock(&xw->priv->phy->pib_lock); @@ -57,21 +58,23 @@ static void mac802154_xmit_worker(struct work_struct *work) pr_debug("set_channel failed\n"); goto out; } + + xw->priv->phy->current_channel = xw->chan; + xw->priv->phy->current_page = xw->page; } res = xw->priv->ops->xmit(&xw->priv->hw, xw->skb); + if (res) + pr_debug("transmission failed\n"); out: mutex_unlock(&xw->priv->phy->pib_lock); - if (res) { - if (xw->xmit_attempts++ < MAC802154_MAX_XMIT_ATTEMPTS) { - queue_work(xw->priv->dev_workqueue, &xw->work); - return; - } else - pr_debug("transmission failed for %d times", - MAC802154_MAX_XMIT_ATTEMPTS); - } + /* Restart the netif queue on each sub_if_data object. */ + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &xw->priv->slaves, list) + netif_wake_queue(sdata->dev); + rcu_read_unlock(); dev_kfree_skb(xw->skb); @@ -82,6 +85,7 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, u8 page, u8 chan) { struct xmit_work *work; + struct mac802154_sub_if_data *sdata; if (!(priv->phy->channels_supported[page] & (1 << chan))) { WARN_ON(1); @@ -109,12 +113,17 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, return NETDEV_TX_BUSY; } + /* Stop the netif queue on each sub_if_data object. */ + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &priv->slaves, list) + netif_stop_queue(sdata->dev); + rcu_read_unlock(); + INIT_WORK(&work->work, mac802154_xmit_worker); work->skb = skb; work->priv = priv; work->page = page; work->chan = chan; - work->xmit_attempts = 0; queue_work(priv->dev_workqueue, &work->work); diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c index 199b92261e94..2ca2f4dceab7 100644 --- a/net/mac802154/wpan.c +++ b/net/mac802154/wpan.c @@ -41,7 +41,7 @@ static inline int mac802154_fetch_skb_u8(struct sk_buff *skb, u8 *val) return -EINVAL; *val = skb->data[0]; - skb_pull(skb, 1); + skb_pull(skb, 1); return 0; } @@ -137,18 +137,16 @@ static int mac802154_header_create(struct sk_buff *skb, struct ieee802154_addr dev_addr; struct mac802154_sub_if_data *priv = netdev_priv(dev); int pos = 2; - u8 *head; + u8 head[MAC802154_FRAME_HARD_HEADER_LEN]; u16 fc; if (!daddr) return -EINVAL; - head = kzalloc(MAC802154_FRAME_HARD_HEADER_LEN, GFP_KERNEL); - if (head == NULL) - return -ENOMEM; - head[pos++] = mac_cb(skb)->seq; /* DSN/BSN */ fc = mac_cb_type(skb); + if (mac_cb_is_ackreq(skb)) + fc |= IEEE802154_FC_ACK_REQ; if (!saddr) { spin_lock_bh(&priv->mib_lock); @@ -210,7 +208,6 @@ static int mac802154_header_create(struct sk_buff *skb, head[1] = fc >> 8; memcpy(skb_push(skb, pos), head, pos); - kfree(head); return pos; } @@ -363,7 +360,7 @@ void mac802154_wpan_setup(struct net_device *dev) dev->header_ops = &mac802154_header_ops; dev->needed_tailroom = 2; /* FCS */ dev->mtu = IEEE802154_MTU; - dev->tx_queue_len = 10; + dev->tx_queue_len = 300; dev->type = ARPHRD_IEEE802154; dev->flags = IFF_NOARP | IFF_BROADCAST; dev->watchdog_timeo = 0; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 49e96df5fbc4..56d22cae5906 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -124,9 +124,14 @@ config NF_CONNTRACK_TIMESTAMP If unsure, say `N'. +config NF_CONNTRACK_LABELS + bool + help + This option enables support for assigning user-defined flag bits + to connection tracking entries. It selected by the connlabel match. + config NF_CT_PROTO_DCCP - tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate 'DCCP protocol connection tracking support' depends on NETFILTER_ADVANCED default IP_DCCP help @@ -139,8 +144,7 @@ config NF_CT_PROTO_GRE tristate config NF_CT_PROTO_SCTP - tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED default IP_SCTP help @@ -281,8 +285,7 @@ config NF_CONNTRACK_PPTP To compile it as a module, choose M here. If unsure, say N. config NF_CONNTRACK_SANE - tristate "SANE protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "SANE protocol support" depends on NETFILTER_ADVANCED help SANE is a protocol for remote access to scanners as implemented @@ -409,8 +412,7 @@ endif # NF_CONNTRACK # transparent proxy support config NETFILTER_TPROXY - tristate "Transparent proxying support (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "Transparent proxying support" depends on IP_NF_MANGLE depends on NETFILTER_ADVANCED help @@ -718,8 +720,7 @@ config NETFILTER_XT_TARGET_TEE this clone be rerouted to another nexthop. config NETFILTER_XT_TARGET_TPROXY - tristate '"TPROXY" target support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate '"TPROXY" target support' depends on NETFILTER_TPROXY depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED @@ -783,8 +784,7 @@ config NETFILTER_XT_TARGET_TCPMSS To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_TARGET_TCPOPTSTRIP - tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate '"TCPOPTSTRIP" target support' depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NETFILTER_ADVANCED help @@ -805,6 +805,15 @@ config NETFILTER_XT_MATCH_ADDRTYPE If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_BPF + tristate '"bpf" match support' + depends on NETFILTER_ADVANCED + help + BPF matching applies a linux socket filter to each packet and + accepts those for which the filter returns non-zero. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_CLUSTER tristate '"cluster" match support' depends on NF_CONNTRACK @@ -842,6 +851,19 @@ config NETFILTER_XT_MATCH_CONNBYTES If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_CONNLABEL + tristate '"connlabel" match support' + select NF_CONNTRACK_LABELS + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This match allows you to test and assign userspace-defined labels names + to a connection. The kernel only stores bit values - mapping + names to bits is done by userspace. + + Unlike connmark, more than 32 flag bits may be assigned to a + connection simultaneously. + config NETFILTER_XT_MATCH_CONNLIMIT tristate '"connlimit" match support"' depends on NF_CONNTRACK @@ -1145,8 +1167,7 @@ config NETFILTER_XT_MATCH_RECENT Official Website: <http://snowman.net/projects/ipt_recent/> config NETFILTER_XT_MATCH_SCTP - tristate '"sctp" protocol match support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate '"sctp" protocol match support' depends on NETFILTER_ADVANCED default IP_SCTP help @@ -1158,8 +1179,7 @@ config NETFILTER_XT_MATCH_SCTP <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. config NETFILTER_XT_MATCH_SOCKET - tristate '"socket" match support (EXPERIMENTAL)' - depends on EXPERIMENTAL + tristate '"socket" match support' depends on NETFILTER_TPROXY depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 32596978df1d..a1abf87d43bf 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -4,6 +4,7 @@ nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_exp nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o obj-$(CONFIG_NETFILTER) = netfilter.o @@ -98,9 +99,11 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o # matches obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_BPF) += xt_bpf.o obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLABEL) += xt_connlabel.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index a9c488b6c50d..07c865a31a3d 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -5,6 +5,7 @@ * way. * * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012 */ #include <linux/kernel.h> #include <linux/netfilter.h> @@ -276,10 +277,30 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif +static int __net_init netfilter_net_init(struct net *net) +{ #ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_net_netfilter; -EXPORT_SYMBOL(proc_net_netfilter); + net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", + net->proc_net); + if (!net->nf.proc_netfilter) { + if (!net_eq(net, &init_net)) + pr_err("cannot create netfilter proc entry"); + + return -ENOMEM; + } #endif + return 0; +} + +static void __net_exit netfilter_net_exit(struct net *net) +{ + remove_proc_entry("netfilter", net->proc_net); +} + +static struct pernet_operations netfilter_net_ops = { + .init = netfilter_net_init, + .exit = netfilter_net_exit, +}; void __init netfilter_init(void) { @@ -289,11 +310,8 @@ void __init netfilter_init(void) INIT_LIST_HEAD(&nf_hooks[i][h]); } -#ifdef CONFIG_PROC_FS - proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net); - if (!proc_net_netfilter) + if (register_pernet_subsys(&netfilter_net_ops) < 0) panic("cannot create netfilter proc entry"); -#endif if (netfilter_log_init() < 0) panic("cannot initialize nf_log"); diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h new file mode 100644 index 000000000000..25243379b887 --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -0,0 +1,277 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __IP_SET_BITMAP_IP_GEN_H +#define __IP_SET_BITMAP_IP_GEN_H + +#define CONCAT(a, b) a##b +#define TOKEN(a,b) CONCAT(a, b) + +#define mtype_do_test TOKEN(MTYPE, _do_test) +#define mtype_gc_test TOKEN(MTYPE, _gc_test) +#define mtype_is_filled TOKEN(MTYPE, _is_filled) +#define mtype_do_add TOKEN(MTYPE, _do_add) +#define mtype_do_del TOKEN(MTYPE, _do_del) +#define mtype_do_list TOKEN(MTYPE, _do_list) +#define mtype_do_head TOKEN(MTYPE, _do_head) +#define mtype_adt_elem TOKEN(MTYPE, _adt_elem) +#define mtype_add_timeout TOKEN(MTYPE, _add_timeout) +#define mtype_gc_init TOKEN(MTYPE, _gc_init) +#define mtype_kadt TOKEN(MTYPE, _kadt) +#define mtype_uadt TOKEN(MTYPE, _uadt) +#define mtype_destroy TOKEN(MTYPE, _destroy) +#define mtype_flush TOKEN(MTYPE, _flush) +#define mtype_head TOKEN(MTYPE, _head) +#define mtype_same_set TOKEN(MTYPE, _same_set) +#define mtype_elem TOKEN(MTYPE, _elem) +#define mtype_test TOKEN(MTYPE, _test) +#define mtype_add TOKEN(MTYPE, _add) +#define mtype_del TOKEN(MTYPE, _del) +#define mtype_list TOKEN(MTYPE, _list) +#define mtype_gc TOKEN(MTYPE, _gc) +#define mtype MTYPE + +#define ext_timeout(e, m) \ + (unsigned long *)((e) + (m)->offset[IPSET_OFFSET_TIMEOUT]) +#define ext_counter(e, m) \ + (struct ip_set_counter *)((e) + (m)->offset[IPSET_OFFSET_COUNTER]) +#define get_ext(map, id) ((map)->extensions + (map)->dsize * (id)) + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ + struct mtype *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +mtype_destroy(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + if (map->dsize) + ip_set_free(map->extensions); + kfree(map); + + set->data = NULL; +} + +static void +mtype_flush(struct ip_set *set) +{ + struct mtype *map = set->data; + + memset(map->members, 0, map->memsize); +} + +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct mtype *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + if (mtype_do_head(skb, map) || + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + + map->memsize + + map->dsize * map->elements)) || + (SET_WITH_TIMEOUT(set) && + nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))) || + (SET_WITH_COUNTER(set) && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, + htonl(IPSET_FLAG_WITH_COUNTERS)))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(map, e->id); + int ret = mtype_do_test(e, map); + + if (ret <= 0) + return ret; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, map))) + return 0; + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(x, map), ext, mext, flags); + return 1; +} + +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + void *x = get_ext(map, e->id); + int ret = mtype_do_add(e, map, flags); + + if (ret == IPSET_ADD_FAILED) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, map))) + ret = 0; + else if (!(flags & IPSET_FLAG_EXIST)) + return -IPSET_ERR_EXIST; + } + + if (SET_WITH_TIMEOUT(set)) +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + mtype_add_timeout(ext_timeout(x, map), e, ext, map, ret); +#else + ip_set_timeout_set(ext_timeout(x, map), ext->timeout); +#endif + + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(x, map), ext); + return 0; +} + +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct mtype *map = set->data; + const struct mtype_adt_elem *e = value; + const void *x = get_ext(map, e->id); + + if (mtype_do_del(e, map) || + (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(x, map)))) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +mtype_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + struct mtype *map = set->data; + struct nlattr *adt, *nested; + void *x; + u32 id, first = cb->args[2]; + + adt = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!adt) + return -EMSGSIZE; + for (; cb->args[2] < map->elements; cb->args[2]++) { + id = cb->args[2]; + x = get_ext(map, id); + if (!test_bit(id, map->members) || + (SET_WITH_TIMEOUT(set) && +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + mtype_is_filled((const struct mtype_elem *) x) && +#endif + ip_set_timeout_expired(ext_timeout(x, map)))) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, adt); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + if (mtype_do_list(skb, map, id)) + goto nla_put_failure; + if (SET_WITH_TIMEOUT(set)) { +#ifdef IP_SET_BITMAP_STORED_TIMEOUT + if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_stored(map, id, + ext_timeout(x, map))))) + goto nla_put_failure; +#else + if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get( + ext_timeout(x, map))))) + goto nla_put_failure; +#endif + } + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(x, map))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, adt); + + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, adt); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static void +mtype_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct mtype *map = set->data; + const void *x; + u32 id; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id < map->elements; id++) + if (mtype_gc_test(id, map)) { + x = get_ext(map, id); + if (ip_set_timeout_expired(ext_timeout(x, map))) + clear_bit(id, map->members); + } + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static const struct ip_set_type_variant mtype = { + .kadt = mtype_kadt, + .uadt = mtype_uadt, + .adt = { + [IPSET_ADD] = mtype_add, + [IPSET_DEL] = mtype_del, + [IPSET_TEST] = mtype_test, + }, + .destroy = mtype_destroy, + .flush = mtype_flush, + .head = mtype_head, + .list = mtype_list, + .same_set = mtype_same_set, +}; + +#endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 4a92fd47bd4c..f1a8128bef01 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -24,31 +24,37 @@ #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> -#define IP_SET_BITMAP_TIMEOUT -#include <linux/netfilter/ipset/ip_set_timeout.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counter support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("bitmap:ip", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); +#define MTYPE bitmap_ip + /* Type structure */ struct bitmap_ip { void *members; /* the set members */ + void *extensions; /* data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ u32 hosts; /* number of hosts in a subnet */ size_t memsize; /* members size */ + size_t dsize; /* extensions struct size */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ u8 netmask; /* subnet netmask */ u32 timeout; /* timeout parameter */ struct timer_list gc; /* garbage collection */ }; -/* Base variant */ +/* ADT structure for generic function args */ +struct bitmap_ip_adt_elem { + u16 id; +}; static inline u32 ip_to_id(const struct bitmap_ip *m, u32 ip) @@ -56,188 +62,67 @@ ip_to_id(const struct bitmap_ip *m, u32 ip) return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; } -static int -bitmap_ip_test(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - const struct bitmap_ip *map = set->data; - u16 id = *(u16 *)value; - - return !!test_bit(id, map->members); -} +/* Common functions */ -static int -bitmap_ip_add(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { - struct bitmap_ip *map = set->data; - u16 id = *(u16 *)value; - - if (test_and_set_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; + return !!test_bit(e->id, map->members); } -static int -bitmap_ip_del(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map) { - struct bitmap_ip *map = set->data; - u16 id = *(u16 *)value; - - if (!test_and_clear_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; -} - -static int -bitmap_ip_list(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) -{ - const struct bitmap_ip *map = set->data; - struct nlattr *atd, *nested; - u32 id, first = cb->args[2]; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] < map->elements; cb->args[2]++) { - id = cb->args[2]; - if (!test_bit(id, map->members)) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id * map->hosts))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return !!test_bit(id, map->members); } -/* Timeout variant */ - -static int -bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, + u32 flags) { - const struct bitmap_ip *map = set->data; - const unsigned long *members = map->members; - u16 id = *(u16 *)value; - - return ip_set_timeout_test(members[id]); + return !!test_and_set_bit(e->id, map->members); } -static int -bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { - struct bitmap_ip *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - - if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST)) - return -IPSET_ERR_EXIST; - - members[id] = ip_set_timeout_set(timeout); - - return 0; + return !test_and_clear_bit(e->id, map->members); } -static int -bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id) { - struct bitmap_ip *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - int ret = -IPSET_ERR_EXIST; - - if (ip_set_timeout_test(members[id])) - ret = 0; - - members[id] = IPSET_ELEM_UNSET; - return ret; + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id * map->hosts)); } -static int -bitmap_ip_tlist(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) { - const struct bitmap_ip *map = set->data; - struct nlattr *adt, *nested; - u32 id, first = cb->args[2]; - const unsigned long *members = map->members; - - adt = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!adt) - return -EMSGSIZE; - for (; cb->args[2] < map->elements; cb->args[2]++) { - id = cb->args[2]; - if (!ip_set_timeout_test(members[id])) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id * map->hosts)) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(members[id])))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, adt); - - /* Set listing finished */ - cb->args[2] = 0; - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, adt); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) || + (map->netmask != 32 && + nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask)); } static int bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_ip_adt_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); u32 ip; ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; - ip = ip_to_id(map, ip); + e.id = ip_to_id(map, ip); - return adtfn(set, &ip, opt_timeout(opt, map), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int @@ -246,33 +131,31 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], { struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - u32 timeout = map->timeout; - u32 ip, ip_to, id; + u32 ip, ip_to; + struct bitmap_ip_adt_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(map); int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(map->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - if (adt == IPSET_TEST) { - id = ip_to_id(map, ip); - return adtfn(set, &id, timeout, flags); + e.id = ip_to_id(map, ip); + return adtfn(set, &e, &ext, &ext, flags); } if (tb[IPSET_ATTR_IP_TO]) { @@ -297,8 +180,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; for (; !before(ip_to, ip); ip += map->hosts) { - id = ip_to_id(map, ip); - ret = adtfn(set, &id, timeout, flags); + e.id = ip_to_id(map, ip); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -308,54 +191,6 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static void -bitmap_ip_destroy(struct ip_set *set) -{ - struct bitmap_ip *map = set->data; - - if (with_timeout(map->timeout)) - del_timer_sync(&map->gc); - - ip_set_free(map->members); - kfree(map); - - set->data = NULL; -} - -static void -bitmap_ip_flush(struct ip_set *set) -{ - struct bitmap_ip *map = set->data; - - memset(map->members, 0, map->memsize); -} - -static int -bitmap_ip_head(struct ip_set *set, struct sk_buff *skb) -{ - const struct bitmap_ip *map = set->data; - struct nlattr *nested; - - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) - goto nla_put_failure; - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || - nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) || - (map->netmask != 32 && - nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask)) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->memsize)) || - (with_timeout(map->timeout) && - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - - return 0; -nla_put_failure: - return -EMSGSIZE; -} - static bool bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -365,70 +200,35 @@ bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) return x->first_ip == y->first_ip && x->last_ip == y->last_ip && x->netmask == y->netmask && - x->timeout == y->timeout; + x->timeout == y->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant bitmap_ip = { - .kadt = bitmap_ip_kadt, - .uadt = bitmap_ip_uadt, - .adt = { - [IPSET_ADD] = bitmap_ip_add, - [IPSET_DEL] = bitmap_ip_del, - [IPSET_TEST] = bitmap_ip_test, - }, - .destroy = bitmap_ip_destroy, - .flush = bitmap_ip_flush, - .head = bitmap_ip_head, - .list = bitmap_ip_list, - .same_set = bitmap_ip_same_set, +/* Plain variant */ + +struct bitmap_ip_elem { }; -static const struct ip_set_type_variant bitmap_tip = { - .kadt = bitmap_ip_kadt, - .uadt = bitmap_ip_uadt, - .adt = { - [IPSET_ADD] = bitmap_ip_tadd, - [IPSET_DEL] = bitmap_ip_tdel, - [IPSET_TEST] = bitmap_ip_ttest, - }, - .destroy = bitmap_ip_destroy, - .flush = bitmap_ip_flush, - .head = bitmap_ip_head, - .list = bitmap_ip_tlist, - .same_set = bitmap_ip_same_set, +/* Timeout variant */ + +struct bitmap_ipt_elem { + unsigned long timeout; }; -static void -bitmap_ip_gc(unsigned long ul_set) -{ - struct ip_set *set = (struct ip_set *) ul_set; - struct bitmap_ip *map = set->data; - unsigned long *table = map->members; - u32 id; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (id = 0; id < map->elements; id++) - if (ip_set_timeout_expired(table[id])) - table[id] = IPSET_ELEM_UNSET; - read_unlock_bh(&set->lock); - - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +/* Plain variant with counter */ -static void -bitmap_ip_gc_init(struct ip_set *set) -{ - struct bitmap_ip *map = set->data; +struct bitmap_ipc_elem { + struct ip_set_counter counter; +}; - init_timer(&map->gc); - map->gc.data = (unsigned long) set; - map->gc.function = bitmap_ip_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +/* Timeout variant with counter */ + +struct bitmap_ipct_elem { + unsigned long timeout; + struct ip_set_counter counter; +}; + +#include "ip_set_bitmap_gen.h" /* Create bitmap:ip type of sets */ @@ -440,6 +240,13 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; + if (map->dsize) { + map->extensions = ip_set_alloc(map->dsize * elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -457,13 +264,14 @@ static int bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) { struct bitmap_ip *map; - u32 first_ip, last_ip, hosts; + u32 first_ip, last_ip, hosts, cadt_flags = 0; u64 elements; u8 netmask = 32; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); @@ -526,8 +334,45 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) if (!map) return -ENOMEM; - if (tb[IPSET_ATTR_TIMEOUT]) { - map->memsize = elements * sizeof(unsigned long); + map->memsize = bitmap_bytes(0, elements - 1); + set->variant = &bitmap_ip; + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_ipct_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_ipct_elem, timeout); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_ipct_elem, counter); + + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = ip_set_timeout_uget( + tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + + bitmap_ip_gc_init(set, bitmap_ip_gc); + } else { + map->dsize = sizeof(struct bitmap_ipc_elem); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_ipc_elem, counter); + + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_ipt_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_ipt_elem, timeout); if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { @@ -536,19 +381,16 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) } map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - set->variant = &bitmap_tip; + set->extensions |= IPSET_EXT_TIMEOUT; - bitmap_ip_gc_init(set); + bitmap_ip_gc_init(set, bitmap_ip_gc); } else { - map->memsize = bitmap_bytes(0, elements - 1); - + map->dsize = 0; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { kfree(map); return -ENOMEM; } - - set->variant = &bitmap_ip; } return 0; } @@ -568,6 +410,7 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -575,6 +418,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 0f92dc24cb89..3b30e0bef890 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -23,340 +23,208 @@ #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counter support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("bitmap:ip,mac", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); +#define MTYPE bitmap_ipmac +#define IP_SET_BITMAP_STORED_TIMEOUT + enum { - MAC_EMPTY, /* element is not set */ - MAC_FILLED, /* element is set with MAC */ MAC_UNSET, /* element is set, without MAC */ + MAC_FILLED, /* element is set with MAC */ }; /* Type structure */ struct bitmap_ipmac { void *members; /* the set members */ + void *extensions; /* MAC + data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ u32 timeout; /* timeout value */ struct timer_list gc; /* garbage collector */ + size_t memsize; /* members size */ size_t dsize; /* size of element */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ }; /* ADT structure for generic function args */ -struct ipmac { - u32 id; /* id in array */ - unsigned char *ether; /* ethernet address */ +struct bitmap_ipmac_adt_elem { + u16 id; + unsigned char *ether; }; -/* Member element without and with timeout */ - -struct ipmac_elem { +struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; - unsigned char match; + unsigned char filled; } __attribute__ ((aligned)); -struct ipmac_telem { - unsigned char ether[ETH_ALEN]; - unsigned char match; - unsigned long timeout; -} __attribute__ ((aligned)); - -static inline void * -bitmap_ipmac_elem(const struct bitmap_ipmac *map, u32 id) +static inline u32 +ip_to_id(const struct bitmap_ipmac *m, u32 ip) { - return (void *)((char *)map->members + id * map->dsize); + return ip - m->first_ip; } -static inline bool -bitmap_timeout(const struct bitmap_ipmac *map, u32 id) +static inline struct bitmap_ipmac_elem * +get_elem(void *extensions, u16 id, size_t dsize) { - const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); - - return ip_set_timeout_test(elem->timeout); + return (struct bitmap_ipmac_elem *)(extensions + id * dsize); } -static inline bool -bitmap_expired(const struct bitmap_ipmac *map, u32 id) -{ - const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); - - return ip_set_timeout_expired(elem->timeout); -} +/* Common functions */ static inline int -bitmap_ipmac_exist(const struct ipmac_telem *elem) +bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, + const struct bitmap_ipmac *map) { - return elem->match == MAC_UNSET || - (elem->match == MAC_FILLED && - !ip_set_timeout_expired(elem->timeout)); -} + const struct bitmap_ipmac_elem *elem; -/* Base variant */ - -static int -bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - const struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - switch (elem->match) { - case MAC_UNSET: - /* Trigger kernel to fill out the ethernet address */ - return -EAGAIN; - case MAC_FILLED: - return data->ether == NULL || - ether_addr_equal(data->ether, elem->ether); - } - return 0; -} - -static int -bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - switch (elem->match) { - case MAC_UNSET: - if (!data->ether) - /* Already added without ethernet address */ - return -IPSET_ERR_EXIST; - /* Fill the MAC address */ - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - break; - case MAC_FILLED: - return -IPSET_ERR_EXIST; - case MAC_EMPTY: - if (data->ether) { - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - } else - elem->match = MAC_UNSET; - } - - return 0; -} - -static int -bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - if (elem->match == MAC_EMPTY) - return -IPSET_ERR_EXIST; - - elem->match = MAC_EMPTY; - - return 0; + if (!test_bit(e->id, map->members)) + return 0; + elem = get_elem(map->extensions, e->id, map->dsize); + if (elem->filled == MAC_FILLED) + return e->ether == NULL || + ether_addr_equal(e->ether, elem->ether); + /* Trigger kernel to fill out the ethernet address */ + return -EAGAIN; } -static int -bitmap_ipmac_list(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac_elem *elem; - struct nlattr *atd, *nested; - u32 id, first = cb->args[2]; - u32 last = map->last_ip - map->first_ip; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - elem = bitmap_ipmac_elem(map, id); - if (elem->match == MAC_EMPTY) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id)) || - (elem->match == MAC_FILLED && - nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, - elem->ether))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; - - return 0; + const struct bitmap_ipmac_elem *elem; -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + if (!test_bit(id, map->members)) + return 0; + elem = get_elem(map->extensions, id, map->dsize); + /* Timer not started for the incomplete elements */ + return elem->filled == MAC_FILLED; } -/* Timeout variant */ - -static int -bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); - - switch (elem->match) { - case MAC_UNSET: - /* Trigger kernel to fill out the ethernet address */ - return -EAGAIN; - case MAC_FILLED: - return (data->ether == NULL || - ether_addr_equal(data->ether, elem->ether)) && - !bitmap_expired(map, data->id); - } - return 0; + return elem->filled == MAC_FILLED; } -static int -bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ipmac_add_timeout(unsigned long *timeout, + const struct bitmap_ipmac_adt_elem *e, + const struct ip_set_ext *ext, + struct bitmap_ipmac *map, int mode) { - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); - bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 t = ext->timeout; - switch (elem->match) { - case MAC_UNSET: - if (!(data->ether || flag_exist)) - /* Already added without ethernet address */ - return -IPSET_ERR_EXIST; - /* Fill the MAC address and activate the timer */ - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - if (timeout == map->timeout) + if (mode == IPSET_ADD_START_STORED_TIMEOUT) { + if (t == map->timeout) /* Timeout was not specified, get stored one */ - timeout = elem->timeout; - elem->timeout = ip_set_timeout_set(timeout); - break; - case MAC_FILLED: - if (!(bitmap_expired(map, data->id) || flag_exist)) - return -IPSET_ERR_EXIST; - /* Fall through */ - case MAC_EMPTY: - if (data->ether) { - memcpy(elem->ether, data->ether, ETH_ALEN); - elem->match = MAC_FILLED; - } else - elem->match = MAC_UNSET; + t = *timeout; + ip_set_timeout_set(timeout, t); + } else { /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, * possibly by the kernel */ - elem->timeout = data->ether ? ip_set_timeout_set(timeout) - : timeout; - break; + if (e->ether) + ip_set_timeout_set(timeout, t); + else + *timeout = t; } - return 0; } -static int -bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, + struct bitmap_ipmac *map, u32 flags) { - struct bitmap_ipmac *map = set->data; - const struct ipmac *data = value; - struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); + struct bitmap_ipmac_elem *elem; + + elem = get_elem(map->extensions, e->id, map->dsize); + if (test_and_set_bit(e->id, map->members)) { + if (elem->filled == MAC_FILLED) { + if (e->ether && (flags & IPSET_FLAG_EXIST)) + memcpy(elem->ether, e->ether, ETH_ALEN); + return IPSET_ADD_FAILED; + } else if (!e->ether) + /* Already added without ethernet address */ + return IPSET_ADD_FAILED; + /* Fill the MAC address and trigger the timer activation */ + memcpy(elem->ether, e->ether, ETH_ALEN); + elem->filled = MAC_FILLED; + return IPSET_ADD_START_STORED_TIMEOUT; + } else if (e->ether) { + /* We can store MAC too */ + memcpy(elem->ether, e->ether, ETH_ALEN); + elem->filled = MAC_FILLED; + return 0; + } else { + elem->filled = MAC_UNSET; + /* MAC is not stored yet, don't start timer */ + return IPSET_ADD_STORE_PLAIN_TIMEOUT; + } +} - if (elem->match == MAC_EMPTY || bitmap_expired(map, data->id)) - return -IPSET_ERR_EXIST; +static inline int +bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, + struct bitmap_ipmac *map) +{ + return !test_and_clear_bit(e->id, map->members); +} - elem->match = MAC_EMPTY; +static inline unsigned long +ip_set_timeout_stored(struct bitmap_ipmac *map, u32 id, unsigned long *timeout) +{ + const struct bitmap_ipmac_elem *elem = + get_elem(map->extensions, id, map->dsize); - return 0; + return elem->filled == MAC_FILLED ? ip_set_timeout_get(timeout) : + *timeout; } -static int -bitmap_ipmac_tlist(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, + u32 id) { - const struct bitmap_ipmac *map = set->data; - const struct ipmac_telem *elem; - struct nlattr *atd, *nested; - u32 id, first = cb->args[2]; - u32 timeout, last = map->last_ip - map->first_ip; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - elem = bitmap_ipmac_elem(map, id); - if (!bitmap_ipmac_exist(elem)) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, - htonl(map->first_ip + id)) || - (elem->match == MAC_FILLED && - nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, - elem->ether))) - goto nla_put_failure; - timeout = elem->match == MAC_UNSET ? elem->timeout - : ip_set_timeout_get(elem->timeout); - if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(timeout))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; + const struct bitmap_ipmac_elem *elem = + get_elem(map->extensions, id, map->dsize); - return 0; + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id)) || + (elem->filled == MAC_FILLED && + nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); +} -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - return -EMSGSIZE; +static inline int +bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) +{ + return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || + nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); } static int bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct ipmac data; + struct bitmap_ipmac_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); + u32 ip; /* MAC can be src only */ if (!(opt->flags & IPSET_DIM_TWO_SRC)) return 0; - data.id = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); - if (data.id < map->first_ip || data.id > map->last_ip) + ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); + if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; /* Backward compatibility: we don't check the second flag */ @@ -364,10 +232,10 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - data.id -= map->first_ip; - data.ether = eth_hdr(skb)->h_source; + e.id = ip_to_id(map, ip); + e.ether = eth_hdr(skb)->h_source; - return adtfn(set, &data, opt_timeout(opt, map), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int @@ -376,91 +244,39 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], { const struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct ipmac data; - u32 timeout = map->timeout; + struct bitmap_ipmac_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(map); + u32 ip; int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &data.id); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (data.id < map->first_ip || data.id > map->last_ip) + if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; + e.id = ip_to_id(map, ip); if (tb[IPSET_ATTR_ETHER]) - data.ether = nla_data(tb[IPSET_ATTR_ETHER]); + e.ether = nla_data(tb[IPSET_ATTR_ETHER]); else - data.ether = NULL; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(map->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + e.ether = NULL; - data.id -= map->first_ip; - - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } -static void -bitmap_ipmac_destroy(struct ip_set *set) -{ - struct bitmap_ipmac *map = set->data; - - if (with_timeout(map->timeout)) - del_timer_sync(&map->gc); - - ip_set_free(map->members); - kfree(map); - - set->data = NULL; -} - -static void -bitmap_ipmac_flush(struct ip_set *set) -{ - struct bitmap_ipmac *map = set->data; - - memset(map->members, 0, - (map->last_ip - map->first_ip + 1) * map->dsize); -} - -static int -bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb) -{ - const struct bitmap_ipmac *map = set->data; - struct nlattr *nested; - - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) - goto nla_put_failure; - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || - nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + - ((map->last_ip - map->first_ip + 1) * - map->dsize))) || - (with_timeout(map->timeout) && - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - - return 0; -nla_put_failure: - return -EMSGSIZE; -} - static bool bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -469,85 +285,64 @@ bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) return x->first_ip == y->first_ip && x->last_ip == y->last_ip && - x->timeout == y->timeout; + x->timeout == y->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant bitmap_ipmac = { - .kadt = bitmap_ipmac_kadt, - .uadt = bitmap_ipmac_uadt, - .adt = { - [IPSET_ADD] = bitmap_ipmac_add, - [IPSET_DEL] = bitmap_ipmac_del, - [IPSET_TEST] = bitmap_ipmac_test, - }, - .destroy = bitmap_ipmac_destroy, - .flush = bitmap_ipmac_flush, - .head = bitmap_ipmac_head, - .list = bitmap_ipmac_list, - .same_set = bitmap_ipmac_same_set, -}; +/* Plain variant */ -static const struct ip_set_type_variant bitmap_tipmac = { - .kadt = bitmap_ipmac_kadt, - .uadt = bitmap_ipmac_uadt, - .adt = { - [IPSET_ADD] = bitmap_ipmac_tadd, - [IPSET_DEL] = bitmap_ipmac_tdel, - [IPSET_TEST] = bitmap_ipmac_ttest, - }, - .destroy = bitmap_ipmac_destroy, - .flush = bitmap_ipmac_flush, - .head = bitmap_ipmac_head, - .list = bitmap_ipmac_tlist, - .same_set = bitmap_ipmac_same_set, +/* Timeout variant */ + +struct bitmap_ipmact_elem { + struct { + unsigned char ether[ETH_ALEN]; + unsigned char filled; + } __attribute__ ((aligned)); + unsigned long timeout; }; -static void -bitmap_ipmac_gc(unsigned long ul_set) -{ - struct ip_set *set = (struct ip_set *) ul_set; - struct bitmap_ipmac *map = set->data; - struct ipmac_telem *elem; - u32 id, last = map->last_ip - map->first_ip; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (id = 0; id <= last; id++) { - elem = bitmap_ipmac_elem(map, id); - if (elem->match == MAC_FILLED && - ip_set_timeout_expired(elem->timeout)) - elem->match = MAC_EMPTY; - } - read_unlock_bh(&set->lock); +/* Plain variant with counter */ - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +struct bitmap_ipmacc_elem { + struct { + unsigned char ether[ETH_ALEN]; + unsigned char filled; + } __attribute__ ((aligned)); + struct ip_set_counter counter; +}; -static void -bitmap_ipmac_gc_init(struct ip_set *set) -{ - struct bitmap_ipmac *map = set->data; +/* Timeout variant with counter */ - init_timer(&map->gc); - map->gc.data = (unsigned long) set; - map->gc.function = bitmap_ipmac_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +struct bitmap_ipmacct_elem { + struct { + unsigned char ether[ETH_ALEN]; + unsigned char filled; + } __attribute__ ((aligned)); + unsigned long timeout; + struct ip_set_counter counter; +}; + +#include "ip_set_bitmap_gen.h" /* Create bitmap:ip,mac type of sets */ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, - u32 first_ip, u32 last_ip) + u32 first_ip, u32 last_ip, u32 elements) { map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize); if (!map->members) return false; + if (map->dsize) { + map->extensions = ip_set_alloc(map->dsize * elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } map->first_ip = first_ip; map->last_ip = last_ip; + map->elements = elements; map->timeout = IPSET_NO_TIMEOUT; set->data = map; @@ -560,13 +355,14 @@ static int bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], u32 flags) { - u32 first_ip, last_ip; + u32 first_ip, last_ip, cadt_flags = 0; u64 elements; struct bitmap_ipmac *map; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); @@ -601,28 +397,59 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - if (tb[IPSET_ATTR_TIMEOUT]) { - map->dsize = sizeof(struct ipmac_telem); + map->memsize = bitmap_bytes(0, elements - 1); + set->variant = &bitmap_ipmac; + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_ipmacct_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_ipmacct_elem, timeout); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_ipmacct_elem, counter); + + if (!init_map_ipmac(set, map, first_ip, last_ip, + elements)) { + kfree(map); + return -ENOMEM; + } + map->timeout = ip_set_timeout_uget( + tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); + } else { + map->dsize = sizeof(struct bitmap_ipmacc_elem); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_ipmacc_elem, counter); + + if (!init_map_ipmac(set, map, first_ip, last_ip, + elements)) { + kfree(map); + return -ENOMEM; + } + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_ipmact_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_ipmact_elem, timeout); - if (!init_map_ipmac(set, map, first_ip, last_ip)) { + if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); return -ENOMEM; } - map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = &bitmap_tipmac; - - bitmap_ipmac_gc_init(set); + set->extensions |= IPSET_EXT_TIMEOUT; + bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); } else { - map->dsize = sizeof(struct ipmac_elem); + map->dsize = sizeof(struct bitmap_ipmac_elem); - if (!init_map_ipmac(set, map, first_ip, last_ip)) { + if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); return -ENOMEM; } set->variant = &bitmap_ipmac; - } return 0; } @@ -641,6 +468,7 @@ static struct ip_set_type bitmap_ipmac_type = { [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -648,6 +476,8 @@ static struct ip_set_type bitmap_ipmac_type = { .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index e6b2db76f4c3..8207d1fda528 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,205 +19,94 @@ #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> #include <linux/netfilter/ipset/ip_set_getport.h> -#define IP_SET_BITMAP_TIMEOUT -#include <linux/netfilter/ipset/ip_set_timeout.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counter support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("bitmap:port", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_bitmap:port"); +#define MTYPE bitmap_port + /* Type structure */ struct bitmap_port { void *members; /* the set members */ + void *extensions; /* data extensions */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ + size_t dsize; /* extensions struct size */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ u32 timeout; /* timeout parameter */ struct timer_list gc; /* garbage collection */ }; -/* Base variant */ +/* ADT structure for generic function args */ +struct bitmap_port_adt_elem { + u16 id; +}; -static int -bitmap_port_test(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline u16 +port_to_id(const struct bitmap_port *m, u16 port) { - const struct bitmap_port *map = set->data; - u16 id = *(u16 *)value; - - return !!test_bit(id, map->members); + return port - m->first_port; } -static int -bitmap_port_add(struct ip_set *set, void *value, u32 timeout, u32 flags) -{ - struct bitmap_port *map = set->data; - u16 id = *(u16 *)value; - - if (test_and_set_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; -} +/* Common functions */ -static int -bitmap_port_del(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_test(const struct bitmap_port_adt_elem *e, + const struct bitmap_port *map) { - struct bitmap_port *map = set->data; - u16 id = *(u16 *)value; - - if (!test_and_clear_bit(id, map->members)) - return -IPSET_ERR_EXIST; - - return 0; + return !!test_bit(e->id, map->members); } -static int -bitmap_port_list(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_port_gc_test(u16 id, const struct bitmap_port *map) { - const struct bitmap_port *map = set->data; - struct nlattr *atd, *nested; - u16 id, first = cb->args[2]; - u16 last = map->last_port - map->first_port; - - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!atd) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - if (!test_bit(id, map->members)) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_net16(skb, IPSET_ATTR_PORT, - htons(map->first_port + id))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, atd); - /* Set listing finished */ - cb->args[2] = 0; - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, atd); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return !!test_bit(id, map->members); } -/* Timeout variant */ - -static int -bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_add(const struct bitmap_port_adt_elem *e, + struct bitmap_port *map, u32 flags) { - const struct bitmap_port *map = set->data; - const unsigned long *members = map->members; - u16 id = *(u16 *)value; - - return ip_set_timeout_test(members[id]); + return !!test_and_set_bit(e->id, map->members); } -static int -bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_del(const struct bitmap_port_adt_elem *e, + struct bitmap_port *map) { - struct bitmap_port *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - - if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST)) - return -IPSET_ERR_EXIST; - - members[id] = ip_set_timeout_set(timeout); - - return 0; + return !test_and_clear_bit(e->id, map->members); } -static int -bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +static inline int +bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id) { - struct bitmap_port *map = set->data; - unsigned long *members = map->members; - u16 id = *(u16 *)value; - int ret = -IPSET_ERR_EXIST; - - if (ip_set_timeout_test(members[id])) - ret = 0; - - members[id] = IPSET_ELEM_UNSET; - return ret; + return nla_put_net16(skb, IPSET_ATTR_PORT, + htons(map->first_port + id)); } -static int -bitmap_port_tlist(const struct ip_set *set, - struct sk_buff *skb, struct netlink_callback *cb) +static inline int +bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) { - const struct bitmap_port *map = set->data; - struct nlattr *adt, *nested; - u16 id, first = cb->args[2]; - u16 last = map->last_port - map->first_port; - const unsigned long *members = map->members; - - adt = ipset_nest_start(skb, IPSET_ATTR_ADT); - if (!adt) - return -EMSGSIZE; - for (; cb->args[2] <= last; cb->args[2]++) { - id = cb->args[2]; - if (!ip_set_timeout_test(members[id])) - continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (id == first) { - nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; - } - if (nla_put_net16(skb, IPSET_ATTR_PORT, - htons(map->first_port + id)) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(members[id])))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - } - ipset_nest_end(skb, adt); - - /* Set listing finished */ - cb->args[2] = 0; - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nested); - ipset_nest_end(skb, adt); - if (unlikely(id == first)) { - cb->args[2] = 0; - return -EMSGSIZE; - } - return 0; + return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || + nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); } static int bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; + struct bitmap_port_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); __be16 __port; u16 port = 0; @@ -230,9 +119,9 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; - port -= map->first_port; + e.id = port_to_id(map, port); - return adtfn(set, &port, opt_timeout(opt, map), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int @@ -241,14 +130,17 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - u32 timeout = map->timeout; + struct bitmap_port_adt_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(map); u32 port; /* wraparound */ - u16 id, port_to; + u16 port_to; int ret = 0; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -257,16 +149,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(map->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; if (adt == IPSET_TEST) { - id = port - map->first_port; - return adtfn(set, &id, timeout, flags); + e.id = port_to_id(map, port); + return adtfn(set, &e, &ext, &ext, flags); } if (tb[IPSET_ATTR_PORT_TO]) { @@ -283,8 +172,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; for (; port <= port_to; port++) { - id = port - map->first_port; - ret = adtfn(set, &id, timeout, flags); + e.id = port_to_id(map, port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -294,52 +183,6 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static void -bitmap_port_destroy(struct ip_set *set) -{ - struct bitmap_port *map = set->data; - - if (with_timeout(map->timeout)) - del_timer_sync(&map->gc); - - ip_set_free(map->members); - kfree(map); - - set->data = NULL; -} - -static void -bitmap_port_flush(struct ip_set *set) -{ - struct bitmap_port *map = set->data; - - memset(map->members, 0, map->memsize); -} - -static int -bitmap_port_head(struct ip_set *set, struct sk_buff *skb) -{ - const struct bitmap_port *map = set->data; - struct nlattr *nested; - - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) - goto nla_put_failure; - if (nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || - nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)) || - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->memsize)) || - (with_timeout(map->timeout) && - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)))) - goto nla_put_failure; - ipset_nest_end(skb, nested); - - return 0; -nla_put_failure: - return -EMSGSIZE; -} - static bool bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -348,71 +191,35 @@ bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) return x->first_port == y->first_port && x->last_port == y->last_port && - x->timeout == y->timeout; + x->timeout == y->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant bitmap_port = { - .kadt = bitmap_port_kadt, - .uadt = bitmap_port_uadt, - .adt = { - [IPSET_ADD] = bitmap_port_add, - [IPSET_DEL] = bitmap_port_del, - [IPSET_TEST] = bitmap_port_test, - }, - .destroy = bitmap_port_destroy, - .flush = bitmap_port_flush, - .head = bitmap_port_head, - .list = bitmap_port_list, - .same_set = bitmap_port_same_set, +/* Plain variant */ + +struct bitmap_port_elem { }; -static const struct ip_set_type_variant bitmap_tport = { - .kadt = bitmap_port_kadt, - .uadt = bitmap_port_uadt, - .adt = { - [IPSET_ADD] = bitmap_port_tadd, - [IPSET_DEL] = bitmap_port_tdel, - [IPSET_TEST] = bitmap_port_ttest, - }, - .destroy = bitmap_port_destroy, - .flush = bitmap_port_flush, - .head = bitmap_port_head, - .list = bitmap_port_tlist, - .same_set = bitmap_port_same_set, +/* Timeout variant */ + +struct bitmap_portt_elem { + unsigned long timeout; }; -static void -bitmap_port_gc(unsigned long ul_set) -{ - struct ip_set *set = (struct ip_set *) ul_set; - struct bitmap_port *map = set->data; - unsigned long *table = map->members; - u32 id; /* wraparound */ - u16 last = map->last_port - map->first_port; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (id = 0; id <= last; id++) - if (ip_set_timeout_expired(table[id])) - table[id] = IPSET_ELEM_UNSET; - read_unlock_bh(&set->lock); - - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +/* Plain variant with counter */ -static void -bitmap_port_gc_init(struct ip_set *set) -{ - struct bitmap_port *map = set->data; +struct bitmap_portc_elem { + struct ip_set_counter counter; +}; - init_timer(&map->gc); - map->gc.data = (unsigned long) set; - map->gc.function = bitmap_port_gc; - map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; - add_timer(&map->gc); -} +/* Timeout variant with counter */ + +struct bitmap_portct_elem { + unsigned long timeout; + struct ip_set_counter counter; +}; + +#include "ip_set_bitmap_gen.h" /* Create bitmap:ip type of sets */ @@ -423,6 +230,13 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; + if (map->dsize) { + map->extensions = ip_set_alloc(map->dsize * map->elements); + if (!map->extensions) { + kfree(map->members); + return false; + } + } map->first_port = first_port; map->last_port = last_port; map->timeout = IPSET_NO_TIMEOUT; @@ -434,15 +248,16 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, } static int -bitmap_port_create(struct ip_set *set, struct nlattr *tb[], - u32 flags) +bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags) { struct bitmap_port *map; u16 first_port, last_port; + u32 cadt_flags = 0; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); @@ -458,28 +273,56 @@ bitmap_port_create(struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - if (tb[IPSET_ATTR_TIMEOUT]) { - map->memsize = (last_port - first_port + 1) - * sizeof(unsigned long); - + map->elements = last_port - first_port + 1; + map->memsize = map->elements * sizeof(unsigned long); + set->variant = &bitmap_port; + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_portct_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_portct_elem, timeout); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_portct_elem, counter); + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = + ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + bitmap_port_gc_init(set, bitmap_port_gc); + } else { + map->dsize = sizeof(struct bitmap_portc_elem); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct bitmap_portc_elem, counter); + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct bitmap_portt_elem); + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct bitmap_portt_elem, timeout); if (!init_map_port(set, map, first_port, last_port)) { kfree(map); return -ENOMEM; } map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - set->variant = &bitmap_tport; - - bitmap_port_gc_init(set); + set->extensions |= IPSET_EXT_TIMEOUT; + bitmap_port_gc_init(set, bitmap_port_gc); } else { - map->memsize = bitmap_bytes(0, last_port - first_port); - pr_debug("memsize: %zu\n", map->memsize); + map->dsize = 0; if (!init_map_port(set, map, first_port, last_port)) { kfree(map); return -ENOMEM; } - set->variant = &bitmap_port; } return 0; } @@ -497,12 +340,15 @@ static struct ip_set_type bitmap_port_type = { [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 6d6d8f2b033e..f77139007983 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -15,7 +15,6 @@ #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/spinlock.h> -#include <linux/netlink.h> #include <linux/rculist.h> #include <net/netlink.h> @@ -88,14 +87,14 @@ find_set_type(const char *name, u8 family, u8 revision) static bool load_settype(const char *name) { - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { pr_warning("Can't find ip_set type %s\n", name); - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_IPSET); return false; } - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_IPSET); return true; } @@ -316,6 +315,29 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); +int +ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], + struct ip_set_ext *ext) +{ + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!(set->extensions & IPSET_EXT_TIMEOUT)) + return -IPSET_ERR_TIMEOUT; + ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { + if (!(set->extensions & IPSET_EXT_COUNTER)) + return -IPSET_ERR_COUNTER; + if (tb[IPSET_ATTR_BYTES]) + ext->bytes = be64_to_cpu(nla_get_be64( + tb[IPSET_ATTR_BYTES])); + if (tb[IPSET_ATTR_PACKETS]) + ext->packets = be64_to_cpu(nla_get_be64( + tb[IPSET_ATTR_PACKETS])); + } + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_extensions); + /* * Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace @@ -366,8 +388,7 @@ ip_set_rcu_get(ip_set_id_t index) int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, - const struct xt_action_param *par, - const struct ip_set_adt_opt *opt) + const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(index); int ret = 0; @@ -392,7 +413,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, ret = 1; } else { /* --return-nomatch: invert matched element */ - if ((opt->flags & IPSET_RETURN_NOMATCH) && + if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) && (set->type->features & IPSET_TYPE_NOMATCH) && (ret > 0 || ret == -ENOTEMPTY)) ret = -ret; @@ -405,8 +426,7 @@ EXPORT_SYMBOL_GPL(ip_set_test); int ip_set_add(ip_set_id_t index, const struct sk_buff *skb, - const struct xt_action_param *par, - const struct ip_set_adt_opt *opt) + const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(index); int ret; @@ -428,8 +448,7 @@ EXPORT_SYMBOL_GPL(ip_set_add); int ip_set_del(ip_set_id_t index, const struct sk_buff *skb, - const struct xt_action_param *par, - const struct ip_set_adt_opt *opt) + const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(index); int ret = 0; @@ -532,7 +551,7 @@ ip_set_nfnl_get(const char *name) ip_set_id_t i, index = IPSET_INVALID_ID; struct ip_set *s; - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_IPSET); for (i = 0; i < ip_set_max; i++) { s = nfnl_set(i); if (s != NULL && STREQ(s->name, name)) { @@ -541,7 +560,7 @@ ip_set_nfnl_get(const char *name) break; } } - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_IPSET); return index; } @@ -561,13 +580,13 @@ ip_set_nfnl_get_byindex(ip_set_id_t index) if (index > ip_set_max) return IPSET_INVALID_ID; - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_IPSET); set = nfnl_set(index); if (set) __ip_set_get(set); else index = IPSET_INVALID_ID; - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_IPSET); return index; } @@ -584,11 +603,11 @@ void ip_set_nfnl_put(ip_set_id_t index) { struct ip_set *set; - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_IPSET); set = nfnl_set(index); if (set != NULL) __ip_set_put(set); - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_IPSET); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); @@ -1085,7 +1104,7 @@ static int dump_init(struct netlink_callback *cb) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; @@ -1301,7 +1320,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, struct sk_buff *skb2; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; struct nlattr *cmdattr; u32 *errline; @@ -1470,7 +1489,8 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, if (ret == -EAGAIN) ret = 1; - return ret < 0 ? ret : ret > 0 ? 0 : -IPSET_ERR_EXIST; + return (ret < 0 && ret != -ENOTEMPTY) ? ret : + ret > 0 ? 0 : -IPSET_ERR_EXIST; } /* Get headed data of a set */ @@ -1763,10 +1783,10 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) goto done; } req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_IPSET); find_set_and_id(req_get->set.name, &id); req_get->set.index = id; - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } case IP_SET_OP_GET_BYINDEX: { @@ -1778,11 +1798,11 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) ret = -EINVAL; goto done; } - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_IPSET); set = nfnl_set(req_get->set.index); strncpy(req_get->set.name, set ? set->name : "", IPSET_MAXNAMELEN); - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } default: diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h new file mode 100644 index 000000000000..57beb1762b2d --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -0,0 +1,1100 @@ +/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _IP_SET_HASH_GEN_H +#define _IP_SET_HASH_GEN_H + +#include <linux/rcupdate.h> +#include <linux/jhash.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#ifndef rcu_dereference_bh +#define rcu_dereference_bh(p) rcu_dereference(p) +#endif + +#define CONCAT(a, b) a##b +#define TOKEN(a, b) CONCAT(a, b) + +/* Hashing which uses arrays to resolve clashing. The hash table is resized + * (doubled) when searching becomes too long. + * Internally jhash is used with the assumption that the size of the + * stored data is a multiple of sizeof(u32). If storage supports timeout, + * the timeout field must be the last one in the data structure - that field + * is ignored when computing the hash key. + * + * Readers and resizing + * + * Resizing can be triggered by userspace command only, and those + * are serialized by the nfnl mutex. During resizing the set is + * read-locked, so the only possible concurrent operations are + * the kernel side readers. Those must be protected by proper RCU locking. + */ + +/* Number of elements to store in an initial array block */ +#define AHASH_INIT_SIZE 4 +/* Max number of elements to store in an array block */ +#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE) + +/* Max number of elements can be tuned */ +#ifdef IP_SET_HASH_WITH_MULTI +#define AHASH_MAX(h) ((h)->ahash_max) + +static inline u8 +tune_ahash_max(u8 curr, u32 multi) +{ + u32 n; + + if (multi < curr) + return curr; + + n = curr + AHASH_INIT_SIZE; + /* Currently, at listing one hash bucket must fit into a message. + * Therefore we have a hard limit here. + */ + return n > curr && n <= 64 ? n : curr; +} +#define TUNE_AHASH_MAX(h, multi) \ + ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) +#else +#define AHASH_MAX(h) AHASH_MAX_SIZE +#define TUNE_AHASH_MAX(h, multi) +#endif + +/* A hash bucket */ +struct hbucket { + void *value; /* the array of the values */ + u8 size; /* size of the array */ + u8 pos; /* position of the first free entry */ +}; + +/* The hash table: the table size stored here in order to make resizing easy */ +struct htable { + u8 htable_bits; /* size of hash table == 2^htable_bits */ + struct hbucket bucket[0]; /* hashtable buckets */ +}; + +#define hbucket(h, i) (&((h)->bucket[i])) + +/* Book-keeping of the prefixes added to the set */ +struct net_prefixes { + u8 cidr; /* the different cidr values in the set */ + u32 nets; /* number of elements per cidr */ +}; + +/* Compute the hash table size */ +static size_t +htable_size(u8 hbits) +{ + size_t hsize; + + /* We must fit both into u32 in jhash and size_t */ + if (hbits > 31) + return 0; + hsize = jhash_size(hbits); + if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket) + < hsize) + return 0; + + return hsize * sizeof(struct hbucket) + sizeof(struct htable); +} + +/* Compute htable_bits from the user input parameter hashsize */ +static u8 +htable_bits(u32 hashsize) +{ + /* Assume that hashsize == 2^htable_bits */ + u8 bits = fls(hashsize - 1); + if (jhash_size(bits) != hashsize) + /* Round up to the first 2^n value */ + bits = fls(hashsize); + + return bits; +} + +/* Destroy the hashtable part of the set */ +static void +ahash_destroy(struct htable *t) +{ + struct hbucket *n; + u32 i; + + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + if (n->size) + /* FIXME: use slab cache */ + kfree(n->value); + } + + ip_set_free(t); +} + +static int +hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) +{ + if (n->pos >= n->size) { + void *tmp; + + if (n->size >= ahash_max) + /* Trigger rehashing */ + return -EAGAIN; + + tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + if (n->size) { + memcpy(tmp, n->value, n->size * dsize); + kfree(n->value); + } + n->value = tmp; + n->size += AHASH_INIT_SIZE; + } + return 0; +} + +#ifdef IP_SET_HASH_WITH_NETS +#ifdef IP_SET_HASH_WITH_NETS_PACKED +/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */ +#define CIDR(cidr) (cidr + 1) +#else +#define CIDR(cidr) (cidr) +#endif + +#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) + +#ifdef IP_SET_HASH_WITH_MULTI +#define NETS_LENGTH(family) (SET_HOST_MASK(family) + 1) +#else +#define NETS_LENGTH(family) SET_HOST_MASK(family) +#endif + +#else +#define NETS_LENGTH(family) 0 +#endif /* IP_SET_HASH_WITH_NETS */ + +#define ext_timeout(e, h) \ +(unsigned long *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_TIMEOUT]) +#define ext_counter(e, h) \ +(struct ip_set_counter *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_COUNTER]) + +#endif /* _IP_SET_HASH_GEN_H */ + +/* Family dependent templates */ + +#undef ahash_data +#undef mtype_data_equal +#undef mtype_do_data_match +#undef mtype_data_set_flags +#undef mtype_data_reset_flags +#undef mtype_data_netmask +#undef mtype_data_list +#undef mtype_data_next +#undef mtype_elem + +#undef mtype_add_cidr +#undef mtype_del_cidr +#undef mtype_ahash_memsize +#undef mtype_flush +#undef mtype_destroy +#undef mtype_gc_init +#undef mtype_same_set +#undef mtype_kadt +#undef mtype_uadt +#undef mtype + +#undef mtype_add +#undef mtype_del +#undef mtype_test_cidrs +#undef mtype_test +#undef mtype_expire +#undef mtype_resize +#undef mtype_head +#undef mtype_list +#undef mtype_gc +#undef mtype_gc_init +#undef mtype_variant +#undef mtype_data_match + +#undef HKEY + +#define mtype_data_equal TOKEN(MTYPE, _data_equal) +#ifdef IP_SET_HASH_WITH_NETS +#define mtype_do_data_match TOKEN(MTYPE, _do_data_match) +#else +#define mtype_do_data_match(d) 1 +#endif +#define mtype_data_set_flags TOKEN(MTYPE, _data_set_flags) +#define mtype_data_reset_flags TOKEN(MTYPE, _data_reset_flags) +#define mtype_data_netmask TOKEN(MTYPE, _data_netmask) +#define mtype_data_list TOKEN(MTYPE, _data_list) +#define mtype_data_next TOKEN(MTYPE, _data_next) +#define mtype_elem TOKEN(MTYPE, _elem) +#define mtype_add_cidr TOKEN(MTYPE, _add_cidr) +#define mtype_del_cidr TOKEN(MTYPE, _del_cidr) +#define mtype_ahash_memsize TOKEN(MTYPE, _ahash_memsize) +#define mtype_flush TOKEN(MTYPE, _flush) +#define mtype_destroy TOKEN(MTYPE, _destroy) +#define mtype_gc_init TOKEN(MTYPE, _gc_init) +#define mtype_same_set TOKEN(MTYPE, _same_set) +#define mtype_kadt TOKEN(MTYPE, _kadt) +#define mtype_uadt TOKEN(MTYPE, _uadt) +#define mtype MTYPE + +#define mtype_elem TOKEN(MTYPE, _elem) +#define mtype_add TOKEN(MTYPE, _add) +#define mtype_del TOKEN(MTYPE, _del) +#define mtype_test_cidrs TOKEN(MTYPE, _test_cidrs) +#define mtype_test TOKEN(MTYPE, _test) +#define mtype_expire TOKEN(MTYPE, _expire) +#define mtype_resize TOKEN(MTYPE, _resize) +#define mtype_head TOKEN(MTYPE, _head) +#define mtype_list TOKEN(MTYPE, _list) +#define mtype_gc TOKEN(MTYPE, _gc) +#define mtype_variant TOKEN(MTYPE, _variant) +#define mtype_data_match TOKEN(MTYPE, _data_match) + +#ifndef HKEY_DATALEN +#define HKEY_DATALEN sizeof(struct mtype_elem) +#endif + +#define HKEY(data, initval, htable_bits) \ +(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \ + & jhash_mask(htable_bits)) + +#ifndef htype +#define htype HTYPE + +/* The generic hash structure */ +struct htype { + struct htable *table; /* the hash table */ + u32 maxelem; /* max elements in the hash */ + u32 elements; /* current element (vs timeout) */ + u32 initval; /* random jhash init value */ + u32 timeout; /* timeout value, if enabled */ + size_t dsize; /* data struct size */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ + struct timer_list gc; /* garbage collection when timeout enabled */ + struct mtype_elem next; /* temporary storage for uadd */ +#ifdef IP_SET_HASH_WITH_MULTI + u8 ahash_max; /* max elements in an array block */ +#endif +#ifdef IP_SET_HASH_WITH_NETMASK + u8 netmask; /* netmask value for subnets to store */ +#endif +#ifdef IP_SET_HASH_WITH_RBTREE + struct rb_root rbtree; +#endif +#ifdef IP_SET_HASH_WITH_NETS + struct net_prefixes nets[0]; /* book-keeping of prefixes */ +#endif +}; +#endif + +#ifdef IP_SET_HASH_WITH_NETS +/* Network cidr size book keeping when the hash stores different + * sized networks */ +static void +mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length) +{ + int i, j; + + /* Add in increasing prefix order, so larger cidr first */ + for (i = 0, j = -1; i < nets_length && h->nets[i].nets; i++) { + if (j != -1) + continue; + else if (h->nets[i].cidr < cidr) + j = i; + else if (h->nets[i].cidr == cidr) { + h->nets[i].nets++; + return; + } + } + if (j != -1) { + for (; i > j; i--) { + h->nets[i].cidr = h->nets[i - 1].cidr; + h->nets[i].nets = h->nets[i - 1].nets; + } + } + h->nets[i].cidr = cidr; + h->nets[i].nets = 1; +} + +static void +mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length) +{ + u8 i, j; + + for (i = 0; i < nets_length - 1 && h->nets[i].cidr != cidr; i++) + ; + h->nets[i].nets--; + + if (h->nets[i].nets != 0) + return; + + for (j = i; j < nets_length - 1 && h->nets[j].nets; j++) { + h->nets[j].cidr = h->nets[j + 1].cidr; + h->nets[j].nets = h->nets[j + 1].nets; + } +} +#endif + +/* Calculate the actual memory size of the set data */ +static size_t +mtype_ahash_memsize(const struct htype *h, u8 nets_length) +{ + u32 i; + struct htable *t = h->table; + size_t memsize = sizeof(*h) + + sizeof(*t) +#ifdef IP_SET_HASH_WITH_NETS + + sizeof(struct net_prefixes) * nets_length +#endif + + jhash_size(t->htable_bits) * sizeof(struct hbucket); + + for (i = 0; i < jhash_size(t->htable_bits); i++) + memsize += t->bucket[i].size * h->dsize; + + return memsize; +} + +/* Flush a hash type of set: destroy all elements */ +static void +mtype_flush(struct ip_set *set) +{ + struct htype *h = set->data; + struct htable *t = h->table; + struct hbucket *n; + u32 i; + + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + if (n->size) { + n->size = n->pos = 0; + /* FIXME: use slab cache */ + kfree(n->value); + } + } +#ifdef IP_SET_HASH_WITH_NETS + memset(h->nets, 0, sizeof(struct net_prefixes) + * NETS_LENGTH(set->family)); +#endif + h->elements = 0; +} + +/* Destroy a hash type of set */ +static void +mtype_destroy(struct ip_set *set) +{ + struct htype *h = set->data; + + if (set->extensions & IPSET_EXT_TIMEOUT) + del_timer_sync(&h->gc); + + ahash_destroy(h->table); +#ifdef IP_SET_HASH_WITH_RBTREE + rbtree_destroy(&h->rbtree); +#endif + kfree(h); + + set->data = NULL; +} + +static void +mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +{ + struct htype *h = set->data; + + init_timer(&h->gc); + h->gc.data = (unsigned long) set; + h->gc.function = gc; + h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ; + add_timer(&h->gc); + pr_debug("gc initialized, run in every %u\n", + IPSET_GC_PERIOD(h->timeout)); +} + +static bool +mtype_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct htype *x = a->data; + const struct htype *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout && +#ifdef IP_SET_HASH_WITH_NETMASK + x->netmask == y->netmask && +#endif + a->extensions == b->extensions; +} + +/* Get the ith element from the array block n */ +#define ahash_data(n, i, dsize) \ + ((struct mtype_elem *)((n)->value + ((i) * (dsize)))) + +/* Delete expired elements from the hashtable */ +static void +mtype_expire(struct htype *h, u8 nets_length, size_t dsize) +{ + struct htable *t = h->table; + struct hbucket *n; + struct mtype_elem *data; + u32 i; + int j; + + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = hbucket(t, i); + for (j = 0; j < n->pos; j++) { + data = ahash_data(n, j, dsize); + if (ip_set_timeout_expired(ext_timeout(data, h))) { + pr_debug("expired %u/%u\n", i, j); +#ifdef IP_SET_HASH_WITH_NETS + mtype_del_cidr(h, CIDR(data->cidr), + nets_length); +#endif + if (j != n->pos - 1) + /* Not last one */ + memcpy(data, + ahash_data(n, n->pos - 1, dsize), + dsize); + n->pos--; + h->elements--; + } + } + if (n->pos + AHASH_INIT_SIZE < n->size) { + void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) + * dsize, + GFP_ATOMIC); + if (!tmp) + /* Still try to delete expired elements */ + continue; + n->size -= AHASH_INIT_SIZE; + memcpy(tmp, n->value, n->size * dsize); + kfree(n->value); + n->value = tmp; + } + } +} + +static void +mtype_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct htype *h = set->data; + + pr_debug("called\n"); + write_lock_bh(&set->lock); + mtype_expire(h, NETS_LENGTH(set->family), h->dsize); + write_unlock_bh(&set->lock); + + h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ; + add_timer(&h->gc); +} + +/* Resize a hash: create a new hash table with doubling the hashsize + * and inserting the elements to it. Repeat until we succeed or + * fail due to memory pressures. */ +static int +mtype_resize(struct ip_set *set, bool retried) +{ + struct htype *h = set->data; + struct htable *t, *orig = h->table; + u8 htable_bits = orig->htable_bits; +#ifdef IP_SET_HASH_WITH_NETS + u8 flags; +#endif + struct mtype_elem *data; + struct mtype_elem *d; + struct hbucket *n, *m; + u32 i, j; + int ret; + + /* Try to cleanup once */ + if (SET_WITH_TIMEOUT(set) && !retried) { + i = h->elements; + write_lock_bh(&set->lock); + mtype_expire(set->data, NETS_LENGTH(set->family), + h->dsize); + write_unlock_bh(&set->lock); + if (h->elements < i) + return 0; + } + +retry: + ret = 0; + htable_bits++; + pr_debug("attempt to resize set %s from %u to %u, t %p\n", + set->name, orig->htable_bits, htable_bits, orig); + if (!htable_bits) { + /* In case we have plenty of memory :-) */ + pr_warning("Cannot increase the hashsize of set %s further\n", + set->name); + return -IPSET_ERR_HASH_FULL; + } + t = ip_set_alloc(sizeof(*t) + + jhash_size(htable_bits) * sizeof(struct hbucket)); + if (!t) + return -ENOMEM; + t->htable_bits = htable_bits; + + read_lock_bh(&set->lock); + for (i = 0; i < jhash_size(orig->htable_bits); i++) { + n = hbucket(orig, i); + for (j = 0; j < n->pos; j++) { + data = ahash_data(n, j, h->dsize); +#ifdef IP_SET_HASH_WITH_NETS + flags = 0; + mtype_data_reset_flags(data, &flags); +#endif + m = hbucket(t, HKEY(data, h->initval, htable_bits)); + ret = hbucket_elem_add(m, AHASH_MAX(h), h->dsize); + if (ret < 0) { +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_reset_flags(data, &flags); +#endif + read_unlock_bh(&set->lock); + ahash_destroy(t); + if (ret == -EAGAIN) + goto retry; + return ret; + } + d = ahash_data(m, m->pos++, h->dsize); + memcpy(d, data, h->dsize); +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_reset_flags(d, &flags); +#endif + } + } + + rcu_assign_pointer(h->table, t); + read_unlock_bh(&set->lock); + + /* Give time to other readers of the set */ + synchronize_rcu_bh(); + + pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, + orig->htable_bits, orig, t->htable_bits, t); + ahash_destroy(orig); + + return 0; +} + +/* Add an element to a hash and update the internal counters when succeeded, + * otherwise report the proper error code. */ +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t; + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n; + int i, ret = 0; + int j = AHASH_MAX(h) + 1; + bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 key, multi = 0; + + if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) + /* FIXME: when set is full, we slow down here */ + mtype_expire(h, NETS_LENGTH(set->family), h->dsize); + + if (h->elements >= h->maxelem) { + if (net_ratelimit()) + pr_warning("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; + } + + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, h->dsize); + if (mtype_data_equal(data, d, &multi)) { + if (flag_exist || + (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, h)))) { + /* Just the extensions could be overwritten */ + j = i; + goto reuse_slot; + } else { + ret = -IPSET_ERR_EXIST; + goto out; + } + } + /* Reuse first timed out entry */ + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, h)) && + j != AHASH_MAX(h) + 1) + j = i; + } +reuse_slot: + if (j != AHASH_MAX(h) + 1) { + /* Fill out reused slot */ + data = ahash_data(n, j, h->dsize); +#ifdef IP_SET_HASH_WITH_NETS + mtype_del_cidr(h, CIDR(data->cidr), NETS_LENGTH(set->family)); + mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family)); +#endif + } else { + /* Use/create a new slot */ + TUNE_AHASH_MAX(h, multi); + ret = hbucket_elem_add(n, AHASH_MAX(h), h->dsize); + if (ret != 0) { + if (ret == -EAGAIN) + mtype_data_next(&h->next, d); + goto out; + } + data = ahash_data(n, n->pos++, h->dsize); +#ifdef IP_SET_HASH_WITH_NETS + mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family)); +#endif + h->elements++; + } + memcpy(data, d, sizeof(struct mtype_elem)); +#ifdef IP_SET_HASH_WITH_NETS + mtype_data_set_flags(data, flags); +#endif + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(data, h), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(data, h), ext); + +out: + rcu_read_unlock_bh(); + return ret; +} + +/* Delete an element from the hash: swap it with the last element + * and free up space if possible. + */ +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t = h->table; + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n; + int i; + u32 key, multi = 0; + + key = HKEY(value, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, h->dsize); + if (!mtype_data_equal(data, d, &multi)) + continue; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, h))) + return -IPSET_ERR_EXIST; + if (i != n->pos - 1) + /* Not last one */ + memcpy(data, ahash_data(n, n->pos - 1, h->dsize), + h->dsize); + + n->pos--; + h->elements--; +#ifdef IP_SET_HASH_WITH_NETS + mtype_del_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family)); +#endif + if (n->pos + AHASH_INIT_SIZE < n->size) { + void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) + * h->dsize, + GFP_ATOMIC); + if (!tmp) + return 0; + n->size -= AHASH_INIT_SIZE; + memcpy(tmp, n->value, n->size * h->dsize); + kfree(n->value); + n->value = tmp; + } + return 0; + } + + return -IPSET_ERR_EXIST; +} + +static inline int +mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, + struct ip_set_ext *mext, struct ip_set *set, u32 flags) +{ + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(data, + (struct htype *)(set->data)), + ext, mext, flags); + return mtype_do_data_match(data); +} + +#ifdef IP_SET_HASH_WITH_NETS +/* Special test function which takes into account the different network + * sizes added to the set */ +static int +mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t = h->table; + struct hbucket *n; + struct mtype_elem *data; + int i, j = 0; + u32 key, multi = 0; + u8 nets_length = NETS_LENGTH(set->family); + + pr_debug("test by nets\n"); + for (; j < nets_length && h->nets[j].nets && !multi; j++) { + mtype_data_netmask(d, h->nets[j].cidr); + key = HKEY(d, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, h->dsize); + if (!mtype_data_equal(data, d, &multi)) + continue; + if (SET_WITH_TIMEOUT(set)) { + if (!ip_set_timeout_expired( + ext_timeout(data, h))) + return mtype_data_match(data, ext, + mext, set, + flags); +#ifdef IP_SET_HASH_WITH_MULTI + multi = 0; +#endif + } else + return mtype_data_match(data, ext, + mext, set, flags); + } + } + return 0; +} +#endif + +/* Test whether the element is added to the set */ +static int +mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + struct htype *h = set->data; + struct htable *t = h->table; + struct mtype_elem *d = value; + struct hbucket *n; + struct mtype_elem *data; + int i; + u32 key, multi = 0; + +#ifdef IP_SET_HASH_WITH_NETS + /* If we test an IP address and not a network address, + * try all possible network sizes */ + if (CIDR(d->cidr) == SET_HOST_MASK(set->family)) + return mtype_test_cidrs(set, d, ext, mext, flags); +#endif + + key = HKEY(d, h->initval, t->htable_bits); + n = hbucket(t, key); + for (i = 0; i < n->pos; i++) { + data = ahash_data(n, i, h->dsize); + if (mtype_data_equal(data, d, &multi) && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, h)))) + return mtype_data_match(data, ext, mext, set, flags); + } + return 0; +} + +/* Reply a HEADER request: fill out the header part of the set */ +static int +mtype_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct htype *h = set->data; + struct nlattr *nested; + size_t memsize; + + read_lock_bh(&set->lock); + memsize = mtype_ahash_memsize(h, NETS_LENGTH(set->family)); + read_unlock_bh(&set->lock); + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, + htonl(jhash_size(h->table->htable_bits))) || + nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) + goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_NETMASK + if (h->netmask != HOST_MASK && + nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) + goto nla_put_failure; +#endif + if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || + ((set->extensions & IPSET_EXT_TIMEOUT) && + nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout))) || + ((set->extensions & IPSET_EXT_COUNTER) && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, + htonl(IPSET_FLAG_WITH_COUNTERS)))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +/* Reply a LIST/SAVE request: dump the elements of the specified set */ +static int +mtype_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct htype *h = set->data; + const struct htable *t = h->table; + struct nlattr *atd, *nested; + const struct hbucket *n; + const struct mtype_elem *e; + u32 first = cb->args[2]; + /* We assume that one hash bucket fills into one page */ + void *incomplete; + int i; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + pr_debug("list hash set %s\n", set->name); + for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) { + incomplete = skb_tail_pointer(skb); + n = hbucket(t, cb->args[2]); + pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n); + for (i = 0; i < n->pos; i++) { + e = ahash_data(n, i, h->dsize); + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, h))) + continue; + pr_debug("list hash %lu hbucket %p i %u, data %p\n", + cb->args[2], n, i, e); + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (cb->args[2] == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + if (mtype_data_list(skb, e)) + goto nla_put_failure; + if (SET_WITH_TIMEOUT(set) && + nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get( + ext_timeout(e, h))))) + goto nla_put_failure; + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(e, h))) + goto nla_put_failure; + ipset_nest_end(skb, nested); + } + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nlmsg_trim(skb, incomplete); + ipset_nest_end(skb, atd); + if (unlikely(first == cb->args[2])) { + pr_warning("Can't list set %s: one bucket does not fit into " + "a message. Please report it!\n", set->name); + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static int +TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt); + +static int +TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); + +static const struct ip_set_type_variant mtype_variant = { + .kadt = mtype_kadt, + .uadt = mtype_uadt, + .adt = { + [IPSET_ADD] = mtype_add, + [IPSET_DEL] = mtype_del, + [IPSET_TEST] = mtype_test, + }, + .destroy = mtype_destroy, + .flush = mtype_flush, + .head = mtype_head, + .list = mtype_list, + .resize = mtype_resize, + .same_set = mtype_same_set, +}; + +#ifdef IP_SET_EMIT_CREATE +static int +TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u32 cadt_flags = 0; + u8 hbits; +#ifdef IP_SET_HASH_WITH_NETMASK + u8 netmask; +#endif + size_t hsize; + struct HTYPE *h; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; +#ifdef IP_SET_HASH_WITH_NETMASK + netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + pr_debug("Create set %s with family %s\n", + set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); +#endif + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + +#ifdef IP_SET_HASH_WITH_NETMASK + if (tb[IPSET_ATTR_NETMASK]) { + netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + + if ((set->family == NFPROTO_IPV4 && netmask > 32) || + (set->family == NFPROTO_IPV6 && netmask > 128) || + netmask == 0) + return -IPSET_ERR_INVALID_NETMASK; + } +#endif + + hsize = sizeof(*h); +#ifdef IP_SET_HASH_WITH_NETS + hsize += sizeof(struct net_prefixes) * + (set->family == NFPROTO_IPV4 ? 32 : 128); +#endif + h = kzalloc(hsize, GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; +#ifdef IP_SET_HASH_WITH_NETMASK + h->netmask = netmask; +#endif + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + h->table = ip_set_alloc(hsize); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + if (set->family == NFPROTO_IPV4) + set->variant = &TOKEN(HTYPE, 4_variant); + else + set->variant = &TOKEN(HTYPE, 6_variant); + + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = + ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + if (set->family == NFPROTO_IPV4) { + h->dsize = + sizeof(struct TOKEN(HTYPE, 4ct_elem)); + h->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct TOKEN(HTYPE, 4ct_elem), + timeout); + h->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct TOKEN(HTYPE, 4ct_elem), + counter); + TOKEN(HTYPE, 4_gc_init)(set, + TOKEN(HTYPE, 4_gc)); + } else { + h->dsize = + sizeof(struct TOKEN(HTYPE, 6ct_elem)); + h->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct TOKEN(HTYPE, 6ct_elem), + timeout); + h->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct TOKEN(HTYPE, 6ct_elem), + counter); + TOKEN(HTYPE, 6_gc_init)(set, + TOKEN(HTYPE, 6_gc)); + } + } else { + if (set->family == NFPROTO_IPV4) { + h->dsize = + sizeof(struct TOKEN(HTYPE, 4c_elem)); + h->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct TOKEN(HTYPE, 4c_elem), + counter); + } else { + h->dsize = + sizeof(struct TOKEN(HTYPE, 6c_elem)); + h->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct TOKEN(HTYPE, 6c_elem), + counter); + } + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->extensions |= IPSET_EXT_TIMEOUT; + if (set->family == NFPROTO_IPV4) { + h->dsize = sizeof(struct TOKEN(HTYPE, 4t_elem)); + h->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct TOKEN(HTYPE, 4t_elem), + timeout); + TOKEN(HTYPE, 4_gc_init)(set, TOKEN(HTYPE, 4_gc)); + } else { + h->dsize = sizeof(struct TOKEN(HTYPE, 6t_elem)); + h->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct TOKEN(HTYPE, 6t_elem), + timeout); + TOKEN(HTYPE, 6_gc_init)(set, TOKEN(HTYPE, 6_gc)); + } + } else { + if (set->family == NFPROTO_IPV4) + h->dsize = sizeof(struct TOKEN(HTYPE, 4_elem)); + else + h->dsize = sizeof(struct TOKEN(HTYPE, 6_elem)); + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} +#endif /* IP_SET_EMIT_CREATE */ diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 5c0b78528e55..c74e6e14cd93 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,11 +21,10 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counters support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -33,58 +32,47 @@ IP_SET_MODULE_DESC("hash:ip", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ -#define TYPE hash_ip - -static bool -hash_ip_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_ip4_same_set hash_ip_same_set -#define hash_ip6_same_set hash_ip_same_set +#define HTYPE hash_ip +#define IP_SET_HASH_WITH_NETMASK -/* The type variant functions: IPv4 */ +/* IPv4 variants */ -/* Member elements without timeout */ +/* Member elements */ struct hash_ip4_elem { + /* Zero valued IP addresses cannot be stored */ __be32 ip; }; -/* Member elements with timeout support */ -struct hash_ip4_telem { +struct hash_ip4t_elem { __be32 ip; unsigned long timeout; }; -static inline bool -hash_ip4_data_equal(const struct hash_ip4_elem *ip1, - const struct hash_ip4_elem *ip2, - u32 *multi) -{ - return ip1->ip == ip2->ip; -} +struct hash_ip4c_elem { + __be32 ip; + struct ip_set_counter counter; +}; -static inline bool -hash_ip4_data_isnull(const struct hash_ip4_elem *elem) -{ - return elem->ip == 0; -} +struct hash_ip4ct_elem { + __be32 ip; + struct ip_set_counter counter; + unsigned long timeout; +}; -static inline void -hash_ip4_data_copy(struct hash_ip4_elem *dst, const struct hash_ip4_elem *src) -{ - dst->ip = src->ip; -} +/* Common functions */ -/* Zero valued IP addresses cannot be stored */ -static inline void -hash_ip4_data_zero_out(struct hash_ip4_elem *elem) +static inline bool +hash_ip4_data_equal(const struct hash_ip4_elem *e1, + const struct hash_ip4_elem *e2, + u32 *multi) { - elem->ip = 0; + return e1->ip == e2->ip; } static inline bool -hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *data) +hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) { - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip)) + if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) goto nla_put_failure; return 0; @@ -92,41 +80,26 @@ nla_put_failure: return 1; } -static bool -hash_ip4_data_tlist(struct sk_buff *skb, const struct hash_ip4_elem *data) +static inline void +hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { - const struct hash_ip4_telem *tdata = - (const struct hash_ip4_telem *)data; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout)))) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return 1; + next->ip = e->ip; } -#define IP_SET_HASH_WITH_NETMASK +#define MTYPE hash_ip4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ip4_data_next(struct ip_set_hash *h, const struct hash_ip4_elem *d) -{ - h->next.ip = d->ip; -} +#include "ip_set_hash_gen.h" static int hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ip4_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); @@ -134,43 +107,42 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, if (ip == 0) return -EINVAL; - return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags); + e.ip = ip; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - u32 ip, ip_to, hosts, timeout = h->timeout; - __be32 nip; + struct hash_ip4_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); + u32 ip, ip_to, hosts; int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ip &= ip_set_hostmask(h->netmask); - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - if (adt == IPSET_TEST) { - nip = htonl(ip); - if (nip == 0) + e.ip = htonl(ip); + if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; - return adtfn(set, &nip, timeout, flags); + return adtfn(set, &e, &ext, &ext, flags); } ip_to = ip; @@ -193,10 +165,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); for (; !before(ip_to, ip); ip += hosts) { - nip = htonl(ip); - if (nip == 0) + e.ip = htonl(ip); + if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; - ret = adtfn(set, &nip, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -206,68 +178,49 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ip_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout && - x->netmask == y->netmask; -} +/* Member elements */ +struct hash_ip6_elem { + union nf_inet_addr ip; +}; -/* The type variant functions: IPv6 */ +struct hash_ip6t_elem { + union nf_inet_addr ip; + unsigned long timeout; +}; -struct hash_ip6_elem { +struct hash_ip6c_elem { union nf_inet_addr ip; + struct ip_set_counter counter; }; -struct hash_ip6_telem { +struct hash_ip6ct_elem { union nf_inet_addr ip; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0; -} - -static inline bool -hash_ip6_data_isnull(const struct hash_ip6_elem *elem) -{ - return ipv6_addr_any(&elem->ip.in6); + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } static inline void -hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src) +hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) { - dst->ip.in6 = src->ip.in6; -} - -static inline void -hash_ip6_data_zero_out(struct hash_ip6_elem *elem) -{ - ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0); -} - -static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) -{ - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + ip6_netmask(ip, prefix); } static bool -hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data) +hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6)) + if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) goto nla_put_failure; return 0; @@ -275,69 +228,55 @@ nla_put_failure: return 1; } -static bool -hash_ip6_data_tlist(struct sk_buff *skb, const struct hash_ip6_elem *data) +static inline void +hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) { - const struct hash_ip6_telem *e = - (const struct hash_ip6_telem *)data; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; } +#undef MTYPE #undef PF #undef HOST_MASK +#undef HKEY_DATALEN +#define MTYPE hash_ip6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> -static inline void -hash_ip6_data_next(struct ip_set_hash *h, const struct hash_ip6_elem *d) -{ -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - union nf_inet_addr ip; + struct hash_ip6_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip.in6); - ip6_netmask(&ip, h->netmask); - if (ipv6_addr_any(&ip.in6)) + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + hash_ip6_netmask(&e.ip, h->netmask); + if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; - return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } -static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = { - [IPSET_ATTR_IP] = { .type = NLA_NESTED }, - [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, - [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, -}; - static int hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - union nf_inet_addr ip; - u32 timeout = h->timeout; + struct hash_ip6_elem e = {}; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -345,110 +284,20 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ip6_netmask(&ip, h->netmask); - if (ipv6_addr_any(&ip.in6)) + hash_ip6_netmask(&e.ip, h->netmask); + if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - - ret = adtfn(set, &ip, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } -/* Create hash:ip type of sets */ - -static int -hash_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 netmask, hbits; - size_t hsize; - struct ip_set_hash *h; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - netmask = set->family == NFPROTO_IPV4 ? 32 : 128; - pr_debug("Create set %s with family %s\n", - set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - if (tb[IPSET_ATTR_NETMASK]) { - netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); - - if ((set->family == NFPROTO_IPV4 && netmask > 32) || - (set->family == NFPROTO_IPV6 && netmask > 128) || - netmask == 0) - return -IPSET_ERR_INVALID_NETMASK; - } - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - h->netmask = netmask; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ip4_tvariant : &hash_ip6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_ip4_gc_init(set); - else - hash_ip6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ip4_variant : &hash_ip6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ip_type __read_mostly = { .name = "hash:ip", .protocol = IPSET_PROTOCOL, @@ -465,6 +314,7 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -472,6 +322,8 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 6283351f4eeb..7a2d2bd98d04 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,12 +21,12 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 -#define REVISION_MAX 1 /* SCTP and UDPLITE support added */ +/* 1 SCTP and UDPLITE support added */ +#define REVISION_MAX 2 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -34,33 +34,45 @@ IP_SET_MODULE_DESC("hash:ip,port", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ -#define TYPE hash_ipport +#define HTYPE hash_ipport -static bool -hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b); +/* IPv4 variants */ -#define hash_ipport4_same_set hash_ipport_same_set -#define hash_ipport6_same_set hash_ipport_same_set +/* Member elements */ +struct hash_ipport4_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 padding; +}; -/* The type variant functions: IPv4 */ +struct hash_ipport4t_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; -/* Member elements without timeout */ -struct hash_ipport4_elem { +struct hash_ipport4c_elem { __be32 ip; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; }; -/* Member elements with timeout support */ -struct hash_ipport4_telem { +struct hash_ipport4ct_elem { __be32 ip; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, @@ -71,27 +83,6 @@ hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipport4_data_isnull(const struct hash_ipport4_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipport4_data_copy(struct hash_ipport4_elem *dst, - const struct hash_ipport4_elem *src) -{ - dst->ip = src->ip; - dst->port = src->port; - dst->proto = src->proto; -} - -static inline void -hash_ipport4_data_zero_out(struct hash_ipport4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipport4_data_list(struct sk_buff *skb, const struct hash_ipport4_elem *data) @@ -106,111 +97,91 @@ nla_put_failure: return 1; } -static bool -hash_ipport4_data_tlist(struct sk_buff *skb, - const struct hash_ipport4_elem *data) -{ - const struct hash_ipport4_telem *tdata = - (const struct hash_ipport4_telem *)data; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; -} - -#define PF 4 -#define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - static inline void -hash_ipport4_data_next(struct ip_set_hash *h, +hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { - h->next.ip = d->ip; - h->next.port = d->port; + next->ip = d->ip; + next->port = d->port; } +#define MTYPE hash_ipport4 +#define PF 4 +#define HOST_MASK 32 +#define HKEY_DATALEN sizeof(struct hash_ipport4_elem) +#include "ip_set_hash_gen.h" + static int hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem data = { }; + struct hash_ipport4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem data = { }; + struct hash_ipport4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip, ip_to, p = 0, port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - ip_to = ip = ntohl(data.ip); + ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) @@ -225,7 +196,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - port_to = port = ntohs(data.port); + port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) @@ -238,9 +209,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { - data.ip = htonl(ip); - data.port = htons(p); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -251,63 +222,52 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_ipport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 padding; +}; -/* The type variant functions: IPv6 */ +struct hash_ipport6t_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; -struct hash_ipport6_elem { +struct hash_ipport6c_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; }; -struct hash_ipport6_telem { +struct hash_ipport6ct_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } -static inline bool -hash_ipport6_data_isnull(const struct hash_ipport6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipport6_data_copy(struct hash_ipport6_elem *dst, - const struct hash_ipport6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipport6_data_zero_out(struct hash_ipport6_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipport6_data_list(struct sk_buff *skb, const struct hash_ipport6_elem *data) @@ -322,66 +282,52 @@ nla_put_failure: return 1; } -static bool -hash_ipport6_data_tlist(struct sk_buff *skb, - const struct hash_ipport6_elem *data) +static inline void +hash_ipport6_data_next(struct hash_ipport4_elem *next, + const struct hash_ipport6_elem *d) { - const struct hash_ipport6_telem *e = - (const struct hash_ipport6_telem *)data; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#undef HKEY_DATALEN +#define MTYPE hash_ipport6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipport6_data_next(struct ip_set_hash *h, - const struct hash_ipport6_elem *d) -{ - h->next.port = d->port; -} +#define HKEY_DATALEN sizeof(struct hash_ipport6_elem) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem data = { }; + struct hash_ipport6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem data = { }; + struct hash_ipport6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; @@ -389,6 +335,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -396,39 +344,34 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); @@ -436,8 +379,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -447,78 +390,6 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_ipport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipport4_tvariant : &hash_ipport6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_ipport4_gc_init(set); - else - hash_ipport6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipport4_variant : &hash_ipport6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ipport_type __read_mostly = { .name = "hash:ip,port", .protocol = IPSET_PROTOCOL, @@ -535,6 +406,7 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -545,6 +417,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 6a21271c8d5a..34e8a1acce42 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,12 +21,12 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 -#define REVISION_MAX 1 /* SCTP and UDPLITE support added */ +/* 1 SCTP and UDPLITE support added */ +#define REVISION_MAX 2 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -34,32 +34,44 @@ IP_SET_MODULE_DESC("hash:ip,port,ip", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); /* Type specific function prefix */ -#define TYPE hash_ipportip +#define HTYPE hash_ipportip -static bool -hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b); +/* IPv4 variants */ -#define hash_ipportip4_same_set hash_ipportip_same_set -#define hash_ipportip6_same_set hash_ipportip_same_set +/* Member elements */ +struct hash_ipportip4_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 proto; + u8 padding; +}; -/* The type variant functions: IPv4 */ +struct hash_ipportip4t_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; -/* Member elements without timeout */ -struct hash_ipportip4_elem { +struct hash_ipportip4c_elem { __be32 ip; __be32 ip2; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; }; -/* Member elements with timeout support */ -struct hash_ipportip4_telem { +struct hash_ipportip4ct_elem { __be32 ip; __be32 ip2; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; unsigned long timeout; }; @@ -74,25 +86,6 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipportip4_data_isnull(const struct hash_ipportip4_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipportip4_data_copy(struct hash_ipportip4_elem *dst, - const struct hash_ipportip4_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipportip4_data_zero_out(struct hash_ipportip4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipportip4_data_list(struct sk_buff *skb, const struct hash_ipportip4_elem *data) @@ -108,117 +101,96 @@ nla_put_failure: return 1; } -static bool -hash_ipportip4_data_tlist(struct sk_buff *skb, - const struct hash_ipportip4_elem *data) +static inline void +hash_ipportip4_data_next(struct hash_ipportip4_elem *next, + const struct hash_ipportip4_elem *d) { - const struct hash_ipportip4_telem *tdata = - (const struct hash_ipportip4_telem *)data; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_ipaddr4(skb, IPSET_ATTR_IP2, tdata->ip2) || - nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; + next->port = d->port; } +/* Common functions */ +#define MTYPE hash_ipportip4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportip4_data_next(struct ip_set_hash *h, - const struct hash_ipportip4_elem *d) -{ - h->next.ip = d->ip; - h->next.port = d->port; -} +#include "ip_set_hash_gen.h" static int hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem data = { }; + struct hash_ipportip4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem data = { }; + struct hash_ipportip4_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip, ip_to, p = 0, port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - ip_to = ip = ntohl(data.ip); + ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) @@ -233,7 +205,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - port_to = port = ntohs(data.port); + port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) @@ -246,9 +218,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { - data.ip = htonl(ip); - data.port = htons(p); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -259,66 +231,57 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_ipportip6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 proto; + u8 padding; +}; -/* The type variant functions: IPv6 */ +struct hash_ipportip6t_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; -struct hash_ipportip6_elem { +struct hash_ipportip6c_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; }; -struct hash_ipportip6_telem { +struct hash_ipportip6ct_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 proto; u8 padding; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && - ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } -static inline bool -hash_ipportip6_data_isnull(const struct hash_ipportip6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipportip6_data_copy(struct hash_ipportip6_elem *dst, - const struct hash_ipportip6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipportip6_data_zero_out(struct hash_ipportip6_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipportip6_data_list(struct sk_buff *skb, const struct hash_ipportip6_elem *data) @@ -334,68 +297,51 @@ nla_put_failure: return 1; } -static bool -hash_ipportip6_data_tlist(struct sk_buff *skb, - const struct hash_ipportip6_elem *data) +static inline void +hash_ipportip6_data_next(struct hash_ipportip4_elem *next, + const struct hash_ipportip6_elem *d) { - const struct hash_ipportip6_telem *e = - (const struct hash_ipportip6_telem *)data; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || - nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_ipportip6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportip6_data_next(struct ip_set_hash *h, - const struct hash_ipportip6_elem *d) -{ - h->next.port = d->port; -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem data = { }; + struct hash_ipportip6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); - - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem data = { }; + struct hash_ipportip6_elem e = { }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; int ret; @@ -403,6 +349,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -410,43 +358,38 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); @@ -454,8 +397,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -465,78 +408,6 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_ipportip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipportip4_tvariant : &hash_ipportip6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_ipportip4_gc_init(set); - else - hash_ipportip6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipportip4_variant : &hash_ipportip6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ipportip_type __read_mostly = { .name = "hash:ip,port,ip", .protocol = IPSET_PROTOCOL, @@ -552,6 +423,7 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -563,6 +435,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 2d5cd4ee30eb..c6a525373be4 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,14 +21,14 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ -#define REVISION_MAX 3 /* nomatch flag support added */ +/* 3 nomatch flag support added */ +#define REVISION_MAX 4 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -36,23 +36,19 @@ IP_SET_MODULE_DESC("hash:ip,port,net", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); /* Type specific function prefix */ -#define TYPE hash_ipportnet - -static bool -hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_ipportnet4_same_set hash_ipportnet_same_set -#define hash_ipportnet6_same_set hash_ipportnet_same_set - -/* The type variant functions: IPv4 */ +#define HTYPE hash_ipportnet /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, * dancing back and forth. */ #define IP_SET_HASH_WITH_NETS_PACKED +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +/* IPv4 variants */ -/* Member elements without timeout */ +/* Member elements */ struct hash_ipportnet4_elem { __be32 ip; __be32 ip2; @@ -62,8 +58,7 @@ struct hash_ipportnet4_elem { u8 proto; }; -/* Member elements with timeout support */ -struct hash_ipportnet4_telem { +struct hash_ipportnet4t_elem { __be32 ip; __be32 ip2; __be16 port; @@ -73,6 +68,29 @@ struct hash_ipportnet4_telem { unsigned long timeout; }; +struct hash_ipportnet4c_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; + struct ip_set_counter counter; +}; + +struct hash_ipportnet4ct_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; + struct ip_set_counter counter; + unsigned long timeout; +}; + +/* Common functions */ + static inline bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, @@ -85,29 +103,22 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline bool -hash_ipportnet4_data_isnull(const struct hash_ipportnet4_elem *elem) +static inline int +hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { - return elem->proto == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_ipportnet4_data_copy(struct hash_ipportnet4_elem *dst, - const struct hash_ipportnet4_elem *src) +hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { - memcpy(dst, src, sizeof(*dst)); + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -hash_ipportnet4_data_flags(struct hash_ipportnet4_elem *dst, u32 flags) +hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { - dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); -} - -static inline int -hash_ipportnet4_data_match(const struct hash_ipportnet4_elem *elem) -{ - return elem->nomatch ? -ENOTEMPTY : 1; + swap(*flags, elem->nomatch); } static inline void @@ -117,12 +128,6 @@ hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) elem->cidr = cidr - 1; } -static inline void -hash_ipportnet4_data_zero_out(struct hash_ipportnet4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_ipportnet4_data_list(struct sk_buff *skb, const struct hash_ipportnet4_elem *data) @@ -143,81 +148,56 @@ nla_put_failure: return 1; } -static bool -hash_ipportnet4_data_tlist(struct sk_buff *skb, - const struct hash_ipportnet4_elem *data) +static inline void +hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, + const struct hash_ipportnet4_elem *d) { - const struct hash_ipportnet4_telem *tdata = - (const struct hash_ipportnet4_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_ipaddr4(skb, IPSET_ATTR_IP2, tdata->ip2) || - nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) || - nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; + next->port = d->port; + next->ip2 = d->ip2; } -#define IP_SET_HASH_WITH_PROTO -#define IP_SET_HASH_WITH_NETS - +#define MTYPE hash_ipportnet4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportnet4_data_next(struct ip_set_hash *h, - const struct hash_ipportnet4_elem *d) -{ - h->next.ip = d->ip; - h->next.port = d->port; - h->next.ip2 = d->ip2; -} +#include "ip_set_hash_gen.h" static int hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet4_elem data = { + struct hash_ipportnet4_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (adt == IPSET_TEST) - data.cidr = HOST_MASK - 1; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2); - data.ip2 &= ip_set_netmask(data.cidr + 1); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); + e.ip2 &= ip_set_netmask(e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet4_elem data = { .cidr = HOST_MASK - 1 }; + struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip, ip_to, p = 0, port, port_to; u32 ip2_from, ip2_to, ip2_last, ip2; - u32 timeout = h->timeout; bool with_ports = false; u8 cidr; int ret; @@ -226,13 +206,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -244,46 +227,41 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - data.cidr = cidr - 1; + e.cidr = cidr - 1; } if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { - data.ip = htonl(ip); - data.ip2 = htonl(ip2_from & ip_set_hostmask(data.cidr + 1)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip); + e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; @@ -301,7 +279,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - port_to = port = ntohs(data.port); + port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) @@ -317,28 +295,27 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; - } else { - ip_set_mask_from_to(ip2_from, ip2_to, data.cidr + 1); - } + } else + ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); if (retried) ip = ntohl(h->next.ip); for (; !before(ip_to, ip); ip++) { - data.ip = htonl(ip); + e.ip = htonl(ip); p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { - data.port = htons(p); + e.port = htons(p); ip2 = retried && ip == ntohl(h->next.ip) && p == ntohs(h->next.port) ? ntohl(h->next.ip2) : ip2_from; while (!after(ip2, ip2_to)) { - data.ip2 = htonl(ip2); + e.ip2 = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &cidr); - data.cidr = cidr - 1; - ret = adtfn(set, &data, timeout, flags); + e.cidr = cidr - 1; + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -351,88 +328,78 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_ipportnet6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; +}; -/* The type variant functions: IPv6 */ +struct hash_ipportnet6t_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; + unsigned long timeout; +}; -struct hash_ipportnet6_elem { +struct hash_ipportnet6c_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; + struct ip_set_counter counter; }; -struct hash_ipportnet6_telem { +struct hash_ipportnet6ct_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && - ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && + ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } -static inline bool -hash_ipportnet6_data_isnull(const struct hash_ipportnet6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_ipportnet6_data_copy(struct hash_ipportnet6_elem *dst, - const struct hash_ipportnet6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_ipportnet6_data_flags(struct hash_ipportnet6_elem *dst, u32 flags) -{ - dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); -} - static inline int -hash_ipportnet6_data_match(const struct hash_ipportnet6_elem *elem) +hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem) +hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { - elem->proto = 0; + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -462,78 +429,58 @@ nla_put_failure: return 1; } -static bool -hash_ipportnet6_data_tlist(struct sk_buff *skb, - const struct hash_ipportnet6_elem *data) +static inline void +hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, + const struct hash_ipportnet6_elem *d) { - const struct hash_ipportnet6_telem *e = - (const struct hash_ipportnet6_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || - nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || - nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_ipportnet6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_ipportnet6_data_next(struct ip_set_hash *h, - const struct hash_ipportnet6_elem *d) -{ - h->next.port = d->port; -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet6_elem data = { + struct hash_ipportnet6_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (adt == IPSET_TEST) - data.cidr = HOST_MASK - 1; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); - ip6_netmask(&data.ip2, data.cidr + 1); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); + ip6_netmask(&e.ip2, e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet6_elem data = { .cidr = HOST_MASK - 1 }; + struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; u8 cidr; int ret; @@ -543,6 +490,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -552,11 +501,12 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; @@ -564,46 +514,41 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - data.cidr = cidr - 1; + e.cidr = cidr - 1; } - ip6_netmask(&data.ip2, data.cidr + 1); + ip6_netmask(&e.ip2, e.cidr + 1); if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); @@ -611,8 +556,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -622,81 +567,6 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_ipportnet_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipportnet4_tvariant - : &hash_ipportnet6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_ipportnet4_gc_init(set); - else - hash_ipportnet6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_ipportnet4_variant : &hash_ipportnet6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_ipportnet_type __read_mostly = { .name = "hash:ip,port,net", .protocol = IPSET_PROTOCOL, @@ -713,6 +583,7 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -727,6 +598,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 29e94b981f3f..da740ceb56ae 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -20,12 +20,12 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 /* 1 Range as input support for IPv4 added */ -#define REVISION_MAX 2 /* nomatch flag support added */ +/* 2 nomatch flag support added */ +#define REVISION_MAX 3 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -33,33 +33,46 @@ IP_SET_MODULE_DESC("hash:net", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:net"); /* Type specific function prefix */ -#define TYPE hash_net +#define HTYPE hash_net +#define IP_SET_HASH_WITH_NETS -static bool -hash_net_same_set(const struct ip_set *a, const struct ip_set *b); +/* IPv4 variants */ -#define hash_net4_same_set hash_net_same_set -#define hash_net6_same_set hash_net_same_set +/* Member elements */ +struct hash_net4_elem { + __be32 ip; + u16 padding0; + u8 nomatch; + u8 cidr; +}; -/* The type variant functions: IPv4 */ +struct hash_net4t_elem { + __be32 ip; + u16 padding0; + u8 nomatch; + u8 cidr; + unsigned long timeout; +}; -/* Member elements without timeout */ -struct hash_net4_elem { +struct hash_net4c_elem { __be32 ip; u16 padding0; u8 nomatch; u8 cidr; + struct ip_set_counter counter; }; -/* Member elements with timeout support */ -struct hash_net4_telem { +struct hash_net4ct_elem { __be32 ip; u16 padding0; u8 nomatch; u8 cidr; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, @@ -69,31 +82,22 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline bool -hash_net4_data_isnull(const struct hash_net4_elem *elem) +static inline int +hash_net4_do_data_match(const struct hash_net4_elem *elem) { - return elem->cidr == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_net4_data_copy(struct hash_net4_elem *dst, - const struct hash_net4_elem *src) +hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { - dst->ip = src->ip; - dst->cidr = src->cidr; - dst->nomatch = src->nomatch; + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -hash_net4_data_flags(struct hash_net4_elem *dst, u32 flags) -{ - dst->nomatch = flags & IPSET_FLAG_NOMATCH; -} - -static inline int -hash_net4_data_match(const struct hash_net4_elem *elem) +hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { - return elem->nomatch ? -ENOTEMPTY : 1; + swap(*flags, elem->nomatch); } static inline void @@ -103,13 +107,6 @@ hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) elem->cidr = cidr; } -/* Zero CIDR values cannot be stored */ -static inline void -hash_net4_data_zero_out(struct hash_net4_elem *elem) -{ - elem->cidr = 0; -} - static bool hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) { @@ -126,106 +123,84 @@ nla_put_failure: return 1; } -static bool -hash_net4_data_tlist(struct sk_buff *skb, const struct hash_net4_elem *data) +static inline void +hash_net4_data_next(struct hash_net4_elem *next, + const struct hash_net4_elem *d) { - const struct hash_net4_telem *tdata = - (const struct hash_net4_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_u8(skb, IPSET_ATTR_CIDR, tdata->cidr) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; } -#define IP_SET_HASH_WITH_NETS - +#define MTYPE hash_net4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_net4_data_next(struct ip_set_hash *h, - const struct hash_net4_elem *d) -{ - h->next.ip = d->ip; -} +#include "ip_set_hash_gen.h" static int hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net4_elem data = { + struct hash_net4_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - data.ip &= ip_set_netmask(data.cidr); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net4_elem data = { .cidr = HOST_MASK }; - u32 timeout = h->timeout; + struct hash_net4_elem e = { .cidr = HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip = 0, ip_to, last; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr || data.cidr > HOST_MASK) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { - data.ip = htonl(ip & ip_set_hostmask(data.cidr)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip & ip_set_hostmask(e.cidr)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; @@ -241,9 +216,9 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); while (!after(ip, ip_to)) { - data.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &data.cidr); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; else @@ -253,83 +228,67 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_net_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_net6_elem { + union nf_inet_addr ip; + u16 padding0; + u8 nomatch; + u8 cidr; +}; -/* The type variant functions: IPv6 */ +struct hash_net6t_elem { + union nf_inet_addr ip; + u16 padding0; + u8 nomatch; + u8 cidr; + unsigned long timeout; +}; -struct hash_net6_elem { +struct hash_net6c_elem { union nf_inet_addr ip; u16 padding0; u8 nomatch; u8 cidr; + struct ip_set_counter counter; }; -struct hash_net6_telem { +struct hash_net6ct_elem { union nf_inet_addr ip; u16 padding0; u8 nomatch; u8 cidr; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->cidr == ip2->cidr; } -static inline bool -hash_net6_data_isnull(const struct hash_net6_elem *elem) -{ - return elem->cidr == 0; -} - -static inline void -hash_net6_data_copy(struct hash_net6_elem *dst, - const struct hash_net6_elem *src) -{ - dst->ip.in6 = src->ip.in6; - dst->cidr = src->cidr; - dst->nomatch = src->nomatch; -} - -static inline void -hash_net6_data_flags(struct hash_net6_elem *dst, u32 flags) -{ - dst->nomatch = flags & IPSET_FLAG_NOMATCH; -} - static inline int -hash_net6_data_match(const struct hash_net6_elem *elem) +hash_net6_do_data_match(const struct hash_net6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_net6_data_zero_out(struct hash_net6_elem *elem) +hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { - elem->cidr = 0; + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -355,74 +314,60 @@ nla_put_failure: return 1; } -static bool -hash_net6_data_tlist(struct sk_buff *skb, const struct hash_net6_elem *data) +static inline void +hash_net6_data_next(struct hash_net4_elem *next, + const struct hash_net6_elem *d) { - const struct hash_net6_telem *e = - (const struct hash_net6_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_u8(skb, IPSET_ATTR_CIDR, e->cidr) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_net6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_net6_data_next(struct ip_set_hash *h, - const struct hash_net6_elem *d) -{ -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net6_elem data = { + struct hash_net6_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6_netmask(&data.ip, data.cidr); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net6_elem data = { .cidr = HOST_MASK }; - u32 timeout = h->timeout; + struct hash_net6_elem e = { .cidr = HOST_MASK }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -430,107 +375,29 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!data.cidr || data.cidr > HOST_MASK) + if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - ip6_netmask(&data.ip, data.cidr); - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + ip6_netmask(&e.ip, e.cidr); - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); - } - - ret = adtfn(set, &data, timeout, flags); - - return ip_set_eexist(ret, flags) ? 0 : ret; -} - -/* Create hash:ip type of sets */ - -static int -hash_net_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - struct ip_set_hash *h; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; + flags |= (IPSET_FLAG_NOMATCH << 16); } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + ret = adtfn(set, &e, &ext, &ext, flags); - set->variant = set->family == NFPROTO_IPV4 - ? &hash_net4_tvariant : &hash_net6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_net4_gc_init(set); - else - hash_net6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_net4_variant : &hash_net6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_net_type __read_mostly = { @@ -548,6 +415,7 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -555,6 +423,8 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 45a101439bc5..84ae6f6ce624 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,12 +21,12 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 /* 1 nomatch flag support added */ -#define REVISION_MAX 2 /* /0 support added */ +/* 2 /0 support added */ +#define REVISION_MAX 3 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -127,17 +127,14 @@ iface_add(struct rb_root *root, const char **iface) } /* Type specific function prefix */ -#define TYPE hash_netiface - -static bool -hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_netiface4_same_set hash_netiface_same_set -#define hash_netiface6_same_set hash_netiface_same_set +#define HTYPE hash_netiface +#define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_RBTREE +#define IP_SET_HASH_WITH_MULTI #define STREQ(a, b) (strcmp(a, b) == 0) -/* The type variant functions: IPv4 */ +/* IPv4 variants */ struct hash_netiface4_elem_hashed { __be32 ip; @@ -147,8 +144,6 @@ struct hash_netiface4_elem_hashed { u8 elem; }; -#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) - /* Member elements without timeout */ struct hash_netiface4_elem { __be32 ip; @@ -159,17 +154,39 @@ struct hash_netiface4_elem { const char *iface; }; -/* Member elements with timeout support */ -struct hash_netiface4_telem { +struct hash_netiface4t_elem { + __be32 ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; + unsigned long timeout; +}; + +struct hash_netiface4c_elem { + __be32 ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; + struct ip_set_counter counter; +}; + +struct hash_netiface4ct_elem { __be32 ip; u8 physdev; u8 cidr; u8 nomatch; u8 elem; const char *iface; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, const struct hash_netiface4_elem *ip2, @@ -182,29 +199,22 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->iface == ip2->iface; } -static inline bool -hash_netiface4_data_isnull(const struct hash_netiface4_elem *elem) +static inline int +hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) { - return elem->elem == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netiface4_data_copy(struct hash_netiface4_elem *dst, - const struct hash_netiface4_elem *src) +hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) { - memcpy(dst, src, sizeof(*dst)); + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -hash_netiface4_data_flags(struct hash_netiface4_elem *dst, u32 flags) -{ - dst->nomatch = flags & IPSET_FLAG_NOMATCH; -} - -static inline int -hash_netiface4_data_match(const struct hash_netiface4_elem *elem) +hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) { - return elem->nomatch ? -ENOTEMPTY : 1; + swap(*flags, elem->nomatch); } static inline void @@ -214,12 +224,6 @@ hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) elem->cidr = cidr; } -static inline void -hash_netiface4_data_zero_out(struct hash_netiface4_elem *elem) -{ - elem->elem = 0; -} - static bool hash_netiface4_data_list(struct sk_buff *skb, const struct hash_netiface4_elem *data) @@ -240,66 +244,40 @@ nla_put_failure: return 1; } -static bool -hash_netiface4_data_tlist(struct sk_buff *skb, - const struct hash_netiface4_elem *data) +static inline void +hash_netiface4_data_next(struct hash_netiface4_elem *next, + const struct hash_netiface4_elem *d) { - const struct hash_netiface4_telem *tdata = - (const struct hash_netiface4_telem *)data; - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; - - if (data->nomatch) - flags |= IPSET_FLAG_NOMATCH; - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || - nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || - nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout)))) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; } -#define IP_SET_HASH_WITH_NETS -#define IP_SET_HASH_WITH_RBTREE -#define IP_SET_HASH_WITH_MULTI - +#define MTYPE hash_netiface4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netiface4_data_next(struct ip_set_hash *h, - const struct hash_netiface4_elem *d) -{ - h->next.ip = d->ip; -} +#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) +#include "ip_set_hash_gen.h" static int hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface4_elem data = { + struct hash_netiface4_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK, .elem = 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); int ret; - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - data.ip &= ip_set_netmask(data.cidr); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr); #define IFACE(dir) (par->dir ? par->dir->name : NULL) #define PHYSDEV(dir) (nf_bridge->dir ? nf_bridge->dir->name : NULL) @@ -311,72 +289,69 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, if (!nf_bridge) return -EINVAL; - data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); - data.physdev = 1; + e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + e.physdev = 1; #else - data.iface = NULL; + e.iface = NULL; #endif } else - data.iface = SRCDIR ? IFACE(in) : IFACE(out); + e.iface = SRCDIR ? IFACE(in) : IFACE(out); - if (!data.iface) + if (!e.iface) return -EINVAL; - ret = iface_test(&h->rbtree, &data.iface); + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } } else if (!ret) return ret; - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface4_elem data = { .cidr = HOST_MASK, .elem = 1 }; + struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 ip = 0, ip_to, last; - u32 timeout = h->timeout; char iface[IFNAMSIZ]; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (data.cidr > HOST_MASK) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - data.iface = iface; - ret = iface_test(&h->rbtree, &data.iface); + e.iface = iface; + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } @@ -386,14 +361,15 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_PHYSDEV) - data.physdev = 1; - if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH)) - flags |= (cadt_flags << 16); + e.physdev = 1; + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { - data.ip = htonl(ip & ip_set_hostmask(data.cidr)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip & ip_set_hostmask(e.cidr)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } if (tb[IPSET_ATTR_IP_TO]) { @@ -404,16 +380,15 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else { - ip_set_mask_from_to(ip, ip_to, data.cidr); - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr); if (retried) ip = ntohl(h->next.ip); while (!after(ip, ip_to)) { - data.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &data.cidr); - ret = adtfn(set, &data, timeout, flags); + e.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -424,18 +399,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; - - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} - -/* The type variant functions: IPv6 */ +/* IPv6 variants */ struct hash_netiface6_elem_hashed { union nf_inet_addr ip; @@ -445,8 +409,6 @@ struct hash_netiface6_elem_hashed { u8 elem; }; -#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) - struct hash_netiface6_elem { union nf_inet_addr ip; u8 physdev; @@ -456,66 +418,67 @@ struct hash_netiface6_elem { const char *iface; }; -struct hash_netiface6_telem { +struct hash_netiface6t_elem { + union nf_inet_addr ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; + unsigned long timeout; +}; + +struct hash_netiface6c_elem { + union nf_inet_addr ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 elem; + const char *iface; + struct ip_set_counter counter; +}; + +struct hash_netiface6ct_elem { union nf_inet_addr ip; u8 physdev; u8 cidr; u8 nomatch; u8 elem; const char *iface; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, const struct hash_netiface6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && ip1->iface == ip2->iface; } -static inline bool -hash_netiface6_data_isnull(const struct hash_netiface6_elem *elem) -{ - return elem->elem == 0; -} - -static inline void -hash_netiface6_data_copy(struct hash_netiface6_elem *dst, - const struct hash_netiface6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_netiface6_data_flags(struct hash_netiface6_elem *dst, u32 flags) -{ - dst->nomatch = flags & IPSET_FLAG_NOMATCH; -} - static inline int -hash_netiface6_data_match(const struct hash_netiface6_elem *elem) +hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netiface6_data_zero_out(struct hash_netiface6_elem *elem) +hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) { - elem->elem = 0; + elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -545,63 +508,45 @@ nla_put_failure: return 1; } -static bool -hash_netiface6_data_tlist(struct sk_buff *skb, - const struct hash_netiface6_elem *data) +static inline void +hash_netiface6_data_next(struct hash_netiface4_elem *next, + const struct hash_netiface6_elem *d) { - const struct hash_netiface6_telem *e = - (const struct hash_netiface6_telem *)data; - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; - - if (data->nomatch) - flags |= IPSET_FLAG_NOMATCH; - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || - nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; } +#undef MTYPE #undef PF #undef HOST_MASK +#undef HKEY_DATALEN +#define MTYPE hash_netiface6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netiface6_data_next(struct ip_set_hash *h, - const struct hash_netiface6_elem *d) -{ -} +#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface6_elem data = { + struct hash_netiface6_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK, .elem = 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); int ret; - if (data.cidr == 0) + if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) - data.cidr = HOST_MASK; + e.cidr = HOST_MASK; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6_netmask(&data.ip, data.cidr); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr); if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #ifdef CONFIG_BRIDGE_NETFILTER @@ -609,44 +554,46 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (!nf_bridge) return -EINVAL; - data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); - data.physdev = 1; + e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + e.physdev = 1; #else - data.iface = NULL; + e.iface = NULL; #endif } else - data.iface = SRCDIR ? IFACE(in) : IFACE(out); + e.iface = SRCDIR ? IFACE(in) : IFACE(out); - if (!data.iface) + if (!e.iface) return -EINVAL; - ret = iface_test(&h->rbtree, &data.iface); + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } } else if (!ret) return ret; - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct ip_set_hash *h = set->data; + struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netiface6_elem data = { .cidr = HOST_MASK, .elem = 1 }; - u32 timeout = h->timeout; + struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); char iface[IFNAMSIZ]; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -654,28 +601,23 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) - data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (data.cidr > HOST_MASK) + e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - ip6_netmask(&data.ip, data.cidr); - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + ip6_netmask(&e.ip, e.cidr); strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - data.iface = iface; - ret = iface_test(&h->rbtree, &data.iface); + e.iface = iface; + ret = iface_test(&h->rbtree, &e.iface); if (adt == IPSET_ADD) { if (!ret) { - ret = iface_add(&h->rbtree, &data.iface); + ret = iface_add(&h->rbtree, &e.iface); if (ret) return ret; } @@ -685,90 +627,15 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_PHYSDEV) - data.physdev = 1; - if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH)) - flags |= (cadt_flags << 16); + e.physdev = 1; + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (IPSET_FLAG_NOMATCH << 16); } - ret = adtfn(set, &data, timeout, flags); + ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; -} - -/* Create hash:ip type of sets */ - -static int -hash_netiface_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - h->ahash_max = AHASH_MAX_SIZE; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - h->rbtree = RB_ROOT; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_netiface4_tvariant : &hash_netiface6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_netiface4_gc_init(set); - else - hash_netiface6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_netiface4_variant : &hash_netiface6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_netiface_type __read_mostly = { @@ -788,6 +655,7 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -798,6 +666,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 7ef700de596c..9a0869853be5 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -20,14 +20,14 @@ #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define REVISION_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ -#define REVISION_MAX 3 /* nomatch flag support added */ +/* 3 nomatch flag support added */ +#define REVISION_MAX 4 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -35,15 +35,9 @@ IP_SET_MODULE_DESC("hash:net,port", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_hash:net,port"); /* Type specific function prefix */ -#define TYPE hash_netport - -static bool -hash_netport_same_set(const struct ip_set *a, const struct ip_set *b); - -#define hash_netport4_same_set hash_netport_same_set -#define hash_netport6_same_set hash_netport_same_set - -/* The type variant functions: IPv4 */ +#define HTYPE hash_netport +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, @@ -51,7 +45,9 @@ hash_netport_same_set(const struct ip_set *a, const struct ip_set *b); */ #define IP_SET_HASH_WITH_NETS_PACKED -/* Member elements without timeout */ +/* IPv4 variants */ + +/* Member elements */ struct hash_netport4_elem { __be32 ip; __be16 port; @@ -60,8 +56,7 @@ struct hash_netport4_elem { u8 nomatch:1; }; -/* Member elements with timeout support */ -struct hash_netport4_telem { +struct hash_netport4t_elem { __be32 ip; __be16 port; u8 proto; @@ -70,6 +65,27 @@ struct hash_netport4_telem { unsigned long timeout; }; +struct hash_netport4c_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; + struct ip_set_counter counter; +}; + +struct hash_netport4ct_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; + struct ip_set_counter counter; + unsigned long timeout; +}; + +/* Common functions */ + static inline bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, const struct hash_netport4_elem *ip2, @@ -81,33 +97,22 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline bool -hash_netport4_data_isnull(const struct hash_netport4_elem *elem) +static inline int +hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { - return elem->proto == 0; + return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netport4_data_copy(struct hash_netport4_elem *dst, - const struct hash_netport4_elem *src) +hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { - dst->ip = src->ip; - dst->port = src->port; - dst->proto = src->proto; - dst->cidr = src->cidr; - dst->nomatch = src->nomatch; + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -hash_netport4_data_flags(struct hash_netport4_elem *dst, u32 flags) +hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { - dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); -} - -static inline int -hash_netport4_data_match(const struct hash_netport4_elem *elem) -{ - return elem->nomatch ? -ENOTEMPTY : 1; + swap(*flags, elem->nomatch); } static inline void @@ -117,12 +122,6 @@ hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) elem->cidr = cidr - 1; } -static inline void -hash_netport4_data_zero_out(struct hash_netport4_elem *elem) -{ - elem->proto = 0; -} - static bool hash_netport4_data_list(struct sk_buff *skb, const struct hash_netport4_elem *data) @@ -142,77 +141,53 @@ nla_put_failure: return 1; } -static bool -hash_netport4_data_tlist(struct sk_buff *skb, - const struct hash_netport4_elem *data) +static inline void +hash_netport4_data_next(struct hash_netport4_elem *next, + const struct hash_netport4_elem *d) { - const struct hash_netport4_telem *tdata = - (const struct hash_netport4_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) || - nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) || - nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(tdata->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->ip = d->ip; + next->port = d->port; } -#define IP_SET_HASH_WITH_PROTO -#define IP_SET_HASH_WITH_NETS - +#define MTYPE hash_netport4 #define PF 4 #define HOST_MASK 32 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netport4_data_next(struct ip_set_hash *h, - const struct hash_netport4_elem *d) -{ - h->next.ip = d->ip; - h->next.port = d->port; -} +#include "ip_set_hash_gen.h" static int hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport4_elem data = { + struct hash_netport4_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (adt == IPSET_TEST) - data.cidr = HOST_MASK - 1; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); - data.ip &= ip_set_netmask(data.cidr + 1); + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= ip_set_netmask(e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport4_elem data = { .cidr = HOST_MASK - 1 }; + struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to, p = 0, ip = 0, ip_to, last; - u32 timeout = h->timeout; bool with_ports = false; u8 cidr; int ret; @@ -221,13 +196,16 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -235,47 +213,42 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - data.cidr = cidr - 1; + e.cidr = cidr - 1; } if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMP)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMP)) + e.port = 0; with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) { - data.ip = htonl(ip & ip_set_hostmask(data.cidr + 1)); - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1)); + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } - port = port_to = ntohs(data.port); + port = port_to = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port_to < port) @@ -289,21 +262,20 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else { - ip_set_mask_from_to(ip, ip_to, data.cidr + 1); - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr + 1); if (retried) ip = ntohl(h->next.ip); while (!after(ip, ip_to)) { - data.ip = htonl(ip); + e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &cidr); - data.cidr = cidr - 1; + e.cidr = cidr - 1; p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { - data.port = htons(p); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(p); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -315,85 +287,73 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -static bool -hash_netport_same_set(const struct ip_set *a, const struct ip_set *b) -{ - const struct ip_set_hash *x = a->data; - const struct ip_set_hash *y = b->data; +/* IPv6 variants */ - /* Resizing changes htable_bits, so we ignore it */ - return x->maxelem == y->maxelem && - x->timeout == y->timeout; -} +struct hash_netport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; +}; -/* The type variant functions: IPv6 */ +struct hash_netport6t_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; + unsigned long timeout; +}; -struct hash_netport6_elem { +struct hash_netport6c_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 cidr:7; u8 nomatch:1; + struct ip_set_counter counter; }; -struct hash_netport6_telem { +struct hash_netport6ct_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 cidr:7; u8 nomatch:1; + struct ip_set_counter counter; unsigned long timeout; }; +/* Common functions */ + static inline bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, u32 *multi) { - return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto && ip1->cidr == ip2->cidr; } -static inline bool -hash_netport6_data_isnull(const struct hash_netport6_elem *elem) -{ - return elem->proto == 0; -} - -static inline void -hash_netport6_data_copy(struct hash_netport6_elem *dst, - const struct hash_netport6_elem *src) -{ - memcpy(dst, src, sizeof(*dst)); -} - -static inline void -hash_netport6_data_flags(struct hash_netport6_elem *dst, u32 flags) -{ - dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); -} - static inline int -hash_netport6_data_match(const struct hash_netport6_elem *elem) +hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static inline void -hash_netport6_data_zero_out(struct hash_netport6_elem *elem) +hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { - elem->proto = 0; + elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static inline void -ip6_netmask(union nf_inet_addr *ip, u8 prefix) +hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { - ip->ip6[0] &= ip_set_netmask6(prefix)[0]; - ip->ip6[1] &= ip_set_netmask6(prefix)[1]; - ip->ip6[2] &= ip_set_netmask6(prefix)[2]; - ip->ip6[3] &= ip_set_netmask6(prefix)[3]; + swap(*flags, elem->nomatch); } static inline void @@ -422,76 +382,57 @@ nla_put_failure: return 1; } -static bool -hash_netport6_data_tlist(struct sk_buff *skb, - const struct hash_netport6_elem *data) +static inline void +hash_netport6_data_next(struct hash_netport4_elem *next, + const struct hash_netport6_elem *d) { - const struct hash_netport6_telem *e = - (const struct hash_netport6_telem *)data; - u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; - - if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || - nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || - nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || - nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || - nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(ip_set_timeout_get(e->timeout))) || - (flags && - nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return 1; + next->port = d->port; } +#undef MTYPE #undef PF #undef HOST_MASK +#define MTYPE hash_netport6 #define PF 6 #define HOST_MASK 128 -#include <linux/netfilter/ipset/ip_set_ahash.h> - -static inline void -hash_netport6_data_next(struct ip_set_hash *h, - const struct hash_netport6_elem *d) -{ - h->next.port = d->port; -} +#define IP_SET_EMIT_CREATE +#include "ip_set_hash_gen.h" static int hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + enum ipset_adt adt, struct ip_set_adt_opt *opt) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport6_elem data = { + struct hash_netport6_elem e = { .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1, }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h); if (adt == IPSET_TEST) - data.cidr = HOST_MASK - 1; + e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, - &data.port, &data.proto)) + &e.port, &e.proto)) return -EINVAL; - ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); - ip6_netmask(&data.ip, data.cidr + 1); + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + ip6_netmask(&e.ip, e.cidr + 1); - return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct ip_set_hash *h = set->data; + const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netport6_elem data = { .cidr = HOST_MASK - 1 }; + struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(h); u32 port, port_to; - u32 timeout = h->timeout; bool with_ports = false; u8 cidr; int ret; @@ -500,7 +441,9 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -508,7 +451,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || + ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -516,45 +460,40 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - data.cidr = cidr - 1; + e.cidr = cidr - 1; } - ip6_netmask(&data.ip, data.cidr + 1); + ip6_netmask(&e.ip, e.cidr + 1); if (tb[IPSET_ATTR_PORT]) - data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); else return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_PROTO]) { - data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); - with_ports = ip_set_proto_with_ports(data.proto); + e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(e.proto); - if (data.proto == 0) + if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - if (!(with_ports || data.proto == IPPROTO_ICMPV6)) - data.port = 0; - - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout(h->timeout)) - return -IPSET_ERR_TIMEOUT; - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - } + if (!(with_ports || e.proto == IPPROTO_ICMPV6)) + e.port = 0; - if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) - flags |= (cadt_flags << 16); + flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { - ret = adtfn(set, &data, timeout, flags); - return ip_set_eexist(ret, flags) ? 0 : ret; + ret = adtfn(set, &e, &ext, &ext, flags); + return ip_set_enomatch(ret, flags, adt) ? 1 : + ip_set_eexist(ret, flags) ? 0 : ret; } - port = ntohs(data.port); + port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); @@ -562,8 +501,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { - data.port = htons(port); - ret = adtfn(set, &data, timeout, flags); + e.port = htons(port); + ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -573,80 +512,6 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; } -/* Create hash:ip type of sets */ - -static int -hash_netport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) -{ - struct ip_set_hash *h; - u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; - u8 hbits; - size_t hsize; - - if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) - return -IPSET_ERR_INVALID_FAMILY; - - if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) - return -IPSET_ERR_PROTOCOL; - - if (tb[IPSET_ATTR_HASHSIZE]) { - hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); - if (hashsize < IPSET_MIMINAL_HASHSIZE) - hashsize = IPSET_MIMINAL_HASHSIZE; - } - - if (tb[IPSET_ATTR_MAXELEM]) - maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); - - h = kzalloc(sizeof(*h) - + sizeof(struct ip_set_hash_nets) - * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); - if (!h) - return -ENOMEM; - - h->maxelem = maxelem; - get_random_bytes(&h->initval, sizeof(h->initval)); - h->timeout = IPSET_NO_TIMEOUT; - - hbits = htable_bits(hashsize); - hsize = htable_size(hbits); - if (hsize == 0) { - kfree(h); - return -ENOMEM; - } - h->table = ip_set_alloc(hsize); - if (!h->table) { - kfree(h); - return -ENOMEM; - } - h->table->htable_bits = hbits; - - set->data = h; - - if (tb[IPSET_ATTR_TIMEOUT]) { - h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); - - set->variant = set->family == NFPROTO_IPV4 - ? &hash_netport4_tvariant : &hash_netport6_tvariant; - - if (set->family == NFPROTO_IPV4) - hash_netport4_gc_init(set); - else - hash_netport6_gc_init(set); - } else { - set->variant = set->family == NFPROTO_IPV4 - ? &hash_netport4_variant : &hash_netport6_variant; - } - - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", - set->name, jhash_size(h->table->htable_bits), - h->table->htable_bits, h->maxelem, set->data, h->table); - - return 0; -} - static struct ip_set_type hash_netport_type __read_mostly = { .name = "hash:net,port", .protocol = IPSET_PROTOCOL, @@ -663,6 +528,7 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, @@ -674,6 +540,8 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 8371c2bac2e4..979b8c90e422 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -13,30 +13,53 @@ #include <linux/errno.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <linux/netfilter/ipset/ip_set_list.h> #define REVISION_MIN 0 -#define REVISION_MAX 0 +#define REVISION_MAX 1 /* Counters support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("list:set", REVISION_MIN, REVISION_MAX); MODULE_ALIAS("ip_set_list:set"); -/* Member elements without and with timeout */ +/* Member elements */ struct set_elem { ip_set_id_t id; }; -struct set_telem { - ip_set_id_t id; +struct sett_elem { + struct { + ip_set_id_t id; + } __attribute__ ((aligned)); + unsigned long timeout; +}; + +struct setc_elem { + struct { + ip_set_id_t id; + } __attribute__ ((aligned)); + struct ip_set_counter counter; +}; + +struct setct_elem { + struct { + ip_set_id_t id; + } __attribute__ ((aligned)); + struct ip_set_counter counter; unsigned long timeout; }; +struct set_adt_elem { + ip_set_id_t id; + ip_set_id_t refid; + int before; +}; + /* Type structure */ struct list_set { size_t dsize; /* element size */ + size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */ u32 size; /* size of set list array */ u32 timeout; /* timeout value */ struct timer_list gc; /* garbage collection */ @@ -49,175 +72,311 @@ list_set_elem(const struct list_set *map, u32 id) return (struct set_elem *)((void *)map->members + id * map->dsize); } -static inline struct set_telem * -list_set_telem(const struct list_set *map, u32 id) -{ - return (struct set_telem *)((void *)map->members + id * map->dsize); -} +#define ext_timeout(e, m) \ +(unsigned long *)((void *)(e) + (m)->offset[IPSET_OFFSET_TIMEOUT]) +#define ext_counter(e, m) \ +(struct ip_set_counter *)((void *)(e) + (m)->offset[IPSET_OFFSET_COUNTER]) -static inline bool -list_set_timeout(const struct list_set *map, u32 id) +static int +list_set_ktest(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { - const struct set_telem *elem = list_set_telem(map, id); + struct list_set *map = set->data; + struct set_elem *e; + u32 i, cmdflags = opt->cmdflags; + int ret; - return ip_set_timeout_test(elem->timeout); + /* Don't lookup sub-counters at all */ + opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; + if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) + opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; + for (i = 0; i < map->size; i++) { + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + ret = ip_set_test(e->id, skb, par, opt); + if (ret > 0) { + if (SET_WITH_COUNTER(set)) + ip_set_update_counter(ext_counter(e, map), + ext, &opt->ext, + cmdflags); + return ret; + } + } + return 0; } -static inline bool -list_set_expired(const struct list_set *map, u32 id) +static int +list_set_kadd(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { - const struct set_telem *elem = list_set_telem(map, id); + struct list_set *map = set->data; + struct set_elem *e; + u32 i; + int ret; - return ip_set_timeout_expired(elem->timeout); + for (i = 0; i < map->size; i++) { + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + return 0; + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + ret = ip_set_add(e->id, skb, par, opt); + if (ret == 0) + return ret; + } + return 0; } -/* Set list without and with timeout */ - static int -list_set_kadt(struct ip_set *set, const struct sk_buff *skb, +list_set_kdel(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, - enum ipset_adt adt, const struct ip_set_adt_opt *opt) + struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; - struct set_elem *elem; + struct set_elem *e; u32 i; int ret; for (i = 0; i < map->size; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID) + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) return 0; - if (with_timeout(map->timeout) && list_set_expired(map, i)) + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) continue; - switch (adt) { - case IPSET_TEST: - ret = ip_set_test(elem->id, skb, par, opt); - if (ret > 0) - return ret; - break; - case IPSET_ADD: - ret = ip_set_add(elem->id, skb, par, opt); - if (ret == 0) - return ret; - break; - case IPSET_DEL: - ret = ip_set_del(elem->id, skb, par, opt); - if (ret == 0) - return ret; - break; - default: - break; - } + ret = ip_set_del(e->id, skb, par, opt); + if (ret == 0) + return ret; + } + return 0; +} + +static int +list_set_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + struct list_set *map = set->data; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map); + + switch (adt) { + case IPSET_TEST: + return list_set_ktest(set, skb, par, opt, &ext); + case IPSET_ADD: + return list_set_kadd(set, skb, par, opt, &ext); + case IPSET_DEL: + return list_set_kdel(set, skb, par, opt, &ext); + default: + break; } return -EINVAL; } static bool -id_eq(const struct list_set *map, u32 i, ip_set_id_t id) +id_eq(const struct ip_set *set, u32 i, ip_set_id_t id) { - const struct set_elem *elem; + const struct list_set *map = set->data; + const struct set_elem *e; + + if (i >= map->size) + return 0; - if (i < map->size) { - elem = list_set_elem(map, i); - return elem->id == id; + e = list_set_elem(map, i); + return !!(e->id == id && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map)))); +} + +static int +list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, + const struct ip_set_ext *ext) +{ + struct list_set *map = set->data; + struct set_elem *e = list_set_elem(map, i); + + if (e->id != IPSET_INVALID_ID) { + if (i == map->size - 1) + /* Last element replaced: e.g. add new,before,last */ + ip_set_put_byindex(e->id); + else { + struct set_elem *x = list_set_elem(map, map->size - 1); + + /* Last element pushed off */ + if (x->id != IPSET_INVALID_ID) + ip_set_put_byindex(x->id); + memmove(list_set_elem(map, i + 1), e, + map->dsize * (map->size - (i + 1))); + } } + e->id = d->id; + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, map), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, map), ext); return 0; } -static bool -id_eq_timeout(const struct list_set *map, u32 i, ip_set_id_t id) +static int +list_set_del(struct ip_set *set, u32 i) { - const struct set_elem *elem; + struct list_set *map = set->data; + struct set_elem *e = list_set_elem(map, i); - if (i < map->size) { - elem = list_set_elem(map, i); - return !!(elem->id == id && - !(with_timeout(map->timeout) && - list_set_expired(map, i))); - } + ip_set_put_byindex(e->id); + + if (i < map->size - 1) + memmove(e, list_set_elem(map, i + 1), + map->dsize * (map->size - (i + 1))); + /* Last element */ + e = list_set_elem(map, map->size - 1); + e->id = IPSET_INVALID_ID; return 0; } static void -list_elem_add(struct list_set *map, u32 i, ip_set_id_t id) +set_cleanup_entries(struct ip_set *set) { + struct list_set *map = set->data; struct set_elem *e; + u32 i; - for (; i < map->size; i++) { + for (i = 0; i < map->size; i++) { e = list_set_elem(map, i); - swap(e->id, id); - if (e->id == IPSET_INVALID_ID) - break; + if (e->id != IPSET_INVALID_ID && + ip_set_timeout_expired(ext_timeout(e, map))) + list_set_del(set, i); } } -static void -list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id, - unsigned long timeout) +static int +list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) { - struct set_telem *e; + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; + u32 i; + int ret; - for (; i < map->size; i++) { - e = list_set_telem(map, i); - swap(e->id, id); - swap(e->timeout, timeout); + for (i = 0; i < map->size; i++) { + e = list_set_elem(map, i); if (e->id == IPSET_INVALID_ID) - break; + return 0; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + else if (e->id != d->id) + continue; + + if (d->before == 0) + return 1; + else if (d->before > 0) + ret = id_eq(set, i + 1, d->refid); + else + ret = i > 0 && id_eq(set, i - 1, d->refid); + return ret; } + return 0; } + static int -list_set_add(struct list_set *map, u32 i, ip_set_id_t id, - unsigned long timeout) +list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) { - const struct set_elem *e = list_set_elem(map, i); - - if (i == map->size - 1 && e->id != IPSET_INVALID_ID) - /* Last element replaced: e.g. add new,before,last */ - ip_set_put_byindex(e->id); - if (with_timeout(map->timeout)) - list_elem_tadd(map, i, id, ip_set_timeout_set(timeout)); - else - list_elem_add(map, i, id); + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; + bool flag_exist = flags & IPSET_FLAG_EXIST; + u32 i, ret = 0; - return 0; -} + /* Check already added element */ + for (i = 0; i < map->size; i++) { + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + goto insert; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + else if (e->id != d->id) + continue; -static int -list_set_del(struct list_set *map, u32 i) -{ - struct set_elem *a = list_set_elem(map, i), *b; - - ip_set_put_byindex(a->id); - - for (; i < map->size - 1; i++) { - b = list_set_elem(map, i + 1); - a->id = b->id; - if (with_timeout(map->timeout)) - ((struct set_telem *)a)->timeout = - ((struct set_telem *)b)->timeout; - a = b; - if (a->id == IPSET_INVALID_ID) - break; + if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) || + (d->before < 0 && + (i == 0 || !id_eq(set, i - 1, d->refid)))) + /* Before/after doesn't match */ + return -IPSET_ERR_REF_EXIST; + if (!flag_exist) + /* Can't re-add */ + return -IPSET_ERR_EXIST; + /* Update extensions */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, map), ext->timeout); + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, map), ext); + /* Set is already added to the list */ + ip_set_put_byindex(d->id); + return 0; } - /* Last element */ - a->id = IPSET_INVALID_ID; - return 0; +insert: + ret = -IPSET_ERR_LIST_FULL; + for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + ret = d->before != 0 ? -IPSET_ERR_REF_EXIST + : list_set_add(set, i, d, ext); + else if (e->id != d->refid) + continue; + else if (d->before > 0) + ret = list_set_add(set, i, d, ext); + else if (i + 1 < map->size) + ret = list_set_add(set, i + 1, d, ext); + } + + return ret; } -static void -cleanup_entries(struct list_set *map) +static int +list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) { - struct set_telem *e; + struct list_set *map = set->data; + struct set_adt_elem *d = value; + struct set_elem *e; u32 i; for (i = 0; i < map->size; i++) { - e = list_set_telem(map, i); - if (e->id != IPSET_INVALID_ID && list_set_expired(map, i)) - list_set_del(map, i); + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + return d->before != 0 ? -IPSET_ERR_REF_EXIST + : -IPSET_ERR_EXIST; + else if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) + continue; + else if (e->id != d->id) + continue; + + if (d->before == 0) + return list_set_del(set, i); + else if (d->before > 0) { + if (!id_eq(set, i + 1, d->refid)) + return -IPSET_ERR_REF_EXIST; + return list_set_del(set, i); + } else if (i == 0 || !id_eq(set, i - 1, d->refid)) + return -IPSET_ERR_REF_EXIST; + else + return list_set_del(set, i); } + return -IPSET_ERR_EXIST; } static int @@ -225,26 +384,27 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct list_set *map = set->data; - bool with_timeout = with_timeout(map->timeout); - bool flag_exist = flags & IPSET_FLAG_EXIST; - int before = 0; - u32 timeout = map->timeout; - ip_set_id_t id, refid = IPSET_INVALID_ID; - const struct set_elem *elem; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct set_adt_elem e = { .refid = IPSET_INVALID_ID }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(map); struct ip_set *s; - u32 i; int ret = 0; if (unlikely(!tb[IPSET_ATTR_NAME] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s); - if (id == IPSET_INVALID_ID) + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + e.id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s); + if (e.id == IPSET_INVALID_ID) return -IPSET_ERR_NAME; /* "Loop detection" */ if (s->type->features & IPSET_TYPE_NAME) { @@ -254,115 +414,34 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); - before = f & IPSET_FLAG_BEFORE; + e.before = f & IPSET_FLAG_BEFORE; } - if (before && !tb[IPSET_ATTR_NAMEREF]) { + if (e.before && !tb[IPSET_ATTR_NAMEREF]) { ret = -IPSET_ERR_BEFORE; goto finish; } if (tb[IPSET_ATTR_NAMEREF]) { - refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]), - &s); - if (refid == IPSET_INVALID_ID) { + e.refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]), + &s); + if (e.refid == IPSET_INVALID_ID) { ret = -IPSET_ERR_NAMEREF; goto finish; } - if (!before) - before = -1; - } - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!with_timeout) { - ret = -IPSET_ERR_TIMEOUT; - goto finish; - } - timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + if (!e.before) + e.before = -1; } - if (with_timeout && adt != IPSET_TEST) - cleanup_entries(map); + if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set)) + set_cleanup_entries(set); - switch (adt) { - case IPSET_TEST: - for (i = 0; i < map->size && !ret; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID || - (before != 0 && i + 1 >= map->size)) - break; - else if (with_timeout && list_set_expired(map, i)) - continue; - else if (before > 0 && elem->id == id) - ret = id_eq_timeout(map, i + 1, refid); - else if (before < 0 && elem->id == refid) - ret = id_eq_timeout(map, i + 1, id); - else if (before == 0 && elem->id == id) - ret = 1; - } - break; - case IPSET_ADD: - for (i = 0; i < map->size; i++) { - elem = list_set_elem(map, i); - if (elem->id != id) - continue; - if (!(with_timeout && flag_exist)) { - ret = -IPSET_ERR_EXIST; - goto finish; - } else { - struct set_telem *e = list_set_telem(map, i); - - if ((before > 1 && - !id_eq(map, i + 1, refid)) || - (before < 0 && - (i == 0 || !id_eq(map, i - 1, refid)))) { - ret = -IPSET_ERR_EXIST; - goto finish; - } - e->timeout = ip_set_timeout_set(timeout); - ip_set_put_byindex(id); - ret = 0; - goto finish; - } - } - ret = -IPSET_ERR_LIST_FULL; - for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID) - ret = before != 0 ? -IPSET_ERR_REF_EXIST - : list_set_add(map, i, id, timeout); - else if (elem->id != refid) - continue; - else if (before > 0) - ret = list_set_add(map, i, id, timeout); - else if (i + 1 < map->size) - ret = list_set_add(map, i + 1, id, timeout); - } - break; - case IPSET_DEL: - ret = -IPSET_ERR_EXIST; - for (i = 0; i < map->size && ret == -IPSET_ERR_EXIST; i++) { - elem = list_set_elem(map, i); - if (elem->id == IPSET_INVALID_ID) { - ret = before != 0 ? -IPSET_ERR_REF_EXIST - : -IPSET_ERR_EXIST; - break; - } else if (elem->id == id && - (before == 0 || - (before > 0 && id_eq(map, i + 1, refid)))) - ret = list_set_del(map, i); - else if (elem->id == refid && - before < 0 && id_eq(map, i + 1, id)) - ret = list_set_del(map, i + 1); - } - break; - default: - break; - } + ret = adtfn(set, &e, &ext, &ext, flags); finish: - if (refid != IPSET_INVALID_ID) - ip_set_put_byindex(refid); + if (e.refid != IPSET_INVALID_ID) + ip_set_put_byindex(e.refid); if (adt != IPSET_ADD || ret) - ip_set_put_byindex(id); + ip_set_put_byindex(e.id); return ip_set_eexist(ret, flags) ? 0 : ret; } @@ -371,14 +450,14 @@ static void list_set_flush(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *elem; + struct set_elem *e; u32 i; for (i = 0; i < map->size; i++) { - elem = list_set_elem(map, i); - if (elem->id != IPSET_INVALID_ID) { - ip_set_put_byindex(elem->id); - elem->id = IPSET_INVALID_ID; + e = list_set_elem(map, i); + if (e->id != IPSET_INVALID_ID) { + ip_set_put_byindex(e->id); + e->id = IPSET_INVALID_ID; } } } @@ -388,7 +467,7 @@ list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; - if (with_timeout(map->timeout)) + if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); list_set_flush(set); kfree(map); @@ -406,8 +485,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || - (with_timeout(map->timeout) && + (SET_WITH_TIMEOUT(set) && nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))) || + (SET_WITH_COUNTER(set) && + nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, + htonl(IPSET_FLAG_WITH_COUNTERS))) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(sizeof(*map) + map->size * map->dsize))) @@ -436,7 +518,8 @@ list_set_list(const struct ip_set *set, e = list_set_elem(map, i); if (e->id == IPSET_INVALID_ID) goto finish; - if (with_timeout(map->timeout) && list_set_expired(map, i)) + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, map))) continue; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { @@ -449,13 +532,14 @@ list_set_list(const struct ip_set *set, if (nla_put_string(skb, IPSET_ATTR_NAME, ip_set_name_byindex(e->id))) goto nla_put_failure; - if (with_timeout(map->timeout)) { - const struct set_telem *te = - (const struct set_telem *) e; - __be32 to = htonl(ip_set_timeout_get(te->timeout)); - if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, to)) - goto nla_put_failure; - } + if (SET_WITH_TIMEOUT(set) && + nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get( + ext_timeout(e, map))))) + goto nla_put_failure; + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(e, map))) + goto nla_put_failure; ipset_nest_end(skb, nested); } finish: @@ -481,12 +565,18 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) const struct list_set *y = b->data; return x->size == y->size && - x->timeout == y->timeout; + x->timeout == y->timeout && + a->extensions == b->extensions; } -static const struct ip_set_type_variant list_set = { +static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, + .adt = { + [IPSET_ADD] = list_set_uadd, + [IPSET_DEL] = list_set_udel, + [IPSET_TEST] = list_set_utest, + }, .destroy = list_set_destroy, .flush = list_set_flush, .head = list_set_head, @@ -501,7 +591,7 @@ list_set_gc(unsigned long ul_set) struct list_set *map = set->data; write_lock_bh(&set->lock); - cleanup_entries(map); + set_cleanup_entries(set); write_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; @@ -509,20 +599,20 @@ list_set_gc(unsigned long ul_set) } static void -list_set_gc_init(struct ip_set *set) +list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) { struct list_set *map = set->data; init_timer(&map->gc); map->gc.data = (unsigned long) set; - map->gc.function = list_set_gc; + map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; add_timer(&map->gc); } /* Create list:set type of sets */ -static bool +static struct list_set * init_list_set(struct ip_set *set, u32 size, size_t dsize, unsigned long timeout) { @@ -532,7 +622,7 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize, map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL); if (!map) - return false; + return NULL; map->size = size; map->dsize = dsize; @@ -544,16 +634,19 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize, e->id = IPSET_INVALID_ID; } - return true; + return map; } static int list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags) { - u32 size = IP_SET_LIST_DEFAULT_SIZE; + struct list_set *map; + u32 size = IP_SET_LIST_DEFAULT_SIZE, cadt_flags = 0; + unsigned long timeout = 0; if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_SIZE]) @@ -561,18 +654,46 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags) if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; - if (tb[IPSET_ATTR_TIMEOUT]) { - if (!init_list_set(set, size, sizeof(struct set_telem), - ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]))) + if (tb[IPSET_ATTR_CADT_FLAGS]) + cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (tb[IPSET_ATTR_TIMEOUT]) + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->variant = &set_variant; + if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) { + set->extensions |= IPSET_EXT_COUNTER; + if (tb[IPSET_ATTR_TIMEOUT]) { + map = init_list_set(set, size, + sizeof(struct setct_elem), timeout); + if (!map) + return -ENOMEM; + set->extensions |= IPSET_EXT_TIMEOUT; + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct setct_elem, timeout); + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct setct_elem, counter); + list_set_gc_init(set, list_set_gc); + } else { + map = init_list_set(set, size, + sizeof(struct setc_elem), 0); + if (!map) + return -ENOMEM; + map->offset[IPSET_OFFSET_COUNTER] = + offsetof(struct setc_elem, counter); + } + } else if (tb[IPSET_ATTR_TIMEOUT]) { + map = init_list_set(set, size, + sizeof(struct sett_elem), timeout); + if (!map) return -ENOMEM; - - list_set_gc_init(set); + set->extensions |= IPSET_EXT_TIMEOUT; + map->offset[IPSET_OFFSET_TIMEOUT] = + offsetof(struct sett_elem, timeout); + list_set_gc_init(set, list_set_gc); } else { - if (!init_list_set(set, size, sizeof(struct set_elem), - IPSET_NO_TIMEOUT)) + map = init_list_set(set, size, sizeof(struct set_elem), 0); + if (!map) return -ENOMEM; } - set->variant = &list_set; return 0; } @@ -588,6 +709,7 @@ static struct ip_set_type list_set_type __read_mostly = { .create_policy = { [IPSET_ATTR_SIZE] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_NAME] = { .type = NLA_STRING, @@ -597,6 +719,8 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 9713e6e86d47..dfd7b65b3d2a 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -58,6 +58,18 @@ static inline void ip_vs_app_put(struct ip_vs_app *app) module_put(app->module); } +static void ip_vs_app_inc_destroy(struct ip_vs_app *inc) +{ + kfree(inc->timeout_table); + kfree(inc); +} + +static void ip_vs_app_inc_rcu_free(struct rcu_head *head) +{ + struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head); + + ip_vs_app_inc_destroy(inc); +} /* * Allocate/initialize app incarnation and register it in proto apps. @@ -106,8 +118,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, return 0; out: - kfree(inc->timeout_table); - kfree(inc); + ip_vs_app_inc_destroy(inc); return ret; } @@ -131,8 +142,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) list_del(&inc->a_list); - kfree(inc->timeout_table); - kfree(inc); + call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free); } @@ -144,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) { int result; - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); + result = ip_vs_app_get(inc->app); + if (result) + atomic_inc(&inc->usecnt); return result; } @@ -156,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc) */ void ip_vs_app_inc_put(struct ip_vs_app *inc) { - ip_vs_app_put(inc->app); atomic_dec(&inc->usecnt); + ip_vs_app_put(inc->app); } @@ -218,6 +228,7 @@ out_unlock: /* * ip_vs_app unregistration routine * We are sure there are no app incarnations attached to services + * Caller should use synchronize_rcu() or rcu_barrier() */ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { @@ -341,14 +352,14 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, unsigned int flag, __u32 seq, int diff) { /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { vseq->previous_delta = vseq->delta; vseq->delta += diff; vseq->init_seq = seq; cp->flags |= flag; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, @@ -605,12 +616,12 @@ int __net_init ip_vs_app_net_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); INIT_LIST_HEAD(&ipvs->app_list); - proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); + proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); return 0; } void __net_exit ip_vs_app_net_cleanup(struct net *net) { unregister_ip_vs_app(net, NULL /* all */); - proc_net_remove(net, "ip_vs_app"); + remove_proc_entry("ip_vs_app", net->proc_net); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 30e764ad021f..a083bda322b6 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -79,51 +79,21 @@ static unsigned int ip_vs_conn_rnd __read_mostly; struct ip_vs_aligned_lock { - rwlock_t l; + spinlock_t l; } __attribute__((__aligned__(SMP_CACHE_BYTES))); /* lock array for conn table */ static struct ip_vs_aligned_lock __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; -static inline void ct_read_lock(unsigned int key) -{ - read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock(unsigned int key) -{ - read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock(unsigned int key) -{ - write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock(unsigned int key) -{ - write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_lock_bh(unsigned int key) -{ - read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock_bh(unsigned int key) -{ - read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - static inline void ct_write_lock_bh(unsigned int key) { - write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } static inline void ct_write_unlock_bh(unsigned int key) { - write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } @@ -197,13 +167,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) /* Hash by protocol, client address and port */ hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]); cp->flags |= IP_VS_CONN_F_HASHED; atomic_inc(&cp->refcnt); + hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); ret = 1; } else { pr_err("%s(): request for already hashed, called from %pF\n", @@ -212,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) } spin_unlock(&cp->lock); - ct_write_unlock(hash); + ct_write_unlock_bh(hash); return ret; } @@ -220,7 +190,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) /* * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. + * returns bool success. Caller should hold conn reference. */ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) { @@ -230,11 +200,11 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) /* unhash it and decrease its reference counter */ hash = ip_vs_conn_hashkey_conn(cp); - ct_write_lock(hash); + ct_write_lock_bh(hash); spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { - hlist_del(&cp->c_list); + hlist_del_rcu(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; atomic_dec(&cp->refcnt); ret = 1; @@ -242,7 +212,37 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) ret = 0; spin_unlock(&cp->lock); - ct_write_unlock(hash); + ct_write_unlock_bh(hash); + + return ret; +} + +/* Try to unlink ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) +{ + unsigned int hash; + bool ret; + + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + ret = false; + /* Decrease refcnt and unlink conn only if we are last user */ + if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { + hlist_del_rcu(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + ret = true; + } + } else + ret = atomic_read(&cp->refcnt) ? false : true; + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); return ret; } @@ -259,28 +259,28 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { unsigned int hash; struct ip_vs_conn *cp; - struct hlist_node *n; hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == p->af && - p->cport == cp->cport && p->vport == cp->vport && + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->cport == cp->cport && p->vport == cp->vport && + cp->af == p->af && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && p->protocol == cp->protocol && ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); + rcu_read_unlock(); return cp; } } - ct_read_unlock(hash); + rcu_read_unlock(); return NULL; } @@ -344,18 +344,19 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { unsigned int hash; struct ip_vs_conn *cp; - struct hlist_node *n; hash = ip_vs_conn_hashkey_param(p, false); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { - if (!ip_vs_conn_net_eq(cp, p->net)) - continue; - if (p->pe_data && p->pe->ct_match) { - if (p->pe == cp->pe && p->pe->ct_match(p, cp)) - goto out; + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (unlikely(p->pe_data && p->pe->ct_match)) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { + if (__ip_vs_conn_get(cp)) + goto out; + } continue; } @@ -365,17 +366,18 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) * p->vaddr is a fwmark */ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, p->vaddr, &cp->vaddr) && - p->cport == cp->cport && p->vport == cp->vport && + p->vport == cp->vport && p->cport == cp->cport && cp->flags & IP_VS_CONN_F_TEMPLATE && - p->protocol == cp->protocol) - goto out; + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + if (__ip_vs_conn_get(cp)) + goto out; + } } cp = NULL; out: - if (cp) - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), @@ -394,30 +396,30 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { unsigned int hash; struct ip_vs_conn *cp, *ret=NULL; - struct hlist_node *n; /* * Check for "full" addressed entries */ hash = ip_vs_conn_hashkey_param(p, true); - ct_read_lock(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == p->af && - p->vport == cp->cport && p->cport == cp->dport && + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->vport == cp->cport && p->cport == cp->dport && + cp->af == p->af && ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && p->protocol == cp->protocol && ip_vs_conn_net_eq(cp, p->net)) { + if (!__ip_vs_conn_get(cp)) + continue; /* HIT */ - atomic_inc(&cp->refcnt); ret = cp; break; } } - ct_read_unlock(hash); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), @@ -460,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (cp->flags & IP_VS_CONN_F_NO_CPORT) { atomic_dec(&ip_vs_conn_no_cport_cnt); cp->flags &= ~IP_VS_CONN_F_NO_CPORT; cp->cport = cport; } - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); /* hash on new dport */ ip_vs_conn_hash(cp); @@ -552,7 +554,7 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) return; /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); + ip_vs_dest_hold(dest); conn_flags = atomic_read(&dest->conn_flags); if (cp->protocol != IPPROTO_UDP) @@ -609,20 +611,22 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) * Check if there is a destination for the connection, if so * bind the connection to the destination. */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +void ip_vs_try_bind_dest(struct ip_vs_conn *cp) { struct ip_vs_dest *dest; + rcu_read_lock(); dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark, cp->flags); if (dest) { struct ip_vs_proto_data *pd; - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if (cp->dest) { - spin_unlock(&cp->lock); - return dest; + spin_unlock_bh(&cp->lock); + rcu_read_unlock(); + return; } /* Applications work depending on the forwarding method @@ -631,7 +635,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) ip_vs_unbind_app(cp); ip_vs_bind_dest(cp, dest); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); /* Update its packet transmitter */ cp->packet_xmit = NULL; @@ -646,7 +650,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) if (pd && atomic_read(&pd->appcnt)) ip_vs_bind_app(cp, pd->pp); } - return dest; + rcu_read_unlock(); } @@ -698,12 +702,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) dest->flags &= ~IP_VS_DEST_F_OVERLOAD; } - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); + ip_vs_dest_put(dest); } static int expire_quiescent_template(struct netns_ipvs *ipvs, @@ -760,44 +759,38 @@ int ip_vs_check_template(struct ip_vs_conn *ct) * Simply decrease the refcnt of the template, * don't restart its timer. */ - atomic_dec(&ct->refcnt); + __ip_vs_conn_put(ct); return 0; } return 1; } +static void ip_vs_conn_rcu_free(struct rcu_head *head) +{ + struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, + rcu_head); + + ip_vs_pe_put(cp->pe); + kfree(cp->pe_data); + kmem_cache_free(ip_vs_conn_cachep, cp); +} + static void ip_vs_conn_expire(unsigned long data) { struct ip_vs_conn *cp = (struct ip_vs_conn *)data; struct net *net = ip_vs_conn_net(cp); struct netns_ipvs *ipvs = net_ipvs(net); - cp->timeout = 60*HZ; - - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); - /* * do I control anybody? */ if (atomic_read(&cp->n_control)) goto expire_later; - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { + /* Unlink conn if not referenced anymore */ + if (likely(ip_vs_conn_unlink(cp))) { /* delete the timer if it is activated by other users */ - if (timer_pending(&cp->timer)) - del_timer(&cp->timer); + del_timer(&cp->timer); /* does anybody control me? */ if (cp->control) @@ -814,38 +807,41 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_conn_drop_conntrack(cp); } - ip_vs_pe_put(cp->pe); - kfree(cp->pe_data); if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); atomic_dec(&ipvs->conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); return; } - /* hash it back to the table */ - ip_vs_conn_hash(cp); - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, + IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt), atomic_read(&cp->n_control)); + atomic_inc(&cp->refcnt); + cp->timeout = 60*HZ; + if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); ip_vs_conn_put(cp); } - +/* Modify timer, so that it expires as soon as possible. + * Can be called without reference only if under RCU lock. + */ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); + /* Using mod_timer_pending will ensure the timer is not + * modified after the final del_timer in ip_vs_conn_expire. + */ + if (timer_pending(&cp->timer) && + time_after(cp->timer.expires, jiffies)) + mod_timer_pending(&cp->timer, jiffies); } @@ -862,7 +858,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, p->protocol); - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { IP_VS_ERR_RL("%s(): no memory\n", __func__); return NULL; @@ -873,13 +869,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, ip_vs_conn_net_set(cp, p->net); cp->af = p->af; cp->protocol = p->protocol; - ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); + ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; - ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr); + ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr); cp->vport = p->vport; /* proto should only be IPPROTO_IP if d_addr is a fwmark */ - ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, - &cp->daddr, daddr); + ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; @@ -888,6 +884,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, cp->pe = p->pe; cp->pe_data = p->pe_data; cp->pe_data_len = p->pe_data_len; + } else { + cp->pe = NULL; + cp->pe_data = NULL; + cp->pe_data_len = 0; } spin_lock_init(&cp->lock); @@ -898,18 +898,28 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, */ atomic_set(&cp->refcnt, 1); + cp->control = NULL; atomic_set(&cp->n_control, 0); atomic_set(&cp->in_pkts, 0); + cp->packet_xmit = NULL; + cp->app = NULL; + cp->app_data = NULL; + /* reset struct ip_vs_seq */ + cp->in_seq.delta = 0; + cp->out_seq.delta = 0; + atomic_inc(&ipvs->conn_count); if (flags & IP_VS_CONN_F_NO_CPORT) atomic_inc(&ip_vs_conn_no_cport_cnt); /* Bind the connection with a destination server */ + cp->dest = NULL; ip_vs_bind_dest(cp, dest); /* Set its state and timeout */ cp->state = 0; + cp->old_state = 0; cp->timeout = 3*HZ; cp->sync_endtime = jiffies & ~3UL; @@ -954,27 +964,31 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) int idx; struct ip_vs_conn *cp; struct ip_vs_iter_state *iter = seq->private; - struct hlist_node *n; for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - ct_read_lock_bh(idx); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + /* __ip_vs_conn_get() is not needed by + * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show + */ if (pos-- == 0) { iter->l = &ip_vs_conn_tab[idx]; return cp; } } - ct_read_unlock_bh(idx); + rcu_read_unlock(); + rcu_read_lock(); } return NULL; } static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { struct ip_vs_iter_state *iter = seq->private; iter->l = NULL; + rcu_read_lock(); return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; } @@ -991,31 +1005,27 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) return ip_vs_conn_array(seq, 0); /* more on same hash chain? */ - if ((e = cp->c_list.next)) + e = rcu_dereference(hlist_next_rcu(&cp->c_list)); + if (e) return hlist_entry(e, struct ip_vs_conn, c_list); idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - while (++idx < ip_vs_conn_tab_size) { - ct_read_lock_bh(idx); - hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { iter->l = &ip_vs_conn_tab[idx]; return cp; } - ct_read_unlock_bh(idx); + rcu_read_unlock(); + rcu_read_lock(); } iter->l = NULL; return NULL; } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - struct ip_vs_iter_state *iter = seq->private; - struct hlist_head *l = iter->l; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); + rcu_read_unlock(); } static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) @@ -1194,21 +1204,20 @@ static inline int todrop_entry(struct ip_vs_conn *cp) void ip_vs_random_dropentry(struct net *net) { int idx; - struct ip_vs_conn *cp; + struct ip_vs_conn *cp, *cp_c; /* * Randomly scan 1/32 of the whole table every second */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { unsigned int hash = net_random() & ip_vs_conn_tab_mask; - struct hlist_node *n; /* * Lock is actually needed in this loop. */ - ct_write_lock_bh(hash); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; @@ -1235,12 +1244,15 @@ void ip_vs_random_dropentry(struct net *net) IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(hash); + rcu_read_unlock(); } } @@ -1251,29 +1263,30 @@ void ip_vs_random_dropentry(struct net *net) static void ip_vs_conn_flush(struct net *net) { int idx; - struct ip_vs_conn *cp; + struct ip_vs_conn *cp, *cp_c; struct netns_ipvs *ipvs = net_ipvs(net); flush_again: for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - struct hlist_node *n; - /* * Lock is actually needed in this loop. */ - ct_write_lock_bh(idx); + rcu_read_lock(); - hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (!ip_vs_conn_net_eq(cp, net)) continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); - if (cp->control) { + cp_c = cp->control; + /* cp->control is valid only with reference to cp */ + if (cp_c && __ip_vs_conn_get(cp)) { IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); + ip_vs_conn_expire_now(cp_c); + __ip_vs_conn_put(cp); } } - ct_write_unlock_bh(idx); + rcu_read_unlock(); } /* the counter may be not NULL, because maybe some conn entries @@ -1292,8 +1305,8 @@ int __net_init ip_vs_conn_net_init(struct net *net) atomic_set(&ipvs->conn_count, 0); - proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); + proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); return 0; } @@ -1301,8 +1314,8 @@ void __net_exit ip_vs_conn_net_cleanup(struct net *net) { /* flush all the connection entries first */ ip_vs_conn_flush(net); - proc_net_remove(net, "ip_vs_conn"); - proc_net_remove(net, "ip_vs_conn_sync"); + remove_proc_entry("ip_vs_conn", net->proc_net); + remove_proc_entry("ip_vs_conn_sync", net->proc_net); } int __init ip_vs_conn_init(void) @@ -1340,7 +1353,7 @@ int __init ip_vs_conn_init(void) INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); } /* calculate the random value for connection hash */ @@ -1351,6 +1364,8 @@ int __init ip_vs_conn_init(void) void ip_vs_conn_cleanup(void) { + /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ + rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); vfree(ip_vs_conn_tab); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 47edf5a40a59..085b5880ab0d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -69,10 +69,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); EXPORT_SYMBOL(ip_vs_get_debug_level); #endif -int ip_vs_net_id __read_mostly; -#ifdef IP_VS_GENERIC_NETNS -EXPORT_SYMBOL(ip_vs_net_id); -#endif +static int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); @@ -206,7 +203,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, { ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, vport, p); - p->pe = svc->pe; + p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) return p->pe->fill_param(p, skb); @@ -238,7 +235,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, svc->netmask); + ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, + (__force __u32) svc->netmask); else #endif snet.ip = iph->saddr.ip & svc->netmask; @@ -299,12 +297,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* Check if a template already exists */ ct = ip_vs_ct_in_get(¶m); if (!ct || !ip_vs_check_template(ct)) { + struct ip_vs_scheduler *sched; + /* * No template found or the dest of the connection * template is not available. * return *ignored=0 i.e. ICMP and NF_DROP */ - dest = svc->scheduler->schedule(svc, skb); + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb); if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); @@ -394,6 +395,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, { struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; + struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; __be16 _ports[2], *pptr; unsigned int flags; @@ -449,7 +451,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, return NULL; } - dest = svc->scheduler->schedule(svc, skb); + sched = rcu_dereference(svc->scheduler); + dest = sched->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; @@ -507,7 +510,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) { - ip_vs_service_put(svc); return NF_DROP; } @@ -533,8 +535,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; - ip_vs_service_put(svc); - /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { @@ -571,12 +571,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) return NF_ACCEPT; - } - - ip_vs_service_put(svc); /* * Notify the client that the destination is unreachable, and @@ -588,9 +584,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net_ = dev_net(skb_dst(skb)->dev); - skb->dev = net->loopback_dev; + skb->dev = net_->loopback_dev; } icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else @@ -643,8 +639,11 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) { - int err = ip_defrag(skb, user); + int err; + local_bh_disable(); + err = ip_defrag(skb, user); + local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); @@ -1164,9 +1163,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) sizeof(_ports), _ports, &iph); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(net, af, iph.protocol, - &iph.saddr, - pptr[0])) { + if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, + pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST @@ -1181,9 +1179,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - struct net *net = - dev_net(skb_dst(skb)->dev); - if (!skb->dev) skb->dev = net->loopback_dev; icmpv6_send(skb, @@ -1226,13 +1221,7 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_out(hooknum, skb, AF_INET); - local_bh_enable(); - return verdict; + return ip_vs_out(hooknum, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1259,13 +1248,7 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_out(hooknum, skb, AF_INET6); - local_bh_enable(); - return verdict; + return ip_vs_out(hooknum, skb, AF_INET6); } #endif @@ -1394,19 +1377,20 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); - rcu_read_lock(); ipv4_update_pmtu(skb, dev_net(skb->dev), mtu, 0, 0, 0, 0); - rcu_read_unlock(); /* Client uses PMTUD? */ if (!(cih->frag_off & htons(IP_DF))) goto ignore_ipip; /* Prefer the resulting PMTU */ if (dest) { - spin_lock(&dest->dst_lock); - if (dest->dst_cache) - mtu = dst_mtu(dest->dst_cache); - spin_unlock(&dest->dst_lock); + struct ip_vs_dest_dst *dest_dst; + + rcu_read_lock(); + dest_dst = rcu_dereference(dest->dest_dst); + if (dest_dst) + mtu = dst_mtu(dest_dst->dst_cache); + rcu_read_unlock(); } if (mtu > 68 + sizeof(struct iphdr)) mtu -= sizeof(struct iphdr); @@ -1577,7 +1561,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, &iph); @@ -1654,7 +1639,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ @@ -1722,13 +1706,7 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_in(hooknum, skb, AF_INET); - local_bh_enable(); - return verdict; + return ip_vs_in(hooknum, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1787,13 +1765,7 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - unsigned int verdict; - - /* Disable BH in LOCAL_OUT until all places are fixed */ - local_bh_disable(); - verdict = ip_vs_in(hooknum, skb, AF_INET6); - local_bh_enable(); - return verdict; + return ip_vs_in(hooknum, skb, AF_INET6); } #endif @@ -1815,13 +1787,15 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, { int r; struct net *net; + struct netns_ipvs *ipvs; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; return ip_vs_in_icmp(skb, &r, hooknum); @@ -1835,6 +1809,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, { int r; struct net *net; + struct netns_ipvs *ipvs; struct ip_vs_iphdr iphdr; ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); @@ -1843,7 +1818,8 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ec664cbb119f..5b142fb16480 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -55,9 +55,6 @@ /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ static DEFINE_MUTEX(__ip_vs_mutex); -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - /* sysctl variables */ #ifdef CONFIG_IP_VS_DEBUG @@ -71,7 +68,7 @@ int ip_vs_get_debug_level(void) /* Protos */ -static void __ip_vs_del_service(struct ip_vs_service *svc); +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); #ifdef CONFIG_IP_VS_IPV6 @@ -257,9 +254,9 @@ ip_vs_use_count_dec(void) #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* the service table hashed by <protocol, addr, port> */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; +static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* @@ -271,16 +268,18 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, { register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; + __u32 ahash; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - addr_fold ^= ((size_t)net>>8); + ahash = ntohl(addr_fold); + ahash ^= ((size_t) net >> 8); - return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; + return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & + IP_VS_SVC_TAB_MASK; } /* @@ -312,13 +311,13 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) */ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, &svc->addr, svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); + hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } svc->flags |= IP_VS_SVC_F_HASHED; @@ -342,10 +341,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) if (svc->fwmark == 0) { /* Remove it from the svc_table table */ - list_del(&svc->s_list); + hlist_del_rcu(&svc->s_list); } else { /* Remove it from the svc_fwm_table table */ - list_del(&svc->f_list); + hlist_del_rcu(&svc->f_list); } svc->flags &= ~IP_VS_SVC_F_HASHED; @@ -367,7 +366,7 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol, /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) @@ -394,7 +393,7 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(net, fwmark); - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af && net_eq(svc->net, net)) { /* HIT */ @@ -405,15 +404,14 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) return NULL; } +/* Find service, called under RCU lock */ struct ip_vs_service * -ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, - const union nf_inet_addr *vaddr, __be16 vport) +ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; struct netns_ipvs *ipvs = net_ipvs(net); - read_lock(&__ip_vs_svc_lock); - /* * Check the table hashed by fwmark first */ @@ -449,10 +447,6 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, } out: - if (svc) - atomic_inc(&svc->usecnt); - read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), @@ -469,6 +463,13 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) dest->svc = svc; } +static void ip_vs_service_free(struct ip_vs_service *svc) +{ + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); + kfree(svc); +} + static void __ip_vs_unbind_svc(struct ip_vs_dest *dest) { @@ -476,12 +477,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest) dest->svc = NULL; if (atomic_dec_and_test(&svc->refcnt)) { - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), atomic_read(&svc->usecnt)); - free_percpu(svc->stats.cpustats); - kfree(svc); + ntohs(svc->port)); + ip_vs_service_free(svc); } } @@ -506,17 +506,13 @@ static inline unsigned int ip_vs_rs_hashkey(int af, & IP_VS_RTAB_MASK; } -/* - * Hashes ip_vs_dest in rs_table by <proto,addr,port>. - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) +/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ +static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; - if (!list_empty(&dest->d_list)) { - return 0; - } + if (dest->in_rs_table) + return; /* * Hash by proto,addr,port, @@ -524,64 +520,51 @@ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - list_add(&dest->d_list, &ipvs->rs_table[hash]); - - return 1; + hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); + dest->in_rs_table = 1; } -/* - * UNhashes ip_vs_dest from rs_table. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +/* Unhash ip_vs_dest from rs_table. */ +static void ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* * Remove it from the rs_table table. */ - if (!list_empty(&dest->d_list)) { - list_del_init(&dest->d_list); + if (dest->in_rs_table) { + hlist_del_rcu(&dest->d_list); + dest->in_rs_table = 0; } - - return 1; } -/* - * Lookup real service by <proto,addr,port> in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, - const union nf_inet_addr *daddr, - __be16 dport) +/* Check if real service by <proto,addr,port> is present */ +bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport) { struct netns_ipvs *ipvs = net_ipvs(net); unsigned int hash; struct ip_vs_dest *dest; - /* - * Check for "full" addressed entries - * Return the first found entry - */ + /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); - read_lock(&ipvs->rs_lock); - list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) { - if ((dest->af == af) - && ip_vs_addr_equal(af, &dest->addr, daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { + rcu_read_lock(); + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { /* HIT */ - read_unlock(&ipvs->rs_lock); - return dest; + rcu_read_unlock(); + return true; } } - read_unlock(&ipvs->rs_lock); + rcu_read_unlock(); - return NULL; + return false; } -/* - * Lookup destination by {addr,port} in the given service +/* Lookup destination by {addr,port} in the given service + * Called under RCU lock. */ static struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, @@ -592,7 +575,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find the destination for the given service */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->af == svc->af) && ip_vs_addr_equal(svc->af, &dest->addr, daddr) && (dest->port == dport)) { @@ -606,13 +589,11 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, /* * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in + * Created to be used in ip_vs_process_message() in * the backup synchronization daemon. It finds the * destination to be bound to the received connection * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. + * Called under RCU lock, no refcnt is returned. */ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, @@ -625,7 +606,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) @@ -633,12 +614,31 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, dest = ip_vs_lookup_dest(svc, daddr, port); if (!dest) dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); return dest; } +void ip_vs_dest_dst_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_dst *dest_dst = container_of(head, + struct ip_vs_dest_dst, + rcu_head); + + dst_release(dest_dst->dst_cache); + kfree(dest_dst); +} + +/* Release dest_dst and dst_cache for dest in user context */ +static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) +{ + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, 1); + if (old) { + RCU_INIT_POINTER(dest->dest_dst, NULL); + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); + } +} + /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed @@ -653,19 +653,25 @@ static struct ip_vs_dest * ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, __be16 dport) { - struct ip_vs_dest *dest, *nxt; + struct ip_vs_dest *dest; struct netns_ipvs *ipvs = net_ipvs(svc->net); /* * Find the destination in trash */ - list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); + /* We can not reuse dest while in grace period + * because conns still can use dest->svc + */ + if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) + continue; if (dest->af == svc->af && ip_vs_addr_equal(svc->af, &dest->addr, daddr) && dest->port == dport && @@ -675,29 +681,27 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " - "from trash\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - free_percpu(dest->stats.cpustats); - kfree(dest); + list_del(&dest->t_list); + ip_vs_dest_hold(dest); + goto out; } } - return NULL; + dest = NULL; + +out: + spin_unlock_bh(&ipvs->dest_trash_lock); + + return dest; } +static void ip_vs_dest_free(struct ip_vs_dest *dest) +{ + __ip_vs_dst_cache_reset(dest); + __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); + kfree(dest); +} /* * Clean up all the destinations in the trash @@ -706,19 +710,18 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. + * be 0, so we simply release them here. */ static void ip_vs_trash_cleanup(struct net *net) { struct ip_vs_dest *dest, *nxt; struct netns_ipvs *ipvs = net_ipvs(net); - list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - free_percpu(dest->stats.cpustats); - kfree(dest); + del_timer_sync(&ipvs->dest_trash_timer); + /* No need to use dest_trash_lock */ + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { + list_del(&dest->t_list); + ip_vs_dest_free(dest); } } @@ -768,6 +771,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_scheduler *sched; int conn_flags; /* set the weight and the flags */ @@ -783,9 +787,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, * Put the real service in rs_table if not present. * For now only for NAT! */ - write_lock_bh(&ipvs->rs_lock); ip_vs_rs_hash(ipvs, dest); - write_unlock_bh(&ipvs->rs_lock); } atomic_set(&dest->conn_flags, conn_flags); @@ -809,27 +811,20 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->l_threshold = udest->l_threshold; spin_lock_bh(&dest->dst_lock); - ip_vs_dst_reset(dest); + __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); - if (add) - ip_vs_start_estimator(svc->net, &dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - + sched = rcu_dereference_protected(svc->scheduler, 1); if (add) { - list_add(&dest->n_list, &svc->destinations); + ip_vs_start_estimator(svc->net, &dest->stats); + list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; + if (sched->add_dest) + sched->add_dest(svc, dest); + } else { + if (sched->upd_dest) + sched->upd_dest(svc, dest); } - - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); } @@ -881,7 +876,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atomic_set(&dest->persistconns, 0); atomic_set(&dest->refcnt, 1); - INIT_LIST_HEAD(&dest->d_list); + INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); spin_lock_init(&dest->stats.lock); __ip_vs_update_dest(svc, dest, udest, 1); @@ -923,10 +918,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Check if the dest already exists in the list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest != NULL) { IP_VS_DBG(1, "%s(): dest already exists\n", __func__); @@ -948,11 +943,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - __ip_vs_update_dest(svc, dest, udest, 1); ret = 0; } else { @@ -992,10 +982,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - /* - * Lookup the destination list - */ + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &daddr, dport); + rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); @@ -1008,11 +998,21 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return 0; } +static void ip_vs_dest_wait_readers(struct rcu_head *head) +{ + struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest, + rcu_head); + + /* End of grace period after unlinking */ + clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); +} + /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, + bool cleanup) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1021,38 +1021,24 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) /* * Remove it from the d-linked list with the real services. */ - write_lock_bh(&ipvs->rs_lock); ip_vs_rs_unhash(dest); - write_unlock_bh(&ipvs->rs_lock); - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n", - dest->vfwmark, - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port)); - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - free_percpu(dest->stats.cpustats); - kfree(dest); - } else { - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " - "dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ipvs->dest_trash); - atomic_inc(&dest->refcnt); + if (!cleanup) { + set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); + call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers); } + + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + IP_VS_DEST_TRASH_PERIOD); + /* dest lives in trash without reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_dest_put(dest); } @@ -1068,14 +1054,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, /* * Remove it from the d-linked destination list. */ - list_del(&dest->n_list); + list_del_rcu(&dest->n_list); svc->num_dests--; - /* - * Call the update_service function of its scheduler - */ - if (svcupd && svc->scheduler->update_service) - svc->scheduler->update_service(svc); + if (svcupd) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched->del_dest) + sched->del_dest(svc, dest); + } } @@ -1090,37 +1078,56 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) EnterFunction(2); + /* We use function that requires RCU lock */ + rcu_read_lock(); dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): destination not found!\n", __func__); return -ENOENT; } - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - /* * Unlink dest from the service */ __ip_vs_unlink_dest(svc, dest, 1); - write_unlock_bh(&__ip_vs_svc_lock); - /* * Delete the destination */ - __ip_vs_del_dest(svc->net, dest); + __ip_vs_del_dest(svc->net, dest, false); LeaveFunction(2); return 0; } +static void ip_vs_dest_trash_expire(unsigned long data) +{ + struct net *net = (struct net *) data; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest *dest, *next; + + spin_lock(&ipvs->dest_trash_lock); + list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { + /* Skip if dest is in grace period */ + if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) + continue; + if (atomic_read(&dest->refcnt) > 0) + continue; + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->svc->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->t_list); + ip_vs_dest_free(dest); + } + if (!list_empty(&ipvs->dest_trash)) + mod_timer(&ipvs->dest_trash_timer, + jiffies + IP_VS_DEST_TRASH_PERIOD); + spin_unlock(&ipvs->dest_trash_lock); +} /* * Add a service into the service hash table @@ -1157,9 +1164,13 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, } #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out_err; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out_err; + } } #endif @@ -1176,7 +1187,6 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, } /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 0); atomic_set(&svc->refcnt, 0); svc->af = u->af; @@ -1190,7 +1200,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, svc->net = net; INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->sched_lock); spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ @@ -1200,7 +1210,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, sched = NULL; /* Bind the ct retriever */ - ip_vs_bind_pe(svc, pe); + RCU_INIT_POINTER(svc->pe, pe); pe = NULL; /* Update the virtual service counters */ @@ -1216,9 +1226,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, ipvs->num_services++; /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); *svc_p = svc; /* Now there is a service - full throttle */ @@ -1228,15 +1236,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, out_err: if (svc != NULL) { - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - if (svc->stats.cpustats) - free_percpu(svc->stats.cpustats); - kfree(svc); + ip_vs_unbind_scheduler(svc, sched); + ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); ip_vs_pe_put(pe); @@ -1280,18 +1281,27 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } #ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; - goto out; + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out; + } } #endif - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched != old_sched) { + /* Bind the new scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + old_sched = sched; + goto out; + } + /* Unbind the old scheduler on success */ + ip_vs_unbind_scheduler(svc, old_sched); + } /* * Set the flags and timeout value @@ -1300,57 +1310,30 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out_unlock; - } - - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out_unlock; - } - } - - old_pe = svc->pe; - if (pe != old_pe) { - ip_vs_unbind_pe(svc); - ip_vs_bind_pe(svc, pe); - } + old_pe = rcu_dereference_protected(svc->pe, 1); + if (pe != old_pe) + rcu_assign_pointer(svc->pe, pe); -out_unlock: - write_unlock_bh(&__ip_vs_svc_lock); out: ip_vs_scheduler_put(old_sched); ip_vs_pe_put(old_pe); return ret; } +static void ip_vs_service_rcu_free(struct rcu_head *head) +{ + struct ip_vs_service *svc; + + svc = container_of(head, struct ip_vs_service, rcu_head); + ip_vs_service_free(svc); +} /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! * - We are called under _bh lock */ -static void __ip_vs_del_service(struct ip_vs_service *svc) +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; @@ -1366,27 +1349,20 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) ip_vs_stop_estimator(svc->net, &svc->stats); /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); + old_sched = rcu_dereference_protected(svc->scheduler, 1); + ip_vs_unbind_scheduler(svc, old_sched); ip_vs_scheduler_put(old_sched); - /* Unbind persistence engine */ - old_pe = svc->pe; - ip_vs_unbind_pe(svc); + /* Unbind persistence engine, keep svc->pe */ + old_pe = rcu_dereference_protected(svc->pe, 1); ip_vs_pe_put(old_pe); - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } - /* * Unlink the whole destination list */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(svc->net, dest); + __ip_vs_del_dest(svc->net, dest, cleanup); } /* @@ -1400,13 +1376,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Free the service if nobody refers to it */ - if (atomic_read(&svc->refcnt) == 0) { - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + if (atomic_dec_and_test(&svc->refcnt)) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port), atomic_read(&svc->usecnt)); - free_percpu(svc->stats.cpustats); - kfree(svc); + ntohs(svc->port)); + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } /* decrease the module use count */ @@ -1416,23 +1391,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Unlink a service from list and try to delete it if its refcnt reached 0 */ -static void ip_vs_unlink_service(struct ip_vs_service *svc) +static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { + /* Hold svc to avoid double release from dest_trash */ + atomic_inc(&svc->refcnt); /* * Unhash it from the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - - __ip_vs_del_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); + __ip_vs_del_service(svc, cleanup); } /* @@ -1442,7 +1410,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) { if (svc == NULL) return -EEXIST; - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, false); return 0; } @@ -1451,19 +1419,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(struct net *net) +static int ip_vs_flush(struct net *net, bool cleanup) { int idx; - struct ip_vs_service *svc, *nxt; + struct ip_vs_service *svc; + struct hlist_node *n; /* * Flush the service table hashed by <netns,protocol,addr,port> */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], - s_list) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], + s_list) { if (net_eq(svc->net, net)) - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, cleanup); } } @@ -1471,10 +1440,10 @@ static int ip_vs_flush(struct net *net) * Flush the service table hashed by fwmark */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], + f_list) { if (net_eq(svc->net, net)) - ip_vs_unlink_service(svc); + ip_vs_unlink_service(svc, cleanup); } } @@ -1490,32 +1459,32 @@ void ip_vs_service_net_cleanup(struct net *net) EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(net); + ip_vs_flush(net, true); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } -/* - * Release dst hold by dst_cache - */ + +/* Put all references for device (dst_cache) */ static inline void -__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) { + struct ip_vs_dest_dst *dest_dst; + spin_lock_bh(&dest->dst_lock); - if (dest->dst_cache && dest->dst_cache->dev == dev) { + dest_dst = rcu_dereference_protected(dest->dest_dst, 1); + if (dest_dst && dest_dst->dst_cache->dev == dev) { IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", dev->name, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - ip_vs_dst_reset(dest); + __ip_vs_dst_cache_reset(dest); } spin_unlock_bh(&dest->dst_lock); } -/* - * Netdev event receiver - * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to - * a device that is "unregister" it must be released. +/* Netdev event receiver + * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -1527,35 +1496,37 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, struct ip_vs_dest *dest; unsigned int idx; - if (event != NETDEV_UNREGISTER || !ipvs) + if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); EnterFunction(2); mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net)) { list_for_each_entry(dest, &svc->destinations, n_list) { - __ip_vs_dev_reset(dest, dev); + ip_vs_forget_dev(dest, dev); } } } - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (net_eq(svc->net, net)) { list_for_each_entry(dest, &svc->destinations, n_list) { - __ip_vs_dev_reset(dest, dev); + ip_vs_forget_dev(dest, dev); } } } } - list_for_each_entry(dest, &ipvs->dest_trash, n_list) { - __ip_vs_dev_reset(dest, dev); + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { + ip_vs_forget_dev(dest, dev); } + spin_unlock_bh(&ipvs->dest_trash_lock); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); return NOTIFY_DONE; @@ -1568,12 +1539,10 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; - write_lock_bh(&__ip_vs_svc_lock); list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_zero_stats(&dest->stats); } ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); return 0; } @@ -1583,14 +1552,14 @@ static int ip_vs_zero_all(struct net *net) struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net)) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (net_eq(svc->net, net)) ip_vs_zero_service(svc); } @@ -1808,6 +1777,12 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "backup_only", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -1912,7 +1887,7 @@ static struct ctl_table vs_vars[] = { struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ - struct list_head *table; + struct hlist_head *table; int bucket; }; @@ -1945,7 +1920,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; @@ -1956,7 +1931,8 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], + f_list) { if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; @@ -1969,17 +1945,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -__acquires(__ip_vs_svc_lock) + __acquires(RCU) { - - read_lock_bh(&__ip_vs_svc_lock); + rcu_read_lock(); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *e; + struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; @@ -1992,13 +1967,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (iter->table == ip_vs_svc_table) { /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - + e = rcu_dereference(hlist_next_rcu(&svc->s_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, s_list); while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_table[iter->bucket], + s_list) { return svc; } } @@ -2009,13 +1985,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); + e = rcu_dereference(hlist_next_rcu(&svc->f_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, f_list); scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_fwm_table[iter->bucket], + f_list) return svc; } @@ -2023,9 +2001,9 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -__releases(__ip_vs_svc_lock) + __releases(RCU) { - read_unlock_bh(&__ip_vs_svc_lock); + rcu_read_unlock(); } @@ -2043,6 +2021,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -2051,18 +2030,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - svc->scheduler->name); + sched->name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - svc->scheduler->name, + sched->name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", - svc->fwmark, svc->scheduler->name, + svc->fwmark, sched->name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } @@ -2073,7 +2052,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) else seq_putc(seq, '\n'); - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { #ifdef CONFIG_IP_VS_IPV6 if (dest->af == AF_INET6) seq_printf(seq, @@ -2167,7 +2146,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; - struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats; + struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_stats_user rates; int i; @@ -2383,7 +2362,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(net); + ret = ip_vs_flush(net, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ @@ -2418,11 +2397,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) } /* Lookup the exact service by <protocol, addr, port> or fwmark */ + rcu_read_lock(); if (usvc.fwmark == 0) svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2474,11 +2455,14 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(src->scheduler, 1); dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2497,7 +2481,7 @@ __ip_vs_get_service_entries(struct net *net, int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; @@ -2516,7 +2500,7 @@ __ip_vs_get_service_entries(struct net *net, } for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; @@ -2545,11 +2529,13 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; + rcu_read_lock(); if (get->fwmark) svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); else svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, get->port); + rcu_read_unlock(); if (svc) { int count = 0; @@ -2732,12 +2718,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; + rcu_read_lock(); if (entry->fwmark) svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); else svc = __ip_vs_service_find(net, AF_INET, entry->protocol, &addr, entry->port); + rcu_read_unlock(); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) @@ -2894,6 +2882,8 @@ nla_put_failure: static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { + struct ip_vs_scheduler *sched; + struct ip_vs_pe *pe; struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; @@ -2910,16 +2900,17 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, } else { if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || - nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port)) + nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) goto nla_put_failure; } - if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) || - (svc->pe && - nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) || + sched = rcu_dereference_protected(svc->scheduler, 1); + pe = rcu_dereference_protected(svc->pe, 1); + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || - nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) + nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) goto nla_put_failure; @@ -2965,7 +2956,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -2976,7 +2967,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -3032,15 +3023,17 @@ static int ip_vs_genl_parse_service(struct net *net, } else { usvc->protocol = nla_get_u16(nla_protocol); nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); - usvc->port = nla_get_u16(nla_port); + usvc->port = nla_get_be16(nla_port); usvc->fwmark = 0; } + rcu_read_lock(); if (usvc->fwmark) svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); else svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, &usvc->addr, usvc->port); + rcu_read_unlock(); *ret_svc = svc; /* If a full entry was requested, check for the additional fields */ @@ -3070,7 +3063,7 @@ static int ip_vs_genl_parse_service(struct net *net, usvc->sched_name = nla_data(nla_sched); usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); - usvc->netmask = nla_get_u32(nla_netmask); + usvc->netmask = nla_get_be32(nla_netmask); } return 0; @@ -3096,7 +3089,7 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) return -EMSGSIZE; if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || - nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) || + nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, (atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK)) || @@ -3205,7 +3198,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, memset(udest, 0, sizeof(*udest)); nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); - udest->port = nla_get_u16(nla_port); + udest->port = nla_get_be16(nla_port); /* If a full entry was requested, check for the additional fields */ if (full_entry) { @@ -3230,8 +3223,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, return 0; } -static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid) +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid) { struct nlattr *nl_daemon; @@ -3252,8 +3245,8 @@ nla_put_failure: return -EMSGSIZE; } -static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid, +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, + const char *mcast_ifn, __u32 syncid, struct netlink_callback *cb) { void *hdr; @@ -3392,7 +3385,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(net); + ret = ip_vs_flush(net, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { ret = ip_vs_genl_set_config(net, info->attrs); @@ -3741,6 +3734,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; ipvs->sysctl_pmtu_disc = 1; tbl[idx++].data = &ipvs->sysctl_pmtu_disc; + tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); @@ -3783,13 +3777,14 @@ int __net_init ip_vs_control_net_init(struct net *net) int idx; struct netns_ipvs *ipvs = net_ipvs(net); - rwlock_init(&ipvs->rs_lock); - /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) - INIT_LIST_HEAD(&ipvs->rs_table[idx]); + INIT_HLIST_HEAD(&ipvs->rs_table[idx]); INIT_LIST_HEAD(&ipvs->dest_trash); + spin_lock_init(&ipvs->dest_trash_lock); + setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, + (unsigned long) net); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); @@ -3800,10 +3795,10 @@ int __net_init ip_vs_control_net_init(struct net *net) spin_lock_init(&ipvs->tot_stats.lock); - proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); - proc_net_fops_create(net, "ip_vs_stats_percpu", 0, - &ip_vs_stats_percpu_fops); + proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); + proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops); + proc_create("ip_vs_stats_percpu", 0, net->proc_net, + &ip_vs_stats_percpu_fops); if (ip_vs_control_net_init_sysctl(net)) goto err; @@ -3819,12 +3814,16 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + /* Some dest can be in grace period even before cleanup, we have to + * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called. + */ + rcu_barrier(); ip_vs_trash_cleanup(net); ip_vs_stop_estimator(net, &ipvs->tot_stats); ip_vs_control_net_cleanup_sysctl(net); - proc_net_remove(net, "ip_vs_stats_percpu"); - proc_net_remove(net, "ip_vs_stats"); - proc_net_remove(net, "ip_vs"); + remove_proc_entry("ip_vs_stats_percpu", net->proc_net); + remove_proc_entry("ip_vs_stats", net->proc_net); + remove_proc_entry("ip_vs", net->proc_net); free_percpu(ipvs->tot_stats.cpustats); } @@ -3864,10 +3863,10 @@ int __init ip_vs_control_init(void) EnterFunction(2); - /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ + /* Initialize svc_table, ip_vs_svc_fwm_table */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); + INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); } smp_wmb(); /* Do we really need it now ? */ diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 7f3b0cc00b7a..ccab120df45e 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -51,7 +51,7 @@ * IPVS DH bucket */ struct ip_vs_dh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -64,6 +64,10 @@ struct ip_vs_dh_bucket { #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) +struct ip_vs_dh_state { + struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; + struct rcu_head rcu_head; +}; /* * Returns hash value for IPVS DH entry @@ -85,10 +89,9 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) { - return (tbl[ip_vs_dh_hashkey(af, addr)]).dest; + return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); } @@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_dh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); p = p->next; } @@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +static void ip_vs_dh_flush(struct ip_vs_dh_state *s) { int i; struct ip_vs_dh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -145,51 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) static int ip_vs_dh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl; + struct ip_vs_dh_state *s; /* allocate the DH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_KERNEL); - if (tbl == NULL) + s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - svc->sched_data = tbl; + svc->sched_data = s; IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_dh_reassign(s, svc); return 0; } -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +static void ip_vs_dh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; + struct ip_vs_dh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + ip_vs_dh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - return 0; } -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); + struct ip_vs_dh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); + ip_vs_dh_reassign(s, svc); return 0; } @@ -212,19 +217,20 @@ static struct ip_vs_dest * ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_dest *dest; - struct ip_vs_dh_bucket *tbl; + struct ip_vs_dh_state *s; struct ip_vs_iphdr iph; ip_vs_fill_iph_addr_only(svc->af, skb, &iph); IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - tbl = (struct ip_vs_dh_bucket *)svc->sched_data; - dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr); + s = (struct ip_vs_dh_state *) svc->sched_data; + dest = ip_vs_dh_get(svc->af, s, &iph.daddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest)) { + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } @@ -248,7 +254,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, - .update_service = ip_vs_dh_update_svc, + .add_dest = ip_vs_dh_dest_changed, + .del_dest = ip_vs_dh_dest_changed, .schedule = ip_vs_dh_schedule, }; @@ -262,6 +269,7 @@ static int __init ip_vs_dh_init(void) static void __exit ip_vs_dh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 0fac6017b6fb..6bee6d0c73a5 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -56,7 +56,7 @@ * Make a summary from each cpu */ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, - struct ip_vs_cpu_stats *stats) + struct ip_vs_cpu_stats __percpu *stats) { int i; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 4f53a5f04437..77c173282f38 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -267,10 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * hopefully it will succeed on the retransmitted * packet. */ + rcu_read_lock(); ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, iph->ihl * 4, start-data, end-start, buf, buf_len); + rcu_read_unlock(); if (ret) { ip_vs_nfct_expect_related(skb, ct, n_cp, IPPROTO_TCP, 0, 0); @@ -480,6 +482,7 @@ static int __init ip_vs_ftp_init(void) int rv; rv = register_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns on error */ return rv; } @@ -489,6 +492,7 @@ static int __init ip_vs_ftp_init(void) static void __exit ip_vs_ftp_exit(void) { unregister_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns */ } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index fdd89b9564ea..5ea26bd87743 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -90,11 +90,12 @@ * IP address and its destination server */ struct ip_vs_lblc_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -102,12 +103,14 @@ struct ip_vs_lblc_entry { * IPVS lblc hash table */ struct ip_vs_lblc_table { - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct timer_list periodic_timer; /* collect stale entries */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; @@ -129,13 +132,16 @@ static ctl_table vs_vars_table[] = { static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) { - list_del(&en->list); + struct ip_vs_dest *dest; + + hlist_del_rcu(&en->list); /* * We don't kfree dest because it is referred either by its service * or the trash dest list. */ - atomic_dec(&en->dest->refcnt); - kfree(en); + dest = rcu_dereference_protected(en->dest, 1); + ip_vs_dest_put(dest); + kfree_rcu(en, rcu_head); } @@ -165,15 +171,12 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read - * lock - */ +/* Get ip_vs_lblc_entry associated with supplied parameters. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, const union nf_inet_addr *addr) @@ -181,7 +184,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, unsigned int hash = ip_vs_lblc_hashkey(af, addr); struct ip_vs_lblc_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -191,7 +194,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, /* * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP - * address to a server. Called under write lock. + * address to a server. Called under spin lock. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, @@ -209,14 +212,20 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, ip_vs_addr_copy(dest->af, &en->addr, daddr); en->lastuse = jiffies; - atomic_inc(&dest->refcnt); - en->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(en->dest, dest); ip_vs_lblc_hash(tbl, en); - } else if (en->dest != dest) { - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; + } else { + struct ip_vs_dest *old_dest; + + old_dest = rcu_dereference_protected(en->dest, 1); + if (old_dest != dest) { + ip_vs_dest_put(old_dest); + ip_vs_dest_hold(dest); + /* No ordering constraints for refcnt */ + RCU_INIT_POINTER(en->dest, dest); + } } return en; @@ -226,17 +235,22 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +static void ip_vs_lblc_flush(struct ip_vs_service *svc) { - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; int i; + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblc_free(en); atomic_dec(&tbl->entries); } } + spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblc_expiration(struct ip_vs_service *svc) @@ -252,15 +266,16 @@ static int sysctl_lblc_expiration(struct ip_vs_service *svc) static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; unsigned long now = jiffies; int i, j; for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + sysctl_lblc_expiration(svc))) @@ -269,7 +284,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) ip_vs_lblc_free(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -293,7 +308,8 @@ static void ip_vs_lblc_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblc_entry *en, *nxt; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -314,8 +330,8 @@ static void ip_vs_lblc_check_expire(unsigned long data) for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) continue; @@ -323,7 +339,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -354,11 +370,12 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) * Initialize the hash buckets */ for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -371,7 +388,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; @@ -379,14 +396,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); + ip_vs_lblc_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -408,7 +423,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { @@ -423,7 +438,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -457,7 +472,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -484,7 +499,6 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); if (en) { /* We only hold a read lock, but this is atomic */ @@ -499,14 +513,11 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * free up entries from the trash at any time. */ - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) - dest = en->dest; + dest = rcu_dereference(en->dest); + if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && + atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; } - read_unlock(&svc->sched_lock); - - /* If the destination has a weight and is not overloaded, use it */ - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) - goto out; /* No cache entry or it is invalid, time to schedule */ dest = __ip_vs_lblc_schedule(svc); @@ -516,9 +527,10 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblc_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblc_new(tbl, &iph.daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", @@ -621,6 +633,7 @@ static void __exit ip_vs_lblc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); unregister_pernet_subsys(&ip_vs_lblc_ops); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index c03b6a3ade2f..50123c2ab484 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -89,40 +89,44 @@ */ struct ip_vs_dest_set_elem { struct list_head list; /* list link */ - struct ip_vs_dest *dest; /* destination server */ + struct ip_vs_dest __rcu *dest; /* destination server */ + struct rcu_head rcu_head; }; struct ip_vs_dest_set { atomic_t size; /* set size */ unsigned long lastmod; /* last modified time */ struct list_head list; /* destination list */ - rwlock_t lock; /* lock for this list */ }; -static struct ip_vs_dest_set_elem * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, + struct ip_vs_dest *dest, bool check) { struct ip_vs_dest_set_elem *e; - list_for_each_entry(e, &set->list, list) { - if (e->dest == dest) - /* already existed */ - return NULL; + if (check) { + list_for_each_entry(e, &set->list, list) { + struct ip_vs_dest *d; + + d = rcu_dereference_protected(e->dest, 1); + if (d == dest) + /* already existed */ + return; + } } e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) - return NULL; + return; - atomic_inc(&dest->refcnt); - e->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(e->dest, dest); - list_add(&e->list, &set->list); + list_add_rcu(&e->list, &set->list); atomic_inc(&set->size); set->lastmod = jiffies; - return e; } static void @@ -131,13 +135,16 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) struct ip_vs_dest_set_elem *e; list_for_each_entry(e, &set->list, list) { - if (e->dest == dest) { + struct ip_vs_dest *d; + + d = rcu_dereference_protected(e->dest, 1); + if (d == dest) { /* HIT */ atomic_dec(&set->size); set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - list_del(&e->list); - kfree(e); + ip_vs_dest_put(dest); + list_del_rcu(&e->list); + kfree_rcu(e, rcu_head); break; } } @@ -147,17 +154,18 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) { struct ip_vs_dest_set_elem *e, *ep; - write_lock(&set->lock); list_for_each_entry_safe(e, ep, &set->list, list) { + struct ip_vs_dest *d; + + d = rcu_dereference_protected(e->dest, 1); /* * We don't kfree dest because it is referred either * by its service or by the trash dest list. */ - atomic_dec(&e->dest->refcnt); - list_del(&e->list); - kfree(e); + ip_vs_dest_put(d); + list_del_rcu(&e->list); + kfree_rcu(e, rcu_head); } - write_unlock(&set->lock); } /* get weighted least-connection node in the destination set */ @@ -171,8 +179,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) return NULL; /* select the first destination server, whose weight > 0 */ - list_for_each_entry(e, &set->list, list) { - least = e->dest; + list_for_each_entry_rcu(e, &set->list, list) { + least = rcu_dereference(e->dest); if (least->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -186,8 +194,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) /* find the destination with the weighted least load */ nextstage: - list_for_each_entry(e, &set->list, list) { - dest = e->dest; + list_for_each_entry_continue_rcu(e, &set->list, list) { + dest = rcu_dereference(e->dest); if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -224,7 +232,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) /* select the first destination server, whose weight > 0 */ list_for_each_entry(e, &set->list, list) { - most = e->dest; + most = rcu_dereference_protected(e->dest, 1); if (atomic_read(&most->weight) > 0) { moh = ip_vs_dest_conn_overhead(most); goto nextstage; @@ -234,8 +242,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) /* find the destination with the weighted most load */ nextstage: - list_for_each_entry(e, &set->list, list) { - dest = e->dest; + list_for_each_entry_continue(e, &set->list, list) { + dest = rcu_dereference_protected(e->dest, 1); doh = ip_vs_dest_conn_overhead(dest); /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ if ((moh * atomic_read(&dest->weight) < @@ -262,11 +270,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) * IP address and its destination server set */ struct ip_vs_lblcr_entry { - struct list_head list; + struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ struct ip_vs_dest_set set; /* destination server set */ unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; }; @@ -274,12 +283,14 @@ struct ip_vs_lblcr_entry { * IPVS lblcr hash table */ struct ip_vs_lblcr_table { - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ struct timer_list periodic_timer; /* collect stale entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ + bool dead; }; @@ -302,9 +313,9 @@ static ctl_table vs_vars_table[] = { static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { - list_del(&en->list); + hlist_del_rcu(&en->list); ip_vs_dest_set_eraseall(&en->set); - kfree(en); + kfree_rcu(en, rcu_head); } @@ -334,15 +345,12 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); - list_add(&en->list, &tbl->bucket[hash]); + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. Called under - * read lock. - */ +/* Get ip_vs_lblcr_entry associated with supplied parameters. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *addr) @@ -350,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, unsigned int hash = ip_vs_lblcr_hashkey(af, addr); struct ip_vs_lblcr_entry *en; - list_for_each_entry(en, &tbl->bucket[hash], list) + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; @@ -360,7 +368,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, /* * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. Called under write lock. + * IP address to a server. Called under spin lock. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, @@ -381,14 +389,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, /* initialize its dest set */ atomic_set(&(en->set.size), 0); INIT_LIST_HEAD(&en->set.list); - rwlock_init(&en->set.lock); + + ip_vs_dest_set_insert(&en->set, dest, false); ip_vs_lblcr_hash(tbl, en); + return en; } - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest, true); return en; } @@ -397,17 +405,21 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, /* * Flush all the entries of the specified table. */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +static void ip_vs_lblcr_flush(struct ip_vs_service *svc) { + struct ip_vs_lblcr_table *tbl = svc->sched_data; int i; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; - /* No locking required, only called during cleanup. */ + spin_lock_bh(&svc->sched_lock); + tbl->dead = 1; for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); } } + spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblcr_expiration(struct ip_vs_service *svc) @@ -425,13 +437,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_after(en->lastuse + sysctl_lblcr_expiration(svc), now)) continue; @@ -439,7 +452,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -463,7 +476,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data) unsigned long now = jiffies; int goal; int i, j; - struct ip_vs_lblcr_entry *en, *nxt; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ @@ -484,8 +498,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data) for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - write_lock(&svc->sched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) continue; @@ -493,7 +507,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&svc->sched_lock); + spin_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -523,11 +537,12 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) * Initialize the hash buckets */ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); + INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; + tbl->dead = 0; /* * Hook periodic timer for garbage collection @@ -540,7 +555,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; @@ -548,14 +563,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); + ip_vs_lblcr_flush(svc); /* release the table itself */ - kfree(tbl); + kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", sizeof(*tbl)); - - return 0; } @@ -577,7 +590,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * The server with weight=0 is quiesced and will not receive any * new connection. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -593,7 +606,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; @@ -627,7 +640,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; - list_for_each_entry(d, &svc->destinations, n_list) { + list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; @@ -646,7 +659,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_lblcr_table *tbl = svc->sched_data; struct ip_vs_iphdr iph; - struct ip_vs_dest *dest = NULL; + struct ip_vs_dest *dest; struct ip_vs_lblcr_entry *en; ip_vs_fill_iph_addr_only(svc->af, skb, &iph); @@ -654,53 +667,46 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ - read_lock(&svc->sched_lock); en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); if (en) { - /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; /* Get the least loaded destination */ - read_lock(&en->set.lock); dest = ip_vs_dest_set_min(&en->set); - read_unlock(&en->set.lock); /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && - time_after(jiffies, en->set.lastmod + + time_after(jiffies, en->set.lastmod + sysctl_lblcr_expiration(svc))) { - struct ip_vs_dest *m; + spin_lock_bh(&svc->sched_lock); + if (atomic_read(&en->set.size) > 1) { + struct ip_vs_dest *m; - write_lock(&en->set.lock); - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - write_unlock(&en->set.lock); + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + spin_unlock_bh(&svc->sched_lock); } /* If the destination is not overloaded, use it */ - if (dest && !is_overloaded(dest, svc)) { - read_unlock(&svc->sched_lock); + if (dest && !is_overloaded(dest, svc)) goto out; - } /* The cache entry is invalid, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); - read_unlock(&svc->sched_lock); return NULL; } /* Update our cache entry */ - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - } - read_unlock(&svc->sched_lock); - - if (dest) + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_dest_set_insert(&en->set, dest, true); + spin_unlock_bh(&svc->sched_lock); goto out; + } /* No cache entry, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); @@ -710,9 +716,10 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblcr_new(tbl, &iph.daddr, dest); - write_unlock(&svc->sched_lock); + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblcr_new(tbl, &iph.daddr, dest); + spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", @@ -814,6 +821,7 @@ static void __exit ip_vs_lblcr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); unregister_pernet_subsys(&ip_vs_lblcr_ops); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index f391819c0cca..5128e338a749 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -42,7 +42,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * served, but no new connection is assigned to the server. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || atomic_read(&dest->weight) == 0) continue; @@ -84,6 +84,7 @@ static int __init ip_vs_lc_init(void) static void __exit ip_vs_lc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); + synchronize_rcu(); } module_init(ip_vs_lc_init); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index 984d9c137d84..646cfd4baa73 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -75,7 +75,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD || !atomic_read(&dest->weight)) @@ -133,6 +133,7 @@ static int __init ip_vs_nq_init(void) static void __exit ip_vs_nq_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); + synchronize_rcu(); } module_init(ip_vs_nq_init); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 5cf859ccb31b..1a82b29ce8ea 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -13,20 +13,8 @@ /* IPVS pe list */ static LIST_HEAD(ip_vs_pe); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_pe_lock); - -/* Bind a service with a pe */ -void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe) -{ - svc->pe = pe; -} - -/* Unbind a service from its pe */ -void ip_vs_unbind_pe(struct ip_vs_service *svc) -{ - svc->pe = NULL; -} +/* semaphore for IPVS PEs. */ +static DEFINE_MUTEX(ip_vs_pe_mutex); /* Get pe in the pe list by name */ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) @@ -36,9 +24,8 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, pe_name); - spin_lock_bh(&ip_vs_pe_lock); - - list_for_each_entry(pe, &ip_vs_pe, n_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { /* Test and get the modules atomically */ if (pe->module && !try_module_get(pe->module)) { @@ -47,14 +34,14 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) } if (strcmp(pe_name, pe->name)==0) { /* HIT */ - spin_unlock_bh(&ip_vs_pe_lock); + rcu_read_unlock(); return pe; } if (pe->module) module_put(pe->module); } + rcu_read_unlock(); - spin_unlock_bh(&ip_vs_pe_lock); return NULL; } @@ -83,22 +70,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) /* increase the module use count */ ip_vs_use_count_inc(); - spin_lock_bh(&ip_vs_pe_lock); - - if (!list_empty(&pe->n_list)) { - spin_unlock_bh(&ip_vs_pe_lock); - ip_vs_use_count_dec(); - pr_err("%s(): [%s] pe already linked\n", - __func__, pe->name); - return -EINVAL; - } - + mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist * in the pe list. */ list_for_each_entry(tmp, &ip_vs_pe, n_list) { if (strcmp(tmp->name, pe->name) == 0) { - spin_unlock_bh(&ip_vs_pe_lock); + mutex_unlock(&ip_vs_pe_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] pe already existed " "in the system\n", __func__, pe->name); @@ -106,8 +84,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) } } /* Add it into the d-linked pe list */ - list_add(&pe->n_list, &ip_vs_pe); - spin_unlock_bh(&ip_vs_pe_lock); + list_add_rcu(&pe->n_list, &ip_vs_pe); + mutex_unlock(&ip_vs_pe_mutex); pr_info("[%s] pe registered.\n", pe->name); @@ -118,17 +96,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe); /* Unregister a pe from the pe list */ int unregister_ip_vs_pe(struct ip_vs_pe *pe) { - spin_lock_bh(&ip_vs_pe_lock); - if (list_empty(&pe->n_list)) { - spin_unlock_bh(&ip_vs_pe_lock); - pr_err("%s(): [%s] pe is not in the list. failed\n", - __func__, pe->name); - return -EINVAL; - } - + mutex_lock(&ip_vs_pe_mutex); /* Remove it from the d-linked pe list */ - list_del(&pe->n_list); - spin_unlock_bh(&ip_vs_pe_lock); + list_del_rcu(&pe->n_list); + mutex_unlock(&ip_vs_pe_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 12475ef88daf..9ef22bdce9f1 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -13,7 +13,8 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, const char *callid, size_t callid_len, int *idx) { - size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1); + size_t max_len = 64; + size_t len = min3(max_len, callid_len, buf_len - *idx - 1); memcpy(buf + *idx, callid, len); buf[*idx+len] = '\0'; *idx += len + 1; @@ -37,14 +38,10 @@ static int get_callid(const char *dptr, unsigned int dataoff, if (ret > 0) break; if (!ret) - return 0; + return -EINVAL; dataoff += *matchoff; } - /* Empty callid is useless */ - if (!*matchlen) - return -EINVAL; - /* Too large is useless */ if (*matchlen > IP_VS_PEDATA_MAXLEN) return -EINVAL; @@ -172,6 +169,7 @@ static int __init ip_vs_sip_init(void) static void __exit ip_vs_sip_cleanup(void) { unregister_ip_vs_pe(&ip_vs_sip_pe); + synchronize_rcu(); } module_init(ip_vs_sip_init); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index ae8ec6f27688..86464881cd20 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -27,9 +27,10 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, if (sch == NULL) return 0; net = skb_net(skb); + rcu_read_lock(); if ((sch->type == SCTP_CID_INIT) && - (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol, - &iph->daddr, sh->dest))) { + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, sh->dest))) { int ignored; if (ip_vs_todrop(net_ipvs(net))) { @@ -37,7 +38,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -49,14 +50,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); - else { - ip_vs_service_put(svc); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -208,7 +208,7 @@ enum ipvs_sctp_event_t { IP_VS_SCTP_EVE_LAST }; -static enum ipvs_sctp_event_t sctp_events[255] = { +static enum ipvs_sctp_event_t sctp_events[256] = { IP_VS_SCTP_EVE_DATA_CLI, IP_VS_SCTP_EVE_INIT_CLI, IP_VS_SCTP_EVE_INIT_ACK_CLI, @@ -906,7 +906,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, sctp_chunkhdr_t _sctpch, *sch; unsigned char chunk_type; int event, next_state; - int ihl; + int ihl, cofs; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); @@ -914,8 +914,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, ihl = ip_hdrlen(skb); #endif - sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t), - sizeof(_sctpch), &_sctpch); + cofs = ihl + sizeof(sctp_sctphdr_t); + sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); if (sch == NULL) return; @@ -933,10 +933,12 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, */ if ((sch->type == SCTP_CID_COOKIE_ECHO) || (sch->type == SCTP_CID_COOKIE_ACK)) { - sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) + - sch->length), sizeof(_sctpch), &_sctpch); - if (sch) { - if (sch->type == SCTP_CID_ABORT) + int clen = ntohs(sch->length); + + if (clen >= sizeof(sctp_chunkhdr_t)) { + sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), + sizeof(_sctpch), &_sctpch); + if (sch && sch->type == SCTP_CID_ABORT) chunk_type = sch->type; } } @@ -992,9 +994,9 @@ static void sctp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); set_sctp_state(pd, cp, direction, skb); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline __u16 sctp_app_hashkey(__be16 port) @@ -1014,30 +1016,25 @@ static int sctp_register_app(struct net *net, struct ip_vs_app *inc) hash = sctp_app_hashkey(port); - spin_lock_bh(&ipvs->sctp_app_lock); list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->sctp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->sctp_app_lock); return ret; } static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); - spin_lock_bh(&ipvs->sctp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->sctp_app_lock); + list_del_rcu(&inc->p_list); } static int sctp_app_conn_bind(struct ip_vs_conn *cp) @@ -1053,12 +1050,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); - spin_lock(&ipvs->sctp_app_lock); - list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->sctp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -1074,7 +1071,7 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->sctp_app_lock); + rcu_read_unlock(); out: return result; } @@ -1088,7 +1085,6 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); - spin_lock_init(&ipvs->sctp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); if (!pd->timeout_table) diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 9af653a75825..50a15944c6c1 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -47,9 +47,10 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, } net = skb_net(skb); /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ + rcu_read_lock(); if (th->syn && - (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol, - &iph->daddr, th->dest))) { + (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, th->dest))) { int ignored; if (ip_vs_todrop(net_ipvs(net))) { @@ -57,7 +58,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -70,14 +71,13 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); - else { - ip_vs_service_put(svc); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -557,9 +557,9 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, if (th == NULL) return; - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); set_tcp_state(pd, cp, direction, th); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } static inline __u16 tcp_app_hashkey(__be16 port) @@ -580,18 +580,16 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) hash = tcp_app_hashkey(port); - spin_lock_bh(&ipvs->tcp_app_lock); list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->tcp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->tcp_app_lock); return ret; } @@ -599,13 +597,10 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) static void tcp_unregister_app(struct net *net, struct ip_vs_app *inc) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); - spin_lock_bh(&ipvs->tcp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->tcp_app_lock); + list_del_rcu(&inc->p_list); } @@ -624,12 +619,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); - spin_lock(&ipvs->tcp_app_lock); - list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->tcp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -646,7 +641,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->tcp_app_lock); + rcu_read_unlock(); out: return result; @@ -660,11 +655,11 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] : tcp_timeouts[IP_VS_TCP_S_LISTEN]); - spin_unlock(&cp->lock); + spin_unlock_bh(&cp->lock); } /* --------------------------------------------- @@ -676,7 +671,6 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); - spin_lock_init(&ipvs->tcp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); if (!pd->timeout_table) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 503a842c90d2..b62a3c0ff9bf 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -44,8 +44,9 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, return 0; } net = skb_net(skb); - svc = ip_vs_service_get(net, af, skb->mark, iph->protocol, - &iph->daddr, uh->dest); + rcu_read_lock(); + svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, + &iph->daddr, uh->dest); if (svc) { int ignored; @@ -54,7 +55,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * It seems that we are very loaded. * We have to drop this packet :( */ - ip_vs_service_put(svc); + rcu_read_unlock(); *verdict = NF_DROP; return 0; } @@ -67,14 +68,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); - else { - ip_vs_service_put(svc); + else *verdict = NF_DROP; - } + rcu_read_unlock(); return 0; } - ip_vs_service_put(svc); } + rcu_read_unlock(); /* NF_ACCEPT */ return 1; } @@ -359,19 +359,16 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc) hash = udp_app_hashkey(port); - - spin_lock_bh(&ipvs->udp_app_lock); list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &ipvs->udp_apps[hash]); + list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&ipvs->udp_app_lock); return ret; } @@ -380,12 +377,9 @@ static void udp_unregister_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); - struct netns_ipvs *ipvs = net_ipvs(net); - spin_lock_bh(&ipvs->udp_app_lock); atomic_dec(&pd->appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&ipvs->udp_app_lock); + list_del_rcu(&inc->p_list); } @@ -403,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); - spin_lock(&ipvs->udp_app_lock); - list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&ipvs->udp_app_lock); + rcu_read_unlock(); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -425,7 +419,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&ipvs->udp_app_lock); + rcu_read_unlock(); out: return result; @@ -467,7 +461,6 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); - spin_lock_init(&ipvs->udp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); if (!pd->timeout_table) diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index c49b388d1085..c35986c793d9 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) { - svc->sched_data = &svc->destinations; + struct list_head *p; + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + /* dest is already unlinked, so p->prev is not valid but + * p->next is valid, use it to reach previous entry. + */ + if (p == &dest->n_list) + svc->sched_data = p->next->prev; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -48,36 +57,41 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc) static struct ip_vs_dest * ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { - struct list_head *p, *q; - struct ip_vs_dest *dest; + struct list_head *p; + struct ip_vs_dest *dest, *last; + int pass = 0; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + last = dest = list_entry(p, struct ip_vs_dest, n_list); + do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + if (dest == last) + goto stop; } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); + pass++; + /* Previous dest could be unlinked, do not loop forever. + * If we stay at head there is no need for 2nd pass. + */ + } while (pass < 2 && p != &svc->destinations); + +stop: + spin_unlock_bh(&svc->sched_lock); ip_vs_scheduler_err(svc, "no destination available"); return NULL; out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); + svc->sched_data = &dest->n_list; + spin_unlock_bh(&svc->sched_lock); IP_VS_DBG_BUF(6, "RR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), @@ -94,7 +108,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = { .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), .init_service = ip_vs_rr_init_svc, - .update_service = ip_vs_rr_update_svc, + .add_dest = NULL, + .del_dest = ip_vs_rr_del_dest, .schedule = ip_vs_rr_schedule, }; @@ -106,6 +121,7 @@ static int __init ip_vs_rr_init(void) static void __exit ip_vs_rr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); + synchronize_rcu(); } module_init(ip_vs_rr_init); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index d6bf20d6cdbe..4dbcda6258bc 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -35,8 +35,8 @@ EXPORT_SYMBOL(ip_vs_scheduler_err); */ static LIST_HEAD(ip_vs_schedulers); -/* lock for service table */ -static DEFINE_SPINLOCK(ip_vs_sched_lock); +/* semaphore for schedulers */ +static DEFINE_MUTEX(ip_vs_sched_mutex); /* @@ -47,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, { int ret; - svc->scheduler = scheduler; - if (scheduler->init_service) { ret = scheduler->init_service(svc); if (ret) { @@ -56,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, return ret; } } - + rcu_assign_pointer(svc->scheduler, scheduler); return 0; } @@ -64,22 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, /* * Unbind a service with its scheduler */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +void ip_vs_unbind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *sched) { - struct ip_vs_scheduler *sched = svc->scheduler; + struct ip_vs_scheduler *cur_sched; - if (!sched) - return 0; + cur_sched = rcu_dereference_protected(svc->scheduler, 1); + /* This check proves that old 'sched' was installed */ + if (!cur_sched) + return; - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - pr_err("%s(): done error\n", __func__); - return -EINVAL; - } - } - - svc->scheduler = NULL; - return 0; + if (sched->done_service) + sched->done_service(svc); + /* svc->scheduler can not be set to NULL */ } @@ -92,7 +87,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); list_for_each_entry(sched, &ip_vs_schedulers, n_list) { /* @@ -106,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) } if (strcmp(sched_name, sched->name)==0) { /* HIT */ - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return sched; } if (sched->module) module_put(sched->module); } - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); return NULL; } @@ -153,21 +148,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference(svc->scheduler); if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", - svc->scheduler->name, svc->fwmark, - svc->fwmark, msg); + sched->name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", - svc->scheduler->name, - ip_vs_proto_name(svc->protocol), + sched->name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", - svc->scheduler->name, - ip_vs_proto_name(svc->protocol), + sched->name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } @@ -192,10 +187,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) /* increase the module use count */ ip_vs_use_count_inc(); - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (!list_empty(&scheduler->n_list)) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already linked\n", __func__, scheduler->name); @@ -208,7 +203,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) */ list_for_each_entry(sched, &ip_vs_schedulers, n_list) { if (strcmp(scheduler->name, sched->name) == 0) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already existed " "in the system\n", __func__, scheduler->name); @@ -219,7 +214,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); pr_info("[%s] scheduler registered.\n", scheduler->name); @@ -237,9 +232,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) return -EINVAL; } - spin_lock_bh(&ip_vs_sched_lock); + mutex_lock(&ip_vs_sched_mutex); if (list_empty(&scheduler->n_list)) { - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); pr_err("%s(): [%s] scheduler is not in the list. failed\n", __func__, scheduler->name); return -EINVAL; @@ -249,7 +244,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); - spin_unlock_bh(&ip_vs_sched_lock); + mutex_unlock(&ip_vs_sched_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 89ead246ed3d..f3205925359a 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -79,7 +79,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; @@ -94,7 +94,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_sed_dest_overhead(dest); @@ -134,6 +134,7 @@ static int __init ip_vs_sed_init(void) static void __exit ip_vs_sed_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); + synchronize_rcu(); } module_init(ip_vs_sed_init); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index e33126994628..0df269d7c99f 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -53,7 +53,7 @@ * IPVS SH bucket */ struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ + struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* @@ -66,6 +66,10 @@ struct ip_vs_sh_bucket { #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) +struct ip_vs_sh_state { + struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; + struct rcu_head rcu_head; +}; /* * Returns hash value for IPVS SH entry @@ -87,10 +91,9 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, - const union nf_inet_addr *addr) +ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr) { - return (tbl[ip_vs_sh_hashkey(af, addr)]).dest; + return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest); } @@ -98,27 +101,32 @@ ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, * Assign all the hash buckets of the specified table with the service. */ static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_sh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; int d_count; + bool empty; - b = tbl; + b = &s->buckets[0]; p = &svc->destinations; + empty = list_empty(p); d_count = 0; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { + dest = rcu_dereference_protected(b->dest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", i, IP_VS_DBG_ADDR(svc->af, &dest->addr), @@ -140,16 +148,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) /* * Flush all the hash buckets of the specified table. */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +static void ip_vs_sh_flush(struct ip_vs_sh_state *s) { int i; struct ip_vs_sh_bucket *b; + struct ip_vs_dest *dest; - b = tbl; + b = &s->buckets[0]; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; + dest = rcu_dereference_protected(b->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); } b++; } @@ -158,51 +168,46 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) static int ip_vs_sh_init_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl; + struct ip_vs_sh_state *s; /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_KERNEL); - if (tbl == NULL) + s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); + if (s == NULL) return -ENOMEM; - svc->sched_data = tbl; + svc->sched_data = s; IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + /* assign the hash buckets with current dests */ + ip_vs_sh_reassign(s, svc); return 0; } -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +static void ip_vs_sh_done_svc(struct ip_vs_service *svc) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; + struct ip_vs_sh_state *s = svc->sched_data; /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + ip_vs_sh_flush(s); /* release the table itself */ - kfree(svc->sched_data); + kfree_rcu(s, rcu_head); IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; } -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); + struct ip_vs_sh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); + ip_vs_sh_reassign(s, svc); return 0; } @@ -225,15 +230,15 @@ static struct ip_vs_dest * ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; + struct ip_vs_sh_state *s; struct ip_vs_iphdr iph; ip_vs_fill_iph_addr_only(svc->af, skb, &iph); IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr); + s = (struct ip_vs_sh_state *) svc->sched_data; + dest = ip_vs_sh_get(svc->af, s, &iph.saddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 @@ -262,7 +267,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), .init_service = ip_vs_sh_init_svc, .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, + .add_dest = ip_vs_sh_dest_changed, + .del_dest = ip_vs_sh_dest_changed, + .upd_dest = ip_vs_sh_dest_changed, .schedule = ip_vs_sh_schedule, }; @@ -276,6 +283,7 @@ static int __init ip_vs_sh_init(void) static void __exit ip_vs_sh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); + synchronize_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 44fd10c539ac..f6046d9af8d3 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -246,7 +246,7 @@ struct ip_vs_sync_thread_data { struct ip_vs_sync_mesg_v0 { __u8 nr_conns; __u8 syncid; - __u16 size; + __be16 size; /* ip_vs_sync_conn entries start here */ }; @@ -255,7 +255,7 @@ struct ip_vs_sync_mesg_v0 { struct ip_vs_sync_mesg { __u8 reserved; /* must be zero */ __u8 syncid; - __u16 size; + __be16 size; __u8 nr_conns; __s8 version; /* SYNC_PROTO_VER */ __u16 spare; @@ -335,7 +335,7 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs) sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; sb->mesg->syncid = ipvs->master_syncid; - sb->mesg->size = sizeof(struct ip_vs_sync_mesg); + sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); sb->mesg->nr_conns = 0; sb->mesg->spare = 0; sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); @@ -418,7 +418,7 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; mesg->nr_conns = 0; mesg->syncid = ipvs->master_syncid; - mesg->size = sizeof(struct ip_vs_sync_mesg_v0); + mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; sb->firstuse = jiffies; @@ -531,9 +531,9 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) return; - spin_lock(&ipvs->sync_buff_lock); + spin_lock_bh(&ipvs->sync_buff_lock); if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); return; } @@ -552,7 +552,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, if (!buff) { buff = ip_vs_sync_buff_create_v0(ipvs); if (!buff) { - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } @@ -582,7 +582,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, } m->nr_conns++; - m->size += len; + m->size = htons(ntohs(m->size) + len); buff->head += len; /* check if there is a space for next one */ @@ -590,7 +590,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, sb_queue_tail(ipvs, ms); ms->sync_buff = NULL; } - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ cp = cp->control; @@ -641,9 +641,9 @@ sloop: pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); } - spin_lock(&ipvs->sync_buff_lock); + spin_lock_bh(&ipvs->sync_buff_lock); if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); return; } @@ -683,7 +683,7 @@ sloop: if (!buff) { buff = ip_vs_sync_buff_create(ipvs); if (!buff) { - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } @@ -693,7 +693,7 @@ sloop: p = buff->head; buff->head += pad + len; - m->size += pad + len; + m->size = htons(ntohs(m->size) + pad + len); /* Add ev. padding from prev. sync_conn */ while (pad--) *(p++) = 0; @@ -750,7 +750,7 @@ sloop: } } - spin_unlock(&ipvs->sync_buff_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); control: /* synchronize its controller if it has */ @@ -843,7 +843,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, kfree(param->pe_data); dest = cp->dest; - spin_lock(&cp->lock); + spin_lock_bh(&cp->lock); if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { if (flags & IP_VS_CONN_F_INACTIVE) { @@ -857,24 +857,21 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; cp->flags = flags; - spin_unlock(&cp->lock); - if (!dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } + spin_unlock_bh(&cp->lock); + if (!dest) + ip_vs_try_bind_dest(cp); } else { /* * Find the appropriate destination for the connection. * If it is not found the connection will remain unbound * but still handled. */ + rcu_read_lock(); dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark, flags); cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); - if (dest) - atomic_dec(&dest->refcnt); + rcu_read_unlock(); if (!cp) { if (param->pe_data) kfree(param->pe_data); @@ -1178,10 +1175,8 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, IP_VS_DBG(2, "BACKUP, message header too short\n"); return; } - /* Convert size back to host byte order */ - m2->size = ntohs(m2->size); - if (buflen != m2->size) { + if (buflen != ntohs(m2->size)) { IP_VS_DBG(2, "BACKUP, bogus message size\n"); return; } @@ -1547,10 +1542,7 @@ ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) int msize; int ret; - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); + msize = ntohs(msg->size); ret = ip_vs_send_async(sock, (char *)msg, msize); if (ret >= 0 || ret == -EAGAIN) @@ -1692,11 +1684,7 @@ static int sync_thread_backup(void *data) break; } - /* disable bottom half, because it accesses the data - shared by softirq while getting/creating conns */ - local_bh_disable(); ip_vs_process_message(tinfo->net, tinfo->buf, len); - local_bh_enable(); } } diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index bc1bfc48a17f..c60a81c4ce9a 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -51,7 +51,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * new connections. */ - list_for_each_entry(dest, &svc->destinations, n_list) { + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; @@ -66,7 +66,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); @@ -106,6 +106,7 @@ static int __init ip_vs_wlc_init(void) static void __exit ip_vs_wlc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); + synchronize_rcu(); } module_init(ip_vs_wlc_init); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 231be7dd547a..0e68555bceb9 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -29,14 +29,45 @@ #include <net/ip_vs.h> +/* The WRR algorithm depends on some caclulations: + * - mw: maximum weight + * - di: weight step, greatest common divisor from all weights + * - cw: current required weight + * As result, all weights are in the [di..mw] range with a step=di. + * + * First, we start with cw = mw and select dests with weight >= cw. + * Then cw is reduced with di and all dests are checked again. + * Last pass should be with cw = di. We have mw/di passes in total: + * + * pass 1: cw = max weight + * pass 2: cw = max weight - di + * pass 3: cw = max weight - 2 * di + * ... + * last pass: cw = di + * + * Weights are supposed to be >= di but we run in parallel with + * weight changes, it is possible some dest weight to be reduced + * below di, bad if it is the only available dest. + * + * So, we modify how mw is calculated, now it is reduced with (di - 1), + * so that last cw is 1 to catch such dests with weight below di: + * pass 1: cw = max weight - (di - 1) + * pass 2: cw = max weight - di - (di - 1) + * pass 3: cw = max weight - 2 * di - (di - 1) + * ... + * last pass: cw = 1 + * + */ + /* * current destination pointer for weighted round-robin scheduling */ struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ + struct ip_vs_dest *cl; /* current dest or head */ int cw; /* current weight */ int mw; /* maximum weight */ int di; /* decreasing interval */ + struct rcu_head rcu_head; }; @@ -88,36 +119,41 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) if (mark == NULL) return -ENOMEM; - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + mark->cw = mark->mw; svc->sched_data = mark; return 0; } -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) { + struct ip_vs_wrr_mark *mark = svc->sched_data; + /* * Release the mark variable */ - kfree(svc->sched_data); - - return 0; + kfree_rcu(mark, rcu_head); } -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) { struct ip_vs_wrr_mark *mark = svc->sched_data; - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); + spin_lock_bh(&svc->sched_lock); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + if (mark->cw > mark->mw || !mark->cw) + mark->cw = mark->mw; + else if (mark->di > 1) + mark->cw = (mark->cw / mark->di) * mark->di + 1; + spin_unlock_bh(&svc->sched_lock); return 0; } @@ -128,80 +164,79 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) static struct ip_vs_dest * ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { - struct ip_vs_dest *dest; + struct ip_vs_dest *dest, *last, *stop = NULL; struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; + bool last_pass = false, restarted = false; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; + spin_lock_bh(&svc->sched_lock); + dest = mark->cl; + /* No available dests? */ + if (mark->mw == 0) + goto err_noavail; + last = dest; + /* Stop only after all dests were checked for weight >= 1 (last pass) */ while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - ip_vs_scheduler_err(svc, - "no destination available: " - "no destinations present"); - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - ip_vs_scheduler_err(svc, - "no destination available"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } + atomic_read(&dest->weight) >= mark->cw) + goto found; + if (dest == stop) + goto err_over; } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - ip_vs_scheduler_err(svc, - "no destination available: " - "all destinations are overloaded"); - goto out; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* Stop if we tried last pass from first dest: + * 1. last_pass: we started checks when cw > di but + * then all dests were checked for w >= 1 + * 2. last was head: the first and only traversal + * was for weight >= 1, for all dests. + */ + if (last_pass || + &last->n_list == &svc->destinations) + goto err_over; + restarted = true; + } + last_pass = mark->cw <= mark->di; + if (last_pass && restarted && + &last->n_list != &svc->destinations) { + /* First traversal was for w >= 1 but only + * for dests after 'last', now do the same + * for all dests up to 'last'. + */ + stop = last; } } +found: IP_VS_DBG_BUF(6, "WRR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + mark->cl = dest; out: - write_unlock(&svc->sched_lock); + spin_unlock_bh(&svc->sched_lock); return dest; + +err_noavail: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available"); + goto out; + +err_over: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available: " + "all destinations are overloaded"); + goto out; } @@ -212,7 +247,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = { .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), .init_service = ip_vs_wrr_init_svc, .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, + .add_dest = ip_vs_wrr_dest_changed, + .del_dest = ip_vs_wrr_dest_changed, + .upd_dest = ip_vs_wrr_dest_changed, .schedule = ip_vs_wrr_schedule, }; @@ -224,6 +261,7 @@ static int __init ip_vs_wrr_init(void) static void __exit ip_vs_wrr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); + synchronize_rcu(); } module_init(ip_vs_wrr_init); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index ee6b7a9f1ec2..b75ff6429a04 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -17,6 +17,8 @@ * - not all connections have destination server, for example, * connections in backup server when fwmark is used * - bypass connections use daddr from packet + * - we can use dst without ref while sending in RCU section, we use + * ref when returning NF_ACCEPT for NAT-ed packet via loopback * LOCAL_OUT rules: * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) * - skb->pkt_type is not set yet @@ -51,39 +53,54 @@ enum { */ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ + IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ }; +static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) +{ + return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); +} + +static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) +{ + kfree(dest_dst); +} + /* * Destination cache to speed up outgoing route lookup */ static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst, - u32 dst_cookie) +__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, + struct dst_entry *dst, u32 dst_cookie) { - struct dst_entry *old_dst; + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, + lockdep_is_held(&dest->dst_lock)); - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dest->dst_cookie = dst_cookie; - dst_release(old_dst); + if (dest_dst) { + dest_dst->dst_cache = dst; + dest_dst->dst_cookie = dst_cookie; + } + rcu_assign_pointer(dest->dest_dst, dest_dst); + + if (old) + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); } -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) +static inline struct ip_vs_dest_dst * +__ip_vs_dst_check(struct ip_vs_dest *dest) { - struct dst_entry *dst = dest->dst_cache; + struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); + struct dst_entry *dst; - if (!dst) + if (!dest_dst) return NULL; - if ((dst->obsolete || rtos != dest->dst_rtos) && - dst->ops->check(dst, dest->dst_cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); + dst = dest_dst->dst_cache; + if (dst->obsolete && + dst->ops->check(dst, dest_dst->dst_cookie) == NULL) return NULL; - } - dst_hold(dst); - return dst; + return dest_dst; } static inline bool @@ -104,7 +121,7 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) /* Get route to daddr, update *saddr, optionally bind route to saddr */ static struct rtable *do_output_route4(struct net *net, __be32 daddr, - u32 rtos, int rt_mode, __be32 *saddr) + int rt_mode, __be32 *saddr) { struct flowi4 fl4; struct rtable *rt; @@ -113,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; - fl4.flowi4_tos = rtos; fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? FLOWI_FLAG_KNOWN_NH : 0; @@ -124,7 +140,7 @@ retry: if (PTR_ERR(rt) == -EINVAL && *saddr && rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { *saddr = 0; - flowi4_update_output(&fl4, 0, rtos, daddr, 0); + flowi4_update_output(&fl4, 0, 0, daddr, 0); goto retry; } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); @@ -132,7 +148,7 @@ retry: } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); *saddr = fl4.saddr; - flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr); + flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); loop++; goto retry; } @@ -141,113 +157,140 @@ retry: } /* Get route to destination or remote server */ -static struct rtable * +static int __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, - __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr) + __be32 daddr, int rt_mode, __be32 *ret_saddr) { struct net *net = dev_net(skb_dst(skb)->dev); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ struct rtable *ort; /* Original route */ - int local; + struct iphdr *iph; + __be16 df; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos))) { - rt = do_output_route4(net, dest->addr.ip, rtos, - rt_mode, &dest->dst_saddr.ip); + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rtable *) dest_dst->dst_cache; + else { + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } + rt = do_output_route4(net, dest->addr.ip, rt_mode, + &dest_dst->dst_saddr.ip); if (!rt) { - spin_unlock(&dest->dst_lock); - return NULL; + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); - IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " - "rtos=%X\n", - &dest->addr.ip, &dest->dst_saddr.ip, - atomic_read(&rt->dst.__refcnt), rtos); + __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", + &dest->addr.ip, &dest_dst->dst_saddr.ip, + atomic_read(&rt->dst.__refcnt)); } daddr = dest->addr.ip; if (ret_saddr) - *ret_saddr = dest->dst_saddr.ip; - spin_unlock(&dest->dst_lock); + *ret_saddr = dest_dst->dst_saddr.ip; } else { __be32 saddr = htonl(INADDR_ANY); + noref = 0; + /* For such unconfigured boxes avoid many route lookups * for performance reasons because we do not remember saddr */ rt_mode &= ~IP_VS_RT_MODE_CONNECT; - rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr); + rt = do_output_route4(net, daddr, rt_mode, &saddr); if (!rt) - return NULL; + goto err_unreach; if (ret_saddr) *ret_saddr = saddr; } - local = rt->rt_flags & RTCF_LOCAL; + local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & rt_mode)) { IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", (rt->rt_flags & RTCF_LOCAL) ? "local":"non-local", &daddr); - ip_rt_put(rt); - return NULL; - } - if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && - !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) { - IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " - "requires NAT method, dest: %pI4\n", - &ip_hdr(skb)->daddr, &daddr); - ip_rt_put(rt); - return NULL; + goto err_put; } - if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { - IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " - "to non-local address, dest: %pI4\n", - &ip_hdr(skb)->saddr, &daddr); - ip_rt_put(rt); - return NULL; + iph = ip_hdr(skb); + if (likely(!local)) { + if (unlikely(ipv4_is_loopback(iph->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI4 to non-local address, dest: %pI4\n", + &iph->saddr, &daddr); + goto err_put; + } + } else { + ort = skb_rtable(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !(ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to " + "local requires NAT method, dest: %pI4\n", + &iph->daddr, &daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + ip_rt_put(rt); + return local; } - return rt; -} - -/* Reroute packet to local IPv4 stack after DNAT */ -static int -__ip_vs_reroute_locally(struct sk_buff *skb) -{ - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; - struct net *net = dev_net(dev); - struct iphdr *iph = ip_hdr(skb); - - if (rt_is_input_route(rt)) { - unsigned long orefdst = skb->_skb_refdst; - - if (ip_route_input(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) - return 0; - refdst_drop(orefdst); + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { + mtu = dst_mtu(&rt->dst); + df = iph->frag_off & htons(IP_DF); } else { - struct flowi4 fl4 = { - .daddr = iph->daddr, - .saddr = iph->saddr, - .flowi4_tos = RT_TOS(iph->tos), - .flowi4_mark = skb->mark, - }; - - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return 0; - if (!(rt->rt_flags & RTCF_LOCAL)) { - ip_rt_put(rt); - return 0; + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (mtu < 68) { + IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); + goto err_put; } - /* Drop old route. */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); + ort = skb_rtable(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + /* MTU check allowed? */ + df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; } - return 1; + + /* MTU checking */ + if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + ip_rt_put(rt); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #ifdef CONFIG_IP_VS_IPV6 @@ -294,44 +337,57 @@ out_err: /* * Get route to destination or remote server */ -static struct rt6_info * +static int __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, - int do_xfrm, int rt_mode) + struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { struct net *net = dev_net(skb_dst(skb)->dev); + struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ struct rt6_info *ort; /* Original route */ struct dst_entry *dst; - int local; + int mtu; + int local, noref = 1; if (dest) { - spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); - if (!rt) { + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rt6_info *) dest_dst->dst_cache; + else { u32 cookie; + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } dst = __ip_vs_route_output_v6(net, &dest->addr.in6, - &dest->dst_saddr.in6, + &dest_dst->dst_saddr.in6, do_xfrm); if (!dst) { - spin_unlock(&dest->dst_lock); - return NULL; + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; } rt = (struct rt6_info *) dst; cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; - __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); + __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", - &dest->addr.in6, &dest->dst_saddr.in6, + &dest->addr.in6, &dest_dst->dst_saddr.in6, atomic_read(&rt->dst.__refcnt)); } if (ret_saddr) - *ret_saddr = dest->dst_saddr.in6; - spin_unlock(&dest->dst_lock); + *ret_saddr = dest_dst->dst_saddr.in6; } else { + noref = 0; dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); if (!dst) - return NULL; + goto err_unreach; rt = (struct rt6_info *) dst; } @@ -340,86 +396,137 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, rt_mode)) { IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n", local ? "local":"non-local", daddr); - dst_release(&rt->dst); - return NULL; + goto err_put; } - if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && - !((ort = (struct rt6_info *) skb_dst(skb)) && - __ip_vs_is_local_route6(ort))) { - IP_VS_DBG_RL("Redirect from non-local address %pI6c to local " - "requires NAT method, dest: %pI6c\n", - &ipv6_hdr(skb)->daddr, daddr); - dst_release(&rt->dst); - return NULL; + if (likely(!local)) { + if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address " + "%pI6c to non-local address, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->saddr, daddr); + goto err_put; + } + } else { + ort = (struct rt6_info *) skb_dst(skb); + if (!(rt_mode & IP_VS_RT_MODE_RDR) && + !__ip_vs_is_local_route6(ort)) { + IP_VS_DBG_RL("Redirect from non-local address %pI6c " + "to local requires NAT method, " + "dest: %pI6c\n", + &ipv6_hdr(skb)->daddr, daddr); + goto err_put; + } + /* skb to local stack, preserve old route */ + if (!noref) + dst_release(&rt->dst); + return local; } - if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&ipv6_hdr(skb)->saddr) & - IPV6_ADDR_LOOPBACK)) { - IP_VS_DBG_RL("Stopping traffic from loopback address %pI6c " - "to non-local address, dest: %pI6c\n", - &ipv6_hdr(skb)->saddr, daddr); - dst_release(&rt->dst); - return NULL; + + /* MTU checking */ + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) + mtu = dst_mtu(&rt->dst); + else { + struct sock *sk = skb->sk; + + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (mtu < IPV6_MIN_MTU) { + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); + goto err_put; + } + ort = (struct rt6_info *) skb_dst(skb); + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } - return rt; + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); + goto err_put; + } + + skb_dst_drop(skb); + if (noref) { + if (!local) + skb_dst_set_noref_force(skb, &rt->dst); + else + skb_dst_set(skb, dst_clone(&rt->dst)); + } else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + dst_release(&rt->dst); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; } #endif -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) +/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ +static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, + struct ip_vs_conn *cp) { - struct dst_entry *old_dst; + int ret = NF_ACCEPT; + + skb->ipvs_property = 1; + if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) + ret = ip_vs_confirm_conntrack(skb); + if (ret == NF_ACCEPT) { + nf_reset(skb); + skb_forward_csum(skb); + } + return ret; +} + +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); - dest->dst_saddr.ip = 0; + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 1); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; } -#define IP_VS_XMIT_TUNNEL(skb, cp) \ -({ \ - int __ret = NF_ACCEPT; \ - \ - (skb)->ipvs_property = 1; \ - if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ - __ret = ip_vs_confirm_conntrack(skb); \ - if (__ret == NF_ACCEPT) { \ - nf_reset(skb); \ - skb_forward_csum(skb); \ - } \ - __ret; \ -}) - -#define IP_VS_XMIT_NAT(pf, skb, cp, local) \ -do { \ - (skb)->ipvs_property = 1; \ - if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - ip_vs_notrack(skb); \ - else \ - ip_vs_update_conntrack(skb, cp, 1); \ - if (local) \ - return NF_ACCEPT; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - skb_dst(skb)->dev, dst_output); \ -} while (0) - -#define IP_VS_XMIT(pf, skb, cp, local) \ -do { \ - (skb)->ipvs_property = 1; \ - if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - ip_vs_notrack(skb); \ - if (local) \ - return NF_ACCEPT; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - skb_dst(skb)->dev, dst_output); \ -} while (0) +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + if (!local) { + skb_forward_csum(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, + dst_output); + } else + ret = NF_ACCEPT; + return ret; +} /* @@ -430,7 +537,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { /* we do not touch skb and do not need pskb ptr */ - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } @@ -443,52 +550,29 @@ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); - int mtu; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos), - IP_VS_RT_MODE_NON_LOCAL, NULL))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, + NULL) < 0) goto tx_error; - } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); + ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -496,60 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - int mtu; - EnterFunction(10); - rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr.in6, NULL, 0, - IP_VS_RT_MODE_NON_LOCAL); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!iph->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, + ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->dst); - return NF_STOLEN; - } - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -564,29 +615,30 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); - int local; + int local, rc, was_input; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), - IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR, NULL))) - goto tx_error_icmp; - local = rt->rt_flags & RTCF_LOCAL; + was_input = rt_is_input_route(skb_rtable(skb)); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -600,57 +652,31 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); - goto tx_error_put; + goto tx_error; } } #endif /* From world but DNAT to loopback address? */ - if (local && ipv4_is_loopback(cp->daddr.ip) && - rt_is_input_route(skb_rtable(skb))) { + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to loopback address"); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, - "ip_vs_nat_xmit(): frag needed for"); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) - goto tx_error_put; + goto tx_error; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); - if (!local) { - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - ip_rt_put(rt); - /* - * Some IPv4 replies get local address from routes, - * not from iph, so while we DNAT after routing - * we need this second input/output route. - */ - if (!__ip_vs_reroute_locally(skb)) - goto tx_error; - } - IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -660,49 +686,48 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; - int local; + int local, rc; EnterFunction(10); + rcu_read_lock(); /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !iph->fragoffs)) { + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { __be16 _pt, *p; - p = skb_header_pointer(skb, iph->len, sizeof(_pt), &_pt); + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR)))) - goto tx_error_icmp; - local = __ip_vs_is_local_route6(rt); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -716,7 +741,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); - goto tx_error_put; + goto tx_error; } } #endif @@ -727,46 +752,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!iph->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, - "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, iph)) + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) goto tx_error; ipv6_hdr(skb)->daddr = cp->daddr.in6; - if (!local || !skb->dev) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* destined to loopback, do we need to change route? */ - dst_release(&rt->dst); - } - IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -776,20 +776,17 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); LeaveFunction(10); - return NF_STOLEN; + return rc; -tx_error_icmp: - dst_link_failure(skb); tx_error: LeaveFunction(10); kfree_skb(skb); + rcu_read_unlock(); return NF_STOLEN; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif @@ -826,56 +823,40 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, __be16 df; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; - int ret; + int ret, local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(tos), IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_CONNECT, - &saddr))) - goto tx_error_icmp; - if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_CONNECT | + IP_VS_RT_MODE_TUNNEL, &saddr); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } + rt = skb_rtable(skb); tdev = rt->dst.dev; - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); - if (mtu < 68) { - IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); - goto tx_error_put; - } - if (rt_is_output_route(skb_rtable(skb))) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - /* Copy DF, reset fragment offset and MF */ df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; - if (df && mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; - } - /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("%s(): no memory\n", __func__); - return NF_STOLEN; - } + + if (!new_skb) + goto tx_error; consume_skb(skb); skb = new_skb; old_iph = ip_hdr(skb); @@ -890,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* * Push down and install the IPIP header. */ @@ -911,25 +888,22 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ret = IP_VS_XMIT_TUNNEL(skb, cp); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) ip_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 @@ -943,60 +917,37 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *old_iph = ipv6_hdr(skb); struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ - int mtu; - int ret; + int ret, local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, - &saddr, 1, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL)))) - goto tx_error_icmp; - if (__ip_vs_is_local_route6(rt)) { - dst_release(&rt->dst); - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, ipvsh, 1, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_TUNNEL); + if (local < 0) + goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } + rt = (struct rt6_info *) skb_dst(skb); tdev = rt->dst.dev; - mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); - if (mtu < IPV6_MIN_MTU) { - IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, - IPV6_MIN_MTU); - goto tx_error_put; - } - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - - /* MTU checking: Notice that 'mtu' have been adjusted before hand */ - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!ipvsh->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; - } - /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - dst_release(&rt->dst); - kfree_skb(skb); - IP_VS_ERR_RL("%s(): no memory\n", __func__); - return NF_STOLEN; - } + + if (!new_skb) + goto tx_error; consume_skb(skb); skb = new_skb; old_iph = ipv6_hdr(skb); @@ -1008,10 +959,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* * Push down and install the IPIP header. */ @@ -1029,25 +976,22 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ret = IP_VS_XMIT_TUNNEL(skb, cp); + ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) ip6_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif @@ -1060,59 +1004,36 @@ int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; + int local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), - IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_KNOWN_NH, NULL))) - goto tx_error_icmp; - if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH, NULL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } ip_send_check(ip_hdr(skb)); - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; - tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1120,64 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, #ifdef CONFIG_IP_VS_IPV6 int ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph) + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct rt6_info *rt; /* Route to the other host */ - int mtu; + int local; EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, (IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL)))) - goto tx_error_icmp; - if (__ip_vs_is_local_route6(rt)) { - dst_release(&rt->dst); - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!iph->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL); + if (local < 0) goto tx_error; + if (local) { + rcu_read_unlock(); + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); } - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->dst); - return NF_STOLEN; - } - - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; -tx_error_icmp: - dst_link_failure(skb); tx_error: kfree_skb(skb); + rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; } @@ -1194,10 +1087,9 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct rtable *rt; /* Route to the other host */ - int mtu; int rc; int local; - int rt_mode; + int rt_mode, was_input; EnterFunction(10); @@ -1217,16 +1109,17 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* * mangle and send the packet here (only for VS/NAT) */ + was_input = rt_is_input_route(skb_rtable(skb)); /* LOCALNODE from FORWARD hook is not supported */ rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(ip_hdr(skb)->tos), - rt_mode, NULL))) - goto tx_error_icmp; - local = rt->rt_flags & RTCF_LOCAL; + rcu_read_lock(); + local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic @@ -1241,82 +1134,51 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI4\n", __func__, &cp->daddr.ip); - goto tx_error_put; + goto tx_error; } } #endif /* From world but DNAT to loopback address? */ - if (local && ipv4_is_loopback(cp->daddr.ip) && - rt_is_input_route(skb_rtable(skb))) { + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI4\n", __func__, &cp->daddr.ip); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && - !skb_is_gso(skb)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; ip_vs_nat_icmp(skb, pp, cp, 0); - if (!local) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - ip_rt_put(rt); - /* - * Some IPv4 replies get local address from routes, - * not from iph, so while we DNAT after routing - * we need this second input/output route. - */ - if (!__ip_vs_reroute_locally(skb)) - goto tx_error; - } - /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); - - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + rcu_read_unlock(); goto out; - tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum, - struct ip_vs_iphdr *iph) + struct ip_vs_iphdr *ipvsh) { struct rt6_info *rt; /* Route to the other host */ - int mtu; int rc; int local; int rt_mode; @@ -1328,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp, iph); + rc = cp->packet_xmit(skb, cp, pp, ipvsh); else rc = NF_ACCEPT; /* do not touch skb anymore */ @@ -1344,11 +1206,12 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - 0, rt_mode))) - goto tx_error_icmp; - - local = __ip_vs_is_local_route6(rt); + rcu_read_lock(); + local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + ipvsh, 0, rt_mode); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -1362,7 +1225,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI6\n", __func__, &cp->daddr.in6); - goto tx_error_put; + goto tx_error; } } #endif @@ -1373,60 +1236,31 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); - goto tx_error_put; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->dst); - if (__mtu_check_toobig_v6(skb, mtu)) { - if (!skb->dev) { - struct net *net = dev_net(skb_dst(skb)->dev); - - skb->dev = net->loopback_dev; - } - /* only send ICMP too big on first fragment */ - if (!iph->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error_put; + goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) - goto tx_error_put; + goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) - goto tx_error_put; + goto tx_error; ip_vs_nat_icmp_v6(skb, pp, cp, 0); - if (!local || !skb->dev) { - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* destined to loopback, do we need to change route? */ - dst_release(&rt->dst); - } - /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); - - rc = NF_STOLEN; + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + rcu_read_unlock(); goto out; -tx_error_icmp: - dst_link_failure(skb); tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); + rcu_read_unlock(); rc = NF_STOLEN; out: LeaveFunction(10); return rc; -tx_error_put: - dst_release(&rt->dst); - goto tx_error; } #endif diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 7df424e2d10c..2d3030ab5b61 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -106,36 +106,26 @@ static void nf_conntrack_acct_fini_sysctl(struct net *net) } #endif -int nf_conntrack_acct_init(struct net *net) +int nf_conntrack_acct_pernet_init(struct net *net) { - int ret; - net->ct.sysctl_acct = nf_ct_acct; + return nf_conntrack_acct_init_sysctl(net); +} - if (net_eq(net, &init_net)) { - ret = nf_ct_extend_register(&acct_extend); - if (ret < 0) { - printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n"); - goto out_extend_register; - } - } +void nf_conntrack_acct_pernet_fini(struct net *net) +{ + nf_conntrack_acct_fini_sysctl(net); +} - ret = nf_conntrack_acct_init_sysctl(net); +int nf_conntrack_acct_init(void) +{ + int ret = nf_ct_extend_register(&acct_extend); if (ret < 0) - goto out_sysctl; - - return 0; - -out_sysctl: - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&acct_extend); -out_extend_register: + pr_err("nf_conntrack_acct: Unable to register extension\n"); return ret; } -void nf_conntrack_acct_fini(struct net *net) +void nf_conntrack_acct_fini(void) { - nf_conntrack_acct_fini_sysctl(net); - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&acct_extend); + nf_ct_extend_unregister(&acct_extend); } diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index c514fe6033d2..b8b95f4027ca 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -2,6 +2,7 @@ * * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> * based on HW's ip_conntrack_irc.c as well as other modules + * (C) 2006 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -145,6 +146,7 @@ static int amanda_help(struct sk_buff *skb, exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } @@ -158,8 +160,10 @@ static int amanda_help(struct sk_buff *skb, if (nf_nat_amanda && ct->status & IPS_NAT_MASK) ret = nf_nat_amanda(skb, ctinfo, protoff, off - dataoff, len, exp); - else if (nf_ct_expect_related(exp) != 0) + else if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; + } nf_ct_expect_put(exp); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e4a0c4fb3a7c..0283baedcdfb 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -5,6 +5,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -45,8 +46,10 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_timeout.h> +#include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_helper.h> #define NF_CONNTRACK_VERSION "0.5.0" @@ -263,7 +266,7 @@ static void death_by_event(unsigned long ul_conntrack) if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { /* bad luck, let's retry again */ ecache->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); + (prandom_u32() % net->ct.sysctl_events_retry_timeout); add_timer(&ecache->timeout); return; } @@ -282,7 +285,7 @@ void nf_ct_dying_timeout(struct nf_conn *ct) /* set a new timer to retry event delivery */ setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); ecache->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); + (prandom_u32() % net->ct.sysctl_events_retry_timeout); add_timer(&ecache->timeout); } EXPORT_SYMBOL_GPL(nf_ct_dying_timeout); @@ -763,6 +766,7 @@ void nf_conntrack_free(struct nf_conn *ct) } EXPORT_SYMBOL_GPL(nf_conntrack_free); + /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static struct nf_conntrack_tuple_hash * @@ -809,6 +813,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); + nf_ct_labels_ext_add(ct); ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, @@ -1256,7 +1261,7 @@ void nf_ct_iterate_cleanup(struct net *net, EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); struct __nf_ct_flush_report { - u32 pid; + u32 portid; int report; }; @@ -1271,7 +1276,7 @@ static int kill_report(struct nf_conn *i, void *data) /* If we fail to deliver the event, death_by_timeout() will retry */ if (nf_conntrack_event_report(IPCT_DESTROY, i, - fr->pid, fr->report) < 0) + fr->portid, fr->report) < 0) return 1; /* Avoid the delivery of the destroy event in death_by_timeout(). */ @@ -1294,10 +1299,10 @@ void nf_ct_free_hashtable(void *hash, unsigned int size) } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush_report(struct net *net, u32 pid, int report) +void nf_conntrack_flush_report(struct net *net, u32 portid, int report) { struct __nf_ct_flush_report fr = { - .pid = pid, + .portid = portid, .report = report, }; nf_ct_iterate_cleanup(net, kill_report, &fr); @@ -1331,57 +1336,78 @@ static int untrack_refs(void) return cnt; } -static void nf_conntrack_cleanup_init_net(void) +void nf_conntrack_cleanup_start(void) +{ + RCU_INIT_POINTER(ip_ct_attach, NULL); +} + +void nf_conntrack_cleanup_end(void) { + RCU_INIT_POINTER(nf_ct_destroy, NULL); while (untrack_refs() > 0) schedule(); #ifdef CONFIG_NF_CONNTRACK_ZONES nf_ct_extend_unregister(&nf_ct_zone_extend); #endif + nf_conntrack_proto_fini(); + nf_conntrack_labels_fini(); + nf_conntrack_helper_fini(); + nf_conntrack_timeout_fini(); + nf_conntrack_ecache_fini(); + nf_conntrack_tstamp_fini(); + nf_conntrack_acct_fini(); + nf_conntrack_expect_fini(); } -static void nf_conntrack_cleanup_net(struct net *net) +/* + * Mishearing the voices in his head, our hero wonders how he's + * supposed to kill the mall. + */ +void nf_conntrack_cleanup_net(struct net *net) { - i_see_dead_people: - nf_ct_iterate_cleanup(net, kill_all, NULL); - nf_ct_release_dying_list(net); - if (atomic_read(&net->ct.count) != 0) { - schedule(); - goto i_see_dead_people; - } + LIST_HEAD(single); - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); - nf_conntrack_helper_fini(net); - nf_conntrack_timeout_fini(net); - nf_conntrack_ecache_fini(net); - nf_conntrack_tstamp_fini(net); - nf_conntrack_acct_fini(net); - nf_conntrack_expect_fini(net); - kmem_cache_destroy(net->ct.nf_conntrack_cachep); - kfree(net->ct.slabname); - free_percpu(net->ct.stat); + list_add(&net->exit_list, &single); + nf_conntrack_cleanup_net_list(&single); } -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void nf_conntrack_cleanup(struct net *net) +void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) { - if (net_eq(net, &init_net)) - RCU_INIT_POINTER(ip_ct_attach, NULL); + int busy; + struct net *net; - /* This makes sure all current packets have passed through - netfilter framework. Roll on, two-stage module - delete... */ + /* + * This makes sure all current packets have passed through + * netfilter framework. Roll on, two-stage module + * delete... + */ synchronize_net(); - nf_conntrack_proto_fini(net); - nf_conntrack_cleanup_net(net); -} +i_see_dead_people: + busy = 0; + list_for_each_entry(net, net_exit_list, exit_list) { + nf_ct_iterate_cleanup(net, kill_all, NULL); + nf_ct_release_dying_list(net); + if (atomic_read(&net->ct.count) != 0) + busy = 1; + } + if (busy) { + schedule(); + goto i_see_dead_people; + } -void nf_conntrack_cleanup_end(void) -{ - RCU_INIT_POINTER(nf_ct_destroy, NULL); - nf_conntrack_cleanup_init_net(); + list_for_each_entry(net, net_exit_list, exit_list) { + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); + nf_conntrack_proto_pernet_fini(net); + nf_conntrack_helper_pernet_fini(net); + nf_conntrack_ecache_pernet_fini(net); + nf_conntrack_tstamp_pernet_fini(net); + nf_conntrack_acct_pernet_fini(net); + nf_conntrack_expect_pernet_fini(net); + kmem_cache_destroy(net->ct.nf_conntrack_cachep); + kfree(net->ct.slabname); + free_percpu(net->ct.stat); + } } void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) @@ -1474,7 +1500,7 @@ void nf_ct_untracked_status_or(unsigned long bits) } EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); -static int nf_conntrack_init_init_net(void) +int nf_conntrack_init_start(void) { int max_factor = 8; int ret, cpu; @@ -1501,11 +1527,44 @@ static int nf_conntrack_init_init_net(void) printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", NF_CONNTRACK_VERSION, nf_conntrack_htable_size, nf_conntrack_max); + + ret = nf_conntrack_expect_init(); + if (ret < 0) + goto err_expect; + + ret = nf_conntrack_acct_init(); + if (ret < 0) + goto err_acct; + + ret = nf_conntrack_tstamp_init(); + if (ret < 0) + goto err_tstamp; + + ret = nf_conntrack_ecache_init(); + if (ret < 0) + goto err_ecache; + + ret = nf_conntrack_timeout_init(); + if (ret < 0) + goto err_timeout; + + ret = nf_conntrack_helper_init(); + if (ret < 0) + goto err_helper; + + ret = nf_conntrack_labels_init(); + if (ret < 0) + goto err_labels; + #ifdef CONFIG_NF_CONNTRACK_ZONES ret = nf_ct_extend_register(&nf_ct_zone_extend); if (ret < 0) goto err_extend; #endif + ret = nf_conntrack_proto_init(); + if (ret < 0) + goto err_proto; + /* Set up fake conntrack: to never be deleted, not in any hashes */ for_each_possible_cpu(cpu) { struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); @@ -1516,12 +1575,38 @@ static int nf_conntrack_init_init_net(void) nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); return 0; +err_proto: #ifdef CONFIG_NF_CONNTRACK_ZONES + nf_ct_extend_unregister(&nf_ct_zone_extend); err_extend: #endif + nf_conntrack_labels_fini(); +err_labels: + nf_conntrack_helper_fini(); +err_helper: + nf_conntrack_timeout_fini(); +err_timeout: + nf_conntrack_ecache_fini(); +err_ecache: + nf_conntrack_tstamp_fini(); +err_tstamp: + nf_conntrack_acct_fini(); +err_acct: + nf_conntrack_expect_fini(); +err_expect: return ret; } +void nf_conntrack_init_end(void) +{ + /* For use by REJECT target */ + RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); + RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); + + /* Howto get NAT offsets */ + RCU_INIT_POINTER(nf_ct_nat_offset, NULL); +} + /* * We need to use special "null" values, not used in hash table */ @@ -1529,7 +1614,7 @@ err_extend: #define DYING_NULLS_VAL ((1<<30)+1) #define TEMPLATE_NULLS_VAL ((1<<30)+2) -static int nf_conntrack_init_net(struct net *net) +int nf_conntrack_init_net(struct net *net) { int ret; @@ -1565,35 +1650,36 @@ static int nf_conntrack_init_net(struct net *net) printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); goto err_hash; } - ret = nf_conntrack_expect_init(net); + ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; - ret = nf_conntrack_acct_init(net); + ret = nf_conntrack_acct_pernet_init(net); if (ret < 0) goto err_acct; - ret = nf_conntrack_tstamp_init(net); + ret = nf_conntrack_tstamp_pernet_init(net); if (ret < 0) goto err_tstamp; - ret = nf_conntrack_ecache_init(net); + ret = nf_conntrack_ecache_pernet_init(net); if (ret < 0) goto err_ecache; - ret = nf_conntrack_timeout_init(net); - if (ret < 0) - goto err_timeout; - ret = nf_conntrack_helper_init(net); + ret = nf_conntrack_helper_pernet_init(net); if (ret < 0) goto err_helper; + ret = nf_conntrack_proto_pernet_init(net); + if (ret < 0) + goto err_proto; return 0; + +err_proto: + nf_conntrack_helper_pernet_fini(net); err_helper: - nf_conntrack_timeout_fini(net); -err_timeout: - nf_conntrack_ecache_fini(net); + nf_conntrack_ecache_pernet_fini(net); err_ecache: - nf_conntrack_tstamp_fini(net); + nf_conntrack_tstamp_pernet_fini(net); err_tstamp: - nf_conntrack_acct_fini(net); + nf_conntrack_acct_pernet_fini(net); err_acct: - nf_conntrack_expect_fini(net); + nf_conntrack_expect_pernet_fini(net); err_expect: nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); err_hash: @@ -1610,38 +1696,3 @@ s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq); EXPORT_SYMBOL_GPL(nf_ct_nat_offset); - -int nf_conntrack_init(struct net *net) -{ - int ret; - - if (net_eq(net, &init_net)) { - ret = nf_conntrack_init_init_net(); - if (ret < 0) - goto out_init_net; - } - ret = nf_conntrack_proto_init(net); - if (ret < 0) - goto out_proto; - ret = nf_conntrack_init_net(net); - if (ret < 0) - goto out_net; - - if (net_eq(net, &init_net)) { - /* For use by REJECT target */ - RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); - RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); - - /* Howto get NAT offsets */ - RCU_INIT_POINTER(nf_ct_nat_offset, NULL); - } - return 0; - -out_net: - nf_conntrack_proto_fini(net); -out_proto: - if (net_eq(net, &init_net)) - nf_conntrack_cleanup_init_net(); -out_init_net: - return ret; -} diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index faa978f1714b..1df176146567 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -1,8 +1,10 @@ /* Event cache for netfilter. */ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> +/* + * (C) 2005 Harald Welte <laforge@gnumonks.org> + * (C) 2005 Patrick McHardy <kaber@trash.net> + * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -233,38 +235,27 @@ static void nf_conntrack_event_fini_sysctl(struct net *net) } #endif /* CONFIG_SYSCTL */ -int nf_conntrack_ecache_init(struct net *net) +int nf_conntrack_ecache_pernet_init(struct net *net) { - int ret; - net->ct.sysctl_events = nf_ct_events; net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; + return nf_conntrack_event_init_sysctl(net); +} - if (net_eq(net, &init_net)) { - ret = nf_ct_extend_register(&event_extend); - if (ret < 0) { - printk(KERN_ERR "nf_ct_event: Unable to register " - "event extension.\n"); - goto out_extend_register; - } - } +void nf_conntrack_ecache_pernet_fini(struct net *net) +{ + nf_conntrack_event_fini_sysctl(net); +} - ret = nf_conntrack_event_init_sysctl(net); +int nf_conntrack_ecache_init(void) +{ + int ret = nf_ct_extend_register(&event_extend); if (ret < 0) - goto out_sysctl; - - return 0; - -out_sysctl: - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&event_extend); -out_extend_register: + pr_err("nf_ct_event: Unable to register event extension.\n"); return ret; } -void nf_conntrack_ecache_fini(struct net *net) +void nf_conntrack_ecache_fini(void) { - nf_conntrack_event_fini_sysctl(net); - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&event_extend); + nf_ct_extend_unregister(&event_extend); } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 527651a53a45..c63b618cd619 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (c) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -40,7 +41,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, - u32 pid, int report) + u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); @@ -54,7 +55,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, hlist_del(&exp->lnode); master_help->expecting[exp->class]--; - nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); + nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); @@ -90,14 +91,13 @@ __nf_ct_expect_find(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; - struct hlist_node *n; unsigned int h; if (!net->ct.expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && nf_ct_zone(i->master) == zone) return i; @@ -130,14 +130,13 @@ nf_ct_find_expectation(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i, *exp = NULL; - struct hlist_node *n; unsigned int h; if (!net->ct.expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && nf_ct_zone(i->master) == zone) { @@ -172,13 +171,13 @@ void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; - struct hlist_node *n, *next; + struct hlist_node *next; /* Optimization: most connection never expect any others. */ if (!help) return; - hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); @@ -348,9 +347,8 @@ static void evict_oldest_expect(struct nf_conn *master, { struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_expect *exp, *last = NULL; - struct hlist_node *n; - hlist_for_each_entry(exp, n, &master_help->expectations, lnode) { + hlist_for_each_entry(exp, &master_help->expectations, lnode) { if (exp->class == new->class) last = exp; } @@ -369,7 +367,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); - struct hlist_node *n, *next; + struct hlist_node *next; unsigned int h; int ret = 1; @@ -378,7 +376,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry_safe(i, n, next, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { if (del_timer(&i->timeout)) { nf_ct_unlink_expect(i); @@ -415,7 +413,7 @@ out: } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, - u32 pid, int report) + u32 portid, int report) { int ret; @@ -428,7 +426,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, if (ret < 0) goto out; spin_unlock_bh(&nf_conntrack_lock); - nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); + nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return ret; out: spin_unlock_bh(&nf_conntrack_lock); @@ -571,7 +569,8 @@ static int exp_proc_init(struct net *net) #ifdef CONFIG_NF_CONNTRACK_PROCFS struct proc_dir_entry *proc; - proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops); + proc = proc_create("nf_conntrack_expect", 0440, net->proc_net, + &exp_file_ops); if (!proc) return -ENOMEM; #endif /* CONFIG_NF_CONNTRACK_PROCFS */ @@ -581,59 +580,56 @@ static int exp_proc_init(struct net *net) static void exp_proc_remove(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS - proc_net_remove(net, "nf_conntrack_expect"); + remove_proc_entry("nf_conntrack_expect", net->proc_net); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ } module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); -int nf_conntrack_expect_init(struct net *net) +int nf_conntrack_expect_pernet_init(struct net *net) { int err = -ENOMEM; - if (net_eq(net, &init_net)) { - if (!nf_ct_expect_hsize) { - nf_ct_expect_hsize = net->ct.htable_size / 256; - if (!nf_ct_expect_hsize) - nf_ct_expect_hsize = 1; - } - nf_ct_expect_max = nf_ct_expect_hsize * 4; - } - net->ct.expect_count = 0; net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (net->ct.expect_hash == NULL) goto err1; - if (net_eq(net, &init_net)) { - nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", - sizeof(struct nf_conntrack_expect), - 0, 0, NULL); - if (!nf_ct_expect_cachep) - goto err2; - } - err = exp_proc_init(net); if (err < 0) - goto err3; + goto err2; return 0; - -err3: - if (net_eq(net, &init_net)) - kmem_cache_destroy(nf_ct_expect_cachep); err2: nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); err1: return err; } -void nf_conntrack_expect_fini(struct net *net) +void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); - if (net_eq(net, &init_net)) { - rcu_barrier(); /* Wait for call_rcu() before destroy */ - kmem_cache_destroy(nf_ct_expect_cachep); - } nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); } + +int nf_conntrack_expect_init(void) +{ + if (!nf_ct_expect_hsize) { + nf_ct_expect_hsize = nf_conntrack_htable_size / 256; + if (!nf_ct_expect_hsize) + nf_ct_expect_hsize = 1; + } + nf_ct_expect_max = nf_ct_expect_hsize * 4; + nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", + sizeof(struct nf_conntrack_expect), + 0, 0, NULL); + if (!nf_ct_expect_cachep) + return -ENOMEM; + return 0; +} + +void nf_conntrack_expect_fini(void) +{ + rcu_barrier(); /* Wait for call_rcu() before destroy */ + kmem_cache_destroy(nf_ct_expect_cachep); +} diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 1ce3befb7c8a..6b217074237b 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -435,8 +436,8 @@ skip_nl_seq: connection tracking, not packet filtering. However, it is necessary for accurate tracking in this case. */ - pr_debug("conntrack_ftp: partial %s %u+%u\n", - search[dir][i].pattern, ntohl(th->seq), datalen); + nf_ct_helper_log(skb, ct, "partial matching of `%s'", + search[dir][i].pattern); ret = NF_DROP; goto out; } else if (found == 0) { /* No match */ @@ -450,6 +451,7 @@ skip_nl_seq: exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } @@ -500,9 +502,10 @@ skip_nl_seq: protoff, matchoff, matchlen, exp); else { /* Can't expect this? Best to drop packet now. */ - if (nf_ct_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; - else + } else ret = NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 962795e839ab..bdebd03bc8cd 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -2,6 +2,7 @@ * H.323 connection tracking helper * * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net> * * This source code is licensed under General Public License version 2. * @@ -623,7 +624,7 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff, drop: spin_unlock_bh(&nf_h323_lock); - net_info_ratelimited("nf_ct_h245: packet dropped\n"); + nf_ct_helper_log(skb, ct, "cannot process H.245 message"); return NF_DROP; } @@ -1197,7 +1198,7 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff, drop: spin_unlock_bh(&nf_h323_lock); - net_info_ratelimited("nf_ct_q931: packet dropped\n"); + nf_ct_helper_log(skb, ct, "cannot process Q.931 message"); return NF_DROP; } @@ -1795,7 +1796,7 @@ static int ras_help(struct sk_buff *skb, unsigned int protoff, drop: spin_unlock_bh(&nf_h323_lock); - net_info_ratelimited("nf_ct_ras: packet dropped\n"); + nf_ct_helper_log(skb, ct, "cannot process RAS message"); return NF_DROP; } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 884f2b39319a..974a2a4adefa 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -28,6 +29,7 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_log.h> static DEFINE_MUTEX(nf_ct_helper_mutex); struct hlist_head *nf_ct_helper_hash __read_mostly; @@ -115,14 +117,13 @@ __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_helper *helper; struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; - struct hlist_node *n; unsigned int h; if (!nf_ct_helper_count) return NULL; h = helper_hash(tuple); - hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) { + hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) return helper; } @@ -133,11 +134,10 @@ struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; - struct hlist_node *n; unsigned int i; for (i = 0; i < nf_ct_helper_hsize; i++) { - hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) { + hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) { if (!strcmp(h->name, name) && h->tuple.src.l3num == l3num && h->tuple.dst.protonum == protonum) @@ -236,7 +236,9 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, /* We only allow helper re-assignment of the same sort since * we cannot reallocate the helper extension area. */ - if (help->helper != helper) { + struct nf_conntrack_helper *tmp = rcu_dereference(help->helper); + + if (tmp && tmp->help != helper->help) { RCU_INIT_POINTER(help->helper, NULL); goto out; } @@ -332,11 +334,37 @@ nf_ct_helper_expectfn_find_by_symbol(const void *symbol) } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); +__printf(3, 4) +void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, + const char *fmt, ...) +{ + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + /* Called from the helper function, this call never fails */ + help = nfct_help(ct); + + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(help->helper); + + nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, + "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); + + va_end(args); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_log); + int nf_conntrack_helper_register(struct nf_conntrack_helper *me) { int ret = 0; struct nf_conntrack_helper *cur; - struct hlist_node *n; unsigned int h = helper_hash(&me->tuple); BUG_ON(me->expect_policy == NULL); @@ -344,7 +372,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); mutex_lock(&nf_ct_helper_mutex); - hlist_for_each_entry(cur, n, &nf_ct_helper_hash[h], hnode) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 && cur->tuple.src.l3num == me->tuple.src.l3num && cur->tuple.dst.protonum == me->tuple.dst.protonum) { @@ -365,13 +393,13 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_expect *exp; - const struct hlist_node *n, *next; + const struct hlist_node *next; const struct hlist_nulls_node *nn; unsigned int i; /* Get rid of expectations */ for (i = 0; i < nf_ct_expect_hsize; i++) { - hlist_for_each_entry_safe(exp, n, next, + hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((rcu_dereference_protected( @@ -423,44 +451,41 @@ static struct nf_ct_ext_type helper_extend __read_mostly = { .id = NF_CT_EXT_HELPER, }; -int nf_conntrack_helper_init(struct net *net) +int nf_conntrack_helper_pernet_init(struct net *net) { - int err; - net->ct.auto_assign_helper_warned = false; net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; + return nf_conntrack_helper_init_sysctl(net); +} - if (net_eq(net, &init_net)) { - nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ - nf_ct_helper_hash = - nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); - if (!nf_ct_helper_hash) - return -ENOMEM; +void nf_conntrack_helper_pernet_fini(struct net *net) +{ + nf_conntrack_helper_fini_sysctl(net); +} - err = nf_ct_extend_register(&helper_extend); - if (err < 0) - goto err1; +int nf_conntrack_helper_init(void) +{ + int ret; + nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ + nf_ct_helper_hash = + nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); + if (!nf_ct_helper_hash) + return -ENOMEM; + + ret = nf_ct_extend_register(&helper_extend); + if (ret < 0) { + pr_err("nf_ct_helper: Unable to register helper extension.\n"); + goto out_extend; } - err = nf_conntrack_helper_init_sysctl(net); - if (err < 0) - goto out_sysctl; - return 0; - -out_sysctl: - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&helper_extend); -err1: +out_extend: nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); - return err; + return ret; } -void nf_conntrack_helper_fini(struct net *net) +void nf_conntrack_helper_fini(void) { - nf_conntrack_helper_fini_sysctl(net); - if (net_eq(net, &init_net)) { - nf_ct_extend_unregister(&helper_extend); - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); - } + nf_ct_extend_unregister(&helper_extend); + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); } diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 3b20aa77cfc8..0fd2976db7ee 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -1,6 +1,7 @@ /* IRC extension for IP connection tracking, Version 1.21 * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org> * based on RR's ip_conntrack_ftp.c + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -194,6 +195,8 @@ static int help(struct sk_buff *skb, unsigned int protoff, exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, + "cannot alloc expectation"); ret = NF_DROP; goto out; } @@ -210,8 +213,11 @@ static int help(struct sk_buff *skb, unsigned int protoff, addr_beg_p - ib_ptr, addr_end_p - addr_beg_p, exp); - else if (nf_ct_expect_related(exp) != 0) + else if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, + "cannot add expectation"); ret = NF_DROP; + } nf_ct_expect_put(exp); goto out; } diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c new file mode 100644 index 000000000000..8fe2e99428b7 --- /dev/null +++ b/net/netfilter/nf_conntrack_labels.c @@ -0,0 +1,112 @@ +/* + * test/set flag bits stored in conntrack extension area. + * + * (C) 2013 Astaro GmbH & Co KG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ctype.h> +#include <linux/export.h> +#include <linux/jhash.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/slab.h> + +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_labels.h> + +static unsigned int label_bits(const struct nf_conn_labels *l) +{ + unsigned int longs = l->words; + return longs * BITS_PER_LONG; +} + +bool nf_connlabel_match(const struct nf_conn *ct, u16 bit) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels) + return false; + + return bit < label_bits(labels) && test_bit(bit, labels->bits); +} +EXPORT_SYMBOL_GPL(nf_connlabel_match); + +int nf_connlabel_set(struct nf_conn *ct, u16 bit) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels || bit >= label_bits(labels)) + return -ENOSPC; + + if (test_bit(bit, labels->bits)) + return 0; + + if (test_and_set_bit(bit, labels->bits)) + nf_conntrack_event_cache(IPCT_LABEL, ct); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabel_set); + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) +static void replace_u32(u32 *address, u32 mask, u32 new) +{ + u32 old, tmp; + + do { + old = *address; + tmp = (old & mask) ^ new; + } while (cmpxchg(address, old, tmp) != old); +} + +int nf_connlabels_replace(struct nf_conn *ct, + const u32 *data, + const u32 *mask, unsigned int words32) +{ + struct nf_conn_labels *labels; + unsigned int size, i; + u32 *dst; + + labels = nf_ct_labels_find(ct); + if (!labels) + return -ENOSPC; + + size = labels->words * sizeof(long); + if (size < (words32 * sizeof(u32))) + words32 = size / sizeof(u32); + + dst = (u32 *) labels->bits; + if (words32) { + for (i = 0; i < words32; i++) + replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); + } + + size /= sizeof(u32); + for (i = words32; i < size; i++) /* pad */ + replace_u32(&dst[i], 0, 0); + + nf_conntrack_event_cache(IPCT_LABEL, ct); + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabels_replace); +#endif + +static struct nf_ct_ext_type labels_extend __read_mostly = { + .len = sizeof(struct nf_conn_labels), + .align = __alignof__(struct nf_conn_labels), + .id = NF_CT_EXT_LABELS, +}; + +int nf_conntrack_labels_init(void) +{ + return nf_ct_extend_register(&labels_extend); +} + +void nf_conntrack_labels_fini(void) +{ + nf_ct_extend_unregister(&labels_extend); +} diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 627b0e50b238..6d0f8a17c5b7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -43,6 +43,7 @@ #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> +#include <net/netfilter/nf_conntrack_labels.h> #ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_l4proto.h> @@ -323,6 +324,40 @@ nla_put_failure: #define ctnetlink_dump_secctx(a, b) (0) #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS +static int ctnetlink_label_size(const struct nf_conn *ct) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels) + return 0; + return nla_total_size(labels->words * sizeof(long)); +} + +static int +ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + unsigned int len, i; + + if (!labels) + return 0; + + len = labels->words * sizeof(long); + i = 0; + do { + if (labels->bits[i] != 0) + return nla_put(skb, CTA_LABELS, len, labels->bits); + i++; + } while (i < labels->words); + + return 0; +} +#else +#define ctnetlink_dump_labels(a, b) (0) +#define ctnetlink_label_size(a) (0) +#endif + #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) static inline int @@ -463,6 +498,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_labels(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || @@ -561,6 +597,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif + ctnetlink_proto_size(ct) + + ctnetlink_label_size(ct) ; } @@ -662,6 +699,9 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif + if (events & (1 << IPCT_LABEL) && + ctnetlink_dump_labels(skb, ct) < 0) + goto nla_put_failure; if (events & (1 << IPCT_RELATED) && ctnetlink_dump_master(skb, ct) < 0) @@ -921,6 +961,7 @@ ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, return 0; } +#define __CTA_LABELS_MAX_LENGTH ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE) static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, @@ -937,6 +978,10 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED }, [CTA_ZONE] = { .type = NLA_U16 }, [CTA_MARK_MASK] = { .type = NLA_U32 }, + [CTA_LABELS] = { .type = NLA_BINARY, + .len = __CTA_LABELS_MAX_LENGTH }, + [CTA_LABELS_MASK] = { .type = NLA_BINARY, + .len = __CTA_LABELS_MAX_LENGTH }, }; static int @@ -1211,13 +1256,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, if (!parse_nat_setup) { #ifdef CONFIG_MODULES rcu_read_unlock(); - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_CTNETLINK); if (request_module("nf-nat") < 0) { - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); return -EOPNOTSUPP; } - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); if (nfnetlink_parse_nat_setup_hook) return -EAGAIN; @@ -1229,13 +1274,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, if (err == -EAGAIN) { #ifdef CONFIG_MODULES rcu_read_unlock(); - nfnl_unlock(); + nfnl_unlock(NFNL_SUBSYS_CTNETLINK); if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) { - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); return -EOPNOTSUPP; } - nfnl_lock(); + nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); #else err = -EOPNOTSUPP; @@ -1465,6 +1510,31 @@ ctnetlink_change_nat_seq_adj(struct nf_conn *ct, #endif static int +ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_CONNTRACK_LABELS + size_t len = nla_len(cda[CTA_LABELS]); + const void *mask = cda[CTA_LABELS_MASK]; + + if (len & (sizeof(u32)-1)) /* must be multiple of u32 */ + return -EINVAL; + + if (mask) { + if (nla_len(cda[CTA_LABELS_MASK]) == 0 || + nla_len(cda[CTA_LABELS_MASK]) != len) + return -EINVAL; + mask = nla_data(cda[CTA_LABELS_MASK]); + } + + len /= sizeof(u32); + + return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len); +#else + return -EOPNOTSUPP; +#endif +} + +static int ctnetlink_change_conntrack(struct nf_conn *ct, const struct nlattr * const cda[]) { @@ -1510,6 +1580,11 @@ ctnetlink_change_conntrack(struct nf_conn *ct, return err; } #endif + if (cda[CTA_LABELS]) { + err = ctnetlink_attach_labels(ct, cda); + if (err < 0) + return err; + } return 0; } @@ -1598,6 +1673,8 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); + nf_ct_labels_ext_add(ct); + /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; @@ -1705,6 +1782,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_CREATE) { enum ip_conntrack_events events; + if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) + return -EINVAL; + ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, &rtuple, u3); if (IS_ERR(ct)) @@ -1716,6 +1796,10 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, else events = IPCT_NEW; + if (cda[CTA_LABELS] && + ctnetlink_attach_labels(ct, cda) == 0) + events |= (1 << IPCT_LABEL); + nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | @@ -1983,6 +2067,8 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) goto nla_put_failure; #endif + if (ctnetlink_dump_labels(skb, ct) < 0) + goto nla_put_failure; rcu_read_unlock(); return 0; @@ -2011,6 +2097,11 @@ ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) if (err < 0) return err; } + if (cda[CTA_LABELS]) { + err = ctnetlink_attach_labels(ct, cda); + if (err < 0) + return err; + } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); @@ -2279,14 +2370,13 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - struct hlist_node *n; u_int8_t l3proto = nfmsg->nfgen_family; rcu_read_lock(); last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]], + hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; @@ -2319,6 +2409,92 @@ out: return skb->len; } +static int +ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conntrack_expect *exp, *last; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nf_conn *ct = cb->data; + struct nf_conn_help *help = nfct_help(ct); + u_int8_t l3proto = nfmsg->nfgen_family; + + if (cb->args[0]) + return 0; + + rcu_read_lock(); + last = (struct nf_conntrack_expect *)cb->args[1]; +restart: + hlist_for_each_entry(exp, &help->expectations, lnode) { + if (l3proto && exp->tuple.src.l3num != l3proto) + continue; + if (cb->args[1]) { + if (exp != last) + continue; + cb->args[1] = 0; + } + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + exp) < 0) { + if (!atomic_inc_not_zero(&exp->use)) + continue; + cb->args[1] = (unsigned long)exp; + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + cb->args[0] = 1; +out: + rcu_read_unlock(); + if (last) + nf_ct_expect_put(last); + + return skb->len; +} + +static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + int err; + struct net *net = sock_net(ctnl); + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + u16 zone = 0; + struct netlink_dump_control c = { + .dump = ctnetlink_exp_ct_dump_table, + .done = ctnetlink_exp_done, + }; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_ZONE]) { + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + } + + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return -ENOENT; + + ct = nf_ct_tuplehash_to_ctrack(h); + c.data = ct; + + err = netlink_dump_start(ctnl, skb, nlh, &c); + nf_ct_put(ct); + + return err; +} + static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, @@ -2349,11 +2525,15 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .dump = ctnetlink_exp_dump_table, - .done = ctnetlink_exp_done, - }; - return netlink_dump_start(ctnl, skb, nlh, &c); + if (cda[CTA_EXPECT_MASTER]) + return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda); + else { + struct netlink_dump_control c = { + .dump = ctnetlink_exp_dump_table, + .done = ctnetlink_exp_done, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } } err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); @@ -2419,7 +2599,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct hlist_node *n, *next; + struct hlist_node *next; u_int8_t u3 = nfmsg->nfgen_family; unsigned int i; u16 zone; @@ -2466,7 +2646,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, /* delete all expectations for this helper */ spin_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { - hlist_for_each_entry_safe(exp, n, next, + hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { m_help = nfct_help(exp->master); @@ -2484,7 +2664,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, /* This basically means we have to flush everything*/ spin_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { - hlist_for_each_entry_safe(exp, n, next, + hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { if (del_timer(&exp->timeout)) { diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index cc7669ef0b95..7bd03decd36c 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -11,10 +11,12 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + * * Limitations: * - We blindly assume that control connections are always * established in PNS->PAC direction. This is a violation - * of RFFC2673 + * of RFC 2637 * - We can only support one single call within each session * TODO: * - testing of incoming PPTP calls diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 51e928db48c8..0ab9636ac57e 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -3,6 +3,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -212,8 +213,7 @@ static void nf_ct_l3proto_unregister_sysctl(struct net *net, #endif } -static int -nf_conntrack_l3proto_register_net(struct nf_conntrack_l3proto *proto) +int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto) { int ret = 0; struct nf_conntrack_l3proto *old; @@ -242,8 +242,9 @@ out_unlock: return ret; } +EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); -int nf_conntrack_l3proto_register(struct net *net, +int nf_ct_l3proto_pernet_register(struct net *net, struct nf_conntrack_l3proto *proto) { int ret = 0; @@ -254,22 +255,11 @@ int nf_conntrack_l3proto_register(struct net *net, return ret; } - ret = nf_ct_l3proto_register_sysctl(net, proto); - if (ret < 0) - return ret; - - if (net == &init_net) { - ret = nf_conntrack_l3proto_register_net(proto); - if (ret < 0) - nf_ct_l3proto_unregister_sysctl(net, proto); - } - - return ret; + return nf_ct_l3proto_register_sysctl(net, proto); } -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register); +EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); -static void -nf_conntrack_l3proto_unregister_net(struct nf_conntrack_l3proto *proto) +void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) { BUG_ON(proto->l3proto >= AF_MAX); @@ -283,19 +273,17 @@ nf_conntrack_l3proto_unregister_net(struct nf_conntrack_l3proto *proto) synchronize_rcu(); } +EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); -void nf_conntrack_l3proto_unregister(struct net *net, +void nf_ct_l3proto_pernet_unregister(struct net *net, struct nf_conntrack_l3proto *proto) { - if (net == &init_net) - nf_conntrack_l3proto_unregister_net(proto); - nf_ct_l3proto_unregister_sysctl(net, proto); /* Remove all contrack entries for this protocol */ nf_ct_iterate_cleanup(net, kill_l3proto, proto); } -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); +EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, struct nf_conntrack_l4proto *l4proto) @@ -376,8 +364,7 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net, /* FIXME: Allow NULL functions and sub in pointers to generic for them. --RR */ -static int -nf_conntrack_l4proto_register_net(struct nf_conntrack_l4proto *l4proto) +int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto) { int ret = 0; @@ -431,8 +418,9 @@ out_unlock: mutex_unlock(&nf_ct_proto_mutex); return ret; } +EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); -int nf_conntrack_l4proto_register(struct net *net, +int nf_ct_l4proto_pernet_register(struct net *net, struct nf_conntrack_l4proto *l4proto) { int ret = 0; @@ -452,22 +440,13 @@ int nf_conntrack_l4proto_register(struct net *net, if (ret < 0) goto out; - if (net == &init_net) { - ret = nf_conntrack_l4proto_register_net(l4proto); - if (ret < 0) { - nf_ct_l4proto_unregister_sysctl(net, pn, l4proto); - goto out; - } - } - pn->users++; out: return ret; } -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register); +EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); -static void -nf_conntrack_l4proto_unregister_net(struct nf_conntrack_l4proto *l4proto) +void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) { BUG_ON(l4proto->l3proto >= PF_MAX); @@ -482,15 +461,13 @@ nf_conntrack_l4proto_unregister_net(struct nf_conntrack_l4proto *l4proto) synchronize_rcu(); } +EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); -void nf_conntrack_l4proto_unregister(struct net *net, +void nf_ct_l4proto_pernet_unregister(struct net *net, struct nf_conntrack_l4proto *l4proto) { struct nf_proto_net *pn = NULL; - if (net == &init_net) - nf_conntrack_l4proto_unregister_net(l4proto); - pn = nf_ct_l4proto_net(net, l4proto); if (pn == NULL) return; @@ -501,11 +478,10 @@ void nf_conntrack_l4proto_unregister(struct net *net, /* Remove all contrack entries for this protocol */ nf_ct_iterate_cleanup(net, kill_l4proto, l4proto); } -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); +EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); -int nf_conntrack_proto_init(struct net *net) +int nf_conntrack_proto_pernet_init(struct net *net) { - unsigned int i; int err; struct nf_proto_net *pn = nf_ct_l4proto_net(net, &nf_conntrack_l4proto_generic); @@ -520,19 +496,12 @@ int nf_conntrack_proto_init(struct net *net) if (err < 0) return err; - if (net == &init_net) { - for (i = 0; i < AF_MAX; i++) - rcu_assign_pointer(nf_ct_l3protos[i], - &nf_conntrack_l3proto_generic); - } - pn->users++; return 0; } -void nf_conntrack_proto_fini(struct net *net) +void nf_conntrack_proto_pernet_fini(struct net *net) { - unsigned int i; struct nf_proto_net *pn = nf_ct_l4proto_net(net, &nf_conntrack_l4proto_generic); @@ -540,9 +509,21 @@ void nf_conntrack_proto_fini(struct net *net) nf_ct_l4proto_unregister_sysctl(net, pn, &nf_conntrack_l4proto_generic); - if (net == &init_net) { - /* free l3proto protocol tables */ - for (i = 0; i < PF_MAX; i++) - kfree(nf_ct_protos[i]); - } +} + +int nf_conntrack_proto_init(void) +{ + unsigned int i; + for (i = 0; i < AF_MAX; i++) + rcu_assign_pointer(nf_ct_l3protos[i], + &nf_conntrack_l3proto_generic); + return 0; +} + +void nf_conntrack_proto_fini(void) +{ + unsigned int i; + /* free l3proto protocol tables */ + for (i = 0; i < PF_MAX; i++) + kfree(nf_ct_protos[i]); } diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index a8ae287bc7af..a99b6c3427b0 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -456,7 +456,8 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, out_invalid: if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg); + nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL, + NULL, msg); return false; } @@ -542,13 +543,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid packet ignored "); return NF_ACCEPT; case CT_DCCP_INVALID: spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid state transition "); return -NF_ACCEPT; } @@ -613,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, out_invalid: if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg); + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, msg); return -NF_ACCEPT; } @@ -935,32 +936,27 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { static __net_init int dccp_net_init(struct net *net) { int ret = 0; - ret = nf_conntrack_l4proto_register(net, - &dccp_proto4); + ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4); if (ret < 0) { - pr_err("nf_conntrack_l4proto_dccp4 :protocol register failed.\n"); + pr_err("nf_conntrack_dccp4: pernet registration failed.\n"); goto out; } - ret = nf_conntrack_l4proto_register(net, - &dccp_proto6); + ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6); if (ret < 0) { - pr_err("nf_conntrack_l4proto_dccp6 :protocol register failed.\n"); + pr_err("nf_conntrack_dccp6: pernet registration failed.\n"); goto cleanup_dccp4; } return 0; cleanup_dccp4: - nf_conntrack_l4proto_unregister(net, - &dccp_proto4); + nf_ct_l4proto_pernet_unregister(net, &dccp_proto4); out: return ret; } static __net_exit void dccp_net_exit(struct net *net) { - nf_conntrack_l4proto_unregister(net, - &dccp_proto6); - nf_conntrack_l4proto_unregister(net, - &dccp_proto4); + nf_ct_l4proto_pernet_unregister(net, &dccp_proto6); + nf_ct_l4proto_pernet_unregister(net, &dccp_proto4); } static struct pernet_operations dccp_net_ops = { @@ -972,11 +968,33 @@ static struct pernet_operations dccp_net_ops = { static int __init nf_conntrack_proto_dccp_init(void) { - return register_pernet_subsys(&dccp_net_ops); + int ret; + + ret = register_pernet_subsys(&dccp_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&dccp_proto4); + if (ret < 0) + goto out_dccp4; + + ret = nf_ct_l4proto_register(&dccp_proto6); + if (ret < 0) + goto out_dccp6; + + return 0; +out_dccp6: + nf_ct_l4proto_unregister(&dccp_proto4); +out_dccp4: + unregister_pernet_subsys(&dccp_net_ops); +out_pernet: + return ret; } static void __exit nf_conntrack_proto_dccp_fini(void) { + nf_ct_l4proto_unregister(&dccp_proto6); + nf_ct_l4proto_unregister(&dccp_proto4); unregister_pernet_subsys(&dccp_net_ops); } diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index b09b7af7f6f8..9d9c0dade602 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -21,6 +21,7 @@ * * Development of this code funded by Astaro AG (http://www.astaro.com/) * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> @@ -397,15 +398,15 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { static int proto_gre_net_init(struct net *net) { int ret = 0; - ret = nf_conntrack_l4proto_register(net, &nf_conntrack_l4proto_gre4); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4); if (ret < 0) - pr_err("nf_conntrack_l4proto_gre4 :protocol register failed.\n"); + pr_err("nf_conntrack_gre4: pernet registration failed.\n"); return ret; } static void proto_gre_net_exit(struct net *net) { - nf_conntrack_l4proto_unregister(net, &nf_conntrack_l4proto_gre4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4); nf_ct_gre_keymap_flush(net); } @@ -418,11 +419,26 @@ static struct pernet_operations proto_gre_net_ops = { static int __init nf_ct_proto_gre_init(void) { - return register_pernet_subsys(&proto_gre_net_ops); + int ret; + + ret = register_pernet_subsys(&proto_gre_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4); + if (ret < 0) + goto out_gre4; + + return 0; +out_gre4: + unregister_pernet_subsys(&proto_gre_net_ops); +out_pernet: + return ret; } static void __exit nf_ct_proto_gre_fini(void) { + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4); unregister_pernet_subsys(&proto_gre_net_ops); } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index c746d61f83ed..1314d33f6bcf 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -1,6 +1,9 @@ /* * Connection tracking protocol helper module for SCTP. * + * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> + * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> + * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. * @@ -853,33 +856,28 @@ static int sctp_net_init(struct net *net) { int ret = 0; - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_sctp4); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4); if (ret < 0) { - pr_err("nf_conntrack_l4proto_sctp4 :protocol register failed.\n"); + pr_err("nf_conntrack_sctp4: pernet registration failed.\n"); goto out; } - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_sctp6); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6); if (ret < 0) { - pr_err("nf_conntrack_l4proto_sctp6 :protocol register failed.\n"); + pr_err("nf_conntrack_sctp6: pernet registration failed.\n"); goto cleanup_sctp4; } return 0; cleanup_sctp4: - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_sctp4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4); out: return ret; } static void sctp_net_exit(struct net *net) { - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_sctp6); - nf_conntrack_l4proto_unregister(net, - &nf_conntrack_l4proto_sctp4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4); } static struct pernet_operations sctp_net_ops = { @@ -891,11 +889,33 @@ static struct pernet_operations sctp_net_ops = { static int __init nf_conntrack_proto_sctp_init(void) { - return register_pernet_subsys(&sctp_net_ops); + int ret; + + ret = register_pernet_subsys(&sctp_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4); + if (ret < 0) + goto out_sctp4; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6); + if (ret < 0) + goto out_sctp6; + + return 0; +out_sctp6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); +out_sctp4: + unregister_pernet_subsys(&sctp_net_ops); +out_pernet: + return ret; } static void __exit nf_conntrack_proto_sctp_fini(void) { + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); unregister_pernet_subsys(&sctp_net_ops); } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 83876e9877f1..4d4d8f1d01fc 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,5 +1,7 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -720,7 +722,7 @@ static bool tcp_in_window(const struct nf_conn *ct, tn->tcp_be_liberal) res = true; if (!res && LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? after(end, sender->td_end - receiver->td_maxwin - 1) ? @@ -772,7 +774,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: short packet "); return -NF_ACCEPT; } @@ -780,7 +782,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -793,7 +795,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; } @@ -802,7 +804,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; } @@ -949,7 +951,7 @@ static int tcp_packet(struct nf_conn *ct, } spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid packet ignored in " "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; @@ -959,7 +961,7 @@ static int tcp_packet(struct nf_conn *ct, dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_CLOSE: @@ -969,8 +971,8 @@ static int tcp_packet(struct nf_conn *ct, /* Invalid RST */ spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid RST "); + nf_log_packet(net, pf, 0, skb, NULL, NULL, + NULL, "nf_ct_tcp: invalid RST "); return -NF_ACCEPT; } if (index == TCP_RST_SET diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 59623cc56e8d..9d7721cbce4b 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -119,7 +120,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: short packet "); return -NF_ACCEPT; } @@ -127,7 +128,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -143,7 +144,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: bad UDP checksum "); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 4b66df209286..2750e6c69f82 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -131,7 +131,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: short packet "); return -NF_ACCEPT; } @@ -141,7 +141,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, cscov = udplen; else if (cscov < sizeof(*hdr) || cscov > udplen) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: invalid checksum coverage "); return -NF_ACCEPT; } @@ -149,7 +149,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, /* UDPLITE mandates checksums */ if (!hdr->check) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: checksum missing "); return -NF_ACCEPT; } @@ -159,7 +159,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: bad UDPLite checksum "); return -NF_ACCEPT; } @@ -336,30 +336,28 @@ static int udplite_net_init(struct net *net) { int ret = 0; - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_udplite4); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4); if (ret < 0) { - pr_err("nf_conntrack_l4proto_udplite4 :protocol register failed.\n"); + pr_err("nf_conntrack_udplite4: pernet registration failed.\n"); goto out; } - ret = nf_conntrack_l4proto_register(net, - &nf_conntrack_l4proto_udplite6); + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6); if (ret < 0) { - pr_err("nf_conntrack_l4proto_udplite4 :protocol register failed.\n"); + pr_err("nf_conntrack_udplite6: pernet registration failed.\n"); goto cleanup_udplite4; } return 0; cleanup_udplite4: - nf_conntrack_l4proto_unregister(net, &nf_conntrack_l4proto_udplite4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4); out: return ret; } static void udplite_net_exit(struct net *net) { - nf_conntrack_l4proto_unregister(net, &nf_conntrack_l4proto_udplite6); - nf_conntrack_l4proto_unregister(net, &nf_conntrack_l4proto_udplite4); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4); } static struct pernet_operations udplite_net_ops = { @@ -371,11 +369,33 @@ static struct pernet_operations udplite_net_ops = { static int __init nf_conntrack_proto_udplite_init(void) { - return register_pernet_subsys(&udplite_net_ops); + int ret; + + ret = register_pernet_subsys(&udplite_net_ops); + if (ret < 0) + goto out_pernet; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4); + if (ret < 0) + goto out_udplite4; + + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6); + if (ret < 0) + goto out_udplite6; + + return 0; +out_udplite6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +out_udplite4: + unregister_pernet_subsys(&udplite_net_ops); +out_pernet: + return ret; } static void __exit nf_conntrack_proto_udplite_exit(void) { + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); unregister_pernet_subsys(&udplite_net_ops); } diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 295429f39088..4a2134fd3fcb 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -138,6 +138,7 @@ static int help(struct sk_buff *skb, exp = nf_ct_expect_alloc(ct); if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } @@ -151,8 +152,10 @@ static int help(struct sk_buff *skb, nf_ct_dump_tuple(&exp->tuple); /* Can't expect this? Best to drop packet now. */ - if (nf_ct_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; + } nf_ct_expect_put(exp); diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index df8f4f284481..e0c4373b4747 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -855,11 +855,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct, { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; - struct hlist_node *n, *next; + struct hlist_node *next; int found = 0; spin_lock_bh(&nf_conntrack_lock); - hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if (exp->class != SIP_EXPECT_SIGNALLING || !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) || exp->tuple.dst.protonum != proto || @@ -881,10 +881,10 @@ static void flush_expectations(struct nf_conn *ct, bool media) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; - struct hlist_node *n, *next; + struct hlist_node *next; spin_lock_bh(&nf_conntrack_lock); - hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media) continue; if (!del_timer(&exp->timeout)) @@ -1095,8 +1095,10 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff, port = simple_strtoul(*dptr + mediaoff, NULL, 10); if (port == 0) continue; - if (port < 1024 || port > 65535) + if (port < 1024 || port > 65535) { + nf_ct_helper_log(skb, ct, "wrong port %u", port); return NF_DROP; + } /* The media description overrides the session description. */ maddr_len = 0; @@ -1107,15 +1109,20 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff, memcpy(&rtp_addr, &maddr, sizeof(rtp_addr)); } else if (caddr_len) memcpy(&rtp_addr, &caddr, sizeof(rtp_addr)); - else + else { + nf_ct_helper_log(skb, ct, "cannot parse SDP message"); return NF_DROP; + } ret = set_expected_rtp_rtcp(skb, protoff, dataoff, dptr, datalen, &rtp_addr, htons(port), t->class, mediaoff, medialen); - if (ret != NF_ACCEPT) + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, ct, + "cannot add expectation for voice"); return ret; + } /* Update media connection address if present */ if (maddr_len && nf_nat_sdp_addr && ct->status & IPS_NAT_MASK) { @@ -1123,8 +1130,10 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff, dptr, datalen, mediaoff, SDP_HDR_CONNECTION, SDP_HDR_MEDIA, &rtp_addr); - if (ret != NF_ACCEPT) + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, ct, "cannot mangle SDP"); return ret; + } } i++; } @@ -1258,9 +1267,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, SIP_HDR_CONTACT, NULL, &matchoff, &matchlen, &daddr, &port); - if (ret < 0) + if (ret < 0) { + nf_ct_helper_log(skb, ct, "cannot parse contact"); return NF_DROP; - else if (ret == 0) + } else if (ret == 0) return NF_ACCEPT; /* We don't support third-party registrations */ @@ -1273,8 +1283,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, if (ct_sip_parse_numerical_param(ct, *dptr, matchoff + matchlen, *datalen, - "expires=", NULL, NULL, &expires) < 0) + "expires=", NULL, NULL, &expires) < 0) { + nf_ct_helper_log(skb, ct, "cannot parse expires"); return NF_DROP; + } if (expires == 0) { ret = NF_ACCEPT; @@ -1282,8 +1294,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, } exp = nf_ct_expect_alloc(ct); - if (!exp) + if (!exp) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); return NF_DROP; + } saddr = NULL; if (sip_direct_signalling) @@ -1300,9 +1314,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, ret = nf_nat_sip_expect(skb, protoff, dataoff, dptr, datalen, exp, matchoff, matchlen); else { - if (nf_ct_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; - else + } else ret = NF_ACCEPT; } nf_ct_expect_put(exp); @@ -1356,9 +1371,10 @@ static int process_register_response(struct sk_buff *skb, unsigned int protoff, SIP_HDR_CONTACT, &in_contact, &matchoff, &matchlen, &addr, &port); - if (ret < 0) + if (ret < 0) { + nf_ct_helper_log(skb, ct, "cannot parse contact"); return NF_DROP; - else if (ret == 0) + } else if (ret == 0) break; /* We don't support third-party registrations */ @@ -1373,8 +1389,10 @@ static int process_register_response(struct sk_buff *skb, unsigned int protoff, matchoff + matchlen, *datalen, "expires=", NULL, NULL, &c_expires); - if (ret < 0) + if (ret < 0) { + nf_ct_helper_log(skb, ct, "cannot parse expires"); return NF_DROP; + } if (c_expires == 0) break; if (refresh_signalling_expectation(ct, &addr, proto, port, @@ -1408,15 +1426,21 @@ static int process_sip_response(struct sk_buff *skb, unsigned int protoff, if (*datalen < strlen("SIP/2.0 200")) return NF_ACCEPT; code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10); - if (!code) + if (!code) { + nf_ct_helper_log(skb, ct, "cannot get code"); return NF_DROP; + } if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, - &matchoff, &matchlen) <= 0) + &matchoff, &matchlen) <= 0) { + nf_ct_helper_log(skb, ct, "cannot parse cseq"); return NF_DROP; + } cseq = simple_strtoul(*dptr + matchoff, NULL, 10); - if (!cseq) + if (!cseq) { + nf_ct_helper_log(skb, ct, "cannot get cseq"); return NF_DROP; + } matchend = matchoff + matchlen + 1; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { @@ -1440,8 +1464,25 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff, { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int matchoff, matchlen; unsigned int cseq, i; + union nf_inet_addr addr; + __be16 port; + + /* Many Cisco IP phones use a high source port for SIP requests, but + * listen for the response on port 5060. If we are the local + * router for one of these phones, save the port number from the + * Via: header so that nf_nat_sip can redirect the responses to + * the correct port. + */ + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + SIP_HDR_VIA_UDP, NULL, &matchoff, + &matchlen, &addr, &port) > 0 && + port != ct->tuplehash[dir].tuple.src.u.udp.port && + nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3)) + ct_sip_info->forced_dport = port; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { const struct sip_handler *handler; @@ -1454,11 +1495,15 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff, continue; if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, - &matchoff, &matchlen) <= 0) + &matchoff, &matchlen) <= 0) { + nf_ct_helper_log(skb, ct, "cannot parse cseq"); return NF_DROP; + } cseq = simple_strtoul(*dptr + matchoff, NULL, 10); - if (!cseq) + if (!cseq) { + nf_ct_helper_log(skb, ct, "cannot get cseq"); return NF_DROP; + } return handler->request(skb, protoff, dataoff, dptr, datalen, cseq); @@ -1481,8 +1526,10 @@ static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct, if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { nf_nat_sip = rcu_dereference(nf_nat_sip_hook); if (nf_nat_sip && !nf_nat_sip(skb, protoff, dataoff, - dptr, datalen)) + dptr, datalen)) { + nf_ct_helper_log(skb, ct, "cannot NAT SIP message"); ret = NF_DROP; + } } return ret; @@ -1547,10 +1594,11 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, msglen = origlen = end - dptr; if (msglen > datalen) - return NF_DROP; + return NF_ACCEPT; ret = process_sip_msg(skb, ct, protoff, dataoff, &dptr, &msglen); + /* process_sip_* functions report why this packet is dropped */ if (ret != NF_ACCEPT) break; diff = msglen - origlen; diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c index 6e545e26289e..87b95a2c270c 100644 --- a/net/netfilter/nf_conntrack_snmp.c +++ b/net/netfilter/nf_conntrack_snmp.c @@ -16,6 +16,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_snmp.h> #define SNMP_PORT 161 diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index e7185c684816..bd700b4013c1 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1,5 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -366,7 +367,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net) { struct proc_dir_entry *pde; - pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops); + pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops); if (!pde) goto out_nf_conntrack; @@ -377,7 +378,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net) return 0; out_stat_nf_conntrack: - proc_net_remove(net, "nf_conntrack"); + remove_proc_entry("nf_conntrack", net->proc_net); out_nf_conntrack: return -ENOMEM; } @@ -385,7 +386,7 @@ out_nf_conntrack: static void nf_conntrack_standalone_fini_proc(struct net *net) { remove_proc_entry("nf_conntrack", net->proc_net_stat); - proc_net_remove(net, "nf_conntrack"); + remove_proc_entry("nf_conntrack", net->proc_net); } #else static int nf_conntrack_standalone_init_proc(struct net *net) @@ -472,13 +473,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) { struct ctl_table *table; - if (net_eq(net, &init_net)) { - nf_ct_netfilter_header = - register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); - if (!nf_ct_netfilter_header) - goto out; - } - table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), GFP_KERNEL); if (!table) @@ -502,10 +496,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) out_unregister_netfilter: kfree(table); out_kmemdup: - if (net_eq(net, &init_net)) - unregister_net_sysctl_table(nf_ct_netfilter_header); -out: - printk(KERN_ERR "nf_conntrack: can't register to sysctl.\n"); return -ENOMEM; } @@ -513,8 +503,6 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct ctl_table *table; - if (net_eq(net, &init_net)) - unregister_net_sysctl_table(nf_ct_netfilter_header); table = net->ct.sysctl_header->ctl_table_arg; unregister_net_sysctl_table(net->ct.sysctl_header); kfree(table); @@ -530,51 +518,90 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net) } #endif /* CONFIG_SYSCTL */ -static int nf_conntrack_net_init(struct net *net) +static int nf_conntrack_pernet_init(struct net *net) { int ret; - ret = nf_conntrack_init(net); + ret = nf_conntrack_init_net(net); if (ret < 0) goto out_init; + ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; + net->ct.sysctl_checksum = 1; net->ct.sysctl_log_invalid = 0; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) goto out_sysctl; + return 0; out_sysctl: nf_conntrack_standalone_fini_proc(net); out_proc: - nf_conntrack_cleanup(net); + nf_conntrack_cleanup_net(net); out_init: return ret; } -static void nf_conntrack_net_exit(struct net *net) +static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) { - nf_conntrack_standalone_fini_sysctl(net); - nf_conntrack_standalone_fini_proc(net); - nf_conntrack_cleanup(net); + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) { + nf_conntrack_standalone_fini_sysctl(net); + nf_conntrack_standalone_fini_proc(net); + } + nf_conntrack_cleanup_net_list(net_exit_list); } static struct pernet_operations nf_conntrack_net_ops = { - .init = nf_conntrack_net_init, - .exit = nf_conntrack_net_exit, + .init = nf_conntrack_pernet_init, + .exit_batch = nf_conntrack_pernet_exit, }; static int __init nf_conntrack_standalone_init(void) { - return register_pernet_subsys(&nf_conntrack_net_ops); + int ret = nf_conntrack_init_start(); + if (ret < 0) + goto out_start; + +#ifdef CONFIG_SYSCTL + nf_ct_netfilter_header = + register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); + if (!nf_ct_netfilter_header) { + pr_err("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto out_sysctl; + } +#endif + + ret = register_pernet_subsys(&nf_conntrack_net_ops); + if (ret < 0) + goto out_pernet; + + nf_conntrack_init_end(); + return 0; + +out_pernet: +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(nf_ct_netfilter_header); +out_sysctl: +#endif + nf_conntrack_cleanup_end(); +out_start: + return ret; } static void __exit nf_conntrack_standalone_fini(void) { + nf_conntrack_cleanup_start(); unregister_pernet_subsys(&nf_conntrack_net_ops); +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(nf_ct_netfilter_header); +#endif nf_conntrack_cleanup_end(); } diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 81fc61c05263..e68ab4fbd71f 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -1,5 +1,5 @@ /* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -60,8 +60,10 @@ static int tftp_help(struct sk_buff *skb, nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); exp = nf_ct_expect_alloc(ct); - if (exp == NULL) + if (exp == NULL) { + nf_ct_helper_log(skb, ct, "cannot alloc expectation"); return NF_DROP; + } tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), @@ -74,8 +76,10 @@ static int tftp_help(struct sk_buff *skb, nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook); if (nf_nat_tftp && ct->status & IPS_NAT_MASK) ret = nf_nat_tftp(skb, ctinfo, exp); - else if (nf_ct_expect_related(exp) != 0) + else if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; + } nf_ct_expect_put(exp); break; case TFTP_OPCODE_DATA: diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index a878ce5b252c..93da609d9d29 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -37,24 +37,15 @@ static struct nf_ct_ext_type timeout_extend __read_mostly = { .id = NF_CT_EXT_TIMEOUT, }; -int nf_conntrack_timeout_init(struct net *net) +int nf_conntrack_timeout_init(void) { - int ret = 0; - - if (net_eq(net, &init_net)) { - ret = nf_ct_extend_register(&timeout_extend); - if (ret < 0) { - printk(KERN_ERR "nf_ct_timeout: Unable to register " - "timeout extension.\n"); - return ret; - } - } - - return 0; + int ret = nf_ct_extend_register(&timeout_extend); + if (ret < 0) + pr_err("nf_ct_timeout: Unable to register timeout extension.\n"); + return ret; } -void nf_conntrack_timeout_fini(struct net *net) +void nf_conntrack_timeout_fini(void) { - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&timeout_extend); + nf_ct_extend_unregister(&timeout_extend); } diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c index 7ea8026f07c9..902fb0a6b38a 100644 --- a/net/netfilter/nf_conntrack_timestamp.c +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -88,37 +88,28 @@ static void nf_conntrack_tstamp_fini_sysctl(struct net *net) } #endif -int nf_conntrack_tstamp_init(struct net *net) +int nf_conntrack_tstamp_pernet_init(struct net *net) { - int ret; - net->ct.sysctl_tstamp = nf_ct_tstamp; + return nf_conntrack_tstamp_init_sysctl(net); +} - if (net_eq(net, &init_net)) { - ret = nf_ct_extend_register(&tstamp_extend); - if (ret < 0) { - printk(KERN_ERR "nf_ct_tstamp: Unable to register " - "extension\n"); - goto out_extend_register; - } - } +void nf_conntrack_tstamp_pernet_fini(struct net *net) +{ + nf_conntrack_tstamp_fini_sysctl(net); + nf_ct_extend_unregister(&tstamp_extend); +} - ret = nf_conntrack_tstamp_init_sysctl(net); +int nf_conntrack_tstamp_init(void) +{ + int ret; + ret = nf_ct_extend_register(&tstamp_extend); if (ret < 0) - goto out_sysctl; - - return 0; - -out_sysctl: - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&tstamp_extend); -out_extend_register: + pr_err("nf_ct_tstamp: Unable to register extension\n"); return ret; } -void nf_conntrack_tstamp_fini(struct net *net) +void nf_conntrack_tstamp_fini(void) { - nf_conntrack_tstamp_fini_sysctl(net); - if (net_eq(net, &init_net)) - nf_ct_extend_unregister(&tstamp_extend); + nf_ct_extend_unregister(&tstamp_extend); } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 9e312695c818..388656d5a9ec 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -16,7 +16,6 @@ #define NF_LOG_PREFIXLEN 128 #define NFLOGGER_NAME_LEN 64 -static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); @@ -32,13 +31,46 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger) return NULL; } +void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) +{ + const struct nf_logger *log; + + if (pf == NFPROTO_UNSPEC) + return; + + mutex_lock(&nf_log_mutex); + log = rcu_dereference_protected(net->nf.nf_loggers[pf], + lockdep_is_held(&nf_log_mutex)); + if (log == NULL) + rcu_assign_pointer(net->nf.nf_loggers[pf], logger); + + mutex_unlock(&nf_log_mutex); +} +EXPORT_SYMBOL(nf_log_set); + +void nf_log_unset(struct net *net, const struct nf_logger *logger) +{ + int i; + const struct nf_logger *log; + + mutex_lock(&nf_log_mutex); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = rcu_dereference_protected(net->nf.nf_loggers[i], + lockdep_is_held(&nf_log_mutex)); + if (log == logger) + RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL); + } + mutex_unlock(&nf_log_mutex); + synchronize_rcu(); +} +EXPORT_SYMBOL(nf_log_unset); + /* return EEXIST if the same logger is registered, 0 on success. */ int nf_log_register(u_int8_t pf, struct nf_logger *logger) { - const struct nf_logger *llog; int i; - if (pf >= ARRAY_SIZE(nf_loggers)) + if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(logger->list); i++) @@ -52,10 +84,6 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger) } else { /* register at end of list to honor first register win */ list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); - llog = rcu_dereference_protected(nf_loggers[pf], - lockdep_is_held(&nf_log_mutex)); - if (llog == NULL) - rcu_assign_pointer(nf_loggers[pf], logger); } mutex_unlock(&nf_log_mutex); @@ -66,49 +94,43 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { - const struct nf_logger *c_logger; int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) { - c_logger = rcu_dereference_protected(nf_loggers[i], - lockdep_is_held(&nf_log_mutex)); - if (c_logger == logger) - RCU_INIT_POINTER(nf_loggers[i], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) list_del(&logger->list[i]); - } mutex_unlock(&nf_log_mutex); - - synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); -int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger) +int nf_log_bind_pf(struct net *net, u_int8_t pf, + const struct nf_logger *logger) { - if (pf >= ARRAY_SIZE(nf_loggers)) + if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) return -EINVAL; mutex_lock(&nf_log_mutex); if (__find_logger(pf, logger->name) == NULL) { mutex_unlock(&nf_log_mutex); return -ENOENT; } - rcu_assign_pointer(nf_loggers[pf], logger); + rcu_assign_pointer(net->nf.nf_loggers[pf], logger); mutex_unlock(&nf_log_mutex); return 0; } EXPORT_SYMBOL(nf_log_bind_pf); -void nf_log_unbind_pf(u_int8_t pf) +void nf_log_unbind_pf(struct net *net, u_int8_t pf) { - if (pf >= ARRAY_SIZE(nf_loggers)) + if (pf >= ARRAY_SIZE(net->nf.nf_loggers)) return; mutex_lock(&nf_log_mutex); - RCU_INIT_POINTER(nf_loggers[pf], NULL); + RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL); mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unbind_pf); -void nf_log_packet(u_int8_t pf, +void nf_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -121,7 +143,7 @@ void nf_log_packet(u_int8_t pf, const struct nf_logger *logger; rcu_read_lock(); - logger = rcu_dereference(nf_loggers[pf]); + logger = rcu_dereference(net->nf.nf_loggers[pf]); if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); @@ -135,9 +157,11 @@ EXPORT_SYMBOL(nf_log_packet); #ifdef CONFIG_PROC_FS static void *seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq_file_net(seq); + mutex_lock(&nf_log_mutex); - if (*pos >= ARRAY_SIZE(nf_loggers)) + if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; @@ -145,9 +169,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos) static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { + struct net *net = seq_file_net(s); + (*pos)++; - if (*pos >= ARRAY_SIZE(nf_loggers)) + if (*pos >= ARRAY_SIZE(net->nf.nf_loggers)) return NULL; return pos; @@ -164,8 +190,9 @@ static int seq_show(struct seq_file *s, void *v) const struct nf_logger *logger; struct nf_logger *t; int ret; + struct net *net = seq_file_net(s); - logger = rcu_dereference_protected(nf_loggers[*pos], + logger = rcu_dereference_protected(net->nf.nf_loggers[*pos], lockdep_is_held(&nf_log_mutex)); if (!logger) @@ -199,7 +226,8 @@ static const struct seq_operations nflog_seq_ops = { static int nflog_open(struct inode *inode, struct file *file) { - return seq_open(file, &nflog_seq_ops); + return seq_open_net(inode, file, &nflog_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations nflog_file_ops = { @@ -207,7 +235,7 @@ static const struct file_operations nflog_file_ops = { .open = nflog_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; @@ -216,7 +244,6 @@ static const struct file_operations nflog_file_ops = { #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; -static struct ctl_table_header *nf_log_dir_header; static int nf_log_proc_dostring(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -226,6 +253,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, size_t size = *lenp; int r = 0; int tindex = (unsigned long)table->extra1; + struct net *net = current->nsproxy->net_ns; if (write) { if (size > sizeof(buf)) @@ -234,7 +262,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, return -EFAULT; if (!strcmp(buf, "NONE")) { - nf_log_unbind_pf(tindex); + nf_log_unbind_pf(net, tindex); return 0; } mutex_lock(&nf_log_mutex); @@ -243,11 +271,11 @@ static int nf_log_proc_dostring(ctl_table *table, int write, mutex_unlock(&nf_log_mutex); return -ENOENT; } - rcu_assign_pointer(nf_loggers[tindex], logger); + rcu_assign_pointer(net->nf.nf_loggers[tindex], logger); mutex_unlock(&nf_log_mutex); } else { mutex_lock(&nf_log_mutex); - logger = rcu_dereference_protected(nf_loggers[tindex], + logger = rcu_dereference_protected(net->nf.nf_loggers[tindex], lockdep_is_held(&nf_log_mutex)); if (!logger) table->data = "NONE"; @@ -260,49 +288,111 @@ static int nf_log_proc_dostring(ctl_table *table, int write, return r; } -static __init int netfilter_log_sysctl_init(void) +static int netfilter_log_sysctl_init(struct net *net) { int i; - - for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { - snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i); - nf_log_sysctl_table[i].procname = - nf_log_sysctl_fnames[i-NFPROTO_UNSPEC]; - nf_log_sysctl_table[i].data = NULL; - nf_log_sysctl_table[i].maxlen = - NFLOGGER_NAME_LEN * sizeof(char); - nf_log_sysctl_table[i].mode = 0644; - nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring; - nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i; + struct ctl_table *table; + + table = nf_log_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_log_sysctl_table, + sizeof(nf_log_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } else { + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { + snprintf(nf_log_sysctl_fnames[i], + 3, "%d", i); + nf_log_sysctl_table[i].procname = + nf_log_sysctl_fnames[i]; + nf_log_sysctl_table[i].data = NULL; + nf_log_sysctl_table[i].maxlen = + NFLOGGER_NAME_LEN * sizeof(char); + nf_log_sysctl_table[i].mode = 0644; + nf_log_sysctl_table[i].proc_handler = + nf_log_proc_dostring; + nf_log_sysctl_table[i].extra1 = + (void *)(unsigned long) i; + } } - nf_log_dir_header = register_net_sysctl(&init_net, "net/netfilter/nf_log", - nf_log_sysctl_table); - if (!nf_log_dir_header) - return -ENOMEM; + net->nf.nf_log_dir_header = register_net_sysctl(net, + "net/netfilter/nf_log", + table); + if (!net->nf.nf_log_dir_header) + goto err_reg; return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void netfilter_log_sysctl_exit(struct net *net) +{ + struct ctl_table *table; + + table = net->nf.nf_log_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_log_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); } #else -static __init int netfilter_log_sysctl_init(void) +static int netfilter_log_sysctl_init(struct net *net) { return 0; } + +static void netfilter_log_sysctl_exit(struct net *net) +{ +} #endif /* CONFIG_SYSCTL */ -int __init netfilter_log_init(void) +static int __net_init nf_log_net_init(struct net *net) { - int i, r; + int ret = -ENOMEM; + #ifdef CONFIG_PROC_FS if (!proc_create("nf_log", S_IRUGO, - proc_net_netfilter, &nflog_file_ops)) - return -1; + net->nf.proc_netfilter, &nflog_file_ops)) + return ret; #endif + ret = netfilter_log_sysctl_init(net); + if (ret < 0) + goto out_sysctl; + + return 0; - /* Errors will trigger panic, unroll on error is unnecessary. */ - r = netfilter_log_sysctl_init(); - if (r < 0) - return r; +out_sysctl: + /* For init_net: errors will trigger panic, don't unroll on error. */ + if (!net_eq(net, &init_net)) + remove_proc_entry("nf_log", net->nf.proc_netfilter); + + return ret; +} + +static void __net_exit nf_log_net_exit(struct net *net) +{ + netfilter_log_sysctl_exit(net); + remove_proc_entry("nf_log", net->nf.proc_netfilter); +} + +static struct pernet_operations nf_log_net_ops = { + .init = nf_log_net_init, + .exit = nf_log_net_exit, +}; + +int __init netfilter_log_init(void) +{ + int i, ret; + + ret = register_pernet_subsys(&nf_log_net_ops); + if (ret < 0) + return ret; for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) INIT_LIST_HEAD(&(nf_loggers_l[i])); diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c index 42d337881171..eb772380a202 100644 --- a/net/netfilter/nf_nat_amanda.c +++ b/net/netfilter/nf_nat_amanda.c @@ -1,6 +1,7 @@ /* Amanda extension for TCP NAT alteration. * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> * based on a copy of HW's ip_nat_irc.c as well as other modules + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -56,15 +57,19 @@ static unsigned int help(struct sk_buff *skb, } } - if (port == 0) + if (port == 0) { + nf_ct_helper_log(skb, exp->master, "all ports in use"); return NF_DROP; + } sprintf(buffer, "%u", port); ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, protoff, matchoff, matchlen, buffer, strlen(buffer)); - if (ret != NF_ACCEPT) + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, exp->master, "cannot mangle packet"); nf_ct_unexpect_related(exp); + } return ret; } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 5f2f9109f461..038eee5c8f85 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -87,9 +87,11 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) struct flowi fl; unsigned int hh_len; struct dst_entry *dst; + int err; - if (xfrm_decode_session(skb, &fl, family) < 0) - return -1; + err = xfrm_decode_session(skb, &fl, family); + if (err < 0) + return err; dst = skb_dst(skb); if (dst->xfrm) @@ -98,7 +100,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); if (IS_ERR(dst)) - return -1; + return PTR_ERR(dst); skb_dst_drop(skb); skb_dst_set(skb, dst); @@ -107,7 +109,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) - return -1; + return -ENOMEM; return 0; } EXPORT_SYMBOL(nf_xfrm_me_harder); @@ -191,9 +193,8 @@ find_appropriate_src(struct net *net, u16 zone, unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn_nat *nat; const struct nf_conn *ct; - const struct hlist_node *n; - hlist_for_each_entry_rcu(nat, n, &net->ct.nat_bysource[h], bysource) { + hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { /* Copy source part from reply tuple. */ @@ -468,33 +469,22 @@ EXPORT_SYMBOL_GPL(nf_nat_packet); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; - bool hash; }; -/* Clear NAT section of all conntracks, in case we're loaded again. */ -static int nf_nat_proto_clean(struct nf_conn *i, void *data) +/* kill conntracks with affected NAT section */ +static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; struct nf_conn_nat *nat = nfct_nat(i); if (!nat) return 0; - if (!(i->status & IPS_SRC_NAT_DONE)) - return 0; + if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; - if (clean->hash) { - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&nat->bysource); - spin_unlock_bh(&nf_nat_lock); - } else { - memset(nat, 0, sizeof(*nat)); - i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | - IPS_SEQ_ADJUST); - } - return 0; + return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) @@ -506,16 +496,8 @@ static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) struct net *net; rtnl_lock(); - /* Step 1 - remove from bysource hash */ - clean.hash = true; for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean); - synchronize_rcu(); - - /* Step 2 - clean NAT section */ - clean.hash = false; - for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean); + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean); rtnl_unlock(); } @@ -527,16 +509,9 @@ static void nf_nat_l3proto_clean(u8 l3proto) struct net *net; rtnl_lock(); - /* Step 1 - remove from bysource hash */ - clean.hash = true; - for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean); - synchronize_rcu(); - /* Step 2 - clean NAT section */ - clean.hash = false; for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean); + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean); rtnl_unlock(); } @@ -774,7 +749,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; - nf_ct_iterate_cleanup(net, &nf_nat_proto_clean, &clean); + nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean); synchronize_rcu(); nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index e839b97b2863..e84a578dbe35 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -96,8 +96,10 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, } } - if (port == 0) + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use"); return NF_DROP; + } buflen = nf_nat_ftp_fmt_cmd(ct, type, buffer, sizeof(buffer), &newaddr, port); @@ -113,6 +115,7 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, return NF_ACCEPT; out: + nf_ct_helper_log(skb, ct, "cannot mangle packet"); nf_ct_unexpect_related(exp); return NF_DROP; } diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 23c2b38676a6..5fea563afe30 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -2,6 +2,7 @@ * * (C) 2000-2002 Harald Welte <laforge@netfilter.org> * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2007-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index 1fedee6e7fb6..f02b3605823e 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -56,14 +56,18 @@ static unsigned int help(struct sk_buff *skb, } } - if (port == 0) + if (port == 0) { + nf_ct_helper_log(skb, exp->master, "all ports in use"); return NF_DROP; + } ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo, protoff, matchoff, matchlen, buffer, strlen(buffer)); - if (ret != NF_ACCEPT) + if (ret != NF_ACCEPT) { + nf_ct_helper_log(skb, exp->master, "cannot mangle packet"); nf_ct_unexpect_related(exp); + } return ret; } diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index e64faa5ca893..396e55d46f90 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -36,7 +36,7 @@ sctp_manip_pkt(struct sk_buff *skb, { struct sk_buff *frag; sctp_sctphdr_t *hdr; - __be32 crc32; + __u32 crc32; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) return false; @@ -55,8 +55,7 @@ sctp_manip_pkt(struct sk_buff *skb, skb_walk_frags(skb, frag) crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag), crc32); - crc32 = sctp_end_cksum(crc32); - hdr->checksum = crc32; + hdr->checksum = sctp_end_cksum(crc32); return true; } diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index 16303c752213..96ccdf78a29f 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -95,6 +95,7 @@ static int map_addr(struct sk_buff *skb, unsigned int protoff, enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; unsigned int buflen; union nf_inet_addr newaddr; @@ -107,7 +108,8 @@ static int map_addr(struct sk_buff *skb, unsigned int protoff, } else if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, addr) && ct->tuplehash[dir].tuple.dst.u.udp.port == port) { newaddr = ct->tuplehash[!dir].tuple.src.u3; - newport = ct->tuplehash[!dir].tuple.src.u.udp.port; + newport = ct_sip_info->forced_dport ? : + ct->tuplehash[!dir].tuple.src.u.udp.port; } else return 1; @@ -144,6 +146,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); unsigned int coff, matchoff, matchlen; enum sip_header_types hdr; union nf_inet_addr addr; @@ -156,8 +159,10 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, &matchoff, &matchlen, &addr, &port) > 0 && !map_addr(skb, protoff, dataoff, dptr, datalen, - matchoff, matchlen, &addr, port)) + matchoff, matchlen, &addr, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle SIP message"); return NF_DROP; + } request = 1; } else request = 0; @@ -190,8 +195,10 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, olen = *datalen; if (!map_addr(skb, protoff, dataoff, dptr, datalen, - matchoff, matchlen, &addr, port)) + matchoff, matchlen, &addr, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle Via header"); return NF_DROP; + } matchend = matchoff + matchlen + *datalen - olen; @@ -206,8 +213,10 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, &ct->tuplehash[!dir].tuple.dst.u3, true); if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, - poff, plen, buffer, buflen)) + poff, plen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle maddr"); return NF_DROP; + } } /* The received= parameter (RFC 2361) contains the address @@ -222,6 +231,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, false); if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, poff, plen, buffer, buflen)) + nf_ct_helper_log(skb, ct, "cannot mangle received"); return NF_DROP; } @@ -235,8 +245,10 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; buflen = sprintf(buffer, "%u", ntohs(p)); if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, - poff, plen, buffer, buflen)) + poff, plen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle rport"); return NF_DROP; + } } } @@ -250,13 +262,36 @@ next: &addr, &port) > 0) { if (!map_addr(skb, protoff, dataoff, dptr, datalen, matchoff, matchlen, - &addr, port)) + &addr, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle contact"); return NF_DROP; + } } if (!map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_FROM) || - !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO)) + !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO)) { + nf_ct_helper_log(skb, ct, "cannot mangle SIP from/to"); return NF_DROP; + } + + /* Mangle destination port for Cisco phones, then fix up checksums */ + if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) { + struct udphdr *uh; + + if (!skb_make_writable(skb, skb->len)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + return NF_DROP; + } + + uh = (void *)skb->data + protoff; + uh->dest = ct_sip_info->forced_dport; + + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, protoff, + 0, 0, NULL, 0)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + return NF_DROP; + } + } return NF_ACCEPT; } @@ -311,8 +346,10 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct); union nf_inet_addr newaddr; u_int16_t port; + __be16 srcport; char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")]; unsigned int buflen; @@ -326,8 +363,9 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, /* If the signalling port matches the connection's source port in the * original direction, try to use the destination port in the opposite * direction. */ - if (exp->tuple.dst.u.udp.port == - ct->tuplehash[dir].tuple.src.u.udp.port) + srcport = ct_sip_info->forced_dport ? : + ct->tuplehash[dir].tuple.src.u.udp.port; + if (exp->tuple.dst.u.udp.port == srcport) port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port); else port = ntohs(exp->tuple.dst.u.udp.port); @@ -351,15 +389,19 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, } } - if (port == 0) + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use for SIP"); return NF_DROP; + } if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) || exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port); if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, - matchoff, matchlen, buffer, buflen)) + matchoff, matchlen, buffer, buflen)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); goto err; + } } return NF_ACCEPT; @@ -552,14 +594,18 @@ static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff, } } - if (port == 0) + if (port == 0) { + nf_ct_helper_log(skb, ct, "all ports in use for SDP media"); goto err1; + } /* Update media port. */ if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && !nf_nat_sdp_port(skb, protoff, dataoff, dptr, datalen, - mediaoff, medialen, port)) + mediaoff, medialen, port)) { + nf_ct_helper_log(skb, ct, "cannot mangle SDP message"); goto err2; + } return NF_ACCEPT; diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c index ccabbda71a3e..7f67e1d5310d 100644 --- a/net/netfilter/nf_nat_tftp.c +++ b/net/netfilter/nf_nat_tftp.c @@ -28,8 +28,10 @@ static unsigned int help(struct sk_buff *skb, = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; exp->dir = IP_CT_DIR_REPLY; exp->expectfn = nf_nat_follow_master; - if (nf_ct_expect_related(exp) != 0) + if (nf_ct_expect_related(exp) != 0) { + nf_ct_helper_log(skb, exp->master, "cannot add expectation"); return NF_DROP; + } return NF_ACCEPT; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index d812c1235b30..5d24b1fdb593 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -1,3 +1,8 @@ +/* + * Rusty Russell (C)2000 -- This code is GPL. + * Patrick McHardy (c) 2006-2012 + */ + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> @@ -40,7 +45,7 @@ void nf_unregister_queue_handler(void) } EXPORT_SYMBOL(nf_unregister_queue_handler); -static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { /* Release those devices we held, or Alexey will kill me. */ if (entry->indev) @@ -60,12 +65,41 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) /* Drop reference to owner of hook which queued us. */ module_put(entry->elem->owner); } +EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); + +/* Bump dev refs so they don't vanish while packet is out */ +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) +{ + if (!try_module_get(entry->elem->owner)) + return false; + + if (entry->indev) + dev_hold(entry->indev); + if (entry->outdev) + dev_hold(entry->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; + struct net_device *physdev; + + physdev = nf_bridge->physindev; + if (physdev) + dev_hold(physdev); + physdev = nf_bridge->physoutdev; + if (physdev) + dev_hold(physdev); + } +#endif + + return true; +} +EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); /* * Any packet that leaves via this function must come back * through nf_reinject(). */ -static int __nf_queue(struct sk_buff *skb, +int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, u_int8_t pf, unsigned int hook, struct net_device *indev, @@ -75,10 +109,6 @@ static int __nf_queue(struct sk_buff *skb, { int status = -ENOENT; struct nf_queue_entry *entry = NULL; -#ifdef CONFIG_BRIDGE_NETFILTER - struct net_device *physindev; - struct net_device *physoutdev; -#endif const struct nf_afinfo *afinfo; const struct nf_queue_handler *qh; @@ -109,28 +139,13 @@ static int __nf_queue(struct sk_buff *skb, .indev = indev, .outdev = outdev, .okfn = okfn, + .size = sizeof(*entry) + afinfo->route_key_size, }; - /* If it's going away, ignore hook. */ - if (!try_module_get(entry->elem->owner)) { + if (!nf_queue_entry_get_refs(entry)) { status = -ECANCELED; goto err_unlock; } - /* Bump dev refs so they don't vanish while packet is out */ - if (indev) - dev_hold(indev); - if (outdev) - dev_hold(outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - physindev = skb->nf_bridge->physindev; - if (physindev) - dev_hold(physindev); - physoutdev = skb->nf_bridge->physoutdev; - if (physoutdev) - dev_hold(physoutdev); - } -#endif skb_dst_force(skb); afinfo->saveroute(skb, entry); status = qh->outfn(entry, queuenum); @@ -151,87 +166,6 @@ err: return status; } -#ifdef CONFIG_BRIDGE_NETFILTER -/* When called from bridge netfilter, skb->data must point to MAC header - * before calling skb_gso_segment(). Else, original MAC header is lost - * and segmented skbs will be sent to wrong destination. - */ -static void nf_bridge_adjust_skb_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_push(skb, skb->network_header - skb->mac_header); -} - -static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_pull(skb, skb->network_header - skb->mac_header); -} -#else -#define nf_bridge_adjust_skb_data(s) do {} while (0) -#define nf_bridge_adjust_segmented_data(s) do {} while (0) -#endif - -int nf_queue(struct sk_buff *skb, - struct nf_hook_ops *elem, - u_int8_t pf, unsigned int hook, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - unsigned int queuenum) -{ - struct sk_buff *segs; - int err = -EINVAL; - unsigned int queued; - - if (!skb_is_gso(skb)) - return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, - queuenum); - - switch (pf) { - case NFPROTO_IPV4: - skb->protocol = htons(ETH_P_IP); - break; - case NFPROTO_IPV6: - skb->protocol = htons(ETH_P_IPV6); - break; - } - - nf_bridge_adjust_skb_data(skb); - segs = skb_gso_segment(skb, 0); - /* Does not use PTR_ERR to limit the number of error codes that can be - * returned by nf_queue. For instance, callers rely on -ECANCELED to mean - * 'ignore this hook'. - */ - if (IS_ERR(segs)) - goto out_err; - queued = 0; - err = 0; - do { - struct sk_buff *nskb = segs->next; - - segs->next = NULL; - if (err == 0) { - nf_bridge_adjust_segmented_data(segs); - err = __nf_queue(segs, elem, pf, hook, indev, - outdev, okfn, queuenum); - } - if (err == 0) - queued++; - else - kfree_skb(segs); - segs = nskb; - } while (segs); - - if (queued) { - kfree_skb(skb); - return 0; - } - out_err: - nf_bridge_adjust_segmented_data(skb); - return err; -} - void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { struct sk_buff *skb = entry->skb; @@ -271,9 +205,9 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) local_bh_enable(); break; case NF_QUEUE: - err = __nf_queue(skb, elem, entry->pf, entry->hook, - entry->indev, entry->outdev, entry->okfn, - verdict >> NF_VERDICT_QBITS); + err = nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, + verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ECANCELED) goto next_hook; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 58a09b7c3f6d..572d87dc116f 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -24,10 +24,9 @@ #include <linux/skbuff.h> #include <asm/uaccess.h> #include <net/sock.h> -#include <net/netlink.h> #include <linux/init.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> MODULE_LICENSE("GPL"); @@ -36,8 +35,10 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); static char __initdata nfversion[] = "0.30"; -static const struct nfnetlink_subsystem __rcu *subsys_table[NFNL_SUBSYS_COUNT]; -static DEFINE_MUTEX(nfnl_mutex); +static struct { + struct mutex mutex; + const struct nfnetlink_subsystem __rcu *subsys; +} table[NFNL_SUBSYS_COUNT]; static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK, @@ -48,27 +49,27 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, }; -void nfnl_lock(void) +void nfnl_lock(__u8 subsys_id) { - mutex_lock(&nfnl_mutex); + mutex_lock(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(nfnl_lock); -void nfnl_unlock(void) +void nfnl_unlock(__u8 subsys_id) { - mutex_unlock(&nfnl_mutex); + mutex_unlock(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(nfnl_unlock); int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { - nfnl_lock(); - if (subsys_table[n->subsys_id]) { - nfnl_unlock(); + nfnl_lock(n->subsys_id); + if (table[n->subsys_id].subsys) { + nfnl_unlock(n->subsys_id); return -EBUSY; } - rcu_assign_pointer(subsys_table[n->subsys_id], n); - nfnl_unlock(); + rcu_assign_pointer(table[n->subsys_id].subsys, n); + nfnl_unlock(n->subsys_id); return 0; } @@ -76,9 +77,9 @@ EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n) { - nfnl_lock(); - subsys_table[n->subsys_id] = NULL; - nfnl_unlock(); + nfnl_lock(n->subsys_id); + table[n->subsys_id].subsys = NULL; + nfnl_unlock(n->subsys_id); synchronize_rcu(); return 0; } @@ -91,7 +92,7 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t if (subsys_id >= NFNL_SUBSYS_COUNT) return NULL; - return rcu_dereference(subsys_table[subsys_id]); + return rcu_dereference(table[subsys_id].subsys); } static inline const struct nfnl_callback * @@ -111,22 +112,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, +struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ + return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); +} +EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); + +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { - return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); + return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); -int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) +int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { - return netlink_set_err(net->nfnl, pid, group, error); + return netlink_set_err(net->nfnl, portid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, + int flags) { - return netlink_unicast(net->nfnl, skb, pid, flags); + return netlink_unicast(net->nfnl, skb, portid, flags); } EXPORT_SYMBOL_GPL(nfnetlink_unicast); @@ -142,7 +151,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EPERM; /* All the messages must at least contain nfgenmsg */ - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg))) + if (nlmsg_len(nlh) < sizeof(struct nfgenmsg)) return 0; type = nlh->nlmsg_type; @@ -170,11 +179,12 @@ replay: } { - int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; + __u8 subsys_id = NFNL_SUBSYS_ID(type); err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy); @@ -189,10 +199,9 @@ replay: rcu_read_unlock(); } else { rcu_read_unlock(); - nfnl_lock(); - if (rcu_dereference_protected( - subsys_table[NFNL_SUBSYS_ID(type)], - lockdep_is_held(&nfnl_mutex)) != ss || + nfnl_lock(subsys_id); + if (rcu_dereference_protected(table[subsys_id].subsys, + lockdep_is_held(&table[subsys_id].mutex)) != ss || nfnetlink_find_client(type, ss) != nc) err = -EAGAIN; else if (nc->call) @@ -200,7 +209,7 @@ replay: (const struct nlattr **)cda); else err = -EINVAL; - nfnl_unlock(); + nfnl_unlock(subsys_id); } if (err == -EAGAIN) goto replay; @@ -267,6 +276,11 @@ static struct pernet_operations nfnetlink_net_ops = { static int __init nfnetlink_init(void) { + int i; + + for (i=0; i<NFNL_SUBSYS_COUNT; i++) + mutex_init(&table[i].mutex); + pr_info("Netfilter messages via NETLINK v%s.\n", nfversion); return register_pernet_subsys(&nfnetlink_net_ops); } diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 589d686f0b4c..dc3fd5d44464 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -49,6 +49,8 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); + if (strlen(acct_name) == 0) + return -EINVAL; list_for_each_entry(nfacct, &nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 945950a8b1f1..a191b6db657e 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -282,7 +282,6 @@ nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb, const char *helper_name; struct nf_conntrack_helper *cur, *helper = NULL; struct nf_conntrack_tuple tuple; - struct hlist_node *n; int ret = 0, i; if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) @@ -296,7 +295,7 @@ nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb, rcu_read_lock(); for (i = 0; i < nf_ct_helper_hsize && !helper; i++) { - hlist_for_each_entry_rcu(cur, n, &nf_ct_helper_hash[i], hnode) { + hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { /* skip non-userspace conntrack helpers. */ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) @@ -452,13 +451,12 @@ static int nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct nf_conntrack_helper *cur, *last; - struct hlist_node *n; rcu_read_lock(); last = (struct nf_conntrack_helper *)cb->args[1]; for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) { restart: - hlist_for_each_entry_rcu(cur, n, + hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[cb->args[0]], hnode) { /* skip non-userspace conntrack helpers. */ @@ -495,7 +493,6 @@ nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb, { int ret = -ENOENT, i; struct nf_conntrack_helper *cur; - struct hlist_node *n; struct sk_buff *skb2; char *helper_name = NULL; struct nf_conntrack_tuple tuple; @@ -520,7 +517,7 @@ nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb, } for (i = 0; i < nf_ct_helper_hsize; i++) { - hlist_for_each_entry_rcu(cur, n, &nf_ct_helper_hash[i], hnode) { + hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) { /* skip non-userspace conntrack helpers. */ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) @@ -568,7 +565,7 @@ nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb, { char *helper_name = NULL; struct nf_conntrack_helper *cur; - struct hlist_node *n, *tmp; + struct hlist_node *tmp; struct nf_conntrack_tuple tuple; bool tuple_set = false, found = false; int i, j = 0, ret; @@ -585,7 +582,7 @@ nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb, } for (i = 0; i < nf_ct_helper_hsize; i++) { - hlist_for_each_entry_safe(cur, n, tmp, &nf_ct_helper_hash[i], + hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], hnode) { /* skip non-userspace conntrack helpers. */ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) @@ -654,13 +651,13 @@ err_out: static void __exit nfnl_cthelper_exit(void) { struct nf_conntrack_helper *cur; - struct hlist_node *n, *tmp; + struct hlist_node *tmp; int i; nfnetlink_subsys_unregister(&nfnl_cthelper_subsys); for (i=0; i<nf_ct_helper_hsize; i++) { - hlist_for_each_entry_safe(cur, n, tmp, &nf_ct_helper_hash[i], + hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i], hnode) { /* skip non-userspace conntrack helpers. */ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 92fd8eca0d31..faf1e9300d8a 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -3,6 +3,7 @@ * nfetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ipt_ULOG.c: * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> @@ -19,7 +20,7 @@ #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/netfilter.h> -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_log.h> #include <linux/spinlock.h> @@ -32,6 +33,7 @@ #include <linux/slab.h> #include <net/sock.h> #include <net/netfilter/nf_log.h> +#include <net/netns/generic.h> #include <net/netfilter/nfnetlink_log.h> #include <linux/atomic.h> @@ -56,6 +58,7 @@ struct nfulnl_instance { unsigned int qlen; /* number of nlmsgs in skb */ struct sk_buff *skb; /* pre-allocatd skb */ struct timer_list timer; + struct net *net; struct user_namespace *peer_user_ns; /* User namespace of the peer process */ int peer_portid; /* PORTID of the peer process */ @@ -71,27 +74,35 @@ struct nfulnl_instance { struct rcu_head rcu; }; -static DEFINE_SPINLOCK(instances_lock); -static atomic_t global_seq; - #define INSTANCE_BUCKETS 16 -static struct hlist_head instance_table[INSTANCE_BUCKETS]; static unsigned int hash_init; +static int nfnl_log_net_id __read_mostly; + +struct nfnl_log_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; + atomic_t global_seq; +}; + +static struct nfnl_log_net *nfnl_log_pernet(struct net *net) +{ + return net_generic(net, nfnl_log_net_id); +} + static inline u_int8_t instance_hashfn(u_int16_t group_num) { return ((group_num & 0xff) % INSTANCE_BUCKETS); } static struct nfulnl_instance * -__instance_lookup(u_int16_t group_num) +__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num) { struct hlist_head *head; - struct hlist_node *pos; struct nfulnl_instance *inst; - head = &instance_table[instance_hashfn(group_num)]; - hlist_for_each_entry_rcu(inst, pos, head, hlist) { + head = &log->instance_table[instance_hashfn(group_num)]; + hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->group_num == group_num) return inst; } @@ -105,12 +116,12 @@ instance_get(struct nfulnl_instance *inst) } static struct nfulnl_instance * -instance_lookup_get(u_int16_t group_num) +instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num) { struct nfulnl_instance *inst; rcu_read_lock_bh(); - inst = __instance_lookup(group_num); + inst = __instance_lookup(log, group_num); if (inst && !atomic_inc_not_zero(&inst->use)) inst = NULL; rcu_read_unlock_bh(); @@ -120,7 +131,11 @@ instance_lookup_get(u_int16_t group_num) static void nfulnl_instance_free_rcu(struct rcu_head *head) { - kfree(container_of(head, struct nfulnl_instance, rcu)); + struct nfulnl_instance *inst = + container_of(head, struct nfulnl_instance, rcu); + + put_net(inst->net); + kfree(inst); module_put(THIS_MODULE); } @@ -134,13 +149,15 @@ instance_put(struct nfulnl_instance *inst) static void nfulnl_timer(unsigned long data); static struct nfulnl_instance * -instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns) +instance_create(struct net *net, u_int16_t group_num, + int portid, struct user_namespace *user_ns) { struct nfulnl_instance *inst; + struct nfnl_log_net *log = nfnl_log_pernet(net); int err; - spin_lock_bh(&instances_lock); - if (__instance_lookup(group_num)) { + spin_lock_bh(&log->instances_lock); + if (__instance_lookup(log, group_num)) { err = -EEXIST; goto out_unlock; } @@ -164,6 +181,7 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns) setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); + inst->net = get_net(net); inst->peer_user_ns = user_ns; inst->peer_portid = portid; inst->group_num = group_num; @@ -175,14 +193,15 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns) inst->copy_range = NFULNL_COPY_RANGE_MAX; hlist_add_head_rcu(&inst->hlist, - &instance_table[instance_hashfn(group_num)]); + &log->instance_table[instance_hashfn(group_num)]); + - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); return inst; out_unlock: - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); return ERR_PTR(err); } @@ -211,11 +230,12 @@ __instance_destroy(struct nfulnl_instance *inst) } static inline void -instance_destroy(struct nfulnl_instance *inst) +instance_destroy(struct nfnl_log_net *log, + struct nfulnl_instance *inst) { - spin_lock_bh(&instances_lock); + spin_lock_bh(&log->instances_lock); __instance_destroy(inst); - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); } static int @@ -299,7 +319,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) } static struct sk_buff * -nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) +nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size) { struct sk_buff *skb; unsigned int n; @@ -308,13 +328,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = alloc_skb(n, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = alloc_skb(pkt_size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, pkt_size, + peer_portid, GFP_ATOMIC); if (!skb) pr_err("nfnetlink_log: can't even alloc %u bytes\n", pkt_size); @@ -337,7 +358,7 @@ __nfulnl_send(struct nfulnl_instance *inst) if (!nlh) goto out; } - status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_portid, + status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid, MSG_DONTWAIT); inst->qlen = 0; @@ -371,7 +392,8 @@ nfulnl_timer(unsigned long data) /* This is an inline function, we don't really care about a long * list of arguments */ static inline int -__build_packet_message(struct nfulnl_instance *inst, +__build_packet_message(struct nfnl_log_net *log, + struct nfulnl_instance *inst, const struct sk_buff *skb, unsigned int data_len, u_int8_t pf, @@ -537,7 +559,7 @@ __build_packet_message(struct nfulnl_instance *inst, /* global sequence number */ if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) && nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL, - htonl(atomic_inc_return(&global_seq)))) + htonl(atomic_inc_return(&log->global_seq)))) goto nla_put_failure; if (data_len) { @@ -593,13 +615,15 @@ nfulnl_log_packet(u_int8_t pf, const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen; + struct net *net = dev_net(in ? in : out); + struct nfnl_log_net *log = nfnl_log_pernet(net); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; else li = &default_loginfo; - inst = instance_lookup_get(li->u.ulog.group); + inst = instance_lookup_get(log, li->u.ulog.group); if (!inst) return; @@ -610,7 +634,7 @@ nfulnl_log_packet(u_int8_t pf, /* FIXME: do we want to make the size calculation conditional based on * what is actually present? way more branches and checks, but more * memory efficient... */ - size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -674,14 +698,15 @@ nfulnl_log_packet(u_int8_t pf, } if (!inst->skb) { - inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); + inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz, + size); if (!inst->skb) goto alloc_failure; } inst->qlen++; - __build_packet_message(inst, skb, data_len, pf, + __build_packet_message(log, inst, skb, data_len, pf, hooknum, in, out, prefix, plen); if (inst->qlen >= qthreshold) @@ -710,24 +735,24 @@ nfulnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; + struct nfnl_log_net *log = nfnl_log_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ - spin_lock_bh(&instances_lock); + spin_lock_bh(&log->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *tmp, *t2; + struct hlist_node *t2; struct nfulnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &log->instance_table[i]; - hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - if ((net_eq(n->net, &init_net)) && - (n->portid == inst->peer_portid)) + hlist_for_each_entry_safe(inst, t2, head, hlist) { + if (n->portid == inst->peer_portid) __instance_destroy(inst); } } - spin_unlock_bh(&instances_lock); + spin_unlock_bh(&log->instances_lock); } return NOTIFY_DONE; } @@ -768,6 +793,8 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, u_int16_t group_num = ntohs(nfmsg->res_id); struct nfulnl_instance *inst; struct nfulnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_log_net *log = nfnl_log_pernet(net); int ret = 0; if (nfula[NFULA_CFG_CMD]) { @@ -777,14 +804,14 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: - return nf_log_bind_pf(pf, &nfulnl_logger); + return nf_log_bind_pf(net, pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: - nf_log_unbind_pf(pf); + nf_log_unbind_pf(net, pf); return 0; } } - inst = instance_lookup_get(group_num); + inst = instance_lookup_get(log, group_num); if (inst && inst->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto out_put; @@ -798,9 +825,9 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out_put; } - inst = instance_create(group_num, + inst = instance_create(net, group_num, NETLINK_CB(skb).portid, - sk_user_ns(NETLINK_CB(skb).ssk)); + sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { ret = PTR_ERR(inst); goto out; @@ -812,7 +839,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out; } - instance_destroy(inst); + instance_destroy(log, inst); goto out_put; default: ret = -ENOTSUPP; @@ -895,55 +922,68 @@ static const struct nfnetlink_subsystem nfulnl_subsys = { #ifdef CONFIG_PROC_FS struct iter_state { + struct seq_net_private p; unsigned int bucket; }; -static struct hlist_node *get_first(struct iter_state *st) +static struct hlist_node *get_first(struct net *net, struct iter_state *st) { + struct nfnl_log_net *log; if (!st) return NULL; + log = nfnl_log_pernet(net); + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&instance_table[st->bucket])) - return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + struct hlist_head *head = &log->instance_table[st->bucket]; + + if (!hlist_empty(head)) + return rcu_dereference_bh(hlist_first_rcu(head)); } return NULL; } -static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) +static struct hlist_node *get_next(struct net *net, struct iter_state *st, + struct hlist_node *h) { h = rcu_dereference_bh(hlist_next_rcu(h)); while (!h) { + struct nfnl_log_net *log; + struct hlist_head *head; + if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + log = nfnl_log_pernet(net); + head = &log->instance_table[st->bucket]; + h = rcu_dereference_bh(hlist_first_rcu(head)); } return h; } -static struct hlist_node *get_idx(struct iter_state *st, loff_t pos) +static struct hlist_node *get_idx(struct net *net, struct iter_state *st, + loff_t pos) { struct hlist_node *head; - head = get_first(st); + head = get_first(net, st); if (head) - while (pos && (head = get_next(st, head))) + while (pos && (head = get_next(net, st, head))) pos--; return pos ? NULL : head; } -static void *seq_start(struct seq_file *seq, loff_t *pos) +static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(rcu_bh) { rcu_read_lock_bh(); - return get_idx(seq->private, *pos); + return get_idx(seq_file_net(s), s->private, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; - return get_next(s->private, v); + return get_next(seq_file_net(s), s->private, v); } static void seq_stop(struct seq_file *s, void *v) @@ -972,8 +1012,8 @@ static const struct seq_operations nful_seq_ops = { static int nful_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &nful_seq_ops, - sizeof(struct iter_state)); + return seq_open_net(inode, file, &nful_seq_ops, + sizeof(struct iter_state)); } static const struct file_operations nful_file_ops = { @@ -981,17 +1021,43 @@ static const struct file_operations nful_file_ops = { .open = nful_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif /* PROC_FS */ -static int __init nfnetlink_log_init(void) +static int __net_init nfnl_log_net_init(struct net *net) { - int i, status = -ENOMEM; + unsigned int i; + struct nfnl_log_net *log = nfnl_log_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&instance_table[i]); + INIT_HLIST_HEAD(&log->instance_table[i]); + spin_lock_init(&log->instances_lock); + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_log", 0440, + net->nf.proc_netfilter, &nful_file_ops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit nfnl_log_net_exit(struct net *net) +{ + remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); +} + +static struct pernet_operations nfnl_log_net_ops = { + .init = nfnl_log_net_init, + .exit = nfnl_log_net_exit, + .id = &nfnl_log_net_id, + .size = sizeof(struct nfnl_log_net), +}; + +static int __init nfnetlink_log_init(void) +{ + int status = -ENOMEM; /* it's not really all that important to have a random value, so * we can do this from the init function, even if there hasn't @@ -1001,29 +1067,25 @@ static int __init nfnetlink_log_init(void) netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); if (status < 0) { - printk(KERN_ERR "log: failed to create netlink socket\n"); + pr_err("log: failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); if (status < 0) { - printk(KERN_ERR "log: failed to register logger\n"); + pr_err("log: failed to register logger\n"); goto cleanup_subsys; } -#ifdef CONFIG_PROC_FS - if (!proc_create("nfnetlink_log", 0440, - proc_net_netfilter, &nful_file_ops)) { - status = -ENOMEM; + status = register_pernet_subsys(&nfnl_log_net_ops); + if (status < 0) { + pr_err("log: failed to register pernet ops\n"); goto cleanup_logger; } -#endif return status; -#ifdef CONFIG_PROC_FS cleanup_logger: nf_log_unregister(&nfulnl_logger); -#endif cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); cleanup_netlink_notifier: @@ -1033,10 +1095,8 @@ cleanup_netlink_notifier: static void __exit nfnetlink_log_fini(void) { + unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_log", proc_net_netfilter); -#endif nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); } diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 3158d87b56a8..2e0e835baf72 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -30,6 +30,7 @@ #include <linux/list.h> #include <net/sock.h> #include <net/netfilter/nf_queue.h> +#include <net/netns/generic.h> #include <net/netfilter/nfnetlink_queue.h> #include <linux/atomic.h> @@ -66,25 +67,32 @@ struct nfqnl_instance { typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); -static DEFINE_SPINLOCK(instances_lock); +static int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 -static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly; +struct nfnl_queue_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; +}; + +static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) +{ + return net_generic(net, nfnl_queue_net_id); +} static inline u_int8_t instance_hashfn(u_int16_t queue_num) { - return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; + return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } static struct nfqnl_instance * -instance_lookup(u_int16_t queue_num) +instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { struct hlist_head *head; - struct hlist_node *pos; struct nfqnl_instance *inst; - head = &instance_table[instance_hashfn(queue_num)]; - hlist_for_each_entry_rcu(inst, pos, head, hlist) { + head = &q->instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->queue_num == queue_num) return inst; } @@ -92,14 +100,15 @@ instance_lookup(u_int16_t queue_num) } static struct nfqnl_instance * -instance_create(u_int16_t queue_num, int portid) +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, + int portid) { struct nfqnl_instance *inst; unsigned int h; int err; - spin_lock(&instances_lock); - if (instance_lookup(queue_num)) { + spin_lock(&q->instances_lock); + if (instance_lookup(q, queue_num)) { err = -EEXIST; goto out_unlock; } @@ -113,7 +122,7 @@ instance_create(u_int16_t queue_num, int portid) inst->queue_num = queue_num; inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; - inst->copy_range = 0xfffff; + inst->copy_range = 0xffff; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); @@ -124,16 +133,16 @@ instance_create(u_int16_t queue_num, int portid) } h = instance_hashfn(queue_num); - hlist_add_head_rcu(&inst->hlist, &instance_table[h]); + hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); return inst; out_free: kfree(inst); out_unlock: - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); return ERR_PTR(err); } @@ -159,11 +168,11 @@ __instance_destroy(struct nfqnl_instance *inst) } static void -instance_destroy(struct nfqnl_instance *inst) +instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) { - spin_lock(&instances_lock); + spin_lock(&q->instances_lock); __instance_destroy(inst); - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); } static inline void @@ -218,14 +227,71 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) spin_unlock_bh(&queue->lock); } +static void +nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) +{ + int i, j = 0; + int plen = 0; /* length of skb->head fragment */ + struct page *page; + unsigned int offset; + + /* dont bother with small payloads */ + if (len <= skb_tailroom(to)) { + skb_copy_bits(from, 0, skb_put(to, len), len); + return; + } + + if (hlen) { + skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + len -= hlen; + } else { + plen = min_t(int, skb_headlen(from), len); + if (plen) { + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + __skb_fill_page_desc(to, 0, page, offset, plen); + get_page(page); + j = 1; + len -= plen; + } + } + + to->truesize += len + plen; + to->len += len + plen; + to->data_len += len + plen; + + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + if (!len) + break; + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; + skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); + len -= skb_shinfo(to)->frags[j].size; + skb_frag_ref(to, j); + j++; + } + skb_shinfo(to)->nr_frags = j; +} + +static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet) +{ + __u32 flags = 0; + + if (packet->ip_summed == CHECKSUM_PARTIAL) + flags = NFQA_SKB_CSUMNOTREADY; + if (skb_is_gso(packet)) + flags |= NFQA_SKB_GSO; + + return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; +} + static struct sk_buff * nfqnl_build_packet_message(struct nfqnl_instance *queue, struct nf_queue_entry *entry, __be32 **packet_id_ptr) { - sk_buff_data_t old_tail; size_t size; size_t data_len = 0, cap_len = 0; + int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; @@ -237,7 +303,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, struct nf_conn *ct = NULL; enum ip_conntrack_info uninitialized_var(ctinfo); - size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -247,8 +313,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) - + nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp) - + nla_total_size(sizeof(u_int32_t))); /* cap_len */ + + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ + + nla_total_size(sizeof(u_int32_t)); /* cap_len */ + + if (entskb->tstamp.tv64) + size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); outdev = entry->outdev; @@ -258,7 +327,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, break; case NFQNL_COPY_PACKET: - if (entskb->ip_summed == CHECKSUM_PARTIAL && + if (!(queue->flags & NFQA_CFG_F_GSO) && + entskb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(entskb)) return NULL; @@ -266,7 +336,16 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (data_len == 0 || data_len > entskb->len) data_len = entskb->len; - size += nla_total_size(data_len); + + if (!entskb->head_frag || + skb_headlen(entskb) < L1_CACHE_BYTES || + skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS) + hlen = skb_headlen(entskb); + + if (skb_has_frag_list(entskb)) + hlen = entskb->len; + hlen = min_t(int, data_len, hlen); + size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; break; } @@ -274,11 +353,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (queue->flags & NFQA_CFG_F_CONNTRACK) ct = nfqnl_ct_get(entskb, &size, &ctinfo); - skb = alloc_skb(size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid, + GFP_ATOMIC); if (!skb) return NULL; - old_tail = skb->tail; nlh = nlmsg_put(skb, 0, 0, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, sizeof(struct nfgenmsg), 0); @@ -383,31 +462,29 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, goto nla_put_failure; } + if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) + goto nla_put_failure; + + if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) + goto nla_put_failure; + + if (nfqnl_put_packet_info(skb, entskb)) + goto nla_put_failure; + if (data_len) { struct nlattr *nla; - int sz = nla_attr_size(data_len); - if (skb_tailroom(skb) < nla_total_size(data_len)) { - printk(KERN_WARNING "nf_queue: no tailroom!\n"); - kfree_skb(skb); - return NULL; - } + if (skb_tailroom(skb) < sizeof(*nla) + hlen) + goto nla_put_failure; - nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len)); + nla = (struct nlattr *)skb_put(skb, sizeof(*nla)); nla->nla_type = NFQA_PAYLOAD; - nla->nla_len = sz; + nla->nla_len = nla_attr_size(data_len); - if (skb_copy_bits(entskb, 0, nla_data(nla), data_len)) - BUG(); + nfqnl_zcopy(skb, entskb, data_len, hlen); } - if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) - goto nla_put_failure; - - if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) - goto nla_put_failure; - - nlh->nlmsg_len = skb->tail - old_tail; + nlh->nlmsg_len = skb->len; return skb; nla_put_failure: @@ -417,26 +494,14 @@ nla_put_failure: } static int -nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, + struct nf_queue_entry *entry) { struct sk_buff *nskb; - struct nfqnl_instance *queue; int err = -ENOBUFS; __be32 *packet_id_ptr; int failopen = 0; - /* rcu_read_lock()ed by nf_hook_slow() */ - queue = instance_lookup(queuenum); - if (!queue) { - err = -ESRCH; - goto err_out; - } - - if (queue->copy_mode == NFQNL_COPY_NONE) { - err = -EINVAL; - goto err_out; - } - nskb = nfqnl_build_packet_message(queue, entry, &packet_id_ptr); if (nskb == NULL) { err = -ENOMEM; @@ -463,7 +528,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, &init_net, queue->peer_portid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); if (err < 0) { queue->queue_user_dropped++; goto err_out_unlock; @@ -484,6 +549,141 @@ err_out: return err; } +static struct nf_queue_entry * +nf_queue_entry_dup(struct nf_queue_entry *e) +{ + struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); + if (entry) { + if (nf_queue_entry_get_refs(entry)) + return entry; + kfree(entry); + } + return NULL; +} + +#ifdef CONFIG_BRIDGE_NETFILTER +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + +static void free_entry(struct nf_queue_entry *entry) +{ + nf_queue_entry_release_refs(entry); + kfree(entry); +} + +static int +__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, + struct sk_buff *skb, struct nf_queue_entry *entry) +{ + int ret = -ENOMEM; + struct nf_queue_entry *entry_seg; + + nf_bridge_adjust_segmented_data(skb); + + if (skb->next == NULL) { /* last packet, no need to copy entry */ + struct sk_buff *gso_skb = entry->skb; + entry->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry); + if (ret) + entry->skb = gso_skb; + return ret; + } + + skb->next = NULL; + + entry_seg = nf_queue_entry_dup(entry); + if (entry_seg) { + entry_seg->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry_seg); + if (ret) + free_entry(entry_seg); + } + return ret; +} + +static int +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + unsigned int queued; + struct nfqnl_instance *queue; + struct sk_buff *skb, *segs; + int err = -ENOBUFS; + struct net *net = dev_net(entry->indev ? + entry->indev : entry->outdev); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + /* rcu_read_lock()ed by nf_hook_slow() */ + queue = instance_lookup(q, queuenum); + if (!queue) + return -ESRCH; + + if (queue->copy_mode == NFQNL_COPY_NONE) + return -EINVAL; + + if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(entry->skb)) + return __nfqnl_enqueue_packet(net, queue, entry); + + skb = entry->skb; + + switch (entry->pf) { + case NFPROTO_IPV4: + skb->protocol = htons(ETH_P_IP); + break; + case NFPROTO_IPV6: + skb->protocol = htons(ETH_P_IPV6); + break; + } + + nf_bridge_adjust_skb_data(skb); + segs = skb_gso_segment(skb, 0); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ECANCELED to + * mean 'ignore this hook'. + */ + if (IS_ERR(segs)) + goto out_err; + queued = 0; + err = 0; + do { + struct sk_buff *nskb = segs->next; + if (err == 0) + err = __nfqnl_enqueue_packet_gso(net, queue, + segs, entry); + if (err == 0) + queued++; + else + kfree_skb(segs); + segs = nskb; + } while (segs); + + if (queued) { + if (err) /* some segments are already queued */ + free_entry(entry); + kfree_skb(skb); + return 0; + } + out_err: + nf_bridge_adjust_segmented_data(skb); + return err; +} + static int nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) { @@ -576,18 +776,18 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) /* drop all packets with either indev or outdev == ifindex from all queue * instances */ static void -nfqnl_dev_drop(int ifindex) +nfqnl_dev_drop(struct net *net, int ifindex) { int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); rcu_read_lock(); for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *tmp; struct nfqnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &q->instance_table[i]; - hlist_for_each_entry_rcu(inst, tmp, head, hlist) + hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, dev_cmp, ifindex); } @@ -602,12 +802,9 @@ nfqnl_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) - nfqnl_dev_drop(dev->ifindex); + nfqnl_dev_drop(dev_net(dev), dev->ifindex); return NOTIFY_DONE; } @@ -620,24 +817,24 @@ nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; + struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ - spin_lock(&instances_lock); + spin_lock(&q->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *tmp, *t2; + struct hlist_node *t2; struct nfqnl_instance *inst; - struct hlist_head *head = &instance_table[i]; + struct hlist_head *head = &q->instance_table[i]; - hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - if ((n->net == &init_net) && - (n->portid == inst->peer_portid)) + hlist_for_each_entry_safe(inst, t2, head, hlist) { + if (n->portid == inst->peer_portid) __instance_destroy(inst); } } - spin_unlock(&instances_lock); + spin_unlock(&q->instances_lock); } return NOTIFY_DONE; } @@ -658,11 +855,12 @@ static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { [NFQA_MARK] = { .type = NLA_U32 }, }; -static struct nfqnl_instance *verdict_instance_lookup(u16 queue_num, int nlportid) +static struct nfqnl_instance * +verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid) { struct nfqnl_instance *queue; - queue = instance_lookup(queue_num); + queue = instance_lookup(q, queue_num); if (!queue) return ERR_PTR(-ENODEV); @@ -706,7 +904,11 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, LIST_HEAD(batch_list); u16 queue_num = ntohs(nfmsg->res_id); - queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid); + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); @@ -754,10 +956,13 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, enum ip_conntrack_info uninitialized_var(ctinfo); struct nf_conn *ct = NULL; - queue = instance_lookup(queue_num); - if (!queue) + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); - queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid); + queue = instance_lookup(q, queue_num); + if (!queue) + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); @@ -821,6 +1026,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_instance *queue; struct nfqnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); int ret = 0; if (nfqa[NFQA_CFG_CMD]) { @@ -834,7 +1041,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, } rcu_read_lock(); - queue = instance_lookup(queue_num); + queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto err_out_unlock; @@ -847,7 +1054,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -EBUSY; goto err_out_unlock; } - queue = instance_create(queue_num, NETLINK_CB(skb).portid); + queue = instance_create(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) { ret = PTR_ERR(queue); goto err_out_unlock; @@ -858,7 +1066,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -ENODEV; goto err_out_unlock; } - instance_destroy(queue); + instance_destroy(q, queue); break; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: @@ -952,19 +1160,24 @@ static const struct nfnetlink_subsystem nfqnl_subsys = { #ifdef CONFIG_PROC_FS struct iter_state { + struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct seq_file *seq) { struct iter_state *st = seq->private; + struct net *net; + struct nfnl_queue_net *q; if (!st) return NULL; + net = seq_file_net(seq); + q = nfnl_queue_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&instance_table[st->bucket])) - return instance_table[st->bucket].first; + if (!hlist_empty(&q->instance_table[st->bucket])) + return q->instance_table[st->bucket].first; } return NULL; } @@ -972,13 +1185,17 @@ static struct hlist_node *get_first(struct seq_file *seq) static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) { struct iter_state *st = seq->private; + struct net *net = seq_file_net(seq); h = h->next; while (!h) { + struct nfnl_queue_net *q; + if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = instance_table[st->bucket].first; + q = nfnl_queue_pernet(net); + h = q->instance_table[st->bucket].first; } return h; } @@ -994,11 +1211,11 @@ static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) return pos ? NULL : head; } -static void *seq_start(struct seq_file *seq, loff_t *pos) - __acquires(instances_lock) +static void *seq_start(struct seq_file *s, loff_t *pos) + __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { - spin_lock(&instances_lock); - return get_idx(seq, *pos); + spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); + return get_idx(s, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) @@ -1008,9 +1225,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) } static void seq_stop(struct seq_file *s, void *v) - __releases(instances_lock) + __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { - spin_unlock(&instances_lock); + spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); } static int seq_show(struct seq_file *s, void *v) @@ -1034,7 +1251,7 @@ static const struct seq_operations nfqnl_seq_ops = { static int nfqnl_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &nfqnl_seq_ops, + return seq_open_net(inode, file, &nfqnl_seq_ops, sizeof(struct iter_state)); } @@ -1043,39 +1260,63 @@ static const struct file_operations nfqnl_file_ops = { .open = nfqnl_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif /* PROC_FS */ -static int __init nfnetlink_queue_init(void) +static int __net_init nfnl_queue_net_init(struct net *net) { - int i, status = -ENOMEM; + unsigned int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&instance_table[i]); + INIT_HLIST_HEAD(&q->instance_table[i]); + + spin_lock_init(&q->instances_lock); + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_queue", 0440, + net->nf.proc_netfilter, &nfqnl_file_ops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit nfnl_queue_net_exit(struct net *net) +{ + remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); +} + +static struct pernet_operations nfnl_queue_net_ops = { + .init = nfnl_queue_net_init, + .exit = nfnl_queue_net_exit, + .id = &nfnl_queue_net_id, + .size = sizeof(struct nfnl_queue_net), +}; + +static int __init nfnetlink_queue_init(void) +{ + int status = -ENOMEM; netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { - printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); + pr_err("nf_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; } -#ifdef CONFIG_PROC_FS - if (!proc_create("nfnetlink_queue", 0440, - proc_net_netfilter, &nfqnl_file_ops)) + status = register_pernet_subsys(&nfnl_queue_net_ops); + if (status < 0) { + pr_err("nf_queue: failed to register pernet ops\n"); goto cleanup_subsys; -#endif - + } register_netdevice_notifier(&nfqnl_dev_notifier); nf_register_queue_handler(&nfqh); return status; -#ifdef CONFIG_PROC_FS cleanup_subsys: nfnetlink_subsys_unregister(&nfqnl_subsys); -#endif cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); return status; @@ -1085,9 +1326,7 @@ static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_queue", proc_net_netfilter); -#endif + unregister_pernet_subsys(&nfnl_queue_net_ops); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 7b3a9e5999c0..8b03028cca69 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -2,6 +2,7 @@ * x_tables core - Backend for {ip,ip6,arp}_tables * * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org> + * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on existing ip_tables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling @@ -999,7 +1000,7 @@ static int xt_table_open(struct inode *inode, struct file *file) sizeof(struct xt_names_priv)); if (!ret) { priv = ((struct seq_file *)file->private_data)->private; - priv->af = (unsigned long)PDE(inode)->data; + priv->af = (unsigned long)PDE_DATA(inode); } return ret; } @@ -1147,7 +1148,7 @@ static int xt_match_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = trav; - trav->nfproto = (unsigned long)PDE(inode)->data; + trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } @@ -1211,7 +1212,7 @@ static int xt_target_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = trav; - trav->nfproto = (unsigned long)PDE(inode)->data; + trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } @@ -1323,12 +1324,12 @@ int xt_proto_init(struct net *net, u_int8_t af) out_remove_matches: strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); out_remove_tables: strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); out: return -1; #endif @@ -1342,15 +1343,15 @@ void xt_proto_fini(struct net *net, u_int8_t af) strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); - proc_net_remove(net, buf); + remove_proc_entry(buf, net->proc_net); #endif /*CONFIG_PROC_FS*/ } EXPORT_SYMBOL_GPL(xt_proto_fini); diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index ba92824086f3..3228d7f24eb4 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -124,6 +124,9 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_audit_info *info = par->targinfo; struct audit_buffer *ab; + if (audit_enabled == 0) + goto errout; + ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); if (ab == NULL) goto errout; diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index bde009ed8d3b..a60261cb0e80 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -20,12 +20,8 @@ #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> -static unsigned int xt_ct_target_v0(struct sk_buff *skb, - const struct xt_action_param *par) +static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) { - const struct xt_ct_target_info *info = par->targinfo; - struct nf_conn *ct = info->ct; - /* Previously seen (loopback)? Ignore. */ if (skb->nfct != NULL) return XT_CONTINUE; @@ -37,21 +33,22 @@ static unsigned int xt_ct_target_v0(struct sk_buff *skb, return XT_CONTINUE; } -static unsigned int xt_ct_target_v1(struct sk_buff *skb, +static unsigned int xt_ct_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_ct_target_info_v1 *info = par->targinfo; + const struct xt_ct_target_info *info = par->targinfo; struct nf_conn *ct = info->ct; - /* Previously seen (loopback)? Ignore. */ - if (skb->nfct != NULL) - return XT_CONTINUE; + return xt_ct_target(skb, ct); +} - atomic_inc(&ct->ct_general.use); - skb->nfct = &ct->ct_general; - skb->nfctinfo = IP_CT_NEW; +static unsigned int xt_ct_target_v1(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct xt_ct_target_info_v1 *info = par->targinfo; + struct nf_conn *ct = info->ct; - return XT_CONTINUE; + return xt_ct_target(skb, ct); } static u8 xt_ct_find_proto(const struct xt_tgchk_param *par) @@ -104,67 +101,6 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, return 0; } -static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par) -{ - struct xt_ct_target_info *info = par->targinfo; - struct nf_conntrack_tuple t; - struct nf_conn *ct; - int ret = -EOPNOTSUPP; - - if (info->flags & ~XT_CT_NOTRACK) - return -EINVAL; - - if (info->flags & XT_CT_NOTRACK) { - ct = nf_ct_untracked_get(); - atomic_inc(&ct->ct_general.use); - goto out; - } - -#ifndef CONFIG_NF_CONNTRACK_ZONES - if (info->zone) - goto err1; -#endif - - ret = nf_ct_l3proto_try_module_get(par->family); - if (ret < 0) - goto err1; - - memset(&t, 0, sizeof(t)); - ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL); - ret = PTR_ERR(ct); - if (IS_ERR(ct)) - goto err2; - - ret = 0; - if ((info->ct_events || info->exp_events) && - !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, - GFP_KERNEL)) - goto err3; - - if (info->helper[0]) { - ret = xt_ct_set_helper(ct, info->helper, par); - if (ret < 0) - goto err3; - } - - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); - - /* Overload tuple linked list to put us in template list. */ - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &par->net->ct.tmpl); -out: - info->ct = ct; - return 0; - -err3: - nf_conntrack_free(ct); -err2: - nf_ct_l3proto_module_put(par->family); -err1: - return ret; -} - #ifdef CONFIG_NF_CONNTRACK_TIMEOUT static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout) { @@ -242,16 +178,13 @@ out: #endif } -static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par) +static int xt_ct_tg_check(const struct xt_tgchk_param *par, + struct xt_ct_target_info_v1 *info) { - struct xt_ct_target_info_v1 *info = par->targinfo; struct nf_conntrack_tuple t; struct nf_conn *ct; int ret = -EOPNOTSUPP; - if (info->flags & ~XT_CT_NOTRACK) - return -EINVAL; - if (info->flags & XT_CT_NOTRACK) { ct = nf_ct_untracked_get(); atomic_inc(&ct->ct_general.use); @@ -309,20 +242,49 @@ err1: return ret; } -static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par) +static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par) { struct xt_ct_target_info *info = par->targinfo; - struct nf_conn *ct = info->ct; - struct nf_conn_help *help; + struct xt_ct_target_info_v1 info_v1 = { + .flags = info->flags, + .zone = info->zone, + .ct_events = info->ct_events, + .exp_events = info->exp_events, + }; + int ret; - if (!nf_ct_is_untracked(ct)) { - help = nfct_help(ct); - if (help) - module_put(help->helper->me); + if (info->flags & ~XT_CT_NOTRACK) + return -EINVAL; - nf_ct_l3proto_module_put(par->family); - } - nf_ct_put(info->ct); + memcpy(info_v1.helper, info->helper, sizeof(info->helper)); + + ret = xt_ct_tg_check(par, &info_v1); + if (ret < 0) + return ret; + + info->ct = info_v1.ct; + + return ret; +} + +static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info_v1 *info = par->targinfo; + + if (info->flags & ~XT_CT_NOTRACK) + return -EINVAL; + + return xt_ct_tg_check(par, par->targinfo); +} + +static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info_v1 *info = par->targinfo; + + if (info->flags & ~XT_CT_MASK) + return -EINVAL; + + return xt_ct_tg_check(par, par->targinfo); } static void xt_ct_destroy_timeout(struct nf_conn *ct) @@ -343,9 +305,9 @@ static void xt_ct_destroy_timeout(struct nf_conn *ct) #endif } -static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) +static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, + struct xt_ct_target_info_v1 *info) { - struct xt_ct_target_info_v1 *info = par->targinfo; struct nf_conn *ct = info->ct; struct nf_conn_help *help; @@ -361,6 +323,26 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) nf_ct_put(info->ct); } +static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct xt_ct_target_info_v1 info_v1 = { + .flags = info->flags, + .zone = info->zone, + .ct_events = info->ct_events, + .exp_events = info->exp_events, + .ct = info->ct, + }; + memcpy(info_v1.helper, info->helper, sizeof(info->helper)); + + xt_ct_tg_destroy(par, &info_v1); +} + +static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) +{ + xt_ct_tg_destroy(par, par->targinfo); +} + static struct xt_target xt_ct_tg_reg[] __read_mostly = { { .name = "CT", @@ -383,6 +365,17 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .table = "raw", .me = THIS_MODULE, }, + { + .name = "CT", + .family = NFPROTO_UNSPEC, + .revision = 2, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .checkentry = xt_ct_tg_check_v2, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, }; static unsigned int diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index fa40096940a1..fe573f6c9e91 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -474,7 +474,14 @@ ipt_log_packet(u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { - struct sbuff *m = sb_open(); + struct sbuff *m; + struct net *net = dev_net(in ? in : out); + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = sb_open(); if (!loginfo) loginfo = &default_loginfo; @@ -798,7 +805,14 @@ ip6t_log_packet(u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { - struct sbuff *m = sb_open(); + struct sbuff *m; + struct net *net = dev_net(in ? in : out); + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = sb_open(); if (!loginfo) loginfo = &default_loginfo; @@ -893,23 +907,55 @@ static struct nf_logger ip6t_log_logger __read_mostly = { }; #endif +static int __net_init log_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger); +#endif + return 0; +} + +static void __net_exit log_net_exit(struct net *net) +{ + nf_log_unset(net, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_unset(net, &ip6t_log_logger); +#endif +} + +static struct pernet_operations log_net_ops = { + .init = log_net_init, + .exit = log_net_exit, +}; + static int __init log_tg_init(void) { int ret; + ret = register_pernet_subsys(&log_net_ops); + if (ret < 0) + goto err_pernet; + ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); if (ret < 0) - return ret; + goto err_target; nf_log_register(NFPROTO_IPV4, &ipt_log_logger); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) nf_log_register(NFPROTO_IPV6, &ip6t_log_logger); #endif return 0; + +err_target: + unregister_pernet_subsys(&log_net_ops); +err_pernet: + return ret; } static void __exit log_tg_exit(void) { + unregister_pernet_subsys(&log_net_ops); nf_log_unregister(&ipt_log_logger); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) nf_log_unregister(&ip6t_log_logger); diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 817f9e9f2b16..1e2fae32f81b 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -76,22 +76,31 @@ static u32 hash_v6(const struct sk_buff *skb) } #endif -static unsigned int -nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +static u32 +nfqueue_hash(const struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v1 *info = par->targinfo; u32 queue = info->queuenum; - if (info->queues_total > 1) { - if (par->family == NFPROTO_IPV4) - queue = (((u64) hash_v4(skb) * info->queues_total) >> - 32) + queue; + if (par->family == NFPROTO_IPV4) + queue += ((u64) hash_v4(skb) * info->queues_total) >> 32; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - else if (par->family == NFPROTO_IPV6) - queue = (((u64) hash_v6(skb) * info->queues_total) >> - 32) + queue; + else if (par->family == NFPROTO_IPV6) + queue += ((u64) hash_v6(skb) * info->queues_total) >> 32; #endif - } + + return queue; +} + +static unsigned int +nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v1 *info = par->targinfo; + u32 queue = info->queuenum; + + if (info->queues_total > 1) + queue = nfqueue_hash(skb, par); + return NF_QUEUE_NR(queue); } @@ -108,7 +117,7 @@ nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) static int nfqueue_tg_check(const struct xt_tgchk_param *par) { - const struct xt_NFQ_info_v2 *info = par->targinfo; + const struct xt_NFQ_info_v3 *info = par->targinfo; u32 maxid; if (unlikely(!rnd_inited)) { @@ -125,11 +134,32 @@ static int nfqueue_tg_check(const struct xt_tgchk_param *par) info->queues_total, maxid); return -ERANGE; } - if (par->target->revision == 2 && info->bypass > 1) + if (par->target->revision == 2 && info->flags > 1) return -EINVAL; + if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK) + return -EINVAL; + return 0; } +static unsigned int +nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v3 *info = par->targinfo; + u32 queue = info->queuenum; + + if (info->queues_total > 1) { + if (info->flags & NFQ_FLAG_CPU_FANOUT) { + int cpu = smp_processor_id(); + + queue = info->queuenum + cpu % info->queues_total; + } else + queue = nfqueue_hash(skb, par); + } + + return NF_QUEUE_NR(queue); +} + static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", @@ -156,6 +186,15 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_NFQ_info_v2), .me = THIS_MODULE, }, + { + .name = "NFQUEUE", + .revision = 3, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v3, + .targetsize = sizeof(struct xt_NFQ_info_v3), + .me = THIS_MODULE, + }, }; static int __init nfqueue_tg_init(void) diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index f264032b8c56..370adf622cef 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -43,12 +43,11 @@ static void xt_rateest_hash_insert(struct xt_rateest *est) struct xt_rateest *xt_rateest_lookup(const char *name) { struct xt_rateest *est; - struct hlist_node *n; unsigned int h; h = xt_rateest_hash(name); mutex_lock(&xt_rateest_mutex); - hlist_for_each_entry(est, n, &rateest_hash[h], list) { + hlist_for_each_entry(est, &rateest_hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; mutex_unlock(&xt_rateest_mutex); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 71a266de5fb4..a75240f0d42b 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -2,6 +2,7 @@ * This is a module which is used for setting the MSS option in TCP packets. * * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * Copyright (C) 2007 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c new file mode 100644 index 000000000000..12d4da8e6c77 --- /dev/null +++ b/net/netfilter/xt_bpf.c @@ -0,0 +1,73 @@ +/* Xtables module to match packets using a BPF filter. + * Copyright 2013 Google Inc. + * Written by Willem de Bruijn <willemb@google.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/filter.h> + +#include <linux/netfilter/xt_bpf.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("Willem de Bruijn <willemb@google.com>"); +MODULE_DESCRIPTION("Xtables: BPF filter match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_bpf"); +MODULE_ALIAS("ip6t_bpf"); + +static int bpf_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_bpf_info *info = par->matchinfo; + struct sock_fprog program; + + program.len = info->bpf_program_num_elem; + program.filter = (struct sock_filter __user *) info->bpf_program; + if (sk_unattached_filter_create(&info->filter, &program)) { + pr_info("bpf: check failed: parse error\n"); + return -EINVAL; + } + + return 0; +} + +static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_bpf_info *info = par->matchinfo; + + return SK_RUN_FILTER(info->filter, skb); +} + +static void bpf_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_bpf_info *info = par->matchinfo; + sk_unattached_filter_destroy(info->filter); +} + +static struct xt_match bpf_mt_reg __read_mostly = { + .name = "bpf", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = bpf_mt_check, + .match = bpf_mt, + .destroy = bpf_mt_destroy, + .matchsize = sizeof(struct xt_bpf_info), + .me = THIS_MODULE, +}; + +static int __init bpf_mt_init(void) +{ + return xt_register_match(&bpf_mt_reg); +} + +static void __exit bpf_mt_exit(void) +{ + xt_unregister_match(&bpf_mt_reg); +} + +module_init(bpf_mt_init); +module_exit(bpf_mt_exit); diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c new file mode 100644 index 000000000000..9f8719df2001 --- /dev/null +++ b/net/netfilter/xt_connlabel.c @@ -0,0 +1,99 @@ +/* + * (C) 2013 Astaro GmbH & Co KG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); +MODULE_ALIAS("ipt_connlabel"); +MODULE_ALIAS("ip6t_connlabel"); + +static bool +connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_connlabel_mtinfo *info = par->matchinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + bool invert = info->options & XT_CONNLABEL_OP_INVERT; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL || nf_ct_is_untracked(ct)) + return invert; + + if (info->options & XT_CONNLABEL_OP_SET) + return (nf_connlabel_set(ct, info->bit) == 0) ^ invert; + + return nf_connlabel_match(ct, info->bit) ^ invert; +} + +static int connlabel_mt_check(const struct xt_mtchk_param *par) +{ + const int options = XT_CONNLABEL_OP_INVERT | + XT_CONNLABEL_OP_SET; + struct xt_connlabel_mtinfo *info = par->matchinfo; + int ret; + size_t words; + + if (info->bit > XT_CONNLABEL_MAXBIT) + return -ERANGE; + + if (info->options & ~options) { + pr_err("Unknown options in mask %x\n", info->options); + return -EINVAL; + } + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) { + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; + } + + par->net->ct.labels_used++; + words = BITS_TO_LONGS(info->bit+1); + if (words > par->net->ct.label_words) + par->net->ct.label_words = words; + + return ret; +} + +static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) +{ + par->net->ct.labels_used--; + if (par->net->ct.labels_used == 0) + par->net->ct.label_words = 0; + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match connlabels_mt_reg __read_mostly = { + .name = "connlabel", + .family = NFPROTO_UNSPEC, + .checkentry = connlabel_mt_check, + .match = connlabel_mt, + .matchsize = sizeof(struct xt_connlabel_mtinfo), + .destroy = connlabel_mt_destroy, + .me = THIS_MODULE, +}; + +static int __init connlabel_mt_init(void) +{ + return xt_register_match(&connlabels_mt_reg); +} + +static void __exit connlabel_mt_exit(void) +{ + xt_unregister_match(&connlabels_mt_reg); +} + +module_init(connlabel_mt_init); +module_exit(connlabel_mt_exit); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 70b5591a2586..c40b2695633b 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -101,7 +101,7 @@ static int count_them(struct net *net, { const struct nf_conntrack_tuple_hash *found; struct xt_connlimit_conn *conn; - struct hlist_node *pos, *n; + struct hlist_node *n; struct nf_conn *found_ct; struct hlist_head *hash; bool addit = true; @@ -115,7 +115,7 @@ static int count_them(struct net *net, rcu_read_lock(); /* check the saved connections */ - hlist_for_each_entry_safe(conn, pos, n, hash, node) { + hlist_for_each_entry_safe(conn, n, hash, node) { found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE, &conn->tuple); found_ct = NULL; @@ -258,14 +258,14 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; struct xt_connlimit_conn *conn; - struct hlist_node *pos, *n; + struct hlist_node *n; struct hlist_head *hash = info->data->iphash; unsigned int i; nf_ct_l3proto_module_put(par->family); for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { - hlist_for_each_entry_safe(conn, pos, n, &hash[i], node) { + hlist_for_each_entry_safe(conn, n, &hash[i], node) { hlist_del(&conn->node); kfree(conn); } diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 61805d7b38aa..188404b9b002 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -3,6 +3,7 @@ * information. (Superset of Rusty's minimalistic state match.) * * (C) 2001 Marc Boucher (marc@mbsi.ca). + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index a9d7af953ceb..9ff035c71403 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -3,6 +3,7 @@ * separately for each hashbucket (sourceip/sourceport/dstip/dstport) * * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * Development of this code was funded by Astaro AG, http://www.astaro.com/ @@ -107,6 +108,7 @@ struct xt_hashlimit_htable { /* seq_file stuff */ struct proc_dir_entry *pde; + const char *name; struct net *net; struct hlist_head hash[0]; /* hashtable itself */ @@ -141,11 +143,10 @@ dsthash_find(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) { struct dsthash_ent *ent; - struct hlist_node *pos; u_int32_t hash = hash_dst(ht, dst); if (!hlist_empty(&ht->hash[hash])) { - hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node) + hlist_for_each_entry_rcu(ent, &ht->hash[hash], node) if (dst_cmp(ent, dst)) { spin_lock(&ent->lock); return ent; @@ -254,6 +255,11 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; + hinfo->name = kstrdup(minfo->name, GFP_KERNEL); + if (!hinfo->name) { + vfree(hinfo); + return -ENOMEM; + } spin_lock_init(&hinfo->lock); hinfo->pde = proc_create_data(minfo->name, 0, @@ -261,6 +267,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, &dl_file_ops, hinfo); if (hinfo->pde == NULL) { + kfree(hinfo->name); vfree(hinfo); return -ENOMEM; } @@ -297,8 +304,8 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, spin_lock_bh(&ht->lock); for (i = 0; i < ht->cfg.size; i++) { struct dsthash_ent *dh; - struct hlist_node *pos, *n; - hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) { + struct hlist_node *n; + hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { if ((*select)(ht, dh)) dsthash_free(ht, dh); } @@ -331,9 +338,10 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) parent = hashlimit_net->ip6t_hashlimit; if(parent != NULL) - remove_proc_entry(hinfo->pde->name, parent); + remove_proc_entry(hinfo->name, parent); htable_selective_cleanup(hinfo, select_all); + kfree(hinfo->name); vfree(hinfo); } @@ -343,10 +351,9 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net, { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; - struct hlist_node *pos; - hlist_for_each_entry(hinfo, pos, &hashlimit_net->htables, node) { - if (!strcmp(name, hinfo->pde->name) && + hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { + if (!strcmp(name, hinfo->name) && hinfo->family == family) { hinfo->use++; return hinfo; @@ -821,10 +828,9 @@ static int dl_seq_show(struct seq_file *s, void *v) struct xt_hashlimit_htable *htable = s->private; unsigned int *bucket = (unsigned int *)v; struct dsthash_ent *ent; - struct hlist_node *pos; if (!hlist_empty(&htable->hash[*bucket])) { - hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) + hlist_for_each_entry(ent, &htable->hash[*bucket], node) if (dl_seq_real_show(ent, htable->family, s)) return -1; } @@ -844,7 +850,7 @@ static int dl_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - sf->private = PDE(inode)->data; + sf->private = PDE_DATA(inode); } return ret; } @@ -867,7 +873,7 @@ static int __net_init hashlimit_proc_net_init(struct net *net) #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net); if (!hashlimit_net->ip6t_hashlimit) { - proc_net_remove(net, "ipt_hashlimit"); + remove_proc_entry("ipt_hashlimit", net->proc_net); return -ENOMEM; } #endif @@ -877,7 +883,6 @@ static int __net_init hashlimit_proc_net_init(struct net *net) static void __net_exit hashlimit_proc_net_exit(struct net *net) { struct xt_hashlimit_htable *hinfo; - struct hlist_node *pos; struct proc_dir_entry *pde; struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); @@ -890,16 +895,16 @@ static void __net_exit hashlimit_proc_net_exit(struct net *net) if (pde == NULL) pde = hashlimit_net->ip6t_hashlimit; - hlist_for_each_entry(hinfo, pos, &hashlimit_net->htables, node) - remove_proc_entry(hinfo->pde->name, pde); + hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) + remove_proc_entry(hinfo->name, pde); hashlimit_net->ipt_hashlimit = NULL; hashlimit_net->ip6t_hashlimit = NULL; mutex_unlock(&hashlimit_mutex); - proc_net_remove(net, "ipt_hashlimit"); + remove_proc_entry("ipt_hashlimit", net->proc_net); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - proc_net_remove(net, "ip6t_hashlimit"); + remove_proc_entry("ip6t_hashlimit", net->proc_net); #endif } diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index a4c1e4528cac..bef850596558 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -1,5 +1,6 @@ /* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index a5e673d32bda..647d989a01e6 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -201,6 +201,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) unsigned char opts[MAX_IPOPTLEN]; const struct xt_osf_finger *kf; const struct xt_osf_user_finger *f; + struct net *net = dev_net(p->in ? p->in : p->out); if (!info) return false; @@ -325,7 +326,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) fcount++; if (info->flags & XT_OSF_LOG) - nf_log_packet(p->family, p->hooknum, skb, + nf_log_packet(net, p->family, p->hooknum, skb, p->in, p->out, NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, @@ -341,7 +342,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) rcu_read_unlock(); if (!fcount && (info->flags & XT_OSF_LOG)) - nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL, + nf_log_packet(net, p->family, p->hooknum, skb, p->in, + p->out, NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 978efc9b555a..1e657cf715c4 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -401,8 +401,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par, ret = -ENOMEM; goto out; } - pde->uid = uid; - pde->gid = gid; + proc_set_user(pde, uid, gid); #endif spin_lock_bh(&recent_lock); list_add_tail(&t->list, &recent_net->tables); @@ -525,14 +524,13 @@ static const struct seq_operations recent_seq_ops = { static int recent_seq_open(struct inode *inode, struct file *file) { - struct proc_dir_entry *pde = PDE(inode); struct recent_iter_state *st; st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); if (st == NULL) return -ENOMEM; - st->table = pde->data; + st->table = PDE_DATA(inode); return 0; } @@ -540,8 +538,7 @@ static ssize_t recent_mt_proc_write(struct file *file, const char __user *input, size_t size, loff_t *loff) { - const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - struct recent_table *t = pde->data; + struct recent_table *t = PDE_DATA(file_inode(file)); struct recent_entry *e; char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; const char *c = buf; @@ -643,7 +640,7 @@ static void __net_exit recent_proc_net_exit(struct net *net) recent_net->xt_recent = NULL; spin_unlock_bh(&recent_lock); - proc_net_remove(net, "xt_recent"); + remove_proc_entry("xt_recent", net->proc_net); } #else static inline int recent_proc_net_init(struct net *net) diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 865a9e54f3ad..31790e789e22 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> - * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -30,7 +30,7 @@ MODULE_ALIAS("ip6t_SET"); static inline int match_set(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, - const struct ip_set_adt_opt *opt, int inv) + struct ip_set_adt_opt *opt, int inv) { if (ip_set_test(index, skb, par, opt)) inv = !inv; @@ -38,20 +38,12 @@ match_set(ip_set_id_t index, const struct sk_buff *skb, } #define ADT_OPT(n, f, d, fs, cfs, t) \ -const struct ip_set_adt_opt n = { \ - .family = f, \ - .dim = d, \ - .flags = fs, \ - .cmdflags = cfs, \ - .timeout = t, \ -} -#define ADT_MOPT(n, f, d, fs, cfs, t) \ struct ip_set_adt_opt n = { \ .family = f, \ .dim = d, \ .flags = fs, \ .cmdflags = cfs, \ - .timeout = t, \ + .ext.timeout = t, \ } /* Revision 0 interface: backward compatible with netfilter/iptables */ @@ -197,6 +189,9 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, 0, UINT_MAX); + if (opt.flags & IPSET_RETURN_NOMATCH) + opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH; + return match_set(info->match_set.index, skb, par, &opt, info->match_set.flags & IPSET_INV_MATCH); } @@ -305,15 +300,15 @@ static unsigned int set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v2 *info = par->targinfo; - ADT_MOPT(add_opt, par->family, info->add_set.dim, - info->add_set.flags, info->flags, info->timeout); + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, info->del_set.flags, 0, UINT_MAX); /* Normalize to fit into jiffies */ - if (add_opt.timeout != IPSET_NO_TIMEOUT && - add_opt.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.timeout = UINT_MAX/MSEC_PER_SEC; + if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && + add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -325,6 +320,52 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) #define set_target_v2_checkentry set_target_v1_checkentry #define set_target_v2_destroy set_target_v1_destroy +/* Revision 3 match */ + +static bool +match_counter(u64 counter, const struct ip_set_counter_match *info) +{ + switch (info->op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == info->value; + case IPSET_COUNTER_NE: + return counter != info->value; + case IPSET_COUNTER_LT: + return counter < info->value; + case IPSET_COUNTER_GT: + return counter > info->value; + } + return false; +} + +static bool +set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v3 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, + info->match_set.flags, info->flags, UINT_MAX); + int ret; + + if (info->packets.op != IPSET_COUNTER_NONE || + info->bytes.op != IPSET_COUNTER_NONE) + opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; + + ret = match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); + + if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) + return ret; + + if (!match_counter(opt.ext.packets, &info->packets)) + return 0; + return match_counter(opt.ext.bytes, &info->bytes); +} + +#define set_match_v3_checkentry set_match_v1_checkentry +#define set_match_v3_destroy set_match_v1_destroy + static struct xt_match set_matches[] __read_mostly = { { .name = "set", @@ -377,6 +418,27 @@ static struct xt_match set_matches[] __read_mostly = { .destroy = set_match_v1_destroy, .me = THIS_MODULE }, + /* counters support: update, match */ + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 3, + .match = set_match_v3, + .matchsize = sizeof(struct xt_set_info_match_v3), + .checkentry = set_match_v3_checkentry, + .destroy = set_match_v3_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 3, + .match = set_match_v3, + .matchsize = sizeof(struct xt_set_info_match_v3), + .checkentry = set_match_v3_checkentry, + .destroy = set_match_v3_destroy, + .me = THIS_MODULE + }, }; static struct xt_target set_targets[] __read_mostly = { diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 847d495cd4de..8a6c6ea466d8 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1189,8 +1189,6 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; - u32 skip_addr4 = cb->args[2]; - u32 skip_addr6 = cb->args[3]; u32 iter_bkt; u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; @@ -1215,7 +1213,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { - if (iter_addr4++ < skip_addr4) + if (iter_addr4++ < cb->args[2]) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1231,7 +1229,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { - if (iter_addr6++ < skip_addr6) + if (iter_addr6++ < cb->args[3]) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1250,10 +1248,10 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, unlabel_staticlist_return: rcu_read_unlock(); - cb->args[0] = skip_bkt; - cb->args[1] = skip_chain; - cb->args[2] = skip_addr4; - cb->args[3] = skip_addr6; + cb->args[0] = iter_bkt; + cb->args[1] = iter_chain; + cb->args[2] = iter_addr4; + cb->args[3] = iter_addr6; return skb->len; } @@ -1273,12 +1271,9 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, { struct netlbl_unlhsh_walk_arg cb_arg; struct netlbl_unlhsh_iface *iface; - u32 skip_addr4 = cb->args[0]; - u32 skip_addr6 = cb->args[1]; - u32 iter_addr4 = 0; + u32 iter_addr4 = 0, iter_addr6 = 0; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) - u32 iter_addr6 = 0; struct netlbl_af6list *addr6; #endif @@ -1292,7 +1287,7 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, goto unlabel_staticlistdef_return; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { - if (iter_addr4++ < skip_addr4) + if (iter_addr4++ < cb->args[0]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, @@ -1305,7 +1300,7 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { - if (iter_addr6++ < skip_addr6) + if (iter_addr6++ < cb->args[1]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, @@ -1320,8 +1315,8 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, unlabel_staticlistdef_return: rcu_read_unlock(); - cb->args[0] = skip_addr4; - cb->args[1] = skip_addr6; + cb->args[0] = iter_addr4; + cb->args[1] = iter_addr6; return skb->len; } diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig new file mode 100644 index 000000000000..2c5e95e9bfbd --- /dev/null +++ b/net/netlink/Kconfig @@ -0,0 +1,19 @@ +# +# Netlink Sockets +# + +config NETLINK_MMAP + bool "NETLINK: mmaped IO" + ---help--- + This option enables support for memory mapped netlink IO. This + reduces overhead by avoiding copying data between kernel- and + userspace. + + If unsure, say N. + +config NETLINK_DIAG + tristate "NETLINK: socket monitoring interface" + default n + ---help--- + Support for NETLINK socket monitoring interface used by the ss tool. + If unsure, say Y. diff --git a/net/netlink/Makefile b/net/netlink/Makefile index bdd6ddf4e95b..e837917f6c03 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -3,3 +3,6 @@ # obj-y := af_netlink.o genetlink.o + +obj-$(CONFIG_NETLINK_DIAG) += netlink_diag.o +netlink_diag-y := diag.o diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c0353d55d56f..12ac6b47a35c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3,6 +3,7 @@ * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -55,87 +56,45 @@ #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> +#include <linux/vmalloc.h> +#include <asm/cacheflush.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> -#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) -#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) - -struct netlink_sock { - /* struct sock has to be the first member of netlink_sock */ - struct sock sk; - u32 portid; - u32 dst_portid; - u32 dst_group; - u32 flags; - u32 subscriptions; - u32 ngroups; - unsigned long *groups; - unsigned long state; - wait_queue_head_t wait; - struct netlink_callback *cb; - struct mutex *cb_mutex; - struct mutex cb_def_mutex; - void (*netlink_rcv)(struct sk_buff *skb); - void (*netlink_bind)(int group); - struct module *module; -}; +#include "af_netlink.h" struct listeners { struct rcu_head rcu; unsigned long masks[0]; }; +/* state bits */ +#define NETLINK_CONGESTED 0x0 + +/* flags */ #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 #define NETLINK_BROADCAST_SEND_ERROR 0x4 #define NETLINK_RECV_NO_ENOBUFS 0x8 -static inline struct netlink_sock *nlk_sk(struct sock *sk) -{ - return container_of(sk, struct netlink_sock, sk); -} - static inline int netlink_is_kernel(struct sock *sk) { return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; } -struct nl_portid_hash { - struct hlist_head *table; - unsigned long rehash_time; - - unsigned int mask; - unsigned int shift; - - unsigned int entries; - unsigned int max_shift; - - u32 rnd; -}; - -struct netlink_table { - struct nl_portid_hash hash; - struct hlist_head mc_list; - struct listeners __rcu *listeners; - unsigned int flags; - unsigned int groups; - struct mutex *cb_mutex; - struct module *module; - void (*bind)(int group); - int registered; -}; - -static struct netlink_table *nl_table; +struct netlink_table *nl_table; +EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static int netlink_dump(struct sock *sk); +static void netlink_skb_destructor(struct sk_buff *skb); -static DEFINE_RWLOCK(nl_table_lock); +DEFINE_RWLOCK(nl_table_lock); +EXPORT_SYMBOL_GPL(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); @@ -152,6 +111,599 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; } +static void netlink_overrun(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } + } + atomic_inc(&sk->sk_drops); +} + +static void netlink_rcv_wake(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (skb_queue_empty(&sk->sk_receive_queue)) + clear_bit(NETLINK_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_CONGESTED, &nlk->state)) + wake_up_interruptible(&nlk->wait); +} + +#ifdef CONFIG_NETLINK_MMAP +static bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ + return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +} + +static bool netlink_rx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->rx_ring.pg_vec != NULL; +} + +static bool netlink_tx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->tx_ring.pg_vec != NULL; +} + +static __pure struct page *pgvec_to_page(const void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + else + return virt_to_page(addr); +} + +static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) { + if (pg_vec[i] != NULL) { + if (is_vmalloc_addr(pg_vec[i])) + vfree(pg_vec[i]); + else + free_pages((unsigned long)pg_vec[i], order); + } + } + kfree(pg_vec); +} + +static void *alloc_one_pg_vec_page(unsigned long order) +{ + void *buffer; + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | + __GFP_NOWARN | __GFP_NORETRY; + + buffer = (void *)__get_free_pages(gfp_flags, order); + if (buffer != NULL) + return buffer; + + buffer = vzalloc((1 << order) * PAGE_SIZE); + if (buffer != NULL) + return buffer; + + gfp_flags &= ~__GFP_NORETRY; + return (void *)__get_free_pages(gfp_flags, order); +} + +static void **alloc_pg_vec(struct netlink_sock *nlk, + struct nl_mmap_req *req, unsigned int order) +{ + unsigned int block_nr = req->nm_block_nr; + unsigned int i; + void **pg_vec, *ptr; + + pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); + if (pg_vec == NULL) + return NULL; + + for (i = 0; i < block_nr; i++) { + pg_vec[i] = ptr = alloc_one_pg_vec_page(order); + if (pg_vec[i] == NULL) + goto err1; + } + + return pg_vec; +err1: + free_pg_vec(pg_vec, order, block_nr); + return NULL; +} + +static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, + bool closing, bool tx_ring) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct sk_buff_head *queue; + void **pg_vec = NULL; + unsigned int order = 0; + int err; + + ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; + queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + + if (!closing) { + if (atomic_read(&nlk->mapped)) + return -EBUSY; + if (atomic_read(&ring->pending)) + return -EBUSY; + } + + if (req->nm_block_nr) { + if (ring->pg_vec != NULL) + return -EBUSY; + + if ((int)req->nm_block_size <= 0) + return -EINVAL; + if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) + return -EINVAL; + if (req->nm_frame_size < NL_MMAP_HDRLEN) + return -EINVAL; + if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) + return -EINVAL; + + ring->frames_per_block = req->nm_block_size / + req->nm_frame_size; + if (ring->frames_per_block == 0) + return -EINVAL; + if (ring->frames_per_block * req->nm_block_nr != + req->nm_frame_nr) + return -EINVAL; + + order = get_order(req->nm_block_size); + pg_vec = alloc_pg_vec(nlk, req, order); + if (pg_vec == NULL) + return -ENOMEM; + } else { + if (req->nm_frame_nr) + return -EINVAL; + } + + err = -EBUSY; + mutex_lock(&nlk->pg_vec_lock); + if (closing || atomic_read(&nlk->mapped) == 0) { + err = 0; + spin_lock_bh(&queue->lock); + + ring->frame_max = req->nm_frame_nr - 1; + ring->head = 0; + ring->frame_size = req->nm_frame_size; + ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; + + swap(ring->pg_vec_len, req->nm_block_nr); + swap(ring->pg_vec_order, order); + swap(ring->pg_vec, pg_vec); + + __skb_queue_purge(queue); + spin_unlock_bh(&queue->lock); + + WARN_ON(atomic_read(&nlk->mapped)); + } + mutex_unlock(&nlk->pg_vec_lock); + + if (pg_vec) + free_pg_vec(pg_vec, order, req->nm_block_nr); + return err; +} + +static void netlink_mm_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_inc(&nlk_sk(sk)->mapped); +} + +static void netlink_mm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_dec(&nlk_sk(sk)->mapped); +} + +static const struct vm_operations_struct netlink_mmap_ops = { + .open = netlink_mm_open, + .close = netlink_mm_close, +}; + +static int netlink_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + unsigned long start, size, expected; + unsigned int i; + int err = -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + mutex_lock(&nlk->pg_vec_lock); + + expected = 0; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; + expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; + } + + if (expected == 0) + goto out; + + size = vma->vm_end - vma->vm_start; + if (size != expected) + goto out; + + start = vma->vm_start; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; + + for (i = 0; i < ring->pg_vec_len; i++) { + struct page *page; + void *kaddr = ring->pg_vec[i]; + unsigned int pg_num; + + for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { + page = pgvec_to_page(kaddr); + err = vm_insert_page(vma, start, page); + if (err < 0) + goto out; + start += PAGE_SIZE; + kaddr += PAGE_SIZE; + } + } + } + + atomic_inc(&nlk->mapped); + vma->vm_ops = &netlink_mmap_ops; + err = 0; +out: + mutex_unlock(&nlk->pg_vec_lock); + return 0; +} + +static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr) +{ +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 + struct page *p_start, *p_end; + + /* First page is flushed through netlink_{get,set}_status */ + p_start = pgvec_to_page(hdr + PAGE_SIZE); + p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + hdr->nm_len - 1); + while (p_start <= p_end) { + flush_dcache_page(p_start); + p_start++; + } +#endif +} + +static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) +{ + smp_rmb(); + flush_dcache_page(pgvec_to_page(hdr)); + return hdr->nm_status; +} + +static void netlink_set_status(struct nl_mmap_hdr *hdr, + enum nl_mmap_status status) +{ + hdr->nm_status = status; + flush_dcache_page(pgvec_to_page(hdr)); + smp_wmb(); +} + +static struct nl_mmap_hdr * +__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) +{ + unsigned int pg_vec_pos, frame_off; + + pg_vec_pos = pos / ring->frames_per_block; + frame_off = pos % ring->frames_per_block; + + return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); +} + +static struct nl_mmap_hdr * +netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, + enum nl_mmap_status status) +{ + struct nl_mmap_hdr *hdr; + + hdr = __netlink_lookup_frame(ring, pos); + if (netlink_get_status(hdr) != status) + return NULL; + + return hdr; +} + +static struct nl_mmap_hdr * +netlink_current_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + return netlink_lookup_frame(ring, ring->head, status); +} + +static struct nl_mmap_hdr * +netlink_previous_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + unsigned int prev; + + prev = ring->head ? ring->head - 1 : ring->frame_max; + return netlink_lookup_frame(ring, prev, status); +} + +static void netlink_increment_head(struct netlink_ring *ring) +{ + ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; +} + +static void netlink_forward_ring(struct netlink_ring *ring) +{ + unsigned int head = ring->head, pos = head; + const struct nl_mmap_hdr *hdr; + + do { + hdr = __netlink_lookup_frame(ring, pos); + if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) + break; + if (hdr->nm_status != NL_MMAP_STATUS_SKIP) + break; + netlink_increment_head(ring); + } while (ring->head != head); +} + +static bool netlink_dump_space(struct netlink_sock *nlk) +{ + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + unsigned int n; + + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + return false; + + n = ring->head + ring->frame_max / 2; + if (n > ring->frame_max) + n -= ring->frame_max; + + hdr = __netlink_lookup_frame(ring, n); + + return hdr->nm_status == NL_MMAP_STATUS_UNUSED; +} + +static unsigned int netlink_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int mask; + int err; + + if (nlk->rx_ring.pg_vec != NULL) { + /* Memory mapped sockets don't call recvmsg(), so flow control + * for dumps is performed here. A dump is allowed to continue + * if at least half the ring is unused. + */ + while (nlk->cb != NULL && netlink_dump_space(nlk)) { + err = netlink_dump(sk); + if (err < 0) { + sk->sk_err = err; + sk->sk_error_report(sk); + break; + } + } + netlink_rcv_wake(sk); + } + + mask = datagram_poll(file, sock, wait); + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + netlink_forward_ring(&nlk->rx_ring); + if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + + spin_lock_bh(&sk->sk_write_queue.lock); + if (nlk->tx_ring.pg_vec) { + if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLOUT | POLLWRNORM; + } + spin_unlock_bh(&sk->sk_write_queue.lock); + + return mask; +} + +static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) +{ + return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); +} + +static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, + struct netlink_ring *ring, + struct nl_mmap_hdr *hdr) +{ + unsigned int size; + void *data; + + size = ring->frame_size - NL_MMAP_HDRLEN; + data = (void *)hdr + NL_MMAP_HDRLEN; + + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->len = 0; + + skb->destructor = netlink_skb_destructor; + NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; + NETLINK_CB(skb).sk = sk; +} + +static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, + u32 dst_portid, u32 dst_group, + struct sock_iocb *siocb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + struct sk_buff *skb; + unsigned int maxlen; + bool excl = true; + int err = 0, len = 0; + + /* Netlink messages are validated by the receiver before processing. + * In order to avoid userspace changing the contents of the message + * after validation, the socket and the ring may only be used by a + * single process, otherwise we fall back to copying. + */ + if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 || + atomic_read(&nlk->mapped) > 1) + excl = false; + + mutex_lock(&nlk->pg_vec_lock); + + ring = &nlk->tx_ring; + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + + do { + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); + if (hdr == NULL) { + if (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending)) + schedule(); + continue; + } + if (hdr->nm_len > maxlen) { + err = -EINVAL; + goto out; + } + + netlink_frame_flush_dcache(hdr); + + if (likely(dst_portid == 0 && dst_group == 0 && excl)) { + skb = alloc_skb_head(GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + sock_hold(sk); + netlink_ring_setup_skb(skb, sk, ring, hdr); + NETLINK_CB(skb).flags |= NETLINK_SKB_TX; + __skb_put(skb, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + } else { + skb = alloc_skb(hdr->nm_len, GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + __skb_put(skb, hdr->nm_len); + memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + } + + netlink_increment_head(ring); + + NETLINK_CB(skb).portid = nlk->portid; + NETLINK_CB(skb).dst_group = dst_group; + NETLINK_CB(skb).creds = siocb->scm->creds; + + err = security_netlink_send(sk, skb); + if (err) { + kfree_skb(skb); + goto out; + } + + if (unlikely(dst_group)) { + atomic_inc(&skb->users); + netlink_broadcast(sk, skb, dst_portid, dst_group, + GFP_KERNEL); + } + err = netlink_unicast(sk, skb, dst_portid, + msg->msg_flags & MSG_DONTWAIT); + if (err < 0) + goto out; + len += err; + + } while (hdr != NULL || + (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending))); + + if (len > 0) + err = len; +out: + mutex_unlock(&nlk->pg_vec_lock); + return err; +} + +static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) +{ + struct nl_mmap_hdr *hdr; + + hdr = netlink_mmap_hdr(skb); + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); + hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); + netlink_frame_flush_dcache(hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + + NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; + kfree_skb(skb); +} + +static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + + spin_lock_bh(&sk->sk_receive_queue.lock); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) { + spin_unlock_bh(&sk->sk_receive_queue.lock); + kfree_skb(skb); + netlink_overrun(sk); + return; + } + netlink_increment_head(ring); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); + hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); + netlink_set_status(hdr, NL_MMAP_STATUS_COPY); +} + +#else /* CONFIG_NETLINK_MMAP */ +#define netlink_skb_is_mmaped(skb) false +#define netlink_rx_is_mmaped(sk) false +#define netlink_tx_is_mmaped(sk) false +#define netlink_mmap sock_no_mmap +#define netlink_poll datagram_poll +#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb) 0 +#endif /* CONFIG_NETLINK_MMAP */ + static void netlink_destroy_callback(struct netlink_callback *cb) { kfree_skb(cb->skb); @@ -164,6 +716,53 @@ static void netlink_consume_callback(struct netlink_callback *cb) kfree(cb); } +static void netlink_skb_destructor(struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP + struct nl_mmap_hdr *hdr; + struct netlink_ring *ring; + struct sock *sk; + + /* If a packet from the kernel to userspace was freed because of an + * error without being delivered to userspace, the kernel must reset + * the status. In the direction userspace to kernel, the status is + * always reset here after the packet was processed and freed. + */ + if (netlink_skb_is_mmaped(skb)) { + hdr = netlink_mmap_hdr(skb); + sk = NETLINK_CB(skb).sk; + + if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + ring = &nlk_sk(sk)->tx_ring; + } else { + if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { + hdr->nm_len = 0; + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + } + ring = &nlk_sk(sk)->rx_ring; + } + + WARN_ON(atomic_read(&ring->pending) == 0); + atomic_dec(&ring->pending); + sock_put(sk); + + skb->data = NULL; + } +#endif + if (skb->sk != NULL) + sock_rfree(skb); +} + +static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + WARN_ON(skb->sk != NULL); + skb->sk = sk; + skb->destructor = netlink_skb_destructor; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_charge(sk, skb->truesize); +} + static void netlink_sock_destruct(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); @@ -177,6 +776,18 @@ static void netlink_sock_destruct(struct sock *sk) } skb_queue_purge(&sk->sk_receive_queue); +#ifdef CONFIG_NETLINK_MMAP + if (1) { + struct nl_mmap_req req; + + memset(&req, 0, sizeof(req)); + if (nlk->rx_ring.pg_vec) + netlink_set_ring(sk, &req, true, false); + memset(&req, 0, sizeof(req)); + if (nlk->tx_ring.pg_vec) + netlink_set_ring(sk, &req, true, true); + } +#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -248,11 +859,10 @@ static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) struct nl_portid_hash *hash = &nl_table[protocol].hash; struct hlist_head *head; struct sock *sk; - struct hlist_node *node; read_lock(&nl_table_lock); head = nl_portid_hashfn(hash, portid); - sk_for_each(sk, node, head) { + sk_for_each(sk, head) { if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->portid == portid)) { sock_hold(sk); goto found; @@ -312,9 +922,9 @@ static int nl_portid_hash_rehash(struct nl_portid_hash *hash, int grow) for (i = 0; i <= omask; i++) { struct sock *sk; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; - sk_for_each_safe(sk, node, tmp, &otable[i]) + sk_for_each_safe(sk, tmp, &otable[i]) __sk_add_node(sk, nl_portid_hashfn(hash, nlk_sk(sk)->portid)); } @@ -344,7 +954,6 @@ static void netlink_update_listeners(struct sock *sk) { struct netlink_table *tbl = &nl_table[sk->sk_protocol]; - struct hlist_node *node; unsigned long mask; unsigned int i; struct listeners *listeners; @@ -355,7 +964,7 @@ netlink_update_listeners(struct sock *sk) for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; - sk_for_each_bound(sk, node, &tbl->mc_list) { + sk_for_each_bound(sk, &tbl->mc_list) { if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } @@ -371,18 +980,17 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid) struct hlist_head *head; int err = -EADDRINUSE; struct sock *osk; - struct hlist_node *node; int len; netlink_table_grab(); head = nl_portid_hashfn(hash, portid); len = 0; - sk_for_each(osk, node, head) { + sk_for_each(osk, head) { if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->portid == portid)) break; len++; } - if (node) + if (osk) goto err; err = -EBUSY; @@ -443,6 +1051,9 @@ static int __netlink_create(struct net *net, struct socket *sock, mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); +#ifdef CONFIG_NETLINK_MMAP + mutex_init(&nlk->pg_vec_lock); +#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -575,7 +1186,6 @@ static int netlink_autobind(struct socket *sock) struct nl_portid_hash *hash = &nl_table[sk->sk_protocol].hash; struct hlist_head *head; struct sock *osk; - struct hlist_node *node; s32 portid = task_tgid_vnr(current); int err; static s32 rover = -4097; @@ -584,7 +1194,7 @@ retry: cond_resched(); netlink_table_grab(); head = nl_portid_hashfn(hash, portid); - sk_for_each(osk, node, head) { + sk_for_each(osk, head) { if (!net_eq(sock_net(osk), net)) continue; if (nlk_sk(osk)->portid == portid) { @@ -775,19 +1385,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, return 0; } -static void netlink_overrun(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - } - } - atomic_inc(&sk->sk_drops); -} - static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; @@ -809,7 +1406,7 @@ static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) struct sock *netlink_getsockbyfilp(struct file *filp) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct sock *sock; if (!S_ISSOCK(inode->i_mode)) @@ -840,8 +1437,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) { + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(NETLINK_CONGESTED, &nlk->state)) && + !netlink_skb_is_mmaped(skb)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -855,7 +1453,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) && + test_bit(NETLINK_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -869,7 +1467,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, } return 1; } - skb_set_owner_r(skb, sk); + netlink_skb_set_owner_r(skb, sk); return 0; } @@ -877,7 +1475,14 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; - skb_queue_tail(&sk->sk_receive_queue, skb); +#ifdef CONFIG_NETLINK_MMAP + if (netlink_skb_is_mmaped(skb)) + netlink_queue_mmaped_skb(sk, skb); + else if (netlink_rx_is_mmaped(sk)) + netlink_ring_set_copied(sk, skb); + else +#endif /* CONFIG_NETLINK_MMAP */ + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, len); return len; } @@ -900,7 +1505,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; - skb_orphan(skb); + WARN_ON(skb->sk != NULL); + if (netlink_skb_is_mmaped(skb)) + return skb; delta = skb->end - skb->tail; if (delta * 2 < skb->truesize) @@ -920,16 +1527,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) return skb; } -static void netlink_rcv_wake(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(0, &nlk->state); - if (!test_bit(0, &nlk->state)) - wake_up_interruptible(&nlk->wait); -} - static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { @@ -939,8 +1536,8 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; - skb_set_owner_r(skb, sk); - NETLINK_CB(skb).ssk = ssk; + netlink_skb_set_owner_r(skb, sk); + NETLINK_CB(skb).sk = ssk; nlk->netlink_rcv(skb); consume_skb(skb); } else { @@ -986,6 +1583,69 @@ retry: } EXPORT_SYMBOL(netlink_unicast); +struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ +#ifdef CONFIG_NETLINK_MMAP + struct sock *sk = NULL; + struct sk_buff *skb; + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + unsigned int maxlen; + + sk = netlink_getsockbyportid(ssk, dst_portid); + if (IS_ERR(sk)) + goto out; + + ring = &nlk_sk(sk)->rx_ring; + /* fast-path without atomic ops for common case: non-mmaped receiver */ + if (ring->pg_vec == NULL) + goto out_put; + + skb = alloc_skb_head(gfp_mask); + if (skb == NULL) + goto err1; + + spin_lock_bh(&sk->sk_receive_queue.lock); + /* check again under lock */ + if (ring->pg_vec == NULL) + goto out_free; + + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + if (maxlen < size) + goto out_free; + + netlink_forward_ring(ring); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + netlink_increment_head(ring); + + spin_unlock_bh(&sk->sk_receive_queue.lock); + return skb; + +err2: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + netlink_overrun(sk); +err1: + sock_put(sk); + return NULL; + +out_free: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); +out_put: + sock_put(sk); +out: +#endif + return alloc_skb(size, gfp_mask); +} +EXPORT_SYMBOL_GPL(netlink_alloc_skb); + int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; @@ -1010,8 +1670,8 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(0, &nlk->state)) { - skb_set_owner_r(skb, sk); + !test_bit(NETLINK_CONGESTED, &nlk->state)) { + netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } @@ -1101,7 +1761,6 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; - struct hlist_node *node; struct sock *sk; skb = netlink_trim(skb, allocation); @@ -1124,7 +1783,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid netlink_lock_table(); - sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); consume_skb(skb); @@ -1200,7 +1859,6 @@ out: int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; - struct hlist_node *node; struct sock *sk; int ret = 0; @@ -1212,7 +1870,7 @@ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) read_lock(&nl_table_lock); - sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); read_unlock(&nl_table_lock); @@ -1248,7 +1906,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optlen >= sizeof(int) && + if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && + optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; @@ -1290,13 +1949,32 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, case NETLINK_NO_ENOBUFS: if (val) { nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(0, &nlk->state); + clear_bit(NETLINK_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; } err = 0; break; +#ifdef CONFIG_NETLINK_MMAP + case NETLINK_RX_RING: + case NETLINK_TX_RING: { + struct nl_mmap_req req; + + /* Rings might consume more memory than queue limits, require + * CAP_NET_ADMIN. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (optlen < sizeof(req)) + return -EINVAL; + if (copy_from_user(&req, optval, sizeof(req))) + return -EFAULT; + err = netlink_set_ring(sk, &req, false, + optname == NETLINK_TX_RING); + break; + } +#endif /* CONFIG_NETLINK_MMAP */ default: err = -ENOPROTOOPT; } @@ -1407,6 +2085,13 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; } + if (netlink_tx_is_mmaped(sk) && + msg->msg_iov->iov_base == NULL) { + err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, + siocb); + goto out; + } + err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; @@ -1676,10 +2361,9 @@ int netlink_change_ngroups(struct sock *sk, unsigned int groups) void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; - struct hlist_node *node; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; - sk_for_each_bound(sk, node, &tbl->mc_list) + sk_for_each_bound(sk, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } @@ -1702,7 +2386,7 @@ struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; - int size = NLMSG_LENGTH(len); + int size = nlmsg_msg_size(len); nlh = (struct nlmsghdr*)skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; @@ -1711,7 +2395,7 @@ __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int fla nlh->nlmsg_pid = portid; nlh->nlmsg_seq = seq; if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) - memset(NLMSG_DATA(nlh) + len, 0, NLMSG_ALIGN(size) - size); + memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size); return nlh; } EXPORT_SYMBOL(__nlmsg_put); @@ -1740,9 +2424,13 @@ static int netlink_dump(struct sock *sk) alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); - skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL); + if (!netlink_rx_is_mmaped(sk) && + atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + goto errout_skb; + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL); if (!skb) goto errout_skb; + netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); @@ -1797,13 +2485,25 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, if (cb == NULL) return -ENOBUFS; + /* Memory mapped dump requests need to be copied to avoid looping + * on the pending state in netlink_mmap_sendmsg() while the CB hold + * a reference to the skb. + */ + if (netlink_skb_is_mmaped(skb)) { + skb = skb_copy(skb, GFP_KERNEL); + if (skb == NULL) { + kfree(cb); + return -ENOBUFS; + } + } else + atomic_inc(&skb->users); + cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; cb->data = control->data; cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; - atomic_inc(&skb->users); cb->skb = skb; sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); @@ -1857,7 +2557,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (err) payload += nlmsg_len(nlh); - skb = nlmsg_new(payload, GFP_KERNEL); + skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), + NETLINK_CB(in_skb).portid, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -1974,14 +2675,13 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) struct nl_seq_iter *iter = seq->private; int i, j; struct sock *s; - struct hlist_node *node; loff_t off = 0; for (i = 0; i < MAX_LINKS; i++) { struct nl_portid_hash *hash = &nl_table[i].hash; for (j = 0; j <= hash->mask; j++) { - sk_for_each(s, node, &hash->table[j]) { + sk_for_each(s, &hash->table[j]) { if (sock_net(s) != seq_file_net(seq)) continue; if (off == pos) { @@ -2124,7 +2824,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = datagram_poll, + .poll = netlink_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -2132,7 +2832,7 @@ static const struct proto_ops netlink_ops = { .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = sock_no_mmap, + .mmap = netlink_mmap, .sendpage = sock_no_sendpage, }; @@ -2145,7 +2845,7 @@ static const struct net_proto_family netlink_family_ops = { static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS - if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops)) + if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops)) return -ENOMEM; #endif return 0; @@ -2154,7 +2854,7 @@ static int __net_init netlink_net_init(struct net *net) static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_remove(net, "netlink"); + remove_proc_entry("netlink", net->proc_net); #endif } @@ -2185,7 +2885,6 @@ static struct pernet_operations __net_initdata netlink_net_ops = { static int __init netlink_proto_init(void) { - struct sk_buff *dummy_skb; int i; unsigned long limit; unsigned int order; @@ -2194,7 +2893,7 @@ static int __init netlink_proto_init(void) if (err != 0) goto out; - BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h new file mode 100644 index 000000000000..ed8522265f4e --- /dev/null +++ b/net/netlink/af_netlink.h @@ -0,0 +1,82 @@ +#ifndef _AF_NETLINK_H +#define _AF_NETLINK_H + +#include <net/sock.h> + +#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) +#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) + +struct netlink_ring { + void **pg_vec; + unsigned int head; + unsigned int frames_per_block; + unsigned int frame_size; + unsigned int frame_max; + + unsigned int pg_vec_order; + unsigned int pg_vec_pages; + unsigned int pg_vec_len; + + atomic_t pending; +}; + +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 portid; + u32 dst_portid; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; + unsigned long state; + wait_queue_head_t wait; + struct netlink_callback *cb; + struct mutex *cb_mutex; + struct mutex cb_def_mutex; + void (*netlink_rcv)(struct sk_buff *skb); + void (*netlink_bind)(int group); + struct module *module; +#ifdef CONFIG_NETLINK_MMAP + struct mutex pg_vec_lock; + struct netlink_ring rx_ring; + struct netlink_ring tx_ring; + atomic_t mapped; +#endif /* CONFIG_NETLINK_MMAP */ +}; + +static inline struct netlink_sock *nlk_sk(struct sock *sk) +{ + return container_of(sk, struct netlink_sock, sk); +} + +struct nl_portid_hash { + struct hlist_head *table; + unsigned long rehash_time; + + unsigned int mask; + unsigned int shift; + + unsigned int entries; + unsigned int max_shift; + + u32 rnd; +}; + +struct netlink_table { + struct nl_portid_hash hash; + struct hlist_head mc_list; + struct listeners __rcu *listeners; + unsigned int flags; + unsigned int groups; + struct mutex *cb_mutex; + struct module *module; + void (*bind)(int group); + int registered; +}; + +extern struct netlink_table *nl_table; +extern rwlock_t nl_table_lock; + +#endif diff --git a/net/netlink/diag.c b/net/netlink/diag.c new file mode 100644 index 000000000000..1af29624b92f --- /dev/null +++ b/net/netlink/diag.c @@ -0,0 +1,227 @@ +#include <linux/module.h> + +#include <net/sock.h> +#include <linux/netlink.h> +#include <linux/sock_diag.h> +#include <linux/netlink_diag.h> + +#include "af_netlink.h" + +#ifdef CONFIG_NETLINK_MMAP +static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, + struct sk_buff *nlskb) +{ + struct netlink_diag_ring ndr; + + ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; + ndr.ndr_block_nr = ring->pg_vec_len; + ndr.ndr_frame_size = ring->frame_size; + ndr.ndr_frame_nr = ring->frame_max + 1; + + return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); +} + +static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int ret; + + mutex_lock(&nlk->pg_vec_lock); + ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); + if (!ret) + ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, + nlskb); + mutex_unlock(&nlk->pg_vec_lock); + + return ret; +} +#else +static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) +{ + return 0; +} +#endif + +static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->groups == NULL) + return 0; + + return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups), + nlk->groups); +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_diag_req *req, + u32 portid, u32 seq, u32 flags, int sk_ino) +{ + struct nlmsghdr *nlh; + struct netlink_diag_msg *rep; + struct netlink_sock *nlk = nlk_sk(sk); + + nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), + flags); + if (!nlh) + return -EMSGSIZE; + + rep = nlmsg_data(nlh); + rep->ndiag_family = AF_NETLINK; + rep->ndiag_type = sk->sk_type; + rep->ndiag_protocol = sk->sk_protocol; + rep->ndiag_state = sk->sk_state; + + rep->ndiag_ino = sk_ino; + rep->ndiag_portid = nlk->portid; + rep->ndiag_dst_portid = nlk->dst_portid; + rep->ndiag_dst_group = nlk->dst_group; + sock_diag_save_cookie(sk, rep->ndiag_cookie); + + if ((req->ndiag_show & NDIAG_SHOW_GROUPS) && + sk_diag_dump_groups(sk, skb)) + goto out_nlmsg_trim; + + if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) && + sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) + goto out_nlmsg_trim; + + if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && + sk_diag_put_rings_cfg(sk, skb)) + goto out_nlmsg_trim; + + return nlmsg_end(skb, nlh); + +out_nlmsg_trim: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + int protocol, int s_num) +{ + struct netlink_table *tbl = &nl_table[protocol]; + struct nl_portid_hash *hash = &tbl->hash; + struct net *net = sock_net(skb->sk); + struct netlink_diag_req *req; + struct sock *sk; + int ret = 0, num = 0, i; + + req = nlmsg_data(cb->nlh); + + for (i = 0; i <= hash->mask; i++) { + sk_for_each(sk, &hash->table[i]) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) { + num++; + continue; + } + + if (sk_diag_fill(sk, skb, req, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + ret = 1; + goto done; + } + + num++; + } + } + + sk_for_each_bound(sk, &tbl->mc_list) { + if (sk_hashed(sk)) + continue; + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) { + num++; + continue; + } + + if (sk_diag_fill(sk, skb, req, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + ret = 1; + goto done; + } + num++; + } +done: + cb->args[0] = num; + cb->args[1] = protocol; + + return ret; +} + +static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netlink_diag_req *req; + int s_num = cb->args[0]; + + req = nlmsg_data(cb->nlh); + + read_lock(&nl_table_lock); + + if (req->sdiag_protocol == NDIAG_PROTO_ALL) { + int i; + + for (i = cb->args[1]; i < MAX_LINKS; i++) { + if (__netlink_diag_dump(skb, cb, i, s_num)) + break; + s_num = 0; + } + } else { + if (req->sdiag_protocol >= MAX_LINKS) { + read_unlock(&nl_table_lock); + return -ENOENT; + } + + __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); + } + + read_unlock(&nl_table_lock); + + return skb->len; +} + +static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + int hdrlen = sizeof(struct netlink_diag_req); + struct net *net = sock_net(skb->sk); + + if (nlmsg_len(h) < hdrlen) + return -EINVAL; + + if (h->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = netlink_diag_dump, + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } else + return -EOPNOTSUPP; +} + +static const struct sock_diag_handler netlink_diag_handler = { + .family = AF_NETLINK, + .dump = netlink_diag_handler_dump, +}; + +static int __init netlink_diag_init(void) +{ + return sock_diag_register(&netlink_diag_handler); +} + +static void __exit netlink_diag_exit(void) +{ + sock_diag_unregister(&netlink_diag_handler); +} + +module_init(netlink_diag_init); +module_exit(netlink_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f2aabb6f4105..2fd6dbea327a 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -16,10 +16,12 @@ #include <linux/skbuff.h> #include <linux/mutex.h> #include <linux/bitmap.h> +#include <linux/rwsem.h> #include <net/sock.h> #include <net/genetlink.h> static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ +static DECLARE_RWSEM(cb_lock); void genl_lock(void) { @@ -41,6 +43,18 @@ int lockdep_genl_is_held(void) EXPORT_SYMBOL(lockdep_genl_is_held); #endif +static void genl_lock_all(void) +{ + down_write(&cb_lock); + genl_lock(); +} + +static void genl_unlock_all(void) +{ + genl_unlock(); + up_write(&cb_lock); +} + #define GENL_FAM_TAB_SIZE 16 #define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1) @@ -142,8 +156,9 @@ int genl_register_mc_group(struct genl_family *family, int err = 0; BUG_ON(grp->name[0] == '\0'); + BUG_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL); - genl_lock(); + genl_lock_all(); /* special-case our own group */ if (grp == ¬ify_grp) @@ -212,7 +227,7 @@ int genl_register_mc_group(struct genl_family *family, genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, grp); out: - genl_unlock(); + genl_unlock_all(); return err; } EXPORT_SYMBOL(genl_register_mc_group); @@ -254,9 +269,9 @@ static void __genl_unregister_mc_group(struct genl_family *family, void genl_unregister_mc_group(struct genl_family *family, struct genl_multicast_group *grp) { - genl_lock(); + genl_lock_all(); __genl_unregister_mc_group(family, grp); - genl_unlock(); + genl_unlock_all(); } EXPORT_SYMBOL(genl_unregister_mc_group); @@ -302,9 +317,9 @@ int genl_register_ops(struct genl_family *family, struct genl_ops *ops) if (ops->policy) ops->flags |= GENL_CMD_CAP_HASPOL; - genl_lock(); + genl_lock_all(); list_add_tail(&ops->ops_list, &family->ops_list); - genl_unlock(); + genl_unlock_all(); genl_ctrl_event(CTRL_CMD_NEWOPS, ops); err = 0; @@ -333,16 +348,16 @@ int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) { struct genl_ops *rc; - genl_lock(); + genl_lock_all(); list_for_each_entry(rc, &family->ops_list, ops_list) { if (rc == ops) { list_del(&ops->ops_list); - genl_unlock(); + genl_unlock_all(); genl_ctrl_event(CTRL_CMD_DELOPS, ops); return 0; } } - genl_unlock(); + genl_unlock_all(); return -ENOENT; } @@ -372,7 +387,7 @@ int genl_register_family(struct genl_family *family) INIT_LIST_HEAD(&family->ops_list); INIT_LIST_HEAD(&family->mcast_groups); - genl_lock(); + genl_lock_all(); if (genl_family_find_byname(family->name)) { err = -EEXIST; @@ -393,7 +408,7 @@ int genl_register_family(struct genl_family *family) goto errout_locked; } - if (family->maxattr) { + if (family->maxattr && !family->parallel_ops) { family->attrbuf = kmalloc((family->maxattr+1) * sizeof(struct nlattr *), GFP_KERNEL); if (family->attrbuf == NULL) { @@ -404,14 +419,14 @@ int genl_register_family(struct genl_family *family) family->attrbuf = NULL; list_add_tail(&family->family_list, genl_family_chain(family->id)); - genl_unlock(); + genl_unlock_all(); genl_ctrl_event(CTRL_CMD_NEWFAMILY, family); return 0; errout_locked: - genl_unlock(); + genl_unlock_all(); errout: return err; } @@ -475,7 +490,7 @@ int genl_unregister_family(struct genl_family *family) { struct genl_family *rc; - genl_lock(); + genl_lock_all(); genl_unregister_mc_groups(family); @@ -485,14 +500,14 @@ int genl_unregister_family(struct genl_family *family) list_del(&rc->family_list); INIT_LIST_HEAD(&family->ops_list); - genl_unlock(); + genl_unlock_all(); kfree(family->attrbuf); genl_ctrl_event(CTRL_CMD_DELFAMILY, family); return 0; } - genl_unlock(); + genl_unlock_all(); return -ENOENT; } @@ -529,19 +544,17 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, } EXPORT_SYMBOL(genlmsg_put); -static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int genl_family_rcv_msg(struct genl_family *family, + struct sk_buff *skb, + struct nlmsghdr *nlh) { struct genl_ops *ops; - struct genl_family *family; struct net *net = sock_net(skb->sk); struct genl_info info; struct genlmsghdr *hdr = nlmsg_data(nlh); + struct nlattr **attrbuf; int hdrlen, err; - family = genl_family_find_byid(nlh->nlmsg_type); - if (family == NULL) - return -ENOENT; - /* this family doesn't exist in this netns */ if (!family->netnsok && !net_eq(net, &init_net)) return -ENOENT; @@ -559,29 +572,33 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EPERM; if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ops->dumpit, + .done = ops->done, + }; + if (ops->dumpit == NULL) return -EOPNOTSUPP; - genl_unlock(); - { - struct netlink_dump_control c = { - .dump = ops->dumpit, - .done = ops->done, - }; - err = netlink_dump_start(net->genl_sock, skb, nlh, &c); - } - genl_lock(); - return err; + return netlink_dump_start(net->genl_sock, skb, nlh, &c); } if (ops->doit == NULL) return -EOPNOTSUPP; - if (family->attrbuf) { - err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr, + if (family->maxattr && family->parallel_ops) { + attrbuf = kmalloc((family->maxattr+1) * + sizeof(struct nlattr *), GFP_KERNEL); + if (attrbuf == NULL) + return -ENOMEM; + } else + attrbuf = family->attrbuf; + + if (attrbuf) { + err = nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, ops->policy); if (err < 0) - return err; + goto out; } info.snd_seq = nlh->nlmsg_seq; @@ -589,14 +606,14 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) info.nlhdr = nlh; info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; - info.attrs = family->attrbuf; + info.attrs = attrbuf; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); if (family->pre_doit) { err = family->pre_doit(ops, skb, &info); if (err) - return err; + goto out; } err = ops->doit(skb, &info); @@ -604,14 +621,38 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (family->post_doit) family->post_doit(ops, skb, &info); +out: + if (family->parallel_ops) + kfree(attrbuf); + + return err; +} + +static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct genl_family *family; + int err; + + family = genl_family_find_byid(nlh->nlmsg_type); + if (family == NULL) + return -ENOENT; + + if (!family->parallel_ops) + genl_lock(); + + err = genl_family_rcv_msg(family, skb, nlh); + + if (!family->parallel_ops) + genl_unlock(); + return err; } static void genl_rcv(struct sk_buff *skb) { - genl_lock(); + down_read(&cb_lock); netlink_rcv_skb(skb, &genl_rcv_msg); - genl_unlock(); + up_read(&cb_lock); } /************************************************************************** @@ -917,7 +958,6 @@ static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, - .cb_mutex = &genl_mutex, .flags = NL_CFG_F_NONROOT_RECV, }; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 7261eb81974f..ec0c80fde69f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -104,10 +104,9 @@ static void nr_remove_socket(struct sock *sk) static void nr_kill_by_device(struct net_device *dev) { struct sock *s; - struct hlist_node *node; spin_lock_bh(&nr_list_lock); - sk_for_each(s, node, &nr_list) + sk_for_each(s, &nr_list) if (nr_sk(s)->device == dev) nr_disconnect(s, ENETUNREACH); spin_unlock_bh(&nr_list_lock); @@ -149,10 +148,9 @@ static void nr_insert_socket(struct sock *sk) static struct sock *nr_find_listener(ax25_address *addr) { struct sock *s; - struct hlist_node *node; spin_lock_bh(&nr_list_lock); - sk_for_each(s, node, &nr_list) + sk_for_each(s, &nr_list) if (!ax25cmp(&nr_sk(s)->source_addr, addr) && s->sk_state == TCP_LISTEN) { bh_lock_sock(s); @@ -170,10 +168,9 @@ found: static struct sock *nr_find_socket(unsigned char index, unsigned char id) { struct sock *s; - struct hlist_node *node; spin_lock_bh(&nr_list_lock); - sk_for_each(s, node, &nr_list) { + sk_for_each(s, &nr_list) { struct nr_sock *nr = nr_sk(s); if (nr->my_index == index && nr->my_id == id) { @@ -194,10 +191,9 @@ static struct sock *nr_find_peer(unsigned char index, unsigned char id, ax25_address *dest) { struct sock *s; - struct hlist_node *node; spin_lock_bh(&nr_list_lock); - sk_for_each(s, node, &nr_list) { + sk_for_each(s, &nr_list) { struct nr_sock *nr = nr_sk(s); if (nr->your_index == index && nr->your_id == id && @@ -838,6 +834,8 @@ static int nr_getname(struct socket *sock, struct sockaddr *uaddr, struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); + memset(&sax->fsa_ax25, 0, sizeof(struct sockaddr_ax25)); + lock_sock(sk); if (peer != 0) { if (sk->sk_state != TCP_ESTABLISHED) { @@ -1177,6 +1175,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, } if (sax != NULL) { + memset(sax, 0, sizeof(*sax)); sax->sax25_family = AF_NETROM; skb_copy_from_linear_data_offset(skb, 7, sax->sax25_call.ax25_call, AX25_ADDR_LEN); @@ -1452,9 +1451,9 @@ static int __init nr_proto_init(void) nr_loopback_init(); - proc_net_fops_create(&init_net, "nr", S_IRUGO, &nr_info_fops); - proc_net_fops_create(&init_net, "nr_neigh", S_IRUGO, &nr_neigh_fops); - proc_net_fops_create(&init_net, "nr_nodes", S_IRUGO, &nr_nodes_fops); + proc_create("nr", S_IRUGO, init_net.proc_net, &nr_info_fops); + proc_create("nr_neigh", S_IRUGO, init_net.proc_net, &nr_neigh_fops); + proc_create("nr_nodes", S_IRUGO, init_net.proc_net, &nr_nodes_fops); out: return rc; fail: @@ -1482,9 +1481,9 @@ static void __exit nr_exit(void) { int i; - proc_net_remove(&init_net, "nr"); - proc_net_remove(&init_net, "nr_neigh"); - proc_net_remove(&init_net, "nr_nodes"); + remove_proc_entry("nr", init_net.proc_net); + remove_proc_entry("nr_neigh", init_net.proc_net); + remove_proc_entry("nr_nodes", init_net.proc_net); nr_loopback_clear(); nr_rt_free(); diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 70ffff76a967..b976d5eff2de 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -49,10 +49,9 @@ static struct nr_node *nr_node_get(ax25_address *callsign) { struct nr_node *found = NULL; struct nr_node *nr_node; - struct hlist_node *node; spin_lock_bh(&nr_node_list_lock); - nr_node_for_each(nr_node, node, &nr_node_list) + nr_node_for_each(nr_node, &nr_node_list) if (ax25cmp(callsign, &nr_node->callsign) == 0) { nr_node_hold(nr_node); found = nr_node; @@ -67,10 +66,9 @@ static struct nr_neigh *nr_neigh_get_dev(ax25_address *callsign, { struct nr_neigh *found = NULL; struct nr_neigh *nr_neigh; - struct hlist_node *node; spin_lock_bh(&nr_neigh_list_lock); - nr_neigh_for_each(nr_neigh, node, &nr_neigh_list) + nr_neigh_for_each(nr_neigh, &nr_neigh_list) if (ax25cmp(callsign, &nr_neigh->callsign) == 0 && nr_neigh->dev == dev) { nr_neigh_hold(nr_neigh); @@ -114,10 +112,9 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, */ if (nr_neigh != NULL && nr_neigh->failed != 0 && quality == 0) { struct nr_node *nr_nodet; - struct hlist_node *node; spin_lock_bh(&nr_node_list_lock); - nr_node_for_each(nr_nodet, node, &nr_node_list) { + nr_node_for_each(nr_nodet, &nr_node_list) { nr_node_lock(nr_nodet); for (i = 0; i < nr_nodet->count; i++) if (nr_nodet->routes[i].neighbour == nr_neigh) @@ -485,11 +482,11 @@ static int nr_dec_obs(void) { struct nr_neigh *nr_neigh; struct nr_node *s; - struct hlist_node *node, *nodet; + struct hlist_node *nodet; int i; spin_lock_bh(&nr_node_list_lock); - nr_node_for_each_safe(s, node, nodet, &nr_node_list) { + nr_node_for_each_safe(s, nodet, &nr_node_list) { nr_node_lock(s); for (i = 0; i < s->count; i++) { switch (s->routes[i].obs_count) { @@ -540,15 +537,15 @@ static int nr_dec_obs(void) void nr_rt_device_down(struct net_device *dev) { struct nr_neigh *s; - struct hlist_node *node, *nodet, *node2, *node2t; + struct hlist_node *nodet, *node2t; struct nr_node *t; int i; spin_lock_bh(&nr_neigh_list_lock); - nr_neigh_for_each_safe(s, node, nodet, &nr_neigh_list) { + nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) { if (s->dev == dev) { spin_lock_bh(&nr_node_list_lock); - nr_node_for_each_safe(t, node2, node2t, &nr_node_list) { + nr_node_for_each_safe(t, node2t, &nr_node_list) { nr_node_lock(t); for (i = 0; i < t->count; i++) { if (t->routes[i].neighbour == s) { @@ -737,11 +734,10 @@ int nr_rt_ioctl(unsigned int cmd, void __user *arg) void nr_link_failed(ax25_cb *ax25, int reason) { struct nr_neigh *s, *nr_neigh = NULL; - struct hlist_node *node; struct nr_node *nr_node = NULL; spin_lock_bh(&nr_neigh_list_lock); - nr_neigh_for_each(s, node, &nr_neigh_list) { + nr_neigh_for_each(s, &nr_neigh_list) { if (s->ax25 == ax25) { nr_neigh_hold(s); nr_neigh = s; @@ -761,7 +757,7 @@ void nr_link_failed(ax25_cb *ax25, int reason) return; } spin_lock_bh(&nr_node_list_lock); - nr_node_for_each(nr_node, node, &nr_node_list) { + nr_node_for_each(nr_node, &nr_node_list) { nr_node_lock(nr_node); if (nr_node->which < nr_node->count && nr_node->routes[nr_node->which].neighbour == nr_neigh) @@ -1013,16 +1009,16 @@ void __exit nr_rt_free(void) { struct nr_neigh *s = NULL; struct nr_node *t = NULL; - struct hlist_node *node, *nodet; + struct hlist_node *nodet; spin_lock_bh(&nr_neigh_list_lock); spin_lock_bh(&nr_node_list_lock); - nr_node_for_each_safe(t, node, nodet, &nr_node_list) { + nr_node_for_each_safe(t, nodet, &nr_node_list) { nr_node_lock(t); nr_remove_node_locked(t); nr_node_unlock(t); } - nr_neigh_for_each_safe(s, node, nodet, &nr_neigh_list) { + nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) { while(s->count) { s->count--; nr_neigh_put(s); diff --git a/net/nfc/Kconfig b/net/nfc/Kconfig index 60c3bbb63e8e..5948b2fc72f6 100644 --- a/net/nfc/Kconfig +++ b/net/nfc/Kconfig @@ -4,6 +4,7 @@ menuconfig NFC depends on NET + depends on RFKILL || !RFKILL tristate "NFC subsystem support" default n help @@ -15,6 +16,5 @@ menuconfig NFC source "net/nfc/nci/Kconfig" source "net/nfc/hci/Kconfig" -source "net/nfc/llcp/Kconfig" source "drivers/nfc/Kconfig" diff --git a/net/nfc/Makefile b/net/nfc/Makefile index d1a117c2c401..fb799deaed4f 100644 --- a/net/nfc/Makefile +++ b/net/nfc/Makefile @@ -5,6 +5,8 @@ obj-$(CONFIG_NFC) += nfc.o obj-$(CONFIG_NFC_NCI) += nci/ obj-$(CONFIG_NFC_HCI) += hci/ +#obj-$(CONFIG_NFC_LLCP) += llcp/ + +nfc-objs := core.o netlink.o af_nfc.o rawsock.o llcp_core.o llcp_commands.o \ + llcp_sock.o -nfc-objs := core.o netlink.o af_nfc.o rawsock.o -nfc-$(CONFIG_NFC_LLCP) += llcp/llcp.o llcp/commands.o llcp/sock.o diff --git a/net/nfc/core.c b/net/nfc/core.c index aa64ea441676..40d2527693da 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -27,6 +27,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/rfkill.h> #include <linux/nfc.h> #include <net/genetlink.h> @@ -58,6 +59,11 @@ int nfc_dev_up(struct nfc_dev *dev) device_lock(&dev->dev); + if (dev->rfkill && rfkill_blocked(dev->rfkill)) { + rc = -ERFKILL; + goto error; + } + if (!device_is_registered(&dev->dev)) { rc = -ENODEV; goto error; @@ -117,6 +123,24 @@ error: return rc; } +static int nfc_rfkill_set_block(void *data, bool blocked) +{ + struct nfc_dev *dev = data; + + pr_debug("%s blocked %d", dev_name(&dev->dev), blocked); + + if (!blocked) + return 0; + + nfc_dev_down(dev); + + return 0; +} + +static const struct rfkill_ops nfc_rfkill_ops = { + .set_block = nfc_rfkill_set_block, +}; + /** * nfc_start_poll - start polling for nfc targets * @@ -143,6 +167,11 @@ int nfc_start_poll(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols) goto error; } + if (!dev->dev_up) { + rc = -ENODEV; + goto error; + } + if (dev->polling) { rc = -EBUSY; goto error; @@ -338,7 +367,7 @@ int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol) dev->active_target = target; dev->rf_mode = NFC_RF_INITIATOR; - if (dev->ops->check_presence) + if (dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } @@ -429,7 +458,7 @@ int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb, rc = dev->ops->im_transceive(dev, dev->active_target, skb, cb, cb_context); - if (!rc && dev->ops->check_presence) + if (!rc && dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } else if (dev->rf_mode == NFC_RF_TARGET && dev->ops->tm_send != NULL) { @@ -684,11 +713,6 @@ static void nfc_release(struct device *d) pr_debug("dev_name=%s\n", dev_name(&dev->dev)); - if (dev->ops->check_presence) { - del_timer_sync(&dev->check_pres_timer); - cancel_work_sync(&dev->check_pres_work); - } - nfc_genl_data_exit(&dev->genl_data); kfree(dev->targets); kfree(dev); @@ -706,15 +730,16 @@ static void nfc_check_pres_work(struct work_struct *work) rc = dev->ops->check_presence(dev, dev->active_target); if (rc == -EOPNOTSUPP) goto exit; - if (!rc) { - mod_timer(&dev->check_pres_timer, jiffies + - msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); - } else { + if (rc) { u32 active_target_idx = dev->active_target->idx; device_unlock(&dev->dev); nfc_target_lost(dev, active_target_idx); return; } + + if (!dev->shutting_down) + mod_timer(&dev->check_pres_timer, jiffies + + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } exit: @@ -734,10 +759,10 @@ struct class nfc_class = { }; EXPORT_SYMBOL(nfc_class); -static int match_idx(struct device *d, void *data) +static int match_idx(struct device *d, const void *data) { struct nfc_dev *dev = to_nfc_dev(d); - unsigned int *idx = data; + const unsigned int *idx = data; return dev->idx == *idx; } @@ -761,6 +786,7 @@ struct nfc_dev *nfc_get_device(unsigned int idx) */ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, u32 supported_protocols, + u32 supported_se, int tx_headroom, int tx_tailroom) { struct nfc_dev *dev; @@ -778,6 +804,8 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, dev->ops = ops; dev->supported_protocols = supported_protocols; + dev->supported_se = supported_se; + dev->active_se = NFC_SE_NONE; dev->tx_headroom = tx_headroom; dev->tx_tailroom = tx_tailroom; @@ -836,6 +864,15 @@ int nfc_register_device(struct nfc_dev *dev) pr_debug("The userspace won't be notified that the device %s was added\n", dev_name(&dev->dev)); + dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev, + RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev); + if (dev->rfkill) { + if (rfkill_register(dev->rfkill) < 0) { + rfkill_destroy(dev->rfkill); + dev->rfkill = NULL; + } + } + return 0; } EXPORT_SYMBOL(nfc_register_device); @@ -853,26 +890,32 @@ void nfc_unregister_device(struct nfc_dev *dev) id = dev->idx; - mutex_lock(&nfc_devlist_mutex); - nfc_devlist_generation++; + if (dev->rfkill) { + rfkill_unregister(dev->rfkill); + rfkill_destroy(dev->rfkill); + } - /* lock to avoid unregistering a device while an operation - is in progress */ - device_lock(&dev->dev); - device_del(&dev->dev); - device_unlock(&dev->dev); + if (dev->ops->check_presence) { + device_lock(&dev->dev); + dev->shutting_down = true; + device_unlock(&dev->dev); + del_timer_sync(&dev->check_pres_timer); + cancel_work_sync(&dev->check_pres_work); + } - mutex_unlock(&nfc_devlist_mutex); + rc = nfc_genl_device_removed(dev); + if (rc) + pr_debug("The userspace won't be notified that the device %s " + "was removed\n", dev_name(&dev->dev)); nfc_llcp_unregister_device(dev); - rc = nfc_genl_device_removed(dev); - if (rc) - pr_debug("The userspace won't be notified that the device %s was removed\n", - dev_name(&dev->dev)); + mutex_lock(&nfc_devlist_mutex); + nfc_devlist_generation++; + device_del(&dev->dev); + mutex_unlock(&nfc_devlist_mutex); ida_simple_remove(&nfc_index_ida, id); - } EXPORT_SYMBOL(nfc_unregister_device); diff --git a/net/nfc/hci/command.c b/net/nfc/hci/command.c index 7d99410e6c1a..64f922be9281 100644 --- a/net/nfc/hci/command.c +++ b/net/nfc/hci/command.c @@ -280,14 +280,19 @@ static int nfc_hci_delete_pipe(struct nfc_hci_dev *hdev, u8 pipe) static int nfc_hci_clear_all_pipes(struct nfc_hci_dev *hdev) { u8 param[2]; + size_t param_len = 2; /* TODO: Find out what the identity reference data is * and fill param with it. HCI spec 6.1.3.5 */ pr_debug("\n"); + if (test_bit(NFC_HCI_QUIRK_SHORT_CLEAR, &hdev->quirks)) + param_len = 0; + return nfc_hci_execute_cmd(hdev, NFC_HCI_ADMIN_PIPE, - NFC_HCI_ADM_CLEAR_ALL_PIPE, param, 2, NULL); + NFC_HCI_ADM_CLEAR_ALL_PIPE, param, param_len, + NULL); } int nfc_hci_disconnect_gate(struct nfc_hci_dev *hdev, u8 gate) diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 7bea574d5934..91020b210d87 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -57,6 +57,8 @@ static void nfc_hci_msg_tx_work(struct work_struct *work) int r = 0; mutex_lock(&hdev->msg_tx_mutex); + if (hdev->shutting_down) + goto exit; if (hdev->cmd_pending_msg) { if (timer_pending(&hdev->cmd_timer) == 0) { @@ -295,6 +297,12 @@ void nfc_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe, u8 event, goto exit; } + if (hdev->ops->event_received) { + r = hdev->ops->event_received(hdev, gate, event, skb); + if (r <= 0) + goto exit_noskb; + } + switch (event) { case NFC_HCI_EVT_TARGET_DISCOVERED: if (skb->len < 1) { /* no status data? */ @@ -320,17 +328,15 @@ void nfc_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe, u8 event, r = nfc_hci_target_discovered(hdev, gate); break; default: - if (hdev->ops->event_received) { - hdev->ops->event_received(hdev, gate, event, skb); - return; - } - + pr_info("Discarded unknown event %x to gate %x\n", event, gate); + r = -EINVAL; break; } exit: kfree_skb(skb); +exit_noskb: if (r) { /* TODO: There was an error dispatching the event, * how to propagate up to nfc core? @@ -669,8 +675,10 @@ static int hci_tm_send(struct nfc_dev *nfc_dev, struct sk_buff *skb) if (hdev->ops->tm_send) return hdev->ops->tm_send(hdev, skb); - else - return -ENOTSUPP; + + kfree_skb(skb); + + return -ENOTSUPP; } static int hci_check_presence(struct nfc_dev *nfc_dev, @@ -787,7 +795,9 @@ static struct nfc_ops hci_nfc_ops = { struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops, struct nfc_hci_init_data *init_data, + unsigned long quirks, u32 protocols, + u32 supported_se, const char *llc_name, int tx_headroom, int tx_tailroom, @@ -813,7 +823,7 @@ struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops, return NULL; } - hdev->ndev = nfc_allocate_device(&hci_nfc_ops, protocols, + hdev->ndev = nfc_allocate_device(&hci_nfc_ops, protocols, supported_se, tx_headroom + HCI_CMDS_HEADROOM, tx_tailroom); if (!hdev->ndev) { @@ -830,6 +840,8 @@ struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops, memset(hdev->gate2pipe, NFC_HCI_INVALID_PIPE, sizeof(hdev->gate2pipe)); + hdev->quirks = quirks; + return hdev; } EXPORT_SYMBOL(nfc_hci_allocate_device); @@ -868,6 +880,28 @@ void nfc_hci_unregister_device(struct nfc_hci_dev *hdev) { struct hci_msg *msg, *n; + mutex_lock(&hdev->msg_tx_mutex); + + if (hdev->cmd_pending_msg) { + if (hdev->cmd_pending_msg->cb) + hdev->cmd_pending_msg->cb( + hdev->cmd_pending_msg->cb_context, + NULL, -ESHUTDOWN); + kfree(hdev->cmd_pending_msg); + hdev->cmd_pending_msg = NULL; + } + + hdev->shutting_down = true; + + mutex_unlock(&hdev->msg_tx_mutex); + + del_timer_sync(&hdev->cmd_timer); + cancel_work_sync(&hdev->msg_tx_work); + + cancel_work_sync(&hdev->msg_rx_work); + + nfc_unregister_device(hdev->ndev); + skb_queue_purge(&hdev->rx_hcp_frags); skb_queue_purge(&hdev->msg_rx_queue); @@ -876,13 +910,6 @@ void nfc_hci_unregister_device(struct nfc_hci_dev *hdev) skb_queue_purge(&msg->msg_frags); kfree(msg); } - - del_timer_sync(&hdev->cmd_timer); - - nfc_unregister_device(hdev->ndev); - - cancel_work_sync(&hdev->msg_tx_work); - cancel_work_sync(&hdev->msg_rx_work); } EXPORT_SYMBOL(nfc_hci_unregister_device); diff --git a/net/nfc/hci/hcp.c b/net/nfc/hci/hcp.c index bc308a7ca609..b6b4109f2343 100644 --- a/net/nfc/hci/hcp.c +++ b/net/nfc/hci/hcp.c @@ -105,6 +105,13 @@ int nfc_hci_hcp_message_tx(struct nfc_hci_dev *hdev, u8 pipe, } mutex_lock(&hdev->msg_tx_mutex); + + if (hdev->shutting_down) { + err = -ESHUTDOWN; + mutex_unlock(&hdev->msg_tx_mutex); + goto out_skb_err; + } + list_add_tail(&cmd->msg_l, &hdev->msg_tx_queue); mutex_unlock(&hdev->msg_tx_mutex); diff --git a/net/nfc/llcp/llcp.h b/net/nfc/llcp.h index 0d62366f8cc3..ff8c434f7df8 100644 --- a/net/nfc/llcp/llcp.h +++ b/net/nfc/llcp.h @@ -31,6 +31,7 @@ enum llcp_state { #define LLCP_MAX_LTO 0xff #define LLCP_MAX_RW 15 #define LLCP_MAX_MIUX 0x7ff +#define LLCP_MAX_MIU (LLCP_MAX_MIUX + 128) #define LLCP_WKS_NUM_SAP 16 #define LLCP_SDP_NUM_SAP 16 @@ -46,6 +47,19 @@ struct llcp_sock_list { rwlock_t lock; }; +struct nfc_llcp_sdp_tlv { + u8 *tlv; + u8 tlv_len; + + char *uri; + u8 tid; + u8 sap; + + unsigned long time; + + struct hlist_node node; +}; + struct nfc_llcp_local { struct list_head list; struct nfc_dev *dev; @@ -86,6 +100,12 @@ struct nfc_llcp_local { u8 remote_opt; u16 remote_wks; + struct mutex sdreq_lock; + struct hlist_head pending_sdreqs; + struct timer_list sdreq_timer; + struct work_struct sdreq_timeout_work; + u8 sdreq_next_tid; + /* sockets array */ struct llcp_sock_list sockets; struct llcp_sock_list connecting_sockets; @@ -105,7 +125,12 @@ struct nfc_llcp_sock { char *service_name; size_t service_name_len; u8 rw; - u16 miu; + __be16 miux; + + + /* Remote link parameters */ + u8 remote_rw; + u16 remote_miu; /* Link variables */ u8 send_n; @@ -121,7 +146,6 @@ struct nfc_llcp_sock { struct sk_buff_head tx_queue; struct sk_buff_head tx_pending_queue; - struct sk_buff_head tx_backlog_queue; struct list_head accept_queue; struct sock *parent; @@ -139,6 +163,7 @@ struct nfc_llcp_ui_cb { #define LLCP_HEADER_SIZE 2 #define LLCP_SEQUENCE_SIZE 1 +#define LLCP_AGF_PDU_HEADER_SIZE 2 /* LLCP versions: 1.1 is 1.0 plus SDP */ #define LLCP_VERSION_10 0x10 @@ -187,6 +212,7 @@ struct nfc_llcp_ui_cb { void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *s); void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *s); +void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock); struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev); struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local); int nfc_llcp_local_put(struct nfc_llcp_local *local); @@ -214,12 +240,20 @@ int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, /* Commands API */ void nfc_llcp_recv(void *data, struct sk_buff *skb, int err); u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length); +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap); +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, + size_t uri_len); +void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp); +void nfc_llcp_free_sdp_tlv_list(struct hlist_head *sdp_head); void nfc_llcp_recv(void *data, struct sk_buff *skb, int err); int nfc_llcp_disconnect(struct nfc_llcp_sock *sock); int nfc_llcp_send_symm(struct nfc_dev *dev); int nfc_llcp_send_connect(struct nfc_llcp_sock *sock); int nfc_llcp_send_cc(struct nfc_llcp_sock *sock); -int nfc_llcp_send_snl(struct nfc_llcp_local *local, u8 tid, u8 sap); +int nfc_llcp_send_snl_sdres(struct nfc_llcp_local *local, + struct hlist_head *tlv_list, size_t tlvs_len); +int nfc_llcp_send_snl_sdreq(struct nfc_llcp_local *local, + struct hlist_head *tlv_list, size_t tlvs_len); int nfc_llcp_send_dm(struct nfc_llcp_local *local, u8 ssap, u8 dsap, u8 reason); int nfc_llcp_send_disconnect(struct nfc_llcp_sock *sock); int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, diff --git a/net/nfc/llcp/Kconfig b/net/nfc/llcp/Kconfig deleted file mode 100644 index a1a41cd68255..000000000000 --- a/net/nfc/llcp/Kconfig +++ /dev/null @@ -1,7 +0,0 @@ -config NFC_LLCP - depends on NFC - bool "NFC LLCP support" - default n - help - Say Y here if you want to build support for a kernel NFC LLCP - implementation.
\ No newline at end of file diff --git a/net/nfc/llcp/commands.c b/net/nfc/llcp_commands.c index df24be48d4da..c1b23eef83ca 100644 --- a/net/nfc/llcp/commands.c +++ b/net/nfc/llcp_commands.c @@ -26,7 +26,7 @@ #include <net/nfc/nfc.h> -#include "../nfc.h" +#include "nfc.h" #include "llcp.h" static u8 llcp_tlv_length[LLCP_TLV_MAX] = { @@ -117,6 +117,88 @@ u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length) return tlv; } +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap) +{ + struct nfc_llcp_sdp_tlv *sdres; + u8 value[2]; + + sdres = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL); + if (sdres == NULL) + return NULL; + + value[0] = tid; + value[1] = sap; + + sdres->tlv = nfc_llcp_build_tlv(LLCP_TLV_SDRES, value, 2, + &sdres->tlv_len); + if (sdres->tlv == NULL) { + kfree(sdres); + return NULL; + } + + sdres->tid = tid; + sdres->sap = sap; + + INIT_HLIST_NODE(&sdres->node); + + return sdres; +} + +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, + size_t uri_len) +{ + struct nfc_llcp_sdp_tlv *sdreq; + + pr_debug("uri: %s, len: %zu\n", uri, uri_len); + + sdreq = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL); + if (sdreq == NULL) + return NULL; + + sdreq->tlv_len = uri_len + 3; + + if (uri[uri_len - 1] == 0) + sdreq->tlv_len--; + + sdreq->tlv = kzalloc(sdreq->tlv_len + 1, GFP_KERNEL); + if (sdreq->tlv == NULL) { + kfree(sdreq); + return NULL; + } + + sdreq->tlv[0] = LLCP_TLV_SDREQ; + sdreq->tlv[1] = sdreq->tlv_len - 2; + sdreq->tlv[2] = tid; + + sdreq->tid = tid; + sdreq->uri = sdreq->tlv + 3; + memcpy(sdreq->uri, uri, uri_len); + + sdreq->time = jiffies; + + INIT_HLIST_NODE(&sdreq->node); + + return sdreq; +} + +void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp) +{ + kfree(sdp->tlv); + kfree(sdp); +} + +void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head) +{ + struct nfc_llcp_sdp_tlv *sdp; + struct hlist_node *n; + + hlist_for_each_entry_safe(sdp, n, head, node) { + hlist_del(&sdp->node); + + nfc_llcp_free_sdp_tlv(sdp); + } +} + int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, u8 *tlv_array, u16 tlv_array_len) { @@ -184,10 +266,10 @@ int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, switch (type) { case LLCP_TLV_MIUX: - sock->miu = llcp_tlv_miux(tlv) + 128; + sock->remote_miu = llcp_tlv_miux(tlv) + 128; break; case LLCP_TLV_RW: - sock->rw = llcp_tlv_rw(tlv); + sock->remote_rw = llcp_tlv_rw(tlv); break; case LLCP_TLV_SN: break; @@ -200,7 +282,8 @@ int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, tlv += length + 2; } - pr_debug("sock %p rw %d miu %d\n", sock, sock->rw, sock->miu); + pr_debug("sock %p rw %d miu %d\n", sock, + sock->remote_rw, sock->remote_miu); return 0; } @@ -304,6 +387,8 @@ int nfc_llcp_send_symm(struct nfc_dev *dev) skb = llcp_add_header(skb, 0, 0, LLCP_PDU_SYMM); + __net_timestamp(skb); + nfc_llcp_send_to_raw_sock(local, skb, NFC_LLCP_DIRECTION_TX); return nfc_data_exchange(dev, local->target_idx, skb, @@ -316,9 +401,9 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) struct sk_buff *skb; u8 *service_name_tlv = NULL, service_name_tlv_length; u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length; + u8 *rw_tlv = NULL, rw_tlv_length, rw; int err; - u16 size = 0; + u16 size = 0, miux; pr_debug("Sending CONNECT\n"); @@ -334,11 +419,16 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) size += service_name_tlv_length; } - miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0, + /* If the socket parameters are not set, use the local ones */ + miux = be16_to_cpu(sock->miux) > LLCP_MAX_MIUX ? + local->miux : sock->miux; + rw = sock->rw > LLCP_MAX_RW ? local->rw : sock->rw; + + miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); size += miux_tlv_length; - rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &local->rw, 0, &rw_tlv_length); + rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); size += rw_tlv_length; pr_debug("SKB size %d SN length %zu\n", size, sock->service_name_len); @@ -375,9 +465,9 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) struct nfc_llcp_local *local; struct sk_buff *skb; u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length; + u8 *rw_tlv = NULL, rw_tlv_length, rw; int err; - u16 size = 0; + u16 size = 0, miux; pr_debug("Sending CC\n"); @@ -385,11 +475,16 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) if (local == NULL) return -ENODEV; - miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0, + /* If the socket parameters are not set, use the local ones */ + miux = be16_to_cpu(sock->miux) > LLCP_MAX_MIUX ? + local->miux : sock->miux; + rw = sock->rw > LLCP_MAX_RW ? local->rw : sock->rw; + + miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); size += miux_tlv_length; - rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &local->rw, 0, &rw_tlv_length); + rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); size += rw_tlv_length; skb = llcp_allocate_pdu(sock, LLCP_PDU_CC, size); @@ -414,48 +509,90 @@ error_tlv: return err; } -int nfc_llcp_send_snl(struct nfc_llcp_local *local, u8 tid, u8 sap) +static struct sk_buff *nfc_llcp_allocate_snl(struct nfc_llcp_local *local, + size_t tlv_length) { struct sk_buff *skb; struct nfc_dev *dev; - u8 *sdres_tlv = NULL, sdres_tlv_length, sdres[2]; u16 size = 0; - pr_debug("Sending SNL tid 0x%x sap 0x%x\n", tid, sap); - if (local == NULL) - return -ENODEV; + return ERR_PTR(-ENODEV); dev = local->dev; if (dev == NULL) - return -ENODEV; - - sdres[0] = tid; - sdres[1] = sap; - sdres_tlv = nfc_llcp_build_tlv(LLCP_TLV_SDRES, sdres, 0, - &sdres_tlv_length); - if (sdres_tlv == NULL) - return -ENOMEM; + return ERR_PTR(-ENODEV); size += LLCP_HEADER_SIZE; size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; - size += sdres_tlv_length; + size += tlv_length; skb = alloc_skb(size, GFP_KERNEL); - if (skb == NULL) { - kfree(sdres_tlv); - return -ENOMEM; - } + if (skb == NULL) + return ERR_PTR(-ENOMEM); skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); skb = llcp_add_header(skb, LLCP_SAP_SDP, LLCP_SAP_SDP, LLCP_PDU_SNL); - memcpy(skb_put(skb, sdres_tlv_length), sdres_tlv, sdres_tlv_length); + return skb; +} + +int nfc_llcp_send_snl_sdres(struct nfc_llcp_local *local, + struct hlist_head *tlv_list, size_t tlvs_len) +{ + struct nfc_llcp_sdp_tlv *sdp; + struct hlist_node *n; + struct sk_buff *skb; + + skb = nfc_llcp_allocate_snl(local, tlvs_len); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + hlist_for_each_entry_safe(sdp, n, tlv_list, node) { + memcpy(skb_put(skb, sdp->tlv_len), sdp->tlv, sdp->tlv_len); + + hlist_del(&sdp->node); + + nfc_llcp_free_sdp_tlv(sdp); + } skb_queue_tail(&local->tx_queue, skb); - kfree(sdres_tlv); + return 0; +} + +int nfc_llcp_send_snl_sdreq(struct nfc_llcp_local *local, + struct hlist_head *tlv_list, size_t tlvs_len) +{ + struct nfc_llcp_sdp_tlv *sdreq; + struct hlist_node *n; + struct sk_buff *skb; + + skb = nfc_llcp_allocate_snl(local, tlvs_len); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + mutex_lock(&local->sdreq_lock); + + if (hlist_empty(&local->pending_sdreqs)) + mod_timer(&local->sdreq_timer, + jiffies + msecs_to_jiffies(3 * local->remote_lto)); + + hlist_for_each_entry_safe(sdreq, n, tlv_list, node) { + pr_debug("tid %d for %s\n", sdreq->tid, sdreq->uri); + + memcpy(skb_put(skb, sdreq->tlv_len), sdreq->tlv, + sdreq->tlv_len); + + hlist_del(&sdreq->node); + + hlist_add_head(&sdreq->node, &local->pending_sdreqs); + } + + mutex_unlock(&local->sdreq_lock); + + skb_queue_tail(&local->tx_queue, skb); return 0; } @@ -521,6 +658,7 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, struct nfc_llcp_local *local; size_t frag_len = 0, remaining_len; u8 *msg_data, *msg_ptr; + u16 remote_miu; pr_debug("Send I frame len %zd\n", len); @@ -530,8 +668,8 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, /* Remote is ready but has not acknowledged our frames */ if((sock->remote_ready && - skb_queue_len(&sock->tx_pending_queue) >= sock->rw && - skb_queue_len(&sock->tx_queue) >= 2 * sock->rw)) { + skb_queue_len(&sock->tx_pending_queue) >= sock->remote_rw && + skb_queue_len(&sock->tx_queue) >= 2 * sock->remote_rw)) { pr_err("Pending queue is full %d frames\n", skb_queue_len(&sock->tx_pending_queue)); return -ENOBUFS; @@ -539,7 +677,7 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, /* Remote is not ready and we've been queueing enough frames */ if ((!sock->remote_ready && - skb_queue_len(&sock->tx_queue) >= 2 * sock->rw)) { + skb_queue_len(&sock->tx_queue) >= 2 * sock->remote_rw)) { pr_err("Tx queue is full %d frames\n", skb_queue_len(&sock->tx_queue)); return -ENOBUFS; @@ -557,9 +695,11 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, remaining_len = len; msg_ptr = msg_data; - while (remaining_len > 0) { + do { + remote_miu = sock->remote_miu > LLCP_MAX_MIU ? + local->remote_miu : sock->remote_miu; - frag_len = min_t(size_t, sock->miu, remaining_len); + frag_len = min_t(size_t, remote_miu, remaining_len); pr_debug("Fragment %zd bytes remaining %zd", frag_len, remaining_len); @@ -571,7 +711,8 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, skb_put(pdu, LLCP_SEQUENCE_SIZE); - memcpy(skb_put(pdu, frag_len), msg_ptr, frag_len); + if (likely(frag_len > 0)) + memcpy(skb_put(pdu, frag_len), msg_ptr, frag_len); skb_queue_tail(&sock->tx_queue, pdu); @@ -583,7 +724,7 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, remaining_len -= frag_len; msg_ptr += frag_len; - } + } while (remaining_len > 0); kfree(msg_data); @@ -597,6 +738,7 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, struct nfc_llcp_local *local; size_t frag_len = 0, remaining_len; u8 *msg_ptr, *msg_data; + u16 remote_miu; int err; pr_debug("Send UI frame len %zd\n", len); @@ -617,9 +759,11 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, remaining_len = len; msg_ptr = msg_data; - while (remaining_len > 0) { + do { + remote_miu = sock->remote_miu > LLCP_MAX_MIU ? + local->remote_miu : sock->remote_miu; - frag_len = min_t(size_t, sock->miu, remaining_len); + frag_len = min_t(size_t, remote_miu, remaining_len); pr_debug("Fragment %zd bytes remaining %zd", frag_len, remaining_len); @@ -633,14 +777,15 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI); - memcpy(skb_put(pdu, frag_len), msg_ptr, frag_len); + if (likely(frag_len > 0)) + memcpy(skb_put(pdu, frag_len), msg_ptr, frag_len); /* No need to check for the peer RW for UI frames */ skb_queue_tail(&local->tx_queue, pdu); remaining_len -= frag_len; msg_ptr += frag_len; - } + } while (remaining_len > 0); kfree(msg_data); diff --git a/net/nfc/llcp/llcp.c b/net/nfc/llcp_core.c index ec43914c92a9..158bdbf668cc 100644 --- a/net/nfc/llcp/llcp.c +++ b/net/nfc/llcp_core.c @@ -24,13 +24,15 @@ #include <linux/list.h> #include <linux/nfc.h> -#include "../nfc.h" +#include "nfc.h" #include "llcp.h" static u8 llcp_magic[3] = {0x46, 0x66, 0x6d}; static struct list_head llcp_devices; +static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb); + void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); @@ -45,6 +47,12 @@ void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *sk) write_unlock(&l->lock); } +void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock) +{ + sock->remote_rw = LLCP_DEFAULT_RW; + sock->remote_miu = LLCP_MAX_MIU + 1; +} + static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local = sock->local; @@ -54,7 +62,6 @@ static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); - skb_queue_purge(&sock->tx_backlog_queue); if (local == NULL) return; @@ -69,17 +76,18 @@ static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) } } -static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool listen) +static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, + int err) { struct sock *sk; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; struct nfc_llcp_sock *llcp_sock; skb_queue_purge(&local->tx_queue); write_lock(&local->sockets.lock); - sk_for_each_safe(sk, node, tmp, &local->sockets.head) { + sk_for_each_safe(sk, tmp, &local->sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); @@ -101,39 +109,51 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool listen) nfc_llcp_accept_unlink(accept_sk); + if (err) + accept_sk->sk_err = err; accept_sk->sk_state = LLCP_CLOSED; + accept_sk->sk_state_change(sk); bh_unlock_sock(accept_sk); - - sock_orphan(accept_sk); } - - if (listen == true) { - bh_unlock_sock(sk); - continue; - } - } - - /* - * If we have a connection less socket bound, we keep it alive - * if the device is still present. - */ - if (sk->sk_state == LLCP_BOUND && sk->sk_type == SOCK_DGRAM && - listen == true) { - bh_unlock_sock(sk); - continue; } + if (err) + sk->sk_err = err; sk->sk_state = LLCP_CLOSED; + sk->sk_state_change(sk); bh_unlock_sock(sk); - sock_orphan(sk); - sk_del_node_init(sk); } write_unlock(&local->sockets.lock); + + /* If we still have a device, we keep the RAW sockets alive */ + if (device == true) + return; + + write_lock(&local->raw_sockets.lock); + + sk_for_each_safe(sk, tmp, &local->raw_sockets.head) { + llcp_sock = nfc_llcp_sock(sk); + + bh_lock_sock(sk); + + nfc_llcp_socket_purge(llcp_sock); + + if (err) + sk->sk_err = err; + sk->sk_state = LLCP_CLOSED; + sk->sk_state_change(sk); + + bh_unlock_sock(sk); + + sk_del_node_init(sk); + } + + write_unlock(&local->raw_sockets.lock); } struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) @@ -143,20 +163,28 @@ struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) return local; } -static void local_release(struct kref *ref) +static void local_cleanup(struct nfc_llcp_local *local) { - struct nfc_llcp_local *local; - - local = container_of(ref, struct nfc_llcp_local, ref); - - list_del(&local->list); - nfc_llcp_socket_release(local, false); + nfc_llcp_socket_release(local, false, ENXIO); del_timer_sync(&local->link_timer); skb_queue_purge(&local->tx_queue); cancel_work_sync(&local->tx_work); cancel_work_sync(&local->rx_work); cancel_work_sync(&local->timeout_work); kfree_skb(local->rx_pending); + del_timer_sync(&local->sdreq_timer); + cancel_work_sync(&local->sdreq_timeout_work); + nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs); +} + +static void local_release(struct kref *ref) +{ + struct nfc_llcp_local *local; + + local = container_of(ref, struct nfc_llcp_local, ref); + + list_del(&local->list); + local_cleanup(local); kfree(local); } @@ -172,7 +200,6 @@ static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, u8 ssap, u8 dsap) { struct sock *sk; - struct hlist_node *node; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("ssap dsap %d %d\n", ssap, dsap); @@ -184,7 +211,7 @@ static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, llcp_sock = NULL; - sk_for_each(sk, node, &local->sockets.head) { + sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); if (tmp_sock->ssap == ssap && tmp_sock->dsap == dsap) { @@ -225,6 +252,47 @@ static void nfc_llcp_symm_timer(unsigned long data) schedule_work(&local->timeout_work); } +static void nfc_llcp_sdreq_timeout_work(struct work_struct *work) +{ + unsigned long time; + HLIST_HEAD(nl_sdres_list); + struct hlist_node *n; + struct nfc_llcp_sdp_tlv *sdp; + struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, + sdreq_timeout_work); + + mutex_lock(&local->sdreq_lock); + + time = jiffies - msecs_to_jiffies(3 * local->remote_lto); + + hlist_for_each_entry_safe(sdp, n, &local->pending_sdreqs, node) { + if (time_after(sdp->time, time)) + continue; + + sdp->sap = LLCP_SDP_UNBOUND; + + hlist_del(&sdp->node); + + hlist_add_head(&sdp->node, &nl_sdres_list); + } + + if (!hlist_empty(&local->pending_sdreqs)) + mod_timer(&local->sdreq_timer, + jiffies + msecs_to_jiffies(3 * local->remote_lto)); + + mutex_unlock(&local->sdreq_lock); + + if (!hlist_empty(&nl_sdres_list)) + nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); +} + +static void nfc_llcp_sdreq_timer(unsigned long data) +{ + struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; + + schedule_work(&local->sdreq_timeout_work); +} + struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev) { struct nfc_llcp_local *local, *n; @@ -273,7 +341,6 @@ struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, u8 *sn, size_t sn_len) { struct sock *sk; - struct hlist_node *node; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("sn %zd %p\n", sn_len, sn); @@ -285,7 +352,7 @@ struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, llcp_sock = NULL; - sk_for_each(sk, node, &local->sockets.head) { + sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); pr_debug("llcp sock %p\n", tmp_sock); @@ -550,14 +617,13 @@ int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) pr_err("No LLCP device\n"); return -ENODEV; } + if (gb_len < 3) + return -EINVAL; memset(local->remote_gb, 0, NFC_MAX_GT_LEN); memcpy(local->remote_gb, gb, gb_len); local->remote_gb_len = gb_len; - if (local->remote_gb == NULL || local->remote_gb_len == 0) - return -ENODEV; - if (memcmp(local->remote_gb, llcp_magic, 3)) { pr_err("MAC does not support LLCP\n"); return -EINVAL; @@ -603,14 +669,13 @@ static void nfc_llcp_set_nrns(struct nfc_llcp_sock *sock, struct sk_buff *pdu) void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction) { - struct hlist_node *node; struct sk_buff *skb_copy = NULL, *nskb; struct sock *sk; u8 *data; read_lock(&local->raw_sockets.lock); - sk_for_each(sk, node, &local->raw_sockets.head) { + sk_for_each(sk, &local->raw_sockets.head) { if (sk->sk_state != LLCP_BOUND) continue; @@ -668,6 +733,8 @@ static void nfc_llcp_tx_work(struct work_struct *work) if (ptype == LLCP_PDU_I) copy_skb = skb_copy(skb, GFP_ATOMIC); + __net_timestamp(skb); + nfc_llcp_send_to_raw_sock(local, skb, NFC_LLCP_DIRECTION_TX); @@ -697,11 +764,10 @@ static struct nfc_llcp_sock *nfc_llcp_connecting_sock_get(struct nfc_llcp_local { struct sock *sk; struct nfc_llcp_sock *llcp_sock; - struct hlist_node *node; read_lock(&local->connecting_sockets.lock); - sk_for_each(sk, node, &local->connecting_sockets.head) { + sk_for_each(sk, &local->connecting_sockets.head) { llcp_sock = nfc_llcp_sock(sk); if (llcp_sock->ssap == ssap) { @@ -770,8 +836,6 @@ static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, ui_cb->dsap = dsap; ui_cb->ssap = ssap; - printk("%s %d %d\n", __func__, dsap, ssap); - pr_debug("%d %d\n", dsap, ssap); /* We're looking for a bound socket, not a client one */ @@ -781,9 +845,14 @@ static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, /* There is no sequence with UI frames */ skb_pull(skb, LLCP_HEADER_SIZE); - if (sock_queue_rcv_skb(&llcp_sock->sk, skb)) { - pr_err("receive queue is full\n"); - skb_queue_head(&llcp_sock->tx_backlog_queue, skb); + if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { + /* + * UI frames will be freed from the socket layer, so we + * need to keep them alive until someone receives them. + */ + skb_get(skb); + } else { + pr_err("Receive queue is full\n"); } nfc_llcp_sock_put(llcp_sock); @@ -863,7 +932,9 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, new_sock = nfc_llcp_sock(new_sk); new_sock->dev = local->dev; new_sock->local = nfc_llcp_local_get(local); - new_sock->miu = local->remote_miu; + new_sock->rw = sock->rw; + new_sock->miux = sock->miux; + new_sock->remote_miu = local->remote_miu; new_sock->nfc_protocol = sock->nfc_protocol; new_sock->dsap = ssap; new_sock->target_idx = local->target_idx; @@ -917,11 +988,11 @@ int nfc_llcp_queue_i_frames(struct nfc_llcp_sock *sock) pr_debug("Remote ready %d tx queue len %d remote rw %d", sock->remote_ready, skb_queue_len(&sock->tx_pending_queue), - sock->rw); + sock->remote_rw); /* Try to queue some I frames for transmission */ while (sock->remote_ready && - skb_queue_len(&sock->tx_pending_queue) < sock->rw) { + skb_queue_len(&sock->tx_pending_queue) < sock->remote_rw) { struct sk_buff *pdu; pdu = skb_dequeue(&sock->tx_queue); @@ -976,9 +1047,14 @@ static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, pr_err("Received out of sequence I PDU\n"); skb_pull(skb, LLCP_HEADER_SIZE + LLCP_SEQUENCE_SIZE); - if (sock_queue_rcv_skb(&llcp_sock->sk, skb)) { - pr_err("receive queue is full\n"); - skb_queue_head(&llcp_sock->tx_backlog_queue, skb); + if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { + /* + * I frames will be freed from the socket layer, so we + * need to keep them alive until someone receives them. + */ + skb_get(skb); + } else { + pr_err("Receive queue is full\n"); } } @@ -1030,6 +1106,12 @@ static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); + if ((dsap == 0) && (ssap == 0)) { + pr_debug("Connection termination"); + nfc_dep_link_down(local->dev); + return; + } + llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); @@ -1136,6 +1218,10 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, u16 tlv_len, offset; char *service_name; size_t service_name_len; + struct nfc_llcp_sdp_tlv *sdp; + HLIST_HEAD(llc_sdres_list); + size_t sdres_tlvs_len; + HLIST_HEAD(nl_sdres_list); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); @@ -1150,6 +1236,7 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, tlv = &skb->data[LLCP_HEADER_SIZE]; tlv_len = skb->len - LLCP_HEADER_SIZE; offset = 0; + sdres_tlvs_len = 0; while (offset < tlv_len) { type = tlv[0]; @@ -1167,14 +1254,14 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, !strncmp(service_name, "urn:nfc:sn:sdp", service_name_len)) { sap = 1; - goto send_snl; + goto add_snl; } llcp_sock = nfc_llcp_sock_from_sn(local, service_name, service_name_len); if (!llcp_sock) { sap = 0; - goto send_snl; + goto add_snl; } /* @@ -1191,7 +1278,7 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, if (sap == LLCP_SAP_MAX) { sap = 0; - goto send_snl; + goto add_snl; } client_count = @@ -1208,8 +1295,37 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, pr_debug("%p %d\n", llcp_sock, sap); -send_snl: - nfc_llcp_send_snl(local, tid, sap); +add_snl: + sdp = nfc_llcp_build_sdres_tlv(tid, sap); + if (sdp == NULL) + goto exit; + + sdres_tlvs_len += sdp->tlv_len; + hlist_add_head(&sdp->node, &llc_sdres_list); + break; + + case LLCP_TLV_SDRES: + mutex_lock(&local->sdreq_lock); + + pr_debug("LLCP_TLV_SDRES: searching tid %d\n", tlv[2]); + + hlist_for_each_entry(sdp, &local->pending_sdreqs, node) { + if (sdp->tid != tlv[2]) + continue; + + sdp->sap = tlv[3]; + + pr_debug("Found: uri=%s, sap=%d\n", + sdp->uri, sdp->sap); + + hlist_del(&sdp->node); + + hlist_add_head(&sdp->node, &nl_sdres_list); + + break; + } + + mutex_unlock(&local->sdreq_lock); break; default: @@ -1220,21 +1336,63 @@ send_snl: offset += length + 2; tlv += length + 2; } + +exit: + if (!hlist_empty(&nl_sdres_list)) + nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); + + if (!hlist_empty(&llc_sdres_list)) + nfc_llcp_send_snl_sdres(local, &llc_sdres_list, sdres_tlvs_len); } -static void nfc_llcp_rx_work(struct work_struct *work) +static void nfc_llcp_recv_agf(struct nfc_llcp_local *local, struct sk_buff *skb) { - struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, - rx_work); - u8 dsap, ssap, ptype; - struct sk_buff *skb; + u8 ptype; + u16 pdu_len; + struct sk_buff *new_skb; - skb = local->rx_pending; - if (skb == NULL) { - pr_debug("No pending SKB\n"); + if (skb->len <= LLCP_HEADER_SIZE) { + pr_err("Malformed AGF PDU\n"); return; } + skb_pull(skb, LLCP_HEADER_SIZE); + + while (skb->len > LLCP_AGF_PDU_HEADER_SIZE) { + pdu_len = skb->data[0] << 8 | skb->data[1]; + + skb_pull(skb, LLCP_AGF_PDU_HEADER_SIZE); + + if (pdu_len < LLCP_HEADER_SIZE || pdu_len > skb->len) { + pr_err("Malformed AGF PDU\n"); + return; + } + + ptype = nfc_llcp_ptype(skb); + + if (ptype == LLCP_PDU_SYMM || ptype == LLCP_PDU_AGF) + goto next; + + new_skb = nfc_alloc_recv_skb(pdu_len, GFP_KERNEL); + if (new_skb == NULL) { + pr_err("Could not allocate PDU\n"); + return; + } + + memcpy(skb_put(new_skb, pdu_len), skb->data, pdu_len); + + nfc_llcp_rx_skb(local, new_skb); + + kfree_skb(new_skb); +next: + skb_pull(skb, pdu_len); + } +} + +static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb) +{ + u8 dsap, ssap, ptype; + ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); @@ -1245,8 +1403,6 @@ static void nfc_llcp_rx_work(struct work_struct *work) print_hex_dump(KERN_DEBUG, "LLCP Rx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); - nfc_llcp_send_to_raw_sock(local, skb, NFC_LLCP_DIRECTION_RX); - switch (ptype) { case LLCP_PDU_SYMM: pr_debug("SYMM\n"); @@ -1289,13 +1445,43 @@ static void nfc_llcp_rx_work(struct work_struct *work) nfc_llcp_recv_hdlc(local, skb); break; + case LLCP_PDU_AGF: + pr_debug("AGF frame\n"); + nfc_llcp_recv_agf(local, skb); + break; } +} + +static void nfc_llcp_rx_work(struct work_struct *work) +{ + struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, + rx_work); + struct sk_buff *skb; + + skb = local->rx_pending; + if (skb == NULL) { + pr_debug("No pending SKB\n"); + return; + } + + __net_timestamp(skb); + + nfc_llcp_send_to_raw_sock(local, skb, NFC_LLCP_DIRECTION_RX); + + nfc_llcp_rx_skb(local, skb); schedule_work(&local->tx_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; } +static void __nfc_llcp_recv(struct nfc_llcp_local *local, struct sk_buff *skb) +{ + local->rx_pending = skb; + del_timer(&local->link_timer); + schedule_work(&local->rx_work); +} + void nfc_llcp_recv(void *data, struct sk_buff *skb, int err) { struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; @@ -1306,9 +1492,7 @@ void nfc_llcp_recv(void *data, struct sk_buff *skb, int err) return; } - local->rx_pending = skb_get(skb); - del_timer(&local->link_timer); - schedule_work(&local->rx_work); + __nfc_llcp_recv(local, skb); } int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb) @@ -1319,9 +1503,7 @@ int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb) if (local == NULL) return -ENODEV; - local->rx_pending = skb_get(skb); - del_timer(&local->link_timer); - schedule_work(&local->rx_work); + __nfc_llcp_recv(local, skb); return 0; } @@ -1334,8 +1516,11 @@ void nfc_llcp_mac_is_down(struct nfc_dev *dev) if (local == NULL) return; + local->remote_miu = LLCP_DEFAULT_MIU; + local->remote_lto = LLCP_DEFAULT_LTO; + /* Close and purge all existing sockets */ - nfc_llcp_socket_release(local, true); + nfc_llcp_socket_release(local, true, 0); } void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, @@ -1400,6 +1585,13 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) local->remote_miu = LLCP_DEFAULT_MIU; local->remote_lto = LLCP_DEFAULT_LTO; + mutex_init(&local->sdreq_lock); + INIT_HLIST_HEAD(&local->pending_sdreqs); + init_timer(&local->sdreq_timer); + local->sdreq_timer.data = (unsigned long) local; + local->sdreq_timer.function = nfc_llcp_sdreq_timer; + INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); + list_add(&local->list, &llcp_devices); return 0; @@ -1414,6 +1606,8 @@ void nfc_llcp_unregister_device(struct nfc_dev *dev) return; } + local_cleanup(local); + nfc_llcp_local_put(local); } diff --git a/net/nfc/llcp/sock.c b/net/nfc/llcp_sock.c index fea22eb41b82..380253eccb74 100644 --- a/net/nfc/llcp/sock.c +++ b/net/nfc/llcp_sock.c @@ -24,7 +24,7 @@ #include <linux/module.h> #include <linux/nfc.h> -#include "../nfc.h" +#include "nfc.h" #include "llcp.h" static int sock_wait_state(struct sock *sk, int state, unsigned long timeo) @@ -223,6 +223,156 @@ error: return ret; } +static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); + u32 opt; + int err = 0; + + pr_debug("%p optname %d\n", sk, optname); + + if (level != SOL_NFC) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case NFC_LLCP_RW: + if (sk->sk_state == LLCP_CONNECTED || + sk->sk_state == LLCP_BOUND || + sk->sk_state == LLCP_LISTEN) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + if (opt > LLCP_MAX_RW) { + err = -EINVAL; + break; + } + + llcp_sock->rw = (u8) opt; + + break; + + case NFC_LLCP_MIUX: + if (sk->sk_state == LLCP_CONNECTED || + sk->sk_state == LLCP_BOUND || + sk->sk_state == LLCP_LISTEN) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + if (opt > LLCP_MAX_MIUX) { + err = -EINVAL; + break; + } + + llcp_sock->miux = cpu_to_be16((u16) opt); + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + + pr_debug("%p rw %d miux %d\n", llcp_sock, + llcp_sock->rw, llcp_sock->miux); + + return err; +} + +static int nfc_llcp_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct nfc_llcp_local *local; + struct sock *sk = sock->sk; + struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); + int len, err = 0; + u16 miux, remote_miu; + u8 rw; + + pr_debug("%p optname %d\n", sk, optname); + + if (level != SOL_NFC) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + local = llcp_sock->local; + if (!local) + return -ENODEV; + + len = min_t(u32, len, sizeof(u32)); + + lock_sock(sk); + + switch (optname) { + case NFC_LLCP_RW: + rw = llcp_sock->rw > LLCP_MAX_RW ? local->rw : llcp_sock->rw; + if (put_user(rw, (u32 __user *) optval)) + err = -EFAULT; + + break; + + case NFC_LLCP_MIUX: + miux = be16_to_cpu(llcp_sock->miux) > LLCP_MAX_MIUX ? + be16_to_cpu(local->miux) : be16_to_cpu(llcp_sock->miux); + + if (put_user(miux, (u32 __user *) optval)) + err = -EFAULT; + + break; + + case NFC_LLCP_REMOTE_MIU: + remote_miu = llcp_sock->remote_miu > LLCP_MAX_MIU ? + local->remote_miu : llcp_sock->remote_miu; + + if (put_user(remote_miu, (u32 __user *) optval)) + err = -EFAULT; + + break; + + case NFC_LLCP_REMOTE_LTO: + if (put_user(local->remote_lto / 10, (u32 __user *) optval)) + err = -EFAULT; + + break; + + case NFC_LLCP_REMOTE_RW: + if (put_user(llcp_sock->remote_rw, (u32 __user *) optval)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + + if (put_user(len, optlen)) + return -EFAULT; + + return err; +} + void nfc_llcp_accept_unlink(struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -270,7 +420,9 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent, } if (sk->sk_state == LLCP_CONNECTED || !newsock) { - nfc_llcp_accept_unlink(sk); + list_del_init(&lsk->accept_queue); + sock_put(sk); + if (newsock) sock_graft(sk, newsock); @@ -278,6 +430,8 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent, pr_debug("Returning sk state %d\n", sk->sk_state); + sk_acceptq_removed(parent); + return sk; } @@ -354,12 +508,13 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, pr_debug("%p %d %d %d\n", sk, llcp_sock->target_idx, llcp_sock->dsap, llcp_sock->ssap); - uaddr->sa_family = AF_NFC; - + memset(llcp_addr, 0, sizeof(*llcp_addr)); *len = sizeof(struct sockaddr_nfc_llcp); + llcp_addr->sa_family = AF_NFC; llcp_addr->dev_idx = llcp_sock->dev->idx; llcp_addr->target_idx = llcp_sock->target_idx; + llcp_addr->nfc_protocol = llcp_sock->nfc_protocol; llcp_addr->dsap = llcp_sock->dsap; llcp_addr->ssap = llcp_sock->ssap; llcp_addr->service_name_len = llcp_sock->service_name_len; @@ -401,7 +556,8 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock, return llcp_accept_poll(sk); if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= POLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0); if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; @@ -462,8 +618,6 @@ static int llcp_sock_release(struct socket *sock) nfc_llcp_accept_unlink(accept_sk); release_sock(accept_sk); - - sock_orphan(accept_sk); } } @@ -541,7 +695,7 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, llcp_sock->dev = dev; llcp_sock->local = nfc_llcp_local_get(local); - llcp_sock->miu = llcp_sock->local->remote_miu; + llcp_sock->remote_miu = llcp_sock->local->remote_miu; llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -ENOMEM; @@ -644,6 +798,8 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock, pr_debug("%p %zu\n", sk, len); + msg->msg_namelen = 0; + lock_sock(sk); if (sk->sk_state == LLCP_CLOSED && @@ -672,25 +828,28 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = min_t(unsigned int, rlen, len); cskb = skb; - if (memcpy_toiovec(msg->msg_iov, cskb->data, copied)) { + if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; } + sock_recv_timestamp(msg, sk, skb); + if (sk->sk_type == SOCK_DGRAM && msg->msg_name) { struct nfc_llcp_ui_cb *ui_cb = nfc_llcp_ui_skb_cb(skb); - struct sockaddr_nfc_llcp sockaddr; + struct sockaddr_nfc_llcp *sockaddr = + (struct sockaddr_nfc_llcp *) msg->msg_name; - pr_debug("Datagram socket %d %d\n", ui_cb->dsap, ui_cb->ssap); + msg->msg_namelen = sizeof(struct sockaddr_nfc_llcp); - sockaddr.sa_family = AF_NFC; - sockaddr.nfc_protocol = NFC_PROTO_NFC_DEP; - sockaddr.dsap = ui_cb->dsap; - sockaddr.ssap = ui_cb->ssap; + pr_debug("Datagram socket %d %d\n", ui_cb->dsap, ui_cb->ssap); - memcpy(msg->msg_name, &sockaddr, sizeof(sockaddr)); - msg->msg_namelen = sizeof(sockaddr); + memset(sockaddr, 0, sizeof(*sockaddr)); + sockaddr->sa_family = AF_NFC; + sockaddr->nfc_protocol = NFC_PROTO_NFC_DEP; + sockaddr->dsap = ui_cb->dsap; + sockaddr->ssap = ui_cb->ssap; } /* Mark read part of skb as used */ @@ -733,8 +892,8 @@ static const struct proto_ops llcp_sock_ops = { .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, + .setsockopt = nfc_llcp_setsockopt, + .getsockopt = nfc_llcp_getsockopt, .sendmsg = llcp_sock_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, @@ -798,15 +957,15 @@ struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp) llcp_sock->ssap = 0; llcp_sock->dsap = LLCP_SAP_SDP; - llcp_sock->rw = LLCP_DEFAULT_RW; - llcp_sock->miu = LLCP_DEFAULT_MIU; + llcp_sock->rw = LLCP_MAX_RW + 1; + llcp_sock->miux = cpu_to_be16(LLCP_MAX_MIUX + 1); llcp_sock->send_n = llcp_sock->send_ack_n = 0; llcp_sock->recv_n = llcp_sock->recv_ack_n = 0; llcp_sock->remote_ready = 1; llcp_sock->reserved_ssap = LLCP_SAP_MAX; + nfc_llcp_socket_remote_param_init(llcp_sock); skb_queue_head_init(&llcp_sock->tx_queue); skb_queue_head_init(&llcp_sock->tx_pending_queue); - skb_queue_head_init(&llcp_sock->tx_backlog_queue); INIT_LIST_HEAD(&llcp_sock->accept_queue); if (sock != NULL) @@ -821,7 +980,6 @@ void nfc_llcp_sock_free(struct nfc_llcp_sock *sock) skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); - skb_queue_purge(&sock->tx_backlog_queue); list_del_init(&sock->accept_queue); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 5f98dc1bf039..48ada0ec749e 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -658,6 +658,7 @@ static struct nfc_ops nci_nfc_ops = { */ struct nci_dev *nci_allocate_device(struct nci_ops *ops, __u32 supported_protocols, + __u32 supported_se, int tx_headroom, int tx_tailroom) { struct nci_dev *ndev; @@ -680,6 +681,7 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops, ndev->nfc_dev = nfc_allocate_device(&nci_nfc_ops, supported_protocols, + supported_se, tx_headroom + NCI_DATA_HDR_SIZE, tx_tailroom); if (!ndev->nfc_dev) diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 3568ae16786d..f0c4d61f37c0 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -28,8 +28,7 @@ #include <linux/slab.h> #include "nfc.h" - -#include "llcp/llcp.h" +#include "llcp.h" static struct genl_multicast_group nfc_genl_event_mcgrp = { .name = NFC_GENL_MCAST_EVENT_NAME, @@ -53,6 +52,15 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 }, [NFC_ATTR_IM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TM_PROTOCOLS] = { .type = NLA_U32 }, + [NFC_ATTR_LLC_PARAM_LTO] = { .type = NLA_U8 }, + [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 }, + [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 }, + [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { + [NFC_SDP_ATTR_URI] = { .type = NLA_STRING }, + [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 }, }; static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, @@ -348,6 +356,74 @@ free_msg: return -EMSGSIZE; } +int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) +{ + struct sk_buff *msg; + struct nlattr *sdp_attr, *uri_attr; + struct nfc_llcp_sdp_tlv *sdres; + struct hlist_node *n; + void *hdr; + int rc = -EMSGSIZE; + int i; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, + NFC_EVENT_LLC_SDRES); + if (!hdr) + goto free_msg; + + if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) + goto nla_put_failure; + + sdp_attr = nla_nest_start(msg, NFC_ATTR_LLC_SDP); + if (sdp_attr == NULL) { + rc = -ENOMEM; + goto nla_put_failure; + } + + i = 1; + hlist_for_each_entry_safe(sdres, n, sdres_list, node) { + pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap); + + uri_attr = nla_nest_start(msg, i++); + if (uri_attr == NULL) { + rc = -ENOMEM; + goto nla_put_failure; + } + + if (nla_put_u8(msg, NFC_SDP_ATTR_SAP, sdres->sap)) + goto nla_put_failure; + + if (nla_put_string(msg, NFC_SDP_ATTR_URI, sdres->uri)) + goto nla_put_failure; + + nla_nest_end(msg, uri_attr); + + hlist_del(&sdres->node); + + nfc_llcp_free_sdp_tlv(sdres); + } + + nla_nest_end(msg, sdp_attr); + + genlmsg_end(msg, hdr); + + return genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_ATOMIC); + +nla_put_failure: + genlmsg_cancel(msg, hdr); + +free_msg: + nlmsg_free(msg); + + nfc_llcp_free_sdp_tlv_list(sdres_list); + + return rc; +} + static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, @@ -366,6 +442,7 @@ static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || + nla_put_u32(msg, NFC_ATTR_SE, dev->supported_se) || nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) || nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) goto nla_put_failure; @@ -858,6 +935,96 @@ exit: return rc; } +static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) +{ + struct nfc_dev *dev; + struct nfc_llcp_local *local; + struct nlattr *attr, *sdp_attrs[NFC_SDP_ATTR_MAX+1]; + u32 idx; + u8 tid; + char *uri; + int rc = 0, rem; + size_t uri_len, tlvs_len; + struct hlist_head sdreq_list; + struct nfc_llcp_sdp_tlv *sdreq; + + if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || + !info->attrs[NFC_ATTR_LLC_SDP]) + return -EINVAL; + + idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + + dev = nfc_get_device(idx); + if (!dev) { + rc = -ENODEV; + goto exit; + } + + device_lock(&dev->dev); + + if (dev->dep_link_up == false) { + rc = -ENOLINK; + goto exit; + } + + local = nfc_llcp_find_local(dev); + if (!local) { + nfc_put_device(dev); + rc = -ENODEV; + goto exit; + } + + INIT_HLIST_HEAD(&sdreq_list); + + tlvs_len = 0; + + nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) { + rc = nla_parse_nested(sdp_attrs, NFC_SDP_ATTR_MAX, attr, + nfc_sdp_genl_policy); + + if (rc != 0) { + rc = -EINVAL; + goto exit; + } + + if (!sdp_attrs[NFC_SDP_ATTR_URI]) + continue; + + uri_len = nla_len(sdp_attrs[NFC_SDP_ATTR_URI]); + if (uri_len == 0) + continue; + + uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]); + if (uri == NULL || *uri == 0) + continue; + + tid = local->sdreq_next_tid++; + + sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len); + if (sdreq == NULL) { + rc = -ENOMEM; + goto exit; + } + + tlvs_len += sdreq->tlv_len; + + hlist_add_head(&sdreq->node, &sdreq_list); + } + + if (hlist_empty(&sdreq_list)) { + rc = -EINVAL; + goto exit; + } + + rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len); +exit: + device_unlock(&dev->dev); + + nfc_put_device(dev); + + return rc; +} + static struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, @@ -912,6 +1079,11 @@ static struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_llc_set_params, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_LLC_SDREQ, + .doit = nfc_genl_llc_sdreq, + .policy = nfc_genl_policy, + }, }; diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index 87d914d2876a..afa1f84ba040 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -46,7 +46,7 @@ struct nfc_rawsock { #define to_rawsock_sk(_tx_work) \ ((struct sock *) container_of(_tx_work, struct nfc_rawsock, tx_work)) -#ifdef CONFIG_NFC_LLCP +struct nfc_llcp_sdp_tlv; void nfc_llcp_mac_is_down(struct nfc_dev *dev); void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, @@ -59,60 +59,8 @@ int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb); struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev); int __init nfc_llcp_init(void); void nfc_llcp_exit(void); - -#else - -static inline void nfc_llcp_mac_is_down(struct nfc_dev *dev) -{ -} - -static inline void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, - u8 comm_mode, u8 rf_mode) -{ -} - -static inline int nfc_llcp_register_device(struct nfc_dev *dev) -{ - return 0; -} - -static inline void nfc_llcp_unregister_device(struct nfc_dev *dev) -{ -} - -static inline int nfc_llcp_set_remote_gb(struct nfc_dev *dev, - u8 *gb, u8 gb_len) -{ - return 0; -} - -static inline u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *gb_len) -{ - *gb_len = 0; - return NULL; -} - -static inline int nfc_llcp_data_received(struct nfc_dev *dev, - struct sk_buff *skb) -{ - return 0; -} - -static inline struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev) -{ - return NULL; -} - -static inline int nfc_llcp_init(void) -{ - return 0; -} - -static inline void nfc_llcp_exit(void) -{ -} - -#endif +void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp); +void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head); int __init rawsock_init(void); void rawsock_exit(void); @@ -144,6 +92,8 @@ int nfc_genl_dep_link_down_event(struct nfc_dev *dev); int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol); int nfc_genl_tm_deactivated(struct nfc_dev *dev); +int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list); + struct nfc_dev *nfc_get_device(unsigned int idx); static inline void nfc_put_device(struct nfc_dev *dev) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ac2defeeba83..894b6cbdd929 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -58,7 +58,7 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_sub(skb->csum, csum_partial(skb->data - + ETH_HLEN, VLAN_HLEN, 0)); + + (2 * ETH_ALEN), VLAN_HLEN, 0)); vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); *current_tci = vhdr->h_vlan_TCI; @@ -98,7 +98,7 @@ static int pop_vlan(struct sk_buff *skb) if (unlikely(err)) return err; - __vlan_hwaccel_put_tag(skb, ntohs(tci)); + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci)); return 0; } @@ -110,15 +110,15 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla /* push down current VLAN tag */ current_tag = vlan_tx_tag_get(skb); - if (!__vlan_put_tag(skb, current_tag)) + if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) return -ENOMEM; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(skb->csum, csum_partial(skb->data - + ETH_HLEN, VLAN_HLEN, 0)); + + (2 * ETH_ALEN), VLAN_HLEN, 0)); } - __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); + __vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); return 0; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f996db343247..d12d6b8b5e8b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -44,6 +44,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/inetdevice.h> #include <linux/list.h> +#include <linux/lockdep.h> #include <linux/openvswitch.h> #include <linux/rculist.h> #include <linux/dmi.h> @@ -55,39 +56,61 @@ #include "datapath.h" #include "flow.h" #include "vport-internal_dev.h" +#include "vport-netdev.h" -/** - * struct ovs_net - Per net-namespace data for ovs. - * @dps: List of datapaths to enable dumping them all out. - * Protected by genl_mutex. - */ -struct ovs_net { - struct list_head dps; -}; - -static int ovs_net_id __read_mostly; #define REHASH_FLOW_INTERVAL (10 * 60 * HZ) static void rehash_flow_table(struct work_struct *work); static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table); +int ovs_net_id __read_mostly; + +static void ovs_notify(struct sk_buff *skb, struct genl_info *info, + struct genl_multicast_group *grp) +{ + genl_notify(skb, genl_info_net(info), info->snd_portid, + grp->id, info->nlhdr, GFP_KERNEL); +} + /** * DOC: Locking: * - * Writes to device state (add/remove datapath, port, set operations on vports, - * etc.) are protected by RTNL. - * - * Writes to other state (flow table modifications, set miscellaneous datapath - * parameters, etc.) are protected by genl_mutex. The RTNL lock nests inside - * genl_mutex. + * All writes e.g. Writes to device state (add/remove datapath, port, set + * operations on vports, etc.), Writes to other state (flow table + * modifications, set miscellaneous datapath parameters, etc.) are protected + * by ovs_lock. * * Reads are protected by RCU. * * There are a few special cases (mostly stats) that have their own * synchronization but they nest under all of above and don't interact with * each other. + * + * The RTNL lock nests inside ovs_mutex. */ +static DEFINE_MUTEX(ovs_mutex); + +void ovs_lock(void) +{ + mutex_lock(&ovs_mutex); +} + +void ovs_unlock(void) +{ + mutex_unlock(&ovs_mutex); +} + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void) +{ + if (debug_locks) + return lockdep_is_held(&ovs_mutex); + else + return 1; +} +#endif + static struct vport *new_vport(const struct vport_parms *); static int queue_gso_packets(struct net *, int dp_ifindex, struct sk_buff *, const struct dp_upcall_info *); @@ -95,7 +118,7 @@ static int queue_userspace_packet(struct net *, int dp_ifindex, struct sk_buff *, const struct dp_upcall_info *); -/* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */ +/* Must be called with rcu_read_lock or ovs_mutex. */ static struct datapath *get_dp(struct net *net, int dp_ifindex) { struct datapath *dp = NULL; @@ -113,10 +136,10 @@ static struct datapath *get_dp(struct net *net, int dp_ifindex) return dp; } -/* Must be called with rcu_read_lock or RTNL lock. */ +/* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { - struct vport *vport = ovs_vport_rtnl_rcu(dp, OVSP_LOCAL); + struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return vport->ops->get_name(vport); } @@ -129,7 +152,7 @@ static int get_dpifindex(struct datapath *dp) local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) - ifindex = local->ops->get_ifindex(local); + ifindex = netdev_vport_priv(local)->dev->ifindex; else ifindex = 0; @@ -158,18 +181,17 @@ static struct hlist_head *vport_hash_bucket(const struct datapath *dp, struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) { struct vport *vport; - struct hlist_node *n; struct hlist_head *head; head = vport_hash_bucket(dp, port_no); - hlist_for_each_entry_rcu(vport, n, head, dp_hash_node) { + hlist_for_each_entry_rcu(vport, head, dp_hash_node) { if (vport->port_no == port_no) return vport; } return NULL; } -/* Called with RTNL lock and genl_lock. */ +/* Called with ovs_mutex. */ static struct vport *new_vport(const struct vport_parms *parms) { struct vport *vport; @@ -181,14 +203,12 @@ static struct vport *new_vport(const struct vport_parms *parms) hlist_add_head_rcu(&vport->dp_hash_node, head); } - return vport; } -/* Called with RTNL lock. */ void ovs_dp_detach_port(struct vport *p) { - ASSERT_RTNL(); + ASSERT_OVSL(); /* First drop references to device. */ hlist_del_rcu(&p->dp_hash_node); @@ -251,7 +271,8 @@ static struct genl_family dp_packet_genl_family = { .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, .maxattr = OVS_PACKET_ATTR_MAX, - .netnsok = true + .netnsok = true, + .parallel_ops = true, }; int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, @@ -301,7 +322,7 @@ static int queue_gso_packets(struct net *net, int dp_ifindex, struct sk_buff *segs, *nskb; int err; - segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM); + segs = __skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM, false); if (IS_ERR(segs)) return PTR_ERR(segs); @@ -338,6 +359,35 @@ static int queue_gso_packets(struct net *net, int dp_ifindex, return err; } +static size_t key_attr_size(void) +{ + return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(4) /* OVS_KEY_ATTR_8021Q */ + + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ + + nla_total_size(28); /* OVS_KEY_ATTR_ND */ +} + +static size_t upcall_msg_size(const struct sk_buff *skb, + const struct nlattr *userdata) +{ + size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + + nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */ + + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + /* OVS_PACKET_ATTR_USERDATA */ + if (userdata) + size += NLA_ALIGN(userdata->nla_len); + + return size; +} + static int queue_userspace_packet(struct net *net, int dp_ifindex, struct sk_buff *skb, const struct dp_upcall_info *upcall_info) @@ -346,7 +396,6 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, struct sk_buff *nskb = NULL; struct sk_buff *user_skb; /* to be queued to userspace */ struct nlattr *nla; - unsigned int len; int err; if (vlan_tx_tag_present(skb)) { @@ -354,7 +403,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, if (!nskb) return -ENOMEM; - nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb)); + nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb)); if (!nskb) return -ENOMEM; @@ -367,13 +416,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, goto out; } - len = sizeof(struct ovs_header); - len += nla_total_size(skb->len); - len += nla_total_size(FLOW_BUFSIZE); - if (upcall_info->cmd == OVS_PACKET_CMD_ACTION) - len += nla_total_size(8); - - user_skb = genlmsg_new(len, GFP_ATOMIC); + user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -388,13 +431,15 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, nla_nest_end(user_skb, nla); if (upcall_info->userdata) - nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA, - nla_get_u64(upcall_info->userdata)); + __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, + nla_len(upcall_info->userdata), + nla_data(upcall_info->userdata)); nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len); skb_copy_and_csum_dev(skb, nla_data(nla)); + genlmsg_end(user_skb, upcall); err = genlmsg_unicast(net, user_skb, upcall_info->portid); out: @@ -402,13 +447,13 @@ out: return err; } -/* Called with genl_mutex. */ +/* Called with ovs_mutex. */ static int flush_flows(struct datapath *dp) { struct flow_table *old_table; struct flow_table *new_table; - old_table = genl_dereference(dp->table); + old_table = ovsl_dereference(dp->table); new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS); if (!new_table) return -ENOMEM; @@ -544,7 +589,7 @@ static int validate_userspace(const struct nlattr *attr) { static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, - [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 }, + [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, }; struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; int error; @@ -661,8 +706,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || - !a[OVS_PACKET_ATTR_ACTIONS] || - nla_len(a[OVS_PACKET_ATTR_PACKET]) < ETH_HLEN) + !a[OVS_PACKET_ATTR_ACTIONS]) goto err; len = nla_len(a[OVS_PACKET_ATTR_PACKET]); @@ -672,7 +716,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err; skb_reserve(packet, NET_IP_ALIGN); - memcpy(__skb_put(packet, len), nla_data(a[OVS_PACKET_ATTR_PACKET]), len); + nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); skb_reset_mac_header(packet); eth = eth_hdr(packet); @@ -680,7 +724,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) /* Normally, setting the skb 'protocol' field would be handled by a * call to eth_type_trans(), but it assumes there's a sending * device, which we may not have. */ - if (ntohs(eth->h_proto) >= 1536) + if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) packet->protocol = eth->h_proto; else packet->protocol = htons(ETH_P_802_2); @@ -743,7 +787,7 @@ err: } static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { - [OVS_PACKET_ATTR_PACKET] = { .type = NLA_UNSPEC }, + [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, }; @@ -759,7 +803,7 @@ static struct genl_ops dp_packet_genl_ops[] = { static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) { int i; - struct flow_table *table = genl_dereference(dp->table); + struct flow_table *table = ovsl_dereference(dp->table); stats->n_flows = ovs_flow_tbl_count(table); @@ -794,14 +838,25 @@ static struct genl_family dp_flow_genl_family = { .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, .maxattr = OVS_FLOW_ATTR_MAX, - .netnsok = true + .netnsok = true, + .parallel_ops = true, }; static struct genl_multicast_group ovs_dp_flow_multicast_group = { .name = OVS_FLOW_MCGROUP }; -/* Called with genl_lock. */ +static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) +{ + return NLMSG_ALIGN(sizeof(struct ovs_header)) + + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ + + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ + + nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */ +} + +/* Called with ovs_mutex. */ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) @@ -815,8 +870,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, u8 tcp_flags; int err; - sf_acts = rcu_dereference_protected(flow->sf_acts, - lockdep_genl_is_held()); + sf_acts = ovsl_dereference(flow->sf_acts); ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); if (!ovs_header) @@ -879,25 +933,10 @@ error: static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow) { const struct sw_flow_actions *sf_acts; - int len; - - sf_acts = rcu_dereference_protected(flow->sf_acts, - lockdep_genl_is_held()); - - /* OVS_FLOW_ATTR_KEY */ - len = nla_total_size(FLOW_BUFSIZE); - /* OVS_FLOW_ATTR_ACTIONS */ - len += nla_total_size(sf_acts->actions_len); - /* OVS_FLOW_ATTR_STATS */ - len += nla_total_size(sizeof(struct ovs_flow_stats)); - /* OVS_FLOW_ATTR_TCP_FLAGS */ - len += nla_total_size(1); - /* OVS_FLOW_ATTR_USED */ - len += nla_total_size(8); - len += NLMSG_ALIGN(sizeof(struct ovs_header)); + sf_acts = ovsl_dereference(flow->sf_acts); - return genlmsg_new(len, GFP_KERNEL); + return genlmsg_new(ovs_flow_cmd_msg_size(sf_acts), GFP_KERNEL); } static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow, @@ -946,12 +985,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) goto error; } + ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); error = -ENODEV; if (!dp) - goto error; + goto err_unlock_ovs; - table = genl_dereference(dp->table); + table = ovsl_dereference(dp->table); flow = ovs_flow_tbl_lookup(table, &key, key_len); if (!flow) { struct sw_flow_actions *acts; @@ -959,7 +999,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) /* Bail out if we're not allowed to create a new flow. */ error = -ENOENT; if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) - goto error; + goto err_unlock_ovs; /* Expand table, if necessary, to make room. */ if (ovs_flow_tbl_need_to_expand(table)) { @@ -969,7 +1009,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) if (!IS_ERR(new_table)) { rcu_assign_pointer(dp->table, new_table); ovs_flow_tbl_deferred_destroy(table); - table = genl_dereference(dp->table); + table = ovsl_dereference(dp->table); } } @@ -977,7 +1017,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) flow = ovs_flow_alloc(); if (IS_ERR(flow)) { error = PTR_ERR(flow); - goto error; + goto err_unlock_ovs; } flow->key = key; clear_stats(flow); @@ -1010,11 +1050,10 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) error = -EEXIST; if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW && info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) - goto error; + goto err_unlock_ovs; /* Update actions. */ - old_acts = rcu_dereference_protected(flow->sf_acts, - lockdep_genl_is_held()); + old_acts = ovsl_dereference(flow->sf_acts); acts_attrs = a[OVS_FLOW_ATTR_ACTIONS]; if (acts_attrs && (old_acts->actions_len != nla_len(acts_attrs) || @@ -1025,7 +1064,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) new_acts = ovs_flow_actions_alloc(acts_attrs); error = PTR_ERR(new_acts); if (IS_ERR(new_acts)) - goto error; + goto err_unlock_ovs; rcu_assign_pointer(flow->sf_acts, new_acts); ovs_flow_deferred_free_acts(old_acts); @@ -1041,11 +1080,10 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) spin_unlock_bh(&flow->lock); } } + ovs_unlock(); if (!IS_ERR(reply)) - genl_notify(reply, genl_info_net(info), info->snd_portid, - ovs_dp_flow_multicast_group.id, info->nlhdr, - GFP_KERNEL); + ovs_notify(reply, info, &ovs_dp_flow_multicast_group); else netlink_set_err(sock_net(skb->sk)->genl_sock, 0, ovs_dp_flow_multicast_group.id, PTR_ERR(reply)); @@ -1053,6 +1091,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) error_free_flow: ovs_flow_free(flow); +err_unlock_ovs: + ovs_unlock(); error: return error; } @@ -1075,21 +1115,32 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) if (err) return err; + ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) - return -ENODEV; + if (!dp) { + err = -ENODEV; + goto unlock; + } - table = genl_dereference(dp->table); + table = ovsl_dereference(dp->table); flow = ovs_flow_tbl_lookup(table, &key, key_len); - if (!flow) - return -ENOENT; + if (!flow) { + err = -ENOENT; + goto unlock; + } reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, info->snd_seq, OVS_FLOW_CMD_NEW); - if (IS_ERR(reply)) - return PTR_ERR(reply); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto unlock; + } + ovs_unlock(); return genlmsg_reply(reply, info); +unlock: + ovs_unlock(); + return err; } static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) @@ -1104,25 +1155,33 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) int err; int key_len; + ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) - return -ENODEV; - - if (!a[OVS_FLOW_ATTR_KEY]) - return flush_flows(dp); + if (!dp) { + err = -ENODEV; + goto unlock; + } + if (!a[OVS_FLOW_ATTR_KEY]) { + err = flush_flows(dp); + goto unlock; + } err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); if (err) - return err; + goto unlock; - table = genl_dereference(dp->table); + table = ovsl_dereference(dp->table); flow = ovs_flow_tbl_lookup(table, &key, key_len); - if (!flow) - return -ENOENT; + if (!flow) { + err = -ENOENT; + goto unlock; + } reply = ovs_flow_cmd_alloc_info(flow); - if (!reply) - return -ENOMEM; + if (!reply) { + err = -ENOMEM; + goto unlock; + } ovs_flow_tbl_remove(table, flow); @@ -1131,10 +1190,13 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) BUG_ON(err < 0); ovs_flow_deferred_free(flow); + ovs_unlock(); - genl_notify(reply, genl_info_net(info), info->snd_portid, - ovs_dp_flow_multicast_group.id, info->nlhdr, GFP_KERNEL); + ovs_notify(reply, info, &ovs_dp_flow_multicast_group); return 0; +unlock: + ovs_unlock(); + return err; } static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) @@ -1143,11 +1205,14 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) struct datapath *dp; struct flow_table *table; + ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) + if (!dp) { + ovs_unlock(); return -ENODEV; + } - table = genl_dereference(dp->table); + table = ovsl_dereference(dp->table); for (;;) { struct sw_flow *flow; @@ -1168,6 +1233,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0] = bucket; cb->args[1] = obj; } + ovs_unlock(); return skb->len; } @@ -1206,13 +1272,24 @@ static struct genl_family dp_datapath_genl_family = { .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, .maxattr = OVS_DP_ATTR_MAX, - .netnsok = true + .netnsok = true, + .parallel_ops = true, }; static struct genl_multicast_group ovs_dp_datapath_multicast_group = { .name = OVS_DATAPATH_MCGROUP }; +static size_t ovs_dp_cmd_msg_size(void) +{ + size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); + + msgsize += nla_total_size(IFNAMSIZ); + msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); + + return msgsize; +} + static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { @@ -1251,7 +1328,7 @@ static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 portid, struct sk_buff *skb; int retval; - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + skb = genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); @@ -1263,7 +1340,7 @@ static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 portid, return skb; } -/* Called with genl_mutex and optionally with RTNL lock also. */ +/* Called with ovs_mutex. */ static struct datapath *lookup_datapath(struct net *net, struct ovs_header *ovs_header, struct nlattr *a[OVS_DP_ATTR_MAX + 1]) @@ -1297,12 +1374,12 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; - rtnl_lock(); + ovs_lock(); err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) - goto err_unlock_rtnl; + goto err_unlock_ovs; ovs_dp_set_net(dp, hold_net(sock_net(skb->sk))); @@ -1353,55 +1430,49 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); list_add_tail(&dp->list_node, &ovs_net->dps); - rtnl_unlock(); - genl_notify(reply, genl_info_net(info), info->snd_portid, - ovs_dp_datapath_multicast_group.id, info->nlhdr, - GFP_KERNEL); + ovs_unlock(); + + ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); return 0; err_destroy_local_port: - ovs_dp_detach_port(ovs_vport_rtnl(dp, OVSP_LOCAL)); + ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); err_destroy_ports_array: kfree(dp->ports); err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(genl_dereference(dp->table)); + ovs_flow_tbl_destroy(ovsl_dereference(dp->table)); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); -err_unlock_rtnl: - rtnl_unlock(); +err_unlock_ovs: + ovs_unlock(); err: return err; } -/* Called with genl_mutex. */ +/* Called with ovs_mutex. */ static void __dp_destroy(struct datapath *dp) { int i; - rtnl_lock(); - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; - struct hlist_node *node, *n; + struct hlist_node *n; - hlist_for_each_entry_safe(vport, node, n, &dp->ports[i], dp_hash_node) + hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) if (vport->port_no != OVSP_LOCAL) ovs_dp_detach_port(vport); } list_del(&dp->list_node); - ovs_dp_detach_port(ovs_vport_rtnl(dp, OVSP_LOCAL)); - /* rtnl_unlock() will wait until all the references to devices that - * are pending unregistration have been dropped. We do it here to - * ensure that any internal devices (which contain DP pointers) are - * fully destroyed before freeing the datapath. + /* OVSP_LOCAL is datapath internal port. We need to make sure that + * all port in datapath are destroyed first before freeing datapath. */ - rtnl_unlock(); + ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1412,24 +1483,27 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; + ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) - return err; + goto unlock; reply = ovs_dp_cmd_build_info(dp, info->snd_portid, info->snd_seq, OVS_DP_CMD_DEL); err = PTR_ERR(reply); if (IS_ERR(reply)) - return err; + goto unlock; __dp_destroy(dp); + ovs_unlock(); - genl_notify(reply, genl_info_net(info), info->snd_portid, - ovs_dp_datapath_multicast_group.id, info->nlhdr, - GFP_KERNEL); + ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); return 0; +unlock: + ovs_unlock(); + return err; } static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) @@ -1438,9 +1512,11 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; + ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + err = PTR_ERR(dp); if (IS_ERR(dp)) - return PTR_ERR(dp); + goto unlock; reply = ovs_dp_cmd_build_info(dp, info->snd_portid, info->snd_seq, OVS_DP_CMD_NEW); @@ -1448,31 +1524,45 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) err = PTR_ERR(reply); netlink_set_err(sock_net(skb->sk)->genl_sock, 0, ovs_dp_datapath_multicast_group.id, err); - return 0; + err = 0; + goto unlock; } - genl_notify(reply, genl_info_net(info), info->snd_portid, - ovs_dp_datapath_multicast_group.id, info->nlhdr, - GFP_KERNEL); + ovs_unlock(); + ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); return 0; +unlock: + ovs_unlock(); + return err; } static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; + int err; + ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); - if (IS_ERR(dp)) - return PTR_ERR(dp); + if (IS_ERR(dp)) { + err = PTR_ERR(dp); + goto unlock; + } reply = ovs_dp_cmd_build_info(dp, info->snd_portid, info->snd_seq, OVS_DP_CMD_NEW); - if (IS_ERR(reply)) - return PTR_ERR(reply); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto unlock; + } + ovs_unlock(); return genlmsg_reply(reply, info); + +unlock: + ovs_unlock(); + return err; } static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) @@ -1482,6 +1572,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int skip = cb->args[0]; int i = 0; + ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) { if (i >= skip && ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, @@ -1490,6 +1581,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) break; i++; } + ovs_unlock(); cb->args[0] = i; @@ -1535,14 +1627,15 @@ static struct genl_family dp_vport_genl_family = { .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, .maxattr = OVS_VPORT_ATTR_MAX, - .netnsok = true + .netnsok = true, + .parallel_ops = true, }; struct genl_multicast_group ovs_dp_vport_multicast_group = { .name = OVS_VPORT_MCGROUP }; -/* Called with RTNL lock or RCU read lock. */ +/* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { @@ -1581,7 +1674,7 @@ error: return err; } -/* Called with RTNL lock or RCU read lock. */ +/* Called with ovs_mutex or RCU read lock. */ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, u32 seq, u8 cmd) { @@ -1593,14 +1686,12 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, return ERR_PTR(-ENOMEM); retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd); - if (retval < 0) { - kfree_skb(skb); - return ERR_PTR(retval); - } + BUG_ON(retval < 0); + return skb; } -/* Called with RTNL lock or RCU read lock. */ +/* Called with ovs_mutex or RCU read lock. */ static struct vport *lookup_vport(struct net *net, struct ovs_header *ovs_header, struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) @@ -1626,9 +1717,9 @@ static struct vport *lookup_vport(struct net *net, if (!dp) return ERR_PTR(-ENODEV); - vport = ovs_vport_rtnl_rcu(dp, port_no); + vport = ovs_vport_ovsl_rcu(dp, port_no); if (!vport) - return ERR_PTR(-ENOENT); + return ERR_PTR(-ENODEV); return vport; } else return ERR_PTR(-EINVAL); @@ -1650,7 +1741,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) !a[OVS_VPORT_ATTR_UPCALL_PID]) goto exit; - rtnl_lock(); + ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) @@ -1663,7 +1754,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) if (port_no >= DP_MAX_PORTS) goto exit_unlock; - vport = ovs_vport_rtnl_rcu(dp, port_no); + vport = ovs_vport_ovsl(dp, port_no); err = -EBUSY; if (vport) goto exit_unlock; @@ -1673,7 +1764,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) err = -EFBIG; goto exit_unlock; } - vport = ovs_vport_rtnl(dp, port_no); + vport = ovs_vport_ovsl(dp, port_no); if (!vport) break; } @@ -1691,6 +1782,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(vport)) goto exit_unlock; + err = 0; reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, OVS_VPORT_CMD_NEW); if (IS_ERR(reply)) { @@ -1698,11 +1790,11 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_detach_port(vport); goto exit_unlock; } - genl_notify(reply, genl_info_net(info), info->snd_portid, - ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL); + + ovs_notify(reply, info, &ovs_dp_vport_multicast_group); exit_unlock: - rtnl_unlock(); + ovs_unlock(); exit: return err; } @@ -1714,7 +1806,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) struct vport *vport; int err; - rtnl_lock(); + ovs_lock(); vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); err = PTR_ERR(vport); if (IS_ERR(vport)) @@ -1725,26 +1817,35 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) err = -EINVAL; + reply = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!reply) { + err = -ENOMEM; + goto exit_unlock; + } + if (!err && a[OVS_VPORT_ATTR_OPTIONS]) err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); if (err) - goto exit_unlock; + goto exit_free; + if (a[OVS_VPORT_ATTR_UPCALL_PID]) vport->upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, - OVS_VPORT_CMD_NEW); - if (IS_ERR(reply)) { - netlink_set_err(sock_net(skb->sk)->genl_sock, 0, - ovs_dp_vport_multicast_group.id, PTR_ERR(reply)); - goto exit_unlock; - } + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + + ovs_unlock(); + ovs_notify(reply, info, &ovs_dp_vport_multicast_group); + return 0; - genl_notify(reply, genl_info_net(info), info->snd_portid, - ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL); + rtnl_unlock(); + return 0; +exit_free: + kfree_skb(reply); exit_unlock: - rtnl_unlock(); + ovs_unlock(); return err; } @@ -1755,7 +1856,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) struct vport *vport; int err; - rtnl_lock(); + ovs_lock(); vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); err = PTR_ERR(vport); if (IS_ERR(vport)) @@ -1772,13 +1873,13 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(reply)) goto exit_unlock; + err = 0; ovs_dp_detach_port(vport); - genl_notify(reply, genl_info_net(info), info->snd_portid, - ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL); + ovs_notify(reply, info, &ovs_dp_vport_multicast_group); exit_unlock: - rtnl_unlock(); + ovs_unlock(); return err; } @@ -1825,10 +1926,9 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; - struct hlist_node *n; j = 0; - hlist_for_each_entry_rcu(vport, n, &dp->ports[i], dp_hash_node) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { if (j >= skip && ovs_vport_cmd_fill_info(vport, skb, NETLINK_CB(cb->skb).portid, @@ -1939,13 +2039,13 @@ static void rehash_flow_table(struct work_struct *work) struct datapath *dp; struct net *net; - genl_lock(); + ovs_lock(); rtnl_lock(); for_each_net(net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); list_for_each_entry(dp, &ovs_net->dps, list_node) { - struct flow_table *old_table = genl_dereference(dp->table); + struct flow_table *old_table = ovsl_dereference(dp->table); struct flow_table *new_table; new_table = ovs_flow_tbl_rehash(old_table); @@ -1956,8 +2056,7 @@ static void rehash_flow_table(struct work_struct *work) } } rtnl_unlock(); - genl_unlock(); - + ovs_unlock(); schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); } @@ -1966,18 +2065,21 @@ static int __net_init ovs_init_net(struct net *net) struct ovs_net *ovs_net = net_generic(net, ovs_net_id); INIT_LIST_HEAD(&ovs_net->dps); + INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); return 0; } static void __net_exit ovs_exit_net(struct net *net) { - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); struct datapath *dp, *dp_next; + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - genl_lock(); + ovs_lock(); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); - genl_unlock(); + ovs_unlock(); + + cancel_work_sync(&ovs_net->dp_notify_work); } static struct pernet_operations ovs_net_ops = { @@ -1989,10 +2091,9 @@ static struct pernet_operations ovs_net_ops = { static int __init dp_init(void) { - struct sk_buff *dummy_skb; int err; - BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath\n"); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 031dfbf37c93..16b840695216 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -57,10 +57,9 @@ struct dp_stats_percpu { * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. - * @n_flows: Number of flows currently in flow table. - * @table: Current flow table. Protected by genl_lock and RCU. + * @table: Current flow table. Protected by ovs_mutex and RCU. * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by - * RTNL and RCU. + * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. * @@ -86,26 +85,6 @@ struct datapath { #endif }; -struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no); - -static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return ovs_lookup_vport(dp, port_no); -} - -static inline struct vport *ovs_vport_rtnl_rcu(const struct datapath *dp, int port_no) -{ - WARN_ON_ONCE(!rcu_read_lock_held() && !rtnl_is_locked()); - return ovs_lookup_vport(dp, port_no); -} - -static inline struct vport *ovs_vport_rtnl(const struct datapath *dp, int port_no) -{ - ASSERT_RTNL(); - return ovs_lookup_vport(dp, port_no); -} - /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. @@ -119,7 +98,7 @@ struct ovs_skb_cb { * struct dp_upcall - metadata to include with a packet to send to userspace * @cmd: One of %OVS_PACKET_CMD_*. * @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull. - * @userdata: If nonnull, its u64 value is extracted and passed to userspace as + * @userdata: If nonnull, its variable-length value is passed to userspace as * %OVS_PACKET_ATTR_USERDATA. * @pid: Netlink PID to which packet should be sent. If @pid is 0 then no * packet is sent and the packet is accounted in the datapath's @n_lost @@ -132,6 +111,30 @@ struct dp_upcall_info { u32 portid; }; +/** + * struct ovs_net - Per net-namespace data for ovs. + * @dps: List of datapaths to enable dumping them all out. + * Protected by genl_mutex. + */ +struct ovs_net { + struct list_head dps; + struct work_struct dp_notify_work; +}; + +extern int ovs_net_id; +void ovs_lock(void); +void ovs_unlock(void); + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void); +#else +#define lockdep_ovsl_is_held() 1 +#endif + +#define ASSERT_OVSL() WARN_ON(unlikely(!lockdep_ovsl_is_held())) +#define ovsl_dereference(p) \ + rcu_dereference_protected(p, lockdep_ovsl_is_held()) + static inline struct net *ovs_dp_get_net(struct datapath *dp) { return read_pnet(&dp->net); @@ -142,6 +145,26 @@ static inline void ovs_dp_set_net(struct datapath *dp, struct net *net) write_pnet(&dp->net, net); } +struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no); + +static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no) +{ + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no) +{ + ASSERT_OVSL(); + return ovs_lookup_vport(dp, port_no); +} + extern struct notifier_block ovs_dp_device_notifier; extern struct genl_multicast_group ovs_dp_vport_multicast_group; @@ -155,4 +178,5 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); +void ovs_dp_notify_wq(struct work_struct *work); #endif /* datapath.h */ diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 5558350e0d33..ef4feec6cd84 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -18,46 +18,78 @@ #include <linux/netdevice.h> #include <net/genetlink.h> +#include <net/netns/generic.h> #include "datapath.h" #include "vport-internal_dev.h" #include "vport-netdev.h" +static void dp_detach_port_notify(struct vport *vport) +{ + struct sk_buff *notify; + struct datapath *dp; + + dp = vport->dp; + notify = ovs_vport_cmd_build_info(vport, 0, 0, + OVS_VPORT_CMD_DEL); + ovs_dp_detach_port(vport); + if (IS_ERR(notify)) { + netlink_set_err(ovs_dp_get_net(dp)->genl_sock, 0, + ovs_dp_vport_multicast_group.id, + PTR_ERR(notify)); + return; + } + + genlmsg_multicast_netns(ovs_dp_get_net(dp), notify, 0, + ovs_dp_vport_multicast_group.id, + GFP_KERNEL); +} + +void ovs_dp_notify_wq(struct work_struct *work) +{ + struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); + struct datapath *dp; + + ovs_lock(); + list_for_each_entry(dp, &ovs_net->dps, list_node) { + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + struct hlist_node *n; + + hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { + struct netdev_vport *netdev_vport; + + if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) + continue; + + netdev_vport = netdev_vport_priv(vport); + if (netdev_vport->dev->reg_state == NETREG_UNREGISTERED || + netdev_vport->dev->reg_state == NETREG_UNREGISTERING) + dp_detach_port_notify(vport); + } + } + } + ovs_unlock(); +} + static int dp_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { + struct ovs_net *ovs_net; struct net_device *dev = ptr; - struct vport *vport; + struct vport *vport = NULL; - if (ovs_is_internal_dev(dev)) - vport = ovs_internal_dev_get_vport(dev); - else + if (!ovs_is_internal_dev(dev)) vport = ovs_netdev_get_vport(dev); if (!vport) return NOTIFY_DONE; - switch (event) { - case NETDEV_UNREGISTER: - if (!ovs_is_internal_dev(dev)) { - struct sk_buff *notify; - struct datapath *dp = vport->dp; - - notify = ovs_vport_cmd_build_info(vport, 0, 0, - OVS_VPORT_CMD_DEL); - ovs_dp_detach_port(vport); - if (IS_ERR(notify)) { - netlink_set_err(ovs_dp_get_net(dp)->genl_sock, 0, - ovs_dp_vport_multicast_group.id, - PTR_ERR(notify)); - break; - } - - genlmsg_multicast_netns(ovs_dp_get_net(dp), notify, 0, - ovs_dp_vport_multicast_group.id, - GFP_KERNEL); - } - break; + if (event == NETDEV_UNREGISTER) { + ovs_net = net_generic(dev_net(dev), ovs_net_id); + queue_work(system_wq, &ovs_net->dp_notify_work); } return NOTIFY_DONE; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index c3294cebc4f2..b15321a2228c 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -211,7 +211,7 @@ struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions) return ERR_PTR(-ENOMEM); sfa->actions_len = actions_len; - memcpy(sfa->actions, nla_data(actions), actions_len); + nla_memcpy(sfa->actions, actions, actions_len); return sfa; } @@ -299,10 +299,10 @@ void ovs_flow_tbl_destroy(struct flow_table *table) for (i = 0; i < table->n_buckets; i++) { struct sw_flow *flow; struct hlist_head *head = flex_array_get(table->buckets, i); - struct hlist_node *node, *n; + struct hlist_node *n; int ver = table->node_ver; - hlist_for_each_entry_safe(flow, node, n, head, hash_node[ver]) { + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { hlist_del_rcu(&flow->hash_node[ver]); ovs_flow_free(flow); } @@ -332,7 +332,6 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la { struct sw_flow *flow; struct hlist_head *head; - struct hlist_node *n; int ver; int i; @@ -340,7 +339,7 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la while (*bucket < table->n_buckets) { i = 0; head = flex_array_get(table->buckets, *bucket); - hlist_for_each_entry_rcu(flow, n, head, hash_node[ver]) { + hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { if (i < *last) { i++; continue; @@ -367,11 +366,10 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new for (i = 0; i < old->n_buckets; i++) { struct sw_flow *flow; struct hlist_head *head; - struct hlist_node *n; head = flex_array_get(old->buckets, i); - hlist_for_each_entry(flow, n, head, hash_node[old_ver]) + hlist_for_each_entry(flow, head, hash_node[old_ver]) ovs_flow_tbl_insert(new, flow); } old->keep_flows = true; @@ -468,7 +466,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) proto = *(__be16 *) skb->data; __skb_pull(skb, sizeof(__be16)); - if (ntohs(proto) >= 1536) + if (ntohs(proto) >= ETH_P_802_3_MIN) return proto; if (skb->len < sizeof(struct llc_snap_hdr)) @@ -484,7 +482,11 @@ static __be16 parse_ethertype(struct sk_buff *skb) return htons(ETH_P_802_2); __skb_pull(skb, sizeof(struct llc_snap_hdr)); - return llc->ethertype; + + if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN) + return llc->ethertype; + + return htons(ETH_P_802_2); } static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, @@ -766,14 +768,13 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, struct sw_flow_key *key, int key_len) { struct sw_flow *flow; - struct hlist_node *n; struct hlist_head *head; u32 hash; hash = ovs_flow_hash(key, key_len); head = find_bucket(table, hash); - hlist_for_each_entry_rcu(flow, n, head, hash_node[table->node_ver]) { + hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { if (flow->hash == hash && !memcmp(&flow->key, key, key_len)) { @@ -794,9 +795,9 @@ void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { + BUG_ON(table->count == 0); hlist_del_rcu(&flow->hash_node[table->node_ver]); table->count--; - BUG_ON(table->count < 0); } /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -1037,7 +1038,7 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - if (ntohs(swkey->eth.type) < 1536) + if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN) return -EINVAL; attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); } else { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index a7bb60ff3b5b..0875fde65b9c 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -138,27 +138,6 @@ int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *, void ovs_flow_used(struct sw_flow *, struct sk_buff *); u64 ovs_flow_used_time(unsigned long flow_jiffies); -/* Upper bound on the length of a nlattr-formatted flow key. The longest - * nlattr-formatted flow key would be: - * - * struct pad nl hdr total - * ------ --- ------ ----- - * OVS_KEY_ATTR_PRIORITY 4 -- 4 8 - * OVS_KEY_ATTR_IN_PORT 4 -- 4 8 - * OVS_KEY_ATTR_SKB_MARK 4 -- 4 8 - * OVS_KEY_ATTR_ETHERNET 12 -- 4 16 - * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (outer VLAN ethertype) - * OVS_KEY_ATTR_8021Q 4 -- 4 8 - * OVS_KEY_ATTR_ENCAP 0 -- 4 4 (VLAN encapsulation) - * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (inner VLAN ethertype) - * OVS_KEY_ATTR_IPV6 40 -- 4 44 - * OVS_KEY_ATTR_ICMPV6 2 2 4 8 - * OVS_KEY_ATTR_ND 28 -- 4 32 - * ------------------------------------------------- - * total 152 - */ -#define FLOW_BUFSIZE 152 - int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, const struct nlattr *); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 5d460c37df07..84e0a0379186 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -63,17 +63,6 @@ static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netde return stats; } -static int internal_dev_mac_addr(struct net_device *dev, void *p) -{ - struct sockaddr *addr = p; - - if (!is_valid_ether_addr(addr->sa_data)) - return -EADDRNOTAVAIL; - dev->addr_assign_type &= ~NET_ADDR_RANDOM; - memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); - return 0; -} - /* Called with rcu_read_lock_bh. */ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -98,7 +87,7 @@ static int internal_dev_stop(struct net_device *netdev) static void internal_dev_getinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { - strcpy(info->driver, "openvswitch"); + strlcpy(info->driver, "openvswitch", sizeof(info->driver)); } static const struct ethtool_ops internal_dev_ethtool_ops = { @@ -127,7 +116,7 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, - .ndo_set_mac_address = internal_dev_mac_addr, + .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, .ndo_get_stats64 = internal_dev_get_stats, }; @@ -139,6 +128,7 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netdev->destructor = internal_dev_destructor; SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops); netdev->tx_queue_len = 0; @@ -147,7 +137,7 @@ static void do_setup(struct net_device *netdev) NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO; netdev->vlan_features = netdev->features; - netdev->features |= NETIF_F_HW_VLAN_TX; + netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; netdev->hw_features = netdev->features & ~NETIF_F_LLTX; eth_hw_addr_random(netdev); } @@ -183,16 +173,19 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) if (vport->port_no == OVSP_LOCAL) netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + rtnl_lock(); err = register_netdevice(netdev_vport->dev); if (err) goto error_free_netdev; dev_set_promiscuity(netdev_vport->dev, 1); + rtnl_unlock(); netif_start_queue(netdev_vport->dev); return vport; error_free_netdev: + rtnl_unlock(); free_netdev(netdev_vport->dev); error_free_vport: ovs_vport_free(vport); @@ -205,10 +198,13 @@ static void internal_dev_destroy(struct vport *vport) struct netdev_vport *netdev_vport = netdev_vport_priv(vport); netif_stop_queue(netdev_vport->dev); + rtnl_lock(); dev_set_promiscuity(netdev_vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ unregister_netdevice(netdev_vport->dev); + + rtnl_unlock(); } static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) @@ -236,7 +232,6 @@ const struct vport_ops ovs_internal_vport_ops = { .create = internal_dev_create, .destroy = internal_dev_destroy, .get_name = ovs_netdev_get_name, - .get_ifindex = ovs_netdev_get_ifindex, .send = internal_dev_recv, }; diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 670cbc3518de..4f01c6d2ffa4 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -43,8 +43,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). - * (No one comes after us, since we tell handle_bridge() that we took - * the packet.) */ + */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; @@ -101,16 +100,20 @@ static struct vport *netdev_create(const struct vport_parms *parms) goto error_put; } + rtnl_lock(); err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, vport); if (err) - goto error_put; + goto error_unlock; dev_set_promiscuity(netdev_vport->dev, 1); netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + rtnl_unlock(); return vport; +error_unlock: + rtnl_unlock(); error_put: dev_put(netdev_vport->dev); error_free_vport: @@ -132,9 +135,11 @@ static void netdev_destroy(struct vport *vport) { struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + rtnl_lock(); netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(netdev_vport->dev); dev_set_promiscuity(netdev_vport->dev, -1); + rtnl_unlock(); call_rcu(&netdev_vport->rcu, free_port_rcu); } @@ -145,12 +150,6 @@ const char *ovs_netdev_get_name(const struct vport *vport) return netdev_vport->dev->name; } -int ovs_netdev_get_ifindex(const struct vport *vport) -{ - const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - return netdev_vport->dev->ifindex; -} - static unsigned int packet_length(const struct sk_buff *skb) { unsigned int length = skb->len - ETH_HLEN; @@ -201,6 +200,5 @@ const struct vport_ops ovs_netdev_vport_ops = { .create = netdev_create, .destroy = netdev_destroy, .get_name = ovs_netdev_get_name, - .get_ifindex = ovs_netdev_get_ifindex, .send = netdev_send, }; diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 6478079b3417..a3cb3a32cd77 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -40,6 +40,5 @@ netdev_vport_priv(const struct vport *vport) const char *ovs_netdev_get_name(const struct vport *); const char *ovs_netdev_get_config(const struct vport *); -int ovs_netdev_get_ifindex(const struct vport *); #endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 70af0bedbac4..720623190eaa 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -40,7 +40,7 @@ static const struct vport_ops *vport_ops_list[] = { &ovs_internal_vport_ops, }; -/* Protected by RCU read lock for reading, RTNL lock for writing. */ +/* Protected by RCU read lock for reading, ovs_mutex for writing. */ static struct hlist_head *dev_table; #define VPORT_HASH_BUCKETS 1024 @@ -80,15 +80,14 @@ static struct hlist_head *hash_bucket(struct net *net, const char *name) * * @name: name of port to find * - * Must be called with RTNL or RCU read lock. + * Must be called with ovs or RCU read lock. */ struct vport *ovs_vport_locate(struct net *net, const char *name) { struct hlist_head *bucket = hash_bucket(net, name); struct vport *vport; - struct hlist_node *node; - hlist_for_each_entry_rcu(vport, node, bucket, hash_node) + hlist_for_each_entry_rcu(vport, bucket, hash_node) if (!strcmp(name, vport->ops->get_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -129,7 +128,7 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); - vport->percpu_stats = alloc_percpu(struct vport_percpu_stats); + vport->percpu_stats = alloc_percpu(struct pcpu_tstats); if (!vport->percpu_stats) { kfree(vport); return ERR_PTR(-ENOMEM); @@ -162,7 +161,7 @@ void ovs_vport_free(struct vport *vport) * @parms: Information about new vport. * * Creates a new vport with the specified configuration (which is dependent on - * device type). RTNL lock must be held. + * device type). ovs_mutex must be held. */ struct vport *ovs_vport_add(const struct vport_parms *parms) { @@ -170,8 +169,6 @@ struct vport *ovs_vport_add(const struct vport_parms *parms) int err = 0; int i; - ASSERT_RTNL(); - for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) { if (vport_ops_list[i]->type == parms->type) { struct hlist_head *bucket; @@ -202,12 +199,10 @@ out: * @port: New configuration. * * Modifies an existing device with the specified configuration (which is - * dependent on device type). RTNL lock must be held. + * dependent on device type). ovs_mutex must be held. */ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) { - ASSERT_RTNL(); - if (!vport->ops->set_options) return -EOPNOTSUPP; return vport->ops->set_options(vport, options); @@ -219,11 +214,11 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) * @vport: vport to delete. * * Detaches @vport from its datapath and destroys it. It is possible to fail - * for reasons such as lack of memory. RTNL lock must be held. + * for reasons such as lack of memory. ovs_mutex must be held. */ void ovs_vport_del(struct vport *vport) { - ASSERT_RTNL(); + ASSERT_OVSL(); hlist_del_rcu(&vport->hash_node); @@ -238,7 +233,7 @@ void ovs_vport_del(struct vport *vport) * * Retrieves transmit, receive, and error stats for the given device. * - * Must be called with RTNL lock or rcu_read_lock. + * Must be called with ovs_mutex or rcu_read_lock. */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { @@ -265,16 +260,16 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) spin_unlock_bh(&vport->stats_lock); for_each_possible_cpu(i) { - const struct vport_percpu_stats *percpu_stats; - struct vport_percpu_stats local_stats; + const struct pcpu_tstats *percpu_stats; + struct pcpu_tstats local_stats; unsigned int start; percpu_stats = per_cpu_ptr(vport->percpu_stats, i); do { - start = u64_stats_fetch_begin_bh(&percpu_stats->sync); + start = u64_stats_fetch_begin_bh(&percpu_stats->syncp); local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start)); + } while (u64_stats_fetch_retry_bh(&percpu_stats->syncp, start)); stats->rx_bytes += local_stats.rx_bytes; stats->rx_packets += local_stats.rx_packets; @@ -297,22 +292,24 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) * negative error code if a real error occurred. If an error occurs, @skb is * left unmodified. * - * Must be called with RTNL lock or rcu_read_lock. + * Must be called with ovs_mutex or rcu_read_lock. */ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) { struct nlattr *nla; + int err; + + if (!vport->ops->get_options) + return 0; nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS); if (!nla) return -EMSGSIZE; - if (vport->ops->get_options) { - int err = vport->ops->get_options(vport, skb); - if (err) { - nla_nest_cancel(skb, nla); - return err; - } + err = vport->ops->get_options(vport, skb); + if (err) { + nla_nest_cancel(skb, nla); + return err; } nla_nest_end(skb, nla); @@ -326,18 +323,17 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) * @skb: skb that was received * * Must be called with rcu_read_lock. The packet cannot be shared and - * skb->data should point to the Ethernet header. The caller must have already - * called compute_ip_summed() to initialize the checksumming fields. + * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) { - struct vport_percpu_stats *stats; + struct pcpu_tstats *stats; stats = this_cpu_ptr(vport->percpu_stats); - u64_stats_update_begin(&stats->sync); + u64_stats_update_begin(&stats->syncp); stats->rx_packets++; stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->sync); + u64_stats_update_end(&stats->syncp); ovs_dp_process_received_packet(vport, skb); } @@ -348,7 +344,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) * @vport: vport on which to send the packet * @skb: skb to send * - * Sends the given packet and returns the length of data sent. Either RTNL + * Sends the given packet and returns the length of data sent. Either ovs * lock or rcu_read_lock must be held. */ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) @@ -356,14 +352,14 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) int sent = vport->ops->send(vport, skb); if (likely(sent)) { - struct vport_percpu_stats *stats; + struct pcpu_tstats *stats; stats = this_cpu_ptr(vport->percpu_stats); - u64_stats_update_begin(&stats->sync); + u64_stats_update_begin(&stats->syncp); stats->tx_packets++; stats->tx_bytes += sent; - u64_stats_update_end(&stats->sync); + u64_stats_update_end(&stats->syncp); } return sent; } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 3f7961ea3c56..68a377bc0841 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -19,6 +19,7 @@ #ifndef VPORT_H #define VPORT_H 1 +#include <linux/if_tunnel.h> #include <linux/list.h> #include <linux/netlink.h> #include <linux/openvswitch.h> @@ -50,14 +51,6 @@ int ovs_vport_send(struct vport *, struct sk_buff *); /* The following definitions are for implementers of vport devices: */ -struct vport_percpu_stats { - u64 rx_bytes; - u64 rx_packets; - u64 tx_bytes; - u64 tx_packets; - struct u64_stats_sync sync; -}; - struct vport_err_stats { u64 rx_dropped; u64 rx_errors; @@ -68,10 +61,10 @@ struct vport_err_stats { /** * struct vport - one port within a datapath * @rcu: RCU callback head for deferred destruction. - * @port_no: Index into @dp's @ports array. * @dp: Datapath to which this port belongs. * @upcall_portid: The Netlink port to use for packets received on this port that * miss the flow table. + * @port_no: Index into @dp's @ports array. * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. @@ -81,15 +74,15 @@ struct vport_err_stats { */ struct vport { struct rcu_head rcu; - u16 port_no; struct datapath *dp; u32 upcall_portid; + u16 port_no; struct hlist_node hash_node; struct hlist_node dp_hash_node; const struct vport_ops *ops; - struct vport_percpu_stats __percpu *percpu_stats; + struct pcpu_tstats __percpu *percpu_stats; spinlock_t stats_lock; struct vport_err_stats err_stats; @@ -131,24 +124,22 @@ struct vport_parms { * have any configuration. * @get_name: Get the device's name. * @get_config: Get the device's configuration. - * @get_ifindex: Get the system interface index associated with the device. * May be null if the device does not have an ifindex. * @send: Send a packet on the device. Returns the length of the packet sent. */ struct vport_ops { enum ovs_vport_type type; - /* Called with RTNL lock. */ + /* Called with ovs_mutex. */ struct vport *(*create)(const struct vport_parms *); void (*destroy)(struct vport *); int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - /* Called with rcu_read_lock or RTNL lock. */ + /* Called with rcu_read_lock or ovs_mutex. */ const char *(*get_name)(const struct vport *); void (*get_config)(const struct vport *, void *); - int (*get_ifindex)(const struct vport *); int (*send)(struct vport *, struct sk_buff *); }; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c111bd0e083a..8ec1bca7f859 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -158,10 +158,16 @@ struct packet_mreq_max { unsigned char mr_address[MAX_ADDR_LEN]; }; +union tpacket_uhdr { + struct tpacket_hdr *h1; + struct tpacket2_hdr *h2; + struct tpacket3_hdr *h3; + void *raw; +}; + static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring); - #define V3_ALIGNMENT (8) #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) @@ -181,6 +187,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, struct packet_sock; static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); +static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, @@ -288,11 +296,7 @@ static inline __pure struct page *pgv_to_page(void *addr) static void __packet_set_status(struct packet_sock *po, void *frame, int status) { - union { - struct tpacket_hdr *h1; - struct tpacket2_hdr *h2; - void *raw; - } h; + union tpacket_uhdr h; h.raw = frame; switch (po->tp_version) { @@ -315,11 +319,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) static int __packet_get_status(struct packet_sock *po, void *frame) { - union { - struct tpacket_hdr *h1; - struct tpacket2_hdr *h2; - void *raw; - } h; + union tpacket_uhdr h; smp_rmb(); @@ -339,17 +339,66 @@ static int __packet_get_status(struct packet_sock *po, void *frame) } } +static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, + unsigned int flags) +{ + struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + + if (shhwtstamps) { + if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) && + ktime_to_timespec_cond(shhwtstamps->syststamp, ts)) + return TP_STATUS_TS_SYS_HARDWARE; + if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) && + ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) + return TP_STATUS_TS_RAW_HARDWARE; + } + + if (ktime_to_timespec_cond(skb->tstamp, ts)) + return TP_STATUS_TS_SOFTWARE; + + return 0; +} + +static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, + struct sk_buff *skb) +{ + union tpacket_uhdr h; + struct timespec ts; + __u32 ts_status; + + if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) + return 0; + + h.raw = frame; + switch (po->tp_version) { + case TPACKET_V1: + h.h1->tp_sec = ts.tv_sec; + h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; + break; + case TPACKET_V2: + h.h2->tp_sec = ts.tv_sec; + h.h2->tp_nsec = ts.tv_nsec; + break; + case TPACKET_V3: + default: + WARN(1, "TPACKET version not supported.\n"); + BUG(); + } + + /* one flush is safe, as both fields always lie on the same cacheline */ + flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); + smp_wmb(); + + return ts_status; +} + static void *packet_lookup_frame(struct packet_sock *po, struct packet_ring_buffer *rb, unsigned int position, int status) { unsigned int pg_vec_pos, frame_offset; - union { - struct tpacket_hdr *h1; - struct tpacket2_hdr *h2; - void *raw; - } h; + union tpacket_uhdr h; pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; @@ -479,7 +528,7 @@ static void init_prb_bdqc(struct packet_sock *po, p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; p1->last_kactive_blk_num = 0; - po->stats_u.stats3.tp_freeze_q_cnt = 0; + po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; else @@ -647,7 +696,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; - if (po->stats.tp_drops) + if (po->stats.stats3.tp_drops) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; @@ -693,36 +742,33 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1, smp_rmb(); - if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) { + /* We could have just memset this but we will lose the + * flexibility of making the priv area sticky + */ - /* We could have just memset this but we will lose the - * flexibility of making the priv area sticky - */ - BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; - BLOCK_NUM_PKTS(pbd1) = 0; - BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); - getnstimeofday(&ts); - h1->ts_first_pkt.ts_sec = ts.tv_sec; - h1->ts_first_pkt.ts_nsec = ts.tv_nsec; - pkc1->pkblk_start = (char *)pbd1; - pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); - BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); - BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; - pbd1->version = pkc1->version; - pkc1->prev = pkc1->nxt_offset; - pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; - prb_thaw_queue(pkc1); - _prb_refresh_rx_retire_blk_timer(pkc1); + BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; + BLOCK_NUM_PKTS(pbd1) = 0; + BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); - smp_wmb(); + getnstimeofday(&ts); - return; - } + h1->ts_first_pkt.ts_sec = ts.tv_sec; + h1->ts_first_pkt.ts_nsec = ts.tv_nsec; + + pkc1->pkblk_start = (char *)pbd1; + pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); + + BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); + BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; + + pbd1->version = pkc1->version; + pkc1->prev = pkc1->nxt_offset; + pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; + + prb_thaw_queue(pkc1); + _prb_refresh_rx_retire_blk_timer(pkc1); - WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n", - pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num); - dump_stack(); - BUG(); + smp_wmb(); } /* @@ -752,7 +798,7 @@ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { pkc->reset_pending_on_curr_blk = 1; - po->stats_u.stats3.tp_freeze_q_cnt++; + po->stats.stats3.tp_freeze_q_cnt++; } #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) @@ -813,10 +859,6 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, prb_close_block(pkc, pbd, po, status); return; } - - WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd); - dump_stack(); - BUG(); } static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc, @@ -973,11 +1015,11 @@ static void *packet_current_rx_frame(struct packet_sock *po, static void *prb_lookup_block(struct packet_sock *po, struct packet_ring_buffer *rb, - unsigned int previous, + unsigned int idx, int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); - struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous); + struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); if (status != BLOCK_STATUS(pbd)) return NULL; @@ -1041,6 +1083,29 @@ static void packet_increment_head(struct packet_ring_buffer *buff) buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } +static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +{ + struct sock *sk = &po->sk; + bool has_room; + + if (po->prot_hook.func != tpacket_rcv) + return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) + <= sk->sk_rcvbuf; + + spin_lock(&sk->sk_receive_queue.lock); + if (po->tp_version == TPACKET_V3) + has_room = prb_lookup_block(po, &po->rx_ring, + po->rx_ring.prb_bdqc.kactive_blk_num, + TP_STATUS_KERNEL); + else + has_room = packet_lookup_frame(po, &po->rx_ring, + po->rx_ring.head, + TP_STATUS_KERNEL); + spin_unlock(&sk->sk_receive_queue.lock); + + return has_room; +} + static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); @@ -1066,16 +1131,16 @@ static int fanout_rr_next(struct packet_fanout *f, unsigned int num) return x; } -static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) +static unsigned int fanout_demux_hash(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int num) { - u32 idx, hash = skb->rxhash; - - idx = ((u64)hash * num) >> 32; - - return f->arr[idx]; + return (((u64)skb->rxhash) * num) >> 32; } -static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) +static unsigned int fanout_demux_lb(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int num) { int cur, old; @@ -1083,14 +1148,40 @@ static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb while ((old = atomic_cmpxchg(&f->rr_cur, cur, fanout_rr_next(f, num))) != cur) cur = old; - return f->arr[cur]; + return cur; +} + +static unsigned int fanout_demux_cpu(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int num) +{ + return smp_processor_id() % num; } -static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) +static unsigned int fanout_demux_rollover(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int idx, unsigned int skip, + unsigned int num) { - unsigned int cpu = smp_processor_id(); + unsigned int i, j; + + i = j = min_t(int, f->next[idx], num - 1); + do { + if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { + if (i != j) + f->next[idx] = i; + return i; + } + if (++i == num) + i = 0; + } while (i != j); + + return idx; +} - return f->arr[cpu % num]; +static bool fanout_has_flag(struct packet_fanout *f, u16 flag) +{ + return f->flags & (flag >> 8); } static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, @@ -1099,7 +1190,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_fanout *f = pt->af_packet_priv; unsigned int num = f->num_members; struct packet_sock *po; - struct sock *sk; + unsigned int idx; if (!net_eq(dev_net(dev), read_pnet(&f->net)) || !num) { @@ -1110,23 +1201,31 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, switch (f->type) { case PACKET_FANOUT_HASH: default: - if (f->defrag) { + if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } skb_get_rxhash(skb); - sk = fanout_demux_hash(f, skb, num); + idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: - sk = fanout_demux_lb(f, skb, num); + idx = fanout_demux_lb(f, skb, num); break; case PACKET_FANOUT_CPU: - sk = fanout_demux_cpu(f, skb, num); + idx = fanout_demux_cpu(f, skb, num); + break; + case PACKET_FANOUT_ROLLOVER: + idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); break; } - po = pkt_sk(sk); + po = pkt_sk(f->arr[idx]); + if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && + unlikely(!packet_rcv_has_room(po, skb))) { + idx = fanout_demux_rollover(f, skb, idx, idx, num); + po = pkt_sk(f->arr[idx]); + } return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } @@ -1175,10 +1274,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f, *match; u8 type = type_flags & 0xff; - u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0; + u8 flags = type_flags >> 8; int err; switch (type) { + case PACKET_FANOUT_ROLLOVER: + if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) + return -EINVAL; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: @@ -1203,7 +1305,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } } err = -EINVAL; - if (match && match->defrag != defrag) + if (match && match->flags != flags) goto out; if (!match) { err = -ENOMEM; @@ -1213,7 +1315,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) write_pnet(&match->net, sock_net(sk)); match->id = id; match->type = type; - match->defrag = defrag; + match->flags = flags; atomic_set(&match->rr_cur, 0); INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); @@ -1443,13 +1545,14 @@ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); - if (err < 0) - goto out_unlock; + + sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (unlikely(extra_len == 4)) skb->no_fcs = 1; + skb_probe_transport_header(skb, 0); + dev_queue_xmit(skb); rcu_read_unlock(); return len; @@ -1577,7 +1680,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, nf_reset(skb); spin_lock(&sk->sk_receive_queue.lock); - po->stats.tp_packets++; + po->stats.stats1.tp_packets++; skb->dropcount = atomic_read(&sk->sk_drops); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); @@ -1586,7 +1689,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: spin_lock(&sk->sk_receive_queue.lock); - po->stats.tp_drops++; + po->stats.stats1.tp_drops++; atomic_inc(&sk->sk_drops); spin_unlock(&sk->sk_receive_queue.lock); @@ -1606,21 +1709,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct sock *sk; struct packet_sock *po; struct sockaddr_ll *sll; - union { - struct tpacket_hdr *h1; - struct tpacket2_hdr *h2; - struct tpacket3_hdr *h3; - void *raw; - } h; + union tpacket_uhdr h; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; unsigned short macoff, netoff, hdrlen; struct sk_buff *copy_skb = NULL; - struct timeval tv; struct timespec ts; - struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + __u32 ts_status; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; @@ -1692,10 +1789,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ - if (po->stats.tp_drops) + if (po->stats.stats1.tp_drops) status |= TP_STATUS_LOSING; } - po->stats.tp_packets++; + po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; __skb_queue_tail(&sk->sk_receive_queue, copy_skb); @@ -1704,24 +1801,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, skb_copy_bits(skb, 0, h.raw + macoff, snaplen); + if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) + getnstimeofday(&ts); + + status |= ts_status; + switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; - if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) - && shhwtstamps->syststamp.tv64) - tv = ktime_to_timeval(shhwtstamps->syststamp); - else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) - && shhwtstamps->hwtstamp.tv64) - tv = ktime_to_timeval(shhwtstamps->hwtstamp); - else if (skb->tstamp.tv64) - tv = ktime_to_timeval(skb->tstamp); - else - do_gettimeofday(&tv); - h.h1->tp_sec = tv.tv_sec; - h.h1->tp_usec = tv.tv_usec; + h.h1->tp_sec = ts.tv_sec; + h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: @@ -1729,16 +1821,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; - if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) - && shhwtstamps->syststamp.tv64) - ts = ktime_to_timespec(shhwtstamps->syststamp); - else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) - && shhwtstamps->hwtstamp.tv64) - ts = ktime_to_timespec(shhwtstamps->hwtstamp); - else if (skb->tstamp.tv64) - ts = ktime_to_timespec(skb->tstamp); - else - getnstimeofday(&ts); h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; if (vlan_tx_tag_present(skb)) { @@ -1759,16 +1841,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; - if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) - && shhwtstamps->syststamp.tv64) - ts = ktime_to_timespec(shhwtstamps->syststamp); - else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) - && shhwtstamps->hwtstamp.tv64) - ts = ktime_to_timespec(shhwtstamps->hwtstamp); - else if (skb->tstamp.tv64) - ts = ktime_to_timespec(skb->tstamp); - else - getnstimeofday(&ts); h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; hdrlen = sizeof(*h.h3); @@ -1819,7 +1891,7 @@ drop: return 0; ring_is_full: - po->stats.tp_drops++; + po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk, 0); @@ -1833,10 +1905,14 @@ static void tpacket_destruct_skb(struct sk_buff *skb) void *ph; if (likely(po->tx_ring.pg_vec)) { + __u32 ts; + ph = skb_shinfo(skb)->destructor_arg; BUG_ON(atomic_read(&po->tx_ring.pending) == 0); atomic_dec(&po->tx_ring.pending); - __packet_set_status(po, ph, TP_STATUS_AVAILABLE); + + ts = __packet_set_timestamp(po, ph, skb); + __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); } sock_wfree(skb); @@ -1846,11 +1922,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, int size_max, __be16 proto, unsigned char *addr, int hlen) { - union { - struct tpacket_hdr *h1; - struct tpacket2_hdr *h2; - void *raw; - } ph; + union tpacket_uhdr ph; int to_write, offset, len, tp_len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; @@ -1863,6 +1935,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; + sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; switch (po->tp_version) { @@ -1880,6 +1953,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb_reserve(skb, hlen); skb_reset_network_header(skb); + skb_probe_transport_header(skb, 0); if (po->tp_tx_has_off) { int off_min, off_max, off; @@ -2247,9 +2321,8 @@ static int packet_snd(struct socket *sock, err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); if (err) goto out_free; - err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); - if (err < 0) - goto out_free; + + sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (!gso_type && (len > dev->mtu + reserve + extra_len)) { /* Earlier code assumed this would be a VLAN pkt, @@ -2289,6 +2362,8 @@ static int packet_snd(struct socket *sock, len += vnet_hdr_len; } + skb_probe_transport_header(skb, reserve); + if (unlikely(extra_len == 4)) skb->no_fcs = 1; @@ -3165,8 +3240,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); void *data = &val; - struct tpacket_stats st; - union tpacket_stats_u st_u; + union tpacket_stats_u st; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3180,22 +3254,18 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case PACKET_STATISTICS: spin_lock_bh(&sk->sk_receive_queue.lock); + memcpy(&st, &po->stats, sizeof(st)); + memset(&po->stats, 0, sizeof(po->stats)); + spin_unlock_bh(&sk->sk_receive_queue.lock); + if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); - memcpy(&st_u.stats3, &po->stats, - sizeof(struct tpacket_stats)); - st_u.stats3.tp_freeze_q_cnt = - po->stats_u.stats3.tp_freeze_q_cnt; - st_u.stats3.tp_packets += po->stats.tp_drops; - data = &st_u.stats3; + data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); - st = po->stats; - st.tp_packets += st.tp_drops; - data = &st; + data = &st.stats1; } - memset(&po->stats, 0, sizeof(st)); - spin_unlock_bh(&sk->sk_receive_queue.lock); + break; case PACKET_AUXDATA: val = po->auxdata; @@ -3240,7 +3310,8 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, case PACKET_FANOUT: val = (po->fanout ? ((u32)po->fanout->id | - ((u32)po->fanout->type << 16)) : + ((u32)po->fanout->type << 16) | + ((u32)po->fanout->flags << 24)) : 0); break; case PACKET_TX_HAS_OFF: @@ -3263,12 +3334,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data) { struct sock *sk; - struct hlist_node *node; struct net_device *dev = data; struct net *net = dev_net(dev); rcu_read_lock(); - sk_for_each_rcu(sk, node, &net->packet.sklist) { + sk_for_each_rcu(sk, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { @@ -3828,7 +3898,7 @@ static int __net_init packet_net_init(struct net *net) mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); - if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops)) + if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops)) return -ENOMEM; return 0; @@ -3836,7 +3906,7 @@ static int __net_init packet_net_init(struct net *net) static void __net_exit packet_net_exit(struct net *net) { - proc_net_remove(net, "packet"); + remove_proc_entry("packet", net->proc_net); } static struct pernet_operations packet_net_ops = { diff --git a/net/packet/diag.c b/net/packet/diag.c index 8db6e21c46bd..a9584a2f6d69 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -125,8 +125,10 @@ static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb) return ret; } -static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct packet_diag_req *req, - u32 portid, u32 seq, u32 flags, int sk_ino) +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct packet_diag_req *req, + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct packet_diag_msg *rp; @@ -147,6 +149,11 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct packet_diag pdiag_put_info(po, skb)) goto out_nlmsg_trim; + if ((req->pdiag_show & PACKET_SHOW_INFO) && + nla_put_u32(skb, PACKET_DIAG_UID, + from_kuid_munged(user_ns, sock_i_uid(sk)))) + goto out_nlmsg_trim; + if ((req->pdiag_show & PACKET_SHOW_MCLIST) && pdiag_put_mclist(po, skb)) goto out_nlmsg_trim; @@ -159,6 +166,14 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct packet_diag pdiag_put_fanout(po, skb)) goto out_nlmsg_trim; + if ((req->pdiag_show & PACKET_SHOW_MEMINFO) && + sock_diag_put_meminfo(sk, skb, PACKET_DIAG_MEMINFO)) + goto out_nlmsg_trim; + + if ((req->pdiag_show & PACKET_SHOW_FILTER) && + sock_diag_put_filterinfo(user_ns, sk, skb, PACKET_DIAG_FILTER)) + goto out_nlmsg_trim; + return nlmsg_end(skb, nlh); out_nlmsg_trim: @@ -172,21 +187,22 @@ static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct packet_diag_req *req; struct net *net; struct sock *sk; - struct hlist_node *node; net = sock_net(skb->sk); req = nlmsg_data(cb->nlh); mutex_lock(&net->packet.sklist_lock); - sk_for_each(sk, node, &net->packet.sklist) { + sk_for_each(sk, &net->packet.sklist) { if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; - if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - sock_i_ino(sk)) < 0) + if (sk_diag_fill(sk, skb, req, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + sock_i_ino(sk)) < 0) goto done; next: num++; diff --git a/net/packet/internal.h b/net/packet/internal.h index e84cab8cb7a9..c4e4b4561207 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -54,6 +54,7 @@ struct pgv { struct packet_ring_buffer { struct pgv *pg_vec; + unsigned int head; unsigned int frames_per_block; unsigned int frame_size; @@ -63,8 +64,9 @@ struct packet_ring_buffer { unsigned int pg_vec_pages; unsigned int pg_vec_len; - struct tpacket_kbdq_core prb_bdqc; atomic_t pending; + + struct tpacket_kbdq_core prb_bdqc; }; extern struct mutex fanout_mutex; @@ -77,10 +79,11 @@ struct packet_fanout { unsigned int num_members; u16 id; u8 type; - u8 defrag; + u8 flags; atomic_t rr_cur; struct list_head list; struct sock *arr[PACKET_FANOUT_MAX]; + int next[PACKET_FANOUT_MAX]; spinlock_t lock; atomic_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; @@ -90,8 +93,7 @@ struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; struct packet_fanout *fanout; - struct tpacket_stats stats; - union tpacket_stats_u stats_u; + union tpacket_stats_u stats; struct packet_ring_buffer rx_ring; struct packet_ring_buffer tx_ring; int copy_thresh; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 576f22c9c76e..e77411735de8 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -640,11 +640,10 @@ static struct sock *pep_find_pipe(const struct hlist_head *hlist, const struct sockaddr_pn *dst, u8 pipe_handle) { - struct hlist_node *node; struct sock *sknode; u16 dobj = pn_sockaddr_get_object(dst); - sk_for_each(sknode, node, hlist) { + sk_for_each(sknode, hlist) { struct pep_sock *pnnode = pep_sk(sknode); /* Ports match, but addresses might not: */ diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 5bf6341e2dd4..45a7df6575de 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -320,7 +320,7 @@ static int __net_init phonet_init_net(struct net *net) { struct phonet_net *pnn = phonet_pernet(net); - if (!proc_net_fops_create(net, "phonet", 0, &pn_sock_seq_fops)) + if (!proc_create("phonet", 0, net->proc_net, &pn_sock_seq_fops)) return -ENOMEM; INIT_LIST_HEAD(&pnn->pndevs.list); @@ -331,7 +331,7 @@ static int __net_init phonet_init_net(struct net *net) static void __net_exit phonet_exit_net(struct net *net) { - proc_net_remove(net, "phonet"); + remove_proc_entry("phonet", net->proc_net); } static struct pernet_operations phonet_net_ops = { @@ -348,7 +348,7 @@ int __init phonet_device_init(void) if (err) return err; - proc_net_fops_create(&init_net, "pnresource", 0, &pn_res_seq_fops); + proc_create("pnresource", 0, init_net.proc_net, &pn_res_seq_fops); register_netdevice_notifier(&phonet_device_notifier); err = phonet_netlink_register(); if (err) @@ -361,7 +361,7 @@ void phonet_device_exit(void) rtnl_unregister_all(PF_PHONET); unregister_netdevice_notifier(&phonet_device_notifier); unregister_pernet_subsys(&phonet_net_ops); - proc_net_remove(&init_net, "pnresource"); + remove_proc_entry("pnresource", init_net.proc_net); } int phonet_route_add(struct net_device *dev, u8 daddr) diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 0193630d3061..dc15f4300808 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -61,7 +61,7 @@ static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U8 }, }; -static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr) +static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -224,7 +224,7 @@ static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = { [RTA_OIF] = { .type = NLA_U32 }, }; -static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr) +static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[RTA_MAX+1]; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index b7e982782255..1afd1381cdc7 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -76,7 +76,6 @@ static struct hlist_head *pn_hash_list(u16 obj) */ struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *spn) { - struct hlist_node *node; struct sock *sknode; struct sock *rval = NULL; u16 obj = pn_sockaddr_get_object(spn); @@ -84,7 +83,7 @@ struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *spn) struct hlist_head *hlist = pn_hash_list(obj); rcu_read_lock(); - sk_for_each_rcu(sknode, node, hlist) { + sk_for_each_rcu(sknode, hlist) { struct pn_sock *pn = pn_sk(sknode); BUG_ON(!pn->sobject); /* unbound socket */ @@ -120,10 +119,9 @@ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb) rcu_read_lock(); for (h = 0; h < PN_HASHSIZE; h++) { - struct hlist_node *node; struct sock *sknode; - sk_for_each(sknode, node, hlist) { + sk_for_each(sknode, hlist) { struct sk_buff *clone; if (!net_eq(sock_net(sknode), net)) @@ -543,12 +541,11 @@ static struct sock *pn_sock_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct hlist_head *hlist = pnsocks.hlist; - struct hlist_node *node; struct sock *sknode; unsigned int h; for (h = 0; h < PN_HASHSIZE; h++) { - sk_for_each_rcu(sknode, node, hlist) { + sk_for_each_rcu(sknode, hlist) { if (!net_eq(net, sock_net(sknode))) continue; if (!pos) diff --git a/net/rds/Kconfig b/net/rds/Kconfig index ec753b3ae72a..f2c670ba7b9b 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -1,7 +1,7 @@ config RDS - tristate "The RDS Protocol (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "The RDS Protocol" + depends on INET ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, sequenced delivery of datagrams over Infiniband, iWARP, diff --git a/net/rds/bind.c b/net/rds/bind.c index 637bde56c9db..b5ad65a0067e 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -52,13 +52,12 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, struct rds_sock *insert) { struct rds_sock *rs; - struct hlist_node *node; struct hlist_head *head = hash_to_bucket(addr, port); u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); rcu_read_lock(); - hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) { + hlist_for_each_entry_rcu(rs, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); diff --git a/net/rds/connection.c b/net/rds/connection.c index 9e07c756d1f9..642ad42c416b 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -69,9 +69,8 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, struct rds_transport *trans) { struct rds_connection *conn, *ret = NULL; - struct hlist_node *pos; - hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && conn->c_trans == trans) { ret = conn; @@ -376,7 +375,6 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, int want_send) { struct hlist_head *head; - struct hlist_node *pos; struct list_head *list; struct rds_connection *conn; struct rds_message *rm; @@ -390,7 +388,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (want_send) list = &conn->c_send_queue; else @@ -439,7 +437,6 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, { uint64_t buffer[(item_len + 7) / 8]; struct hlist_head *head; - struct hlist_node *pos; struct rds_connection *conn; size_t i; @@ -450,7 +447,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) diff --git a/net/rds/message.c b/net/rds/message.c index f0a4658f3273..aba232f9f308 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -82,10 +82,7 @@ static void rds_message_purge(struct rds_message *rm) void rds_message_put(struct rds_message *rm) { rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); - if (atomic_read(&rm->m_refcount) == 0) { -printk(KERN_CRIT "danger refcount zero on %p\n", rm); -WARN_ON(1); - } + WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm); if (atomic_dec_and_test(&rm->m_refcount)) { BUG_ON(!list_empty(&rm->m_sock_item)); BUG_ON(!list_empty(&rm->m_conn_item)); @@ -197,6 +194,9 @@ struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) { struct rds_message *rm; + if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message)) + return NULL; + rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); if (!rm) goto out; diff --git a/net/rds/stats.c b/net/rds/stats.c index 7be790d60b90..73be187d389e 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -87,6 +87,7 @@ void rds_stats_info_copy(struct rds_info_iterator *iter, for (i = 0; i < nr; i++) { BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); + ctr.name[sizeof(ctr.name) - 1] = '\0'; ctr.value = values[i]; rds_info_copy(iter, &ctr, sizeof(ctr)); diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 9b9be5279f5d..1cec5e4f3a5e 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -587,7 +587,7 @@ static ssize_t rfkill_name_show(struct device *dev, static const char *rfkill_get_type_str(enum rfkill_type type) { - BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_FM + 1); + BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_NFC + 1); switch (type) { case RFKILL_TYPE_WLAN: @@ -604,6 +604,8 @@ static const char *rfkill_get_type_str(enum rfkill_type type) return "gps"; case RFKILL_TYPE_FM: return "fm"; + case RFKILL_TYPE_NFC: + return "nfc"; default: BUG(); } diff --git a/net/rfkill/input.c b/net/rfkill/input.c index c9d931e7ffec..b85107b5ef62 100644 --- a/net/rfkill/input.c +++ b/net/rfkill/input.c @@ -148,11 +148,9 @@ static unsigned long rfkill_ratelimit(const unsigned long last) static void rfkill_schedule_ratelimited(void) { - if (delayed_work_pending(&rfkill_op_work)) - return; - schedule_delayed_work(&rfkill_op_work, - rfkill_ratelimit(rfkill_last_scheduled)); - rfkill_last_scheduled = jiffies; + if (schedule_delayed_work(&rfkill_op_work, + rfkill_ratelimit(rfkill_last_scheduled))) + rfkill_last_scheduled = jiffies; } static void rfkill_schedule_global_op(enum rfkill_sched_op op) diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 78fc0937948d..fb076cd6f808 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -131,6 +131,7 @@ static int rfkill_gpio_probe(struct platform_device *pdev) rfkill->pwr_clk = clk_get(&pdev->dev, pdata->power_clk_name); if (IS_ERR(rfkill->pwr_clk)) { pr_warn("%s: can't find pwr_clk.\n", __func__); + ret = PTR_ERR(rfkill->pwr_clk); goto fail_shutdown_name; } } @@ -152,9 +153,11 @@ static int rfkill_gpio_probe(struct platform_device *pdev) } rfkill->rfkill_dev = rfkill_alloc(pdata->name, &pdev->dev, pdata->type, - &rfkill_gpio_ops, rfkill); - if (!rfkill->rfkill_dev) + &rfkill_gpio_ops, rfkill); + if (!rfkill->rfkill_dev) { + ret = -ENOMEM; goto fail_shutdown; + } ret = rfkill_register(rfkill->rfkill_dev); if (ret < 0) diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c index 4b5ab21ecb24..d11ac79246e4 100644 --- a/net/rfkill/rfkill-regulator.c +++ b/net/rfkill/rfkill-regulator.c @@ -51,7 +51,7 @@ static int rfkill_regulator_set_block(void *data, bool blocked) return 0; } -struct rfkill_ops rfkill_regulator_ops = { +static struct rfkill_ops rfkill_regulator_ops = { .set_block = rfkill_regulator_set_block, }; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index c4719ce604c2..9c8347451597 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -165,10 +165,9 @@ static void rose_remove_socket(struct sock *sk) void rose_kill_by_neigh(struct rose_neigh *neigh) { struct sock *s; - struct hlist_node *node; spin_lock_bh(&rose_list_lock); - sk_for_each(s, node, &rose_list) { + sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (rose->neighbour == neigh) { @@ -186,10 +185,9 @@ void rose_kill_by_neigh(struct rose_neigh *neigh) static void rose_kill_by_device(struct net_device *dev) { struct sock *s; - struct hlist_node *node; spin_lock_bh(&rose_list_lock); - sk_for_each(s, node, &rose_list) { + sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (rose->device == dev) { @@ -246,10 +244,9 @@ static void rose_insert_socket(struct sock *sk) static struct sock *rose_find_listener(rose_address *addr, ax25_address *call) { struct sock *s; - struct hlist_node *node; spin_lock_bh(&rose_list_lock); - sk_for_each(s, node, &rose_list) { + sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (!rosecmp(&rose->source_addr, addr) && @@ -258,7 +255,7 @@ static struct sock *rose_find_listener(rose_address *addr, ax25_address *call) goto found; } - sk_for_each(s, node, &rose_list) { + sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (!rosecmp(&rose->source_addr, addr) && @@ -278,10 +275,9 @@ found: struct sock *rose_find_socket(unsigned int lci, struct rose_neigh *neigh) { struct sock *s; - struct hlist_node *node; spin_lock_bh(&rose_list_lock); - sk_for_each(s, node, &rose_list) { + sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (rose->lci == lci && rose->neighbour == neigh) @@ -1257,6 +1253,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (srose != NULL) { + memset(srose, 0, msg->msg_namelen); srose->srose_family = AF_ROSE; srose->srose_addr = rose->dest_addr; srose->srose_call = rose->dest_call; @@ -1575,10 +1572,13 @@ static int __init rose_proto_init(void) rose_add_loopback_neigh(); - proc_net_fops_create(&init_net, "rose", S_IRUGO, &rose_info_fops); - proc_net_fops_create(&init_net, "rose_neigh", S_IRUGO, &rose_neigh_fops); - proc_net_fops_create(&init_net, "rose_nodes", S_IRUGO, &rose_nodes_fops); - proc_net_fops_create(&init_net, "rose_routes", S_IRUGO, &rose_routes_fops); + proc_create("rose", S_IRUGO, init_net.proc_net, &rose_info_fops); + proc_create("rose_neigh", S_IRUGO, init_net.proc_net, + &rose_neigh_fops); + proc_create("rose_nodes", S_IRUGO, init_net.proc_net, + &rose_nodes_fops); + proc_create("rose_routes", S_IRUGO, init_net.proc_net, + &rose_routes_fops); out: return rc; fail: @@ -1605,10 +1605,10 @@ static void __exit rose_exit(void) { int i; - proc_net_remove(&init_net, "rose"); - proc_net_remove(&init_net, "rose_neigh"); - proc_net_remove(&init_net, "rose_nodes"); - proc_net_remove(&init_net, "rose_routes"); + remove_proc_entry("rose", init_net.proc_net); + remove_proc_entry("rose_neigh", init_net.proc_net); + remove_proc_entry("rose_nodes", init_net.proc_net); + remove_proc_entry("rose_routes", init_net.proc_net); rose_loopback_clear(); rose_rt_free(); diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 0d3103c4f11c..23dcef12b986 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -4,7 +4,7 @@ config AF_RXRPC tristate "RxRPC session sockets" - depends on INET && EXPERIMENTAL + depends on INET select CRYPTO select KEYS help diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 05996d0dd828..e61aa6001c65 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -10,6 +10,7 @@ */ #include <linux/module.h> +#include <linux/kernel.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/skbuff.h> @@ -792,10 +793,9 @@ static const struct net_proto_family rxrpc_family_ops = { */ static int __init af_rxrpc_init(void) { - struct sk_buff *dummy_skb; int ret = -1; - BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); rxrpc_epoch = htonl(get_seconds()); @@ -839,8 +839,9 @@ static int __init af_rxrpc_init(void) } #ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "rxrpc_calls", 0, &rxrpc_call_seq_fops); - proc_net_fops_create(&init_net, "rxrpc_conns", 0, &rxrpc_connection_seq_fops); + proc_create("rxrpc_calls", 0, init_net.proc_net, &rxrpc_call_seq_fops); + proc_create("rxrpc_conns", 0, init_net.proc_net, + &rxrpc_connection_seq_fops); #endif return 0; @@ -878,8 +879,8 @@ static void __exit af_rxrpc_exit(void) _debug("flush scheduled work"); flush_workqueue(rxrpc_workqueue); - proc_net_remove(&init_net, "rxrpc_conns"); - proc_net_remove(&init_net, "rxrpc_calls"); + remove_proc_entry("rxrpc_conns", init_net.proc_net); + remove_proc_entry("rxrpc_calls", init_net.proc_net); destroy_workqueue(rxrpc_workqueue); kmem_cache_destroy(rxrpc_call_jar); _leave(""); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 65d240cbf74b..fd7072827a40 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -485,8 +485,9 @@ errout: return err; } -struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind) +struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, + int bind) { struct tc_action *a; struct tc_action_ops *a_o; @@ -542,9 +543,9 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, /* backward compatibility for policer */ if (name == NULL) - err = a_o->init(tb[TCA_ACT_OPTIONS], est, a, ovr, bind); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); else - err = a_o->init(nla, est, a, ovr, bind); + err = a_o->init(net, nla, est, a, ovr, bind); if (err < 0) goto err_free; @@ -566,8 +567,9 @@ err_out: return ERR_PTR(err); } -struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind) +struct tc_action *tcf_action_init(struct net *net, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, + int bind) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *head = NULL, *act, *act_prev = NULL; @@ -579,7 +581,7 @@ struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, return ERR_PTR(err); for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(tb[i], est, name, ovr, bind); + act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); if (IS_ERR(act)) goto err; act->order = i; @@ -960,7 +962,7 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, struct tc_action *a; u32 seq = n->nlmsg_seq; - act = tcf_action_init(nla, NULL, NULL, ovr, 0); + act = tcf_action_init(net, nla, NULL, NULL, ovr, 0); if (act == NULL) goto done; if (IS_ERR(act)) { @@ -980,7 +982,7 @@ done: return ret; } -static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ACT_MAX + 1]; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 2c8ad7c86e43..3a4c0caa1f7d 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -51,7 +51,7 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, }; -static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, +static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { struct nlattr *tb[TCA_CSUM_MAX + 1]; @@ -166,15 +166,17 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h, +static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct icmp6hdr *icmp6h; + const struct ipv6hdr *ip6h; icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h)); if (icmp6h == NULL) return 0; + ip6h = ipv6_hdr(skb); icmp6h->icmp6_cksum = 0; skb->csum = csum_partial(icmp6h, ipl - ihl, 0); icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -186,15 +188,17 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h, return 1; } -static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph, +static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct tcphdr *tcph; + const struct iphdr *iph; tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); if (tcph == NULL) return 0; + iph = ip_hdr(skb); tcph->check = 0; skb->csum = csum_partial(tcph, ipl - ihl, 0); tcph->check = tcp_v4_check(ipl - ihl, @@ -205,15 +209,17 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph, return 1; } -static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h, +static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct tcphdr *tcph; + const struct ipv6hdr *ip6h; tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); if (tcph == NULL) return 0; + ip6h = ipv6_hdr(skb); tcph->check = 0; skb->csum = csum_partial(tcph, ipl - ihl, 0); tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -225,10 +231,11 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h, return 1; } -static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, +static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl, int udplite) { struct udphdr *udph; + const struct iphdr *iph; u16 ul; /* @@ -242,6 +249,7 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, if (udph == NULL) return 0; + iph = ip_hdr(skb); ul = ntohs(udph->len); if (udplite || udph->check) { @@ -276,10 +284,11 @@ ignore_obscure_skb: return 1; } -static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, +static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl, int udplite) { struct udphdr *udph; + const struct ipv6hdr *ip6h; u16 ul; /* @@ -293,6 +302,7 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, if (udph == NULL) return 0; + ip6h = ipv6_hdr(skb); ul = ntohs(udph->len); udph->check = 0; @@ -328,7 +338,7 @@ ignore_obscure_skb: static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) { - struct iphdr *iph; + const struct iphdr *iph; int ntkoff; ntkoff = skb_network_offset(skb); @@ -353,19 +363,19 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) break; case IPPROTO_TCP: if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) - if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4, + if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4, ntohs(iph->tot_len))) goto fail; break; case IPPROTO_UDP: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) - if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4, ntohs(iph->tot_len), 0)) goto fail; break; case IPPROTO_UDPLITE: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) - if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4, ntohs(iph->tot_len), 1)) goto fail; break; @@ -377,7 +387,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) goto fail; - ip_send_check(iph); + ip_send_check(ip_hdr(skb)); } return 1; @@ -456,6 +466,7 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) ixhl = ipv6_optlen(ip6xh); if (!pskb_may_pull(skb, hl + ixhl + ntkoff)) goto fail; + ip6xh = (void *)(skb_network_header(skb) + hl); if ((nexthdr == NEXTHDR_HOP) && !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl))) goto fail; @@ -464,25 +475,25 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) break; case IPPROTO_ICMPV6: if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) - if (!tcf_csum_ipv6_icmp(skb, ip6h, + if (!tcf_csum_ipv6_icmp(skb, hl, pl + sizeof(*ip6h))) goto fail; goto done; case IPPROTO_TCP: if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) - if (!tcf_csum_ipv6_tcp(skb, ip6h, + if (!tcf_csum_ipv6_tcp(skb, hl, pl + sizeof(*ip6h))) goto fail; goto done; case IPPROTO_UDP: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) - if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + if (!tcf_csum_ipv6_udp(skb, hl, pl + sizeof(*ip6h), 0)) goto fail; goto done; case IPPROTO_UDPLITE: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) - if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + if (!tcf_csum_ipv6_udp(skb, hl, pl + sizeof(*ip6h), 1)) goto fail; goto done; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 05d60859d8e3..fd2b3cff5fa2 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -58,8 +58,9 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { [TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) }, }; -static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_gact_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_GACT_MAX + 1]; struct tc_gact *parm; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 58fb3c7aab9e..60d88b6b9560 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Copyright: Jamal Hadi Salim (2002-4) + * Copyright: Jamal Hadi Salim (2002-13) */ #include <linux/types.h> @@ -102,7 +102,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; -static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, +static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { struct nlattr *tb[TCA_IPT_MAX + 1]; @@ -207,10 +207,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_ipt *ipt = a->priv; struct xt_action_param par; - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return TC_ACT_UNSPEC; - } + if (skb_unclone(skb, GFP_ATOMIC)) + return TC_ACT_UNSPEC; spin_lock(&ipt->tcf_lock); @@ -305,17 +303,44 @@ static struct tc_action_ops act_ipt_ops = { .walk = tcf_generic_walker }; -MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +static struct tc_action_ops act_xt_ops = { + .kind = "xt", + .hinfo = &ipt_hash_info, + .type = TCA_ACT_IPT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_ipt, + .dump = tcf_ipt_dump, + .cleanup = tcf_ipt_cleanup, + .lookup = tcf_hash_search, + .init = tcf_ipt_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); MODULE_DESCRIPTION("Iptables target actions"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("act_xt"); static int __init ipt_init_module(void) { - return tcf_register_action(&act_ipt_ops); + int ret1, ret2; + ret1 = tcf_register_action(&act_xt_ops); + if (ret1 < 0) + printk("Failed to load xt action\n"); + ret2 = tcf_register_action(&act_ipt_ops); + if (ret2 < 0) + printk("Failed to load ipt action\n"); + + if (ret1 < 0 && ret2 < 0) + return ret1; + else + return 0; } static void __exit ipt_cleanup_module(void) { + tcf_unregister_action(&act_xt_ops); tcf_unregister_action(&act_ipt_ops); } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 9c0fd0c78814..5d676edc22a6 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -62,8 +62,9 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, }; -static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_mirred_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tc_mirred *parm; @@ -88,7 +89,7 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, return -EINVAL; } if (parm->ifindex) { - dev = __dev_get_by_index(&init_net, parm->ifindex); + dev = __dev_get_by_index(net, parm->ifindex); if (dev == NULL) return -ENODEV; switch (dev->type) { diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index b5d029eb44f2..876f0ef29694 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -44,7 +44,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; -static int tcf_nat_init(struct nlattr *nla, struct nlattr *est, +static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { struct nlattr *tb[TCA_NAT_MAX + 1]; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 45c53ab067a6..7ed78c9e505c 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -38,8 +38,9 @@ static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, }; -static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_pedit_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; @@ -130,8 +131,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, int i, munged = 0; unsigned int off; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return p->tcf_action; off = skb_network_offset(skb); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a9de23297d47..823463adbd21 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -22,8 +22,23 @@ #include <net/act_api.h> #include <net/netlink.h> -#define L2T(p, L) qdisc_l2t((p)->tcfp_R_tab, L) -#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L) +struct tcf_police { + struct tcf_common common; + int tcfp_result; + u32 tcfp_ewma_rate; + s64 tcfp_burst; + u32 tcfp_mtu; + s64 tcfp_toks; + s64 tcfp_ptoks; + s64 tcfp_mtu_ptoks; + s64 tcfp_t_c; + struct psched_ratecfg rate; + bool rate_present; + struct psched_ratecfg peak; + bool peak_present; +}; +#define to_police(pc) \ + container_of(pc, struct tcf_police, common) #define POL_TAB_MASK 15 static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; @@ -108,10 +123,6 @@ static void tcf_police_destroy(struct tcf_police *p) write_unlock_bh(&police_lock); gen_kill_estimator(&p->tcf_bstats, &p->tcf_rate_est); - if (p->tcfp_R_tab) - qdisc_put_rtab(p->tcfp_R_tab); - if (p->tcfp_P_tab) - qdisc_put_rtab(p->tcfp_P_tab); /* * gen_estimator est_timer() might access p->tcf_lock * or bstats, wait a RCU grace period before freeing p @@ -130,8 +141,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, }; -static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_act_police_locate(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { unsigned int h; int ret = 0, err; @@ -211,26 +223,36 @@ override: } /* No failure allowed after this point */ - if (R_tab != NULL) { - qdisc_put_rtab(police->tcfp_R_tab); - police->tcfp_R_tab = R_tab; + police->tcfp_mtu = parm->mtu; + if (police->tcfp_mtu == 0) { + police->tcfp_mtu = ~0; + if (R_tab) + police->tcfp_mtu = 255 << R_tab->rate.cell_log; + } + if (R_tab) { + police->rate_present = true; + psched_ratecfg_precompute(&police->rate, R_tab->rate.rate); + qdisc_put_rtab(R_tab); + } else { + police->rate_present = false; } - if (P_tab != NULL) { - qdisc_put_rtab(police->tcfp_P_tab); - police->tcfp_P_tab = P_tab; + if (P_tab) { + police->peak_present = true; + psched_ratecfg_precompute(&police->peak, P_tab->rate.rate); + qdisc_put_rtab(P_tab); + } else { + police->peak_present = false; } if (tb[TCA_POLICE_RESULT]) police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); - police->tcfp_toks = police->tcfp_burst = parm->burst; - police->tcfp_mtu = parm->mtu; - if (police->tcfp_mtu == 0) { - police->tcfp_mtu = ~0; - if (police->tcfp_R_tab) - police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log; + police->tcfp_burst = PSCHED_TICKS2NS(parm->burst); + police->tcfp_toks = police->tcfp_burst; + if (police->peak_present) { + police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak, + police->tcfp_mtu); + police->tcfp_ptoks = police->tcfp_mtu_ptoks; } - if (police->tcfp_P_tab) - police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu); police->tcf_action = parm->action; if (tb[TCA_POLICE_AVRATE]) @@ -240,7 +262,7 @@ override: if (ret != ACT_P_CREATED) return ret; - police->tcfp_t_c = psched_get_time(); + police->tcfp_t_c = ktime_to_ns(ktime_get()); police->tcf_index = parm->index ? parm->index : tcf_hash_new_index(&police_idx_gen, &police_hash_info); h = tcf_hash(police->tcf_index, POL_TAB_MASK); @@ -286,9 +308,9 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = a->priv; - psched_time_t now; - long toks; - long ptoks = 0; + s64 now; + s64 toks; + s64 ptoks = 0; spin_lock(&police->tcf_lock); @@ -304,24 +326,25 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, } if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { - if (police->tcfp_R_tab == NULL) { + if (!police->rate_present) { spin_unlock(&police->tcf_lock); return police->tcfp_result; } - now = psched_get_time(); - toks = psched_tdiff_bounded(now, police->tcfp_t_c, - police->tcfp_burst); - if (police->tcfp_P_tab) { + now = ktime_to_ns(ktime_get()); + toks = min_t(s64, now - police->tcfp_t_c, + police->tcfp_burst); + if (police->peak_present) { ptoks = toks + police->tcfp_ptoks; - if (ptoks > (long)L2T_P(police, police->tcfp_mtu)) - ptoks = (long)L2T_P(police, police->tcfp_mtu); - ptoks -= L2T_P(police, qdisc_pkt_len(skb)); + if (ptoks > police->tcfp_mtu_ptoks) + ptoks = police->tcfp_mtu_ptoks; + ptoks -= (s64) psched_l2t_ns(&police->peak, + qdisc_pkt_len(skb)); } toks += police->tcfp_toks; - if (toks > (long)police->tcfp_burst) + if (toks > police->tcfp_burst) toks = police->tcfp_burst; - toks -= L2T(police, qdisc_pkt_len(skb)); + toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb)); if ((toks|ptoks) >= 0) { police->tcfp_t_c = now; police->tcfp_toks = toks; @@ -347,15 +370,15 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) .index = police->tcf_index, .action = police->tcf_action, .mtu = police->tcfp_mtu, - .burst = police->tcfp_burst, + .burst = PSCHED_NS2TICKS(police->tcfp_burst), .refcnt = police->tcf_refcnt - ref, .bindcnt = police->tcf_bindcnt - bind, }; - if (police->tcfp_R_tab) - opt.rate = police->tcfp_R_tab->rate; - if (police->tcfp_P_tab) - opt.peakrate = police->tcfp_P_tab->rate; + if (police->rate_present) + opt.rate.rate = psched_ratecfg_getrate(&police->rate); + if (police->peak_present) + opt.peakrate.rate = psched_ratecfg_getrate(&police->peak); if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; if (police->tcfp_result && diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 3714f60f0b3c..7725eb4ab756 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -95,8 +95,9 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA }, }; -static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_simp_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 476e0fac6712..cb4221171f93 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -67,8 +67,9 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, }; -static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_skbedit_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ff55ed6c49b2..8e118af90973 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -22,7 +22,6 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> -#include <linux/netlink.h> #include <linux/err.h> #include <linux/slab.h> #include <net/net_namespace.h> @@ -118,7 +117,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) /* Add/change/delete/get a filter node */ -static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; @@ -141,7 +140,12 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if ((n->nlmsg_type != RTM_GETTFILTER) && !capable(CAP_NET_ADMIN)) return -EPERM; + replay: + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); + if (err < 0) + return err; + t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); @@ -164,10 +168,6 @@ replay: if (dev == NULL) return -ENODEV; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); - if (err < 0) - return err; - /* Find qdisc */ if (!parent) { q = dev->qdisc; @@ -321,7 +321,7 @@ replay: } } - err = tp->ops->change(skb, tp, cl, t->tcm_handle, tca, &fh); + err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh); if (err == 0) { if (tp_created) { spin_lock_bh(root_lock); @@ -427,7 +427,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) const struct Qdisc_class_ops *cops; struct tcf_dump_args arg; - if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) @@ -508,7 +508,7 @@ void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_destroy); -int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, +int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, const struct tcf_ext_map *map) { @@ -519,7 +519,7 @@ int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, struct tc_action *act; if (map->police && tb[map->police]) { - act = tcf_action_init_1(tb[map->police], rate_tlv, + act = tcf_action_init_1(net, tb[map->police], rate_tlv, "police", TCA_ACT_NOREPLACE, TCA_ACT_BIND); if (IS_ERR(act)) @@ -528,8 +528,9 @@ int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, act->type = TCA_OLD_COMPAT; exts->action = act; } else if (map->action && tb[map->action]) { - act = tcf_action_init(tb[map->action], rate_tlv, NULL, - TCA_ACT_NOREPLACE, TCA_ACT_BIND); + act = tcf_action_init(net, tb[map->action], rate_tlv, + NULL, TCA_ACT_NOREPLACE, + TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 344a11b342e5..d76a35d0dc85 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -132,15 +132,16 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED }, }; -static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, - unsigned long base, struct nlattr **tb, +static int basic_set_parms(struct net *net, struct tcf_proto *tp, + struct basic_filter *f, unsigned long base, + struct nlattr **tb, struct nlattr *est) { int err = -EINVAL; struct tcf_exts e; struct tcf_ematch_tree t; - err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); + err = tcf_exts_validate(net, tp, tb, est, &e, &basic_ext_map); if (err < 0) return err; @@ -162,7 +163,7 @@ errout: return err; } -static int basic_change(struct sk_buff *in_skb, +static int basic_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { @@ -182,7 +183,7 @@ static int basic_change(struct sk_buff *in_skb, if (f != NULL) { if (handle && f->handle != handle) return -EINVAL; - return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); + return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]); } err = -ENOBUFS; @@ -208,7 +209,7 @@ static int basic_change(struct sk_buff *in_skb, f->handle = head->hgenerator; } - err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); + err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]); if (err < 0) goto errout; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 6db7855b9029..3a294eb98d61 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -178,7 +178,7 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; -static int cls_cgroup_change(struct sk_buff *in_skb, +static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) @@ -215,7 +215,8 @@ static int cls_cgroup_change(struct sk_buff *in_skb, if (err < 0) return err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, + &cgroup_ext_map); if (err < 0) return err; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index ce82d0cb1b47..7881e2fccbc2 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -351,7 +351,7 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; -static int flow_change(struct sk_buff *in_skb, +static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) @@ -393,11 +393,11 @@ static int flow_change(struct sk_buff *in_skb, return -EOPNOTSUPP; if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && - sk_user_ns(NETLINK_CB(in_skb).ssk) != &init_user_ns) + sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns) return -EOPNOTSUPP; } - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &flow_ext_map); if (err < 0) return err; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 4075a0aef2aa..9b97172db84a 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -192,7 +192,7 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { }; static int -fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, +fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, struct nlattr **tca, unsigned long base) { struct fw_head *head = (struct fw_head *)tp->root; @@ -200,11 +200,10 @@ fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, u32 mask; int err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &fw_ext_map); if (err < 0) return err; - err = -EINVAL; if (tb[TCA_FW_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); tcf_bind_filter(tp, &f->res, base); @@ -218,6 +217,7 @@ fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, } #endif /* CONFIG_NET_CLS_IND */ + err = -EINVAL; if (tb[TCA_FW_MASK]) { mask = nla_get_u32(tb[TCA_FW_MASK]); if (mask != head->mask) @@ -233,7 +233,7 @@ errout: return err; } -static int fw_change(struct sk_buff *in_skb, +static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -255,7 +255,7 @@ static int fw_change(struct sk_buff *in_skb, if (f != NULL) { if (f->id != handle && handle) return -EINVAL; - return fw_change_attrs(tp, f, tb, tca, base); + return fw_change_attrs(net, tp, f, tb, tca, base); } if (!handle) @@ -282,7 +282,7 @@ static int fw_change(struct sk_buff *in_skb, f->id = handle; - err = fw_change_attrs(tp, f, tb, tca, base); + err = fw_change_attrs(net, tp, f, tb, tca, base); if (err < 0) goto errout; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index c10d57bf98f2..37da567d833e 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -335,9 +335,10 @@ static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { [TCA_ROUTE4_IIF] = { .type = NLA_U32 }, }; -static int route4_set_parms(struct tcf_proto *tp, unsigned long base, - struct route4_filter *f, u32 handle, struct route4_head *head, - struct nlattr **tb, struct nlattr *est, int new) +static int route4_set_parms(struct net *net, struct tcf_proto *tp, + unsigned long base, struct route4_filter *f, + u32 handle, struct route4_head *head, + struct nlattr **tb, struct nlattr *est, int new) { int err; u32 id = 0, to = 0, nhandle = 0x8000; @@ -346,7 +347,7 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, struct route4_bucket *b; struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map); + err = tcf_exts_validate(net, tp, tb, est, &e, &route_ext_map); if (err < 0) return err; @@ -427,7 +428,7 @@ errout: return err; } -static int route4_change(struct sk_buff *in_skb, +static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -457,7 +458,7 @@ static int route4_change(struct sk_buff *in_skb, if (f->bkt) old_handle = f->handle; - err = route4_set_parms(tp, base, f, handle, head, tb, + err = route4_set_parms(net, tp, base, f, handle, head, tb, tca[TCA_RATE], 0); if (err < 0) return err; @@ -480,7 +481,7 @@ static int route4_change(struct sk_buff *in_skb, if (f == NULL) goto errout; - err = route4_set_parms(tp, base, f, handle, head, tb, + err = route4_set_parms(net, tp, base, f, handle, head, tb, tca[TCA_RATE], 1); if (err < 0) goto errout; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 494bbb90924a..252d8b05872e 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -416,7 +416,7 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, }; -static int rsvp_change(struct sk_buff *in_skb, +static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -440,7 +440,7 @@ static int rsvp_change(struct sk_buff *in_skb, if (err < 0) return err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &rsvp_ext_map); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &rsvp_ext_map); if (err < 0) return err; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index a1293b4ab7a1..b86535a40169 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -197,9 +197,10 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { }; static int -tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, - struct tcindex_data *p, struct tcindex_filter_result *r, - struct nlattr **tb, struct nlattr *est) +tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, + u32 handle, struct tcindex_data *p, + struct tcindex_filter_result *r, struct nlattr **tb, + struct nlattr *est) { int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; @@ -208,7 +209,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map); + err = tcf_exts_validate(net, tp, tb, est, &e, &tcindex_ext_map); if (err < 0) return err; @@ -332,7 +333,7 @@ errout: } static int -tcindex_change(struct sk_buff *in_skb, +tcindex_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { @@ -353,7 +354,8 @@ tcindex_change(struct sk_buff *in_skb, if (err < 0) return err; - return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]); + return tcindex_set_parms(net, tp, base, handle, p, r, tb, + tca[TCA_RATE]); } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index c7c27bc91b5a..eb07a1e536e6 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -488,15 +488,15 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, }; -static int u32_set_parms(struct tcf_proto *tp, unsigned long base, - struct tc_u_hnode *ht, +static int u32_set_parms(struct net *net, struct tcf_proto *tp, + unsigned long base, struct tc_u_hnode *ht, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est) { int err; struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map); + err = tcf_exts_validate(net, tp, tb, est, &e, &u32_ext_map); if (err < 0) return err; @@ -544,7 +544,7 @@ errout: return err; } -static int u32_change(struct sk_buff *in_skb, +static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) @@ -570,7 +570,8 @@ static int u32_change(struct sk_buff *in_skb, if (TC_U32_KEY(n->handle) == 0) return -EINVAL; - return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]); + return u32_set_parms(net, tp, base, n->ht_up, n, tb, + tca[TCA_RATE]); } if (tb[TCA_U32_DIVISOR]) { @@ -656,7 +657,7 @@ static int u32_change(struct sk_buff *in_skb, } #endif - err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]); + err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE]); if (err == 0) { struct tc_u_knode **ins; for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index 3130320997e2..938b7cbf5627 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -83,7 +83,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, opt.dim = set->dim; opt.flags = set->flags; opt.cmdflags = 0; - opt.timeout = ~0u; + opt.ext.timeout = ~0u; network_offset = skb_network_offset(skb); skb_pull(skb, network_offset); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d84f7e734cd7..2b935e7cfe7b 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -493,7 +493,7 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) @@ -502,10 +502,10 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) qdisc_throttled(wd->qdisc); hrtimer_start(&wd->timer, - ns_to_ktime(PSCHED_TICKS2NS(expires)), + ns_to_ktime(expires), HRTIMER_MODE_ABS); } -EXPORT_SYMBOL(qdisc_watchdog_schedule); +EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { @@ -545,7 +545,7 @@ static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n) void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) { struct Qdisc_class_common *cl; - struct hlist_node *n, *next; + struct hlist_node *next; struct hlist_head *nhash, *ohash; unsigned int nsize, nmask, osize; unsigned int i, h; @@ -564,7 +564,7 @@ void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) sch_tree_lock(sch); for (i = 0; i < osize; i++) { - hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) { + hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { h = qdisc_class_hash(cl->classid, nmask); hlist_add_head(&cl->hnode, &nhash[h]); } @@ -971,13 +971,13 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) * Delete/get qdisc. */ -static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; - u32 clid = tcm->tcm_parent; + u32 clid; struct Qdisc *q = NULL; struct Qdisc *p = NULL; int err; @@ -985,14 +985,15 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN)) return -EPERM; - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); if (err < 0) return err; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { @@ -1038,7 +1039,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) * Create/change qdisc. */ -static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm; @@ -1053,6 +1054,10 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) replay: /* Reinit, just in case something touches this. */ + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + tcm = nlmsg_data(n); clid = tcm->tcm_parent; q = p = NULL; @@ -1061,9 +1066,6 @@ replay: if (!dev) return -ENODEV; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); - if (err < 0) - return err; if (clid) { if (clid != TC_H_ROOT) { @@ -1372,7 +1374,7 @@ done: -static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); @@ -1382,22 +1384,22 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) const struct Qdisc_class_ops *cops; unsigned long cl = 0; unsigned long new_cl; - u32 portid = tcm->tcm_parent; - u32 clid = tcm->tcm_handle; - u32 qid = TC_H_MAJ(clid); + u32 portid; + u32 clid; + u32 qid; int err; if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN)) return -EPERM; - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); if (err < 0) return err; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + /* parent == TC_H_UNSPEC - unspecified parent. parent == TC_H_ROOT - class is root, which has no parent. @@ -1413,6 +1415,10 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* Step 1. Determine qdisc handle X:0 */ + portid = tcm->tcm_parent; + clid = tcm->tcm_handle; + qid = TC_H_MAJ(clid); + if (portid != TC_H_ROOT) { u32 qid1 = TC_H_MAJ(portid); @@ -1636,7 +1642,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; int t, s_t; - if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return 0; dev = dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) @@ -1768,7 +1774,7 @@ static int __net_init psched_net_init(struct net *net) { struct proc_dir_entry *e; - e = proc_net_fops_create(net, "psched", 0, &psched_fops); + e = proc_create("psched", 0, net->proc_net, &psched_fops); if (e == NULL) return -ENOMEM; @@ -1777,7 +1783,7 @@ static int __net_init psched_net_init(struct net *net) static void __net_exit psched_net_exit(struct net *net) { - proc_net_remove(net, "psched"); + remove_proc_entry("psched", net->proc_net); } #else static int __net_init psched_net_init(struct net *net) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 0e19948470b8..1bc210ffcba2 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -962,8 +962,11 @@ cbq_dequeue(struct Qdisc *sch) cbq_update(q); if ((incr -= incr2) < 0) incr = 0; + q->now += incr; + } else { + if (now > q->now) + q->now = now; } - q->now += incr; q->now_rt = now; for (;;) { @@ -1041,14 +1044,13 @@ static void cbq_adjust_levels(struct cbq_class *this) static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) { struct cbq_class *cl; - struct hlist_node *n; unsigned int h; if (q->quanta[prio] == 0) return; for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { /* BUGGGG... Beware! This expression suffer of * arithmetic overflows! */ @@ -1087,10 +1089,9 @@ static void cbq_sync_defmap(struct cbq_class *cl) continue; for (h = 0; h < q->clhash.hashsize; h++) { - struct hlist_node *n; struct cbq_class *c; - hlist_for_each_entry(c, n, &q->clhash.hash[h], + hlist_for_each_entry(c, &q->clhash.hash[h], common.hnode) { if (c->split == split && c->level < level && c->defmap & (1<<i)) { @@ -1210,7 +1211,6 @@ cbq_reset(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl; - struct hlist_node *n; int prio; unsigned int h; @@ -1228,7 +1228,7 @@ cbq_reset(struct Qdisc *sch) q->active[prio] = NULL; for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { qdisc_reset(cl->q); cl->next_alive = NULL; @@ -1697,7 +1697,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) static void cbq_destroy(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); - struct hlist_node *n, *next; + struct hlist_node *next; struct cbq_class *cl; unsigned int h; @@ -1710,11 +1710,11 @@ static void cbq_destroy(struct Qdisc *sch) * be bound to classes which have been destroyed already. --TGR '04 */ for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) tcf_destroy_chain(&cl->filter_list); } for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], common.hnode) cbq_destroy_class(sch, cl); } @@ -2013,14 +2013,13 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl; - struct hlist_node *n; unsigned int h; if (arg->stop) return; for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index cc37dd52ecf9..ef53ab8d0aae 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -80,7 +80,7 @@ struct choke_sched_data { /* deliver a random number between 0 and N - 1 */ static u32 random_N(unsigned int N) { - return reciprocal_divide(random32(), N); + return reciprocal_divide(prandom_u32(), N); } /* number of elements in queue including holes */ diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 71e50c80315f..759b308d1a8d 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -293,14 +293,13 @@ static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - struct hlist_node *n; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; @@ -451,11 +450,10 @@ static void drr_reset_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) list_del(&cl->alist); qdisc_reset(cl->qdisc); @@ -468,13 +466,13 @@ static void drr_destroy_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - struct hlist_node *n, *next; + struct hlist_node *next; unsigned int i; tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) drr_destroy_class(sch, cl); } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 4e606fcb2534..55786283a3df 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -195,7 +195,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow->deficit = q->quantum; flow->dropped = 0; } - if (++sch->q.qlen < sch->limit) + if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; q->drop_overlimit++; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5d81a4478514..eac7e0ee23c1 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -25,6 +25,7 @@ #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -896,3 +897,39 @@ void dev_shutdown(struct net_device *dev) WARN_ON(timer_pending(&dev->watchdog_timer)); } + +void psched_ratecfg_precompute(struct psched_ratecfg *r, u32 rate) +{ + u64 factor; + u64 mult; + int shift; + + r->rate_bps = (u64)rate << 3; + r->shift = 0; + r->mult = 1; + /* + * Calibrate mult, shift so that token counting is accurate + * for smallest packet size (64 bytes). Token (time in ns) is + * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will + * work as long as the smallest packet transfer time can be + * accurately represented in nanosec. + */ + if (r->rate_bps > 0) { + /* + * Higher shift gives better accuracy. Find the largest + * shift such that mult fits in 32 bits. + */ + for (shift = 0; shift < 16; shift++) { + r->shift = shift; + factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); + mult = div64_u64(factor, r->rate_bps); + if (mult > UINT_MAX) + break; + } + + r->shift = shift - 1; + factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); + r->mult = div64_u64(factor, r->rate_bps); + } +} +EXPORT_SYMBOL(psched_ratecfg_precompute); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 6c2ec4510540..9facea03faeb 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1389,7 +1389,6 @@ static void hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct hfsc_sched *q = qdisc_priv(sch); - struct hlist_node *n; struct hfsc_class *cl; unsigned int i; @@ -1397,7 +1396,7 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) { if (arg->count < arg->skip) { arg->count++; @@ -1523,11 +1522,10 @@ hfsc_reset_qdisc(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) hfsc_reset_class(cl); } q->eligible = RB_ROOT; @@ -1540,16 +1538,16 @@ static void hfsc_destroy_qdisc(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); - struct hlist_node *n, *next; + struct hlist_node *next; struct hfsc_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) tcf_destroy_chain(&cl->filter_list); } for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], cl_common.hnode) hfsc_destroy_class(sch, cl); } @@ -1564,12 +1562,11 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; struct hfsc_class *cl; - struct hlist_node *n; unsigned int i; sch->qstats.backlog = 0; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) sch->qstats.backlog += cl->qdisc->qstats.backlog; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 79e8ed4ac7ce..79b1876b6cd2 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -38,6 +38,7 @@ #include <linux/workqueue.h> #include <linux/slab.h> #include <net/netlink.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> /* HTB algorithm. @@ -71,12 +72,6 @@ enum htb_cmode { HTB_CAN_SEND /* class can send */ }; -struct htb_rate_cfg { - u64 rate_bps; - u32 mult; - u32 shift; -}; - /* interior & leaf nodes; props specific to leaves are marked L: */ struct htb_class { struct Qdisc_class_common common; @@ -124,8 +119,8 @@ struct htb_class { int filter_cnt; /* token bucket parameters */ - struct htb_rate_cfg rate; - struct htb_rate_cfg ceil; + struct psched_ratecfg rate; + struct psched_ratecfg ceil; s64 buffer, cbuffer; /* token bucket depth/rate */ psched_tdiff_t mbuffer; /* max wait time */ s64 tokens, ctokens; /* current number of tokens */ @@ -168,45 +163,6 @@ struct htb_sched { struct work_struct work; }; -static u64 l2t_ns(struct htb_rate_cfg *r, unsigned int len) -{ - return ((u64)len * r->mult) >> r->shift; -} - -static void htb_precompute_ratedata(struct htb_rate_cfg *r) -{ - u64 factor; - u64 mult; - int shift; - - r->shift = 0; - r->mult = 1; - /* - * Calibrate mult, shift so that token counting is accurate - * for smallest packet size (64 bytes). Token (time in ns) is - * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will - * work as long as the smallest packet transfer time can be - * accurately represented in nanosec. - */ - if (r->rate_bps > 0) { - /* - * Higher shift gives better accuracy. Find the largest - * shift such that mult fits in 32 bits. - */ - for (shift = 0; shift < 16; shift++) { - r->shift = shift; - factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); - mult = div64_u64(factor, r->rate_bps); - if (mult > UINT_MAX) - break; - } - - r->shift = shift - 1; - factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); - r->mult = div64_u64(factor, r->rate_bps); - } -} - /* find class in global hash table using given handle */ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch) { @@ -632,7 +588,7 @@ static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff) if (toks > cl->buffer) toks = cl->buffer; - toks -= (s64) l2t_ns(&cl->rate, bytes); + toks -= (s64) psched_l2t_ns(&cl->rate, bytes); if (toks <= -cl->mbuffer) toks = 1 - cl->mbuffer; @@ -645,7 +601,7 @@ static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff) if (toks > cl->cbuffer) toks = cl->cbuffer; - toks -= (s64) l2t_ns(&cl->ceil, bytes); + toks -= (s64) psched_l2t_ns(&cl->ceil, bytes); if (toks <= -cl->mbuffer) toks = 1 - cl->mbuffer; @@ -993,11 +949,10 @@ static void htb_reset(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->level) memset(&cl->un.inner, 0, sizeof(cl->un.inner)); else { @@ -1026,6 +981,7 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) }, [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 }, }; static void htb_work_func(struct work_struct *work) @@ -1039,7 +995,7 @@ static void htb_work_func(struct work_struct *work) static int htb_init(struct Qdisc *sch, struct nlattr *opt) { struct htb_sched *q = qdisc_priv(sch); - struct nlattr *tb[TCA_HTB_INIT + 1]; + struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; int err; int i; @@ -1047,20 +1003,16 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy); + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy); if (err < 0) return err; - if (tb[TCA_HTB_INIT] == NULL) { - pr_err("HTB: hey probably you have bad tc tool ?\n"); + if (!tb[TCA_HTB_INIT]) return -EINVAL; - } + gopt = nla_data(tb[TCA_HTB_INIT]); - if (gopt->version != HTB_VER >> 16) { - pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n", - HTB_VER >> 16, HTB_VER & 0xffff, gopt->version); + if (gopt->version != HTB_VER >> 16) return -EINVAL; - } err = qdisc_class_hash_init(&q->clhash); if (err < 0) @@ -1072,10 +1024,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) INIT_WORK(&q->work, htb_work_func); skb_queue_head_init(&q->direct_queue); - q->direct_qlen = qdisc_dev(sch)->tx_queue_len; - if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ - q->direct_qlen = 2; - + if (tb[TCA_HTB_DIRECT_QLEN]) + q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); + else { + q->direct_qlen = qdisc_dev(sch)->tx_queue_len; + if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ + q->direct_qlen = 2; + } if ((q->rate2quantum = gopt->rate2quantum) < 1) q->rate2quantum = 1; q->defcls = gopt->defcls; @@ -1101,7 +1056,8 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt)) + if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) || + nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen)) goto nla_put_failure; nla_nest_end(skb, nest); @@ -1134,9 +1090,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, memset(&opt, 0, sizeof(opt)); - opt.rate.rate = cl->rate.rate_bps >> 3; + opt.rate.rate = psched_ratecfg_getrate(&cl->rate); opt.buffer = PSCHED_NS2TICKS(cl->buffer); - opt.ceil.rate = cl->ceil.rate_bps >> 3; + opt.ceil.rate = psched_ratecfg_getrate(&cl->ceil); opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer); opt.quantum = cl->quantum; opt.prio = cl->prio; @@ -1262,7 +1218,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) static void htb_destroy(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); - struct hlist_node *n, *next; + struct hlist_node *next; struct htb_class *cl; unsigned int i; @@ -1276,11 +1232,11 @@ static void htb_destroy(struct Qdisc *sch) tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) tcf_destroy_chain(&cl->filter_list); } for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) htb_destroy_class(sch, cl); } @@ -1356,7 +1312,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[__TCA_HTB_MAX]; + struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; /* extract all subattrs from opt attr */ @@ -1459,8 +1415,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->parent = parent; /* set class to be in HTB_CAN_SEND state */ - cl->tokens = hopt->buffer; - cl->ctokens = hopt->cbuffer; + cl->tokens = PSCHED_TICKS2NS(hopt->buffer); + cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer); cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC; /* 1min */ cl->t_c = psched_get_time(); cl->cmode = HTB_CAN_SEND; @@ -1503,17 +1459,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->prio = TC_HTB_NUMPRIO - 1; } - cl->buffer = hopt->buffer; - cl->cbuffer = hopt->cbuffer; - - cl->rate.rate_bps = (u64)hopt->rate.rate << 3; - cl->ceil.rate_bps = (u64)hopt->ceil.rate << 3; - - htb_precompute_ratedata(&cl->rate); - htb_precompute_ratedata(&cl->ceil); + psched_ratecfg_precompute(&cl->rate, hopt->rate.rate); + psched_ratecfg_precompute(&cl->ceil, hopt->ceil.rate); - cl->buffer = hopt->buffer << PSCHED_SHIFT; - cl->cbuffer = hopt->buffer << PSCHED_SHIFT; + cl->buffer = PSCHED_TICKS2NS(hopt->buffer); + cl->cbuffer = PSCHED_TICKS2NS(hopt->buffer); sch_tree_unlock(sch); @@ -1566,14 +1516,13 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl; - struct hlist_node *n; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 6ed37652a4c3..d51852bba01c 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -276,9 +276,8 @@ static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, u32 lmax, u32 weight) { struct qfq_aggregate *agg; - struct hlist_node *n; - hlist_for_each_entry(agg, n, &q->nonfull_aggs, nonfull_next) + hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) if (agg->lmax == lmax && agg->class_weight == weight) return agg; @@ -299,6 +298,10 @@ static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, new_num_classes == q->max_agg_classes - 1) /* agg no more full */ hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + /* The next assignment may let + * agg->initial_budget > agg->budgetmax + * hold, we will take it into account in charge_actual_service(). + */ agg->budgetmax = new_num_classes * agg->lmax; new_agg_weight = agg->class_weight * new_num_classes; agg->inv_w = ONE_FP/new_agg_weight; @@ -670,14 +673,13 @@ static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; - struct hlist_node *n; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; @@ -819,7 +821,7 @@ static void qfq_make_eligible(struct qfq_sched *q) unsigned long old_vslot = q->oldV >> q->min_slot_shift; if (vslot != old_vslot) { - unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; + unsigned long mask = (1ULL << fls(vslot ^ old_vslot)) - 1; qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } @@ -990,12 +992,23 @@ static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, /* Update F according to the actual service received by the aggregate. */ static inline void charge_actual_service(struct qfq_aggregate *agg) { - /* compute the service received by the aggregate */ - u32 service_received = agg->initial_budget - agg->budget; + /* Compute the service received by the aggregate, taking into + * account that, after decreasing the number of classes in + * agg, it may happen that + * agg->initial_budget - agg->budget > agg->bugdetmax + */ + u32 service_received = min(agg->budgetmax, + agg->initial_budget - agg->budget); agg->F = agg->S + (u64)service_received * agg->inv_w; } +static inline void qfq_update_agg_ts(struct qfq_sched *q, + struct qfq_aggregate *agg, + enum update_reason reason); + +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); + static struct sk_buff *qfq_dequeue(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); @@ -1023,7 +1036,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch) in_serv_agg->initial_budget = in_serv_agg->budget = in_serv_agg->budgetmax; - if (!list_empty(&in_serv_agg->active)) + if (!list_empty(&in_serv_agg->active)) { /* * Still active: reschedule for * service. Possible optimization: if no other @@ -1034,8 +1047,9 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch) * handle it, we would need to maintain an * extra num_active_aggs field. */ - qfq_activate_agg(q, in_serv_agg, requeue); - else if (sch->q.qlen == 0) { /* no aggregate to serve */ + qfq_update_agg_ts(q, in_serv_agg, requeue); + qfq_schedule_agg(q, in_serv_agg); + } else if (sch->q.qlen == 0) { /* no aggregate to serve */ q->in_serv_agg = NULL; return NULL; } @@ -1054,7 +1068,15 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch) qdisc_bstats_update(sch, skb); agg_dequeue(in_serv_agg, cl, len); - in_serv_agg->budget -= len; + /* If lmax is lowered, through qfq_change_class, for a class + * owning pending packets with larger size than the new value + * of lmax, then the following condition may hold. + */ + if (unlikely(in_serv_agg->budget < len)) + in_serv_agg->budget = 0; + else + in_serv_agg->budget -= len; + q->V += (u64)len * IWSUM; pr_debug("qfq dequeue: len %u F %lld now %lld\n", len, (unsigned long long) in_serv_agg->F, @@ -1219,17 +1241,11 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl->deficit = agg->lmax; list_add_tail(&cl->alist, &agg->active); - if (list_first_entry(&agg->active, struct qfq_class, alist) != cl) - return err; /* aggregate was not empty, nothing else to do */ + if (list_first_entry(&agg->active, struct qfq_class, alist) != cl || + q->in_serv_agg == agg) + return err; /* non-empty or in service, nothing else to do */ - /* recharge budget */ - agg->initial_budget = agg->budget = agg->budgetmax; - - qfq_update_agg_ts(q, agg, enqueue); - if (q->in_serv_agg == NULL) - q->in_serv_agg = agg; - else if (agg != q->in_serv_agg) - qfq_schedule_agg(q, agg); + qfq_activate_agg(q, agg, enqueue); return err; } @@ -1263,7 +1279,8 @@ static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) /* group was surely ineligible, remove */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[IB]); - } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) && + q->in_serv_agg == NULL) q->V = roundedS; grp->S = roundedS; @@ -1286,8 +1303,15 @@ skip_update: static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, enum update_reason reason) { + agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ + qfq_update_agg_ts(q, agg, reason); - qfq_schedule_agg(q, agg); + if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ + q->in_serv_agg = agg; /* start serving this aggregate */ + /* update V: to be in service, agg must be eligible */ + q->oldV = q->V = agg->S; + } else if (agg != q->in_serv_agg) + qfq_schedule_agg(q, agg); } static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, @@ -1359,8 +1383,6 @@ static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) __set_bit(grp->index, &q->bitmaps[s]); } } - - qfq_update_eligible(q); } static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) @@ -1376,11 +1398,10 @@ static unsigned int qfq_drop_from_slot(struct qfq_sched *q, struct hlist_head *slot) { struct qfq_aggregate *agg; - struct hlist_node *n; struct qfq_class *cl; unsigned int len; - hlist_for_each_entry(agg, n, slot, next) { + hlist_for_each_entry(agg, slot, next) { list_for_each_entry(cl, &agg->active, alist) { if (!cl->qdisc->ops->drop) @@ -1459,11 +1480,10 @@ static void qfq_reset_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen > 0) qfq_deactivate_class(q, cl); @@ -1477,13 +1497,13 @@ static void qfq_destroy_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; - struct hlist_node *n, *next; + struct hlist_node *next; unsigned int i; tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) { qfq_destroy_class(sch, cl); } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 4b056c15e90c..c8388f3c3426 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> @@ -100,23 +101,21 @@ struct tbf_sched_data { /* Parameters */ u32 limit; /* Maximal length of backlog: bytes */ - u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ - u32 mtu; + s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + s64 mtu; u32 max_size; - struct qdisc_rate_table *R_tab; - struct qdisc_rate_table *P_tab; + struct psched_ratecfg rate; + struct psched_ratecfg peak; + bool peak_present; /* Variables */ - long tokens; /* Current number of B tokens */ - long ptokens; /* Current number of P tokens */ - psched_time_t t_c; /* Time check-point */ + s64 tokens; /* Current number of B tokens */ + s64 ptokens; /* Current number of P tokens */ + s64 t_c; /* Time check-point */ struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ struct qdisc_watchdog watchdog; /* Watchdog timer */ }; -#define L2T(q, L) qdisc_l2t((q)->R_tab, L) -#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L) - static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -156,24 +155,24 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) skb = q->qdisc->ops->peek(q->qdisc); if (skb) { - psched_time_t now; - long toks; - long ptoks = 0; + s64 now; + s64 toks; + s64 ptoks = 0; unsigned int len = qdisc_pkt_len(skb); - now = psched_get_time(); - toks = psched_tdiff_bounded(now, q->t_c, q->buffer); + now = ktime_to_ns(ktime_get()); + toks = min_t(s64, now - q->t_c, q->buffer); - if (q->P_tab) { + if (q->peak_present) { ptoks = toks + q->ptokens; - if (ptoks > (long)q->mtu) + if (ptoks > q->mtu) ptoks = q->mtu; - ptoks -= L2T_P(q, len); + ptoks -= (s64) psched_l2t_ns(&q->peak, len); } toks += q->tokens; - if (toks > (long)q->buffer) + if (toks > q->buffer) toks = q->buffer; - toks -= L2T(q, len); + toks -= (s64) psched_l2t_ns(&q->rate, len); if ((toks|ptoks) >= 0) { skb = qdisc_dequeue_peeked(q->qdisc); @@ -189,8 +188,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) return skb; } - qdisc_watchdog_schedule(&q->watchdog, - now + max_t(long, -toks, -ptoks)); + qdisc_watchdog_schedule_ns(&q->watchdog, + now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, @@ -214,7 +213,7 @@ static void tbf_reset(struct Qdisc *sch) qdisc_reset(q->qdisc); sch->q.qlen = 0; - q->t_c = psched_get_time(); + q->t_c = ktime_to_ns(ktime_get()); q->tokens = q->buffer; q->ptokens = q->mtu; qdisc_watchdog_cancel(&q->watchdog); @@ -293,14 +292,19 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->qdisc = child; } q->limit = qopt->limit; - q->mtu = qopt->mtu; + q->mtu = PSCHED_TICKS2NS(qopt->mtu); q->max_size = max_size; - q->buffer = qopt->buffer; + q->buffer = PSCHED_TICKS2NS(qopt->buffer); q->tokens = q->buffer; q->ptokens = q->mtu; - swap(q->R_tab, rtab); - swap(q->P_tab, ptab); + psched_ratecfg_precompute(&q->rate, rtab->rate.rate); + if (ptab) { + psched_ratecfg_precompute(&q->peak, ptab->rate.rate); + q->peak_present = true; + } else { + q->peak_present = false; + } sch_tree_unlock(sch); err = 0; @@ -319,7 +323,7 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - q->t_c = psched_get_time(); + q->t_c = ktime_to_ns(ktime_get()); qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; @@ -331,12 +335,6 @@ static void tbf_destroy(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); - - if (q->P_tab) - qdisc_put_rtab(q->P_tab); - if (q->R_tab) - qdisc_put_rtab(q->R_tab); - qdisc_destroy(q->qdisc); } @@ -352,13 +350,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; opt.limit = q->limit; - opt.rate = q->R_tab->rate; - if (q->P_tab) - opt.peakrate = q->P_tab->rate; + opt.rate.rate = psched_ratecfg_getrate(&q->rate); + if (q->peak_present) + opt.peakrate.rate = psched_ratecfg_getrate(&q->peak); else memset(&opt.peakrate, 0, sizeof(opt.peakrate)); - opt.mtu = q->mtu; - opt.buffer = q->buffer; + opt.mtu = PSCHED_NS2TICKS(q->mtu); + opt.buffer = PSCHED_NS2TICKS(q->buffer); if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index b45ed1f96921..91cfd8f94a19 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -66,13 +66,6 @@ static void sctp_assoc_bh_rcv(struct work_struct *work); static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc); static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc); -/* Keep track of the new idr low so that we don't re-use association id - * numbers too fast. It is protected by they idr spin lock is in the - * range of 1 - INT_MAX. - */ -static u32 idr_low = 1; - - /* 1st Level Abstractions. */ /* Initialize a new association from provided memory. */ @@ -104,8 +97,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* Initialize the object handling fields. */ atomic_set(&asoc->base.refcnt, 1); - asoc->base.dead = 0; - asoc->base.malloced = 0; + asoc->base.dead = false; /* Initialize the bind addr area. */ sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port); @@ -371,7 +363,6 @@ struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, if (!sctp_association_init(asoc, ep, sk, scope, gfp)) goto fail_init; - asoc->base.malloced = 1; SCTP_DBG_OBJCNT_INC(assoc); SCTP_DEBUG_PRINTK("Created asoc %p\n", asoc); @@ -409,7 +400,7 @@ void sctp_association_free(struct sctp_association *asoc) /* Mark as dead, so other users can know this structure is * going away. */ - asoc->base.dead = 1; + asoc->base.dead = true; /* Dispose of any data lying around in the outqueue. */ sctp_outq_free(&asoc->outqueue); @@ -434,8 +425,7 @@ void sctp_association_free(struct sctp_association *asoc) * on our state. */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { - if (timer_pending(&asoc->timers[i]) && - del_timer(&asoc->timers[i])) + if (del_timer(&asoc->timers[i])) sctp_association_put(asoc); } @@ -485,10 +475,8 @@ static void sctp_association_destroy(struct sctp_association *asoc) WARN_ON(atomic_read(&asoc->rmem_alloc)); - if (asoc->base.malloced) { - kfree(asoc); - SCTP_DBG_OBJCNT_DEC(assoc); - } + kfree(asoc); + SCTP_DBG_OBJCNT_DEC(assoc); } /* Change the primary destination address for the peer. */ @@ -1080,7 +1068,7 @@ struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc, transports) { if (transport == active) - break; + continue; list_for_each_entry(chunk, &transport->transmitted, transmitted_list) { if (key == chunk->subh.data_hdr->tsn) { @@ -1497,7 +1485,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) /* Stop the SACK timer. */ timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; - if (timer_pending(timer) && del_timer(timer)) + if (del_timer(timer)) sctp_association_put(asoc); } } @@ -1592,32 +1580,26 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc, /* Set an association id for a given association */ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) { - int assoc_id; - int error = 0; + bool preload = gfp & __GFP_WAIT; + int ret; /* If the id is already assigned, keep it. */ if (asoc->assoc_id) - return error; -retry: - if (unlikely(!idr_pre_get(&sctp_assocs_id, gfp))) - return -ENOMEM; + return 0; + if (preload) + idr_preload(gfp); spin_lock_bh(&sctp_assocs_id_lock); - error = idr_get_new_above(&sctp_assocs_id, (void *)asoc, - idr_low, &assoc_id); - if (!error) { - idr_low = assoc_id + 1; - if (idr_low == INT_MAX) - idr_low = 1; - } + /* 0 is not a valid assoc_id, must be >= 1 */ + ret = idr_alloc_cyclic(&sctp_assocs_id, asoc, 1, 0, GFP_NOWAIT); spin_unlock_bh(&sctp_assocs_id_lock); - if (error == -EAGAIN) - goto retry; - else if (error) - return error; + if (preload) + idr_preload_end(); + if (ret < 0) + return ret; - asoc->assoc_id = (sctp_assoc_t) assoc_id; - return error; + asoc->assoc_id = (sctp_assoc_t)ret; + return 0; } /* Free the ASCONF queue */ diff --git a/net/sctp/auth.c b/net/sctp/auth.c index d8420ae614dc..ba1dfc3f8def 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -200,27 +200,28 @@ static struct sctp_auth_bytes *sctp_auth_make_key_vector( struct sctp_auth_bytes *new; __u32 len; __u32 offset = 0; + __u16 random_len, hmacs_len, chunks_len = 0; - len = ntohs(random->param_hdr.length) + ntohs(hmacs->param_hdr.length); - if (chunks) - len += ntohs(chunks->param_hdr.length); + random_len = ntohs(random->param_hdr.length); + hmacs_len = ntohs(hmacs->param_hdr.length); + if (chunks) + chunks_len = ntohs(chunks->param_hdr.length); - new = kmalloc(sizeof(struct sctp_auth_bytes) + len, gfp); + len = random_len + hmacs_len + chunks_len; + + new = sctp_auth_create_key(len, gfp); if (!new) return NULL; - new->len = len; - - memcpy(new->data, random, ntohs(random->param_hdr.length)); - offset += ntohs(random->param_hdr.length); + memcpy(new->data, random, random_len); + offset += random_len; if (chunks) { - memcpy(new->data + offset, chunks, - ntohs(chunks->param_hdr.length)); - offset += ntohs(chunks->param_hdr.length); + memcpy(new->data + offset, chunks, chunks_len); + offset += chunks_len; } - memcpy(new->data + offset, hmacs, ntohs(hmacs->param_hdr.length)); + memcpy(new->data + offset, hmacs, hmacs_len); return new; } @@ -350,8 +351,8 @@ static struct sctp_auth_bytes *sctp_auth_asoc_create_secret( secret = sctp_auth_asoc_set_secret(ep_key, first_vector, last_vector, gfp); out: - kfree(local_key_vector); - kfree(peer_key_vector); + sctp_auth_key_put(local_key_vector); + sctp_auth_key_put(peer_key_vector); return secret; } diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index d886b3bf84f5..41145fe31813 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -131,8 +131,6 @@ int sctp_bind_addr_dup(struct sctp_bind_addr *dest, */ void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port) { - bp->malloced = 0; - INIT_LIST_HEAD(&bp->address_list); bp->port = port; } @@ -155,11 +153,6 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp) { /* Empty the bind address list. */ sctp_bind_addr_clean(bp); - - if (bp->malloced) { - kfree(bp); - SCTP_DBG_OBJCNT_DEC(bind_addr); - } } /* Add an address to the bind address list in the SCTP_bind_addr structure. */ diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 1a9c5fb77310..5fbd7bc6bb11 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -121,8 +121,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Initialize the basic object fields. */ atomic_set(&ep->base.refcnt, 1); - ep->base.dead = 0; - ep->base.malloced = 1; + ep->base.dead = false; /* Create an input queue. */ sctp_inq_init(&ep->base.inqueue); @@ -151,13 +150,11 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, ep->rcvbuf_policy = net->sctp.rcvbuf_policy; /* Initialize the secret key used with cookie. */ - get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); - ep->last_key = ep->current_key = 0; - ep->key_changed_at = jiffies; + get_random_bytes(ep->secret_key, sizeof(ep->secret_key)); /* SCTP-AUTH extensions*/ INIT_LIST_HEAD(&ep->endpoint_shared_keys); - null_key = sctp_auth_shkey_create(0, GFP_KERNEL); + null_key = sctp_auth_shkey_create(0, gfp); if (!null_key) goto nomem; @@ -200,7 +197,7 @@ struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp) goto fail; if (!sctp_endpoint_init(ep, sk, gfp)) goto fail_init; - ep->base.malloced = 1; + SCTP_DBG_OBJCNT_INC(ep); return ep; @@ -236,7 +233,7 @@ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, */ void sctp_endpoint_free(struct sctp_endpoint *ep) { - ep->base.dead = 1; + ep->base.dead = true; ep->base.sk->sk_state = SCTP_SS_CLOSED; @@ -249,8 +246,6 @@ void sctp_endpoint_free(struct sctp_endpoint *ep) /* Final destructor for endpoint. */ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) { - int i; - SCTP_ASSERT(ep->base.dead, "Endpoint is not dead", return); /* Free up the HMAC transform. */ @@ -273,8 +268,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); - for (i = 0; i < SCTP_HOW_MANY_SECRETS; ++i) - memset(&ep->secret_key[i], 0, SCTP_SECRET_SIZE); + memset(ep->secret_key, 0, sizeof(ep->secret_key)); /* Remove and free the port */ if (sctp_sk(ep->base.sk)->bind_hash) @@ -284,11 +278,8 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) if (ep->base.sk) sock_put(ep->base.sk); - /* Finally, free up our memory. */ - if (ep->base.malloced) { - kfree(ep); - SCTP_DBG_OBJCNT_DEC(ep); - } + kfree(ep); + SCTP_DBG_OBJCNT_DEC(ep); } /* Hold a reference to an endpoint. */ @@ -337,7 +328,6 @@ static struct sctp_association *__sctp_endpoint_lookup_assoc( struct sctp_transport *t = NULL; struct sctp_hashbucket *head; struct sctp_ep_common *epb; - struct hlist_node *node; int hash; int rport; @@ -355,7 +345,7 @@ static struct sctp_association *__sctp_endpoint_lookup_assoc( rport); head = &sctp_assoc_hashtable[hash]; read_lock(&head->lock); - sctp_for_each_hentry(epb, node, &head->chain) { + sctp_for_each_hentry(epb, &head->chain) { tmp = sctp_assoc(epb); if (tmp->ep != ep || rport != tmp->peer.port) continue; diff --git a/net/sctp/input.c b/net/sctp/input.c index 8bd3c279427e..4b2c83146aa7 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -468,8 +468,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk, } else { struct net *net = sock_net(sk); - if (timer_pending(&t->proto_unreach_timer) && - del_timer(&t->proto_unreach_timer)) + if (del_timer(&t->proto_unreach_timer)) sctp_association_put(asoc); sctp_do_sm(net, SCTP_EVENT_T_OTHER, @@ -785,13 +784,12 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, struct sctp_hashbucket *head; struct sctp_ep_common *epb; struct sctp_endpoint *ep; - struct hlist_node *node; int hash; hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port)); head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); - sctp_for_each_hentry(epb, node, &head->chain) { + sctp_for_each_hentry(epb, &head->chain) { ep = sctp_ep(epb); if (sctp_endpoint_is_match(ep, net, laddr)) goto hit; @@ -877,7 +875,6 @@ static struct sctp_association *__sctp_lookup_association( struct sctp_ep_common *epb; struct sctp_association *asoc; struct sctp_transport *transport; - struct hlist_node *node; int hash; /* Optimize here for direct hit, only listening connections can @@ -887,7 +884,7 @@ static struct sctp_association *__sctp_lookup_association( ntohs(peer->v4.sin_port)); head = &sctp_assoc_hashtable[hash]; read_lock(&head->lock); - sctp_for_each_hentry(epb, node, &head->chain) { + sctp_for_each_hentry(epb, &head->chain) { asoc = sctp_assoc(epb); transport = sctp_assoc_is_match(asoc, net, local, peer); if (transport) diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 2d5ad280de38..3221d073448c 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -58,8 +58,6 @@ void sctp_inq_init(struct sctp_inq *queue) /* Create a task for delivering data. */ INIT_WORK(&queue->immediate, NULL); - - queue->malloced = 0; } /* Release the memory associated with an SCTP inqueue. */ @@ -80,11 +78,6 @@ void sctp_inq_free(struct sctp_inq *queue) sctp_chunk_free(queue->in_progress); queue->in_progress = NULL; } - - if (queue->malloced) { - /* Dump the master memory segment. */ - kfree(queue); - } } /* Put a new packet in an SCTP inqueue. diff --git a/net/sctp/output.c b/net/sctp/output.c index f5200a2ad852..bbef4a7a9b56 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -136,7 +136,7 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, packet->overhead = overhead; sctp_packet_reset(packet); packet->vtag = 0; - packet->malloced = 0; + return packet; } @@ -151,9 +151,6 @@ void sctp_packet_free(struct sctp_packet *packet) list_del_init(&chunk->list); sctp_chunk_free(chunk); } - - if (packet->malloced) - kfree(packet); } /* This routine tries to append the chunk to the offered packet. If adding diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 9bcdbd02d777..32a4625fef77 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -217,8 +217,6 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) q->outstanding_bytes = 0; q->empty = 1; q->cork = 0; - - q->malloced = 0; q->out_qlen = 0; } @@ -295,10 +293,6 @@ void sctp_outq_free(struct sctp_outq *q) { /* Throw away leftover chunks. */ __sctp_outq_teardown(q); - - /* If we were kmalloc()'d, free the memory. */ - if (q->malloced) - kfree(q); } /* Put a new chunk in an sctp_outq. */ @@ -707,11 +701,10 @@ redo: /* Cork the outqueue so queued chunks are really queued. */ int sctp_outq_uncork(struct sctp_outq *q) { - int error = 0; if (q->cork) q->cork = 0; - error = sctp_outq_flush(q, 0); - return error; + + return sctp_outq_flush(q, 0); } @@ -1700,10 +1693,8 @@ static void sctp_check_transmitted(struct sctp_outq *q, * address. */ if (!transport->flight_size) { - if (timer_pending(&transport->T3_rtx_timer) && - del_timer(&transport->T3_rtx_timer)) { + if (del_timer(&transport->T3_rtx_timer)) sctp_transport_put(transport); - } } else if (restart_timer) { if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) diff --git a/net/sctp/probe.c b/net/sctp/probe.c index 5f7518de2fd1..e62c22535be4 100644 --- a/net/sctp/probe.c +++ b/net/sctp/probe.c @@ -63,7 +63,7 @@ static struct { struct timespec tstart; } sctpw; -static void printl(const char *fmt, ...) +static __printf(1, 2) void printl(const char *fmt, ...) { va_list args; int len; @@ -122,12 +122,12 @@ static const struct file_operations sctpprobe_fops = { .llseek = noop_llseek, }; -sctp_disposition_t jsctp_sf_eat_sack(struct net *net, - const struct sctp_endpoint *ep, - const struct sctp_association *asoc, - const sctp_subtype_t type, - void *arg, - sctp_cmd_seq_t *commands) +static sctp_disposition_t jsctp_sf_eat_sack(struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) { struct sctp_transport *sp; static __u32 lcwnd = 0; @@ -183,13 +183,20 @@ static __init int sctpprobe_init(void) { int ret = -ENOMEM; + /* Warning: if the function signature of sctp_sf_eat_sack_6_2, + * has been changed, you also have to change the signature of + * jsctp_sf_eat_sack, otherwise you end up right here! + */ + BUILD_BUG_ON(__same_type(sctp_sf_eat_sack_6_2, + jsctp_sf_eat_sack) == 0); + init_waitqueue_head(&sctpw.wait); spin_lock_init(&sctpw.lock); if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL)) return ret; - if (!proc_net_fops_create(&init_net, procname, S_IRUSR, - &sctpprobe_fops)) + if (!proc_create(procname, S_IRUSR, init_net.proc_net, + &sctpprobe_fops)) goto free_kfifo; ret = register_jprobe(&sctp_recv_probe); @@ -201,7 +208,7 @@ static __init int sctpprobe_init(void) return 0; remove_proc: - proc_net_remove(&init_net, procname); + remove_proc_entry(procname, init_net.proc_net); free_kfifo: kfifo_free(&sctpw.fifo); return ret; @@ -210,7 +217,7 @@ free_kfifo: static __exit void sctpprobe_exit(void) { kfifo_free(&sctpw.fifo); - proc_net_remove(&init_net, procname); + remove_proc_entry(procname, init_net.proc_net); unregister_jprobe(&sctp_recv_probe); } diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 8c19e97262ca..4e45ee35d0db 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -213,7 +213,6 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v) struct sctp_ep_common *epb; struct sctp_endpoint *ep; struct sock *sk; - struct hlist_node *node; int hash = *(loff_t *)v; if (hash >= sctp_ep_hashsize) @@ -222,7 +221,7 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v) head = &sctp_ep_hashtable[hash]; sctp_local_bh_disable(); read_lock(&head->lock); - sctp_for_each_hentry(epb, node, &head->chain) { + sctp_for_each_hentry(epb, &head->chain) { ep = sctp_ep(epb); sk = epb->sk; if (!net_eq(sock_net(sk), seq_file_net(seq))) @@ -296,7 +295,8 @@ static void * sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) seq_printf(seq, " ASSOC SOCK STY SST ST HBKT " "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT " "RPORT LADDRS <-> RADDRS " - "HBINT INS OUTS MAXRT T1X T2X RTXC\n"); + "HBINT INS OUTS MAXRT T1X T2X RTXC " + "wmema wmemq sndbuf rcvbuf\n"); return (void *)pos; } @@ -321,7 +321,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) struct sctp_ep_common *epb; struct sctp_association *assoc; struct sock *sk; - struct hlist_node *node; int hash = *(loff_t *)v; if (hash >= sctp_assoc_hashsize) @@ -330,7 +329,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) head = &sctp_assoc_hashtable[hash]; sctp_local_bh_disable(); read_lock(&head->lock); - sctp_for_each_hentry(epb, node, &head->chain) { + sctp_for_each_hentry(epb, &head->chain) { assoc = sctp_assoc(epb); sk = epb->sk; if (!net_eq(sock_net(sk), seq_file_net(seq))) @@ -351,11 +350,16 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) sctp_seq_dump_local_addrs(seq, epb); seq_printf(seq, "<-> "); sctp_seq_dump_remote_addrs(seq, assoc); - seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d ", + seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d " + "%8d %8d %8d %8d", assoc->hbinterval, assoc->c.sinit_max_instreams, assoc->c.sinit_num_ostreams, assoc->max_retrans, assoc->init_retries, assoc->shutdown_retries, - assoc->rtx_data_chunks); + assoc->rtx_data_chunks, + atomic_read(&sk->sk_wmem_alloc), + sk->sk_wmem_queued, + sk->sk_sndbuf, + sk->sk_rcvbuf); seq_printf(seq, "\n"); } read_unlock(&head->lock); @@ -436,7 +440,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) struct sctp_hashbucket *head; struct sctp_ep_common *epb; struct sctp_association *assoc; - struct hlist_node *node; struct sctp_transport *tsp; int hash = *(loff_t *)v; @@ -447,7 +450,7 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) sctp_local_bh_disable(); read_lock(&head->lock); rcu_read_lock(); - sctp_for_each_hentry(epb, node, &head->chain) { + sctp_for_each_hentry(epb, &head->chain) { if (!net_eq(sock_net(epb->sk), seq_file_net(seq))) continue; assoc = sctp_assoc(epb); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f898b1c58bd2..eaee00c61139 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -595,7 +595,7 @@ static void sctp_v4_ecn_capable(struct sock *sk) INET_ECN_xmit(sk); } -void sctp_addr_wq_timeout_handler(unsigned long arg) +static void sctp_addr_wq_timeout_handler(unsigned long arg) { struct net *net = (struct net *)arg; struct sctp_sockaddr_entry *addrw, *temp; @@ -1403,7 +1403,7 @@ SCTP_STATIC __init int sctp_init(void) /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; - sctp_ep_hashtable = (struct sctp_hashbucket *) + sctp_ep_hashtable = kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL); if (!sctp_ep_hashtable) { pr_err("Failed endpoint_hash alloc\n"); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index e1c5fc2be6b8..cf579e71cff0 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1201,7 +1201,7 @@ nodata: * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) * This is a helper function to allocate an error chunk for * for those invalid parameter codes in which we may not want - * to report all the errors, if the incomming chunk is large + * to report all the errors, if the incoming chunk is large */ static inline struct sctp_chunk *sctp_make_op_error_fixed( const struct sctp_association *asoc, @@ -1589,8 +1589,6 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, struct sctp_signed_cookie *cookie; struct scatterlist sg; int headersize, bodysize; - unsigned int keylen; - char *key; /* Header size is static data prior to the actual cookie, including * any padding. @@ -1650,12 +1648,11 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, /* Sign the message. */ sg_init_one(&sg, &cookie->c, bodysize); - keylen = SCTP_SECRET_SIZE; - key = (char *)ep->secret_key[ep->current_key]; desc.tfm = sctp_sk(ep->base.sk)->hmac; desc.flags = 0; - if (crypto_hash_setkey(desc.tfm, key, keylen) || + if (crypto_hash_setkey(desc.tfm, ep->secret_key, + sizeof(ep->secret_key)) || crypto_hash_digest(&desc, &sg, bodysize, cookie->signature)) goto free_cookie; } @@ -1682,8 +1679,7 @@ struct sctp_association *sctp_unpack_cookie( int headersize, bodysize, fixed_size; __u8 *digest = ep->digest; struct scatterlist sg; - unsigned int keylen, len; - char *key; + unsigned int len; sctp_scope_t scope; struct sk_buff *skb = chunk->skb; struct timeval tv; @@ -1718,34 +1714,21 @@ struct sctp_association *sctp_unpack_cookie( goto no_hmac; /* Check the signature. */ - keylen = SCTP_SECRET_SIZE; sg_init_one(&sg, bear_cookie, bodysize); - key = (char *)ep->secret_key[ep->current_key]; desc.tfm = sctp_sk(ep->base.sk)->hmac; desc.flags = 0; memset(digest, 0x00, SCTP_SIGNATURE_SIZE); - if (crypto_hash_setkey(desc.tfm, key, keylen) || + if (crypto_hash_setkey(desc.tfm, ep->secret_key, + sizeof(ep->secret_key)) || crypto_hash_digest(&desc, &sg, bodysize, digest)) { *error = -SCTP_IERROR_NOMEM; goto fail; } if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { - /* Try the previous key. */ - key = (char *)ep->secret_key[ep->last_key]; - memset(digest, 0x00, SCTP_SIGNATURE_SIZE); - if (crypto_hash_setkey(desc.tfm, key, keylen) || - crypto_hash_digest(&desc, &sg, bodysize, digest)) { - *error = -SCTP_IERROR_NOMEM; - goto fail; - } - - if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { - /* Yikes! Still bad signature! */ - *error = -SCTP_IERROR_BAD_SIG; - goto fail; - } + *error = -SCTP_IERROR_BAD_SIG; + goto fail; } no_hmac: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index c9577754a708..8aab894aeabe 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -674,10 +674,8 @@ static void sctp_cmd_t3_rtx_timers_stop(sctp_cmd_seq_t *cmds, list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { - if (timer_pending(&t->T3_rtx_timer) && - del_timer(&t->T3_rtx_timer)) { + if (del_timer(&t->T3_rtx_timer)) sctp_transport_put(t); - } } } @@ -1517,7 +1515,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_TIMER_STOP: timer = &asoc->timers[cmd->obj.to]; - if (timer_pending(timer) && del_timer(timer)) + if (del_timer(timer)) sctp_association_put(asoc); break; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5131fcfedb03..de1a0138317f 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2082,7 +2082,7 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net, } /* Delete the tempory new association. */ - sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_ASOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); /* Restore association pointer to provide SCTP command interpeter diff --git a/net/sctp/socket.c b/net/sctp/socket.c index cedd9bf67b8c..f631c5ff4dbf 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1119,9 +1119,10 @@ static int __sctp_connect(struct sock* sk, /* Make sure the destination port is correctly set * in all addresses. */ - if (asoc && asoc->peer.port && asoc->peer.port != port) + if (asoc && asoc->peer.port && asoc->peer.port != port) { + err = -EINVAL; goto out_free; - + } /* Check if there already is a matching association on the * endpoint (other than the one created here). @@ -5653,6 +5654,9 @@ static int sctp_getsockopt_assoc_stats(struct sock *sk, int len, if (len < sizeof(sctp_assoc_t)) return -EINVAL; + /* Allow the struct to grow and fill in as much as possible */ + len = min_t(size_t, len, sizeof(sas)); + if (copy_from_user(&sas, optval, len)) return -EFAULT; @@ -5686,9 +5690,6 @@ static int sctp_getsockopt_assoc_stats(struct sock *sk, int len, /* Mark beginning of a new observation period */ asoc->stats.max_obs_rto = asoc->rto_min; - /* Allow the struct to grow and fill in as much as possible */ - len = min_t(size_t, len, sizeof(sas)); - if (put_user(len, optlen)) return -EFAULT; @@ -5882,8 +5883,7 @@ static struct sctp_bind_bucket *sctp_bucket_create( static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { struct sctp_bind_hashbucket *head; /* hash list */ - struct sctp_bind_bucket *pp; /* hash list port iterator */ - struct hlist_node *node; + struct sctp_bind_bucket *pp; unsigned short snum; int ret; @@ -5910,7 +5910,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) index = sctp_phashfn(sock_net(sk), rover); head = &sctp_port_hashtable[index]; sctp_spin_lock(&head->lock); - sctp_for_each_hentry(pp, node, &head->chain) + sctp_for_each_hentry(pp, &head->chain) if ((pp->port == rover) && net_eq(sock_net(sk), pp->net)) goto next; @@ -5938,7 +5938,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(sk), snum)]; sctp_spin_lock(&head->lock); - sctp_for_each_hentry(pp, node, &head->chain) { + sctp_for_each_hentry(pp, &head->chain) { if ((pp->port == snum) && net_eq(pp->net, sock_net(sk))) goto pp_found; } @@ -5970,7 +5970,7 @@ pp_found: * that this port/socket (sk) combination are already * in an endpoint. */ - sk_for_each_bound(sk2, node, &pp->owner) { + sk_for_each_bound(sk2, &pp->owner) { struct sctp_endpoint *ep2; ep2 = sctp_sk(sk2)->ep; @@ -6186,7 +6186,8 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Is there any exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= POLLERR | + sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c index 442ad4ed6315..da8603523808 100644 --- a/net/sctp/ssnmap.c +++ b/net/sctp/ssnmap.c @@ -41,8 +41,6 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> -#define MAX_KMALLOC_SIZE 131072 - static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in, __u16 out); @@ -65,7 +63,7 @@ struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, int size; size = sctp_ssnmap_size(in, out); - if (size <= MAX_KMALLOC_SIZE) + if (size <= KMALLOC_MAX_SIZE) retval = kmalloc(size, gfp); else retval = (struct sctp_ssnmap *) @@ -76,13 +74,12 @@ struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, if (!sctp_ssnmap_init(retval, in, out)) goto fail_map; - retval->malloced = 1; SCTP_DBG_OBJCNT_INC(ssnmap); return retval; fail_map: - if (size <= MAX_KMALLOC_SIZE) + if (size <= KMALLOC_MAX_SIZE) kfree(retval); else free_pages((unsigned long)retval, get_order(size)); @@ -120,14 +117,16 @@ void sctp_ssnmap_clear(struct sctp_ssnmap *map) /* Dispose of a ssnmap. */ void sctp_ssnmap_free(struct sctp_ssnmap *map) { - if (map && map->malloced) { - int size; - - size = sctp_ssnmap_size(map->in.len, map->out.len); - if (size <= MAX_KMALLOC_SIZE) - kfree(map); - else - free_pages((unsigned long)map, get_order(size)); - SCTP_DBG_OBJCNT_DEC(ssnmap); - } + int size; + + if (unlikely(!map)) + return; + + size = sctp_ssnmap_size(map->in.len, map->out.len); + if (size <= KMALLOC_MAX_SIZE) + kfree(map); + else + free_pages((unsigned long)map, get_order(size)); + + SCTP_DBG_OBJCNT_DEC(ssnmap); } diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 4e45bb68aef0..098f1d5f769e 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -123,7 +123,6 @@ struct sctp_transport *sctp_transport_new(struct net *net, if (!sctp_transport_init(net, transport, addr, gfp)) goto fail_init; - transport->malloced = 1; SCTP_DBG_OBJCNT_INC(transport); return transport; @@ -151,13 +150,11 @@ void sctp_transport_free(struct sctp_transport *transport) * structure hang around in memory since we know * the tranport is going away. */ - if (timer_pending(&transport->T3_rtx_timer) && - del_timer(&transport->T3_rtx_timer)) + if (del_timer(&transport->T3_rtx_timer)) sctp_transport_put(transport); /* Delete the ICMP proto unreachable timer if it's active. */ - if (timer_pending(&transport->proto_unreach_timer) && - del_timer(&transport->proto_unreach_timer)) + if (del_timer(&transport->proto_unreach_timer)) sctp_association_put(transport->asoc); sctp_transport_put(transport); @@ -168,10 +165,6 @@ static void sctp_transport_destroy_rcu(struct rcu_head *head) struct sctp_transport *transport; transport = container_of(head, struct sctp_transport, rcu); - if (transport->asoc) - sctp_association_put(transport->asoc); - - sctp_packet_free(&transport->packet); dst_release(transport->dst); kfree(transport); @@ -186,6 +179,11 @@ static void sctp_transport_destroy(struct sctp_transport *transport) SCTP_ASSERT(transport->dead, "Transport is not dead", return); call_rcu(&transport->rcu, sctp_transport_destroy_rcu); + + sctp_packet_free(&transport->packet); + + if (transport->asoc) + sctp_association_put(transport->asoc); } /* Start T3_rtx timer if it is not already running and update the heartbeat @@ -654,10 +652,9 @@ void sctp_transport_reset(struct sctp_transport *t) void sctp_transport_immediate_rtx(struct sctp_transport *t) { /* Stop pending T3_rtx_timer */ - if (timer_pending(&t->T3_rtx_timer)) { - (void)del_timer(&t->T3_rtx_timer); + if (del_timer(&t->T3_rtx_timer)) sctp_transport_put(t); - } + sctp_retransmit(&t->asoc->outqueue, t, SCTP_RTXR_T3_RTX); if (!timer_pending(&t->T3_rtx_timer)) { if (!mod_timer(&t->T3_rtx_timer, jiffies + t->rto)) diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c index 5f25e0c92c31..396c45174e5b 100644 --- a/net/sctp/tsnmap.c +++ b/net/sctp/tsnmap.c @@ -51,7 +51,7 @@ static void sctp_tsnmap_update(struct sctp_tsnmap *map); static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off, __u16 len, __u16 *start, __u16 *end); -static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 gap); +static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size); /* Initialize a block of memory as a tsnmap. */ struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len, @@ -124,7 +124,7 @@ int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn, gap = tsn - map->base_tsn; - if (gap >= map->len && !sctp_tsnmap_grow(map, gap)) + if (gap >= map->len && !sctp_tsnmap_grow(map, gap + 1)) return -ENOMEM; if (!sctp_tsnmap_has_gap(map) && gap == 0) { @@ -360,23 +360,24 @@ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map, return ngaps; } -static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 gap) +static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size) { unsigned long *new; unsigned long inc; u16 len; - if (gap >= SCTP_TSN_MAP_SIZE) + if (size > SCTP_TSN_MAP_SIZE) return 0; - inc = ALIGN((gap - map->len),BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT; + inc = ALIGN((size - map->len), BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT; len = min_t(u16, map->len + inc, SCTP_TSN_MAP_SIZE); new = kzalloc(len>>3, GFP_ATOMIC); if (!new) return 0; - bitmap_copy(new, map->tsn_map, map->max_tsn_seen - map->base_tsn); + bitmap_copy(new, map->tsn_map, + map->max_tsn_seen - map->cumulative_tsn_ack_point); kfree(map->tsn_map); map->tsn_map = new; map->len = len; diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index ada17464b65b..04e3d470f877 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -68,7 +68,6 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq, skb_queue_head_init(&ulpq->reasm); skb_queue_head_init(&ulpq->lobby); ulpq->pd_mode = 0; - ulpq->malloced = 0; return ulpq; } @@ -96,8 +95,6 @@ void sctp_ulpq_flush(struct sctp_ulpq *ulpq) void sctp_ulpq_free(struct sctp_ulpq *ulpq) { sctp_ulpq_flush(ulpq); - if (ulpq->malloced) - kfree(ulpq); } /* Process an incoming DATA chunk. */ @@ -106,6 +103,7 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, { struct sk_buff_head temp; struct sctp_ulpevent *event; + int event_eor = 0; /* Create an event from the incoming chunk. */ event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); @@ -127,10 +125,12 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, /* Send event to the ULP. 'event' is the sctp_ulpevent for * very first SKB on the 'temp' list. */ - if (event) + if (event) { + event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_ulpq_tail_event(ulpq, event); + } - return 0; + return event_eor; } /* Add a new event for propagation to the ULP. */ @@ -540,14 +540,19 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq) ctsn = cevent->tsn; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (!first_frag) + return NULL; + goto done; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { first_frag = pos; next_tsn = ctsn + 1; last_frag = pos; - } else if (next_tsn == ctsn) + } else if (next_tsn == ctsn) { next_tsn++; - else + last_frag = pos; + } else goto done; break; case SCTP_DATA_LAST_FRAG: @@ -651,6 +656,14 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq) } else goto done; break; + + case SCTP_DATA_LAST_FRAG: + if (!first_frag) + return NULL; + else + goto done; + break; + default: return NULL; } @@ -962,20 +975,43 @@ static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list, __u16 needed) { __u16 freed = 0; - __u32 tsn; - struct sk_buff *skb; + __u32 tsn, last_tsn; + struct sk_buff *skb, *flist, *last; struct sctp_ulpevent *event; struct sctp_tsnmap *tsnmap; tsnmap = &ulpq->asoc->peer.tsn_map; - while ((skb = __skb_dequeue_tail(list)) != NULL) { - freed += skb_headlen(skb); + while ((skb = skb_peek_tail(list)) != NULL) { event = sctp_skb2event(skb); tsn = event->tsn; + /* Don't renege below the Cumulative TSN ACK Point. */ + if (TSN_lte(tsn, sctp_tsnmap_get_ctsn(tsnmap))) + break; + + /* Events in ordering queue may have multiple fragments + * corresponding to additional TSNs. Sum the total + * freed space; find the last TSN. + */ + freed += skb_headlen(skb); + flist = skb_shinfo(skb)->frag_list; + for (last = flist; flist; flist = flist->next) { + last = flist; + freed += skb_headlen(last); + } + if (last) + last_tsn = sctp_skb2event(last)->tsn; + else + last_tsn = tsn; + + /* Unlink the event, then renege all applicable TSNs. */ + __skb_unlink(skb, list); sctp_ulpevent_free(event); - sctp_tsnmap_renege(tsnmap, tsn); + while (TSN_lte(tsn, last_tsn)) { + sctp_tsnmap_renege(tsnmap, tsn); + tsn++; + } if (freed >= needed) return freed; } @@ -1002,16 +1038,28 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event; struct sctp_association *asoc; struct sctp_sock *sp; + __u32 ctsn; + struct sk_buff *skb; asoc = ulpq->asoc; sp = sctp_sk(asoc->base.sk); /* If the association is already in Partial Delivery mode - * we have noting to do. + * we have nothing to do. */ if (ulpq->pd_mode) return; + /* Data must be at or below the Cumulative TSN ACK Point to + * start partial delivery. + */ + skb = skb_peek(&asoc->ulpq.reasm); + if (skb != NULL) { + ctsn = sctp_skb2event(skb)->tsn; + if (!TSN_lte(ctsn, sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map))) + return; + } + /* If the user enabled fragment interleave socket option, * multiple associations can enter partial delivery. * Otherwise, we can only enter partial delivery if the @@ -1054,12 +1102,16 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, } /* If able to free enough room, accept this chunk. */ if (chunk && (freed >= needed)) { - __u32 tsn; - tsn = ntohl(chunk->subh.data_hdr->tsn); - sctp_tsnmap_mark(&asoc->peer.tsn_map, tsn, chunk->transport); - sctp_ulpq_tail_data(ulpq, chunk, gfp); - - sctp_ulpq_partial_delivery(ulpq, gfp); + int retval; + retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); + /* + * Enter partial delivery if chunk has not been + * delivered; otherwise, drain the reassembly queue. + */ + if (retval <= 0) + sctp_ulpq_partial_delivery(ulpq, gfp); + else if (retval == 1) + sctp_ulpq_reasm_drain(ulpq); } sk_mem_reclaim(asoc->base.sk); diff --git a/net/socket.c b/net/socket.c index 1bbc37b7a312..6b94633ca61d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -69,7 +69,6 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/mutex.h> -#include <linux/wanrouter.h> #include <linux/if_bridge.h> #include <linux/if_frad.h> #include <linux/if_vlan.h> @@ -370,16 +369,15 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops); - if (unlikely(!file)) { + if (unlikely(IS_ERR(file))) { /* drop dentry, keep inode */ ihold(path.dentry->d_inode); path_put(&path); - return ERR_PTR(-ENFILE); + return file; } sock->file = file; file->f_flags = O_RDWR | (flags & O_NONBLOCK); - file->f_pos = 0; file->private_data = sock; return file; } @@ -602,7 +600,7 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) +void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) { *tx_flags = 0; if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) @@ -611,7 +609,6 @@ int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) *tx_flags |= SKBTX_SW_TSTAMP; if (sock_flag(sk, SOCK_WIFI_STATUS)) *tx_flags |= SKBTX_WIFI_STATUS; - return 0; } EXPORT_SYMBOL(sock_tx_timestamp); @@ -684,16 +681,6 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL(kernel_sendmsg); -static int ktime2ts(ktime_t kt, struct timespec *ts) -{ - if (kt.tv64) { - *ts = ktime_to_timespec(kt); - return 1; - } else { - return 0; - } -} - /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ @@ -726,17 +713,15 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, memset(ts, 0, sizeof(ts)); - if (skb->tstamp.tv64 && - sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) { - skb_get_timestampns(skb, ts + 0); + if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) && + ktime_to_timespec_cond(skb->tstamp, ts + 0)) empty = 0; - } if (shhwtstamps) { if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE) && - ktime2ts(shhwtstamps->syststamp, ts + 1)) + ktime_to_timespec_cond(shhwtstamps->syststamp, ts + 1)) empty = 0; if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) && - ktime2ts(shhwtstamps->hwtstamp, ts + 2)) + ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts + 2)) empty = 0; } if (!empty) @@ -1175,15 +1160,6 @@ static int sock_mmap(struct file *file, struct vm_area_struct *vma) static int sock_close(struct inode *inode, struct file *filp) { - /* - * It was possible the inode is NULL we were - * closing an unfinished socket. - */ - - if (!inode) { - printk(KERN_DEBUG "sock_close: NULL inode\n"); - return 0; - } sock_release(SOCKET_I(inode)); return 0; } @@ -2840,7 +2816,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) } ifr = compat_alloc_user_space(buf_size); - rxnfc = (void *)ifr + ALIGN(sizeof(struct ifreq), 8); + rxnfc = (void __user *)ifr + ALIGN(sizeof(struct ifreq), 8); if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ)) return -EFAULT; @@ -2864,12 +2840,12 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) offsetof(struct ethtool_rxnfc, fs.ring_cookie)); if (copy_in_user(rxnfc, compat_rxnfc, - (void *)(&rxnfc->fs.m_ext + 1) - - (void *)rxnfc) || + (void __user *)(&rxnfc->fs.m_ext + 1) - + (void __user *)rxnfc) || copy_in_user(&rxnfc->fs.ring_cookie, &compat_rxnfc->fs.ring_cookie, - (void *)(&rxnfc->fs.location + 1) - - (void *)&rxnfc->fs.ring_cookie) || + (void __user *)(&rxnfc->fs.location + 1) - + (void __user *)&rxnfc->fs.ring_cookie) || copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt, sizeof(rxnfc->rule_cnt))) return -EFAULT; @@ -2881,12 +2857,12 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) if (convert_out) { if (copy_in_user(compat_rxnfc, rxnfc, - (const void *)(&rxnfc->fs.m_ext + 1) - - (const void *)rxnfc) || + (const void __user *)(&rxnfc->fs.m_ext + 1) - + (const void __user *)rxnfc) || copy_in_user(&compat_rxnfc->fs.ring_cookie, &rxnfc->fs.ring_cookie, - (const void *)(&rxnfc->fs.location + 1) - - (const void *)&rxnfc->fs.ring_cookie) || + (const void __user *)(&rxnfc->fs.location + 1) - + (const void __user *)&rxnfc->fs.ring_cookie) || copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt, sizeof(rxnfc->rule_cnt))) return -EFAULT; diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 03d03e37a7d5..241b54f30204 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -3,6 +3,7 @@ config SUNRPC config SUNRPC_GSS tristate + select OID_REGISTRY config SUNRPC_BACKCHANNEL bool @@ -10,7 +11,7 @@ config SUNRPC_BACKCHANNEL config SUNRPC_XPRT_RDMA tristate - depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS && EXPERIMENTAL + depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS default SUNRPC && INFINIBAND help This option allows the NFS client and server to support @@ -24,7 +25,6 @@ config SUNRPC_XPRT_RDMA config SUNRPC_SWAP bool depends on SUNRPC - select NETVM config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism" diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index d11418f97f1f..a622ad64acd8 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -17,7 +17,8 @@ */ #include <net/ipv6.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/msg_prot.h> #include <linux/slab.h> #include <linux/export.h> diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index b5c067bccc45..ed2fdd210c0b 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -82,7 +82,7 @@ MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); static u32 pseudoflavor_to_flavor(u32 flavor) { - if (flavor >= RPC_AUTH_MAXFLAVOR) + if (flavor > RPC_AUTH_MAXFLAVOR) return RPC_AUTH_GSS; return flavor; } @@ -124,6 +124,79 @@ rpcauth_unregister(const struct rpc_authops *ops) EXPORT_SYMBOL_GPL(rpcauth_unregister); /** + * rpcauth_get_pseudoflavor - check if security flavor is supported + * @flavor: a security flavor + * @info: a GSS mech OID, quality of protection, and service value + * + * Verifies that an appropriate kernel module is available or already loaded. + * Returns an equivalent pseudoflavor, or RPC_AUTH_MAXFLAVOR if "flavor" is + * not supported locally. + */ +rpc_authflavor_t +rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) +{ + const struct rpc_authops *ops; + rpc_authflavor_t pseudoflavor; + + ops = auth_flavors[flavor]; + if (ops == NULL) + request_module("rpc-auth-%u", flavor); + spin_lock(&rpc_authflavor_lock); + ops = auth_flavors[flavor]; + if (ops == NULL || !try_module_get(ops->owner)) { + spin_unlock(&rpc_authflavor_lock); + return RPC_AUTH_MAXFLAVOR; + } + spin_unlock(&rpc_authflavor_lock); + + pseudoflavor = flavor; + if (ops->info2flavor != NULL) + pseudoflavor = ops->info2flavor(info); + + module_put(ops->owner); + return pseudoflavor; +} +EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); + +/** + * rpcauth_get_gssinfo - find GSS tuple matching a GSS pseudoflavor + * @pseudoflavor: GSS pseudoflavor to match + * @info: rpcsec_gss_info structure to fill in + * + * Returns zero and fills in "info" if pseudoflavor matches a + * supported mechanism. + */ +int +rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) +{ + rpc_authflavor_t flavor = pseudoflavor_to_flavor(pseudoflavor); + const struct rpc_authops *ops; + int result; + + if (flavor >= RPC_AUTH_MAXFLAVOR) + return -EINVAL; + + ops = auth_flavors[flavor]; + if (ops == NULL) + request_module("rpc-auth-%u", flavor); + spin_lock(&rpc_authflavor_lock); + ops = auth_flavors[flavor]; + if (ops == NULL || !try_module_get(ops->owner)) { + spin_unlock(&rpc_authflavor_lock); + return -ENOENT; + } + spin_unlock(&rpc_authflavor_lock); + + result = -ENOENT; + if (ops->flavor2info != NULL) + result = ops->flavor2info(pseudoflavor, info); + + module_put(ops->owner); + return result; +} +EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); + +/** * rpcauth_list_flavors - discover registered flavors and pseudoflavors * @array: array to fill in * @size: size of "array" @@ -407,15 +480,14 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, { LIST_HEAD(free); struct rpc_cred_cache *cache = auth->au_credcache; - struct hlist_node *pos; struct rpc_cred *cred = NULL, *entry, *new; unsigned int nr; - nr = hash_long(acred->uid, cache->hashbits); + nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits); rcu_read_lock(); - hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) { + hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; spin_lock(&cache->lock); @@ -439,7 +511,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, } spin_lock(&cache->lock); - hlist_for_each_entry(entry, pos, &cache->hashtable[nr], cr_hash) { + hlist_for_each_entry(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); @@ -519,8 +591,8 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { - .uid = 0, - .gid = 0, + .uid = GLOBAL_ROOT_UID, + .gid = GLOBAL_ROOT_GID, }; dprintk("RPC: %5u looking up %s cred\n", diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 6ed6f201b022..b6badafc6494 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -18,8 +18,8 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -#define RPC_MACHINE_CRED_USERID ((uid_t)0) -#define RPC_MACHINE_CRED_GROUPID ((gid_t)0) +#define RPC_MACHINE_CRED_USERID GLOBAL_ROOT_UID +#define RPC_MACHINE_CRED_GROUPID GLOBAL_ROOT_GID struct generic_cred { struct rpc_cred gc_base; @@ -96,7 +96,9 @@ generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) dprintk("RPC: allocated %s cred %p for uid %d gid %d\n", gcred->acred.machine_cred ? "machine" : "generic", - gcred, acred->uid, acred->gid); + gcred, + from_kuid(&init_user_ns, acred->uid), + from_kgid(&init_user_ns, acred->gid)); return &gcred->gc_base; } @@ -129,8 +131,8 @@ machine_cred_match(struct auth_cred *acred, struct generic_cred *gcred, int flag { if (!gcred->acred.machine_cred || gcred->acred.principal != acred->principal || - gcred->acred.uid != acred->uid || - gcred->acred.gid != acred->gid) + !uid_eq(gcred->acred.uid, acred->uid) || + !gid_eq(gcred->acred.gid, acred->gid)) return 0; return 1; } @@ -147,8 +149,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) if (acred->machine_cred) return machine_cred_match(acred, gcred, flags); - if (gcred->acred.uid != acred->uid || - gcred->acred.gid != acred->gid || + if (!uid_eq(gcred->acred.uid, acred->uid) || + !gid_eq(gcred->acred.gid, acred->gid) || gcred->acred.machine_cred != 0) goto out_nomatch; diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index 9e4cb59ef9f0..14e9e53e63d5 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -5,7 +5,8 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o auth_rpcgss-y := auth_gss.o gss_generic_token.o \ - gss_mech_switch.o svcauth_gss.o + gss_mech_switch.o svcauth_gss.o \ + gss_rpc_upcall.o gss_rpc_xdr.o obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 6e5c824b040b..7da6b457f66a 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -238,7 +238,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct p = ERR_PTR(-EFAULT); goto err; } - ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, GFP_NOFS); + ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS); if (ret < 0) { p = ERR_PTR(ret); goto err; @@ -247,8 +247,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct __func__, ctx->gc_expiry, now, timeout); return q; err: - dprintk("RPC: %s returns %ld gc_expiry %lu now %lu timeout %u\n", - __func__, -PTR_ERR(p), ctx->gc_expiry, now, timeout); + dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p)); return p; } @@ -256,7 +255,7 @@ err: struct gss_upcall_msg { atomic_t count; - uid_t uid; + kuid_t uid; struct rpc_pipe_msg msg; struct list_head list; struct gss_auth *auth; @@ -303,11 +302,11 @@ gss_release_msg(struct gss_upcall_msg *gss_msg) } static struct gss_upcall_msg * -__gss_find_upcall(struct rpc_pipe *pipe, uid_t uid) +__gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { - if (pos->uid != uid) + if (!uid_eq(pos->uid, uid)) continue; atomic_inc(&pos->count); dprintk("RPC: %s found msg %p\n", __func__, pos); @@ -395,8 +394,11 @@ gss_upcall_callback(struct rpc_task *task) static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg) { - gss_msg->msg.data = &gss_msg->uid; - gss_msg->msg.len = sizeof(gss_msg->uid); + uid_t uid = from_kuid(&init_user_ns, gss_msg->uid); + memcpy(gss_msg->databuf, &uid, sizeof(uid)); + gss_msg->msg.data = gss_msg->databuf; + gss_msg->msg.len = sizeof(uid); + BUG_ON(sizeof(uid) > UPCALL_BUF_LEN); } static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, @@ -409,7 +411,7 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ", mech->gm_name, - gss_msg->uid); + from_kuid(&init_user_ns, gss_msg->uid)); p += gss_msg->msg.len; if (clnt->cl_principal) { len = sprintf(p, "target=%s ", clnt->cl_principal); @@ -445,7 +447,7 @@ static void gss_encode_msg(struct gss_upcall_msg *gss_msg, static struct gss_upcall_msg * gss_alloc_msg(struct gss_auth *gss_auth, struct rpc_clnt *clnt, - uid_t uid, const char *service_name) + kuid_t uid, const char *service_name) { struct gss_upcall_msg *gss_msg; int vers; @@ -475,7 +477,7 @@ gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cr struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_new, *gss_msg; - uid_t uid = cred->cr_uid; + kuid_t uid = cred->cr_uid; gss_new = gss_alloc_msg(gss_auth, clnt, uid, gss_cred->gc_principal); if (IS_ERR(gss_new)) @@ -517,7 +519,7 @@ gss_refresh_upcall(struct rpc_task *task) int err = 0; dprintk("RPC: %5u %s for uid %u\n", - task->tk_pid, __func__, cred->cr_uid); + task->tk_pid, __func__, from_kuid(&init_user_ns, cred->cr_uid)); gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we @@ -549,7 +551,8 @@ gss_refresh_upcall(struct rpc_task *task) gss_release_msg(gss_msg); out: dprintk("RPC: %5u %s for uid %u result %d\n", - task->tk_pid, __func__, cred->cr_uid, err); + task->tk_pid, __func__, + from_kuid(&init_user_ns, cred->cr_uid), err); return err; } @@ -562,7 +565,8 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) DEFINE_WAIT(wait); int err = 0; - dprintk("RPC: %s for uid %u\n", __func__, cred->cr_uid); + dprintk("RPC: %s for uid %u\n", + __func__, from_kuid(&init_user_ns, cred->cr_uid)); retry: gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { @@ -604,7 +608,7 @@ out_intr: gss_release_msg(gss_msg); out: dprintk("RPC: %s for uid %u result %d\n", - __func__, cred->cr_uid, err); + __func__, from_kuid(&init_user_ns, cred->cr_uid), err); return err; } @@ -616,9 +620,10 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) const void *p, *end; void *buf; struct gss_upcall_msg *gss_msg; - struct rpc_pipe *pipe = RPC_I(filp->f_dentry->d_inode)->pipe; + struct rpc_pipe *pipe = RPC_I(file_inode(filp))->pipe; struct gss_cl_ctx *ctx; - uid_t uid; + uid_t id; + kuid_t uid; ssize_t err = -EFBIG; if (mlen > MSG_BUF_MAXSIZE) @@ -633,12 +638,18 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) goto err; end = (const void *)((char *)buf + mlen); - p = simple_get_bytes(buf, end, &uid, sizeof(uid)); + p = simple_get_bytes(buf, end, &id, sizeof(id)); if (IS_ERR(p)) { err = PTR_ERR(p); goto err; } + uid = make_kuid(&init_user_ns, id); + if (!uid_valid(uid)) { + err = -EINVAL; + goto err; + } + err = -ENOMEM; ctx = gss_alloc_context(); if (ctx == NULL) @@ -856,8 +867,7 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) err = -EINVAL; gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); if (!gss_auth->mech) { - printk(KERN_WARNING "%s: Pseudoflavor %d not found!\n", - __func__, flavor); + dprintk("RPC: Pseudoflavor %d not found!\n", flavor); goto err_free; } gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); @@ -1059,7 +1069,8 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) int err = -ENOMEM; dprintk("RPC: %s for uid %d, flavor %d\n", - __func__, acred->uid, auth->au_flavor); + __func__, from_kuid(&init_user_ns, acred->uid), + auth->au_flavor); if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS))) goto out_err; @@ -1115,7 +1126,7 @@ out: } if (gss_cred->gc_principal != NULL) return 0; - return rc->cr_uid == acred->uid; + return uid_eq(rc->cr_uid, acred->uid); } /* @@ -1154,7 +1165,7 @@ gss_marshal(struct rpc_task *task, __be32 *p) /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ - iov.iov_base = xprt_skip_transport_header(task->tk_xprt, + iov.iov_base = xprt_skip_transport_header(req->rq_xprt, req->rq_snd_buf.head[0].iov_base); iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); @@ -1629,6 +1640,8 @@ static const struct rpc_authops authgss_ops = { .pipes_create = gss_pipes_dentries_create, .pipes_destroy = gss_pipes_dentries_destroy, .list_pseudoflavors = gss_mech_list_pseudoflavors, + .info2flavor = gss_mech_info2flavor, + .flavor2info = gss_mech_flavor2info, }; static const struct rpc_credops gss_credops = { @@ -1721,6 +1734,7 @@ static void __exit exit_rpcsec_gss(void) rcu_barrier(); /* Wait for completion of call_rcu()'s */ } +MODULE_ALIAS("rpc-auth-6"); MODULE_LICENSE("GPL"); module_param_named(expired_cred_retry_delay, gss_expired_cred_retry_delay, diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index d3611f11a8df..0d3c158ef8fa 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -679,6 +679,7 @@ out_err: static int gss_import_sec_context_kerberos(const void *p, size_t len, struct gss_ctx *ctx_id, + time_t *endtime, gfp_t gfp_mask) { const void *end = (const void *)((const char *)p + len); @@ -694,9 +695,11 @@ gss_import_sec_context_kerberos(const void *p, size_t len, else ret = gss_import_v2_context(p, end, ctx, gfp_mask); - if (ret == 0) + if (ret == 0) { ctx_id->internal_ctx_id = ctx; - else + if (endtime) + *endtime = ctx->endtime; + } else kfree(ctx); dprintk("RPC: %s: returning %d\n", __func__, ret); @@ -729,16 +732,19 @@ static const struct gss_api_ops gss_kerberos_ops = { static struct pf_desc gss_kerberos_pfs[] = { [0] = { .pseudoflavor = RPC_AUTH_GSS_KRB5, + .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_NONE, .name = "krb5", }, [1] = { .pseudoflavor = RPC_AUTH_GSS_KRB5I, + .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_INTEGRITY, .name = "krb5i", }, [2] = { .pseudoflavor = RPC_AUTH_GSS_KRB5P, + .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_PRIVACY, .name = "krb5p", }, @@ -750,11 +756,12 @@ MODULE_ALIAS("rpc-auth-gss-krb5p"); MODULE_ALIAS("rpc-auth-gss-390003"); MODULE_ALIAS("rpc-auth-gss-390004"); MODULE_ALIAS("rpc-auth-gss-390005"); +MODULE_ALIAS("rpc-auth-gss-1.2.840.113554.1.2.2"); static struct gss_api_mech gss_kerberos_mech = { .gm_name = "krb5", .gm_owner = THIS_MODULE, - .gm_oid = {9, (void *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02"}, + .gm_oid = { 9, "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02" }, .gm_ops = &gss_kerberos_ops, .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), .gm_pfs = gss_kerberos_pfs, diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 107c4528654f..1da52d1406fc 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -130,8 +130,8 @@ gss_krb5_make_confounder(char *p, u32 conflen) /* initialize to random value */ if (i == 0) { - i = random32(); - i = (i << 32) | random32(); + i = prandom_u32(); + i = (i << 32) | prandom_u32(); } switch (conflen) { @@ -574,6 +574,8 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip; + /* Trim off the checksum blob */ + xdr_buf_trim(buf, GSS_KRB5_TOK_HDR_LEN + tailskip); return GSS_S_COMPLETE; } diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index b174fcd9ff4c..defa9d33925c 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -36,6 +36,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/oid_registry.h> #include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/gss_asn1.h> #include <linux/sunrpc/auth_gss.h> @@ -102,8 +103,13 @@ out: return status; } -int -gss_mech_register(struct gss_api_mech *gm) +/** + * gss_mech_register - register a GSS mechanism + * @gm: GSS mechanism handle + * + * Returns zero if successful, or a negative errno. + */ +int gss_mech_register(struct gss_api_mech *gm) { int status; @@ -116,11 +122,14 @@ gss_mech_register(struct gss_api_mech *gm) dprintk("RPC: registered gss mechanism %s\n", gm->gm_name); return 0; } - EXPORT_SYMBOL_GPL(gss_mech_register); -void -gss_mech_unregister(struct gss_api_mech *gm) +/** + * gss_mech_unregister - release a GSS mechanism + * @gm: GSS mechanism handle + * + */ +void gss_mech_unregister(struct gss_api_mech *gm) { spin_lock(®istered_mechs_lock); list_del(&gm->gm_list); @@ -128,19 +137,15 @@ gss_mech_unregister(struct gss_api_mech *gm) dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name); gss_mech_free(gm); } - EXPORT_SYMBOL_GPL(gss_mech_unregister); -struct gss_api_mech * -gss_mech_get(struct gss_api_mech *gm) +static struct gss_api_mech *gss_mech_get(struct gss_api_mech *gm) { __module_get(gm->gm_owner); return gm; } -EXPORT_SYMBOL_GPL(gss_mech_get); - -struct gss_api_mech * +static struct gss_api_mech * _gss_mech_get_by_name(const char *name) { struct gss_api_mech *pos, *gm = NULL; @@ -169,12 +174,16 @@ struct gss_api_mech * gss_mech_get_by_name(const char *name) } return gm; } -EXPORT_SYMBOL_GPL(gss_mech_get_by_name); -struct gss_api_mech * -gss_mech_get_by_OID(struct xdr_netobj *obj) +struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) { struct gss_api_mech *pos, *gm = NULL; + char buf[32]; + + if (sprint_oid(obj->data, obj->len, buf, sizeof(buf)) < 0) + return NULL; + dprintk("RPC: %s(%s)\n", __func__, buf); + request_module("rpc-auth-gss-%s", buf); spin_lock(®istered_mechs_lock); list_for_each_entry(pos, ®istered_mechs, gm_list) { @@ -188,11 +197,8 @@ gss_mech_get_by_OID(struct xdr_netobj *obj) } spin_unlock(®istered_mechs_lock); return gm; - } -EXPORT_SYMBOL_GPL(gss_mech_get_by_OID); - static inline int mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor) { @@ -205,7 +211,7 @@ mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor) return 0; } -struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor) +static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor) { struct gss_api_mech *gm = NULL, *pos; @@ -237,8 +243,6 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor) return gm; } -EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor); - /** * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors * @array: array to fill in @@ -268,19 +272,82 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) return i; } -u32 -gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service) +/** + * gss_svc_to_pseudoflavor - map a GSS service number to a pseudoflavor + * @gm: GSS mechanism handle + * @qop: GSS quality-of-protection value + * @service: GSS service value + * + * Returns a matching security flavor, or RPC_AUTH_MAXFLAVOR if none is found. + */ +rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 qop, + u32 service) { int i; for (i = 0; i < gm->gm_pf_num; i++) { - if (gm->gm_pfs[i].service == service) { + if (gm->gm_pfs[i].qop == qop && + gm->gm_pfs[i].service == service) { return gm->gm_pfs[i].pseudoflavor; } } - return RPC_AUTH_MAXFLAVOR; /* illegal value */ + return RPC_AUTH_MAXFLAVOR; +} + +/** + * gss_mech_info2flavor - look up a pseudoflavor given a GSS tuple + * @info: a GSS mech OID, quality of protection, and service value + * + * Returns a matching pseudoflavor, or RPC_AUTH_MAXFLAVOR if the tuple is + * not supported. + */ +rpc_authflavor_t gss_mech_info2flavor(struct rpcsec_gss_info *info) +{ + rpc_authflavor_t pseudoflavor; + struct gss_api_mech *gm; + + gm = gss_mech_get_by_OID(&info->oid); + if (gm == NULL) + return RPC_AUTH_MAXFLAVOR; + + pseudoflavor = gss_svc_to_pseudoflavor(gm, info->qop, info->service); + + gss_mech_put(gm); + return pseudoflavor; +} + +/** + * gss_mech_flavor2info - look up a GSS tuple for a given pseudoflavor + * @pseudoflavor: GSS pseudoflavor to match + * @info: rpcsec_gss_info structure to fill in + * + * Returns zero and fills in "info" if pseudoflavor matches a + * supported mechanism. Otherwise a negative errno is returned. + */ +int gss_mech_flavor2info(rpc_authflavor_t pseudoflavor, + struct rpcsec_gss_info *info) +{ + struct gss_api_mech *gm; + int i; + + gm = gss_mech_get_by_pseudoflavor(pseudoflavor); + if (gm == NULL) + return -ENOENT; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) { + memcpy(info->oid.data, gm->gm_oid.data, gm->gm_oid.len); + info->oid.len = gm->gm_oid.len; + info->qop = gm->gm_pfs[i].qop; + info->service = gm->gm_pfs[i].service; + gss_mech_put(gm); + return 0; + } + } + + gss_mech_put(gm); + return -ENOENT; } -EXPORT_SYMBOL_GPL(gss_svc_to_pseudoflavor); u32 gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) @@ -294,8 +361,6 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) return 0; } -EXPORT_SYMBOL_GPL(gss_pseudoflavor_to_service); - char * gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) { @@ -308,8 +373,6 @@ gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) return NULL; } -EXPORT_SYMBOL_GPL(gss_service_to_auth_domain_name); - void gss_mech_put(struct gss_api_mech * gm) { @@ -317,22 +380,21 @@ gss_mech_put(struct gss_api_mech * gm) module_put(gm->gm_owner); } -EXPORT_SYMBOL_GPL(gss_mech_put); - /* The mech could probably be determined from the token instead, but it's just * as easy for now to pass it in. */ int gss_import_sec_context(const void *input_token, size_t bufsize, struct gss_api_mech *mech, struct gss_ctx **ctx_id, + time_t *endtime, gfp_t gfp_mask) { if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask))) return -ENOMEM; (*ctx_id)->mech_type = gss_mech_get(mech); - return mech->gm_ops - ->gss_import_sec_context(input_token, bufsize, *ctx_id, gfp_mask); + return mech->gm_ops->gss_import_sec_context(input_token, bufsize, + *ctx_id, endtime, gfp_mask); } /* gss_get_mic: compute a mic over message and return mic_token. */ diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c new file mode 100644 index 000000000000..d304f41260f2 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -0,0 +1,358 @@ +/* + * linux/net/sunrpc/gss_rpc_upcall.c + * + * Copyright (C) 2012 Simo Sorce <simo@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/types.h> +#include <linux/un.h> + +#include <linux/sunrpc/svcauth.h> +#include "gss_rpc_upcall.h" + +#define GSSPROXY_SOCK_PATHNAME "/var/run/gssproxy.sock" + +#define GSSPROXY_PROGRAM (400112u) +#define GSSPROXY_VERS_1 (1u) + +/* + * Encoding/Decoding functions + */ + +enum { + GSSX_NULL = 0, /* Unused */ + GSSX_INDICATE_MECHS = 1, + GSSX_GET_CALL_CONTEXT = 2, + GSSX_IMPORT_AND_CANON_NAME = 3, + GSSX_EXPORT_CRED = 4, + GSSX_IMPORT_CRED = 5, + GSSX_ACQUIRE_CRED = 6, + GSSX_STORE_CRED = 7, + GSSX_INIT_SEC_CONTEXT = 8, + GSSX_ACCEPT_SEC_CONTEXT = 9, + GSSX_RELEASE_HANDLE = 10, + GSSX_GET_MIC = 11, + GSSX_VERIFY = 12, + GSSX_WRAP = 13, + GSSX_UNWRAP = 14, + GSSX_WRAP_SIZE_LIMIT = 15, +}; + +#define PROC(proc, name) \ +[GSSX_##proc] = { \ + .p_proc = GSSX_##proc, \ + .p_encode = (kxdreproc_t)gssx_enc_##name, \ + .p_decode = (kxdrdproc_t)gssx_dec_##name, \ + .p_arglen = GSSX_ARG_##name##_sz, \ + .p_replen = GSSX_RES_##name##_sz, \ + .p_statidx = GSSX_##proc, \ + .p_name = #proc, \ +} + +static struct rpc_procinfo gssp_procedures[] = { + PROC(INDICATE_MECHS, indicate_mechs), + PROC(GET_CALL_CONTEXT, get_call_context), + PROC(IMPORT_AND_CANON_NAME, import_and_canon_name), + PROC(EXPORT_CRED, export_cred), + PROC(IMPORT_CRED, import_cred), + PROC(ACQUIRE_CRED, acquire_cred), + PROC(STORE_CRED, store_cred), + PROC(INIT_SEC_CONTEXT, init_sec_context), + PROC(ACCEPT_SEC_CONTEXT, accept_sec_context), + PROC(RELEASE_HANDLE, release_handle), + PROC(GET_MIC, get_mic), + PROC(VERIFY, verify), + PROC(WRAP, wrap), + PROC(UNWRAP, unwrap), + PROC(WRAP_SIZE_LIMIT, wrap_size_limit), +}; + + + +/* + * Common transport functions + */ + +static const struct rpc_program gssp_program; + +static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) +{ + static const struct sockaddr_un gssp_localaddr = { + .sun_family = AF_LOCAL, + .sun_path = GSSPROXY_SOCK_PATHNAME, + }; + struct rpc_create_args args = { + .net = net, + .protocol = XPRT_TRANSPORT_LOCAL, + .address = (struct sockaddr *)&gssp_localaddr, + .addrsize = sizeof(gssp_localaddr), + .servername = "localhost", + .program = &gssp_program, + .version = GSSPROXY_VERS_1, + .authflavor = RPC_AUTH_NULL, + /* + * Note we want connection to be done in the caller's + * filesystem namespace. We therefore turn off the idle + * timeout, which would result in reconnections being + * done without the correct namespace: + */ + .flags = RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_NO_IDLE_TIMEOUT + }; + struct rpc_clnt *clnt; + int result = 0; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) { + dprintk("RPC: failed to create AF_LOCAL gssproxy " + "client (errno %ld).\n", PTR_ERR(clnt)); + result = -PTR_ERR(clnt); + *_clnt = NULL; + goto out; + } + + dprintk("RPC: created new gssp local client (gssp_local_clnt: " + "%p)\n", clnt); + *_clnt = clnt; + +out: + return result; +} + +void init_gssp_clnt(struct sunrpc_net *sn) +{ + mutex_init(&sn->gssp_lock); + sn->gssp_clnt = NULL; + init_waitqueue_head(&sn->gssp_wq); +} + +int set_gssp_clnt(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + struct rpc_clnt *clnt; + int ret; + + mutex_lock(&sn->gssp_lock); + ret = gssp_rpc_create(net, &clnt); + if (!ret) { + if (sn->gssp_clnt) + rpc_shutdown_client(sn->gssp_clnt); + sn->gssp_clnt = clnt; + } + mutex_unlock(&sn->gssp_lock); + wake_up(&sn->gssp_wq); + return ret; +} + +void clear_gssp_clnt(struct sunrpc_net *sn) +{ + mutex_lock(&sn->gssp_lock); + if (sn->gssp_clnt) { + rpc_shutdown_client(sn->gssp_clnt); + sn->gssp_clnt = NULL; + } + mutex_unlock(&sn->gssp_lock); +} + +static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn) +{ + struct rpc_clnt *clnt; + + mutex_lock(&sn->gssp_lock); + clnt = sn->gssp_clnt; + if (clnt) + atomic_inc(&clnt->cl_count); + mutex_unlock(&sn->gssp_lock); + return clnt; +} + +static int gssp_call(struct net *net, struct rpc_message *msg) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + struct rpc_clnt *clnt; + int status; + + clnt = get_gssp_clnt(sn); + if (!clnt) + return -EIO; + status = rpc_call_sync(clnt, msg, 0); + if (status < 0) { + dprintk("gssp: rpc_call returned error %d\n", -status); + switch (status) { + case -EPROTONOSUPPORT: + status = -EINVAL; + break; + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ENOTCONN: + status = -EAGAIN; + break; + case -ERESTARTSYS: + if (signalled ()) + status = -EINTR; + break; + default: + break; + } + } + rpc_release_client(clnt); + return status; +} + + +/* + * Public functions + */ + +/* numbers somewhat arbitrary but large enough for current needs */ +#define GSSX_MAX_OUT_HANDLE 128 +#define GSSX_MAX_SRC_PRINC 256 +#define GSSX_KMEMBUF (GSSX_max_output_handle_sz + \ + GSSX_max_oid_sz + \ + GSSX_max_princ_sz + \ + sizeof(struct svc_cred)) + +int gssp_accept_sec_context_upcall(struct net *net, + struct gssp_upcall_data *data) +{ + struct gssx_ctx ctxh = { + .state = data->in_handle + }; + struct gssx_arg_accept_sec_context arg = { + .input_token = data->in_token, + }; + struct gssx_ctx rctxh = { + /* + * pass in the max length we expect for each of these + * buffers but let the xdr code kmalloc them: + */ + .exported_context_token.len = GSSX_max_output_handle_sz, + .mech.len = GSS_OID_MAX_LEN, + .src_name.display_name.len = GSSX_max_princ_sz + }; + struct gssx_res_accept_sec_context res = { + .context_handle = &rctxh, + .output_token = &data->out_token + }; + struct rpc_message msg = { + .rpc_proc = &gssp_procedures[GSSX_ACCEPT_SEC_CONTEXT], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = NULL, /* FIXME ? */ + }; + struct xdr_netobj client_name = { 0 , NULL }; + int ret; + + if (data->in_handle.len != 0) + arg.context_handle = &ctxh; + res.output_token->len = GSSX_max_output_token_sz; + + /* use nfs/ for targ_name ? */ + + ret = gssp_call(net, &msg); + + /* we need to fetch all data even in case of error so + * that we can free special strctures is they have been allocated */ + data->major_status = res.status.major_status; + data->minor_status = res.status.minor_status; + if (res.context_handle) { + data->out_handle = rctxh.exported_context_token; + data->mech_oid.len = rctxh.mech.len; + memcpy(data->mech_oid.data, rctxh.mech.data, + data->mech_oid.len); + client_name = rctxh.src_name.display_name; + } + + if (res.options.count == 1) { + gssx_buffer *value = &res.options.data[0].value; + /* Currently we only decode CREDS_VALUE, if we add + * anything else we'll have to loop and match on the + * option name */ + if (value->len == 1) { + /* steal group info from struct svc_cred */ + data->creds = *(struct svc_cred *)value->data; + data->found_creds = 1; + } + /* whether we use it or not, free data */ + kfree(value->data); + } + + if (res.options.count != 0) { + kfree(res.options.data); + } + + /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */ + if (data->found_creds && client_name.data != NULL) { + char *c; + + data->creds.cr_principal = kstrndup(client_name.data, + client_name.len, GFP_KERNEL); + if (data->creds.cr_principal) { + /* terminate and remove realm part */ + c = strchr(data->creds.cr_principal, '@'); + if (c) { + *c = '\0'; + + /* change service-hostname delimiter */ + c = strchr(data->creds.cr_principal, '/'); + if (c) *c = '@'; + } + if (!c) { + /* not a service principal */ + kfree(data->creds.cr_principal); + data->creds.cr_principal = NULL; + } + } + } + kfree(client_name.data); + + return ret; +} + +void gssp_free_upcall_data(struct gssp_upcall_data *data) +{ + kfree(data->in_handle.data); + kfree(data->out_handle.data); + kfree(data->out_token.data); + kfree(data->mech_oid.data); + free_svc_cred(&data->creds); +} + +/* + * Initialization stuff + */ + +static const struct rpc_version gssp_version1 = { + .number = GSSPROXY_VERS_1, + .nrprocs = ARRAY_SIZE(gssp_procedures), + .procs = gssp_procedures, +}; + +static const struct rpc_version *gssp_version[] = { + NULL, + &gssp_version1, +}; + +static struct rpc_stat gssp_stats; + +static const struct rpc_program gssp_program = { + .name = "gssproxy", + .number = GSSPROXY_PROGRAM, + .nrvers = ARRAY_SIZE(gssp_version), + .version = gssp_version, + .stats = &gssp_stats, +}; diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.h b/net/sunrpc/auth_gss/gss_rpc_upcall.h new file mode 100644 index 000000000000..1e542aded90a --- /dev/null +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.h @@ -0,0 +1,48 @@ +/* + * linux/net/sunrpc/gss_rpc_upcall.h + * + * Copyright (C) 2012 Simo Sorce <simo@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _GSS_RPC_UPCALL_H +#define _GSS_RPC_UPCALL_H + +#include <linux/sunrpc/gss_api.h> +#include <linux/sunrpc/auth_gss.h> +#include "gss_rpc_xdr.h" +#include "../netns.h" + +struct gssp_upcall_data { + struct xdr_netobj in_handle; + struct gssp_in_token in_token; + struct xdr_netobj out_handle; + struct xdr_netobj out_token; + struct rpcsec_gss_oid mech_oid; + struct svc_cred creds; + int found_creds; + int major_status; + int minor_status; +}; + +int gssp_accept_sec_context_upcall(struct net *net, + struct gssp_upcall_data *data); +void gssp_free_upcall_data(struct gssp_upcall_data *data); + +void init_gssp_clnt(struct sunrpc_net *); +int set_gssp_clnt(struct net *); +void clear_gssp_clnt(struct sunrpc_net *); +#endif /* _GSS_RPC_UPCALL_H */ diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c new file mode 100644 index 000000000000..357f613df7ff --- /dev/null +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -0,0 +1,840 @@ +/* + * GSS Proxy upcall module + * + * Copyright (C) 2012 Simo Sorce <simo@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/sunrpc/svcauth.h> +#include "gss_rpc_xdr.h" + +static int gssx_enc_bool(struct xdr_stream *xdr, int v) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + if (unlikely(p == NULL)) + return -ENOSPC; + *p = v ? xdr_one : xdr_zero; + return 0; +} + +static int gssx_dec_bool(struct xdr_stream *xdr, u32 *v) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -ENOSPC; + *v = be32_to_cpu(*p); + return 0; +} + +static int gssx_enc_buffer(struct xdr_stream *xdr, + gssx_buffer *buf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(u32) + buf->len); + if (!p) + return -ENOSPC; + xdr_encode_opaque(p, buf->data, buf->len); + return 0; +} + +static int gssx_enc_in_token(struct xdr_stream *xdr, + struct gssp_in_token *in) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return -ENOSPC; + *p = cpu_to_be32(in->page_len); + + /* all we need to do is to write pages */ + xdr_write_pages(xdr, in->pages, in->page_base, in->page_len); + + return 0; +} + + +static int gssx_dec_buffer(struct xdr_stream *xdr, + gssx_buffer *buf) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -ENOSPC; + + length = be32_to_cpup(p); + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + return -ENOSPC; + + if (buf->len == 0) { + /* we intentionally are not interested in this buffer */ + return 0; + } + if (length > buf->len) + return -ENOSPC; + + if (!buf->data) { + buf->data = kmemdup(p, length, GFP_KERNEL); + if (!buf->data) + return -ENOMEM; + } else { + memcpy(buf->data, p, length); + } + buf->len = length; + return 0; +} + +static int gssx_enc_option(struct xdr_stream *xdr, + struct gssx_option *opt) +{ + int err; + + err = gssx_enc_buffer(xdr, &opt->option); + if (err) + return err; + err = gssx_enc_buffer(xdr, &opt->value); + return err; +} + +static int gssx_dec_option(struct xdr_stream *xdr, + struct gssx_option *opt) +{ + int err; + + err = gssx_dec_buffer(xdr, &opt->option); + if (err) + return err; + err = gssx_dec_buffer(xdr, &opt->value); + return err; +} + +static int dummy_enc_opt_array(struct xdr_stream *xdr, + struct gssx_option_array *oa) +{ + __be32 *p; + + if (oa->count != 0) + return -EINVAL; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return -ENOSPC; + *p = 0; + + return 0; +} + +static int dummy_dec_opt_array(struct xdr_stream *xdr, + struct gssx_option_array *oa) +{ + struct gssx_option dummy; + u32 count, i; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -ENOSPC; + count = be32_to_cpup(p++); + memset(&dummy, 0, sizeof(dummy)); + for (i = 0; i < count; i++) { + gssx_dec_option(xdr, &dummy); + } + + oa->count = 0; + oa->data = NULL; + return 0; +} + +static int get_s32(void **p, void *max, s32 *res) +{ + void *base = *p; + void *next = (void *)((char *)base + sizeof(s32)); + if (unlikely(next > max || next < base)) + return -EINVAL; + memcpy(res, base, sizeof(s32)); + *p = next; + return 0; +} + +static int gssx_dec_linux_creds(struct xdr_stream *xdr, + struct svc_cred *creds) +{ + u32 length; + __be32 *p; + void *q, *end; + s32 tmp; + int N, i, err; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -ENOSPC; + + length = be32_to_cpup(p); + + /* FIXME: we do not want to use the scratch buffer for this one + * may need to use functions that allows us to access an io vector + * directly */ + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + return -ENOSPC; + + q = p; + end = q + length; + + /* uid */ + err = get_s32(&q, end, &tmp); + if (err) + return err; + creds->cr_uid = make_kuid(&init_user_ns, tmp); + + /* gid */ + err = get_s32(&q, end, &tmp); + if (err) + return err; + creds->cr_gid = make_kgid(&init_user_ns, tmp); + + /* number of additional gid's */ + err = get_s32(&q, end, &tmp); + if (err) + return err; + N = tmp; + creds->cr_group_info = groups_alloc(N); + if (creds->cr_group_info == NULL) + return -ENOMEM; + + /* gid's */ + for (i = 0; i < N; i++) { + kgid_t kgid; + err = get_s32(&q, end, &tmp); + if (err) + goto out_free_groups; + err = -EINVAL; + kgid = make_kgid(&init_user_ns, tmp); + if (!gid_valid(kgid)) + goto out_free_groups; + GROUP_AT(creds->cr_group_info, i) = kgid; + } + + return 0; +out_free_groups: + groups_free(creds->cr_group_info); + return err; +} + +static int gssx_dec_option_array(struct xdr_stream *xdr, + struct gssx_option_array *oa) +{ + struct svc_cred *creds; + u32 count, i; + __be32 *p; + int err; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -ENOSPC; + count = be32_to_cpup(p++); + if (!count) + return 0; + + /* we recognize only 1 currently: CREDS_VALUE */ + oa->count = 1; + + oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL); + if (!oa->data) + return -ENOMEM; + + creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL); + if (!creds) { + kfree(oa->data); + return -ENOMEM; + } + + oa->data[0].option.data = CREDS_VALUE; + oa->data[0].option.len = sizeof(CREDS_VALUE); + oa->data[0].value.data = (void *)creds; + oa->data[0].value.len = 0; + + for (i = 0; i < count; i++) { + gssx_buffer dummy = { 0, NULL }; + u32 length; + + /* option buffer */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -ENOSPC; + + length = be32_to_cpup(p); + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + return -ENOSPC; + + if (length == sizeof(CREDS_VALUE) && + memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) { + /* We have creds here. parse them */ + err = gssx_dec_linux_creds(xdr, creds); + if (err) + return err; + oa->data[0].value.len = 1; /* presence */ + } else { + /* consume uninteresting buffer */ + err = gssx_dec_buffer(xdr, &dummy); + if (err) + return err; + } + } + return 0; +} + +static int gssx_dec_status(struct xdr_stream *xdr, + struct gssx_status *status) +{ + __be32 *p; + int err; + + /* status->major_status */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(p == NULL)) + return -ENOSPC; + p = xdr_decode_hyper(p, &status->major_status); + + /* status->mech */ + err = gssx_dec_buffer(xdr, &status->mech); + if (err) + return err; + + /* status->minor_status */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(p == NULL)) + return -ENOSPC; + p = xdr_decode_hyper(p, &status->minor_status); + + /* status->major_status_string */ + err = gssx_dec_buffer(xdr, &status->major_status_string); + if (err) + return err; + + /* status->minor_status_string */ + err = gssx_dec_buffer(xdr, &status->minor_status_string); + if (err) + return err; + + /* status->server_ctx */ + err = gssx_dec_buffer(xdr, &status->server_ctx); + if (err) + return err; + + /* we assume we have no options for now, so simply consume them */ + /* status->options */ + err = dummy_dec_opt_array(xdr, &status->options); + + return err; +} + +static int gssx_enc_call_ctx(struct xdr_stream *xdr, + struct gssx_call_ctx *ctx) +{ + struct gssx_option opt; + __be32 *p; + int err; + + /* ctx->locale */ + err = gssx_enc_buffer(xdr, &ctx->locale); + if (err) + return err; + + /* ctx->server_ctx */ + err = gssx_enc_buffer(xdr, &ctx->server_ctx); + if (err) + return err; + + /* we always want to ask for lucid contexts */ + /* ctx->options */ + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(2); + + /* we want a lucid_v1 context */ + opt.option.data = LUCID_OPTION; + opt.option.len = sizeof(LUCID_OPTION); + opt.value.data = LUCID_VALUE; + opt.value.len = sizeof(LUCID_VALUE); + err = gssx_enc_option(xdr, &opt); + + /* ..and user creds */ + opt.option.data = CREDS_OPTION; + opt.option.len = sizeof(CREDS_OPTION); + opt.value.data = CREDS_VALUE; + opt.value.len = sizeof(CREDS_VALUE); + err = gssx_enc_option(xdr, &opt); + + return err; +} + +static int gssx_dec_name_attr(struct xdr_stream *xdr, + struct gssx_name_attr *attr) +{ + int err; + + /* attr->attr */ + err = gssx_dec_buffer(xdr, &attr->attr); + if (err) + return err; + + /* attr->value */ + err = gssx_dec_buffer(xdr, &attr->value); + if (err) + return err; + + /* attr->extensions */ + err = dummy_dec_opt_array(xdr, &attr->extensions); + + return err; +} + +static int dummy_enc_nameattr_array(struct xdr_stream *xdr, + struct gssx_name_attr_array *naa) +{ + __be32 *p; + + if (naa->count != 0) + return -EINVAL; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return -ENOSPC; + *p = 0; + + return 0; +} + +static int dummy_dec_nameattr_array(struct xdr_stream *xdr, + struct gssx_name_attr_array *naa) +{ + struct gssx_name_attr dummy; + u32 count, i; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -ENOSPC; + count = be32_to_cpup(p++); + for (i = 0; i < count; i++) { + gssx_dec_name_attr(xdr, &dummy); + } + + naa->count = 0; + naa->data = NULL; + return 0; +} + +static struct xdr_netobj zero_netobj = {}; + +static struct gssx_name_attr_array zero_name_attr_array = {}; + +static struct gssx_option_array zero_option_array = {}; + +static int gssx_enc_name(struct xdr_stream *xdr, + struct gssx_name *name) +{ + int err; + + /* name->display_name */ + err = gssx_enc_buffer(xdr, &name->display_name); + if (err) + return err; + + /* name->name_type */ + err = gssx_enc_buffer(xdr, &zero_netobj); + if (err) + return err; + + /* name->exported_name */ + err = gssx_enc_buffer(xdr, &zero_netobj); + if (err) + return err; + + /* name->exported_composite_name */ + err = gssx_enc_buffer(xdr, &zero_netobj); + if (err) + return err; + + /* leave name_attributes empty for now, will add once we have any + * to pass up at all */ + /* name->name_attributes */ + err = dummy_enc_nameattr_array(xdr, &zero_name_attr_array); + if (err) + return err; + + /* leave options empty for now, will add once we have any options + * to pass up at all */ + /* name->extensions */ + err = dummy_enc_opt_array(xdr, &zero_option_array); + + return err; +} + +static int gssx_dec_name(struct xdr_stream *xdr, + struct gssx_name *name) +{ + struct xdr_netobj dummy_netobj; + struct gssx_name_attr_array dummy_name_attr_array; + struct gssx_option_array dummy_option_array; + int err; + + /* name->display_name */ + err = gssx_dec_buffer(xdr, &name->display_name); + if (err) + return err; + + /* name->name_type */ + err = gssx_dec_buffer(xdr, &dummy_netobj); + if (err) + return err; + + /* name->exported_name */ + err = gssx_dec_buffer(xdr, &dummy_netobj); + if (err) + return err; + + /* name->exported_composite_name */ + err = gssx_dec_buffer(xdr, &dummy_netobj); + if (err) + return err; + + /* we assume we have no attributes for now, so simply consume them */ + /* name->name_attributes */ + err = dummy_dec_nameattr_array(xdr, &dummy_name_attr_array); + if (err) + return err; + + /* we assume we have no options for now, so simply consume them */ + /* name->extensions */ + err = dummy_dec_opt_array(xdr, &dummy_option_array); + + return err; +} + +static int dummy_enc_credel_array(struct xdr_stream *xdr, + struct gssx_cred_element_array *cea) +{ + __be32 *p; + + if (cea->count != 0) + return -EINVAL; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return -ENOSPC; + *p = 0; + + return 0; +} + +static int gssx_enc_cred(struct xdr_stream *xdr, + struct gssx_cred *cred) +{ + int err; + + /* cred->desired_name */ + err = gssx_enc_name(xdr, &cred->desired_name); + if (err) + return err; + + /* cred->elements */ + err = dummy_enc_credel_array(xdr, &cred->elements); + + /* cred->cred_handle_reference */ + err = gssx_enc_buffer(xdr, &cred->cred_handle_reference); + if (err) + return err; + + /* cred->needs_release */ + err = gssx_enc_bool(xdr, cred->needs_release); + + return err; +} + +static int gssx_enc_ctx(struct xdr_stream *xdr, + struct gssx_ctx *ctx) +{ + __be32 *p; + int err; + + /* ctx->exported_context_token */ + err = gssx_enc_buffer(xdr, &ctx->exported_context_token); + if (err) + return err; + + /* ctx->state */ + err = gssx_enc_buffer(xdr, &ctx->state); + if (err) + return err; + + /* ctx->need_release */ + err = gssx_enc_bool(xdr, ctx->need_release); + if (err) + return err; + + /* ctx->mech */ + err = gssx_enc_buffer(xdr, &ctx->mech); + if (err) + return err; + + /* ctx->src_name */ + err = gssx_enc_name(xdr, &ctx->src_name); + if (err) + return err; + + /* ctx->targ_name */ + err = gssx_enc_name(xdr, &ctx->targ_name); + if (err) + return err; + + /* ctx->lifetime */ + p = xdr_reserve_space(xdr, 8+8); + if (!p) + return -ENOSPC; + p = xdr_encode_hyper(p, ctx->lifetime); + + /* ctx->ctx_flags */ + p = xdr_encode_hyper(p, ctx->ctx_flags); + + /* ctx->locally_initiated */ + err = gssx_enc_bool(xdr, ctx->locally_initiated); + if (err) + return err; + + /* ctx->open */ + err = gssx_enc_bool(xdr, ctx->open); + if (err) + return err; + + /* leave options empty for now, will add once we have any options + * to pass up at all */ + /* ctx->options */ + err = dummy_enc_opt_array(xdr, &ctx->options); + + return err; +} + +static int gssx_dec_ctx(struct xdr_stream *xdr, + struct gssx_ctx *ctx) +{ + __be32 *p; + int err; + + /* ctx->exported_context_token */ + err = gssx_dec_buffer(xdr, &ctx->exported_context_token); + if (err) + return err; + + /* ctx->state */ + err = gssx_dec_buffer(xdr, &ctx->state); + if (err) + return err; + + /* ctx->need_release */ + err = gssx_dec_bool(xdr, &ctx->need_release); + if (err) + return err; + + /* ctx->mech */ + err = gssx_dec_buffer(xdr, &ctx->mech); + if (err) + return err; + + /* ctx->src_name */ + err = gssx_dec_name(xdr, &ctx->src_name); + if (err) + return err; + + /* ctx->targ_name */ + err = gssx_dec_name(xdr, &ctx->targ_name); + if (err) + return err; + + /* ctx->lifetime */ + p = xdr_inline_decode(xdr, 8+8); + if (unlikely(p == NULL)) + return -ENOSPC; + p = xdr_decode_hyper(p, &ctx->lifetime); + + /* ctx->ctx_flags */ + p = xdr_decode_hyper(p, &ctx->ctx_flags); + + /* ctx->locally_initiated */ + err = gssx_dec_bool(xdr, &ctx->locally_initiated); + if (err) + return err; + + /* ctx->open */ + err = gssx_dec_bool(xdr, &ctx->open); + if (err) + return err; + + /* we assume we have no options for now, so simply consume them */ + /* ctx->options */ + err = dummy_dec_opt_array(xdr, &ctx->options); + + return err; +} + +static int gssx_enc_cb(struct xdr_stream *xdr, struct gssx_cb *cb) +{ + __be32 *p; + int err; + + /* cb->initiator_addrtype */ + p = xdr_reserve_space(xdr, 8); + if (!p) + return -ENOSPC; + p = xdr_encode_hyper(p, cb->initiator_addrtype); + + /* cb->initiator_address */ + err = gssx_enc_buffer(xdr, &cb->initiator_address); + if (err) + return err; + + /* cb->acceptor_addrtype */ + p = xdr_reserve_space(xdr, 8); + if (!p) + return -ENOSPC; + p = xdr_encode_hyper(p, cb->acceptor_addrtype); + + /* cb->acceptor_address */ + err = gssx_enc_buffer(xdr, &cb->acceptor_address); + if (err) + return err; + + /* cb->application_data */ + err = gssx_enc_buffer(xdr, &cb->application_data); + + return err; +} + +void gssx_enc_accept_sec_context(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct gssx_arg_accept_sec_context *arg) +{ + int err; + + err = gssx_enc_call_ctx(xdr, &arg->call_ctx); + if (err) + goto done; + + /* arg->context_handle */ + if (arg->context_handle) { + err = gssx_enc_ctx(xdr, arg->context_handle); + if (err) + goto done; + } else { + err = gssx_enc_bool(xdr, 0); + } + + /* arg->cred_handle */ + if (arg->cred_handle) { + err = gssx_enc_cred(xdr, arg->cred_handle); + if (err) + goto done; + } else { + err = gssx_enc_bool(xdr, 0); + } + + /* arg->input_token */ + err = gssx_enc_in_token(xdr, &arg->input_token); + if (err) + goto done; + + /* arg->input_cb */ + if (arg->input_cb) { + err = gssx_enc_cb(xdr, arg->input_cb); + if (err) + goto done; + } else { + err = gssx_enc_bool(xdr, 0); + } + + err = gssx_enc_bool(xdr, arg->ret_deleg_cred); + if (err) + goto done; + + /* leave options empty for now, will add once we have any options + * to pass up at all */ + /* arg->options */ + err = dummy_enc_opt_array(xdr, &arg->options); + +done: + if (err) + dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); +} + +int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct gssx_res_accept_sec_context *res) +{ + u32 value_follows; + int err; + + /* res->status */ + err = gssx_dec_status(xdr, &res->status); + if (err) + return err; + + /* res->context_handle */ + err = gssx_dec_bool(xdr, &value_follows); + if (err) + return err; + if (value_follows) { + err = gssx_dec_ctx(xdr, res->context_handle); + if (err) + return err; + } else { + res->context_handle = NULL; + } + + /* res->output_token */ + err = gssx_dec_bool(xdr, &value_follows); + if (err) + return err; + if (value_follows) { + err = gssx_dec_buffer(xdr, res->output_token); + if (err) + return err; + } else { + res->output_token = NULL; + } + + /* res->delegated_cred_handle */ + err = gssx_dec_bool(xdr, &value_follows); + if (err) + return err; + if (value_follows) { + /* we do not support upcall servers sending this data. */ + return -EINVAL; + } + + /* res->options */ + err = gssx_dec_option_array(xdr, &res->options); + + return err; +} diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.h b/net/sunrpc/auth_gss/gss_rpc_xdr.h new file mode 100644 index 000000000000..1c98b27d870c --- /dev/null +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.h @@ -0,0 +1,264 @@ +/* + * GSS Proxy upcall module + * + * Copyright (C) 2012 Simo Sorce <simo@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _LINUX_GSS_RPC_XDR_H +#define _LINUX_GSS_RPC_XDR_H + +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/xprtsock.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +#define LUCID_OPTION "exported_context_type" +#define LUCID_VALUE "linux_lucid_v1" +#define CREDS_OPTION "exported_creds_type" +#define CREDS_VALUE "linux_creds_v1" + +typedef struct xdr_netobj gssx_buffer; +typedef struct xdr_netobj utf8string; +typedef struct xdr_netobj gssx_OID; + +enum gssx_cred_usage { + GSSX_C_INITIATE = 1, + GSSX_C_ACCEPT = 2, + GSSX_C_BOTH = 3, +}; + +struct gssx_option { + gssx_buffer option; + gssx_buffer value; +}; + +struct gssx_option_array { + u32 count; + struct gssx_option *data; +}; + +struct gssx_status { + u64 major_status; + gssx_OID mech; + u64 minor_status; + utf8string major_status_string; + utf8string minor_status_string; + gssx_buffer server_ctx; + struct gssx_option_array options; +}; + +struct gssx_call_ctx { + utf8string locale; + gssx_buffer server_ctx; + struct gssx_option_array options; +}; + +struct gssx_name_attr { + gssx_buffer attr; + gssx_buffer value; + struct gssx_option_array extensions; +}; + +struct gssx_name_attr_array { + u32 count; + struct gssx_name_attr *data; +}; + +struct gssx_name { + gssx_buffer display_name; +}; +typedef struct gssx_name gssx_name; + +struct gssx_cred_element { + gssx_name MN; + gssx_OID mech; + u32 cred_usage; + u64 initiator_time_rec; + u64 acceptor_time_rec; + struct gssx_option_array options; +}; + +struct gssx_cred_element_array { + u32 count; + struct gssx_cred_element *data; +}; + +struct gssx_cred { + gssx_name desired_name; + struct gssx_cred_element_array elements; + gssx_buffer cred_handle_reference; + u32 needs_release; +}; + +struct gssx_ctx { + gssx_buffer exported_context_token; + gssx_buffer state; + u32 need_release; + gssx_OID mech; + gssx_name src_name; + gssx_name targ_name; + u64 lifetime; + u64 ctx_flags; + u32 locally_initiated; + u32 open; + struct gssx_option_array options; +}; + +struct gssx_cb { + u64 initiator_addrtype; + gssx_buffer initiator_address; + u64 acceptor_addrtype; + gssx_buffer acceptor_address; + gssx_buffer application_data; +}; + + +/* This structure is not defined in the protocol. + * It is used in the kernel to carry around a big buffer + * as a set of pages */ +struct gssp_in_token { + struct page **pages; /* Array of contiguous pages */ + unsigned int page_base; /* Start of page data */ + unsigned int page_len; /* Length of page data */ +}; + +struct gssx_arg_accept_sec_context { + struct gssx_call_ctx call_ctx; + struct gssx_ctx *context_handle; + struct gssx_cred *cred_handle; + struct gssp_in_token input_token; + struct gssx_cb *input_cb; + u32 ret_deleg_cred; + struct gssx_option_array options; +}; + +struct gssx_res_accept_sec_context { + struct gssx_status status; + struct gssx_ctx *context_handle; + gssx_buffer *output_token; + /* struct gssx_cred *delegated_cred_handle; not used in kernel */ + struct gssx_option_array options; +}; + + + +#define gssx_enc_indicate_mechs NULL +#define gssx_dec_indicate_mechs NULL +#define gssx_enc_get_call_context NULL +#define gssx_dec_get_call_context NULL +#define gssx_enc_import_and_canon_name NULL +#define gssx_dec_import_and_canon_name NULL +#define gssx_enc_export_cred NULL +#define gssx_dec_export_cred NULL +#define gssx_enc_import_cred NULL +#define gssx_dec_import_cred NULL +#define gssx_enc_acquire_cred NULL +#define gssx_dec_acquire_cred NULL +#define gssx_enc_store_cred NULL +#define gssx_dec_store_cred NULL +#define gssx_enc_init_sec_context NULL +#define gssx_dec_init_sec_context NULL +void gssx_enc_accept_sec_context(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct gssx_arg_accept_sec_context *args); +int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct gssx_res_accept_sec_context *res); +#define gssx_enc_release_handle NULL +#define gssx_dec_release_handle NULL +#define gssx_enc_get_mic NULL +#define gssx_dec_get_mic NULL +#define gssx_enc_verify NULL +#define gssx_dec_verify NULL +#define gssx_enc_wrap NULL +#define gssx_dec_wrap NULL +#define gssx_enc_unwrap NULL +#define gssx_dec_unwrap NULL +#define gssx_enc_wrap_size_limit NULL +#define gssx_dec_wrap_size_limit NULL + +/* non implemented calls are set to 0 size */ +#define GSSX_ARG_indicate_mechs_sz 0 +#define GSSX_RES_indicate_mechs_sz 0 +#define GSSX_ARG_get_call_context_sz 0 +#define GSSX_RES_get_call_context_sz 0 +#define GSSX_ARG_import_and_canon_name_sz 0 +#define GSSX_RES_import_and_canon_name_sz 0 +#define GSSX_ARG_export_cred_sz 0 +#define GSSX_RES_export_cred_sz 0 +#define GSSX_ARG_import_cred_sz 0 +#define GSSX_RES_import_cred_sz 0 +#define GSSX_ARG_acquire_cred_sz 0 +#define GSSX_RES_acquire_cred_sz 0 +#define GSSX_ARG_store_cred_sz 0 +#define GSSX_RES_store_cred_sz 0 +#define GSSX_ARG_init_sec_context_sz 0 +#define GSSX_RES_init_sec_context_sz 0 + +#define GSSX_default_in_call_ctx_sz (4 + 4 + 4 + \ + 8 + sizeof(LUCID_OPTION) + sizeof(LUCID_VALUE) + \ + 8 + sizeof(CREDS_OPTION) + sizeof(CREDS_VALUE)) +#define GSSX_default_in_ctx_hndl_sz (4 + 4+8 + 4 + 4 + 6*4 + 6*4 + 8 + 8 + \ + 4 + 4 + 4) +#define GSSX_default_in_cred_sz 4 /* we send in no cred_handle */ +#define GSSX_default_in_token_sz 4 /* does *not* include token data */ +#define GSSX_default_in_cb_sz 4 /* we do not use channel bindings */ +#define GSSX_ARG_accept_sec_context_sz (GSSX_default_in_call_ctx_sz + \ + GSSX_default_in_ctx_hndl_sz + \ + GSSX_default_in_cred_sz + \ + GSSX_default_in_token_sz + \ + GSSX_default_in_cb_sz + \ + 4 /* no deleg creds boolean */ + \ + 4) /* empty options */ + +/* somewhat arbitrary numbers but large enough (we ignore some of the data + * sent down, but it is part of the protocol so we need enough space to take + * it in) */ +#define GSSX_default_status_sz 8 + 24 + 8 + 256 + 256 + 16 + 4 +#define GSSX_max_output_handle_sz 128 +#define GSSX_max_oid_sz 16 +#define GSSX_max_princ_sz 256 +#define GSSX_default_ctx_sz (GSSX_max_output_handle_sz + \ + 16 + 4 + GSSX_max_oid_sz + \ + 2 * GSSX_max_princ_sz + \ + 8 + 8 + 4 + 4 + 4) +#define GSSX_max_output_token_sz 1024 +#define GSSX_max_creds_sz (4 + 4 + 4 + NGROUPS_MAX * 4) +#define GSSX_RES_accept_sec_context_sz (GSSX_default_status_sz + \ + GSSX_default_ctx_sz + \ + GSSX_max_output_token_sz + \ + 4 + GSSX_max_creds_sz) + +#define GSSX_ARG_release_handle_sz 0 +#define GSSX_RES_release_handle_sz 0 +#define GSSX_ARG_get_mic_sz 0 +#define GSSX_RES_get_mic_sz 0 +#define GSSX_ARG_verify_sz 0 +#define GSSX_RES_verify_sz 0 +#define GSSX_ARG_wrap_sz 0 +#define GSSX_RES_wrap_sz 0 +#define GSSX_ARG_unwrap_sz 0 +#define GSSX_RES_unwrap_sz 0 +#define GSSX_ARG_wrap_size_limit_sz 0 +#define GSSX_RES_wrap_size_limit_sz 0 + + + +#endif /* _LINUX_GSS_RPC_XDR_H */ diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 73e957386600..871c73c92165 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -48,8 +48,8 @@ #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/cache.h> +#include "gss_rpc_upcall.h" -#include "../netns.h" #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH @@ -182,12 +182,6 @@ static void rsi_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) -{ - return sunrpc_cache_pipe_upcall(cd, h, rsi_request); -} - - static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen) { @@ -275,7 +269,7 @@ static struct cache_detail rsi_cache_template = { .hash_size = RSI_HASHMAX, .name = "auth.rpcsec.init", .cache_put = rsi_put, - .cache_upcall = rsi_upcall, + .cache_request = rsi_request, .cache_parse = rsi_parse, .match = rsi_match, .init = rsi_init, @@ -418,6 +412,7 @@ static int rsc_parse(struct cache_detail *cd, { /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */ char *buf = mesg; + int id; int len, rv; struct rsc rsci, *rscp = NULL; time_t expiry; @@ -444,7 +439,7 @@ static int rsc_parse(struct cache_detail *cd, goto out; /* uid, or NEGATIVE */ - rv = get_int(&mesg, &rsci.cred.cr_uid); + rv = get_int(&mesg, &id); if (rv == -EINVAL) goto out; if (rv == -ENOENT) @@ -452,9 +447,21 @@ static int rsc_parse(struct cache_detail *cd, else { int N, i; + /* + * NOTE: we skip uid_valid()/gid_valid() checks here: + * instead, * -1 id's are later mapped to the + * (export-specific) anonymous id by nfsd_setuser. + * + * (But supplementary gid's get no such special + * treatment so are checked for validity here.) + */ + /* uid */ + rsci.cred.cr_uid = make_kuid(&init_user_ns, id); + /* gid */ - if (get_int(&mesg, &rsci.cred.cr_gid)) + if (get_int(&mesg, &id)) goto out; + rsci.cred.cr_gid = make_kgid(&init_user_ns, id); /* number of additional gid's */ if (get_int(&mesg, &N)) @@ -467,11 +474,10 @@ static int rsc_parse(struct cache_detail *cd, /* gid's */ status = -EINVAL; for (i=0; i<N; i++) { - gid_t gid; kgid_t kgid; - if (get_int(&mesg, &gid)) + if (get_int(&mesg, &id)) goto out; - kgid = make_kgid(&init_user_ns, gid); + kgid = make_kgid(&init_user_ns, id); if (!gid_valid(kgid)) goto out; GROUP_AT(rsci.cred.cr_group_info, i) = kgid; @@ -491,7 +497,8 @@ static int rsc_parse(struct cache_detail *cd, len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; - status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, GFP_KERNEL); + status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, + NULL, GFP_KERNEL); if (status) goto out; @@ -499,8 +506,10 @@ static int rsc_parse(struct cache_detail *cd, len = qword_get(&mesg, buf, mlen); if (len > 0) { rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL); - if (!rsci.cred.cr_principal) + if (!rsci.cred.cr_principal) { + status = -ENOMEM; goto out; + } } } @@ -817,13 +826,17 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) * The server uses base of head iovec as read pointer, while the * client uses separate pointer. */ static int -unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) +unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) { int stat = -EINVAL; u32 integ_len, maj_stat; struct xdr_netobj mic; struct xdr_buf integ_buf; + /* Did we already verify the signature on the original pass through? */ + if (rqstp->rq_deferred) + return 0; + integ_len = svc_getnl(&buf->head[0]); if (integ_len & 3) return stat; @@ -846,6 +859,8 @@ unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) goto out; if (svc_getnl(&buf->head[0]) != seq) goto out; + /* trim off the mic at the end before returning */ + xdr_buf_trim(buf, mic.len + 4); stat = 0; out: kfree(mic.data); @@ -975,13 +990,10 @@ gss_write_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp, } static inline int -gss_read_verf(struct rpc_gss_wire_cred *gc, - struct kvec *argv, __be32 *authp, - struct xdr_netobj *in_handle, - struct xdr_netobj *in_token) +gss_read_common_verf(struct rpc_gss_wire_cred *gc, + struct kvec *argv, __be32 *authp, + struct xdr_netobj *in_handle) { - struct xdr_netobj tmpobj; - /* Read the verifier; should be NULL: */ *authp = rpc_autherr_badverf; if (argv->iov_len < 2 * 4) @@ -997,6 +1009,23 @@ gss_read_verf(struct rpc_gss_wire_cred *gc, if (dup_netobj(in_handle, &gc->gc_ctx)) return SVC_CLOSE; *authp = rpc_autherr_badverf; + + return 0; +} + +static inline int +gss_read_verf(struct rpc_gss_wire_cred *gc, + struct kvec *argv, __be32 *authp, + struct xdr_netobj *in_handle, + struct xdr_netobj *in_token) +{ + struct xdr_netobj tmpobj; + int res; + + res = gss_read_common_verf(gc, argv, authp, in_handle); + if (res) + return res; + if (svc_safe_getnetobj(argv, &tmpobj)) { kfree(in_handle->data); return SVC_DENIED; @@ -1009,6 +1038,40 @@ gss_read_verf(struct rpc_gss_wire_cred *gc, return 0; } +/* Ok this is really heavily depending on a set of semantics in + * how rqstp is set up by svc_recv and pages laid down by the + * server when reading a request. We are basically guaranteed that + * the token lays all down linearly across a set of pages, starting + * at iov_base in rq_arg.head[0] which happens to be the first of a + * set of pages stored in rq_pages[]. + * rq_arg.head[0].iov_base will provide us the page_base to pass + * to the upcall. + */ +static inline int +gss_read_proxy_verf(struct svc_rqst *rqstp, + struct rpc_gss_wire_cred *gc, __be32 *authp, + struct xdr_netobj *in_handle, + struct gssp_in_token *in_token) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + u32 inlen; + int res; + + res = gss_read_common_verf(gc, argv, authp, in_handle); + if (res) + return res; + + inlen = svc_getnl(argv); + if (inlen > (argv->iov_len + rqstp->rq_arg.page_len)) + return SVC_DENIED; + + in_token->pages = rqstp->rq_pages; + in_token->page_base = (ulong)argv->iov_base & ~PAGE_MASK; + in_token->page_len = inlen; + + return 0; +} + static inline int gss_write_resv(struct kvec *resv, size_t size_limit, struct xdr_netobj *out_handle, struct xdr_netobj *out_token, @@ -1036,7 +1099,7 @@ gss_write_resv(struct kvec *resv, size_t size_limit, * the upcall results are available, write the verifier and result. * Otherwise, drop the request pending an answer to the upcall. */ -static int svcauth_gss_handle_init(struct svc_rqst *rqstp, +static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc, __be32 *authp) { struct kvec *argv = &rqstp->rq_arg.head[0]; @@ -1076,6 +1139,287 @@ out: return ret; } +static int gss_proxy_save_rsc(struct cache_detail *cd, + struct gssp_upcall_data *ud, + uint64_t *handle) +{ + struct rsc rsci, *rscp = NULL; + static atomic64_t ctxhctr; + long long ctxh; + struct gss_api_mech *gm = NULL; + time_t expiry; + int status = -EINVAL; + + memset(&rsci, 0, sizeof(rsci)); + /* context handle */ + status = -ENOMEM; + /* the handle needs to be just a unique id, + * use a static counter */ + ctxh = atomic64_inc_return(&ctxhctr); + + /* make a copy for the caller */ + *handle = ctxh; + + /* make a copy for the rsc cache */ + if (dup_to_netobj(&rsci.handle, (char *)handle, sizeof(uint64_t))) + goto out; + rscp = rsc_lookup(cd, &rsci); + if (!rscp) + goto out; + + /* creds */ + if (!ud->found_creds) { + /* userspace seem buggy, we should always get at least a + * mapping to nobody */ + dprintk("RPC: No creds found, marking Negative!\n"); + set_bit(CACHE_NEGATIVE, &rsci.h.flags); + } else { + + /* steal creds */ + rsci.cred = ud->creds; + memset(&ud->creds, 0, sizeof(struct svc_cred)); + + status = -EOPNOTSUPP; + /* get mech handle from OID */ + gm = gss_mech_get_by_OID(&ud->mech_oid); + if (!gm) + goto out; + + status = -EINVAL; + /* mech-specific data: */ + status = gss_import_sec_context(ud->out_handle.data, + ud->out_handle.len, + gm, &rsci.mechctx, + &expiry, GFP_KERNEL); + if (status) + goto out; + } + + rsci.h.expiry_time = expiry; + rscp = rsc_update(cd, &rsci, rscp); + status = 0; +out: + gss_mech_put(gm); + rsc_free(&rsci); + if (rscp) + cache_put(&rscp->h, cd); + else + status = -ENOMEM; + return status; +} + +static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, + struct rpc_gss_wire_cred *gc, __be32 *authp) +{ + struct kvec *resv = &rqstp->rq_res.head[0]; + struct xdr_netobj cli_handle; + struct gssp_upcall_data ud; + uint64_t handle; + int status; + int ret; + struct net *net = rqstp->rq_xprt->xpt_net; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + memset(&ud, 0, sizeof(ud)); + ret = gss_read_proxy_verf(rqstp, gc, authp, + &ud.in_handle, &ud.in_token); + if (ret) + return ret; + + ret = SVC_CLOSE; + + /* Perform synchronous upcall to gss-proxy */ + status = gssp_accept_sec_context_upcall(net, &ud); + if (status) + goto out; + + dprintk("RPC: svcauth_gss: gss major status = %d\n", + ud.major_status); + + switch (ud.major_status) { + case GSS_S_CONTINUE_NEEDED: + cli_handle = ud.out_handle; + break; + case GSS_S_COMPLETE: + status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle); + if (status) + goto out; + cli_handle.data = (u8 *)&handle; + cli_handle.len = sizeof(handle); + break; + default: + ret = SVC_CLOSE; + goto out; + } + + /* Got an answer to the upcall; use it: */ + if (gss_write_init_verf(sn->rsc_cache, rqstp, + &cli_handle, &ud.major_status)) + goto out; + if (gss_write_resv(resv, PAGE_SIZE, + &cli_handle, &ud.out_token, + ud.major_status, ud.minor_status)) + goto out; + + ret = SVC_COMPLETE; +out: + gssp_free_upcall_data(&ud); + return ret; +} + +DEFINE_SPINLOCK(use_gssp_lock); + +static bool use_gss_proxy(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + if (sn->use_gss_proxy != -1) + return sn->use_gss_proxy; + spin_lock(&use_gssp_lock); + /* + * If you wanted gss-proxy, you should have said so before + * starting to accept requests: + */ + sn->use_gss_proxy = 0; + spin_unlock(&use_gssp_lock); + return 0; +} + +#ifdef CONFIG_PROC_FS + +static bool set_gss_proxy(struct net *net, int type) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + int ret = 0; + + WARN_ON_ONCE(type != 0 && type != 1); + spin_lock(&use_gssp_lock); + if (sn->use_gss_proxy == -1 || sn->use_gss_proxy == type) + sn->use_gss_proxy = type; + else + ret = -EBUSY; + spin_unlock(&use_gssp_lock); + wake_up(&sn->gssp_wq); + return ret; +} + +static inline bool gssp_ready(struct sunrpc_net *sn) +{ + switch (sn->use_gss_proxy) { + case -1: + return false; + case 0: + return true; + case 1: + return sn->gssp_clnt; + } + WARN_ON_ONCE(1); + return false; +} + +static int wait_for_gss_proxy(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + return wait_event_interruptible(sn->gssp_wq, gssp_ready(sn)); +} + + +static ssize_t write_gssp(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct net *net = PDE_DATA(file->f_path.dentry->d_inode); + char tbuf[20]; + unsigned long i; + int res; + + if (*ppos || count > sizeof(tbuf)-1) + return -EINVAL; + if (copy_from_user(tbuf, buf, count)) + return -EFAULT; + + tbuf[count] = 0; + res = kstrtoul(tbuf, 0, &i); + if (res) + return res; + if (i != 1) + return -EINVAL; + res = set_gss_proxy(net, 1); + if (res) + return res; + res = set_gssp_clnt(net); + if (res) + return res; + return count; +} + +static ssize_t read_gssp(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct net *net = PDE_DATA(file->f_path.dentry->d_inode); + unsigned long p = *ppos; + char tbuf[10]; + size_t len; + int ret; + + ret = wait_for_gss_proxy(net); + if (ret) + return ret; + + snprintf(tbuf, sizeof(tbuf), "%d\n", use_gss_proxy(net)); + len = strlen(tbuf); + if (p >= len) + return 0; + len -= p; + if (len > count) + len = count; + if (copy_to_user(buf, (void *)(tbuf+p), len)) + return -EFAULT; + *ppos += len; + return len; +} + +static const struct file_operations use_gss_proxy_ops = { + .open = nonseekable_open, + .write = write_gssp, + .read = read_gssp, +}; + +static int create_use_gss_proxy_proc_entry(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + struct proc_dir_entry **p = &sn->use_gssp_proc; + + sn->use_gss_proxy = -1; + *p = proc_create_data("use-gss-proxy", S_IFREG|S_IRUSR|S_IWUSR, + sn->proc_net_rpc, + &use_gss_proxy_ops, net); + if (!*p) + return -ENOMEM; + init_gssp_clnt(sn); + return 0; +} + +static void destroy_use_gss_proxy_proc_entry(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + if (sn->use_gssp_proc) { + remove_proc_entry("use-gss-proxy", sn->proc_net_rpc); + clear_gssp_clnt(sn); + } +} +#else /* CONFIG_PROC_FS */ + +static int create_use_gss_proxy_proc_entry(struct net *net) +{ + return 0; +} + +static void destroy_use_gss_proxy_proc_entry(struct net *net) {} + +#endif /* CONFIG_PROC_FS */ + /* * Accept an rpcsec packet. * If context establishment, punt to user space @@ -1142,7 +1486,10 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: - return svcauth_gss_handle_init(rqstp, gc, authp); + if (use_gss_proxy(SVC_NET(rqstp))) + return svcauth_gss_proxy_init(rqstp, gc, authp); + else + return svcauth_gss_legacy_init(rqstp, gc, authp); case RPC_GSS_PROC_DATA: case RPC_GSS_PROC_DESTROY: /* Look up the context, and check the verifier: */ @@ -1190,7 +1537,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) /* placeholders for length and seq. number: */ svc_putnl(resv, 0); svc_putnl(resv, 0); - if (unwrap_integ_data(&rqstp->rq_arg, + if (unwrap_integ_data(rqstp, &rqstp->rq_arg, gc->gc_seq, rsci->mechctx)) goto garbage_args; break; @@ -1208,7 +1555,9 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) svcdata->rsci = rsci; cache_get(&rsci->h); rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor( - rsci->mechctx->mech_type, gc->gc_svc); + rsci->mechctx->mech_type, + GSS_C_QOP_DEFAULT, + gc->gc_svc); ret = SVC_OK; goto out; } @@ -1517,7 +1866,12 @@ gss_svc_init_net(struct net *net) rv = rsi_cache_create_net(net); if (rv) goto out1; + rv = create_use_gss_proxy_proc_entry(net); + if (rv) + goto out2; return 0; +out2: + destroy_use_gss_proxy_proc_entry(net); out1: rsc_cache_destroy_net(net); return rv; @@ -1526,6 +1880,7 @@ out1: void gss_svc_shutdown_net(struct net *net) { + destroy_use_gss_proxy_proc_entry(net); rsi_cache_destroy_net(net); rsc_cache_destroy_net(net); } diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 52c5abdee211..dc37021fc3e5 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -18,8 +18,8 @@ struct unx_cred { struct rpc_cred uc_base; - gid_t uc_gid; - gid_t uc_gids[NFS_NGROUPS]; + kgid_t uc_gid; + kgid_t uc_gids[NFS_NGROUPS]; }; #define uc_uid uc_base.cr_uid @@ -65,7 +65,8 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) unsigned int i; dprintk("RPC: allocating UNIX cred for uid %d gid %d\n", - acred->uid, acred->gid); + from_kuid(&init_user_ns, acred->uid), + from_kgid(&init_user_ns, acred->gid)); if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS))) return ERR_PTR(-ENOMEM); @@ -79,13 +80,10 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) groups = NFS_NGROUPS; cred->uc_gid = acred->gid; - for (i = 0; i < groups; i++) { - gid_t gid; - gid = from_kgid(&init_user_ns, GROUP_AT(acred->group_info, i)); - cred->uc_gids[i] = gid; - } + for (i = 0; i < groups; i++) + cred->uc_gids[i] = GROUP_AT(acred->group_info, i); if (i < NFS_NGROUPS) - cred->uc_gids[i] = NOGROUP; + cred->uc_gids[i] = INVALID_GID; return &cred->uc_base; } @@ -123,21 +121,17 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) unsigned int i; - if (cred->uc_uid != acred->uid || cred->uc_gid != acred->gid) + if (!uid_eq(cred->uc_uid, acred->uid) || !gid_eq(cred->uc_gid, acred->gid)) return 0; if (acred->group_info != NULL) groups = acred->group_info->ngroups; if (groups > NFS_NGROUPS) groups = NFS_NGROUPS; - for (i = 0; i < groups ; i++) { - gid_t gid; - gid = from_kgid(&init_user_ns, GROUP_AT(acred->group_info, i)); - if (cred->uc_gids[i] != gid) + for (i = 0; i < groups ; i++) + if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i))) return 0; - } - if (groups < NFS_NGROUPS && - cred->uc_gids[groups] != NOGROUP) + if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) return 0; return 1; } @@ -163,11 +157,11 @@ unx_marshal(struct rpc_task *task, __be32 *p) */ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); - *p++ = htonl((u32) cred->uc_uid); - *p++ = htonl((u32) cred->uc_gid); + *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid)); + *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid)); hold = p++; - for (i = 0; i < 16 && cred->uc_gids[i] != (gid_t) NOGROUP; i++) - *p++ = htonl((u32) cred->uc_gids[i]); + for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++) + *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i])); *hold = htonl(p - hold - 1); /* gid array length */ *base = htonl((p - base - 1) << 2); /* cred length */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 9afa4393c217..80fe5c86efd1 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -196,9 +196,9 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_update); static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h) { - if (!cd->cache_upcall) - return -EINVAL; - return cd->cache_upcall(cd, h); + if (cd->cache_upcall) + return cd->cache_upcall(cd, h); + return sunrpc_cache_pipe_upcall(cd, h); } static inline int cache_is_valid(struct cache_detail *detail, struct cache_head *h) @@ -670,13 +670,13 @@ static void cache_revisit_request(struct cache_head *item) { struct cache_deferred_req *dreq; struct list_head pending; - struct hlist_node *lp, *tmp; + struct hlist_node *tmp; int hash = DFR_HASH(item); INIT_LIST_HEAD(&pending); spin_lock(&cache_defer_lock); - hlist_for_each_entry_safe(dreq, lp, tmp, &cache_defer_hash[hash], hash) + hlist_for_each_entry_safe(dreq, tmp, &cache_defer_hash[hash], hash) if (dreq->item == item) { __unhash_deferred_req(dreq); list_add(&dreq->recent, &pending); @@ -750,12 +750,24 @@ struct cache_reader { int offset; /* if non-0, we have a refcnt on next request */ }; +static int cache_request(struct cache_detail *detail, + struct cache_request *crq) +{ + char *bp = crq->buf; + int len = PAGE_SIZE; + + detail->cache_request(detail, crq->item, &bp, &len); + if (len < 0) + return -EAGAIN; + return PAGE_SIZE - len; +} + static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { struct cache_reader *rp = filp->private_data; struct cache_request *rq; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int err; if (count == 0) @@ -784,6 +796,13 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, rq->readers++; spin_unlock(&queue_lock); + if (rq->len == 0) { + err = cache_request(cd, rq); + if (err < 0) + goto out; + rq->len = err; + } + if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { err = -EAGAIN; spin_lock(&queue_lock); @@ -886,7 +905,7 @@ static ssize_t cache_write(struct file *filp, const char __user *buf, struct cache_detail *cd) { struct address_space *mapping = filp->f_mapping; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); ssize_t ret = -EINVAL; if (!cd->cache_parse) @@ -967,8 +986,10 @@ static int cache_open(struct inode *inode, struct file *filp, nonseekable_open(inode, filp); if (filp->f_mode & FMODE_READ) { rp = kmalloc(sizeof(*rp), GFP_KERNEL); - if (!rp) + if (!rp) { + module_put(cd->owner); return -ENOMEM; + } rp->offset = 0; rp->q.reader = 1; atomic_inc(&cd->readers); @@ -1140,17 +1161,14 @@ static bool cache_listeners_exist(struct cache_detail *detail) * * Each request is at most one page long. */ -int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h, - void (*cache_request)(struct cache_detail *, - struct cache_head *, - char **, - int *)) +int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) { char *buf; struct cache_request *crq; - char *bp; - int len; + + if (!detail->cache_request) + return -EINVAL; if (!cache_listeners_exist(detail)) { warn_no_listener(detail); @@ -1167,19 +1185,10 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h, return -EAGAIN; } - bp = buf; len = PAGE_SIZE; - - cache_request(detail, h, &bp, &len); - - if (len < 0) { - kfree(buf); - kfree(crq); - return -EAGAIN; - } crq->q.reader = 0; crq->item = cache_get(h); crq->buf = buf; - crq->len = PAGE_SIZE - len; + crq->len = 0; crq->readers = 0; spin_lock(&queue_lock); list_add_tail(&crq->q.list, &detail->queue); @@ -1201,7 +1210,6 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall); * key and content are both parsed by cache */ -#define isodigit(c) (isdigit(c) && c <= '7') int qword_get(char **bpp, char *dest, int bufsize) { /* return bytes copied, or -1 on error */ @@ -1454,7 +1462,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf, static ssize_t cache_read_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + struct cache_detail *cd = PDE_DATA(file_inode(filp)); return cache_read(filp, buf, count, ppos, cd); } @@ -1462,14 +1470,14 @@ static ssize_t cache_read_procfs(struct file *filp, char __user *buf, static ssize_t cache_write_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + struct cache_detail *cd = PDE_DATA(file_inode(filp)); return cache_write(filp, buf, count, ppos, cd); } static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait) { - struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + struct cache_detail *cd = PDE_DATA(file_inode(filp)); return cache_poll(filp, wait, cd); } @@ -1477,22 +1485,22 @@ static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait) static long cache_ioctl_procfs(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; - struct cache_detail *cd = PDE(inode)->data; + struct inode *inode = file_inode(filp); + struct cache_detail *cd = PDE_DATA(inode); return cache_ioctl(inode, filp, cmd, arg, cd); } static int cache_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE(inode)->data; + struct cache_detail *cd = PDE_DATA(inode); return cache_open(inode, filp, cd); } static int cache_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE(inode)->data; + struct cache_detail *cd = PDE_DATA(inode); return cache_release(inode, filp, cd); } @@ -1510,14 +1518,14 @@ static const struct file_operations cache_file_operations_procfs = { static int content_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE(inode)->data; + struct cache_detail *cd = PDE_DATA(inode); return content_open(inode, filp, cd); } static int content_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE(inode)->data; + struct cache_detail *cd = PDE_DATA(inode); return content_release(inode, filp, cd); } @@ -1531,14 +1539,14 @@ static const struct file_operations content_file_operations_procfs = { static int open_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE(inode)->data; + struct cache_detail *cd = PDE_DATA(inode); return open_flush(inode, filp, cd); } static int release_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE(inode)->data; + struct cache_detail *cd = PDE_DATA(inode); return release_flush(inode, filp, cd); } @@ -1546,7 +1554,7 @@ static int release_flush_procfs(struct inode *inode, struct file *filp) static ssize_t read_flush_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + struct cache_detail *cd = PDE_DATA(file_inode(filp)); return read_flush(filp, buf, count, ppos, cd); } @@ -1555,7 +1563,7 @@ static ssize_t write_flush_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + struct cache_detail *cd = PDE_DATA(file_inode(filp)); return write_flush(filp, buf, count, ppos, cd); } @@ -1605,7 +1613,7 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) if (p == NULL) goto out_nomem; - if (cd->cache_upcall || cd->cache_parse) { + if (cd->cache_request || cd->cache_parse) { p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR, cd->u.procfs.proc_ent, &cache_file_operations_procfs, cd); @@ -1614,7 +1622,7 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) goto out_nomem; } if (cd->cache_show) { - p = proc_create_data("content", S_IFREG|S_IRUSR|S_IWUSR, + p = proc_create_data("content", S_IFREG|S_IRUSR, cd->u.procfs.proc_ent, &content_file_operations_procfs, cd); cd->u.procfs.content_ent = p; @@ -1686,7 +1694,7 @@ EXPORT_SYMBOL_GPL(cache_destroy_net); static ssize_t cache_read_pipefs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + struct cache_detail *cd = RPC_I(file_inode(filp))->private; return cache_read(filp, buf, count, ppos, cd); } @@ -1694,14 +1702,14 @@ static ssize_t cache_read_pipefs(struct file *filp, char __user *buf, static ssize_t cache_write_pipefs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + struct cache_detail *cd = RPC_I(file_inode(filp))->private; return cache_write(filp, buf, count, ppos, cd); } static unsigned int cache_poll_pipefs(struct file *filp, poll_table *wait) { - struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + struct cache_detail *cd = RPC_I(file_inode(filp))->private; return cache_poll(filp, wait, cd); } @@ -1709,7 +1717,7 @@ static unsigned int cache_poll_pipefs(struct file *filp, poll_table *wait) static long cache_ioctl_pipefs(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct cache_detail *cd = RPC_I(inode)->private; return cache_ioctl(inode, filp, cmd, arg, cd); @@ -1778,7 +1786,7 @@ static int release_flush_pipefs(struct inode *inode, struct file *filp) static ssize_t read_flush_pipefs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + struct cache_detail *cd = RPC_I(file_inode(filp))->private; return read_flush(filp, buf, count, ppos, cd); } @@ -1787,7 +1795,7 @@ static ssize_t write_flush_pipefs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + struct cache_detail *cd = RPC_I(file_inode(filp))->private; return write_flush(filp, buf, count, ppos, cd); } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 507b5e84fbdb..5a750b9c3640 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -33,6 +33,7 @@ #include <linux/rcupdate.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/bc_xprt.h> @@ -303,10 +304,8 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru err = rpciod_up(); if (err) goto out_no_rpciod; - err = -EINVAL; - if (!xprt) - goto out_no_xprt; + err = -EINVAL; if (args->version >= program->nrvers) goto out_err; version = program->version[args->version]; @@ -361,7 +360,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru auth = rpcauth_create(args->authflavor, clnt); if (IS_ERR(auth)) { - printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n", + dprintk("RPC: Couldn't create auth handle (flavor %u)\n", args->authflavor); err = PTR_ERR(auth); goto out_no_auth; @@ -381,10 +380,9 @@ out_no_principal: out_no_stats: kfree(clnt); out_err: - xprt_put(xprt); -out_no_xprt: rpciod_down(); out_no_rpciod: + xprt_put(xprt); return ERR_PTR(err); } @@ -413,6 +411,10 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) }; char servername[48]; + if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS) + xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS; + if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT) + xprtargs.flags |= XPRT_CREATE_NO_IDLE_TIMEOUT; /* * If the caller chooses not to specify a hostname, whip * up a string representation of the passed-in address. @@ -511,7 +513,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new = rpc_new_client(args, xprt); if (IS_ERR(new)) { err = PTR_ERR(new); - goto out_put; + goto out_err; } atomic_inc(&clnt->cl_count); @@ -524,8 +526,6 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_chatty = clnt->cl_chatty; return new; -out_put: - xprt_put(xprt); out_err: dprintk("RPC: %s: returned error %d\n", __func__, err); return ERR_PTR(err); @@ -683,6 +683,7 @@ rpc_release_client(struct rpc_clnt *clnt) if (atomic_dec_and_test(&clnt->cl_count)) rpc_free_auth(clnt); } +EXPORT_SYMBOL_GPL(rpc_release_client); /** * rpc_bind_new_program - bind a new RPC program to an existing client @@ -1196,6 +1197,21 @@ size_t rpc_max_payload(struct rpc_clnt *clnt) EXPORT_SYMBOL_GPL(rpc_max_payload); /** + * rpc_get_timeout - Get timeout for transport in units of HZ + * @clnt: RPC client to query + */ +unsigned long rpc_get_timeout(struct rpc_clnt *clnt) +{ + unsigned long ret; + + rcu_read_lock(); + ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval; + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_get_timeout); + +/** * rpc_force_rebind - force transport to check that remote port is unchanged * @clnt: client to rebind * @@ -1290,6 +1306,8 @@ call_reserve(struct rpc_task *task) xprt_reserve(task); } +static void call_retry_reserve(struct rpc_task *task); + /* * 1b. Grok the result of xprt_reserve() */ @@ -1331,7 +1349,7 @@ call_reserveresult(struct rpc_task *task) case -ENOMEM: rpc_delay(task, HZ >> 2); case -EAGAIN: /* woken up; retry */ - task->tk_action = call_reserve; + task->tk_action = call_retry_reserve; return; case -EIO: /* probably a shutdown */ break; @@ -1344,6 +1362,19 @@ call_reserveresult(struct rpc_task *task) } /* + * 1c. Retry reserving an RPC call slot + */ +static void +call_retry_reserve(struct rpc_task *task) +{ + dprint_status(task); + + task->tk_status = 0; + task->tk_action = call_reserveresult; + xprt_retry_reserve(task); +} + +/* * 2. Bind and/or refresh the credentials */ static void @@ -1400,7 +1431,7 @@ call_allocate(struct rpc_task *task) { unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack; struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt = req->rq_xprt; struct rpc_procinfo *proc = task->tk_msg.rpc_proc; dprint_status(task); @@ -1508,7 +1539,7 @@ rpc_xdr_encode(struct rpc_task *task) static void call_bind(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; dprint_status(task); @@ -1602,7 +1633,7 @@ retry_timeout: static void call_connect(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; dprintk("RPC: %5u call_connect xprt %p %s connected\n", task->tk_pid, xprt, @@ -1628,22 +1659,26 @@ call_connect_status(struct rpc_task *task) dprint_status(task); - task->tk_status = 0; - if (status >= 0 || status == -EAGAIN) { - clnt->cl_stats->netreconn++; - task->tk_action = call_transmit; - return; - } - trace_rpc_connect_status(task, status); switch (status) { /* if soft mounted, test if we've timed out */ case -ETIMEDOUT: task->tk_action = call_timeout; - break; - default: - rpc_exit(task, -EIO); + return; + case -ECONNREFUSED: + case -ECONNRESET: + case -ENETUNREACH: + if (RPC_IS_SOFTCONN(task)) + break; + /* retry with existing socket, after a delay */ + case 0: + case -EAGAIN: + task->tk_status = 0; + clnt->cl_stats->netreconn++; + task->tk_action = call_transmit; + return; } + rpc_exit(task, status); } /* @@ -1685,7 +1720,7 @@ call_transmit(struct rpc_task *task) if (rpc_reply_expected(task)) return; task->tk_action = rpc_exit_task; - rpc_wake_up_queued_task(&task->tk_xprt->pending, task); + rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task); } /* @@ -1784,7 +1819,7 @@ call_bc_transmit(struct rpc_task *task) */ printk(KERN_NOTICE "RPC: Could not send backchannel reply " "error: %d\n", task->tk_status); - xprt_conditional_disconnect(task->tk_xprt, + xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); break; default: @@ -1836,7 +1871,7 @@ call_status(struct rpc_task *task) case -ETIMEDOUT: task->tk_action = call_timeout; if (task->tk_client->cl_discrtry) - xprt_conditional_disconnect(task->tk_xprt, + xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); break; case -ECONNRESET: @@ -1991,7 +2026,7 @@ out_retry: if (task->tk_rqstp == req) { req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) - xprt_conditional_disconnect(task->tk_xprt, + xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); } } @@ -2005,7 +2040,7 @@ rpc_encode_header(struct rpc_task *task) /* FIXME: check buffer size? */ - p = xprt_skip_transport_header(task->tk_xprt, p); + p = xprt_skip_transport_header(req->rq_xprt, p); *p++ = req->rq_xid; /* XID */ *p++ = htonl(RPC_CALL); /* CALL */ *p++ = htonl(RPC_VERSION); /* RPC version */ diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index ce7bd449173d..7111a4c9113b 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -23,6 +23,12 @@ struct sunrpc_net { struct rpc_clnt *rpcb_local_clnt4; spinlock_t rpcb_clnt_lock; unsigned int rpcb_users; + + struct mutex gssp_lock; + wait_queue_head_t gssp_wq; + struct rpc_clnt *gssp_clnt; + int use_gss_proxy; + struct proc_dir_entry *use_gssp_proc; }; extern int sunrpc_net_id; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index fd10981ea792..a9129f8d7070 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -284,7 +284,7 @@ out: static ssize_t rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct rpc_pipe *pipe; struct rpc_pipe_msg *msg; int res = 0; @@ -328,7 +328,7 @@ out_unlock: static ssize_t rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *offset) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int res; mutex_lock(&inode->i_mutex); @@ -342,7 +342,7 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of static unsigned int rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct rpc_inode *rpci = RPC_I(inode); unsigned int mask = POLLOUT | POLLWRNORM; @@ -360,7 +360,7 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) static long rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct rpc_pipe *pipe; int len; @@ -830,7 +830,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, * responses to upcalls. They will result in calls to @msg->downcall. * * The @private argument passed here will be available to all these methods - * from the file pointer, via RPC_I(file->f_dentry->d_inode)->private. + * from the file pointer, via RPC_I(file_inode(file))->private. */ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, void *private, struct rpc_pipe *pipe) @@ -1174,6 +1174,8 @@ static struct file_system_type rpc_pipe_fs_type = { .mount = rpc_mount, .kill_sb = rpc_kill_sb, }; +MODULE_ALIAS_FS("rpc_pipefs"); +MODULE_ALIAS("rpc_pipefs"); static void init_once(void *foo) @@ -1218,6 +1220,3 @@ void unregister_rpc_pipefs(void) kmem_cache_destroy(rpc_inode_cachep); unregister_filesystem(&rpc_pipe_fs_type); } - -/* Make 'mount -t rpc_pipefs ...' autoload this module. */ -MODULE_ALIAS("rpc_pipefs"); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 795a0f4e920b..3df764dc330c 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -26,6 +26,7 @@ #include <net/ipv6.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/xprtsock.h> diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index fb20f25ddec9..f8529fc8e542 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -180,6 +180,8 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); task->tk_waitqueue = queue; queue->qlen++; + /* barrier matches the read in rpc_wake_up_task_queue_locked() */ + smp_wmb(); rpc_set_queued(task); dprintk("RPC: %5u added to queue %p \"%s\"\n", @@ -430,8 +432,11 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task */ static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) { - if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue) - __rpc_do_wake_up_task(queue, task); + if (RPC_IS_QUEUED(task)) { + smp_rmb(); + if (task->tk_waitqueue == queue) + __rpc_do_wake_up_task(queue, task); + } } /* diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index bc2068ee795b..21b75cb08c03 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -64,7 +64,7 @@ static int rpc_proc_show(struct seq_file *seq, void *v) { static int rpc_proc_open(struct inode *inode, struct file *file) { - return single_open(file, rpc_proc_show, PDE(inode)->data); + return single_open(file, rpc_proc_show, PDE_DATA(inode)); } static const struct file_operations rpc_proc_fops = { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index dbf12ac5ecb7..89a588b4478b 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -515,15 +515,6 @@ EXPORT_SYMBOL_GPL(svc_create_pooled); void svc_shutdown_net(struct svc_serv *serv, struct net *net) { - /* - * The set of xprts (contained in the sv_tempsocks and - * sv_permsocks lists) is now constant, since it is modified - * only by accepting new sockets (done by service threads in - * svc_recv) or aging old ones (done by sv_temptimer), or - * configuration changes (excluded by whatever locking the - * caller is using--nfsd_mutex in the case of nfsd). So it's - * safe to traverse those lists and shut everything down: - */ svc_close_net(serv, net); if (serv->sv_shutdown) @@ -1042,6 +1033,7 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net) /* * dprintk the given error with the address of the client that caused it. */ +#ifdef RPC_DEBUG static __printf(2, 3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) { @@ -1058,6 +1050,9 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) va_end(args); } +#else +static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {} +#endif /* * Common routine for processing the RPC request. diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b8e47fac7315..80a6640f329b 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -499,7 +499,8 @@ void svc_wake_up(struct svc_serv *serv) rqstp->rq_xprt = NULL; */ wake_up(&rqstp->rq_wait); - } + } else + pool->sp_task_pending = 1; spin_unlock_bh(&pool->sp_lock); } } @@ -634,7 +635,13 @@ struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) * long for cache updates. */ rqstp->rq_chandle.thread_wait = 1*HZ; + pool->sp_task_pending = 0; } else { + if (pool->sp_task_pending) { + pool->sp_task_pending = 0; + spin_unlock_bh(&pool->sp_lock); + return ERR_PTR(-EAGAIN); + } /* No data pending. Go to sleep */ svc_thread_enqueue(pool, rqstp); @@ -856,7 +863,6 @@ static void svc_age_temp_xprts(unsigned long closure) struct svc_serv *serv = (struct svc_serv *)closure; struct svc_xprt *xprt; struct list_head *le, *next; - LIST_HEAD(to_be_aged); dprintk("svc_age_temp_xprts\n"); @@ -877,25 +883,15 @@ static void svc_age_temp_xprts(unsigned long closure) if (atomic_read(&xprt->xpt_ref.refcount) > 1 || test_bit(XPT_BUSY, &xprt->xpt_flags)) continue; - svc_xprt_get(xprt); - list_move(le, &to_be_aged); + list_del_init(le); set_bit(XPT_CLOSE, &xprt->xpt_flags); set_bit(XPT_DETACHED, &xprt->xpt_flags); - } - spin_unlock_bh(&serv->sv_lock); - - while (!list_empty(&to_be_aged)) { - le = to_be_aged.next; - /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ - list_del_init(le); - xprt = list_entry(le, struct svc_xprt, xpt_list); - dprintk("queuing xprt %p for closing\n", xprt); /* a thread will dequeue and close it soon */ svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); } + spin_unlock_bh(&serv->sv_lock); mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } @@ -959,21 +955,24 @@ void svc_close_xprt(struct svc_xprt *xprt) } EXPORT_SYMBOL_GPL(svc_close_xprt); -static void svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net) +static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net) { struct svc_xprt *xprt; + int ret = 0; spin_lock(&serv->sv_lock); list_for_each_entry(xprt, xprt_list, xpt_list) { if (xprt->xpt_net != net) continue; + ret++; set_bit(XPT_CLOSE, &xprt->xpt_flags); - set_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); } spin_unlock(&serv->sv_lock); + return ret; } -static void svc_clear_pools(struct svc_serv *serv, struct net *net) +static struct svc_xprt *svc_dequeue_net(struct svc_serv *serv, struct net *net) { struct svc_pool *pool; struct svc_xprt *xprt; @@ -988,42 +987,46 @@ static void svc_clear_pools(struct svc_serv *serv, struct net *net) if (xprt->xpt_net != net) continue; list_del_init(&xprt->xpt_ready); + spin_unlock_bh(&pool->sp_lock); + return xprt; } spin_unlock_bh(&pool->sp_lock); } + return NULL; } -static void svc_clear_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net) +static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) { struct svc_xprt *xprt; - struct svc_xprt *tmp; - LIST_HEAD(victims); - spin_lock(&serv->sv_lock); - list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { - if (xprt->xpt_net != net) - continue; - list_move(&xprt->xpt_list, &victims); - } - spin_unlock(&serv->sv_lock); - - list_for_each_entry_safe(xprt, tmp, &victims, xpt_list) + while ((xprt = svc_dequeue_net(serv, net))) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_delete_xprt(xprt); + } } +/* + * Server threads may still be running (especially in the case where the + * service is still running in other network namespaces). + * + * So we shut down sockets the same way we would on a running server, by + * setting XPT_CLOSE, enqueuing, and letting a thread pick it up to do + * the close. In the case there are no such other threads, + * threads running, svc_clean_up_xprts() does a simple version of a + * server's main event loop, and in the case where there are other + * threads, we may need to wait a little while and then check again to + * see if they're done. + */ void svc_close_net(struct svc_serv *serv, struct net *net) { - svc_close_list(serv, &serv->sv_tempsocks, net); - svc_close_list(serv, &serv->sv_permsocks, net); + int delay = 0; - svc_clear_pools(serv, net); - /* - * At this point the sp_sockets lists will stay empty, since - * svc_xprt_enqueue will not add new entries without taking the - * sp_lock and checking XPT_BUSY. - */ - svc_clear_list(serv, &serv->sv_tempsocks, net); - svc_clear_list(serv, &serv->sv_permsocks, net); + while (svc_close_list(serv, &serv->sv_permsocks, net) + + svc_close_list(serv, &serv->sv_tempsocks, net)) { + + svc_clean_up_xprts(serv, net); + msleep(delay++); + } } /* diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 7963569fc04f..2af7b0cba43a 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -138,13 +138,12 @@ auth_domain_lookup(char *name, struct auth_domain *new) { struct auth_domain *hp; struct hlist_head *head; - struct hlist_node *np; head = &auth_domain_table[hash_str(name, DN_HASHBITS)]; spin_lock(&auth_domain_lock); - hlist_for_each_entry(hp, np, head, hash) { + hlist_for_each_entry(hp, head, hash) { if (strcmp(hp->name, name)==0) { kref_get(&hp->ref); spin_unlock(&auth_domain_lock); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 4d0129203733..c3f9e1ef7f53 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -6,6 +6,7 @@ #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/gss_api.h> +#include <linux/sunrpc/addr.h> #include <linux/err.h> #include <linux/seq_file.h> #include <linux/hash.h> @@ -17,7 +18,6 @@ #include <linux/user_namespace.h> #define RPCDBG_FACILITY RPCDBG_AUTH -#include <linux/sunrpc/clnt.h> #include "netns.h" @@ -157,11 +157,6 @@ static void ip_map_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h) -{ - return sunrpc_cache_pipe_upcall(cd, h, ip_map_request); -} - static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr); static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry); @@ -415,10 +410,15 @@ svcauth_unix_info_release(struct svc_xprt *xpt) struct unix_gid { struct cache_head h; - uid_t uid; + kuid_t uid; struct group_info *gi; }; +static int unix_gid_hash(kuid_t uid) +{ + return hash_long(from_kuid(&init_user_ns, uid), GID_HASHBITS); +} + static void unix_gid_put(struct kref *kref) { struct cache_head *item = container_of(kref, struct cache_head, ref); @@ -433,7 +433,7 @@ static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew) { struct unix_gid *orig = container_of(corig, struct unix_gid, h); struct unix_gid *new = container_of(cnew, struct unix_gid, h); - return orig->uid == new->uid; + return uid_eq(orig->uid, new->uid); } static void unix_gid_init(struct cache_head *cnew, struct cache_head *citem) { @@ -465,23 +465,19 @@ static void unix_gid_request(struct cache_detail *cd, char tuid[20]; struct unix_gid *ug = container_of(h, struct unix_gid, h); - snprintf(tuid, 20, "%u", ug->uid); + snprintf(tuid, 20, "%u", from_kuid(&init_user_ns, ug->uid)); qword_add(bpp, blen, tuid); (*bpp)[-1] = '\n'; } -static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h) -{ - return sunrpc_cache_pipe_upcall(cd, h, unix_gid_request); -} - -static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, uid_t uid); +static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid); static int unix_gid_parse(struct cache_detail *cd, char *mesg, int mlen) { /* uid expiry Ngid gid0 gid1 ... gidN-1 */ - int uid; + int id; + kuid_t uid; int gids; int rv; int i; @@ -493,9 +489,12 @@ static int unix_gid_parse(struct cache_detail *cd, return -EINVAL; mesg[mlen-1] = 0; - rv = get_int(&mesg, &uid); + rv = get_int(&mesg, &id); if (rv) return -EINVAL; + uid = make_kuid(&init_user_ns, id); + if (!uid_valid(uid)) + return -EINVAL; ug.uid = uid; expiry = get_expiry(&mesg); @@ -530,7 +529,7 @@ static int unix_gid_parse(struct cache_detail *cd, ug.h.expiry_time = expiry; ch = sunrpc_cache_update(cd, &ug.h, &ugp->h, - hash_long(uid, GID_HASHBITS)); + unix_gid_hash(uid)); if (!ch) err = -ENOMEM; else { @@ -549,7 +548,7 @@ static int unix_gid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { - struct user_namespace *user_ns = current_user_ns(); + struct user_namespace *user_ns = &init_user_ns; struct unix_gid *ug; int i; int glen; @@ -565,7 +564,7 @@ static int unix_gid_show(struct seq_file *m, else glen = 0; - seq_printf(m, "%u %d:", ug->uid, glen); + seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen); for (i = 0; i < glen; i++) seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i))); seq_printf(m, "\n"); @@ -577,7 +576,7 @@ static struct cache_detail unix_gid_cache_template = { .hash_size = GID_HASHMAX, .name = "auth.unix.gid", .cache_put = unix_gid_put, - .cache_upcall = unix_gid_upcall, + .cache_request = unix_gid_request, .cache_parse = unix_gid_parse, .cache_show = unix_gid_show, .match = unix_gid_match, @@ -615,20 +614,20 @@ void unix_gid_cache_destroy(struct net *net) cache_destroy_net(cd, net); } -static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, uid_t uid) +static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid) { struct unix_gid ug; struct cache_head *ch; ug.uid = uid; - ch = sunrpc_cache_lookup(cd, &ug.h, hash_long(uid, GID_HASHBITS)); + ch = sunrpc_cache_lookup(cd, &ug.h, unix_gid_hash(uid)); if (ch) return container_of(ch, struct unix_gid, h); else return NULL; } -static struct group_info *unix_gid_find(uid_t uid, struct svc_rqst *rqstp) +static struct group_info *unix_gid_find(kuid_t uid, struct svc_rqst *rqstp) { struct unix_gid *ug; struct group_info *gi; @@ -750,8 +749,8 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) } /* Signal that mapping to nobody uid/gid is required */ - cred->cr_uid = (uid_t) -1; - cred->cr_gid = (gid_t) -1; + cred->cr_uid = INVALID_UID; + cred->cr_gid = INVALID_GID; cred->cr_group_info = groups_alloc(0); if (cred->cr_group_info == NULL) return SVC_CLOSE; /* kmalloc failure - client must retry */ @@ -812,8 +811,10 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) argv->iov_base = (void*)((__be32*)argv->iov_base + slen); /* skip machname */ argv->iov_len -= slen*4; - cred->cr_uid = svc_getnl(argv); /* uid */ - cred->cr_gid = svc_getnl(argv); /* gid */ + cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */ + cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */ + if (!uid_valid(cred->cr_uid) || !gid_valid(cred->cr_gid)) + goto badcred; slen = svc_getnl(argv); /* gids length */ if (slen > 16 || (len -= (slen + 2)*4) < 0) goto badcred; @@ -874,7 +875,7 @@ static struct cache_detail ip_map_cache_template = { .hash_size = IP_HASHMAX, .name = "auth.unix.ip", .cache_put = ip_map_put, - .cache_upcall = ip_map_upcall, + .cache_request = ip_map_request, .cache_parse = ip_map_parse, .cache_show = ip_map_show, .match = ip_map_match, diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 56055632f151..75edcfad6e26 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -879,6 +879,47 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, } EXPORT_SYMBOL_GPL(xdr_buf_subsegment); +/** + * xdr_buf_trim - lop at most "len" bytes off the end of "buf" + * @buf: buf to be trimmed + * @len: number of bytes to reduce "buf" by + * + * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note + * that it's possible that we'll trim less than that amount if the xdr_buf is + * too small, or if (for instance) it's all in the head and the parser has + * already read too far into it. + */ +void xdr_buf_trim(struct xdr_buf *buf, unsigned int len) +{ + size_t cur; + unsigned int trim = len; + + if (buf->tail[0].iov_len) { + cur = min_t(size_t, buf->tail[0].iov_len, trim); + buf->tail[0].iov_len -= cur; + trim -= cur; + if (!trim) + goto fix_len; + } + + if (buf->page_len) { + cur = min_t(unsigned int, buf->page_len, trim); + buf->page_len -= cur; + trim -= cur; + if (!trim) + goto fix_len; + } + + if (buf->head[0].iov_len) { + cur = min_t(size_t, buf->head[0].iov_len, trim); + buf->head[0].iov_len -= cur; + trim -= cur; + } +fix_len: + buf->len -= (len - trim); +} +EXPORT_SYMBOL_GPL(xdr_buf_trim); + static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) { unsigned int this_len; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 33811db8788a..095363eee764 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -430,21 +430,23 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) */ void xprt_release_rqst_cong(struct rpc_task *task) { - __xprt_put_cong(task->tk_xprt, task->tk_rqstp); + struct rpc_rqst *req = task->tk_rqstp; + + __xprt_put_cong(req->rq_xprt, req); } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); /** * xprt_adjust_cwnd - adjust transport congestion window + * @xprt: pointer to xprt * @task: recently completed RPC request used to adjust window * @result: result code of completed RPC request * * We use a time-smoothed congestion estimator to avoid heavy oscillation. */ -void xprt_adjust_cwnd(struct rpc_task *task, int result) +void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result) { struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = task->tk_xprt; unsigned long cwnd = xprt->cwnd; if (result >= 0 && cwnd <= xprt->cong) { @@ -485,13 +487,17 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); * xprt_wait_for_buffer_space - wait for transport output buffer to clear * @task: task to be put to sleep * @action: function pointer to be executed after wait + * + * Note that we only set the timer for the case of RPC_IS_SOFT(), since + * we don't in general want to force a socket disconnection due to + * an incomplete RPC call transmission. */ void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - task->tk_timeout = req->rq_timeout; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; rpc_sleep_on(&xprt->pending, task, action); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); @@ -695,7 +701,7 @@ out_abort: */ void xprt_connect(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; dprintk("RPC: %5u xprt_connect xprt %p %s connected\n", task->tk_pid, xprt, (xprt_connected(xprt) ? "is" : "is not")); @@ -722,13 +728,13 @@ void xprt_connect(struct rpc_task *task) if (xprt_test_and_set_connecting(xprt)) return; xprt->stat.connect_start = jiffies; - xprt->ops->connect(task); + xprt->ops->connect(xprt, task); } } static void xprt_connect_status(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; if (task->tk_status == 0) { xprt->stat.connect_count++; @@ -832,7 +838,7 @@ static void xprt_timer(struct rpc_task *task) spin_lock_bh(&xprt->transport_lock); if (!req->rq_reply_bytes_recvd) { if (xprt->ops->timer) - xprt->ops->timer(task); + xprt->ops->timer(xprt, task); } else task->tk_status = 0; spin_unlock_bh(&xprt->transport_lock); @@ -942,6 +948,34 @@ void xprt_transmit(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); } +static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) +{ + set_bit(XPRT_CONGESTED, &xprt->state); + rpc_sleep_on(&xprt->backlog, task, NULL); +} + +static void xprt_wake_up_backlog(struct rpc_xprt *xprt) +{ + if (rpc_wake_up_next(&xprt->backlog) == NULL) + clear_bit(XPRT_CONGESTED, &xprt->state); +} + +static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task) +{ + bool ret = false; + + if (!test_bit(XPRT_CONGESTED, &xprt->state)) + goto out; + spin_lock(&xprt->reserve_lock); + if (test_bit(XPRT_CONGESTED, &xprt->state)) { + rpc_sleep_on(&xprt->backlog, task, NULL); + ret = true; + } + spin_unlock(&xprt->reserve_lock); +out: + return ret; +} + static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_rqst *req = ERR_PTR(-EAGAIN); @@ -986,7 +1020,7 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) task->tk_status = -ENOMEM; break; case -EAGAIN: - rpc_sleep_on(&xprt->backlog, task, NULL); + xprt_add_backlog(xprt, task); dprintk("RPC: waiting for request slot\n"); default: task->tk_status = -EAGAIN; @@ -1022,7 +1056,7 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) memset(req, 0, sizeof(*req)); /* mark unused */ list_add(&req->rq_list, &xprt->free); } - rpc_wake_up_next(&xprt->backlog); + xprt_wake_up_backlog(xprt); spin_unlock(&xprt->reserve_lock); } @@ -1086,12 +1120,39 @@ EXPORT_SYMBOL_GPL(xprt_free); * xprt_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation * - * If no more slots are available, place the task on the transport's + * If the transport is marked as being congested, or if no more + * slots are available, place the task on the transport's * backlog queue. */ void xprt_reserve(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt; + + task->tk_status = 0; + if (task->tk_rqstp != NULL) + return; + + task->tk_timeout = 0; + task->tk_status = -EAGAIN; + rcu_read_lock(); + xprt = rcu_dereference(task->tk_client->cl_xprt); + if (!xprt_throttle_congested(xprt, task)) + xprt->ops->alloc_slot(xprt, task); + rcu_read_unlock(); +} + +/** + * xprt_retry_reserve - allocate an RPC request slot + * @task: RPC task requesting a slot allocation + * + * If no more slots are available, place the task on the transport's + * backlog queue. + * Note that the only difference with xprt_reserve is that we now + * ignore the value of the XPRT_CONGESTED flag. + */ +void xprt_retry_reserve(struct rpc_task *task) +{ + struct rpc_xprt *xprt; task->tk_status = 0; if (task->tk_rqstp != NULL) @@ -1099,7 +1160,10 @@ void xprt_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; + rcu_read_lock(); + xprt = rcu_dereference(task->tk_client->cl_xprt); xprt->ops->alloc_slot(xprt, task); + rcu_read_unlock(); } static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) @@ -1236,6 +1300,8 @@ found: -PTR_ERR(xprt)); goto out; } + if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT) + xprt->idle_timeout = 0; INIT_WORK(&xprt->task_cleanup, xprt_autoclose); if (xprt_has_timer(xprt)) setup_timer(&xprt->timer, xprt_init_autodisconnect, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 558fbab574f0..e03725bfe2b8 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -171,7 +171,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) { struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); int nsegs, nchunks = 0; unsigned int pos; struct rpcrdma_mr_seg *seg = req->rl_segments; @@ -366,7 +366,7 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) int rpcrdma_marshal_req(struct rpc_rqst *rqst) { - struct rpc_xprt *xprt = rqst->rq_task->tk_xprt; + struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); char *base; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c9aa7a35f3bf..794312f22b9b 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -51,6 +51,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/seq_file.h> +#include <linux/sunrpc/addr.h> #include "xprt_rdma.h" @@ -426,9 +427,8 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) } static void -xprt_rdma_connect(struct rpc_task *task) +xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { - struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); if (r_xprt->rx_ep.rep_connected != 0) { @@ -475,7 +475,7 @@ xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) static void * xprt_rdma_allocate(struct rpc_task *task, size_t size) { - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; struct rpcrdma_req *req, *nreq; req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); @@ -627,7 +627,7 @@ static int xprt_rdma_send_request(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 745973b729af..93726560eaa8 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1086,7 +1086,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, case RPCRDMA_MEMWINDOWS: /* Allocate one extra request's worth, for full cycling */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { - r->r.mw = ib_alloc_mw(ia->ri_pd); + r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1); if (IS_ERR(r->r.mw)) { rc = PTR_ERR(r->r.mw); dprintk("RPC: %s: ib_alloc_mw" @@ -1673,12 +1673,12 @@ rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, *nsegs = 1; rpcrdma_map_one(ia, seg, writing); - param.mr = ia->ri_bind_mem; + param.bind_info.mr = ia->ri_bind_mem; param.wr_id = 0ULL; /* no send cookie */ - param.addr = seg->mr_dma; - param.length = seg->mr_len; + param.bind_info.addr = seg->mr_dma; + param.bind_info.length = seg->mr_len; param.send_flags = 0; - param.mw_access_flags = mem_priv; + param.bind_info.mw_access_flags = mem_priv; DECR_CQCOUNT(&r_xprt->rx_ep); rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); @@ -1690,7 +1690,7 @@ rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, rpcrdma_unmap_one(ia, seg); } else { seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; - seg->mr_base = param.addr; + seg->mr_base = param.bind_info.addr; seg->mr_nsegs = 1; } return rc; @@ -1706,10 +1706,10 @@ rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, int rc; BUG_ON(seg->mr_nsegs != 1); - param.mr = ia->ri_bind_mem; - param.addr = 0ULL; /* unbind */ - param.length = 0; - param.mw_access_flags = 0; + param.bind_info.mr = ia->ri_bind_mem; + param.bind_info.addr = 0ULL; /* unbind */ + param.bind_info.length = 0; + param.bind_info.mw_access_flags = 0; if (*r) { param.wr_id = (u64) (unsigned long) *r; param.send_flags = IB_SEND_SIGNALED; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9a66c95b5837..cc1445dc1d1a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -235,13 +235,13 @@ struct rpcrdma_create_data_internal { }; #define RPCRDMA_INLINE_READ_THRESHOLD(rq) \ - (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize) + (rpcx_to_rdmad(rq->rq_xprt).inline_rsize) #define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\ - (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize) + (rpcx_to_rdmad(rq->rq_xprt).inline_wsize) #define RPCRDMA_INLINE_PAD_VALUE(rq)\ - rpcx_to_rdmad(rq->rq_task->tk_xprt).padding + rpcx_to_rdmad(rq->rq_xprt).padding /* * Statistics for RPCRDMA diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 68b0a81c31d5..ffd50348a509 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -33,6 +33,7 @@ #include <linux/udp.h> #include <linux/tcp.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprtsock.h> @@ -770,7 +771,7 @@ static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) goto out_release; if (req->rq_bytes_sent == req->rq_snd_buf.len) goto out_release; - set_bit(XPRT_CLOSE_WAIT, &task->tk_xprt->state); + set_bit(XPRT_CLOSE_WAIT, &xprt->state); out_release: xprt_release_xprt(xprt, task); } @@ -848,6 +849,14 @@ static void xs_tcp_close(struct rpc_xprt *xprt) xs_tcp_shutdown(xprt); } +static void xs_local_destroy(struct rpc_xprt *xprt) +{ + xs_close(xprt); + xs_free_peer_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); +} + /** * xs_destroy - prepare to shutdown a transport * @xprt: doomed transport @@ -861,10 +870,7 @@ static void xs_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&transport->connect_worker); - xs_close(xprt); - xs_free_peer_addresses(xprt); - xprt_free(xprt); - module_put(THIS_MODULE); + xs_local_destroy(xprt); } static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) @@ -1005,7 +1011,7 @@ static void xs_udp_data_ready(struct sock *sk, int len) UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS); - xprt_adjust_cwnd(task, copied); + xprt_adjust_cwnd(xprt, task, copied); xprt_complete_rqst(task, copied); out_unlock: @@ -1646,9 +1652,9 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t * * Adjust the congestion window after a retransmit timeout has occurred. */ -static void xs_udp_timer(struct rpc_task *task) +static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) { - xprt_adjust_cwnd(task, -ETIMEDOUT); + xprt_adjust_cwnd(xprt, task, -ETIMEDOUT); } static unsigned short xs_get_random_port(void) @@ -1731,7 +1737,9 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) */ static void xs_local_rpcbind(struct rpc_task *task) { - xprt_set_bound(task->tk_xprt); + rcu_read_lock(); + xprt_set_bound(rcu_dereference(task->tk_client->cl_xprt)); + rcu_read_unlock(); } static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port) @@ -1865,13 +1873,9 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, * @xprt: RPC transport to connect * @transport: socket transport to connect * @create_sock: function to create a socket of the correct type - * - * Invoked by a work queue tasklet. */ -static void xs_local_setup_socket(struct work_struct *work) +static int xs_local_setup_socket(struct sock_xprt *transport) { - struct sock_xprt *transport = - container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; struct socket *sock; int status = -EIO; @@ -1916,6 +1920,30 @@ out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); current->flags &= ~PF_FSTRANS; + return status; +} + +static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + int ret; + + if (RPC_IS_ASYNC(task)) { + /* + * We want the AF_LOCAL connect to be resolved in the + * filesystem namespace of the process making the rpc + * call. Thus we connect synchronously. + * + * If we want to support asynchronous AF_LOCAL calls, + * we'll need to figure out how to pass a namespace to + * connect. + */ + rpc_exit(task, -ENOTCONN); + return; + } + ret = xs_local_setup_socket(transport); + if (ret && !RPC_IS_SOFTCONN(task)) + msleep_interruptible(15000); } #ifdef CONFIG_SUNRPC_SWAP @@ -2179,10 +2207,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) */ xs_tcp_force_close(xprt); break; - case -ECONNREFUSED: - case -ECONNRESET: - case -ENETUNREACH: - /* retry with existing socket, after a delay */ case 0: case -EINPROGRESS: case -EALREADY: @@ -2193,6 +2217,10 @@ static void xs_tcp_setup_socket(struct work_struct *work) /* Happens, for instance, if the user specified a link * local IPv6 address without a scope-id. */ + case -ECONNREFUSED: + case -ECONNRESET: + case -ENETUNREACH: + /* retry with existing socket, after a delay */ goto out; } out_eagain: @@ -2205,6 +2233,7 @@ out: /** * xs_connect - connect a socket to a remote endpoint + * @xprt: pointer to transport structure * @task: address of RPC task that manages state of connect request * * TCP: If the remote end dropped the connection, delay reconnecting. @@ -2216,9 +2245,8 @@ out: * If a UDP socket connect fails, the delay behavior here prevents * retry floods (hard mounts). */ -static void xs_connect(struct rpc_task *task) +static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { @@ -2453,13 +2481,13 @@ static struct rpc_xprt_ops xs_local_ops = { .alloc_slot = xprt_alloc_slot, .rpcbind = xs_local_rpcbind, .set_port = xs_local_set_port, - .connect = xs_connect, + .connect = xs_local_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .send_request = xs_local_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, - .destroy = xs_destroy, + .destroy = xs_local_destroy, .print_stats = xs_local_print_stats, }; @@ -2626,9 +2654,10 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) goto out_err; } xprt_set_bound(xprt); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_local_setup_socket); xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); + ret = ERR_PTR(xs_local_setup_socket(transport)); + if (ret) + goto out_err; break; default: ret = ERR_PTR(-EAFNOSUPPORT); @@ -2741,9 +2770,13 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) struct rpc_xprt *xprt; struct sock_xprt *transport; struct rpc_xprt *ret; + unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries; + + if (args->flags & XPRT_CREATE_INFINITE_SLOTS) + max_slot_table_size = RPC_MAX_SLOT_TABLE_LIMIT; xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, - xprt_max_tcp_slot_table_entries); + max_slot_table_size); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index bc41bd31eadc..c890848f9d56 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -3,8 +3,8 @@ # menuconfig TIPC - tristate "The TIPC Protocol (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "The TIPC Protocol" + depends on INET ---help--- The Transparent Inter Process Communication (TIPC) protocol is specially designed for intra cluster communication. This protocol @@ -31,3 +31,10 @@ config TIPC_PORTS Setting this to a smaller value saves some memory, setting it to higher allows for more ports. + +config TIPC_MEDIA_IB + bool "InfiniBand media type support" + depends on TIPC && INFINIBAND_IPOIB + help + Saying Y here will enable support for running TIPC on + IP-over-InfiniBand devices. diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 6cd55d671d3a..4df8e02d9008 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -9,3 +9,5 @@ tipc-y += addr.o bcast.o bearer.o config.o \ name_distr.o subscr.o name_table.o net.o \ netlink.o node.o node_subscr.o port.o ref.o \ socket.o log.o eth_media.o + +tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 54f89f90ac33..e5f3da507823 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -584,8 +584,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, { int bp_index; - /* - * Prepare broadcast link message for reliable transmission, + /* Prepare broadcast link message for reliable transmission, * if first time trying to send it; * preparation is skipped for broadcast link protocol messages * since they are sent in an unreliable manner and don't need it @@ -611,30 +610,43 @@ static int tipc_bcbearer_send(struct sk_buff *buf, for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) { struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary; struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary; + struct tipc_bearer *b = p; + struct sk_buff *tbuf; if (!p) - break; /* no more bearers to try */ + break; /* No more bearers to try */ + + if (tipc_bearer_blocked(p)) { + if (!s || tipc_bearer_blocked(s)) + continue; /* Can't use either bearer */ + b = s; + } - tipc_nmap_diff(&bcbearer->remains, &p->nodes, &bcbearer->remains_new); + tipc_nmap_diff(&bcbearer->remains, &b->nodes, + &bcbearer->remains_new); if (bcbearer->remains_new.count == bcbearer->remains.count) - continue; /* bearer pair doesn't add anything */ + continue; /* Nothing added by bearer pair */ - if (!tipc_bearer_blocked(p)) - tipc_bearer_send(p, buf, &p->media->bcast_addr); - else if (s && !tipc_bearer_blocked(s)) - /* unable to send on primary bearer */ - tipc_bearer_send(s, buf, &s->media->bcast_addr); - else - /* unable to send on either bearer */ - continue; + if (bp_index == 0) { + /* Use original buffer for first bearer */ + tipc_bearer_send(b, buf, &b->bcast_addr); + } else { + /* Avoid concurrent buffer access */ + tbuf = pskb_copy(buf, GFP_ATOMIC); + if (!tbuf) + break; + tipc_bearer_send(b, tbuf, &b->bcast_addr); + kfree_skb(tbuf); /* Bearer keeps a clone */ + } + /* Swap bearers for next packet */ if (s) { bcbearer->bpairs[bp_index].primary = s; bcbearer->bpairs[bp_index].secondary = p; } if (bcbearer->remains_new.count == 0) - break; /* all targets reached */ + break; /* All targets reached */ bcbearer->remains = bcbearer->remains_new; } @@ -774,6 +786,7 @@ void tipc_bclink_init(void) bcl->owner = &bclink->node; bcl->max_pkt = MAX_PKT_DEFAULT_MCAST; tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); + spin_lock_init(&bcbearer->bearer.lock); bcl->b_ptr = &bcbearer->bearer; bcl->state = WORKING_WORKING; strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index aa62f93a9127..cb29ef7ba2f0 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -39,7 +39,7 @@ #include "bearer.h" #include "discover.h" -#define MAX_ADDR_STR 32 +#define MAX_ADDR_STR 60 static struct tipc_media *media_list[MAX_MEDIA]; static u32 media_count; @@ -89,9 +89,6 @@ int tipc_register_media(struct tipc_media *m_ptr) if ((strlen(m_ptr->name) + 1) > TIPC_MAX_MEDIA_NAME) goto exit; - if ((m_ptr->bcast_addr.media_id != m_ptr->type_id) || - !m_ptr->bcast_addr.broadcast) - goto exit; if (m_ptr->priority > TIPC_MAX_LINK_PRI) goto exit; if ((m_ptr->tolerance < TIPC_MIN_LINK_TOL) || @@ -407,7 +404,7 @@ restart: INIT_LIST_HEAD(&b_ptr->links); spin_lock_init(&b_ptr->lock); - res = tipc_disc_create(b_ptr, &m_ptr->bcast_addr, disc_domain); + res = tipc_disc_create(b_ptr, &b_ptr->bcast_addr, disc_domain); if (res) { bearer_disable(b_ptr); pr_warn("Bearer <%s> rejected, discovery object creation failed\n", diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 39f1192d04bf..09c869adcfcf 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -56,6 +56,7 @@ * Identifiers of supported TIPC media types */ #define TIPC_MEDIA_TYPE_ETH 1 +#define TIPC_MEDIA_TYPE_IB 2 /** * struct tipc_media_addr - destination address used by TIPC bearers @@ -77,7 +78,6 @@ struct tipc_bearer; * @enable_bearer: routine which enables a bearer * @disable_bearer: routine which disables a bearer * @addr2str: routine which converts media address to string - * @str2addr: routine which converts media address from string * @addr2msg: routine which converts media address to protocol message area * @msg2addr: routine which converts media address from protocol message area * @bcast_addr: media address used in broadcasting @@ -94,10 +94,9 @@ struct tipc_media { int (*enable_bearer)(struct tipc_bearer *b_ptr); void (*disable_bearer)(struct tipc_bearer *b_ptr); int (*addr2str)(struct tipc_media_addr *a, char *str_buf, int str_size); - int (*str2addr)(struct tipc_media_addr *a, char *str_buf); int (*addr2msg)(struct tipc_media_addr *a, char *msg_area); - int (*msg2addr)(struct tipc_media_addr *a, char *msg_area); - struct tipc_media_addr bcast_addr; + int (*msg2addr)(const struct tipc_bearer *b_ptr, + struct tipc_media_addr *a, char *msg_area); u32 priority; u32 tolerance; u32 window; @@ -136,6 +135,7 @@ struct tipc_bearer { char name[TIPC_MAX_BEARER_NAME]; spinlock_t lock; struct tipc_media *media; + struct tipc_media_addr bcast_addr; u32 priority; u32 window; u32 tolerance; @@ -175,6 +175,14 @@ int tipc_disable_bearer(const char *name); int tipc_eth_media_start(void); void tipc_eth_media_stop(void); +#ifdef CONFIG_TIPC_MEDIA_IB +int tipc_ib_media_start(void); +void tipc_ib_media_stop(void); +#else +static inline int tipc_ib_media_start(void) { return 0; } +static inline void tipc_ib_media_stop(void) { return; } +#endif + int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); diff --git a/net/tipc/core.c b/net/tipc/core.c index fc05cecd7481..7ec2c1eb94f1 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -82,6 +82,7 @@ static void tipc_core_stop_net(void) { tipc_net_stop(); tipc_eth_media_stop(); + tipc_ib_media_stop(); } /** @@ -93,8 +94,15 @@ int tipc_core_start_net(unsigned long addr) tipc_net_start(addr); res = tipc_eth_media_start(); - if (res) - tipc_core_stop_net(); + if (res < 0) + goto err; + res = tipc_ib_media_start(); + if (res < 0) + goto err; + return res; + +err: + tipc_core_stop_net(); return res; } diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 1074b9587e81..eedff58d0387 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -129,7 +129,7 @@ void tipc_disc_recv_msg(struct sk_buff *buf, struct tipc_bearer *b_ptr) int link_fully_up; media_addr.broadcast = 1; - b_ptr->media->msg2addr(&media_addr, msg_media_addr(msg)); + b_ptr->media->msg2addr(b_ptr, &media_addr, msg_media_addr(msg)); kfree_skb(buf); /* Ensure message from node is valid and communication is permitted */ diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 2132c1ef2951..120a676a3360 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -77,12 +77,13 @@ static struct notifier_block notifier = { * Media-dependent "value" field stores MAC address in first 6 bytes * and zeroes out the remaining bytes. */ -static void eth_media_addr_set(struct tipc_media_addr *a, char *mac) +static void eth_media_addr_set(const struct tipc_bearer *tb_ptr, + struct tipc_media_addr *a, char *mac) { memcpy(a->value, mac, ETH_ALEN); memset(a->value + ETH_ALEN, 0, sizeof(a->value) - ETH_ALEN); a->media_id = TIPC_MEDIA_TYPE_ETH; - a->broadcast = !memcmp(mac, eth_media_info.bcast_addr.value, ETH_ALEN); + a->broadcast = !memcmp(mac, tb_ptr->bcast_addr.value, ETH_ALEN); } /** @@ -110,6 +111,7 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, skb_reset_network_header(clone); clone->dev = dev; + clone->protocol = htons(ETH_P_TIPC); dev_hard_header(clone, dev, ETH_P_TIPC, dest->value, dev->dev_addr, clone->len); dev_queue_xmit(clone); @@ -201,9 +203,13 @@ static int enable_bearer(struct tipc_bearer *tb_ptr) /* Associate TIPC bearer with Ethernet bearer */ eb_ptr->bearer = tb_ptr; tb_ptr->usr_handle = (void *)eb_ptr; + memset(tb_ptr->bcast_addr.value, 0, sizeof(tb_ptr->bcast_addr.value)); + memcpy(tb_ptr->bcast_addr.value, dev->broadcast, ETH_ALEN); + tb_ptr->bcast_addr.media_id = TIPC_MEDIA_TYPE_ETH; + tb_ptr->bcast_addr.broadcast = 1; tb_ptr->mtu = dev->mtu; tb_ptr->blocked = 0; - eth_media_addr_set(&tb_ptr->addr, (char *)dev->dev_addr); + eth_media_addr_set(tb_ptr, &tb_ptr->addr, (char *)dev->dev_addr); return 0; } @@ -302,25 +308,6 @@ static int eth_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) } /** - * eth_str2addr - convert string to Ethernet address - */ -static int eth_str2addr(struct tipc_media_addr *a, char *str_buf) -{ - char mac[ETH_ALEN]; - int r; - - r = sscanf(str_buf, "%02x:%02x:%02x:%02x:%02x:%02x", - (u32 *)&mac[0], (u32 *)&mac[1], (u32 *)&mac[2], - (u32 *)&mac[3], (u32 *)&mac[4], (u32 *)&mac[5]); - - if (r != ETH_ALEN) - return 1; - - eth_media_addr_set(a, mac); - return 0; -} - -/** * eth_str2addr - convert Ethernet address format to message header format */ static int eth_addr2msg(struct tipc_media_addr *a, char *msg_area) @@ -334,12 +321,13 @@ static int eth_addr2msg(struct tipc_media_addr *a, char *msg_area) /** * eth_str2addr - convert message header address format to Ethernet format */ -static int eth_msg2addr(struct tipc_media_addr *a, char *msg_area) +static int eth_msg2addr(const struct tipc_bearer *tb_ptr, + struct tipc_media_addr *a, char *msg_area) { if (msg_area[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_ETH) return 1; - eth_media_addr_set(a, msg_area + ETH_ADDR_OFFSET); + eth_media_addr_set(tb_ptr, a, msg_area + ETH_ADDR_OFFSET); return 0; } @@ -351,11 +339,8 @@ static struct tipc_media eth_media_info = { .enable_bearer = enable_bearer, .disable_bearer = disable_bearer, .addr2str = eth_addr2str, - .str2addr = eth_str2addr, .addr2msg = eth_addr2msg, .msg2addr = eth_msg2addr, - .bcast_addr = { { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - TIPC_MEDIA_TYPE_ETH, 1 }, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .window = TIPC_DEF_LINK_WIN, diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c new file mode 100644 index 000000000000..2a2864c25e15 --- /dev/null +++ b/net/tipc/ib_media.c @@ -0,0 +1,387 @@ +/* + * net/tipc/ib_media.c: Infiniband bearer support for TIPC + * + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * Based on eth_media.c, which carries the following copyright notice: + * + * Copyright (c) 2001-2007, Ericsson AB + * Copyright (c) 2005-2008, 2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/if_infiniband.h> +#include "core.h" +#include "bearer.h" + +#define MAX_IB_BEARERS MAX_BEARERS + +/** + * struct ib_bearer - Infiniband bearer data structure + * @bearer: ptr to associated "generic" bearer structure + * @dev: ptr to associated Infiniband network device + * @tipc_packet_type: used in binding TIPC to Infiniband driver + * @cleanup: work item used when disabling bearer + */ + +struct ib_bearer { + struct tipc_bearer *bearer; + struct net_device *dev; + struct packet_type tipc_packet_type; + struct work_struct setup; + struct work_struct cleanup; +}; + +static struct tipc_media ib_media_info; +static struct ib_bearer ib_bearers[MAX_IB_BEARERS]; +static int ib_started; + +/** + * ib_media_addr_set - initialize Infiniband media address structure + * + * Media-dependent "value" field stores MAC address in first 6 bytes + * and zeroes out the remaining bytes. + */ +static void ib_media_addr_set(const struct tipc_bearer *tb_ptr, + struct tipc_media_addr *a, char *mac) +{ + BUILD_BUG_ON(sizeof(a->value) < INFINIBAND_ALEN); + memcpy(a->value, mac, INFINIBAND_ALEN); + a->media_id = TIPC_MEDIA_TYPE_IB; + a->broadcast = !memcmp(mac, tb_ptr->bcast_addr.value, INFINIBAND_ALEN); +} + +/** + * send_msg - send a TIPC message out over an InfiniBand interface + */ +static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, + struct tipc_media_addr *dest) +{ + struct sk_buff *clone; + struct net_device *dev; + int delta; + + clone = skb_clone(buf, GFP_ATOMIC); + if (!clone) + return 0; + + dev = ((struct ib_bearer *)(tb_ptr->usr_handle))->dev; + delta = dev->hard_header_len - skb_headroom(buf); + + if ((delta > 0) && + pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { + kfree_skb(clone); + return 0; + } + + skb_reset_network_header(clone); + clone->dev = dev; + clone->protocol = htons(ETH_P_TIPC); + dev_hard_header(clone, dev, ETH_P_TIPC, dest->value, + dev->dev_addr, clone->len); + dev_queue_xmit(clone); + return 0; +} + +/** + * recv_msg - handle incoming TIPC message from an InfiniBand interface + * + * Accept only packets explicitly sent to this node, or broadcast packets; + * ignores packets sent using InfiniBand multicast, and traffic sent to other + * nodes (which can happen if interface is running in promiscuous mode). + */ +static int recv_msg(struct sk_buff *buf, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct ib_bearer *ib_ptr = (struct ib_bearer *)pt->af_packet_priv; + + if (!net_eq(dev_net(dev), &init_net)) { + kfree_skb(buf); + return 0; + } + + if (likely(ib_ptr->bearer)) { + if (likely(buf->pkt_type <= PACKET_BROADCAST)) { + buf->next = NULL; + tipc_recv_msg(buf, ib_ptr->bearer); + return 0; + } + } + kfree_skb(buf); + return 0; +} + +/** + * setup_bearer - setup association between InfiniBand bearer and interface + */ +static void setup_bearer(struct work_struct *work) +{ + struct ib_bearer *ib_ptr = + container_of(work, struct ib_bearer, setup); + + dev_add_pack(&ib_ptr->tipc_packet_type); +} + +/** + * enable_bearer - attach TIPC bearer to an InfiniBand interface + */ +static int enable_bearer(struct tipc_bearer *tb_ptr) +{ + struct net_device *dev = NULL; + struct net_device *pdev = NULL; + struct ib_bearer *ib_ptr = &ib_bearers[0]; + struct ib_bearer *stop = &ib_bearers[MAX_IB_BEARERS]; + char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1; + int pending_dev = 0; + + /* Find unused InfiniBand bearer structure */ + while (ib_ptr->dev) { + if (!ib_ptr->bearer) + pending_dev++; + if (++ib_ptr == stop) + return pending_dev ? -EAGAIN : -EDQUOT; + } + + /* Find device with specified name */ + read_lock(&dev_base_lock); + for_each_netdev(&init_net, pdev) { + if (!strncmp(pdev->name, driver_name, IFNAMSIZ)) { + dev = pdev; + dev_hold(dev); + break; + } + } + read_unlock(&dev_base_lock); + if (!dev) + return -ENODEV; + + /* Create InfiniBand bearer for device */ + ib_ptr->dev = dev; + ib_ptr->tipc_packet_type.type = htons(ETH_P_TIPC); + ib_ptr->tipc_packet_type.dev = dev; + ib_ptr->tipc_packet_type.func = recv_msg; + ib_ptr->tipc_packet_type.af_packet_priv = ib_ptr; + INIT_LIST_HEAD(&(ib_ptr->tipc_packet_type.list)); + INIT_WORK(&ib_ptr->setup, setup_bearer); + schedule_work(&ib_ptr->setup); + + /* Associate TIPC bearer with InfiniBand bearer */ + ib_ptr->bearer = tb_ptr; + tb_ptr->usr_handle = (void *)ib_ptr; + memset(tb_ptr->bcast_addr.value, 0, sizeof(tb_ptr->bcast_addr.value)); + memcpy(tb_ptr->bcast_addr.value, dev->broadcast, INFINIBAND_ALEN); + tb_ptr->bcast_addr.media_id = TIPC_MEDIA_TYPE_IB; + tb_ptr->bcast_addr.broadcast = 1; + tb_ptr->mtu = dev->mtu; + tb_ptr->blocked = 0; + ib_media_addr_set(tb_ptr, &tb_ptr->addr, (char *)dev->dev_addr); + return 0; +} + +/** + * cleanup_bearer - break association between InfiniBand bearer and interface + * + * This routine must be invoked from a work queue because it can sleep. + */ +static void cleanup_bearer(struct work_struct *work) +{ + struct ib_bearer *ib_ptr = + container_of(work, struct ib_bearer, cleanup); + + dev_remove_pack(&ib_ptr->tipc_packet_type); + dev_put(ib_ptr->dev); + ib_ptr->dev = NULL; +} + +/** + * disable_bearer - detach TIPC bearer from an InfiniBand interface + * + * Mark InfiniBand bearer as inactive so that incoming buffers are thrown away, + * then get worker thread to complete bearer cleanup. (Can't do cleanup + * here because cleanup code needs to sleep and caller holds spinlocks.) + */ +static void disable_bearer(struct tipc_bearer *tb_ptr) +{ + struct ib_bearer *ib_ptr = (struct ib_bearer *)tb_ptr->usr_handle; + + ib_ptr->bearer = NULL; + INIT_WORK(&ib_ptr->cleanup, cleanup_bearer); + schedule_work(&ib_ptr->cleanup); +} + +/** + * recv_notification - handle device updates from OS + * + * Change the state of the InfiniBand bearer (if any) associated with the + * specified device. + */ +static int recv_notification(struct notifier_block *nb, unsigned long evt, + void *dv) +{ + struct net_device *dev = (struct net_device *)dv; + struct ib_bearer *ib_ptr = &ib_bearers[0]; + struct ib_bearer *stop = &ib_bearers[MAX_IB_BEARERS]; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + while ((ib_ptr->dev != dev)) { + if (++ib_ptr == stop) + return NOTIFY_DONE; /* couldn't find device */ + } + if (!ib_ptr->bearer) + return NOTIFY_DONE; /* bearer had been disabled */ + + ib_ptr->bearer->mtu = dev->mtu; + + switch (evt) { + case NETDEV_CHANGE: + if (netif_carrier_ok(dev)) + tipc_continue(ib_ptr->bearer); + else + tipc_block_bearer(ib_ptr->bearer->name); + break; + case NETDEV_UP: + tipc_continue(ib_ptr->bearer); + break; + case NETDEV_DOWN: + tipc_block_bearer(ib_ptr->bearer->name); + break; + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + tipc_block_bearer(ib_ptr->bearer->name); + tipc_continue(ib_ptr->bearer); + break; + case NETDEV_UNREGISTER: + case NETDEV_CHANGENAME: + tipc_disable_bearer(ib_ptr->bearer->name); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block notifier = { + .notifier_call = recv_notification, + .priority = 0, +}; + +/** + * ib_addr2str - convert InfiniBand address to string + */ +static int ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) +{ + if (str_size < 60) /* 60 = 19 * strlen("xx:") + strlen("xx\0") */ + return 1; + + sprintf(str_buf, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:" + "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", + a->value[0], a->value[1], a->value[2], a->value[3], + a->value[4], a->value[5], a->value[6], a->value[7], + a->value[8], a->value[9], a->value[10], a->value[11], + a->value[12], a->value[13], a->value[14], a->value[15], + a->value[16], a->value[17], a->value[18], a->value[19]); + + return 0; +} + +/** + * ib_addr2msg - convert InfiniBand address format to message header format + */ +static int ib_addr2msg(struct tipc_media_addr *a, char *msg_area) +{ + memset(msg_area, 0, TIPC_MEDIA_ADDR_SIZE); + msg_area[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_IB; + memcpy(msg_area, a->value, INFINIBAND_ALEN); + return 0; +} + +/** + * ib_msg2addr - convert message header address format to InfiniBand format + */ +static int ib_msg2addr(const struct tipc_bearer *tb_ptr, + struct tipc_media_addr *a, char *msg_area) +{ + ib_media_addr_set(tb_ptr, a, msg_area); + return 0; +} + +/* + * InfiniBand media registration info + */ +static struct tipc_media ib_media_info = { + .send_msg = send_msg, + .enable_bearer = enable_bearer, + .disable_bearer = disable_bearer, + .addr2str = ib_addr2str, + .addr2msg = ib_addr2msg, + .msg2addr = ib_msg2addr, + .priority = TIPC_DEF_LINK_PRI, + .tolerance = TIPC_DEF_LINK_TOL, + .window = TIPC_DEF_LINK_WIN, + .type_id = TIPC_MEDIA_TYPE_IB, + .name = "ib" +}; + +/** + * tipc_ib_media_start - activate InfiniBand bearer support + * + * Register InfiniBand media type with TIPC bearer code. Also register + * with OS for notifications about device state changes. + */ +int tipc_ib_media_start(void) +{ + int res; + + if (ib_started) + return -EINVAL; + + res = tipc_register_media(&ib_media_info); + if (res) + return res; + + res = register_netdevice_notifier(¬ifier); + if (!res) + ib_started = 1; + return res; +} + +/** + * tipc_ib_media_stop - deactivate InfiniBand bearer support + */ +void tipc_ib_media_stop(void) +{ + if (!ib_started) + return; + + flush_scheduled_work(); + unregister_netdevice_notifier(¬ifier); + ib_started = 0; +} diff --git a/net/tipc/link.c b/net/tipc/link.c index daa6080a2a0c..a80feee5197a 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2306,8 +2306,11 @@ static int link_recv_changeover_msg(struct tipc_link **l_ptr, struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf); u32 msg_typ = msg_type(tunnel_msg); u32 msg_count = msg_msgcnt(tunnel_msg); + u32 bearer_id = msg_bearer_id(tunnel_msg); - dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)]; + if (bearer_id >= MAX_BEARERS) + goto exit; + dest_link = (*l_ptr)->owner->links[bearer_id]; if (!dest_link) goto exit; if (dest_link == *l_ptr) { @@ -2521,14 +2524,16 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm); u32 msg_sz = msg_size(imsg); u32 fragm_sz = msg_data_sz(fragm); - u32 exp_fragm_cnt = msg_sz/fragm_sz + !!(msg_sz % fragm_sz); + u32 exp_fragm_cnt; u32 max = TIPC_MAX_USER_MSG_SIZE + NAMED_H_SIZE; + if (msg_type(imsg) == TIPC_MCAST_MSG) max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE; - if (msg_size(imsg) > max) { + if (fragm_sz == 0 || msg_size(imsg) > max) { kfree_skb(fbuf); return 0; } + exp_fragm_cnt = msg_sz / fragm_sz + !!(msg_sz % fragm_sz); pbuf = tipc_buf_acquire(msg_size(imsg)); if (pbuf != NULL) { pbuf->next = *pending; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 46754779fd3d..24b167914311 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -473,11 +473,10 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, static struct name_seq *nametbl_find_seq(u32 type) { struct hlist_head *seq_head; - struct hlist_node *seq_node; struct name_seq *ns; seq_head = &table.types[hash(type)]; - hlist_for_each_entry(ns, seq_node, seq_head, ns_list) { + hlist_for_each_entry(ns, seq_head, ns_list) { if (ns->type == type) return ns; } @@ -853,7 +852,6 @@ static int nametbl_list(char *buf, int len, u32 depth_info, u32 type, u32 lowbound, u32 upbound) { struct hlist_head *seq_head; - struct hlist_node *seq_node; struct name_seq *seq; int all_types; int ret = 0; @@ -873,7 +871,7 @@ static int nametbl_list(char *buf, int len, u32 depth_info, upbound = ~0; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { seq_head = &table.types[i]; - hlist_for_each_entry(seq, seq_node, seq_head, ns_list) { + hlist_for_each_entry(seq, seq_head, ns_list) { ret += nameseq_list(seq, buf + ret, len - ret, depth, seq->type, lowbound, upbound, i); @@ -889,7 +887,7 @@ static int nametbl_list(char *buf, int len, u32 depth_info, ret += nametbl_header(buf + ret, len - ret, depth); i = hash(type); seq_head = &table.types[i]; - hlist_for_each_entry(seq, seq_node, seq_head, ns_list) { + hlist_for_each_entry(seq, seq_head, ns_list) { if (seq->type == type) { ret += nameseq_list(seq, buf + ret, len - ret, depth, type, diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 6675914dc592..8bcd4985d0fb 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -44,7 +44,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info) struct nlmsghdr *rep_nlh; struct nlmsghdr *req_nlh = info->nlhdr; struct tipc_genlmsghdr *req_userhdr = info->userhdr; - int hdr_space = NLMSG_SPACE(GENL_HDRLEN + TIPC_GENL_HDRLEN); + int hdr_space = nlmsg_total_size(GENL_HDRLEN + TIPC_GENL_HDRLEN); u16 cmd; if ((req_userhdr->cmd & 0xC000) && (!capable(CAP_NET_ADMIN))) @@ -53,8 +53,8 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info) cmd = req_userhdr->cmd; rep_buf = tipc_cfg_do_cmd(req_userhdr->dest, cmd, - NLMSG_DATA(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN, - NLMSG_PAYLOAD(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN), + nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN, + nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN), hdr_space); if (rep_buf) { diff --git a/net/tipc/node.c b/net/tipc/node.c index 48f39dd3eae8..6e6c434872e8 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -69,12 +69,11 @@ static unsigned int tipc_hashfn(u32 addr) struct tipc_node *tipc_node_find(u32 addr) { struct tipc_node *node; - struct hlist_node *pos; if (unlikely(!in_own_cluster_exact(addr))) return NULL; - hlist_for_each_entry(node, pos, &node_htable[tipc_hashfn(addr)], hash) { + hlist_for_each_entry(node, &node_htable[tipc_hashfn(addr)], hash) { if (node->addr == addr) return node; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 9b4e4833a484..515ce38e4f4c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -43,7 +43,8 @@ #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ -#define OVERLOAD_LIMIT_BASE 10000 +#define CONN_OVERLOAD_LIMIT ((TIPC_FLOW_CONTROL_WIN * 2 + 1) * \ + SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ struct tipc_sock { @@ -129,19 +130,6 @@ static void advance_rx_queue(struct sock *sk) } /** - * discard_rx_queue - discard all buffers in socket receive queue - * - * Caller must hold socket lock - */ -static void discard_rx_queue(struct sock *sk) -{ - struct sk_buff *buf; - - while ((buf = __skb_dequeue(&sk->sk_receive_queue))) - kfree_skb(buf); -} - -/** * reject_rx_queue - reject all buffers in socket receive queue * * Caller must hold socket lock @@ -215,7 +203,6 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol, sock_init_data(sock, sk); sk->sk_backlog_rcv = backlog_rcv; - sk->sk_rcvbuf = TIPC_FLOW_CONTROL_WIN * 2 * TIPC_MAX_USER_MSG_SIZE * 2; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; tipc_sk(sk)->p = tp_ptr; @@ -292,7 +279,7 @@ static int release(struct socket *sock) res = tipc_deleteport(tport->ref); /* Discard any remaining (connection-based) messages in receive queue */ - discard_rx_queue(sk); + __skb_queue_purge(&sk->sk_receive_queue); /* Reject any messages that accumulated in backlog queue */ sock->state = SS_DISCONNECTING; @@ -516,8 +503,7 @@ static int send_msg(struct kiocb *iocb, struct socket *sock, if (unlikely((m->msg_namelen < sizeof(*dest)) || (dest->family != AF_TIPC))) return -EINVAL; - if ((total_len > TIPC_MAX_USER_MSG_SIZE) || - (m->msg_iovlen > (unsigned int)INT_MAX)) + if (total_len > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; if (iocb) @@ -625,8 +611,7 @@ static int send_packet(struct kiocb *iocb, struct socket *sock, if (unlikely(dest)) return send_msg(iocb, sock, m, total_len); - if ((total_len > TIPC_MAX_USER_MSG_SIZE) || - (m->msg_iovlen > (unsigned int)INT_MAX)) + if (total_len > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; if (iocb) @@ -711,8 +696,7 @@ static int send_stream(struct kiocb *iocb, struct socket *sock, goto exit; } - if ((total_len > (unsigned int)INT_MAX) || - (m->msg_iovlen > (unsigned int)INT_MAX)) { + if (total_len > (unsigned int)INT_MAX) { res = -EMSGSIZE; goto exit; } @@ -806,6 +790,7 @@ static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) if (addr) { addr->family = AF_TIPC; addr->addrtype = TIPC_ADDR_ID; + memset(&addr->addr, 0, sizeof(addr->addr)); addr->addr.id.ref = msg_origport(msg); addr->addr.id.node = msg_orignode(msg); addr->addr.name.domain = 0; /* could leave uninitialized */ @@ -920,6 +905,9 @@ static int recv_msg(struct kiocb *iocb, struct socket *sock, goto exit; } + /* will be updated in set_orig_addr() if needed */ + m->msg_namelen = 0; + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); restart: @@ -1029,6 +1017,9 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock, goto exit; } + /* will be updated in set_orig_addr() if needed */ + m->msg_namelen = 0; + target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len); timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); @@ -1155,34 +1146,6 @@ static void tipc_data_ready(struct sock *sk, int len) } /** - * rx_queue_full - determine if receive queue can accept another message - * @msg: message to be added to queue - * @queue_size: current size of queue - * @base: nominal maximum size of queue - * - * Returns 1 if queue is unable to accept message, 0 otherwise - */ -static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base) -{ - u32 threshold; - u32 imp = msg_importance(msg); - - if (imp == TIPC_LOW_IMPORTANCE) - threshold = base; - else if (imp == TIPC_MEDIUM_IMPORTANCE) - threshold = base * 2; - else if (imp == TIPC_HIGH_IMPORTANCE) - threshold = base * 100; - else - return 0; - - if (msg_connected(msg)) - threshold *= 4; - - return queue_size >= threshold; -} - -/** * filter_connect - Handle all incoming messages for a connection-based socket * @tsock: TIPC socket * @msg: message @@ -1260,6 +1223,36 @@ static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf) } /** + * rcvbuf_limit - get proper overload limit of socket receive queue + * @sk: socket + * @buf: message + * + * For all connection oriented messages, irrespective of importance, + * the default overload value (i.e. 67MB) is set as limit. + * + * For all connectionless messages, by default new queue limits are + * as belows: + * + * TIPC_LOW_IMPORTANCE (5MB) + * TIPC_MEDIUM_IMPORTANCE (10MB) + * TIPC_HIGH_IMPORTANCE (20MB) + * TIPC_CRITICAL_IMPORTANCE (40MB) + * + * Returns overload limit according to corresponding message importance + */ +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + unsigned int limit; + + if (msg_connected(msg)) + limit = CONN_OVERLOAD_LIMIT; + else + limit = sk->sk_rcvbuf << (msg_importance(msg) + 5); + return limit; +} + +/** * filter_rcv - validate incoming message * @sk: socket * @buf: message @@ -1275,7 +1268,7 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) { struct socket *sock = sk->sk_socket; struct tipc_msg *msg = buf_msg(buf); - u32 recv_q_len; + unsigned int limit = rcvbuf_limit(sk, buf); u32 res = TIPC_OK; /* Reject message if it is wrong sort of message for socket */ @@ -1292,15 +1285,13 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) } /* Reject message if there isn't room to queue it */ - recv_q_len = skb_queue_len(&sk->sk_receive_queue); - if (unlikely(recv_q_len >= (OVERLOAD_LIMIT_BASE / 2))) { - if (rx_queue_full(msg, recv_q_len, OVERLOAD_LIMIT_BASE / 2)) - return TIPC_ERR_OVERLOAD; - } + if (sk_rmem_alloc_get(sk) + buf->truesize >= limit) + return TIPC_ERR_OVERLOAD; - /* Enqueue message (finally!) */ + /* Enqueue message */ TIPC_SKB_CB(buf)->handle = 0; __skb_queue_tail(&sk->sk_receive_queue, buf); + skb_set_owner_r(buf, sk); sk->sk_data_ready(sk, 0); return TIPC_OK; @@ -1349,7 +1340,7 @@ static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf) if (!sock_owned_by_user(sk)) { res = filter_rcv(sk, buf); } else { - if (sk_add_backlog(sk, buf, sk->sk_rcvbuf)) + if (sk_add_backlog(sk, buf, rcvbuf_limit(sk, buf))) res = TIPC_ERR_OVERLOAD; else res = TIPC_OK; @@ -1583,6 +1574,7 @@ static int accept(struct socket *sock, struct socket *new_sock, int flags) } else { __skb_dequeue(&sk->sk_receive_queue); __skb_queue_head(&new_sk->sk_receive_queue, buf); + skb_set_owner_r(buf, new_sk); } release_sock(new_sk); @@ -1637,7 +1629,7 @@ restart: case SS_DISCONNECTING: /* Discard any unreceived messages */ - discard_rx_queue(sk); + __skb_queue_purge(&sk->sk_receive_queue); /* Wake up anyone sleeping in poll */ sk->sk_state_change(sk); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5b5c876c80e9..826e09938bff 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -263,9 +263,8 @@ static struct sock *__unix_find_socket_byname(struct net *net, int len, int type, unsigned int hash) { struct sock *s; - struct hlist_node *node; - sk_for_each(s, node, &unix_socket_table[hash ^ type]) { + sk_for_each(s, &unix_socket_table[hash ^ type]) { struct unix_sock *u = unix_sk(s); if (!net_eq(sock_net(s), net)) @@ -298,10 +297,9 @@ static inline struct sock *unix_find_socket_byname(struct net *net, static struct sock *unix_find_socket_byinode(struct inode *i) { struct sock *s; - struct hlist_node *node; spin_lock(&unix_table_lock); - sk_for_each(s, node, + sk_for_each(s, &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->path.dentry; @@ -384,7 +382,7 @@ static void unix_sock_destructor(struct sock *sk) #endif } -static int unix_release_sock(struct sock *sk, int embrion) +static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); struct path path; @@ -453,8 +451,6 @@ static int unix_release_sock(struct sock *sk, int embrion) if (unix_tot_inflight) unix_gc(); /* Garbage collect fds */ - - return 0; } static void init_peercred(struct sock *sk) @@ -701,9 +697,10 @@ static int unix_release(struct socket *sock) if (!sk) return 0; + unix_release_sock(sk, 0); sock->sk = NULL; - return unix_release_sock(sk, 0); + return 0; } static int unix_autobind(struct socket *sock) @@ -1343,7 +1340,6 @@ static void unix_destruct_scm(struct sk_buff *skb) struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); scm.pid = UNIXCB(skb).pid; - scm.cred = UNIXCB(skb).cred; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); @@ -1394,8 +1390,8 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen int err = 0; UNIXCB(skb).pid = get_pid(scm->pid); - if (scm->cred) - UNIXCB(skb).cred = get_cred(scm->cred); + UNIXCB(skb).uid = scm->creds.uid; + UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); @@ -1412,13 +1408,13 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, const struct sock *other) { - if (UNIXCB(skb).cred) + if (UNIXCB(skb).pid) return; if (test_bit(SOCK_PASSCRED, &sock->flags) || !other->sk_socket || test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); - UNIXCB(skb).cred = get_current_cred(); + current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } @@ -1822,7 +1818,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, siocb->scm = &tmp_scm; memset(&tmp_scm, 0, sizeof(tmp_scm)); } - scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(siocb->scm, skb); if (!(flags & MSG_PEEK)) { @@ -1862,10 +1858,10 @@ out: } /* - * Sleep until data has arrive. But check for races.. + * Sleep until more data has arrived. But check for races.. */ - -static long unix_stream_data_wait(struct sock *sk, long timeo) +static long unix_stream_data_wait(struct sock *sk, long timeo, + struct sk_buff *last) { DEFINE_WAIT(wait); @@ -1874,7 +1870,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo) for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (!skb_queue_empty(&sk->sk_receive_queue) || + if (skb_peek_tail(&sk->sk_receive_queue) != last || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || @@ -1893,8 +1889,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo) return timeo; } - - static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) @@ -1939,14 +1933,12 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, goto out; } - skip = sk_peek_offset(sk, flags); - do { int chunk; - struct sk_buff *skb; + struct sk_buff *skb, *last; unix_state_lock(sk); - skb = skb_peek(&sk->sk_receive_queue); + last = skb = skb_peek(&sk->sk_receive_queue); again: if (skb == NULL) { unix_sk(sk)->recursion_level = 0; @@ -1969,7 +1961,7 @@ again: break; mutex_unlock(&u->readlock); - timeo = unix_stream_data_wait(sk, timeo); + timeo = unix_stream_data_wait(sk, timeo, last); if (signal_pending(current) || mutex_lock_interruptible(&u->readlock)) { @@ -1983,10 +1975,13 @@ again: break; } - if (skip >= skb->len) { + skip = sk_peek_offset(sk, flags); + while (skip >= skb->len) { skip -= skb->len; + last = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); - goto again; + if (!skb) + goto again; } unix_state_unlock(sk); @@ -1994,11 +1989,12 @@ again: if (check_creds) { /* Never glue messages from different writers */ if ((UNIXCB(skb).pid != siocb->scm->pid) || - (UNIXCB(skb).cred != siocb->scm->cred)) + !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) || + !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid)) break; - } else { + } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { /* Copy credentials */ - scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); check_creds = 1; } @@ -2199,7 +2195,9 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= POLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0); + if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) @@ -2402,7 +2400,7 @@ static int __net_init unix_net_init(struct net *net) goto out; #ifdef CONFIG_PROC_FS - if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) { + if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) { unix_sysctl_unregister(net); goto out; } @@ -2415,7 +2413,7 @@ out: static void __net_exit unix_net_exit(struct net *net) { unix_sysctl_unregister(net); - proc_net_remove(net, "unix"); + remove_proc_entry("unix", net->proc_net); } static struct pernet_operations unix_net_ops = { @@ -2426,9 +2424,8 @@ static struct pernet_operations unix_net_ops = { static int __init af_unix_init(void) { int rc = -1; - struct sk_buff *dummy_skb; - BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb)); rc = proto_register(&unix_proto, 1); if (rc != 0) { diff --git a/net/unix/diag.c b/net/unix/diag.c index 5ac19dc1d5e4..d591091603bf 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -192,10 +192,9 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) slot < ARRAY_SIZE(unix_socket_table); s_num = 0, slot++) { struct sock *sk; - struct hlist_node *node; num = 0; - sk_for_each(sk, node, &unix_socket_table[slot]) { + sk_for_each(sk, &unix_socket_table[slot]) { if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) @@ -226,9 +225,7 @@ static struct sock *unix_lookup_by_ino(int ino) spin_lock(&unix_table_lock); for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) { - struct hlist_node *node; - - sk_for_each(sk, node, &unix_socket_table[i]) + sk_for_each(sk, &unix_socket_table[i]) if (ino == sock_i_ino(sk)) { sock_hold(sk); spin_unlock(&unix_table_lock); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index b6f4b994eb35..9bc73f87f64a 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -99,7 +99,7 @@ unsigned int unix_tot_inflight; struct sock *unix_get_socket(struct file *filp) { struct sock *u_sock = NULL; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); /* * Socket ? @@ -185,7 +185,7 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), * have been added to the queues after * starting the garbage collection */ - if (u->gc_candidate) { + if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { hit = true; func(u); } @@ -254,7 +254,7 @@ static void inc_inflight_move_tail(struct unix_sock *u) * of the list, so that it's checked even if it was already * passed over */ - if (u->gc_maybe_cycle) + if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) list_move_tail(&u->link, &gc_candidates); } @@ -315,8 +315,8 @@ void unix_gc(void) BUG_ON(total_refs < inflight_refs); if (total_refs == inflight_refs) { list_move_tail(&u->link, &gc_candidates); - u->gc_candidate = 1; - u->gc_maybe_cycle = 1; + __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); + __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); } } @@ -344,7 +344,7 @@ void unix_gc(void) if (atomic_long_read(&u->inflight) > 0) { list_move_tail(&u->link, ¬_cycle_list); - u->gc_maybe_cycle = 0; + __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL); } } @@ -356,7 +356,7 @@ void unix_gc(void) */ while (!list_empty(¬_cycle_list)) { u = list_entry(not_cycle_list.next, struct unix_sock, link); - u->gc_candidate = 0; + __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); list_move_tail(&u->link, &gc_inflight_list); } diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig new file mode 100644 index 000000000000..b5fa7e40cdcb --- /dev/null +++ b/net/vmw_vsock/Kconfig @@ -0,0 +1,28 @@ +# +# Vsock protocol +# + +config VSOCKETS + tristate "Virtual Socket protocol" + help + Virtual Socket Protocol is a socket protocol similar to TCP/IP + allowing comunication between Virtual Machines and hypervisor + or host. + + You should also select one or more hypervisor-specific transports + below. + + To compile this driver as a module, choose M here: the module + will be called vsock. If unsure, say N. + +config VMWARE_VMCI_VSOCKETS + tristate "VMware VMCI transport for Virtual Sockets" + depends on VSOCKETS && VMWARE_VMCI + help + This module implements a VMCI transport for Virtual Sockets. + + Enable this transport if your Virtual Machine runs on a VMware + hypervisor. + + To compile this driver as a module, choose M here: the module + will be called vmw_vsock_vmci_transport. If unsure, say N. diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile new file mode 100644 index 000000000000..2ce52d70f224 --- /dev/null +++ b/net/vmw_vsock/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_VSOCKETS) += vsock.o +obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o + +vsock-y += af_vsock.o vsock_addr.o + +vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ + vmci_transport_notify_qstate.o diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c new file mode 100644 index 000000000000..3f77f42a3b58 --- /dev/null +++ b/net/vmw_vsock/af_vsock.c @@ -0,0 +1,2014 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +/* Implementation notes: + * + * - There are two kinds of sockets: those created by user action (such as + * calling socket(2)) and those created by incoming connection request packets. + * + * - There are two "global" tables, one for bound sockets (sockets that have + * specified an address that they are responsible for) and one for connected + * sockets (sockets that have established a connection with another socket). + * These tables are "global" in that all sockets on the system are placed + * within them. - Note, though, that the bound table contains an extra entry + * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in + * that list. The bound table is used solely for lookup of sockets when packets + * are received and that's not necessary for SOCK_DGRAM sockets since we create + * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM + * sockets out of the bound hash buckets will reduce the chance of collisions + * when looking for SOCK_STREAM sockets and prevents us from having to check the + * socket type in the hash table lookups. + * + * - Sockets created by user action will either be "client" sockets that + * initiate a connection or "server" sockets that listen for connections; we do + * not support simultaneous connects (two "client" sockets connecting). + * + * - "Server" sockets are referred to as listener sockets throughout this + * implementation because they are in the SS_LISTEN state. When a connection + * request is received (the second kind of socket mentioned above), we create a + * new socket and refer to it as a pending socket. These pending sockets are + * placed on the pending connection list of the listener socket. When future + * packets are received for the address the listener socket is bound to, we + * check if the source of the packet is from one that has an existing pending + * connection. If it does, we process the packet for the pending socket. When + * that socket reaches the connected state, it is removed from the listener + * socket's pending list and enqueued in the listener socket's accept queue. + * Callers of accept(2) will accept connected sockets from the listener socket's + * accept queue. If the socket cannot be accepted for some reason then it is + * marked rejected. Once the connection is accepted, it is owned by the user + * process and the responsibility for cleanup falls with that user process. + * + * - It is possible that these pending sockets will never reach the connected + * state; in fact, we may never receive another packet after the connection + * request. Because of this, we must schedule a cleanup function to run in the + * future, after some amount of time passes where a connection should have been + * established. This function ensures that the socket is off all lists so it + * cannot be retrieved, then drops all references to the socket so it is cleaned + * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this + * function will also cleanup rejected sockets, those that reach the connected + * state but leave it before they have been accepted. + * + * - Sockets created by user action will be cleaned up when the user process + * calls close(2), causing our release implementation to be called. Our release + * implementation will perform some cleanup then drop the last reference so our + * sk_destruct implementation is invoked. Our sk_destruct implementation will + * perform additional cleanup that's common for both types of sockets. + * + * - A socket's reference count is what ensures that the structure won't be + * freed. Each entry in a list (such as the "global" bound and connected tables + * and the listener socket's pending list and connected queue) ensures a + * reference. When we defer work until process context and pass a socket as our + * argument, we must ensure the reference count is increased to ensure the + * socket isn't freed before the function is run; the deferred function will + * then drop the reference. + */ + +#include <linux/types.h> +#include <linux/bitops.h> +#include <linux/cred.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/net.h> +#include <linux/poll.h> +#include <linux/skbuff.h> +#include <linux/smp.h> +#include <linux/socket.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <net/sock.h> + +#include "af_vsock.h" + +static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); +static void vsock_sk_destruct(struct sock *sk); +static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); + +/* Protocol family. */ +static struct proto vsock_proto = { + .name = "AF_VSOCK", + .owner = THIS_MODULE, + .obj_size = sizeof(struct vsock_sock), +}; + +/* The default peer timeout indicates how long we will wait for a peer response + * to a control message. + */ +#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) + +#define SS_LISTEN 255 + +static const struct vsock_transport *transport; +static DEFINE_MUTEX(vsock_register_mutex); + +/**** EXPORTS ****/ + +/* Get the ID of the local context. This is transport dependent. */ + +int vm_sockets_get_local_cid(void) +{ + return transport->get_local_cid(); +} +EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); + +/**** UTILS ****/ + +/* Each bound VSocket is stored in the bind hash table and each connected + * VSocket is stored in the connected hash table. + * + * Unbound sockets are all put on the same list attached to the end of the hash + * table (vsock_unbound_sockets). Bound sockets are added to the hash table in + * the bucket that their local address hashes to (vsock_bound_sockets(addr) + * represents the list that addr hashes to). + * + * Specifically, we initialize the vsock_bind_table array to a size of + * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through + * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and + * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function + * mods with VSOCK_HASH_SIZE - 1 to ensure this. + */ +#define VSOCK_HASH_SIZE 251 +#define MAX_PORT_RETRIES 24 + +#define VSOCK_HASH(addr) ((addr)->svm_port % (VSOCK_HASH_SIZE - 1)) +#define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) +#define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) + +/* XXX This can probably be implemented in a better way. */ +#define VSOCK_CONN_HASH(src, dst) \ + (((src)->svm_cid ^ (dst)->svm_port) % (VSOCK_HASH_SIZE - 1)) +#define vsock_connected_sockets(src, dst) \ + (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) +#define vsock_connected_sockets_vsk(vsk) \ + vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) + +static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; +static struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; +static DEFINE_SPINLOCK(vsock_table_lock); + +static void vsock_init_tables(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) + INIT_LIST_HEAD(&vsock_bind_table[i]); + + for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) + INIT_LIST_HEAD(&vsock_connected_table[i]); +} + +static void __vsock_insert_bound(struct list_head *list, + struct vsock_sock *vsk) +{ + sock_hold(&vsk->sk); + list_add(&vsk->bound_table, list); +} + +static void __vsock_insert_connected(struct list_head *list, + struct vsock_sock *vsk) +{ + sock_hold(&vsk->sk); + list_add(&vsk->connected_table, list); +} + +static void __vsock_remove_bound(struct vsock_sock *vsk) +{ + list_del_init(&vsk->bound_table); + sock_put(&vsk->sk); +} + +static void __vsock_remove_connected(struct vsock_sock *vsk) +{ + list_del_init(&vsk->connected_table); + sock_put(&vsk->sk); +} + +static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) +{ + struct vsock_sock *vsk; + + list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) + if (addr->svm_port == vsk->local_addr.svm_port) + return sk_vsock(vsk); + + return NULL; +} + +static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, + struct sockaddr_vm *dst) +{ + struct vsock_sock *vsk; + + list_for_each_entry(vsk, vsock_connected_sockets(src, dst), + connected_table) { + if (vsock_addr_equals_addr(src, &vsk->remote_addr) && + dst->svm_port == vsk->local_addr.svm_port) { + return sk_vsock(vsk); + } + } + + return NULL; +} + +static bool __vsock_in_bound_table(struct vsock_sock *vsk) +{ + return !list_empty(&vsk->bound_table); +} + +static bool __vsock_in_connected_table(struct vsock_sock *vsk) +{ + return !list_empty(&vsk->connected_table); +} + +static void vsock_insert_unbound(struct vsock_sock *vsk) +{ + spin_lock_bh(&vsock_table_lock); + __vsock_insert_bound(vsock_unbound_sockets, vsk); + spin_unlock_bh(&vsock_table_lock); +} + +void vsock_insert_connected(struct vsock_sock *vsk) +{ + struct list_head *list = vsock_connected_sockets( + &vsk->remote_addr, &vsk->local_addr); + + spin_lock_bh(&vsock_table_lock); + __vsock_insert_connected(list, vsk); + spin_unlock_bh(&vsock_table_lock); +} +EXPORT_SYMBOL_GPL(vsock_insert_connected); + +void vsock_remove_bound(struct vsock_sock *vsk) +{ + spin_lock_bh(&vsock_table_lock); + __vsock_remove_bound(vsk); + spin_unlock_bh(&vsock_table_lock); +} +EXPORT_SYMBOL_GPL(vsock_remove_bound); + +void vsock_remove_connected(struct vsock_sock *vsk) +{ + spin_lock_bh(&vsock_table_lock); + __vsock_remove_connected(vsk); + spin_unlock_bh(&vsock_table_lock); +} +EXPORT_SYMBOL_GPL(vsock_remove_connected); + +struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) +{ + struct sock *sk; + + spin_lock_bh(&vsock_table_lock); + sk = __vsock_find_bound_socket(addr); + if (sk) + sock_hold(sk); + + spin_unlock_bh(&vsock_table_lock); + + return sk; +} +EXPORT_SYMBOL_GPL(vsock_find_bound_socket); + +struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, + struct sockaddr_vm *dst) +{ + struct sock *sk; + + spin_lock_bh(&vsock_table_lock); + sk = __vsock_find_connected_socket(src, dst); + if (sk) + sock_hold(sk); + + spin_unlock_bh(&vsock_table_lock); + + return sk; +} +EXPORT_SYMBOL_GPL(vsock_find_connected_socket); + +static bool vsock_in_bound_table(struct vsock_sock *vsk) +{ + bool ret; + + spin_lock_bh(&vsock_table_lock); + ret = __vsock_in_bound_table(vsk); + spin_unlock_bh(&vsock_table_lock); + + return ret; +} + +static bool vsock_in_connected_table(struct vsock_sock *vsk) +{ + bool ret; + + spin_lock_bh(&vsock_table_lock); + ret = __vsock_in_connected_table(vsk); + spin_unlock_bh(&vsock_table_lock); + + return ret; +} + +void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) +{ + int i; + + spin_lock_bh(&vsock_table_lock); + + for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { + struct vsock_sock *vsk; + list_for_each_entry(vsk, &vsock_connected_table[i], + connected_table); + fn(sk_vsock(vsk)); + } + + spin_unlock_bh(&vsock_table_lock); +} +EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); + +void vsock_add_pending(struct sock *listener, struct sock *pending) +{ + struct vsock_sock *vlistener; + struct vsock_sock *vpending; + + vlistener = vsock_sk(listener); + vpending = vsock_sk(pending); + + sock_hold(pending); + sock_hold(listener); + list_add_tail(&vpending->pending_links, &vlistener->pending_links); +} +EXPORT_SYMBOL_GPL(vsock_add_pending); + +void vsock_remove_pending(struct sock *listener, struct sock *pending) +{ + struct vsock_sock *vpending = vsock_sk(pending); + + list_del_init(&vpending->pending_links); + sock_put(listener); + sock_put(pending); +} +EXPORT_SYMBOL_GPL(vsock_remove_pending); + +void vsock_enqueue_accept(struct sock *listener, struct sock *connected) +{ + struct vsock_sock *vlistener; + struct vsock_sock *vconnected; + + vlistener = vsock_sk(listener); + vconnected = vsock_sk(connected); + + sock_hold(connected); + sock_hold(listener); + list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); +} +EXPORT_SYMBOL_GPL(vsock_enqueue_accept); + +static struct sock *vsock_dequeue_accept(struct sock *listener) +{ + struct vsock_sock *vlistener; + struct vsock_sock *vconnected; + + vlistener = vsock_sk(listener); + + if (list_empty(&vlistener->accept_queue)) + return NULL; + + vconnected = list_entry(vlistener->accept_queue.next, + struct vsock_sock, accept_queue); + + list_del_init(&vconnected->accept_queue); + sock_put(listener); + /* The caller will need a reference on the connected socket so we let + * it call sock_put(). + */ + + return sk_vsock(vconnected); +} + +static bool vsock_is_accept_queue_empty(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + return list_empty(&vsk->accept_queue); +} + +static bool vsock_is_pending(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + return !list_empty(&vsk->pending_links); +} + +static int vsock_send_shutdown(struct sock *sk, int mode) +{ + return transport->shutdown(vsock_sk(sk), mode); +} + +void vsock_pending_work(struct work_struct *work) +{ + struct sock *sk; + struct sock *listener; + struct vsock_sock *vsk; + bool cleanup; + + vsk = container_of(work, struct vsock_sock, dwork.work); + sk = sk_vsock(vsk); + listener = vsk->listener; + cleanup = true; + + lock_sock(listener); + lock_sock(sk); + + if (vsock_is_pending(sk)) { + vsock_remove_pending(listener, sk); + } else if (!vsk->rejected) { + /* We are not on the pending list and accept() did not reject + * us, so we must have been accepted by our user process. We + * just need to drop our references to the sockets and be on + * our way. + */ + cleanup = false; + goto out; + } + + listener->sk_ack_backlog--; + + /* We need to remove ourself from the global connected sockets list so + * incoming packets can't find this socket, and to reduce the reference + * count. + */ + if (vsock_in_connected_table(vsk)) + vsock_remove_connected(vsk); + + sk->sk_state = SS_FREE; + +out: + release_sock(sk); + release_sock(listener); + if (cleanup) + sock_put(sk); + + sock_put(sk); + sock_put(listener); +} +EXPORT_SYMBOL_GPL(vsock_pending_work); + +/**** SOCKET OPERATIONS ****/ + +static int __vsock_bind_stream(struct vsock_sock *vsk, + struct sockaddr_vm *addr) +{ + static u32 port = LAST_RESERVED_PORT + 1; + struct sockaddr_vm new_addr; + + vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); + + if (addr->svm_port == VMADDR_PORT_ANY) { + bool found = false; + unsigned int i; + + for (i = 0; i < MAX_PORT_RETRIES; i++) { + if (port <= LAST_RESERVED_PORT) + port = LAST_RESERVED_PORT + 1; + + new_addr.svm_port = port++; + + if (!__vsock_find_bound_socket(&new_addr)) { + found = true; + break; + } + } + + if (!found) + return -EADDRNOTAVAIL; + } else { + /* If port is in reserved range, ensure caller + * has necessary privileges. + */ + if (addr->svm_port <= LAST_RESERVED_PORT && + !capable(CAP_NET_BIND_SERVICE)) { + return -EACCES; + } + + if (__vsock_find_bound_socket(&new_addr)) + return -EADDRINUSE; + } + + vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); + + /* Remove stream sockets from the unbound list and add them to the hash + * table for easy lookup by its address. The unbound list is simply an + * extra entry at the end of the hash table, a trick used by AF_UNIX. + */ + __vsock_remove_bound(vsk); + __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); + + return 0; +} + +static int __vsock_bind_dgram(struct vsock_sock *vsk, + struct sockaddr_vm *addr) +{ + return transport->dgram_bind(vsk, addr); +} + +static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) +{ + struct vsock_sock *vsk = vsock_sk(sk); + u32 cid; + int retval; + + /* First ensure this socket isn't already bound. */ + if (vsock_addr_bound(&vsk->local_addr)) + return -EINVAL; + + /* Now bind to the provided address or select appropriate values if + * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that + * like AF_INET prevents binding to a non-local IP address (in most + * cases), we only allow binding to the local CID. + */ + cid = transport->get_local_cid(); + if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY) + return -EADDRNOTAVAIL; + + switch (sk->sk_socket->type) { + case SOCK_STREAM: + spin_lock_bh(&vsock_table_lock); + retval = __vsock_bind_stream(vsk, addr); + spin_unlock_bh(&vsock_table_lock); + break; + + case SOCK_DGRAM: + retval = __vsock_bind_dgram(vsk, addr); + break; + + default: + retval = -EINVAL; + break; + } + + return retval; +} + +struct sock *__vsock_create(struct net *net, + struct socket *sock, + struct sock *parent, + gfp_t priority, + unsigned short type) +{ + struct sock *sk; + struct vsock_sock *psk; + struct vsock_sock *vsk; + + sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + + /* sk->sk_type is normally set in sock_init_data, but only if sock is + * non-NULL. We make sure that our sockets always have a type by + * setting it here if needed. + */ + if (!sock) + sk->sk_type = type; + + vsk = vsock_sk(sk); + vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); + vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); + + sk->sk_destruct = vsock_sk_destruct; + sk->sk_backlog_rcv = vsock_queue_rcv_skb; + sk->sk_state = 0; + sock_reset_flag(sk, SOCK_DONE); + + INIT_LIST_HEAD(&vsk->bound_table); + INIT_LIST_HEAD(&vsk->connected_table); + vsk->listener = NULL; + INIT_LIST_HEAD(&vsk->pending_links); + INIT_LIST_HEAD(&vsk->accept_queue); + vsk->rejected = false; + vsk->sent_request = false; + vsk->ignore_connecting_rst = false; + vsk->peer_shutdown = 0; + + psk = parent ? vsock_sk(parent) : NULL; + if (parent) { + vsk->trusted = psk->trusted; + vsk->owner = get_cred(psk->owner); + vsk->connect_timeout = psk->connect_timeout; + } else { + vsk->trusted = capable(CAP_NET_ADMIN); + vsk->owner = get_current_cred(); + vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; + } + + if (transport->init(vsk, psk) < 0) { + sk_free(sk); + return NULL; + } + + if (sock) + vsock_insert_unbound(vsk); + + return sk; +} +EXPORT_SYMBOL_GPL(__vsock_create); + +static void __vsock_release(struct sock *sk) +{ + if (sk) { + struct sk_buff *skb; + struct sock *pending; + struct vsock_sock *vsk; + + vsk = vsock_sk(sk); + pending = NULL; /* Compiler warning. */ + + if (vsock_in_bound_table(vsk)) + vsock_remove_bound(vsk); + + if (vsock_in_connected_table(vsk)) + vsock_remove_connected(vsk); + + transport->release(vsk); + + lock_sock(sk); + sock_orphan(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + while ((skb = skb_dequeue(&sk->sk_receive_queue))) + kfree_skb(skb); + + /* Clean up any sockets that never were accepted. */ + while ((pending = vsock_dequeue_accept(sk)) != NULL) { + __vsock_release(pending); + sock_put(pending); + } + + release_sock(sk); + sock_put(sk); + } +} + +static void vsock_sk_destruct(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + transport->destruct(vsk); + + /* When clearing these addresses, there's no need to set the family and + * possibly register the address family with the kernel. + */ + vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); + vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); + + put_cred(vsk->owner); +} + +static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int err; + + err = sock_queue_rcv_skb(sk, skb); + if (err) + kfree_skb(skb); + + return err; +} + +s64 vsock_stream_has_data(struct vsock_sock *vsk) +{ + return transport->stream_has_data(vsk); +} +EXPORT_SYMBOL_GPL(vsock_stream_has_data); + +s64 vsock_stream_has_space(struct vsock_sock *vsk) +{ + return transport->stream_has_space(vsk); +} +EXPORT_SYMBOL_GPL(vsock_stream_has_space); + +static int vsock_release(struct socket *sock) +{ + __vsock_release(sock->sk); + sock->sk = NULL; + sock->state = SS_FREE; + + return 0; +} + +static int +vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + int err; + struct sock *sk; + struct sockaddr_vm *vm_addr; + + sk = sock->sk; + + if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) + return -EINVAL; + + lock_sock(sk); + err = __vsock_bind(sk, vm_addr); + release_sock(sk); + + return err; +} + +static int vsock_getname(struct socket *sock, + struct sockaddr *addr, int *addr_len, int peer) +{ + int err; + struct sock *sk; + struct vsock_sock *vsk; + struct sockaddr_vm *vm_addr; + + sk = sock->sk; + vsk = vsock_sk(sk); + err = 0; + + lock_sock(sk); + + if (peer) { + if (sock->state != SS_CONNECTED) { + err = -ENOTCONN; + goto out; + } + vm_addr = &vsk->remote_addr; + } else { + vm_addr = &vsk->local_addr; + } + + if (!vm_addr) { + err = -EINVAL; + goto out; + } + + /* sys_getsockname() and sys_getpeername() pass us a + * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately + * that macro is defined in socket.c instead of .h, so we hardcode its + * value here. + */ + BUILD_BUG_ON(sizeof(*vm_addr) > 128); + memcpy(addr, vm_addr, sizeof(*vm_addr)); + *addr_len = sizeof(*vm_addr); + +out: + release_sock(sk); + return err; +} + +static int vsock_shutdown(struct socket *sock, int mode) +{ + int err; + struct sock *sk; + + /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses + * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode + * here like the other address families do. Note also that the + * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), + * which is what we want. + */ + mode++; + + if ((mode & ~SHUTDOWN_MASK) || !mode) + return -EINVAL; + + /* If this is a STREAM socket and it is not connected then bail out + * immediately. If it is a DGRAM socket then we must first kick the + * socket so that it wakes up from any sleeping calls, for example + * recv(), and then afterwards return the error. + */ + + sk = sock->sk; + if (sock->state == SS_UNCONNECTED) { + err = -ENOTCONN; + if (sk->sk_type == SOCK_STREAM) + return err; + } else { + sock->state = SS_DISCONNECTING; + err = 0; + } + + /* Receive and send shutdowns are treated alike. */ + mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); + if (mode) { + lock_sock(sk); + sk->sk_shutdown |= mode; + sk->sk_state_change(sk); + release_sock(sk); + + if (sk->sk_type == SOCK_STREAM) { + sock_reset_flag(sk, SOCK_DONE); + vsock_send_shutdown(sk, mode); + } + } + + return err; +} + +static unsigned int vsock_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk; + unsigned int mask; + struct vsock_sock *vsk; + + sk = sock->sk; + vsk = vsock_sk(sk); + + poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + if (sk->sk_err) + /* Signify that there has been an error on this socket. */ + mask |= POLLERR; + + /* INET sockets treat local write shutdown and peer write shutdown as a + * case of POLLHUP set. + */ + if ((sk->sk_shutdown == SHUTDOWN_MASK) || + ((sk->sk_shutdown & SEND_SHUTDOWN) && + (vsk->peer_shutdown & SEND_SHUTDOWN))) { + mask |= POLLHUP; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN || + vsk->peer_shutdown & SEND_SHUTDOWN) { + mask |= POLLRDHUP; + } + + if (sock->type == SOCK_DGRAM) { + /* For datagram sockets we can read if there is something in + * the queue and write as long as the socket isn't shutdown for + * sending. + */ + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) { + mask |= POLLIN | POLLRDNORM; + } + + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + } else if (sock->type == SOCK_STREAM) { + lock_sock(sk); + + /* Listening sockets that have connections in their accept + * queue can be read. + */ + if (sk->sk_state == SS_LISTEN + && !vsock_is_accept_queue_empty(sk)) + mask |= POLLIN | POLLRDNORM; + + /* If there is something in the queue then we can read. */ + if (transport->stream_is_active(vsk) && + !(sk->sk_shutdown & RCV_SHUTDOWN)) { + bool data_ready_now = false; + int ret = transport->notify_poll_in( + vsk, 1, &data_ready_now); + if (ret < 0) { + mask |= POLLERR; + } else { + if (data_ready_now) + mask |= POLLIN | POLLRDNORM; + + } + } + + /* Sockets whose connections have been closed, reset, or + * terminated should also be considered read, and we check the + * shutdown flag for that. + */ + if (sk->sk_shutdown & RCV_SHUTDOWN || + vsk->peer_shutdown & SEND_SHUTDOWN) { + mask |= POLLIN | POLLRDNORM; + } + + /* Connected sockets that can produce data can be written. */ + if (sk->sk_state == SS_CONNECTED) { + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + bool space_avail_now = false; + int ret = transport->notify_poll_out( + vsk, 1, &space_avail_now); + if (ret < 0) { + mask |= POLLERR; + } else { + if (space_avail_now) + /* Remove POLLWRBAND since INET + * sockets are not setting it. + */ + mask |= POLLOUT | POLLWRNORM; + + } + } + } + + /* Simulate INET socket poll behaviors, which sets + * POLLOUT|POLLWRNORM when peer is closed and nothing to read, + * but local send is not shutdown. + */ + if (sk->sk_state == SS_UNCONNECTED) { + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) + mask |= POLLOUT | POLLWRNORM; + + } + + release_sock(sk); + } + + return mask; +} + +static int vsock_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + int err; + struct sock *sk; + struct vsock_sock *vsk; + struct sockaddr_vm *remote_addr; + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* For now, MSG_DONTWAIT is always assumed... */ + err = 0; + sk = sock->sk; + vsk = vsock_sk(sk); + + lock_sock(sk); + + if (!vsock_addr_bound(&vsk->local_addr)) { + struct sockaddr_vm local_addr; + + vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); + err = __vsock_bind(sk, &local_addr); + if (err != 0) + goto out; + + } + + /* If the provided message contains an address, use that. Otherwise + * fall back on the socket's remote handle (if it has been connected). + */ + if (msg->msg_name && + vsock_addr_cast(msg->msg_name, msg->msg_namelen, + &remote_addr) == 0) { + /* Ensure this address is of the right type and is a valid + * destination. + */ + + if (remote_addr->svm_cid == VMADDR_CID_ANY) + remote_addr->svm_cid = transport->get_local_cid(); + + if (!vsock_addr_bound(remote_addr)) { + err = -EINVAL; + goto out; + } + } else if (sock->state == SS_CONNECTED) { + remote_addr = &vsk->remote_addr; + + if (remote_addr->svm_cid == VMADDR_CID_ANY) + remote_addr->svm_cid = transport->get_local_cid(); + + /* XXX Should connect() or this function ensure remote_addr is + * bound? + */ + if (!vsock_addr_bound(&vsk->remote_addr)) { + err = -EINVAL; + goto out; + } + } else { + err = -EINVAL; + goto out; + } + + if (!transport->dgram_allow(remote_addr->svm_cid, + remote_addr->svm_port)) { + err = -EINVAL; + goto out; + } + + err = transport->dgram_enqueue(vsk, remote_addr, msg->msg_iov, len); + +out: + release_sock(sk); + return err; +} + +static int vsock_dgram_connect(struct socket *sock, + struct sockaddr *addr, int addr_len, int flags) +{ + int err; + struct sock *sk; + struct vsock_sock *vsk; + struct sockaddr_vm *remote_addr; + + sk = sock->sk; + vsk = vsock_sk(sk); + + err = vsock_addr_cast(addr, addr_len, &remote_addr); + if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { + lock_sock(sk); + vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, + VMADDR_PORT_ANY); + sock->state = SS_UNCONNECTED; + release_sock(sk); + return 0; + } else if (err != 0) + return -EINVAL; + + lock_sock(sk); + + if (!vsock_addr_bound(&vsk->local_addr)) { + struct sockaddr_vm local_addr; + + vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); + err = __vsock_bind(sk, &local_addr); + if (err != 0) + goto out; + + } + + if (!transport->dgram_allow(remote_addr->svm_cid, + remote_addr->svm_port)) { + err = -EINVAL; + goto out; + } + + memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); + sock->state = SS_CONNECTED; + +out: + release_sock(sk); + return err; +} + +static int vsock_dgram_recvmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + return transport->dgram_dequeue(kiocb, vsock_sk(sock->sk), msg, len, + flags); +} + +static const struct proto_ops vsock_dgram_ops = { + .family = PF_VSOCK, + .owner = THIS_MODULE, + .release = vsock_release, + .bind = vsock_bind, + .connect = vsock_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = vsock_getname, + .poll = vsock_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = vsock_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = vsock_dgram_sendmsg, + .recvmsg = vsock_dgram_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static void vsock_connect_timeout(struct work_struct *work) +{ + struct sock *sk; + struct vsock_sock *vsk; + + vsk = container_of(work, struct vsock_sock, dwork.work); + sk = sk_vsock(vsk); + + lock_sock(sk); + if (sk->sk_state == SS_CONNECTING && + (sk->sk_shutdown != SHUTDOWN_MASK)) { + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = ETIMEDOUT; + sk->sk_error_report(sk); + } + release_sock(sk); + + sock_put(sk); +} + +static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags) +{ + int err; + struct sock *sk; + struct vsock_sock *vsk; + struct sockaddr_vm *remote_addr; + long timeout; + DEFINE_WAIT(wait); + + err = 0; + sk = sock->sk; + vsk = vsock_sk(sk); + + lock_sock(sk); + + /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ + switch (sock->state) { + case SS_CONNECTED: + err = -EISCONN; + goto out; + case SS_DISCONNECTING: + err = -EINVAL; + goto out; + case SS_CONNECTING: + /* This continues on so we can move sock into the SS_CONNECTED + * state once the connection has completed (at which point err + * will be set to zero also). Otherwise, we will either wait + * for the connection or return -EALREADY should this be a + * non-blocking call. + */ + err = -EALREADY; + break; + default: + if ((sk->sk_state == SS_LISTEN) || + vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { + err = -EINVAL; + goto out; + } + + /* The hypervisor and well-known contexts do not have socket + * endpoints. + */ + if (!transport->stream_allow(remote_addr->svm_cid, + remote_addr->svm_port)) { + err = -ENETUNREACH; + goto out; + } + + /* Set the remote address that we are connecting to. */ + memcpy(&vsk->remote_addr, remote_addr, + sizeof(vsk->remote_addr)); + + /* Autobind this socket to the local address if necessary. */ + if (!vsock_addr_bound(&vsk->local_addr)) { + struct sockaddr_vm local_addr; + + vsock_addr_init(&local_addr, VMADDR_CID_ANY, + VMADDR_PORT_ANY); + err = __vsock_bind(sk, &local_addr); + if (err != 0) + goto out; + + } + + sk->sk_state = SS_CONNECTING; + + err = transport->connect(vsk); + if (err < 0) + goto out; + + /* Mark sock as connecting and set the error code to in + * progress in case this is a non-blocking connect. + */ + sock->state = SS_CONNECTING; + err = -EINPROGRESS; + } + + /* The receive path will handle all communication until we are able to + * enter the connected state. Here we wait for the connection to be + * completed or a notification of an error. + */ + timeout = vsk->connect_timeout; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) { + if (flags & O_NONBLOCK) { + /* If we're not going to block, we schedule a timeout + * function to generate a timeout on the connection + * attempt, in case the peer doesn't respond in a + * timely manner. We hold on to the socket until the + * timeout fires. + */ + sock_hold(sk); + INIT_DELAYED_WORK(&vsk->dwork, + vsock_connect_timeout); + schedule_delayed_work(&vsk->dwork, timeout); + + /* Skip ahead to preserve error code set above. */ + goto out_wait; + } + + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + + if (signal_pending(current)) { + err = sock_intr_errno(timeout); + goto out_wait_error; + } else if (timeout == 0) { + err = -ETIMEDOUT; + goto out_wait_error; + } + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + } + + if (sk->sk_err) { + err = -sk->sk_err; + goto out_wait_error; + } else + err = 0; + +out_wait: + finish_wait(sk_sleep(sk), &wait); +out: + release_sock(sk); + return err; + +out_wait_error: + sk->sk_state = SS_UNCONNECTED; + sock->state = SS_UNCONNECTED; + goto out_wait; +} + +static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *listener; + int err; + struct sock *connected; + struct vsock_sock *vconnected; + long timeout; + DEFINE_WAIT(wait); + + err = 0; + listener = sock->sk; + + lock_sock(listener); + + if (sock->type != SOCK_STREAM) { + err = -EOPNOTSUPP; + goto out; + } + + if (listener->sk_state != SS_LISTEN) { + err = -EINVAL; + goto out; + } + + /* Wait for children sockets to appear; these are the new sockets + * created upon connection establishment. + */ + timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); + prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); + + while ((connected = vsock_dequeue_accept(listener)) == NULL && + listener->sk_err == 0) { + release_sock(listener); + timeout = schedule_timeout(timeout); + lock_sock(listener); + + if (signal_pending(current)) { + err = sock_intr_errno(timeout); + goto out_wait; + } else if (timeout == 0) { + err = -EAGAIN; + goto out_wait; + } + + prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); + } + + if (listener->sk_err) + err = -listener->sk_err; + + if (connected) { + listener->sk_ack_backlog--; + + lock_sock(connected); + vconnected = vsock_sk(connected); + + /* If the listener socket has received an error, then we should + * reject this socket and return. Note that we simply mark the + * socket rejected, drop our reference, and let the cleanup + * function handle the cleanup; the fact that we found it in + * the listener's accept queue guarantees that the cleanup + * function hasn't run yet. + */ + if (err) { + vconnected->rejected = true; + release_sock(connected); + sock_put(connected); + goto out_wait; + } + + newsock->state = SS_CONNECTED; + sock_graft(connected, newsock); + release_sock(connected); + sock_put(connected); + } + +out_wait: + finish_wait(sk_sleep(listener), &wait); +out: + release_sock(listener); + return err; +} + +static int vsock_listen(struct socket *sock, int backlog) +{ + int err; + struct sock *sk; + struct vsock_sock *vsk; + + sk = sock->sk; + + lock_sock(sk); + + if (sock->type != SOCK_STREAM) { + err = -EOPNOTSUPP; + goto out; + } + + if (sock->state != SS_UNCONNECTED) { + err = -EINVAL; + goto out; + } + + vsk = vsock_sk(sk); + + if (!vsock_addr_bound(&vsk->local_addr)) { + err = -EINVAL; + goto out; + } + + sk->sk_max_ack_backlog = backlog; + sk->sk_state = SS_LISTEN; + + err = 0; + +out: + release_sock(sk); + return err; +} + +static int vsock_stream_setsockopt(struct socket *sock, + int level, + int optname, + char __user *optval, + unsigned int optlen) +{ + int err; + struct sock *sk; + struct vsock_sock *vsk; + u64 val; + + if (level != AF_VSOCK) + return -ENOPROTOOPT; + +#define COPY_IN(_v) \ + do { \ + if (optlen < sizeof(_v)) { \ + err = -EINVAL; \ + goto exit; \ + } \ + if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \ + err = -EFAULT; \ + goto exit; \ + } \ + } while (0) + + err = 0; + sk = sock->sk; + vsk = vsock_sk(sk); + + lock_sock(sk); + + switch (optname) { + case SO_VM_SOCKETS_BUFFER_SIZE: + COPY_IN(val); + transport->set_buffer_size(vsk, val); + break; + + case SO_VM_SOCKETS_BUFFER_MAX_SIZE: + COPY_IN(val); + transport->set_max_buffer_size(vsk, val); + break; + + case SO_VM_SOCKETS_BUFFER_MIN_SIZE: + COPY_IN(val); + transport->set_min_buffer_size(vsk, val); + break; + + case SO_VM_SOCKETS_CONNECT_TIMEOUT: { + struct timeval tv; + COPY_IN(tv); + if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && + tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { + vsk->connect_timeout = tv.tv_sec * HZ + + DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ)); + if (vsk->connect_timeout == 0) + vsk->connect_timeout = + VSOCK_DEFAULT_CONNECT_TIMEOUT; + + } else { + err = -ERANGE; + } + break; + } + + default: + err = -ENOPROTOOPT; + break; + } + +#undef COPY_IN + +exit: + release_sock(sk); + return err; +} + +static int vsock_stream_getsockopt(struct socket *sock, + int level, int optname, + char __user *optval, + int __user *optlen) +{ + int err; + int len; + struct sock *sk; + struct vsock_sock *vsk; + u64 val; + + if (level != AF_VSOCK) + return -ENOPROTOOPT; + + err = get_user(len, optlen); + if (err != 0) + return err; + +#define COPY_OUT(_v) \ + do { \ + if (len < sizeof(_v)) \ + return -EINVAL; \ + \ + len = sizeof(_v); \ + if (copy_to_user(optval, &_v, len) != 0) \ + return -EFAULT; \ + \ + } while (0) + + err = 0; + sk = sock->sk; + vsk = vsock_sk(sk); + + switch (optname) { + case SO_VM_SOCKETS_BUFFER_SIZE: + val = transport->get_buffer_size(vsk); + COPY_OUT(val); + break; + + case SO_VM_SOCKETS_BUFFER_MAX_SIZE: + val = transport->get_max_buffer_size(vsk); + COPY_OUT(val); + break; + + case SO_VM_SOCKETS_BUFFER_MIN_SIZE: + val = transport->get_min_buffer_size(vsk); + COPY_OUT(val); + break; + + case SO_VM_SOCKETS_CONNECT_TIMEOUT: { + struct timeval tv; + tv.tv_sec = vsk->connect_timeout / HZ; + tv.tv_usec = + (vsk->connect_timeout - + tv.tv_sec * HZ) * (1000000 / HZ); + COPY_OUT(tv); + break; + } + default: + return -ENOPROTOOPT; + } + + err = put_user(len, optlen); + if (err != 0) + return -EFAULT; + +#undef COPY_OUT + + return 0; +} + +static int vsock_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk; + struct vsock_sock *vsk; + ssize_t total_written; + long timeout; + int err; + struct vsock_transport_send_notify_data send_data; + + DEFINE_WAIT(wait); + + sk = sock->sk; + vsk = vsock_sk(sk); + total_written = 0; + err = 0; + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + lock_sock(sk); + + /* Callers should not provide a destination with stream sockets. */ + if (msg->msg_namelen) { + err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP; + goto out; + } + + /* Send data only if both sides are not shutdown in the direction. */ + if (sk->sk_shutdown & SEND_SHUTDOWN || + vsk->peer_shutdown & RCV_SHUTDOWN) { + err = -EPIPE; + goto out; + } + + if (sk->sk_state != SS_CONNECTED || + !vsock_addr_bound(&vsk->local_addr)) { + err = -ENOTCONN; + goto out; + } + + if (!vsock_addr_bound(&vsk->remote_addr)) { + err = -EDESTADDRREQ; + goto out; + } + + /* Wait for room in the produce queue to enqueue our user's data. */ + timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + err = transport->notify_send_init(vsk, &send_data); + if (err < 0) + goto out; + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + while (total_written < len) { + ssize_t written; + + while (vsock_stream_has_space(vsk) == 0 && + sk->sk_err == 0 && + !(sk->sk_shutdown & SEND_SHUTDOWN) && + !(vsk->peer_shutdown & RCV_SHUTDOWN)) { + + /* Don't wait for non-blocking sockets. */ + if (timeout == 0) { + err = -EAGAIN; + goto out_wait; + } + + err = transport->notify_send_pre_block(vsk, &send_data); + if (err < 0) + goto out_wait; + + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + if (signal_pending(current)) { + err = sock_intr_errno(timeout); + goto out_wait; + } else if (timeout == 0) { + err = -EAGAIN; + goto out_wait; + } + + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + } + + /* These checks occur both as part of and after the loop + * conditional since we need to check before and after + * sleeping. + */ + if (sk->sk_err) { + err = -sk->sk_err; + goto out_wait; + } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || + (vsk->peer_shutdown & RCV_SHUTDOWN)) { + err = -EPIPE; + goto out_wait; + } + + err = transport->notify_send_pre_enqueue(vsk, &send_data); + if (err < 0) + goto out_wait; + + /* Note that enqueue will only write as many bytes as are free + * in the produce queue, so we don't need to ensure len is + * smaller than the queue size. It is the caller's + * responsibility to check how many bytes we were able to send. + */ + + written = transport->stream_enqueue( + vsk, msg->msg_iov, + len - total_written); + if (written < 0) { + err = -ENOMEM; + goto out_wait; + } + + total_written += written; + + err = transport->notify_send_post_enqueue( + vsk, written, &send_data); + if (err < 0) + goto out_wait; + + } + +out_wait: + if (total_written > 0) + err = total_written; + finish_wait(sk_sleep(sk), &wait); +out: + release_sock(sk); + return err; +} + + +static int +vsock_stream_recvmsg(struct kiocb *kiocb, + struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + struct sock *sk; + struct vsock_sock *vsk; + int err; + size_t target; + ssize_t copied; + long timeout; + struct vsock_transport_recv_notify_data recv_data; + + DEFINE_WAIT(wait); + + sk = sock->sk; + vsk = vsock_sk(sk); + err = 0; + + msg->msg_namelen = 0; + + lock_sock(sk); + + if (sk->sk_state != SS_CONNECTED) { + /* Recvmsg is supposed to return 0 if a peer performs an + * orderly shutdown. Differentiate between that case and when a + * peer has not connected or a local shutdown occured with the + * SOCK_DONE flag. + */ + if (sock_flag(sk, SOCK_DONE)) + err = 0; + else + err = -ENOTCONN; + + goto out; + } + + if (flags & MSG_OOB) { + err = -EOPNOTSUPP; + goto out; + } + + /* We don't check peer_shutdown flag here since peer may actually shut + * down, but there can be data in the queue that a local socket can + * receive. + */ + if (sk->sk_shutdown & RCV_SHUTDOWN) { + err = 0; + goto out; + } + + /* It is valid on Linux to pass in a zero-length receive buffer. This + * is not an error. We may as well bail out now. + */ + if (!len) { + err = 0; + goto out; + } + + /* We must not copy less than target bytes into the user's buffer + * before returning successfully, so we wait for the consume queue to + * have that much data to consume before dequeueing. Note that this + * makes it impossible to handle cases where target is greater than the + * queue size. + */ + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + if (target >= transport->stream_rcvhiwat(vsk)) { + err = -ENOMEM; + goto out; + } + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + copied = 0; + + err = transport->notify_recv_init(vsk, target, &recv_data); + if (err < 0) + goto out; + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + while (1) { + s64 ready = vsock_stream_has_data(vsk); + + if (ready < 0) { + /* Invalid queue pair content. XXX This should be + * changed to a connection reset in a later change. + */ + + err = -ENOMEM; + goto out_wait; + } else if (ready > 0) { + ssize_t read; + + err = transport->notify_recv_pre_dequeue( + vsk, target, &recv_data); + if (err < 0) + break; + + read = transport->stream_dequeue( + vsk, msg->msg_iov, + len - copied, flags); + if (read < 0) { + err = -ENOMEM; + break; + } + + copied += read; + + err = transport->notify_recv_post_dequeue( + vsk, target, read, + !(flags & MSG_PEEK), &recv_data); + if (err < 0) + goto out_wait; + + if (read >= target || flags & MSG_PEEK) + break; + + target -= read; + } else { + if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN) + || (vsk->peer_shutdown & SEND_SHUTDOWN)) { + break; + } + /* Don't wait for non-blocking sockets. */ + if (timeout == 0) { + err = -EAGAIN; + break; + } + + err = transport->notify_recv_pre_block( + vsk, target, &recv_data); + if (err < 0) + break; + + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + + if (signal_pending(current)) { + err = sock_intr_errno(timeout); + break; + } else if (timeout == 0) { + err = -EAGAIN; + break; + } + + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + } + } + + if (sk->sk_err) + err = -sk->sk_err; + else if (sk->sk_shutdown & RCV_SHUTDOWN) + err = 0; + + if (copied > 0) { + /* We only do these additional bookkeeping/notification steps + * if we actually copied something out of the queue pair + * instead of just peeking ahead. + */ + + if (!(flags & MSG_PEEK)) { + /* If the other side has shutdown for sending and there + * is nothing more to read, then modify the socket + * state. + */ + if (vsk->peer_shutdown & SEND_SHUTDOWN) { + if (vsock_stream_has_data(vsk) <= 0) { + sk->sk_state = SS_UNCONNECTED; + sock_set_flag(sk, SOCK_DONE); + sk->sk_state_change(sk); + } + } + } + err = copied; + } + +out_wait: + finish_wait(sk_sleep(sk), &wait); +out: + release_sock(sk); + return err; +} + +static const struct proto_ops vsock_stream_ops = { + .family = PF_VSOCK, + .owner = THIS_MODULE, + .release = vsock_release, + .bind = vsock_bind, + .connect = vsock_stream_connect, + .socketpair = sock_no_socketpair, + .accept = vsock_accept, + .getname = vsock_getname, + .poll = vsock_poll, + .ioctl = sock_no_ioctl, + .listen = vsock_listen, + .shutdown = vsock_shutdown, + .setsockopt = vsock_stream_setsockopt, + .getsockopt = vsock_stream_getsockopt, + .sendmsg = vsock_stream_sendmsg, + .recvmsg = vsock_stream_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static int vsock_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + if (!sock) + return -EINVAL; + + if (protocol && protocol != PF_VSOCK) + return -EPROTONOSUPPORT; + + switch (sock->type) { + case SOCK_DGRAM: + sock->ops = &vsock_dgram_ops; + break; + case SOCK_STREAM: + sock->ops = &vsock_stream_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + sock->state = SS_UNCONNECTED; + + return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM; +} + +static const struct net_proto_family vsock_family_ops = { + .family = AF_VSOCK, + .create = vsock_create, + .owner = THIS_MODULE, +}; + +static long vsock_dev_do_ioctl(struct file *filp, + unsigned int cmd, void __user *ptr) +{ + u32 __user *p = ptr; + int retval = 0; + + switch (cmd) { + case IOCTL_VM_SOCKETS_GET_LOCAL_CID: + if (put_user(transport->get_local_cid(), p) != 0) + retval = -EFAULT; + break; + + default: + pr_err("Unknown ioctl %d\n", cmd); + retval = -EINVAL; + } + + return retval; +} + +static long vsock_dev_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); +} + +#ifdef CONFIG_COMPAT +static long vsock_dev_compat_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); +} +#endif + +static const struct file_operations vsock_device_ops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vsock_dev_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vsock_dev_compat_ioctl, +#endif + .open = nonseekable_open, +}; + +static struct miscdevice vsock_device = { + .name = "vsock", + .fops = &vsock_device_ops, +}; + +static int __vsock_core_init(void) +{ + int err; + + vsock_init_tables(); + + vsock_device.minor = MISC_DYNAMIC_MINOR; + err = misc_register(&vsock_device); + if (err) { + pr_err("Failed to register misc device\n"); + return -ENOENT; + } + + err = proto_register(&vsock_proto, 1); /* we want our slab */ + if (err) { + pr_err("Cannot register vsock protocol\n"); + goto err_misc_deregister; + } + + err = sock_register(&vsock_family_ops); + if (err) { + pr_err("could not register af_vsock (%d) address family: %d\n", + AF_VSOCK, err); + goto err_unregister_proto; + } + + return 0; + +err_unregister_proto: + proto_unregister(&vsock_proto); +err_misc_deregister: + misc_deregister(&vsock_device); + return err; +} + +int vsock_core_init(const struct vsock_transport *t) +{ + int retval = mutex_lock_interruptible(&vsock_register_mutex); + if (retval) + return retval; + + if (transport) { + retval = -EBUSY; + goto out; + } + + transport = t; + retval = __vsock_core_init(); + if (retval) + transport = NULL; + +out: + mutex_unlock(&vsock_register_mutex); + return retval; +} +EXPORT_SYMBOL_GPL(vsock_core_init); + +void vsock_core_exit(void) +{ + mutex_lock(&vsock_register_mutex); + + misc_deregister(&vsock_device); + sock_unregister(AF_VSOCK); + proto_unregister(&vsock_proto); + + /* We do not want the assignment below re-ordered. */ + mb(); + transport = NULL; + + mutex_unlock(&vsock_register_mutex); +} +EXPORT_SYMBOL_GPL(vsock_core_exit); + +MODULE_AUTHOR("VMware, Inc."); +MODULE_DESCRIPTION("VMware Virtual Socket Family"); +MODULE_VERSION("1.0.0.0-k"); +MODULE_LICENSE("GPL v2"); diff --git a/net/vmw_vsock/af_vsock.h b/net/vmw_vsock/af_vsock.h new file mode 100644 index 000000000000..7d64d3609ec9 --- /dev/null +++ b/net/vmw_vsock/af_vsock.h @@ -0,0 +1,175 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef __AF_VSOCK_H__ +#define __AF_VSOCK_H__ + +#include <linux/kernel.h> +#include <linux/workqueue.h> +#include <linux/vm_sockets.h> + +#include "vsock_addr.h" + +#define LAST_RESERVED_PORT 1023 + +#define vsock_sk(__sk) ((struct vsock_sock *)__sk) +#define sk_vsock(__vsk) (&(__vsk)->sk) + +struct vsock_sock { + /* sk must be the first member. */ + struct sock sk; + struct sockaddr_vm local_addr; + struct sockaddr_vm remote_addr; + /* Links for the global tables of bound and connected sockets. */ + struct list_head bound_table; + struct list_head connected_table; + /* Accessed without the socket lock held. This means it can never be + * modified outsided of socket create or destruct. + */ + bool trusted; + bool cached_peer_allow_dgram; /* Dgram communication allowed to + * cached peer? + */ + u32 cached_peer; /* Context ID of last dgram destination check. */ + const struct cred *owner; + /* Rest are SOCK_STREAM only. */ + long connect_timeout; + /* Listening socket that this came from. */ + struct sock *listener; + /* Used for pending list and accept queue during connection handshake. + * The listening socket is the head for both lists. Sockets created + * for connection requests are placed in the pending list until they + * are connected, at which point they are put in the accept queue list + * so they can be accepted in accept(). If accept() cannot accept the + * connection, it is marked as rejected so the cleanup function knows + * to clean up the socket. + */ + struct list_head pending_links; + struct list_head accept_queue; + bool rejected; + struct delayed_work dwork; + u32 peer_shutdown; + bool sent_request; + bool ignore_connecting_rst; + + /* Private to transport. */ + void *trans; +}; + +s64 vsock_stream_has_data(struct vsock_sock *vsk); +s64 vsock_stream_has_space(struct vsock_sock *vsk); +void vsock_pending_work(struct work_struct *work); +struct sock *__vsock_create(struct net *net, + struct socket *sock, + struct sock *parent, + gfp_t priority, unsigned short type); + +/**** TRANSPORT ****/ + +struct vsock_transport_recv_notify_data { + u64 data1; /* Transport-defined. */ + u64 data2; /* Transport-defined. */ + bool notify_on_block; +}; + +struct vsock_transport_send_notify_data { + u64 data1; /* Transport-defined. */ + u64 data2; /* Transport-defined. */ +}; + +struct vsock_transport { + /* Initialize/tear-down socket. */ + int (*init)(struct vsock_sock *, struct vsock_sock *); + void (*destruct)(struct vsock_sock *); + void (*release)(struct vsock_sock *); + + /* Connections. */ + int (*connect)(struct vsock_sock *); + + /* DGRAM. */ + int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *); + int (*dgram_dequeue)(struct kiocb *kiocb, struct vsock_sock *vsk, + struct msghdr *msg, size_t len, int flags); + int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, + struct iovec *, size_t len); + bool (*dgram_allow)(u32 cid, u32 port); + + /* STREAM. */ + /* TODO: stream_bind() */ + ssize_t (*stream_dequeue)(struct vsock_sock *, struct iovec *, + size_t len, int flags); + ssize_t (*stream_enqueue)(struct vsock_sock *, struct iovec *, + size_t len); + s64 (*stream_has_data)(struct vsock_sock *); + s64 (*stream_has_space)(struct vsock_sock *); + u64 (*stream_rcvhiwat)(struct vsock_sock *); + bool (*stream_is_active)(struct vsock_sock *); + bool (*stream_allow)(u32 cid, u32 port); + + /* Notification. */ + int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); + int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); + int (*notify_recv_init)(struct vsock_sock *, size_t, + struct vsock_transport_recv_notify_data *); + int (*notify_recv_pre_block)(struct vsock_sock *, size_t, + struct vsock_transport_recv_notify_data *); + int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t, + struct vsock_transport_recv_notify_data *); + int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t, + ssize_t, bool, struct vsock_transport_recv_notify_data *); + int (*notify_send_init)(struct vsock_sock *, + struct vsock_transport_send_notify_data *); + int (*notify_send_pre_block)(struct vsock_sock *, + struct vsock_transport_send_notify_data *); + int (*notify_send_pre_enqueue)(struct vsock_sock *, + struct vsock_transport_send_notify_data *); + int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, + struct vsock_transport_send_notify_data *); + + /* Shutdown. */ + int (*shutdown)(struct vsock_sock *, int); + + /* Buffer sizes. */ + void (*set_buffer_size)(struct vsock_sock *, u64); + void (*set_min_buffer_size)(struct vsock_sock *, u64); + void (*set_max_buffer_size)(struct vsock_sock *, u64); + u64 (*get_buffer_size)(struct vsock_sock *); + u64 (*get_min_buffer_size)(struct vsock_sock *); + u64 (*get_max_buffer_size)(struct vsock_sock *); + + /* Addressing. */ + u32 (*get_local_cid)(void); +}; + +/**** CORE ****/ + +int vsock_core_init(const struct vsock_transport *t); +void vsock_core_exit(void); + +/**** UTILS ****/ + +void vsock_release_pending(struct sock *pending); +void vsock_add_pending(struct sock *listener, struct sock *pending); +void vsock_remove_pending(struct sock *listener, struct sock *pending); +void vsock_enqueue_accept(struct sock *listener, struct sock *connected); +void vsock_insert_connected(struct vsock_sock *vsk); +void vsock_remove_bound(struct vsock_sock *vsk); +void vsock_remove_connected(struct vsock_sock *vsk); +struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); +struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, + struct sockaddr_vm *dst); +void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); + +#endif /* __AF_VSOCK_H__ */ diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c new file mode 100644 index 000000000000..daff75200e25 --- /dev/null +++ b/net/vmw_vsock/vmci_transport.c @@ -0,0 +1,2175 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/bitops.h> +#include <linux/cred.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/net.h> +#include <linux/poll.h> +#include <linux/skbuff.h> +#include <linux/smp.h> +#include <linux/socket.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <net/sock.h> + +#include "af_vsock.h" +#include "vmci_transport_notify.h" + +static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg); +static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg); +static void vmci_transport_peer_attach_cb(u32 sub_id, + const struct vmci_event_data *ed, + void *client_data); +static void vmci_transport_peer_detach_cb(u32 sub_id, + const struct vmci_event_data *ed, + void *client_data); +static void vmci_transport_recv_pkt_work(struct work_struct *work); +static int vmci_transport_recv_listen(struct sock *sk, + struct vmci_transport_packet *pkt); +static int vmci_transport_recv_connecting_server( + struct sock *sk, + struct sock *pending, + struct vmci_transport_packet *pkt); +static int vmci_transport_recv_connecting_client( + struct sock *sk, + struct vmci_transport_packet *pkt); +static int vmci_transport_recv_connecting_client_negotiate( + struct sock *sk, + struct vmci_transport_packet *pkt); +static int vmci_transport_recv_connecting_client_invalid( + struct sock *sk, + struct vmci_transport_packet *pkt); +static int vmci_transport_recv_connected(struct sock *sk, + struct vmci_transport_packet *pkt); +static bool vmci_transport_old_proto_override(bool *old_pkt_proto); +static u16 vmci_transport_new_proto_supported_versions(void); +static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto, + bool old_pkt_proto); + +struct vmci_transport_recv_pkt_info { + struct work_struct work; + struct sock *sk; + struct vmci_transport_packet pkt; +}; + +static struct vmci_handle vmci_transport_stream_handle = { VMCI_INVALID_ID, + VMCI_INVALID_ID }; +static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; + +static int PROTOCOL_OVERRIDE = -1; + +#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN 128 +#define VMCI_TRANSPORT_DEFAULT_QP_SIZE 262144 +#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX 262144 + +/* The default peer timeout indicates how long we will wait for a peer response + * to a control message. + */ +#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) + +#define SS_LISTEN 255 + +/* Helper function to convert from a VMCI error code to a VSock error code. */ + +static s32 vmci_transport_error_to_vsock_error(s32 vmci_error) +{ + int err; + + switch (vmci_error) { + case VMCI_ERROR_NO_MEM: + err = ENOMEM; + break; + case VMCI_ERROR_DUPLICATE_ENTRY: + case VMCI_ERROR_ALREADY_EXISTS: + err = EADDRINUSE; + break; + case VMCI_ERROR_NO_ACCESS: + err = EPERM; + break; + case VMCI_ERROR_NO_RESOURCES: + err = ENOBUFS; + break; + case VMCI_ERROR_INVALID_RESOURCE: + err = EHOSTUNREACH; + break; + case VMCI_ERROR_INVALID_ARGS: + default: + err = EINVAL; + } + + return err > 0 ? -err : err; +} + +static u32 vmci_transport_peer_rid(u32 peer_cid) +{ + if (VMADDR_CID_HYPERVISOR == peer_cid) + return VMCI_TRANSPORT_HYPERVISOR_PACKET_RID; + + return VMCI_TRANSPORT_PACKET_RID; +} + +static inline void +vmci_transport_packet_init(struct vmci_transport_packet *pkt, + struct sockaddr_vm *src, + struct sockaddr_vm *dst, + u8 type, + u64 size, + u64 mode, + struct vmci_transport_waiting_info *wait, + u16 proto, + struct vmci_handle handle) +{ + /* We register the stream control handler as an any cid handle so we + * must always send from a source address of VMADDR_CID_ANY + */ + pkt->dg.src = vmci_make_handle(VMADDR_CID_ANY, + VMCI_TRANSPORT_PACKET_RID); + pkt->dg.dst = vmci_make_handle(dst->svm_cid, + vmci_transport_peer_rid(dst->svm_cid)); + pkt->dg.payload_size = sizeof(*pkt) - sizeof(pkt->dg); + pkt->version = VMCI_TRANSPORT_PACKET_VERSION; + pkt->type = type; + pkt->src_port = src->svm_port; + pkt->dst_port = dst->svm_port; + memset(&pkt->proto, 0, sizeof(pkt->proto)); + memset(&pkt->_reserved2, 0, sizeof(pkt->_reserved2)); + + switch (pkt->type) { + case VMCI_TRANSPORT_PACKET_TYPE_INVALID: + pkt->u.size = 0; + break; + + case VMCI_TRANSPORT_PACKET_TYPE_REQUEST: + case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE: + pkt->u.size = size; + break; + + case VMCI_TRANSPORT_PACKET_TYPE_OFFER: + case VMCI_TRANSPORT_PACKET_TYPE_ATTACH: + pkt->u.handle = handle; + break; + + case VMCI_TRANSPORT_PACKET_TYPE_WROTE: + case VMCI_TRANSPORT_PACKET_TYPE_READ: + case VMCI_TRANSPORT_PACKET_TYPE_RST: + pkt->u.size = 0; + break; + + case VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN: + pkt->u.mode = mode; + break; + + case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ: + case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE: + memcpy(&pkt->u.wait, wait, sizeof(pkt->u.wait)); + break; + + case VMCI_TRANSPORT_PACKET_TYPE_REQUEST2: + case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2: + pkt->u.size = size; + pkt->proto = proto; + break; + } +} + +static inline void +vmci_transport_packet_get_addresses(struct vmci_transport_packet *pkt, + struct sockaddr_vm *local, + struct sockaddr_vm *remote) +{ + vsock_addr_init(local, pkt->dg.dst.context, pkt->dst_port); + vsock_addr_init(remote, pkt->dg.src.context, pkt->src_port); +} + +static int +__vmci_transport_send_control_pkt(struct vmci_transport_packet *pkt, + struct sockaddr_vm *src, + struct sockaddr_vm *dst, + enum vmci_transport_packet_type type, + u64 size, + u64 mode, + struct vmci_transport_waiting_info *wait, + u16 proto, + struct vmci_handle handle, + bool convert_error) +{ + int err; + + vmci_transport_packet_init(pkt, src, dst, type, size, mode, wait, + proto, handle); + err = vmci_datagram_send(&pkt->dg); + if (convert_error && (err < 0)) + return vmci_transport_error_to_vsock_error(err); + + return err; +} + +static int +vmci_transport_reply_control_pkt_fast(struct vmci_transport_packet *pkt, + enum vmci_transport_packet_type type, + u64 size, + u64 mode, + struct vmci_transport_waiting_info *wait, + struct vmci_handle handle) +{ + struct vmci_transport_packet reply; + struct sockaddr_vm src, dst; + + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST) { + return 0; + } else { + vmci_transport_packet_get_addresses(pkt, &src, &dst); + return __vmci_transport_send_control_pkt(&reply, &src, &dst, + type, + size, mode, wait, + VSOCK_PROTO_INVALID, + handle, true); + } +} + +static int +vmci_transport_send_control_pkt_bh(struct sockaddr_vm *src, + struct sockaddr_vm *dst, + enum vmci_transport_packet_type type, + u64 size, + u64 mode, + struct vmci_transport_waiting_info *wait, + struct vmci_handle handle) +{ + /* Note that it is safe to use a single packet across all CPUs since + * two tasklets of the same type are guaranteed to not ever run + * simultaneously. If that ever changes, or VMCI stops using tasklets, + * we can use per-cpu packets. + */ + static struct vmci_transport_packet pkt; + + return __vmci_transport_send_control_pkt(&pkt, src, dst, type, + size, mode, wait, + VSOCK_PROTO_INVALID, handle, + false); +} + +static int +vmci_transport_send_control_pkt(struct sock *sk, + enum vmci_transport_packet_type type, + u64 size, + u64 mode, + struct vmci_transport_waiting_info *wait, + u16 proto, + struct vmci_handle handle) +{ + struct vmci_transport_packet *pkt; + struct vsock_sock *vsk; + int err; + + vsk = vsock_sk(sk); + + if (!vsock_addr_bound(&vsk->local_addr)) + return -EINVAL; + + if (!vsock_addr_bound(&vsk->remote_addr)) + return -EINVAL; + + pkt = kmalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + return -ENOMEM; + + err = __vmci_transport_send_control_pkt(pkt, &vsk->local_addr, + &vsk->remote_addr, type, size, + mode, wait, proto, handle, + true); + kfree(pkt); + + return err; +} + +static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst, + struct sockaddr_vm *src, + struct vmci_transport_packet *pkt) +{ + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST) + return 0; + return vmci_transport_send_control_pkt_bh( + dst, src, + VMCI_TRANSPORT_PACKET_TYPE_RST, 0, + 0, NULL, VMCI_INVALID_HANDLE); +} + +static int vmci_transport_send_reset(struct sock *sk, + struct vmci_transport_packet *pkt) +{ + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST) + return 0; + return vmci_transport_send_control_pkt(sk, + VMCI_TRANSPORT_PACKET_TYPE_RST, + 0, 0, NULL, VSOCK_PROTO_INVALID, + VMCI_INVALID_HANDLE); +} + +static int vmci_transport_send_negotiate(struct sock *sk, size_t size) +{ + return vmci_transport_send_control_pkt( + sk, + VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE, + size, 0, NULL, + VSOCK_PROTO_INVALID, + VMCI_INVALID_HANDLE); +} + +static int vmci_transport_send_negotiate2(struct sock *sk, size_t size, + u16 version) +{ + return vmci_transport_send_control_pkt( + sk, + VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2, + size, 0, NULL, version, + VMCI_INVALID_HANDLE); +} + +static int vmci_transport_send_qp_offer(struct sock *sk, + struct vmci_handle handle) +{ + return vmci_transport_send_control_pkt( + sk, VMCI_TRANSPORT_PACKET_TYPE_OFFER, 0, + 0, NULL, + VSOCK_PROTO_INVALID, handle); +} + +static int vmci_transport_send_attach(struct sock *sk, + struct vmci_handle handle) +{ + return vmci_transport_send_control_pkt( + sk, VMCI_TRANSPORT_PACKET_TYPE_ATTACH, + 0, 0, NULL, VSOCK_PROTO_INVALID, + handle); +} + +static int vmci_transport_reply_reset(struct vmci_transport_packet *pkt) +{ + return vmci_transport_reply_control_pkt_fast( + pkt, + VMCI_TRANSPORT_PACKET_TYPE_RST, + 0, 0, NULL, + VMCI_INVALID_HANDLE); +} + +static int vmci_transport_send_invalid_bh(struct sockaddr_vm *dst, + struct sockaddr_vm *src) +{ + return vmci_transport_send_control_pkt_bh( + dst, src, + VMCI_TRANSPORT_PACKET_TYPE_INVALID, + 0, 0, NULL, VMCI_INVALID_HANDLE); +} + +int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst, + struct sockaddr_vm *src) +{ + return vmci_transport_send_control_pkt_bh( + dst, src, + VMCI_TRANSPORT_PACKET_TYPE_WROTE, 0, + 0, NULL, VMCI_INVALID_HANDLE); +} + +int vmci_transport_send_read_bh(struct sockaddr_vm *dst, + struct sockaddr_vm *src) +{ + return vmci_transport_send_control_pkt_bh( + dst, src, + VMCI_TRANSPORT_PACKET_TYPE_READ, 0, + 0, NULL, VMCI_INVALID_HANDLE); +} + +int vmci_transport_send_wrote(struct sock *sk) +{ + return vmci_transport_send_control_pkt( + sk, VMCI_TRANSPORT_PACKET_TYPE_WROTE, 0, + 0, NULL, VSOCK_PROTO_INVALID, + VMCI_INVALID_HANDLE); +} + +int vmci_transport_send_read(struct sock *sk) +{ + return vmci_transport_send_control_pkt( + sk, VMCI_TRANSPORT_PACKET_TYPE_READ, 0, + 0, NULL, VSOCK_PROTO_INVALID, + VMCI_INVALID_HANDLE); +} + +int vmci_transport_send_waiting_write(struct sock *sk, + struct vmci_transport_waiting_info *wait) +{ + return vmci_transport_send_control_pkt( + sk, VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE, + 0, 0, wait, VSOCK_PROTO_INVALID, + VMCI_INVALID_HANDLE); +} + +int vmci_transport_send_waiting_read(struct sock *sk, + struct vmci_transport_waiting_info *wait) +{ + return vmci_transport_send_control_pkt( + sk, VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ, + 0, 0, wait, VSOCK_PROTO_INVALID, + VMCI_INVALID_HANDLE); +} + +static int vmci_transport_shutdown(struct vsock_sock *vsk, int mode) +{ + return vmci_transport_send_control_pkt( + &vsk->sk, + VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN, + 0, mode, NULL, + VSOCK_PROTO_INVALID, + VMCI_INVALID_HANDLE); +} + +static int vmci_transport_send_conn_request(struct sock *sk, size_t size) +{ + return vmci_transport_send_control_pkt(sk, + VMCI_TRANSPORT_PACKET_TYPE_REQUEST, + size, 0, NULL, + VSOCK_PROTO_INVALID, + VMCI_INVALID_HANDLE); +} + +static int vmci_transport_send_conn_request2(struct sock *sk, size_t size, + u16 version) +{ + return vmci_transport_send_control_pkt( + sk, VMCI_TRANSPORT_PACKET_TYPE_REQUEST2, + size, 0, NULL, version, + VMCI_INVALID_HANDLE); +} + +static struct sock *vmci_transport_get_pending( + struct sock *listener, + struct vmci_transport_packet *pkt) +{ + struct vsock_sock *vlistener; + struct vsock_sock *vpending; + struct sock *pending; + struct sockaddr_vm src; + + vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port); + + vlistener = vsock_sk(listener); + + list_for_each_entry(vpending, &vlistener->pending_links, + pending_links) { + if (vsock_addr_equals_addr(&src, &vpending->remote_addr) && + pkt->dst_port == vpending->local_addr.svm_port) { + pending = sk_vsock(vpending); + sock_hold(pending); + goto found; + } + } + + pending = NULL; +found: + return pending; + +} + +static void vmci_transport_release_pending(struct sock *pending) +{ + sock_put(pending); +} + +/* We allow two kinds of sockets to communicate with a restricted VM: 1) + * trusted sockets 2) sockets from applications running as the same user as the + * VM (this is only true for the host side and only when using hosted products) + */ + +static bool vmci_transport_is_trusted(struct vsock_sock *vsock, u32 peer_cid) +{ + return vsock->trusted || + vmci_is_context_owner(peer_cid, vsock->owner->uid); +} + +/* We allow sending datagrams to and receiving datagrams from a restricted VM + * only if it is trusted as described in vmci_transport_is_trusted. + */ + +static bool vmci_transport_allow_dgram(struct vsock_sock *vsock, u32 peer_cid) +{ + if (VMADDR_CID_HYPERVISOR == peer_cid) + return true; + + if (vsock->cached_peer != peer_cid) { + vsock->cached_peer = peer_cid; + if (!vmci_transport_is_trusted(vsock, peer_cid) && + (vmci_context_get_priv_flags(peer_cid) & + VMCI_PRIVILEGE_FLAG_RESTRICTED)) { + vsock->cached_peer_allow_dgram = false; + } else { + vsock->cached_peer_allow_dgram = true; + } + } + + return vsock->cached_peer_allow_dgram; +} + +static int +vmci_transport_queue_pair_alloc(struct vmci_qp **qpair, + struct vmci_handle *handle, + u64 produce_size, + u64 consume_size, + u32 peer, u32 flags, bool trusted) +{ + int err = 0; + + if (trusted) { + /* Try to allocate our queue pair as trusted. This will only + * work if vsock is running in the host. + */ + + err = vmci_qpair_alloc(qpair, handle, produce_size, + consume_size, + peer, flags, + VMCI_PRIVILEGE_FLAG_TRUSTED); + if (err != VMCI_ERROR_NO_ACCESS) + goto out; + + } + + err = vmci_qpair_alloc(qpair, handle, produce_size, consume_size, + peer, flags, VMCI_NO_PRIVILEGE_FLAGS); +out: + if (err < 0) { + pr_err("Could not attach to queue pair with %d\n", + err); + err = vmci_transport_error_to_vsock_error(err); + } + + return err; +} + +static int +vmci_transport_datagram_create_hnd(u32 resource_id, + u32 flags, + vmci_datagram_recv_cb recv_cb, + void *client_data, + struct vmci_handle *out_handle) +{ + int err = 0; + + /* Try to allocate our datagram handler as trusted. This will only work + * if vsock is running in the host. + */ + + err = vmci_datagram_create_handle_priv(resource_id, flags, + VMCI_PRIVILEGE_FLAG_TRUSTED, + recv_cb, + client_data, out_handle); + + if (err == VMCI_ERROR_NO_ACCESS) + err = vmci_datagram_create_handle(resource_id, flags, + recv_cb, client_data, + out_handle); + + return err; +} + +/* This is invoked as part of a tasklet that's scheduled when the VMCI + * interrupt fires. This is run in bottom-half context and if it ever needs to + * sleep it should defer that work to a work queue. + */ + +static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg) +{ + struct sock *sk; + size_t size; + struct sk_buff *skb; + struct vsock_sock *vsk; + + sk = (struct sock *)data; + + /* This handler is privileged when this module is running on the host. + * We will get datagrams from all endpoints (even VMs that are in a + * restricted context). If we get one from a restricted context then + * the destination socket must be trusted. + * + * NOTE: We access the socket struct without holding the lock here. + * This is ok because the field we are interested is never modified + * outside of the create and destruct socket functions. + */ + vsk = vsock_sk(sk); + if (!vmci_transport_allow_dgram(vsk, dg->src.context)) + return VMCI_ERROR_NO_ACCESS; + + size = VMCI_DG_SIZE(dg); + + /* Attach the packet to the socket's receive queue as an sk_buff. */ + skb = alloc_skb(size, GFP_ATOMIC); + if (skb) { + /* sk_receive_skb() will do a sock_put(), so hold here. */ + sock_hold(sk); + skb_put(skb, size); + memcpy(skb->data, dg, size); + sk_receive_skb(sk, skb, 0); + } + + return VMCI_SUCCESS; +} + +static bool vmci_transport_stream_allow(u32 cid, u32 port) +{ + static const u32 non_socket_contexts[] = { + VMADDR_CID_RESERVED, + }; + int i; + + BUILD_BUG_ON(sizeof(cid) != sizeof(*non_socket_contexts)); + + for (i = 0; i < ARRAY_SIZE(non_socket_contexts); i++) { + if (cid == non_socket_contexts[i]) + return false; + } + + return true; +} + +/* This is invoked as part of a tasklet that's scheduled when the VMCI + * interrupt fires. This is run in bottom-half context but it defers most of + * its work to the packet handling work queue. + */ + +static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) +{ + struct sock *sk; + struct sockaddr_vm dst; + struct sockaddr_vm src; + struct vmci_transport_packet *pkt; + struct vsock_sock *vsk; + bool bh_process_pkt; + int err; + + sk = NULL; + err = VMCI_SUCCESS; + bh_process_pkt = false; + + /* Ignore incoming packets from contexts without sockets, or resources + * that aren't vsock implementations. + */ + + if (!vmci_transport_stream_allow(dg->src.context, -1) + || vmci_transport_peer_rid(dg->src.context) != dg->src.resource) + return VMCI_ERROR_NO_ACCESS; + + if (VMCI_DG_SIZE(dg) < sizeof(*pkt)) + /* Drop datagrams that do not contain full VSock packets. */ + return VMCI_ERROR_INVALID_ARGS; + + pkt = (struct vmci_transport_packet *)dg; + + /* Find the socket that should handle this packet. First we look for a + * connected socket and if there is none we look for a socket bound to + * the destintation address. + */ + vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port); + vsock_addr_init(&dst, pkt->dg.dst.context, pkt->dst_port); + + sk = vsock_find_connected_socket(&src, &dst); + if (!sk) { + sk = vsock_find_bound_socket(&dst); + if (!sk) { + /* We could not find a socket for this specified + * address. If this packet is a RST, we just drop it. + * If it is another packet, we send a RST. Note that + * we do not send a RST reply to RSTs so that we do not + * continually send RSTs between two endpoints. + * + * Note that since this is a reply, dst is src and src + * is dst. + */ + if (vmci_transport_send_reset_bh(&dst, &src, pkt) < 0) + pr_err("unable to send reset\n"); + + err = VMCI_ERROR_NOT_FOUND; + goto out; + } + } + + /* If the received packet type is beyond all types known to this + * implementation, reply with an invalid message. Hopefully this will + * help when implementing backwards compatibility in the future. + */ + if (pkt->type >= VMCI_TRANSPORT_PACKET_TYPE_MAX) { + vmci_transport_send_invalid_bh(&dst, &src); + err = VMCI_ERROR_INVALID_ARGS; + goto out; + } + + /* This handler is privileged when this module is running on the host. + * We will get datagram connect requests from all endpoints (even VMs + * that are in a restricted context). If we get one from a restricted + * context then the destination socket must be trusted. + * + * NOTE: We access the socket struct without holding the lock here. + * This is ok because the field we are interested is never modified + * outside of the create and destruct socket functions. + */ + vsk = vsock_sk(sk); + if (!vmci_transport_allow_dgram(vsk, pkt->dg.src.context)) { + err = VMCI_ERROR_NO_ACCESS; + goto out; + } + + /* We do most everything in a work queue, but let's fast path the + * notification of reads and writes to help data transfer performance. + * We can only do this if there is no process context code executing + * for this socket since that may change the state. + */ + bh_lock_sock(sk); + + if (!sock_owned_by_user(sk)) { + /* The local context ID may be out of date, update it. */ + vsk->local_addr.svm_cid = dst.svm_cid; + + if (sk->sk_state == SS_CONNECTED) + vmci_trans(vsk)->notify_ops->handle_notify_pkt( + sk, pkt, true, &dst, &src, + &bh_process_pkt); + } + + bh_unlock_sock(sk); + + if (!bh_process_pkt) { + struct vmci_transport_recv_pkt_info *recv_pkt_info; + + recv_pkt_info = kmalloc(sizeof(*recv_pkt_info), GFP_ATOMIC); + if (!recv_pkt_info) { + if (vmci_transport_send_reset_bh(&dst, &src, pkt) < 0) + pr_err("unable to send reset\n"); + + err = VMCI_ERROR_NO_MEM; + goto out; + } + + recv_pkt_info->sk = sk; + memcpy(&recv_pkt_info->pkt, pkt, sizeof(recv_pkt_info->pkt)); + INIT_WORK(&recv_pkt_info->work, vmci_transport_recv_pkt_work); + + schedule_work(&recv_pkt_info->work); + /* Clear sk so that the reference count incremented by one of + * the Find functions above is not decremented below. We need + * that reference count for the packet handler we've scheduled + * to run. + */ + sk = NULL; + } + +out: + if (sk) + sock_put(sk); + + return err; +} + +static void vmci_transport_peer_attach_cb(u32 sub_id, + const struct vmci_event_data *e_data, + void *client_data) +{ + struct sock *sk = client_data; + const struct vmci_event_payload_qp *e_payload; + struct vsock_sock *vsk; + + e_payload = vmci_event_data_const_payload(e_data); + + vsk = vsock_sk(sk); + + /* We don't ask for delayed CBs when we subscribe to this event (we + * pass 0 as flags to vmci_event_subscribe()). VMCI makes no + * guarantees in that case about what context we might be running in, + * so it could be BH or process, blockable or non-blockable. So we + * need to account for all possible contexts here. + */ + local_bh_disable(); + bh_lock_sock(sk); + + /* XXX This is lame, we should provide a way to lookup sockets by + * qp_handle. + */ + if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, + e_payload->handle)) { + /* XXX This doesn't do anything, but in the future we may want + * to set a flag here to verify the attach really did occur and + * we weren't just sent a datagram claiming it was. + */ + goto out; + } + +out: + bh_unlock_sock(sk); + local_bh_enable(); +} + +static void vmci_transport_handle_detach(struct sock *sk) +{ + struct vsock_sock *vsk; + + vsk = vsock_sk(sk); + if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) { + sock_set_flag(sk, SOCK_DONE); + + /* On a detach the peer will not be sending or receiving + * anymore. + */ + vsk->peer_shutdown = SHUTDOWN_MASK; + + /* We should not be sending anymore since the peer won't be + * there to receive, but we can still receive if there is data + * left in our consume queue. + */ + if (vsock_stream_has_data(vsk) <= 0) { + if (sk->sk_state == SS_CONNECTING) { + /* The peer may detach from a queue pair while + * we are still in the connecting state, i.e., + * if the peer VM is killed after attaching to + * a queue pair, but before we complete the + * handshake. In that case, we treat the detach + * event like a reset. + */ + + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = ECONNRESET; + sk->sk_error_report(sk); + return; + } + sk->sk_state = SS_UNCONNECTED; + } + sk->sk_state_change(sk); + } +} + +static void vmci_transport_peer_detach_cb(u32 sub_id, + const struct vmci_event_data *e_data, + void *client_data) +{ + struct sock *sk = client_data; + const struct vmci_event_payload_qp *e_payload; + struct vsock_sock *vsk; + + e_payload = vmci_event_data_const_payload(e_data); + vsk = vsock_sk(sk); + if (vmci_handle_is_invalid(e_payload->handle)) + return; + + /* Same rules for locking as for peer_attach_cb(). */ + local_bh_disable(); + bh_lock_sock(sk); + + /* XXX This is lame, we should provide a way to lookup sockets by + * qp_handle. + */ + if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, + e_payload->handle)) + vmci_transport_handle_detach(sk); + + bh_unlock_sock(sk); + local_bh_enable(); +} + +static void vmci_transport_qp_resumed_cb(u32 sub_id, + const struct vmci_event_data *e_data, + void *client_data) +{ + vsock_for_each_connected_socket(vmci_transport_handle_detach); +} + +static void vmci_transport_recv_pkt_work(struct work_struct *work) +{ + struct vmci_transport_recv_pkt_info *recv_pkt_info; + struct vmci_transport_packet *pkt; + struct sock *sk; + + recv_pkt_info = + container_of(work, struct vmci_transport_recv_pkt_info, work); + sk = recv_pkt_info->sk; + pkt = &recv_pkt_info->pkt; + + lock_sock(sk); + + /* The local context ID may be out of date. */ + vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context; + + switch (sk->sk_state) { + case SS_LISTEN: + vmci_transport_recv_listen(sk, pkt); + break; + case SS_CONNECTING: + /* Processing of pending connections for servers goes through + * the listening socket, so see vmci_transport_recv_listen() + * for that path. + */ + vmci_transport_recv_connecting_client(sk, pkt); + break; + case SS_CONNECTED: + vmci_transport_recv_connected(sk, pkt); + break; + default: + /* Because this function does not run in the same context as + * vmci_transport_recv_stream_cb it is possible that the + * socket has closed. We need to let the other side know or it + * could be sitting in a connect and hang forever. Send a + * reset to prevent that. + */ + vmci_transport_send_reset(sk, pkt); + goto out; + } + +out: + release_sock(sk); + kfree(recv_pkt_info); + /* Release reference obtained in the stream callback when we fetched + * this socket out of the bound or connected list. + */ + sock_put(sk); +} + +static int vmci_transport_recv_listen(struct sock *sk, + struct vmci_transport_packet *pkt) +{ + struct sock *pending; + struct vsock_sock *vpending; + int err; + u64 qp_size; + bool old_request = false; + bool old_pkt_proto = false; + + err = 0; + + /* Because we are in the listen state, we could be receiving a packet + * for ourself or any previous connection requests that we received. + * If it's the latter, we try to find a socket in our list of pending + * connections and, if we do, call the appropriate handler for the + * state that that socket is in. Otherwise we try to service the + * connection request. + */ + pending = vmci_transport_get_pending(sk, pkt); + if (pending) { + lock_sock(pending); + + /* The local context ID may be out of date. */ + vsock_sk(pending)->local_addr.svm_cid = pkt->dg.dst.context; + + switch (pending->sk_state) { + case SS_CONNECTING: + err = vmci_transport_recv_connecting_server(sk, + pending, + pkt); + break; + default: + vmci_transport_send_reset(pending, pkt); + err = -EINVAL; + } + + if (err < 0) + vsock_remove_pending(sk, pending); + + release_sock(pending); + vmci_transport_release_pending(pending); + + return err; + } + + /* The listen state only accepts connection requests. Reply with a + * reset unless we received a reset. + */ + + if (!(pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST || + pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST2)) { + vmci_transport_reply_reset(pkt); + return -EINVAL; + } + + if (pkt->u.size == 0) { + vmci_transport_reply_reset(pkt); + return -EINVAL; + } + + /* If this socket can't accommodate this connection request, we send a + * reset. Otherwise we create and initialize a child socket and reply + * with a connection negotiation. + */ + if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) { + vmci_transport_reply_reset(pkt); + return -ECONNREFUSED; + } + + pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, + sk->sk_type); + if (!pending) { + vmci_transport_send_reset(sk, pkt); + return -ENOMEM; + } + + vpending = vsock_sk(pending); + + vsock_addr_init(&vpending->local_addr, pkt->dg.dst.context, + pkt->dst_port); + vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context, + pkt->src_port); + + /* If the proposed size fits within our min/max, accept it. Otherwise + * propose our own size. + */ + if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size && + pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) { + qp_size = pkt->u.size; + } else { + qp_size = vmci_trans(vpending)->queue_pair_size; + } + + /* Figure out if we are using old or new requests based on the + * overrides pkt types sent by our peer. + */ + if (vmci_transport_old_proto_override(&old_pkt_proto)) { + old_request = old_pkt_proto; + } else { + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST) + old_request = true; + else if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST2) + old_request = false; + + } + + if (old_request) { + /* Handle a REQUEST (or override) */ + u16 version = VSOCK_PROTO_INVALID; + if (vmci_transport_proto_to_notify_struct( + pending, &version, true)) + err = vmci_transport_send_negotiate(pending, qp_size); + else + err = -EINVAL; + + } else { + /* Handle a REQUEST2 (or override) */ + int proto_int = pkt->proto; + int pos; + u16 active_proto_version = 0; + + /* The list of possible protocols is the intersection of all + * protocols the client supports ... plus all the protocols we + * support. + */ + proto_int &= vmci_transport_new_proto_supported_versions(); + + /* We choose the highest possible protocol version and use that + * one. + */ + pos = fls(proto_int); + if (pos) { + active_proto_version = (1 << (pos - 1)); + if (vmci_transport_proto_to_notify_struct( + pending, &active_proto_version, false)) + err = vmci_transport_send_negotiate2(pending, + qp_size, + active_proto_version); + else + err = -EINVAL; + + } else { + err = -EINVAL; + } + } + + if (err < 0) { + vmci_transport_send_reset(sk, pkt); + sock_put(pending); + err = vmci_transport_error_to_vsock_error(err); + goto out; + } + + vsock_add_pending(sk, pending); + sk->sk_ack_backlog++; + + pending->sk_state = SS_CONNECTING; + vmci_trans(vpending)->produce_size = + vmci_trans(vpending)->consume_size = qp_size; + vmci_trans(vpending)->queue_pair_size = qp_size; + + vmci_trans(vpending)->notify_ops->process_request(pending); + + /* We might never receive another message for this socket and it's not + * connected to any process, so we have to ensure it gets cleaned up + * ourself. Our delayed work function will take care of that. Note + * that we do not ever cancel this function since we have few + * guarantees about its state when calling cancel_delayed_work(). + * Instead we hold a reference on the socket for that function and make + * it capable of handling cases where it needs to do nothing but + * release that reference. + */ + vpending->listener = sk; + sock_hold(sk); + sock_hold(pending); + INIT_DELAYED_WORK(&vpending->dwork, vsock_pending_work); + schedule_delayed_work(&vpending->dwork, HZ); + +out: + return err; +} + +static int +vmci_transport_recv_connecting_server(struct sock *listener, + struct sock *pending, + struct vmci_transport_packet *pkt) +{ + struct vsock_sock *vpending; + struct vmci_handle handle; + struct vmci_qp *qpair; + bool is_local; + u32 flags; + u32 detach_sub_id; + int err; + int skerr; + + vpending = vsock_sk(pending); + detach_sub_id = VMCI_INVALID_ID; + + switch (pkt->type) { + case VMCI_TRANSPORT_PACKET_TYPE_OFFER: + if (vmci_handle_is_invalid(pkt->u.handle)) { + vmci_transport_send_reset(pending, pkt); + skerr = EPROTO; + err = -EINVAL; + goto destroy; + } + break; + default: + /* Close and cleanup the connection. */ + vmci_transport_send_reset(pending, pkt); + skerr = EPROTO; + err = pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST ? 0 : -EINVAL; + goto destroy; + } + + /* In order to complete the connection we need to attach to the offered + * queue pair and send an attach notification. We also subscribe to the + * detach event so we know when our peer goes away, and we do that + * before attaching so we don't miss an event. If all this succeeds, + * we update our state and wakeup anything waiting in accept() for a + * connection. + */ + + /* We don't care about attach since we ensure the other side has + * attached by specifying the ATTACH_ONLY flag below. + */ + err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH, + vmci_transport_peer_detach_cb, + pending, &detach_sub_id); + if (err < VMCI_SUCCESS) { + vmci_transport_send_reset(pending, pkt); + err = vmci_transport_error_to_vsock_error(err); + skerr = -err; + goto destroy; + } + + vmci_trans(vpending)->detach_sub_id = detach_sub_id; + + /* Now attach to the queue pair the client created. */ + handle = pkt->u.handle; + + /* vpending->local_addr always has a context id so we do not need to + * worry about VMADDR_CID_ANY in this case. + */ + is_local = + vpending->remote_addr.svm_cid == vpending->local_addr.svm_cid; + flags = VMCI_QPFLAG_ATTACH_ONLY; + flags |= is_local ? VMCI_QPFLAG_LOCAL : 0; + + err = vmci_transport_queue_pair_alloc( + &qpair, + &handle, + vmci_trans(vpending)->produce_size, + vmci_trans(vpending)->consume_size, + pkt->dg.src.context, + flags, + vmci_transport_is_trusted( + vpending, + vpending->remote_addr.svm_cid)); + if (err < 0) { + vmci_transport_send_reset(pending, pkt); + skerr = -err; + goto destroy; + } + + vmci_trans(vpending)->qp_handle = handle; + vmci_trans(vpending)->qpair = qpair; + + /* When we send the attach message, we must be ready to handle incoming + * control messages on the newly connected socket. So we move the + * pending socket to the connected state before sending the attach + * message. Otherwise, an incoming packet triggered by the attach being + * received by the peer may be processed concurrently with what happens + * below after sending the attach message, and that incoming packet + * will find the listening socket instead of the (currently) pending + * socket. Note that enqueueing the socket increments the reference + * count, so even if a reset comes before the connection is accepted, + * the socket will be valid until it is removed from the queue. + * + * If we fail sending the attach below, we remove the socket from the + * connected list and move the socket to SS_UNCONNECTED before + * releasing the lock, so a pending slow path processing of an incoming + * packet will not see the socket in the connected state in that case. + */ + pending->sk_state = SS_CONNECTED; + + vsock_insert_connected(vpending); + + /* Notify our peer of our attach. */ + err = vmci_transport_send_attach(pending, handle); + if (err < 0) { + vsock_remove_connected(vpending); + pr_err("Could not send attach\n"); + vmci_transport_send_reset(pending, pkt); + err = vmci_transport_error_to_vsock_error(err); + skerr = -err; + goto destroy; + } + + /* We have a connection. Move the now connected socket from the + * listener's pending list to the accept queue so callers of accept() + * can find it. + */ + vsock_remove_pending(listener, pending); + vsock_enqueue_accept(listener, pending); + + /* Callers of accept() will be be waiting on the listening socket, not + * the pending socket. + */ + listener->sk_state_change(listener); + + return 0; + +destroy: + pending->sk_err = skerr; + pending->sk_state = SS_UNCONNECTED; + /* As long as we drop our reference, all necessary cleanup will handle + * when the cleanup function drops its reference and our destruct + * implementation is called. Note that since the listen handler will + * remove pending from the pending list upon our failure, the cleanup + * function won't drop the additional reference, which is why we do it + * here. + */ + sock_put(pending); + + return err; +} + +static int +vmci_transport_recv_connecting_client(struct sock *sk, + struct vmci_transport_packet *pkt) +{ + struct vsock_sock *vsk; + int err; + int skerr; + + vsk = vsock_sk(sk); + + switch (pkt->type) { + case VMCI_TRANSPORT_PACKET_TYPE_ATTACH: + if (vmci_handle_is_invalid(pkt->u.handle) || + !vmci_handle_is_equal(pkt->u.handle, + vmci_trans(vsk)->qp_handle)) { + skerr = EPROTO; + err = -EINVAL; + goto destroy; + } + + /* Signify the socket is connected and wakeup the waiter in + * connect(). Also place the socket in the connected table for + * accounting (it can already be found since it's in the bound + * table). + */ + sk->sk_state = SS_CONNECTED; + sk->sk_socket->state = SS_CONNECTED; + vsock_insert_connected(vsk); + sk->sk_state_change(sk); + + break; + case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE: + case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2: + if (pkt->u.size == 0 + || pkt->dg.src.context != vsk->remote_addr.svm_cid + || pkt->src_port != vsk->remote_addr.svm_port + || !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle) + || vmci_trans(vsk)->qpair + || vmci_trans(vsk)->produce_size != 0 + || vmci_trans(vsk)->consume_size != 0 + || vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID + || vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) { + skerr = EPROTO; + err = -EINVAL; + + goto destroy; + } + + err = vmci_transport_recv_connecting_client_negotiate(sk, pkt); + if (err) { + skerr = -err; + goto destroy; + } + + break; + case VMCI_TRANSPORT_PACKET_TYPE_INVALID: + err = vmci_transport_recv_connecting_client_invalid(sk, pkt); + if (err) { + skerr = -err; + goto destroy; + } + + break; + case VMCI_TRANSPORT_PACKET_TYPE_RST: + /* Older versions of the linux code (WS 6.5 / ESX 4.0) used to + * continue processing here after they sent an INVALID packet. + * This meant that we got a RST after the INVALID. We ignore a + * RST after an INVALID. The common code doesn't send the RST + * ... so we can hang if an old version of the common code + * fails between getting a REQUEST and sending an OFFER back. + * Not much we can do about it... except hope that it doesn't + * happen. + */ + if (vsk->ignore_connecting_rst) { + vsk->ignore_connecting_rst = false; + } else { + skerr = ECONNRESET; + err = 0; + goto destroy; + } + + break; + default: + /* Close and cleanup the connection. */ + skerr = EPROTO; + err = -EINVAL; + goto destroy; + } + + return 0; + +destroy: + vmci_transport_send_reset(sk, pkt); + + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = skerr; + sk->sk_error_report(sk); + return err; +} + +static int vmci_transport_recv_connecting_client_negotiate( + struct sock *sk, + struct vmci_transport_packet *pkt) +{ + int err; + struct vsock_sock *vsk; + struct vmci_handle handle; + struct vmci_qp *qpair; + u32 attach_sub_id; + u32 detach_sub_id; + bool is_local; + u32 flags; + bool old_proto = true; + bool old_pkt_proto; + u16 version; + + vsk = vsock_sk(sk); + handle = VMCI_INVALID_HANDLE; + attach_sub_id = VMCI_INVALID_ID; + detach_sub_id = VMCI_INVALID_ID; + + /* If we have gotten here then we should be past the point where old + * linux vsock could have sent the bogus rst. + */ + vsk->sent_request = false; + vsk->ignore_connecting_rst = false; + + /* Verify that we're OK with the proposed queue pair size */ + if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size || + pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) { + err = -EINVAL; + goto destroy; + } + + /* At this point we know the CID the peer is using to talk to us. */ + + if (vsk->local_addr.svm_cid == VMADDR_CID_ANY) + vsk->local_addr.svm_cid = pkt->dg.dst.context; + + /* Setup the notify ops to be the highest supported version that both + * the server and the client support. + */ + + if (vmci_transport_old_proto_override(&old_pkt_proto)) { + old_proto = old_pkt_proto; + } else { + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE) + old_proto = true; + else if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2) + old_proto = false; + + } + + if (old_proto) + version = VSOCK_PROTO_INVALID; + else + version = pkt->proto; + + if (!vmci_transport_proto_to_notify_struct(sk, &version, old_proto)) { + err = -EINVAL; + goto destroy; + } + + /* Subscribe to attach and detach events first. + * + * XXX We attach once for each queue pair created for now so it is easy + * to find the socket (it's provided), but later we should only + * subscribe once and add a way to lookup sockets by queue pair handle. + */ + err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_ATTACH, + vmci_transport_peer_attach_cb, + sk, &attach_sub_id); + if (err < VMCI_SUCCESS) { + err = vmci_transport_error_to_vsock_error(err); + goto destroy; + } + + err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH, + vmci_transport_peer_detach_cb, + sk, &detach_sub_id); + if (err < VMCI_SUCCESS) { + err = vmci_transport_error_to_vsock_error(err); + goto destroy; + } + + /* Make VMCI select the handle for us. */ + handle = VMCI_INVALID_HANDLE; + is_local = vsk->remote_addr.svm_cid == vsk->local_addr.svm_cid; + flags = is_local ? VMCI_QPFLAG_LOCAL : 0; + + err = vmci_transport_queue_pair_alloc(&qpair, + &handle, + pkt->u.size, + pkt->u.size, + vsk->remote_addr.svm_cid, + flags, + vmci_transport_is_trusted( + vsk, + vsk-> + remote_addr.svm_cid)); + if (err < 0) + goto destroy; + + err = vmci_transport_send_qp_offer(sk, handle); + if (err < 0) { + err = vmci_transport_error_to_vsock_error(err); + goto destroy; + } + + vmci_trans(vsk)->qp_handle = handle; + vmci_trans(vsk)->qpair = qpair; + + vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = + pkt->u.size; + + vmci_trans(vsk)->attach_sub_id = attach_sub_id; + vmci_trans(vsk)->detach_sub_id = detach_sub_id; + + vmci_trans(vsk)->notify_ops->process_negotiate(sk); + + return 0; + +destroy: + if (attach_sub_id != VMCI_INVALID_ID) + vmci_event_unsubscribe(attach_sub_id); + + if (detach_sub_id != VMCI_INVALID_ID) + vmci_event_unsubscribe(detach_sub_id); + + if (!vmci_handle_is_invalid(handle)) + vmci_qpair_detach(&qpair); + + return err; +} + +static int +vmci_transport_recv_connecting_client_invalid(struct sock *sk, + struct vmci_transport_packet *pkt) +{ + int err = 0; + struct vsock_sock *vsk = vsock_sk(sk); + + if (vsk->sent_request) { + vsk->sent_request = false; + vsk->ignore_connecting_rst = true; + + err = vmci_transport_send_conn_request( + sk, vmci_trans(vsk)->queue_pair_size); + if (err < 0) + err = vmci_transport_error_to_vsock_error(err); + else + err = 0; + + } + + return err; +} + +static int vmci_transport_recv_connected(struct sock *sk, + struct vmci_transport_packet *pkt) +{ + struct vsock_sock *vsk; + bool pkt_processed = false; + + /* In cases where we are closing the connection, it's sufficient to + * mark the state change (and maybe error) and wake up any waiting + * threads. Since this is a connected socket, it's owned by a user + * process and will be cleaned up when the failure is passed back on + * the current or next system call. Our system call implementations + * must therefore check for error and state changes on entry and when + * being awoken. + */ + switch (pkt->type) { + case VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN: + if (pkt->u.mode) { + vsk = vsock_sk(sk); + + vsk->peer_shutdown |= pkt->u.mode; + sk->sk_state_change(sk); + } + break; + + case VMCI_TRANSPORT_PACKET_TYPE_RST: + vsk = vsock_sk(sk); + /* It is possible that we sent our peer a message (e.g a + * WAITING_READ) right before we got notified that the peer had + * detached. If that happens then we can get a RST pkt back + * from our peer even though there is data available for us to + * read. In that case, don't shutdown the socket completely but + * instead allow the local client to finish reading data off + * the queuepair. Always treat a RST pkt in connected mode like + * a clean shutdown. + */ + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + + sk->sk_state_change(sk); + break; + + default: + vsk = vsock_sk(sk); + vmci_trans(vsk)->notify_ops->handle_notify_pkt( + sk, pkt, false, NULL, NULL, + &pkt_processed); + if (!pkt_processed) + return -EINVAL; + + break; + } + + return 0; +} + +static int vmci_transport_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk) +{ + vsk->trans = kmalloc(sizeof(struct vmci_transport), GFP_KERNEL); + if (!vsk->trans) + return -ENOMEM; + + vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE; + vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE; + vmci_trans(vsk)->qpair = NULL; + vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = 0; + vmci_trans(vsk)->attach_sub_id = vmci_trans(vsk)->detach_sub_id = + VMCI_INVALID_ID; + vmci_trans(vsk)->notify_ops = NULL; + if (psk) { + vmci_trans(vsk)->queue_pair_size = + vmci_trans(psk)->queue_pair_size; + vmci_trans(vsk)->queue_pair_min_size = + vmci_trans(psk)->queue_pair_min_size; + vmci_trans(vsk)->queue_pair_max_size = + vmci_trans(psk)->queue_pair_max_size; + } else { + vmci_trans(vsk)->queue_pair_size = + VMCI_TRANSPORT_DEFAULT_QP_SIZE; + vmci_trans(vsk)->queue_pair_min_size = + VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN; + vmci_trans(vsk)->queue_pair_max_size = + VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX; + } + + return 0; +} + +static void vmci_transport_destruct(struct vsock_sock *vsk) +{ + if (vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID) { + vmci_event_unsubscribe(vmci_trans(vsk)->attach_sub_id); + vmci_trans(vsk)->attach_sub_id = VMCI_INVALID_ID; + } + + if (vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) { + vmci_event_unsubscribe(vmci_trans(vsk)->detach_sub_id); + vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID; + } + + if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) { + vmci_qpair_detach(&vmci_trans(vsk)->qpair); + vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE; + vmci_trans(vsk)->produce_size = 0; + vmci_trans(vsk)->consume_size = 0; + } + + if (vmci_trans(vsk)->notify_ops) + vmci_trans(vsk)->notify_ops->socket_destruct(vsk); + + kfree(vsk->trans); + vsk->trans = NULL; +} + +static void vmci_transport_release(struct vsock_sock *vsk) +{ + if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) { + vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle); + vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE; + } +} + +static int vmci_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr) +{ + u32 port; + u32 flags; + int err; + + /* VMCI will select a resource ID for us if we provide + * VMCI_INVALID_ID. + */ + port = addr->svm_port == VMADDR_PORT_ANY ? + VMCI_INVALID_ID : addr->svm_port; + + if (port <= LAST_RESERVED_PORT && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + flags = addr->svm_cid == VMADDR_CID_ANY ? + VMCI_FLAG_ANYCID_DG_HND : 0; + + err = vmci_transport_datagram_create_hnd(port, flags, + vmci_transport_recv_dgram_cb, + &vsk->sk, + &vmci_trans(vsk)->dg_handle); + if (err < VMCI_SUCCESS) + return vmci_transport_error_to_vsock_error(err); + vsock_addr_init(&vsk->local_addr, addr->svm_cid, + vmci_trans(vsk)->dg_handle.resource); + + return 0; +} + +static int vmci_transport_dgram_enqueue( + struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct iovec *iov, + size_t len) +{ + int err; + struct vmci_datagram *dg; + + if (len > VMCI_MAX_DG_PAYLOAD_SIZE) + return -EMSGSIZE; + + if (!vmci_transport_allow_dgram(vsk, remote_addr->svm_cid)) + return -EPERM; + + /* Allocate a buffer for the user's message and our packet header. */ + dg = kmalloc(len + sizeof(*dg), GFP_KERNEL); + if (!dg) + return -ENOMEM; + + memcpy_fromiovec(VMCI_DG_PAYLOAD(dg), iov, len); + + dg->dst = vmci_make_handle(remote_addr->svm_cid, + remote_addr->svm_port); + dg->src = vmci_make_handle(vsk->local_addr.svm_cid, + vsk->local_addr.svm_port); + dg->payload_size = len; + + err = vmci_datagram_send(dg); + kfree(dg); + if (err < 0) + return vmci_transport_error_to_vsock_error(err); + + return err - sizeof(*dg); +} + +static int vmci_transport_dgram_dequeue(struct kiocb *kiocb, + struct vsock_sock *vsk, + struct msghdr *msg, size_t len, + int flags) +{ + int err; + int noblock; + struct vmci_datagram *dg; + size_t payload_len; + struct sk_buff *skb; + + noblock = flags & MSG_DONTWAIT; + + if (flags & MSG_OOB || flags & MSG_ERRQUEUE) + return -EOPNOTSUPP; + + msg->msg_namelen = 0; + + /* Retrieve the head sk_buff from the socket's receive queue. */ + err = 0; + skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err); + if (err) + return err; + + if (!skb) + return -EAGAIN; + + dg = (struct vmci_datagram *)skb->data; + if (!dg) + /* err is 0, meaning we read zero bytes. */ + goto out; + + payload_len = dg->payload_size; + /* Ensure the sk_buff matches the payload size claimed in the packet. */ + if (payload_len != skb->len - sizeof(*dg)) { + err = -EINVAL; + goto out; + } + + if (payload_len > len) { + payload_len = len; + msg->msg_flags |= MSG_TRUNC; + } + + /* Place the datagram payload in the user's iovec. */ + err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov, + payload_len); + if (err) + goto out; + + if (msg->msg_name) { + struct sockaddr_vm *vm_addr; + + /* Provide the address of the sender. */ + vm_addr = (struct sockaddr_vm *)msg->msg_name; + vsock_addr_init(vm_addr, dg->src.context, dg->src.resource); + msg->msg_namelen = sizeof(*vm_addr); + } + err = payload_len; + +out: + skb_free_datagram(&vsk->sk, skb); + return err; +} + +static bool vmci_transport_dgram_allow(u32 cid, u32 port) +{ + if (cid == VMADDR_CID_HYPERVISOR) { + /* Registrations of PBRPC Servers do not modify VMX/Hypervisor + * state and are allowed. + */ + return port == VMCI_UNITY_PBRPC_REGISTER; + } + + return true; +} + +static int vmci_transport_connect(struct vsock_sock *vsk) +{ + int err; + bool old_pkt_proto = false; + struct sock *sk = &vsk->sk; + + if (vmci_transport_old_proto_override(&old_pkt_proto) && + old_pkt_proto) { + err = vmci_transport_send_conn_request( + sk, vmci_trans(vsk)->queue_pair_size); + if (err < 0) { + sk->sk_state = SS_UNCONNECTED; + return err; + } + } else { + int supported_proto_versions = + vmci_transport_new_proto_supported_versions(); + err = vmci_transport_send_conn_request2( + sk, vmci_trans(vsk)->queue_pair_size, + supported_proto_versions); + if (err < 0) { + sk->sk_state = SS_UNCONNECTED; + return err; + } + + vsk->sent_request = true; + } + + return err; +} + +static ssize_t vmci_transport_stream_dequeue( + struct vsock_sock *vsk, + struct iovec *iov, + size_t len, + int flags) +{ + if (flags & MSG_PEEK) + return vmci_qpair_peekv(vmci_trans(vsk)->qpair, iov, len, 0); + else + return vmci_qpair_dequev(vmci_trans(vsk)->qpair, iov, len, 0); +} + +static ssize_t vmci_transport_stream_enqueue( + struct vsock_sock *vsk, + struct iovec *iov, + size_t len) +{ + return vmci_qpair_enquev(vmci_trans(vsk)->qpair, iov, len, 0); +} + +static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk) +{ + return vmci_qpair_consume_buf_ready(vmci_trans(vsk)->qpair); +} + +static s64 vmci_transport_stream_has_space(struct vsock_sock *vsk) +{ + return vmci_qpair_produce_free_space(vmci_trans(vsk)->qpair); +} + +static u64 vmci_transport_stream_rcvhiwat(struct vsock_sock *vsk) +{ + return vmci_trans(vsk)->consume_size; +} + +static bool vmci_transport_stream_is_active(struct vsock_sock *vsk) +{ + return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle); +} + +static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk) +{ + return vmci_trans(vsk)->queue_pair_size; +} + +static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk) +{ + return vmci_trans(vsk)->queue_pair_min_size; +} + +static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk) +{ + return vmci_trans(vsk)->queue_pair_max_size; +} + +static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) +{ + if (val < vmci_trans(vsk)->queue_pair_min_size) + vmci_trans(vsk)->queue_pair_min_size = val; + if (val > vmci_trans(vsk)->queue_pair_max_size) + vmci_trans(vsk)->queue_pair_max_size = val; + vmci_trans(vsk)->queue_pair_size = val; +} + +static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk, + u64 val) +{ + if (val > vmci_trans(vsk)->queue_pair_size) + vmci_trans(vsk)->queue_pair_size = val; + vmci_trans(vsk)->queue_pair_min_size = val; +} + +static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk, + u64 val) +{ + if (val < vmci_trans(vsk)->queue_pair_size) + vmci_trans(vsk)->queue_pair_size = val; + vmci_trans(vsk)->queue_pair_max_size = val; +} + +static int vmci_transport_notify_poll_in( + struct vsock_sock *vsk, + size_t target, + bool *data_ready_now) +{ + return vmci_trans(vsk)->notify_ops->poll_in( + &vsk->sk, target, data_ready_now); +} + +static int vmci_transport_notify_poll_out( + struct vsock_sock *vsk, + size_t target, + bool *space_available_now) +{ + return vmci_trans(vsk)->notify_ops->poll_out( + &vsk->sk, target, space_available_now); +} + +static int vmci_transport_notify_recv_init( + struct vsock_sock *vsk, + size_t target, + struct vsock_transport_recv_notify_data *data) +{ + return vmci_trans(vsk)->notify_ops->recv_init( + &vsk->sk, target, + (struct vmci_transport_recv_notify_data *)data); +} + +static int vmci_transport_notify_recv_pre_block( + struct vsock_sock *vsk, + size_t target, + struct vsock_transport_recv_notify_data *data) +{ + return vmci_trans(vsk)->notify_ops->recv_pre_block( + &vsk->sk, target, + (struct vmci_transport_recv_notify_data *)data); +} + +static int vmci_transport_notify_recv_pre_dequeue( + struct vsock_sock *vsk, + size_t target, + struct vsock_transport_recv_notify_data *data) +{ + return vmci_trans(vsk)->notify_ops->recv_pre_dequeue( + &vsk->sk, target, + (struct vmci_transport_recv_notify_data *)data); +} + +static int vmci_transport_notify_recv_post_dequeue( + struct vsock_sock *vsk, + size_t target, + ssize_t copied, + bool data_read, + struct vsock_transport_recv_notify_data *data) +{ + return vmci_trans(vsk)->notify_ops->recv_post_dequeue( + &vsk->sk, target, copied, data_read, + (struct vmci_transport_recv_notify_data *)data); +} + +static int vmci_transport_notify_send_init( + struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return vmci_trans(vsk)->notify_ops->send_init( + &vsk->sk, + (struct vmci_transport_send_notify_data *)data); +} + +static int vmci_transport_notify_send_pre_block( + struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return vmci_trans(vsk)->notify_ops->send_pre_block( + &vsk->sk, + (struct vmci_transport_send_notify_data *)data); +} + +static int vmci_transport_notify_send_pre_enqueue( + struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return vmci_trans(vsk)->notify_ops->send_pre_enqueue( + &vsk->sk, + (struct vmci_transport_send_notify_data *)data); +} + +static int vmci_transport_notify_send_post_enqueue( + struct vsock_sock *vsk, + ssize_t written, + struct vsock_transport_send_notify_data *data) +{ + return vmci_trans(vsk)->notify_ops->send_post_enqueue( + &vsk->sk, written, + (struct vmci_transport_send_notify_data *)data); +} + +static bool vmci_transport_old_proto_override(bool *old_pkt_proto) +{ + if (PROTOCOL_OVERRIDE != -1) { + if (PROTOCOL_OVERRIDE == 0) + *old_pkt_proto = true; + else + *old_pkt_proto = false; + + pr_info("Proto override in use\n"); + return true; + } + + return false; +} + +static bool vmci_transport_proto_to_notify_struct(struct sock *sk, + u16 *proto, + bool old_pkt_proto) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + if (old_pkt_proto) { + if (*proto != VSOCK_PROTO_INVALID) { + pr_err("Can't set both an old and new protocol\n"); + return false; + } + vmci_trans(vsk)->notify_ops = &vmci_transport_notify_pkt_ops; + goto exit; + } + + switch (*proto) { + case VSOCK_PROTO_PKT_ON_NOTIFY: + vmci_trans(vsk)->notify_ops = + &vmci_transport_notify_pkt_q_state_ops; + break; + default: + pr_err("Unknown notify protocol version\n"); + return false; + } + +exit: + vmci_trans(vsk)->notify_ops->socket_init(sk); + return true; +} + +static u16 vmci_transport_new_proto_supported_versions(void) +{ + if (PROTOCOL_OVERRIDE != -1) + return PROTOCOL_OVERRIDE; + + return VSOCK_PROTO_ALL_SUPPORTED; +} + +static u32 vmci_transport_get_local_cid(void) +{ + return vmci_get_context_id(); +} + +static struct vsock_transport vmci_transport = { + .init = vmci_transport_socket_init, + .destruct = vmci_transport_destruct, + .release = vmci_transport_release, + .connect = vmci_transport_connect, + .dgram_bind = vmci_transport_dgram_bind, + .dgram_dequeue = vmci_transport_dgram_dequeue, + .dgram_enqueue = vmci_transport_dgram_enqueue, + .dgram_allow = vmci_transport_dgram_allow, + .stream_dequeue = vmci_transport_stream_dequeue, + .stream_enqueue = vmci_transport_stream_enqueue, + .stream_has_data = vmci_transport_stream_has_data, + .stream_has_space = vmci_transport_stream_has_space, + .stream_rcvhiwat = vmci_transport_stream_rcvhiwat, + .stream_is_active = vmci_transport_stream_is_active, + .stream_allow = vmci_transport_stream_allow, + .notify_poll_in = vmci_transport_notify_poll_in, + .notify_poll_out = vmci_transport_notify_poll_out, + .notify_recv_init = vmci_transport_notify_recv_init, + .notify_recv_pre_block = vmci_transport_notify_recv_pre_block, + .notify_recv_pre_dequeue = vmci_transport_notify_recv_pre_dequeue, + .notify_recv_post_dequeue = vmci_transport_notify_recv_post_dequeue, + .notify_send_init = vmci_transport_notify_send_init, + .notify_send_pre_block = vmci_transport_notify_send_pre_block, + .notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue, + .notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue, + .shutdown = vmci_transport_shutdown, + .set_buffer_size = vmci_transport_set_buffer_size, + .set_min_buffer_size = vmci_transport_set_min_buffer_size, + .set_max_buffer_size = vmci_transport_set_max_buffer_size, + .get_buffer_size = vmci_transport_get_buffer_size, + .get_min_buffer_size = vmci_transport_get_min_buffer_size, + .get_max_buffer_size = vmci_transport_get_max_buffer_size, + .get_local_cid = vmci_transport_get_local_cid, +}; + +static int __init vmci_transport_init(void) +{ + int err; + + /* Create the datagram handle that we will use to send and receive all + * VSocket control messages for this context. + */ + err = vmci_transport_datagram_create_hnd(VMCI_TRANSPORT_PACKET_RID, + VMCI_FLAG_ANYCID_DG_HND, + vmci_transport_recv_stream_cb, + NULL, + &vmci_transport_stream_handle); + if (err < VMCI_SUCCESS) { + pr_err("Unable to create datagram handle. (%d)\n", err); + return vmci_transport_error_to_vsock_error(err); + } + + err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED, + vmci_transport_qp_resumed_cb, + NULL, &vmci_transport_qp_resumed_sub_id); + if (err < VMCI_SUCCESS) { + pr_err("Unable to subscribe to resumed event. (%d)\n", err); + err = vmci_transport_error_to_vsock_error(err); + vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; + goto err_destroy_stream_handle; + } + + err = vsock_core_init(&vmci_transport); + if (err < 0) + goto err_unsubscribe; + + return 0; + +err_unsubscribe: + vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id); +err_destroy_stream_handle: + vmci_datagram_destroy_handle(vmci_transport_stream_handle); + return err; +} +module_init(vmci_transport_init); + +static void __exit vmci_transport_exit(void) +{ + if (!vmci_handle_is_invalid(vmci_transport_stream_handle)) { + if (vmci_datagram_destroy_handle( + vmci_transport_stream_handle) != VMCI_SUCCESS) + pr_err("Couldn't destroy datagram handle\n"); + vmci_transport_stream_handle = VMCI_INVALID_HANDLE; + } + + if (vmci_transport_qp_resumed_sub_id != VMCI_INVALID_ID) { + vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id); + vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; + } + + vsock_core_exit(); +} +module_exit(vmci_transport_exit); + +MODULE_AUTHOR("VMware, Inc."); +MODULE_DESCRIPTION("VMCI transport for Virtual Sockets"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("vmware_vsock"); +MODULE_ALIAS_NETPROTO(PF_VSOCK); diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h new file mode 100644 index 000000000000..fd88ea8924e4 --- /dev/null +++ b/net/vmw_vsock/vmci_transport.h @@ -0,0 +1,142 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _VMCI_TRANSPORT_H_ +#define _VMCI_TRANSPORT_H_ + +#include <linux/vmw_vmci_defs.h> +#include <linux/vmw_vmci_api.h> + +#include "vsock_addr.h" +#include "af_vsock.h" + +/* If the packet format changes in a release then this should change too. */ +#define VMCI_TRANSPORT_PACKET_VERSION 1 + +/* The resource ID on which control packets are sent. */ +#define VMCI_TRANSPORT_PACKET_RID 1 + +/* The resource ID on which control packets are sent to the hypervisor. */ +#define VMCI_TRANSPORT_HYPERVISOR_PACKET_RID 15 + +#define VSOCK_PROTO_INVALID 0 +#define VSOCK_PROTO_PKT_ON_NOTIFY (1 << 0) +#define VSOCK_PROTO_ALL_SUPPORTED (VSOCK_PROTO_PKT_ON_NOTIFY) + +#define vmci_trans(_vsk) ((struct vmci_transport *)((_vsk)->trans)) + +enum vmci_transport_packet_type { + VMCI_TRANSPORT_PACKET_TYPE_INVALID = 0, + VMCI_TRANSPORT_PACKET_TYPE_REQUEST, + VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE, + VMCI_TRANSPORT_PACKET_TYPE_OFFER, + VMCI_TRANSPORT_PACKET_TYPE_ATTACH, + VMCI_TRANSPORT_PACKET_TYPE_WROTE, + VMCI_TRANSPORT_PACKET_TYPE_READ, + VMCI_TRANSPORT_PACKET_TYPE_RST, + VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN, + VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE, + VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ, + VMCI_TRANSPORT_PACKET_TYPE_REQUEST2, + VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2, + VMCI_TRANSPORT_PACKET_TYPE_MAX +}; + +struct vmci_transport_waiting_info { + u64 generation; + u64 offset; +}; + +/* Control packet type for STREAM sockets. DGRAMs have no control packets nor + * special packet header for data packets, they are just raw VMCI DGRAM + * messages. For STREAMs, control packets are sent over the control channel + * while data is written and read directly from queue pairs with no packet + * format. + */ +struct vmci_transport_packet { + struct vmci_datagram dg; + u8 version; + u8 type; + u16 proto; + u32 src_port; + u32 dst_port; + u32 _reserved2; + union { + u64 size; + u64 mode; + struct vmci_handle handle; + struct vmci_transport_waiting_info wait; + } u; +}; + +struct vmci_transport_notify_pkt { + u64 write_notify_window; + u64 write_notify_min_window; + bool peer_waiting_read; + bool peer_waiting_write; + bool peer_waiting_write_detected; + bool sent_waiting_read; + bool sent_waiting_write; + struct vmci_transport_waiting_info peer_waiting_read_info; + struct vmci_transport_waiting_info peer_waiting_write_info; + u64 produce_q_generation; + u64 consume_q_generation; +}; + +struct vmci_transport_notify_pkt_q_state { + u64 write_notify_window; + u64 write_notify_min_window; + bool peer_waiting_write; + bool peer_waiting_write_detected; +}; + +union vmci_transport_notify { + struct vmci_transport_notify_pkt pkt; + struct vmci_transport_notify_pkt_q_state pkt_q_state; +}; + +/* Our transport-specific data. */ +struct vmci_transport { + /* For DGRAMs. */ + struct vmci_handle dg_handle; + /* For STREAMs. */ + struct vmci_handle qp_handle; + struct vmci_qp *qpair; + u64 produce_size; + u64 consume_size; + u64 queue_pair_size; + u64 queue_pair_min_size; + u64 queue_pair_max_size; + u32 attach_sub_id; + u32 detach_sub_id; + union vmci_transport_notify notify; + struct vmci_transport_notify_ops *notify_ops; +}; + +int vmci_transport_register(void); +void vmci_transport_unregister(void); + +int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst, + struct sockaddr_vm *src); +int vmci_transport_send_read_bh(struct sockaddr_vm *dst, + struct sockaddr_vm *src); +int vmci_transport_send_wrote(struct sock *sk); +int vmci_transport_send_read(struct sock *sk); +int vmci_transport_send_waiting_write(struct sock *sk, + struct vmci_transport_waiting_info *wait); +int vmci_transport_send_waiting_read(struct sock *sk, + struct vmci_transport_waiting_info *wait); + +#endif diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c new file mode 100644 index 000000000000..9a730744e7bc --- /dev/null +++ b/net/vmw_vsock/vmci_transport_notify.c @@ -0,0 +1,680 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2009-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/stddef.h> +#include <net/sock.h> + +#include "vmci_transport_notify.h" + +#define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name) + +static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk) +{ +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + bool retval; + u64 notify_limit; + + if (!PKT_FIELD(vsk, peer_waiting_write)) + return false; + +#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL + /* When the sender blocks, we take that as a sign that the sender is + * faster than the receiver. To reduce the transmit rate of the sender, + * we delay the sending of the read notification by decreasing the + * write_notify_window. The notification is delayed until the number of + * bytes used in the queue drops below the write_notify_window. + */ + + if (!PKT_FIELD(vsk, peer_waiting_write_detected)) { + PKT_FIELD(vsk, peer_waiting_write_detected) = true; + if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) { + PKT_FIELD(vsk, write_notify_window) = + PKT_FIELD(vsk, write_notify_min_window); + } else { + PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE; + if (PKT_FIELD(vsk, write_notify_window) < + PKT_FIELD(vsk, write_notify_min_window)) + PKT_FIELD(vsk, write_notify_window) = + PKT_FIELD(vsk, write_notify_min_window); + + } + } + notify_limit = vmci_trans(vsk)->consume_size - + PKT_FIELD(vsk, write_notify_window); +#else + notify_limit = 0; +#endif + + /* For now we ignore the wait information and just see if the free + * space exceeds the notify limit. Note that improving this function + * to be more intelligent will not require a protocol change and will + * retain compatibility between endpoints with mixed versions of this + * function. + * + * The notify_limit is used to delay notifications in the case where + * flow control is enabled. Below the test is expressed in terms of + * free space in the queue: if free_space > ConsumeSize - + * write_notify_window then notify An alternate way of expressing this + * is to rewrite the expression to use the data ready in the receive + * queue: if write_notify_window > bufferReady then notify as + * free_space == ConsumeSize - bufferReady. + */ + retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) > + notify_limit; +#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL + if (retval) { + /* + * Once we notify the peer, we reset the detected flag so the + * next wait will again cause a decrease in the window size. + */ + + PKT_FIELD(vsk, peer_waiting_write_detected) = false; + } +#endif + return retval; +#else + return true; +#endif +} + +static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk) +{ +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + if (!PKT_FIELD(vsk, peer_waiting_read)) + return false; + + /* For now we ignore the wait information and just see if there is any + * data for our peer to read. Note that improving this function to be + * more intelligent will not require a protocol change and will retain + * compatibility between endpoints with mixed versions of this + * function. + */ + return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0; +#else + return true; +#endif +} + +static void +vmci_transport_handle_waiting_read(struct sock *sk, + struct vmci_transport_packet *pkt, + bool bottom_half, + struct sockaddr_vm *dst, + struct sockaddr_vm *src) +{ +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + struct vsock_sock *vsk; + + vsk = vsock_sk(sk); + + PKT_FIELD(vsk, peer_waiting_read) = true; + memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait, + sizeof(PKT_FIELD(vsk, peer_waiting_read_info))); + + if (vmci_transport_notify_waiting_read(vsk)) { + bool sent; + + if (bottom_half) + sent = vmci_transport_send_wrote_bh(dst, src) > 0; + else + sent = vmci_transport_send_wrote(sk) > 0; + + if (sent) + PKT_FIELD(vsk, peer_waiting_read) = false; + } +#endif +} + +static void +vmci_transport_handle_waiting_write(struct sock *sk, + struct vmci_transport_packet *pkt, + bool bottom_half, + struct sockaddr_vm *dst, + struct sockaddr_vm *src) +{ +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + struct vsock_sock *vsk; + + vsk = vsock_sk(sk); + + PKT_FIELD(vsk, peer_waiting_write) = true; + memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait, + sizeof(PKT_FIELD(vsk, peer_waiting_write_info))); + + if (vmci_transport_notify_waiting_write(vsk)) { + bool sent; + + if (bottom_half) + sent = vmci_transport_send_read_bh(dst, src) > 0; + else + sent = vmci_transport_send_read(sk) > 0; + + if (sent) + PKT_FIELD(vsk, peer_waiting_write) = false; + } +#endif +} + +static void +vmci_transport_handle_read(struct sock *sk, + struct vmci_transport_packet *pkt, + bool bottom_half, + struct sockaddr_vm *dst, struct sockaddr_vm *src) +{ +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + struct vsock_sock *vsk; + + vsk = vsock_sk(sk); + PKT_FIELD(vsk, sent_waiting_write) = false; +#endif + + sk->sk_write_space(sk); +} + +static bool send_waiting_read(struct sock *sk, u64 room_needed) +{ +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + struct vsock_sock *vsk; + struct vmci_transport_waiting_info waiting_info; + u64 tail; + u64 head; + u64 room_left; + bool ret; + + vsk = vsock_sk(sk); + + if (PKT_FIELD(vsk, sent_waiting_read)) + return true; + + if (PKT_FIELD(vsk, write_notify_window) < + vmci_trans(vsk)->consume_size) + PKT_FIELD(vsk, write_notify_window) = + min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE, + vmci_trans(vsk)->consume_size); + + vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head); + room_left = vmci_trans(vsk)->consume_size - head; + if (room_needed >= room_left) { + waiting_info.offset = room_needed - room_left; + waiting_info.generation = + PKT_FIELD(vsk, consume_q_generation) + 1; + } else { + waiting_info.offset = head + room_needed; + waiting_info.generation = PKT_FIELD(vsk, consume_q_generation); + } + + ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0; + if (ret) + PKT_FIELD(vsk, sent_waiting_read) = true; + + return ret; +#else + return true; +#endif +} + +static bool send_waiting_write(struct sock *sk, u64 room_needed) +{ +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + struct vsock_sock *vsk; + struct vmci_transport_waiting_info waiting_info; + u64 tail; + u64 head; + u64 room_left; + bool ret; + + vsk = vsock_sk(sk); + + if (PKT_FIELD(vsk, sent_waiting_write)) + return true; + + vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head); + room_left = vmci_trans(vsk)->produce_size - tail; + if (room_needed + 1 >= room_left) { + /* Wraps around to current generation. */ + waiting_info.offset = room_needed + 1 - room_left; + waiting_info.generation = PKT_FIELD(vsk, produce_q_generation); + } else { + waiting_info.offset = tail + room_needed + 1; + waiting_info.generation = + PKT_FIELD(vsk, produce_q_generation) - 1; + } + + ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0; + if (ret) + PKT_FIELD(vsk, sent_waiting_write) = true; + + return ret; +#else + return true; +#endif +} + +static int vmci_transport_send_read_notification(struct sock *sk) +{ + struct vsock_sock *vsk; + bool sent_read; + unsigned int retries; + int err; + + vsk = vsock_sk(sk); + sent_read = false; + retries = 0; + err = 0; + + if (vmci_transport_notify_waiting_write(vsk)) { + /* Notify the peer that we have read, retrying the send on + * failure up to our maximum value. XXX For now we just log + * the failure, but later we should schedule a work item to + * handle the resend until it succeeds. That would require + * keeping track of work items in the vsk and cleaning them up + * upon socket close. + */ + while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && + !sent_read && + retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { + err = vmci_transport_send_read(sk); + if (err >= 0) + sent_read = true; + + retries++; + } + + if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) + pr_err("%p unable to send read notify to peer\n", sk); + else +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + PKT_FIELD(vsk, peer_waiting_write) = false; +#endif + + } + return err; +} + +static void +vmci_transport_handle_wrote(struct sock *sk, + struct vmci_transport_packet *pkt, + bool bottom_half, + struct sockaddr_vm *dst, struct sockaddr_vm *src) +{ +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + struct vsock_sock *vsk = vsock_sk(sk); + PKT_FIELD(vsk, sent_waiting_read) = false; +#endif + sk->sk_data_ready(sk, 0); +} + +static void vmci_transport_notify_pkt_socket_init(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; + PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; + PKT_FIELD(vsk, peer_waiting_read) = false; + PKT_FIELD(vsk, peer_waiting_write) = false; + PKT_FIELD(vsk, peer_waiting_write_detected) = false; + PKT_FIELD(vsk, sent_waiting_read) = false; + PKT_FIELD(vsk, sent_waiting_write) = false; + PKT_FIELD(vsk, produce_q_generation) = 0; + PKT_FIELD(vsk, consume_q_generation) = 0; + + memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0, + sizeof(PKT_FIELD(vsk, peer_waiting_read_info))); + memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0, + sizeof(PKT_FIELD(vsk, peer_waiting_write_info))); +} + +static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk) +{ +} + +static int +vmci_transport_notify_pkt_poll_in(struct sock *sk, + size_t target, bool *data_ready_now) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + if (vsock_stream_has_data(vsk)) { + *data_ready_now = true; + } else { + /* We can't read right now because there is nothing in the + * queue. Ask for notifications when there is something to + * read. + */ + if (sk->sk_state == SS_CONNECTED) { + if (!send_waiting_read(sk, 1)) + return -1; + + } + *data_ready_now = false; + } + + return 0; +} + +static int +vmci_transport_notify_pkt_poll_out(struct sock *sk, + size_t target, bool *space_avail_now) +{ + s64 produce_q_free_space; + struct vsock_sock *vsk = vsock_sk(sk); + + produce_q_free_space = vsock_stream_has_space(vsk); + if (produce_q_free_space > 0) { + *space_avail_now = true; + return 0; + } else if (produce_q_free_space == 0) { + /* This is a connected socket but we can't currently send data. + * Notify the peer that we are waiting if the queue is full. We + * only send a waiting write if the queue is full because + * otherwise we end up in an infinite WAITING_WRITE, READ, + * WAITING_WRITE, READ, etc. loop. Treat failing to send the + * notification as a socket error, passing that back through + * the mask. + */ + if (!send_waiting_write(sk, 1)) + return -1; + + *space_avail_now = false; + } + + return 0; +} + +static int +vmci_transport_notify_pkt_recv_init( + struct sock *sk, + size_t target, + struct vmci_transport_recv_notify_data *data) +{ + struct vsock_sock *vsk = vsock_sk(sk); + +#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY + data->consume_head = 0; + data->produce_tail = 0; +#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL + data->notify_on_block = false; + + if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) { + PKT_FIELD(vsk, write_notify_min_window) = target + 1; + if (PKT_FIELD(vsk, write_notify_window) < + PKT_FIELD(vsk, write_notify_min_window)) { + /* If the current window is smaller than the new + * minimal window size, we need to reevaluate whether + * we need to notify the sender. If the number of ready + * bytes are smaller than the new window, we need to + * send a notification to the sender before we block. + */ + + PKT_FIELD(vsk, write_notify_window) = + PKT_FIELD(vsk, write_notify_min_window); + data->notify_on_block = true; + } + } +#endif +#endif + + return 0; +} + +static int +vmci_transport_notify_pkt_recv_pre_block( + struct sock *sk, + size_t target, + struct vmci_transport_recv_notify_data *data) +{ + int err = 0; + + /* Notify our peer that we are waiting for data to read. */ + if (!send_waiting_read(sk, target)) { + err = -EHOSTUNREACH; + return err; + } +#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL + if (data->notify_on_block) { + err = vmci_transport_send_read_notification(sk); + if (err < 0) + return err; + + data->notify_on_block = false; + } +#endif + + return err; +} + +static int +vmci_transport_notify_pkt_recv_pre_dequeue( + struct sock *sk, + size_t target, + struct vmci_transport_recv_notify_data *data) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + /* Now consume up to len bytes from the queue. Note that since we have + * the socket locked we should copy at least ready bytes. + */ +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, + &data->produce_tail, + &data->consume_head); +#endif + + return 0; +} + +static int +vmci_transport_notify_pkt_recv_post_dequeue( + struct sock *sk, + size_t target, + ssize_t copied, + bool data_read, + struct vmci_transport_recv_notify_data *data) +{ + struct vsock_sock *vsk; + int err; + + vsk = vsock_sk(sk); + err = 0; + + if (data_read) { +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + /* Detect a wrap-around to maintain queue generation. Note + * that this is safe since we hold the socket lock across the + * two queue pair operations. + */ + if (copied >= + vmci_trans(vsk)->consume_size - data->consume_head) + PKT_FIELD(vsk, consume_q_generation)++; +#endif + + err = vmci_transport_send_read_notification(sk); + if (err < 0) + return err; + + } + return err; +} + +static int +vmci_transport_notify_pkt_send_init( + struct sock *sk, + struct vmci_transport_send_notify_data *data) +{ +#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY + data->consume_head = 0; + data->produce_tail = 0; +#endif + + return 0; +} + +static int +vmci_transport_notify_pkt_send_pre_block( + struct sock *sk, + struct vmci_transport_send_notify_data *data) +{ + /* Notify our peer that we are waiting for room to write. */ + if (!send_waiting_write(sk, 1)) + return -EHOSTUNREACH; + + return 0; +} + +static int +vmci_transport_notify_pkt_send_pre_enqueue( + struct sock *sk, + struct vmci_transport_send_notify_data *data) +{ + struct vsock_sock *vsk = vsock_sk(sk); + +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, + &data->produce_tail, + &data->consume_head); +#endif + + return 0; +} + +static int +vmci_transport_notify_pkt_send_post_enqueue( + struct sock *sk, + ssize_t written, + struct vmci_transport_send_notify_data *data) +{ + int err = 0; + struct vsock_sock *vsk; + bool sent_wrote = false; + int retries = 0; + + vsk = vsock_sk(sk); + +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + /* Detect a wrap-around to maintain queue generation. Note that this + * is safe since we hold the socket lock across the two queue pair + * operations. + */ + if (written >= vmci_trans(vsk)->produce_size - data->produce_tail) + PKT_FIELD(vsk, produce_q_generation)++; + +#endif + + if (vmci_transport_notify_waiting_read(vsk)) { + /* Notify the peer that we have written, retrying the send on + * failure up to our maximum value. See the XXX comment for the + * corresponding piece of code in StreamRecvmsg() for potential + * improvements. + */ + while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && + !sent_wrote && + retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { + err = vmci_transport_send_wrote(sk); + if (err >= 0) + sent_wrote = true; + + retries++; + } + + if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { + pr_err("%p unable to send wrote notify to peer\n", sk); + return err; + } else { +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) + PKT_FIELD(vsk, peer_waiting_read) = false; +#endif + } + } + return err; +} + +static void +vmci_transport_notify_pkt_handle_pkt( + struct sock *sk, + struct vmci_transport_packet *pkt, + bool bottom_half, + struct sockaddr_vm *dst, + struct sockaddr_vm *src, bool *pkt_processed) +{ + bool processed = false; + + switch (pkt->type) { + case VMCI_TRANSPORT_PACKET_TYPE_WROTE: + vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src); + processed = true; + break; + case VMCI_TRANSPORT_PACKET_TYPE_READ: + vmci_transport_handle_read(sk, pkt, bottom_half, dst, src); + processed = true; + break; + case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE: + vmci_transport_handle_waiting_write(sk, pkt, bottom_half, + dst, src); + processed = true; + break; + + case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ: + vmci_transport_handle_waiting_read(sk, pkt, bottom_half, + dst, src); + processed = true; + break; + } + + if (pkt_processed) + *pkt_processed = processed; +} + +static void vmci_transport_notify_pkt_process_request(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; + if (vmci_trans(vsk)->consume_size < + PKT_FIELD(vsk, write_notify_min_window)) + PKT_FIELD(vsk, write_notify_min_window) = + vmci_trans(vsk)->consume_size; +} + +static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; + if (vmci_trans(vsk)->consume_size < + PKT_FIELD(vsk, write_notify_min_window)) + PKT_FIELD(vsk, write_notify_min_window) = + vmci_trans(vsk)->consume_size; +} + +/* Socket control packet based operations. */ +struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = { + vmci_transport_notify_pkt_socket_init, + vmci_transport_notify_pkt_socket_destruct, + vmci_transport_notify_pkt_poll_in, + vmci_transport_notify_pkt_poll_out, + vmci_transport_notify_pkt_handle_pkt, + vmci_transport_notify_pkt_recv_init, + vmci_transport_notify_pkt_recv_pre_block, + vmci_transport_notify_pkt_recv_pre_dequeue, + vmci_transport_notify_pkt_recv_post_dequeue, + vmci_transport_notify_pkt_send_init, + vmci_transport_notify_pkt_send_pre_block, + vmci_transport_notify_pkt_send_pre_enqueue, + vmci_transport_notify_pkt_send_post_enqueue, + vmci_transport_notify_pkt_process_request, + vmci_transport_notify_pkt_process_negotiate, +}; diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h new file mode 100644 index 000000000000..7df793249b6c --- /dev/null +++ b/net/vmw_vsock/vmci_transport_notify.h @@ -0,0 +1,83 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2009-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef __VMCI_TRANSPORT_NOTIFY_H__ +#define __VMCI_TRANSPORT_NOTIFY_H__ + +#include <linux/types.h> +#include <linux/vmw_vmci_defs.h> +#include <linux/vmw_vmci_api.h> +#include <linux/vm_sockets.h> + +#include "vmci_transport.h" + +/* Comment this out to compare with old protocol. */ +#define VSOCK_OPTIMIZATION_WAITING_NOTIFY 1 +#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) +/* Comment this out to remove flow control for "new" protocol */ +#define VSOCK_OPTIMIZATION_FLOW_CONTROL 1 +#endif + +#define VMCI_TRANSPORT_MAX_DGRAM_RESENDS 10 + +struct vmci_transport_recv_notify_data { + u64 consume_head; + u64 produce_tail; + bool notify_on_block; +}; + +struct vmci_transport_send_notify_data { + u64 consume_head; + u64 produce_tail; +}; + +/* Socket notification callbacks. */ +struct vmci_transport_notify_ops { + void (*socket_init) (struct sock *sk); + void (*socket_destruct) (struct vsock_sock *vsk); + int (*poll_in) (struct sock *sk, size_t target, + bool *data_ready_now); + int (*poll_out) (struct sock *sk, size_t target, + bool *space_avail_now); + void (*handle_notify_pkt) (struct sock *sk, + struct vmci_transport_packet *pkt, + bool bottom_half, struct sockaddr_vm *dst, + struct sockaddr_vm *src, + bool *pkt_processed); + int (*recv_init) (struct sock *sk, size_t target, + struct vmci_transport_recv_notify_data *data); + int (*recv_pre_block) (struct sock *sk, size_t target, + struct vmci_transport_recv_notify_data *data); + int (*recv_pre_dequeue) (struct sock *sk, size_t target, + struct vmci_transport_recv_notify_data *data); + int (*recv_post_dequeue) (struct sock *sk, size_t target, + ssize_t copied, bool data_read, + struct vmci_transport_recv_notify_data *data); + int (*send_init) (struct sock *sk, + struct vmci_transport_send_notify_data *data); + int (*send_pre_block) (struct sock *sk, + struct vmci_transport_send_notify_data *data); + int (*send_pre_enqueue) (struct sock *sk, + struct vmci_transport_send_notify_data *data); + int (*send_post_enqueue) (struct sock *sk, ssize_t written, + struct vmci_transport_send_notify_data *data); + void (*process_request) (struct sock *sk); + void (*process_negotiate) (struct sock *sk); +}; + +extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops; +extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops; + +#endif /* __VMCI_TRANSPORT_NOTIFY_H__ */ diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c new file mode 100644 index 000000000000..622bd7aa1016 --- /dev/null +++ b/net/vmw_vsock/vmci_transport_notify_qstate.c @@ -0,0 +1,438 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2009-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/stddef.h> +#include <net/sock.h> + +#include "vmci_transport_notify.h" + +#define PKT_FIELD(vsk, field_name) \ + (vmci_trans(vsk)->notify.pkt_q_state.field_name) + +static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk) +{ + bool retval; + u64 notify_limit; + + if (!PKT_FIELD(vsk, peer_waiting_write)) + return false; + + /* When the sender blocks, we take that as a sign that the sender is + * faster than the receiver. To reduce the transmit rate of the sender, + * we delay the sending of the read notification by decreasing the + * write_notify_window. The notification is delayed until the number of + * bytes used in the queue drops below the write_notify_window. + */ + + if (!PKT_FIELD(vsk, peer_waiting_write_detected)) { + PKT_FIELD(vsk, peer_waiting_write_detected) = true; + if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) { + PKT_FIELD(vsk, write_notify_window) = + PKT_FIELD(vsk, write_notify_min_window); + } else { + PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE; + if (PKT_FIELD(vsk, write_notify_window) < + PKT_FIELD(vsk, write_notify_min_window)) + PKT_FIELD(vsk, write_notify_window) = + PKT_FIELD(vsk, write_notify_min_window); + + } + } + notify_limit = vmci_trans(vsk)->consume_size - + PKT_FIELD(vsk, write_notify_window); + + /* The notify_limit is used to delay notifications in the case where + * flow control is enabled. Below the test is expressed in terms of + * free space in the queue: if free_space > ConsumeSize - + * write_notify_window then notify An alternate way of expressing this + * is to rewrite the expression to use the data ready in the receive + * queue: if write_notify_window > bufferReady then notify as + * free_space == ConsumeSize - bufferReady. + */ + + retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) > + notify_limit; + + if (retval) { + /* Once we notify the peer, we reset the detected flag so the + * next wait will again cause a decrease in the window size. + */ + + PKT_FIELD(vsk, peer_waiting_write_detected) = false; + } + return retval; +} + +static void +vmci_transport_handle_read(struct sock *sk, + struct vmci_transport_packet *pkt, + bool bottom_half, + struct sockaddr_vm *dst, struct sockaddr_vm *src) +{ + sk->sk_write_space(sk); +} + +static void +vmci_transport_handle_wrote(struct sock *sk, + struct vmci_transport_packet *pkt, + bool bottom_half, + struct sockaddr_vm *dst, struct sockaddr_vm *src) +{ + sk->sk_data_ready(sk, 0); +} + +static void vsock_block_update_write_window(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + if (PKT_FIELD(vsk, write_notify_window) < vmci_trans(vsk)->consume_size) + PKT_FIELD(vsk, write_notify_window) = + min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE, + vmci_trans(vsk)->consume_size); +} + +static int vmci_transport_send_read_notification(struct sock *sk) +{ + struct vsock_sock *vsk; + bool sent_read; + unsigned int retries; + int err; + + vsk = vsock_sk(sk); + sent_read = false; + retries = 0; + err = 0; + + if (vmci_transport_notify_waiting_write(vsk)) { + /* Notify the peer that we have read, retrying the send on + * failure up to our maximum value. XXX For now we just log + * the failure, but later we should schedule a work item to + * handle the resend until it succeeds. That would require + * keeping track of work items in the vsk and cleaning them up + * upon socket close. + */ + while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && + !sent_read && + retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { + err = vmci_transport_send_read(sk); + if (err >= 0) + sent_read = true; + + retries++; + } + + if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_read) + pr_err("%p unable to send read notification to peer\n", + sk); + else + PKT_FIELD(vsk, peer_waiting_write) = false; + + } + return err; +} + +static void vmci_transport_notify_pkt_socket_init(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; + PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; + PKT_FIELD(vsk, peer_waiting_write) = false; + PKT_FIELD(vsk, peer_waiting_write_detected) = false; +} + +static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk) +{ + PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; + PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; + PKT_FIELD(vsk, peer_waiting_write) = false; + PKT_FIELD(vsk, peer_waiting_write_detected) = false; +} + +static int +vmci_transport_notify_pkt_poll_in(struct sock *sk, + size_t target, bool *data_ready_now) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + if (vsock_stream_has_data(vsk)) { + *data_ready_now = true; + } else { + /* We can't read right now because there is nothing in the + * queue. Ask for notifications when there is something to + * read. + */ + if (sk->sk_state == SS_CONNECTED) + vsock_block_update_write_window(sk); + *data_ready_now = false; + } + + return 0; +} + +static int +vmci_transport_notify_pkt_poll_out(struct sock *sk, + size_t target, bool *space_avail_now) +{ + s64 produce_q_free_space; + struct vsock_sock *vsk = vsock_sk(sk); + + produce_q_free_space = vsock_stream_has_space(vsk); + if (produce_q_free_space > 0) { + *space_avail_now = true; + return 0; + } else if (produce_q_free_space == 0) { + /* This is a connected socket but we can't currently send data. + * Nothing else to do. + */ + *space_avail_now = false; + } + + return 0; +} + +static int +vmci_transport_notify_pkt_recv_init( + struct sock *sk, + size_t target, + struct vmci_transport_recv_notify_data *data) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + data->consume_head = 0; + data->produce_tail = 0; + data->notify_on_block = false; + + if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) { + PKT_FIELD(vsk, write_notify_min_window) = target + 1; + if (PKT_FIELD(vsk, write_notify_window) < + PKT_FIELD(vsk, write_notify_min_window)) { + /* If the current window is smaller than the new + * minimal window size, we need to reevaluate whether + * we need to notify the sender. If the number of ready + * bytes are smaller than the new window, we need to + * send a notification to the sender before we block. + */ + + PKT_FIELD(vsk, write_notify_window) = + PKT_FIELD(vsk, write_notify_min_window); + data->notify_on_block = true; + } + } + + return 0; +} + +static int +vmci_transport_notify_pkt_recv_pre_block( + struct sock *sk, + size_t target, + struct vmci_transport_recv_notify_data *data) +{ + int err = 0; + + vsock_block_update_write_window(sk); + + if (data->notify_on_block) { + err = vmci_transport_send_read_notification(sk); + if (err < 0) + return err; + data->notify_on_block = false; + } + + return err; +} + +static int +vmci_transport_notify_pkt_recv_post_dequeue( + struct sock *sk, + size_t target, + ssize_t copied, + bool data_read, + struct vmci_transport_recv_notify_data *data) +{ + struct vsock_sock *vsk; + int err; + bool was_full = false; + u64 free_space; + + vsk = vsock_sk(sk); + err = 0; + + if (data_read) { + smp_mb(); + + free_space = + vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair); + was_full = free_space == copied; + + if (was_full) + PKT_FIELD(vsk, peer_waiting_write) = true; + + err = vmci_transport_send_read_notification(sk); + if (err < 0) + return err; + + /* See the comment in + * vmci_transport_notify_pkt_send_post_enqueue(). + */ + sk->sk_data_ready(sk, 0); + } + + return err; +} + +static int +vmci_transport_notify_pkt_send_init( + struct sock *sk, + struct vmci_transport_send_notify_data *data) +{ + data->consume_head = 0; + data->produce_tail = 0; + + return 0; +} + +static int +vmci_transport_notify_pkt_send_post_enqueue( + struct sock *sk, + ssize_t written, + struct vmci_transport_send_notify_data *data) +{ + int err = 0; + struct vsock_sock *vsk; + bool sent_wrote = false; + bool was_empty; + int retries = 0; + + vsk = vsock_sk(sk); + + smp_mb(); + + was_empty = + vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) == written; + if (was_empty) { + while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && + !sent_wrote && + retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { + err = vmci_transport_send_wrote(sk); + if (err >= 0) + sent_wrote = true; + + retries++; + } + } + + if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_wrote) { + pr_err("%p unable to send wrote notification to peer\n", + sk); + return err; + } + + return err; +} + +static void +vmci_transport_notify_pkt_handle_pkt( + struct sock *sk, + struct vmci_transport_packet *pkt, + bool bottom_half, + struct sockaddr_vm *dst, + struct sockaddr_vm *src, bool *pkt_processed) +{ + bool processed = false; + + switch (pkt->type) { + case VMCI_TRANSPORT_PACKET_TYPE_WROTE: + vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src); + processed = true; + break; + case VMCI_TRANSPORT_PACKET_TYPE_READ: + vmci_transport_handle_read(sk, pkt, bottom_half, dst, src); + processed = true; + break; + } + + if (pkt_processed) + *pkt_processed = processed; +} + +static void vmci_transport_notify_pkt_process_request(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; + if (vmci_trans(vsk)->consume_size < + PKT_FIELD(vsk, write_notify_min_window)) + PKT_FIELD(vsk, write_notify_min_window) = + vmci_trans(vsk)->consume_size; +} + +static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; + if (vmci_trans(vsk)->consume_size < + PKT_FIELD(vsk, write_notify_min_window)) + PKT_FIELD(vsk, write_notify_min_window) = + vmci_trans(vsk)->consume_size; +} + +static int +vmci_transport_notify_pkt_recv_pre_dequeue( + struct sock *sk, + size_t target, + struct vmci_transport_recv_notify_data *data) +{ + return 0; /* NOP for QState. */ +} + +static int +vmci_transport_notify_pkt_send_pre_block( + struct sock *sk, + struct vmci_transport_send_notify_data *data) +{ + return 0; /* NOP for QState. */ +} + +static int +vmci_transport_notify_pkt_send_pre_enqueue( + struct sock *sk, + struct vmci_transport_send_notify_data *data) +{ + return 0; /* NOP for QState. */ +} + +/* Socket always on control packet based operations. */ +struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = { + vmci_transport_notify_pkt_socket_init, + vmci_transport_notify_pkt_socket_destruct, + vmci_transport_notify_pkt_poll_in, + vmci_transport_notify_pkt_poll_out, + vmci_transport_notify_pkt_handle_pkt, + vmci_transport_notify_pkt_recv_init, + vmci_transport_notify_pkt_recv_pre_block, + vmci_transport_notify_pkt_recv_pre_dequeue, + vmci_transport_notify_pkt_recv_post_dequeue, + vmci_transport_notify_pkt_send_init, + vmci_transport_notify_pkt_send_pre_block, + vmci_transport_notify_pkt_send_pre_enqueue, + vmci_transport_notify_pkt_send_post_enqueue, + vmci_transport_notify_pkt_process_request, + vmci_transport_notify_pkt_process_negotiate, +}; diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c new file mode 100644 index 000000000000..ec2611b4ea0e --- /dev/null +++ b/net/vmw_vsock/vsock_addr.c @@ -0,0 +1,76 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2007-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/stddef.h> +#include <net/sock.h> + +#include "vsock_addr.h" + +void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port) +{ + memset(addr, 0, sizeof(*addr)); + addr->svm_family = AF_VSOCK; + addr->svm_cid = cid; + addr->svm_port = port; +} +EXPORT_SYMBOL_GPL(vsock_addr_init); + +int vsock_addr_validate(const struct sockaddr_vm *addr) +{ + if (!addr) + return -EFAULT; + + if (addr->svm_family != AF_VSOCK) + return -EAFNOSUPPORT; + + if (addr->svm_zero[0] != 0) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(vsock_addr_validate); + +bool vsock_addr_bound(const struct sockaddr_vm *addr) +{ + return addr->svm_port != VMADDR_PORT_ANY; +} +EXPORT_SYMBOL_GPL(vsock_addr_bound); + +void vsock_addr_unbind(struct sockaddr_vm *addr) +{ + vsock_addr_init(addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); +} +EXPORT_SYMBOL_GPL(vsock_addr_unbind); + +bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, + const struct sockaddr_vm *other) +{ + return addr->svm_cid == other->svm_cid && + addr->svm_port == other->svm_port; +} +EXPORT_SYMBOL_GPL(vsock_addr_equals_addr); + +int vsock_addr_cast(const struct sockaddr *addr, + size_t len, struct sockaddr_vm **out_addr) +{ + if (len < sizeof(**out_addr)) + return -EFAULT; + + *out_addr = (struct sockaddr_vm *)addr; + return vsock_addr_validate(*out_addr); +} +EXPORT_SYMBOL_GPL(vsock_addr_cast); diff --git a/net/vmw_vsock/vsock_addr.h b/net/vmw_vsock/vsock_addr.h new file mode 100644 index 000000000000..9ccd5316eac0 --- /dev/null +++ b/net/vmw_vsock/vsock_addr.h @@ -0,0 +1,30 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _VSOCK_ADDR_H_ +#define _VSOCK_ADDR_H_ + +#include <linux/vm_sockets.h> + +void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port); +int vsock_addr_validate(const struct sockaddr_vm *addr); +bool vsock_addr_bound(const struct sockaddr_vm *addr); +void vsock_addr_unbind(struct sockaddr_vm *addr); +bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, + const struct sockaddr_vm *other); +int vsock_addr_cast(const struct sockaddr *addr, size_t len, + struct sockaddr_vm **out_addr); + +#endif diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig deleted file mode 100644 index a157a2e64e18..000000000000 --- a/net/wanrouter/Kconfig +++ /dev/null @@ -1,27 +0,0 @@ -# -# Configuration for WAN router -# - -config WAN_ROUTER - tristate "WAN router (DEPRECATED)" - depends on EXPERIMENTAL - ---help--- - Wide Area Networks (WANs), such as X.25, frame relay and leased - lines, are used to interconnect Local Area Networks (LANs) over vast - distances with data transfer rates significantly higher than those - achievable with commonly used asynchronous modem connections. - Usually, a quite expensive external device called a `WAN router' is - needed to connect to a WAN. - - As an alternative, WAN routing can be built into the Linux kernel. - With relatively inexpensive WAN interface cards available on the - market, a perfectly usable router can be built for less than half - the price of an external router. If you have one of those cards and - wish to use your Linux box as a WAN router, say Y here and also to - the WAN driver for your card, below. You will then need the - wan-tools package which is available from <ftp://ftp.sangoma.com/>. - - To compile WAN routing support as a module, choose M here: the - module will be called wanrouter. - - If unsure, say N. diff --git a/net/wanrouter/Makefile b/net/wanrouter/Makefile deleted file mode 100644 index 4da14bc48078..000000000000 --- a/net/wanrouter/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# -# Makefile for the Linux WAN router layer. -# - -obj-$(CONFIG_WAN_ROUTER) += wanrouter.o - -wanrouter-y := wanproc.o wanmain.o diff --git a/net/wanrouter/patchlevel b/net/wanrouter/patchlevel deleted file mode 100644 index c043eea7767e..000000000000 --- a/net/wanrouter/patchlevel +++ /dev/null @@ -1 +0,0 @@ -2.2.1 diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c deleted file mode 100644 index 2ab785064b7e..000000000000 --- a/net/wanrouter/wanmain.c +++ /dev/null @@ -1,782 +0,0 @@ -/***************************************************************************** -* wanmain.c WAN Multiprotocol Router Module. Main code. -* -* This module is completely hardware-independent and provides -* the following common services for the WAN Link Drivers: -* o WAN device management (registering, unregistering) -* o Network interface management -* o Physical connection management (dial-up, incoming calls) -* o Logical connection management (switched virtual circuits) -* o Protocol encapsulation/decapsulation -* -* Author: Gideon Hack -* -* Copyright: (c) 1995-1999 Sangoma Technologies Inc. -* -* This program is free software; you can redistribute it and/or -* modify it under the terms of the GNU General Public License -* as published by the Free Software Foundation; either version -* 2 of the License, or (at your option) any later version. -* ============================================================================ -* Nov 24, 2000 Nenad Corbic Updated for 2.4.X kernels -* Nov 07, 2000 Nenad Corbic Fixed the Mulit-Port PPP for kernels 2.2.16 and -* greater. -* Aug 2, 2000 Nenad Corbic Block the Multi-Port PPP from running on -* kernels 2.2.16 or greater. The SyncPPP -* has changed. -* Jul 13, 2000 Nenad Corbic Added SyncPPP support -* Added extra debugging in device_setup(). -* Oct 01, 1999 Gideon Hack Update for s514 PCI card -* Dec 27, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) -* Jan 16, 1997 Gene Kozin router_devlist made public -* Jan 31, 1997 Alan Cox Hacked it about a bit for 2.1 -* Jun 27, 1997 Alan Cox realigned with vendor code -* Oct 15, 1997 Farhan Thawar changed wan_encapsulate to add a pad byte of 0 -* Apr 20, 1998 Alan Cox Fixed 2.1 symbols -* May 17, 1998 K. Baranowski Fixed SNAP encapsulation in wan_encapsulate -* Dec 15, 1998 Arnaldo Melo support for firmwares of up to 128000 bytes -* check wandev->setup return value -* Dec 22, 1998 Arnaldo Melo vmalloc/vfree used in device_setup to allocate -* kernel memory and copy configuration data to -* kernel space (for big firmwares) -* Jun 02, 1999 Gideon Hack Updates for Linux 2.0.X and 2.2.X kernels. -*****************************************************************************/ - -#include <linux/stddef.h> /* offsetof(), etc. */ -#include <linux/capability.h> -#include <linux/errno.h> /* return codes */ -#include <linux/kernel.h> -#include <linux/module.h> /* support for loadable modules */ -#include <linux/slab.h> /* kmalloc(), kfree() */ -#include <linux/mutex.h> -#include <linux/mm.h> -#include <linux/string.h> /* inline mem*, str* functions */ - -#include <asm/byteorder.h> /* htons(), etc. */ -#include <linux/wanrouter.h> /* WAN router API definitions */ - -#include <linux/vmalloc.h> /* vmalloc, vfree */ -#include <asm/uaccess.h> /* copy_to/from_user */ -#include <linux/init.h> /* __initfunc et al. */ - -#define DEV_TO_SLAVE(dev) (*((struct net_device **)netdev_priv(dev))) - -/* - * Function Prototypes - */ - -/* - * WAN device IOCTL handlers - */ - -static DEFINE_MUTEX(wanrouter_mutex); -static int wanrouter_device_setup(struct wan_device *wandev, - wandev_conf_t __user *u_conf); -static int wanrouter_device_stat(struct wan_device *wandev, - wandev_stat_t __user *u_stat); -static int wanrouter_device_shutdown(struct wan_device *wandev); -static int wanrouter_device_new_if(struct wan_device *wandev, - wanif_conf_t __user *u_conf); -static int wanrouter_device_del_if(struct wan_device *wandev, - char __user *u_name); - -/* - * Miscellaneous - */ - -static struct wan_device *wanrouter_find_device(char *name); -static int wanrouter_delete_interface(struct wan_device *wandev, char *name); -static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) - __acquires(lock); -static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) - __releases(lock); - - - -/* - * Global Data - */ - -static char wanrouter_fullname[] = "Sangoma WANPIPE Router"; -static char wanrouter_copyright[] = "(c) 1995-2000 Sangoma Technologies Inc."; -static char wanrouter_modname[] = ROUTER_NAME; /* short module name */ -struct wan_device* wanrouter_router_devlist; /* list of registered devices */ - -/* - * Organize Unique Identifiers for encapsulation/decapsulation - */ - -#if 0 -static unsigned char wanrouter_oui_ether[] = { 0x00, 0x00, 0x00 }; -static unsigned char wanrouter_oui_802_2[] = { 0x00, 0x80, 0xC2 }; -#endif - -static int __init wanrouter_init(void) -{ - int err; - - printk(KERN_INFO "%s v%u.%u %s\n", - wanrouter_fullname, ROUTER_VERSION, ROUTER_RELEASE, - wanrouter_copyright); - - err = wanrouter_proc_init(); - if (err) - printk(KERN_INFO "%s: can't create entry in proc filesystem!\n", - wanrouter_modname); - - return err; -} - -static void __exit wanrouter_cleanup (void) -{ - wanrouter_proc_cleanup(); -} - -/* - * This is just plain dumb. We should move the bugger to drivers/net/wan, - * slap it first in directory and make it module_init(). The only reason - * for subsys_initcall() here is that net goes after drivers (why, BTW?) - */ -subsys_initcall(wanrouter_init); -module_exit(wanrouter_cleanup); - -/* - * Kernel APIs - */ - -/* - * Register WAN device. - * o verify device credentials - * o create an entry for the device in the /proc/net/router directory - * o initialize internally maintained fields of the wan_device structure - * o link device data space to a singly-linked list - * o if it's the first device, then start kernel 'thread' - * o increment module use count - * - * Return: - * 0 Ok - * < 0 error. - * - * Context: process - */ - - -int register_wan_device(struct wan_device *wandev) -{ - int err, namelen; - - if ((wandev == NULL) || (wandev->magic != ROUTER_MAGIC) || - (wandev->name == NULL)) - return -EINVAL; - - namelen = strlen(wandev->name); - if (!namelen || (namelen > WAN_DRVNAME_SZ)) - return -EINVAL; - - if (wanrouter_find_device(wandev->name)) - return -EEXIST; - -#ifdef WANDEBUG - printk(KERN_INFO "%s: registering WAN device %s\n", - wanrouter_modname, wandev->name); -#endif - - /* - * Register /proc directory entry - */ - err = wanrouter_proc_add(wandev); - if (err) { - printk(KERN_INFO - "%s: can't create /proc/net/router/%s entry!\n", - wanrouter_modname, wandev->name); - return err; - } - - /* - * Initialize fields of the wan_device structure maintained by the - * router and update local data. - */ - - wandev->ndev = 0; - wandev->dev = NULL; - wandev->next = wanrouter_router_devlist; - wanrouter_router_devlist = wandev; - return 0; -} - -/* - * Unregister WAN device. - * o shut down device - * o unlink device data space from the linked list - * o delete device entry in the /proc/net/router directory - * o decrement module use count - * - * Return: 0 Ok - * <0 error. - * Context: process - */ - - -int unregister_wan_device(char *name) -{ - struct wan_device *wandev, *prev; - - if (name == NULL) - return -EINVAL; - - for (wandev = wanrouter_router_devlist, prev = NULL; - wandev && strcmp(wandev->name, name); - prev = wandev, wandev = wandev->next) - ; - if (wandev == NULL) - return -ENODEV; - -#ifdef WANDEBUG - printk(KERN_INFO "%s: unregistering WAN device %s\n", - wanrouter_modname, name); -#endif - - if (wandev->state != WAN_UNCONFIGURED) - wanrouter_device_shutdown(wandev); - - if (prev) - prev->next = wandev->next; - else - wanrouter_router_devlist = wandev->next; - - wanrouter_proc_delete(wandev); - return 0; -} - -#if 0 - -/* - * Encapsulate packet. - * - * Return: encapsulation header size - * < 0 - unsupported Ethertype - * - * Notes: - * 1. This function may be called on interrupt context. - */ - - -int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev, - unsigned short type) -{ - int hdr_len = 0; - - switch (type) { - case ETH_P_IP: /* IP datagram encapsulation */ - hdr_len += 1; - skb_push(skb, 1); - skb->data[0] = NLPID_IP; - break; - - case ETH_P_IPX: /* SNAP encapsulation */ - case ETH_P_ARP: - hdr_len += 7; - skb_push(skb, 7); - skb->data[0] = 0; - skb->data[1] = NLPID_SNAP; - skb_copy_to_linear_data_offset(skb, 2, wanrouter_oui_ether, - sizeof(wanrouter_oui_ether)); - *((unsigned short*)&skb->data[5]) = htons(type); - break; - - default: /* Unknown packet type */ - printk(KERN_INFO - "%s: unsupported Ethertype 0x%04X on interface %s!\n", - wanrouter_modname, type, dev->name); - hdr_len = -EINVAL; - } - return hdr_len; -} - - -/* - * Decapsulate packet. - * - * Return: Ethertype (in network order) - * 0 unknown encapsulation - * - * Notes: - * 1. This function may be called on interrupt context. - */ - - -__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev) -{ - int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */ - __be16 ethertype; - - switch (skb->data[cnt]) { - case NLPID_IP: /* IP datagramm */ - ethertype = htons(ETH_P_IP); - cnt += 1; - break; - - case NLPID_SNAP: /* SNAP encapsulation */ - if (memcmp(&skb->data[cnt + 1], wanrouter_oui_ether, - sizeof(wanrouter_oui_ether))){ - printk(KERN_INFO - "%s: unsupported SNAP OUI %02X-%02X-%02X " - "on interface %s!\n", wanrouter_modname, - skb->data[cnt+1], skb->data[cnt+2], - skb->data[cnt+3], dev->name); - return 0; - } - ethertype = *((__be16*)&skb->data[cnt+4]); - cnt += 6; - break; - - /* add other protocols, e.g. CLNP, ESIS, ISIS, if needed */ - - default: - printk(KERN_INFO - "%s: unsupported NLPID 0x%02X on interface %s!\n", - wanrouter_modname, skb->data[cnt], dev->name); - return 0; - } - skb->protocol = ethertype; - skb->pkt_type = PACKET_HOST; /* Physically point to point */ - skb_pull(skb, cnt); - skb_reset_mac_header(skb); - return ethertype; -} - -#endif /* 0 */ - -/* - * WAN device IOCTL. - * o find WAN device associated with this node - * o execute requested action or pass command to the device driver - */ - -long wanrouter_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file->f_path.dentry->d_inode; - int err = 0; - struct proc_dir_entry *dent; - struct wan_device *wandev; - void __user *data = (void __user *)arg; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if ((cmd >> 8) != ROUTER_IOCTL) - return -EINVAL; - - dent = PDE(inode); - if ((dent == NULL) || (dent->data == NULL)) - return -EINVAL; - - wandev = dent->data; - if (wandev->magic != ROUTER_MAGIC) - return -EINVAL; - - mutex_lock(&wanrouter_mutex); - switch (cmd) { - case ROUTER_SETUP: - err = wanrouter_device_setup(wandev, data); - break; - - case ROUTER_DOWN: - err = wanrouter_device_shutdown(wandev); - break; - - case ROUTER_STAT: - err = wanrouter_device_stat(wandev, data); - break; - - case ROUTER_IFNEW: - err = wanrouter_device_new_if(wandev, data); - break; - - case ROUTER_IFDEL: - err = wanrouter_device_del_if(wandev, data); - break; - - case ROUTER_IFSTAT: - break; - - default: - if ((cmd >= ROUTER_USER) && - (cmd <= ROUTER_USER_MAX) && - wandev->ioctl) - err = wandev->ioctl(wandev, cmd, arg); - else err = -EINVAL; - } - mutex_unlock(&wanrouter_mutex); - return err; -} - -/* - * WAN Driver IOCTL Handlers - */ - -/* - * Setup WAN link device. - * o verify user address space - * o allocate kernel memory and copy configuration data to kernel space - * o if configuration data includes extension, copy it to kernel space too - * o call driver's setup() entry point - */ - -static int wanrouter_device_setup(struct wan_device *wandev, - wandev_conf_t __user *u_conf) -{ - void *data = NULL; - wandev_conf_t *conf; - int err = -EINVAL; - - if (wandev->setup == NULL) { /* Nothing to do ? */ - printk(KERN_INFO "%s: ERROR, No setup script: wandev->setup()\n", - wandev->name); - return 0; - } - - conf = kmalloc(sizeof(wandev_conf_t), GFP_KERNEL); - if (conf == NULL){ - printk(KERN_INFO "%s: ERROR, Failed to allocate kernel memory !\n", - wandev->name); - return -ENOBUFS; - } - - if (copy_from_user(conf, u_conf, sizeof(wandev_conf_t))) { - printk(KERN_INFO "%s: Failed to copy user config data to kernel space!\n", - wandev->name); - kfree(conf); - return -EFAULT; - } - - if (conf->magic != ROUTER_MAGIC) { - kfree(conf); - printk(KERN_INFO "%s: ERROR, Invalid MAGIC Number\n", - wandev->name); - return -EINVAL; - } - - if (conf->data_size && conf->data) { - if (conf->data_size > 128000) { - printk(KERN_INFO - "%s: ERROR, Invalid firmware data size %i !\n", - wandev->name, conf->data_size); - kfree(conf); - return -EINVAL; - } - - data = vmalloc(conf->data_size); - if (!data) { - printk(KERN_INFO - "%s: ERROR, Failed allocate kernel memory !\n", - wandev->name); - kfree(conf); - return -ENOBUFS; - } - if (!copy_from_user(data, conf->data, conf->data_size)) { - conf->data = data; - err = wandev->setup(wandev, conf); - } else { - printk(KERN_INFO - "%s: ERROR, Failed to copy from user data !\n", - wandev->name); - err = -EFAULT; - } - vfree(data); - } else { - printk(KERN_INFO - "%s: ERROR, No firmware found ! Firmware size = %i !\n", - wandev->name, conf->data_size); - } - - kfree(conf); - return err; -} - -/* - * Shutdown WAN device. - * o delete all not opened logical channels for this device - * o call driver's shutdown() entry point - */ - -static int wanrouter_device_shutdown(struct wan_device *wandev) -{ - struct net_device *dev; - int err=0; - - if (wandev->state == WAN_UNCONFIGURED) - return 0; - - printk(KERN_INFO "\n%s: Shutting Down!\n",wandev->name); - - for (dev = wandev->dev; dev;) { - err = wanrouter_delete_interface(wandev, dev->name); - if (err) - return err; - /* The above function deallocates the current dev - * structure. Therefore, we cannot use netdev_priv(dev) - * as the next element: wandev->dev points to the - * next element */ - dev = wandev->dev; - } - - if (wandev->ndev) - return -EBUSY; /* there are opened interfaces */ - - if (wandev->shutdown) - err=wandev->shutdown(wandev); - - return err; -} - -/* - * Get WAN device status & statistics. - */ - -static int wanrouter_device_stat(struct wan_device *wandev, - wandev_stat_t __user *u_stat) -{ - wandev_stat_t stat; - - memset(&stat, 0, sizeof(stat)); - - /* Ask device driver to update device statistics */ - if ((wandev->state != WAN_UNCONFIGURED) && wandev->update) - wandev->update(wandev); - - /* Fill out structure */ - stat.ndev = wandev->ndev; - stat.state = wandev->state; - - if (copy_to_user(u_stat, &stat, sizeof(stat))) - return -EFAULT; - - return 0; -} - -/* - * Create new WAN interface. - * o verify user address space - * o copy configuration data to kernel address space - * o allocate network interface data space - * o call driver's new_if() entry point - * o make sure there is no interface name conflict - * o register network interface - */ - -static int wanrouter_device_new_if(struct wan_device *wandev, - wanif_conf_t __user *u_conf) -{ - wanif_conf_t *cnf; - struct net_device *dev = NULL; - int err; - - if ((wandev->state == WAN_UNCONFIGURED) || (wandev->new_if == NULL)) - return -ENODEV; - - cnf = kmalloc(sizeof(wanif_conf_t), GFP_KERNEL); - if (!cnf) - return -ENOBUFS; - - err = -EFAULT; - if (copy_from_user(cnf, u_conf, sizeof(wanif_conf_t))) - goto out; - - err = -EINVAL; - if (cnf->magic != ROUTER_MAGIC) - goto out; - - if (cnf->config_id == WANCONFIG_MPPP) { - printk(KERN_INFO "%s: Wanpipe Mulit-Port PPP support has not been compiled in!\n", - wandev->name); - err = -EPROTONOSUPPORT; - goto out; - } else { - err = wandev->new_if(wandev, dev, cnf); - } - - if (!err) { - /* Register network interface. This will invoke init() - * function supplied by the driver. If device registered - * successfully, add it to the interface list. - */ - -#ifdef WANDEBUG - printk(KERN_INFO "%s: registering interface %s...\n", - wanrouter_modname, dev->name); -#endif - - err = register_netdev(dev); - if (!err) { - struct net_device *slave = NULL; - unsigned long smp_flags=0; - - lock_adapter_irq(&wandev->lock, &smp_flags); - - if (wandev->dev == NULL) { - wandev->dev = dev; - } else { - for (slave=wandev->dev; - DEV_TO_SLAVE(slave); - slave = DEV_TO_SLAVE(slave)) - DEV_TO_SLAVE(slave) = dev; - } - ++wandev->ndev; - - unlock_adapter_irq(&wandev->lock, &smp_flags); - err = 0; /* done !!! */ - goto out; - } - if (wandev->del_if) - wandev->del_if(wandev, dev); - free_netdev(dev); - } - -out: - kfree(cnf); - return err; -} - - -/* - * Delete WAN logical channel. - * o verify user address space - * o copy configuration data to kernel address space - */ - -static int wanrouter_device_del_if(struct wan_device *wandev, char __user *u_name) -{ - char name[WAN_IFNAME_SZ + 1]; - int err = 0; - - if (wandev->state == WAN_UNCONFIGURED) - return -ENODEV; - - memset(name, 0, sizeof(name)); - - if (copy_from_user(name, u_name, WAN_IFNAME_SZ)) - return -EFAULT; - - err = wanrouter_delete_interface(wandev, name); - if (err) - return err; - - /* If last interface being deleted, shutdown card - * This helps with administration at leaf nodes - * (You can tell if the person at the other end of the phone - * has an interface configured) and avoids DoS vulnerabilities - * in binary driver files - this fixes a problem with the current - * Sangoma driver going into strange states when all the network - * interfaces are deleted and the link irrecoverably disconnected. - */ - - if (!wandev->ndev && wandev->shutdown) - err = wandev->shutdown(wandev); - - return err; -} - -/* - * Miscellaneous Functions - */ - -/* - * Find WAN device by name. - * Return pointer to the WAN device data space or NULL if device not found. - */ - -static struct wan_device *wanrouter_find_device(char *name) -{ - struct wan_device *wandev; - - for (wandev = wanrouter_router_devlist; - wandev && strcmp(wandev->name, name); - wandev = wandev->next); - return wandev; -} - -/* - * Delete WAN logical channel identified by its name. - * o find logical channel by its name - * o call driver's del_if() entry point - * o unregister network interface - * o unlink channel data space from linked list of channels - * o release channel data space - * - * Return: 0 success - * -ENODEV channel not found. - * -EBUSY interface is open - * - * Note: If (force != 0), then device will be destroyed even if interface - * associated with it is open. It's caller's responsibility to make - * sure that opened interfaces are not removed! - */ - -static int wanrouter_delete_interface(struct wan_device *wandev, char *name) -{ - struct net_device *dev = NULL, *prev = NULL; - unsigned long smp_flags=0; - - lock_adapter_irq(&wandev->lock, &smp_flags); - dev = wandev->dev; - prev = NULL; - while (dev && strcmp(name, dev->name)) { - struct net_device **slave = netdev_priv(dev); - prev = dev; - dev = *slave; - } - unlock_adapter_irq(&wandev->lock, &smp_flags); - - if (dev == NULL) - return -ENODEV; /* interface not found */ - - if (netif_running(dev)) - return -EBUSY; /* interface in use */ - - if (wandev->del_if) - wandev->del_if(wandev, dev); - - lock_adapter_irq(&wandev->lock, &smp_flags); - if (prev) { - struct net_device **prev_slave = netdev_priv(prev); - struct net_device **slave = netdev_priv(dev); - - *prev_slave = *slave; - } else { - struct net_device **slave = netdev_priv(dev); - wandev->dev = *slave; - } - --wandev->ndev; - unlock_adapter_irq(&wandev->lock, &smp_flags); - - printk(KERN_INFO "%s: unregistering '%s'\n", wandev->name, dev->name); - - unregister_netdev(dev); - - free_netdev(dev); - - return 0; -} - -static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) - __acquires(lock) -{ - spin_lock_irqsave(lock, *smp_flags); -} - - -static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) - __releases(lock) -{ - spin_unlock_irqrestore(lock, *smp_flags); -} - -EXPORT_SYMBOL(register_wan_device); -EXPORT_SYMBOL(unregister_wan_device); - -MODULE_LICENSE("GPL"); - -/* - * End - */ diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c deleted file mode 100644 index c43612ee96bb..000000000000 --- a/net/wanrouter/wanproc.c +++ /dev/null @@ -1,380 +0,0 @@ -/***************************************************************************** -* wanproc.c WAN Router Module. /proc filesystem interface. -* -* This module is completely hardware-independent and provides -* access to the router using Linux /proc filesystem. -* -* Author: Gideon Hack -* -* Copyright: (c) 1995-1999 Sangoma Technologies Inc. -* -* This program is free software; you can redistribute it and/or -* modify it under the terms of the GNU General Public License -* as published by the Free Software Foundation; either version -* 2 of the License, or (at your option) any later version. -* ============================================================================ -* Jun 02, 1999 Gideon Hack Updates for Linux 2.2.X kernels. -* Jun 29, 1997 Alan Cox Merged with 1.0.3 vendor code -* Jan 29, 1997 Gene Kozin v1.0.1. Implemented /proc read routines -* Jan 30, 1997 Alan Cox Hacked around for 2.1 -* Dec 13, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) -*****************************************************************************/ - -#include <linux/init.h> /* __initfunc et al. */ -#include <linux/stddef.h> /* offsetof(), etc. */ -#include <linux/errno.h> /* return codes */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/wanrouter.h> /* WAN router API definitions */ -#include <linux/seq_file.h> -#include <linux/mutex.h> - -#include <net/net_namespace.h> -#include <asm/io.h> - -#define PROC_STATS_FORMAT "%30s: %12lu\n" - -/****** Defines and Macros **************************************************/ - -#define PROT_DECODE(prot) ((prot == WANCONFIG_FR) ? " FR" :\ - (prot == WANCONFIG_X25) ? " X25" : \ - (prot == WANCONFIG_PPP) ? " PPP" : \ - (prot == WANCONFIG_CHDLC) ? " CHDLC": \ - (prot == WANCONFIG_MPPP) ? " MPPP" : \ - " Unknown" ) - -/****** Function Prototypes *************************************************/ - -#ifdef CONFIG_PROC_FS - -/* Miscellaneous */ - -/* - * Structures for interfacing with the /proc filesystem. - * Router creates its own directory /proc/net/router with the following - * entries: - * config device configuration - * status global device statistics - * <device> entry for each WAN device - */ - -/* - * Generic /proc/net/router/<file> file and inode operations - */ - -/* - * /proc/net/router - */ - -static DEFINE_MUTEX(config_mutex); -static struct proc_dir_entry *proc_router; - -/* Strings */ - -/* - * Interface functions - */ - -/****** Proc filesystem entry points ****************************************/ - -/* - * Iterator - */ -static void *r_start(struct seq_file *m, loff_t *pos) -{ - struct wan_device *wandev; - loff_t l = *pos; - - mutex_lock(&config_mutex); - if (!l--) - return SEQ_START_TOKEN; - for (wandev = wanrouter_router_devlist; l-- && wandev; - wandev = wandev->next) - ; - return wandev; -} - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct wan_device *wandev = v; - (*pos)++; - return (v == SEQ_START_TOKEN) ? wanrouter_router_devlist : wandev->next; -} - -static void r_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&config_mutex); -} - -static int config_show(struct seq_file *m, void *v) -{ - struct wan_device *p = v; - if (v == SEQ_START_TOKEN) { - seq_puts(m, "Device name | port |IRQ|DMA| mem.addr |" - "mem.size|option1|option2|option3|option4\n"); - return 0; - } - if (!p->state) - return 0; - seq_printf(m, "%-15s|0x%-4X|%3u|%3u| 0x%-8lX |0x%-6X|%7u|%7u|%7u|%7u\n", - p->name, p->ioport, p->irq, p->dma, p->maddr, p->msize, - p->hw_opt[0], p->hw_opt[1], p->hw_opt[2], p->hw_opt[3]); - return 0; -} - -static int status_show(struct seq_file *m, void *v) -{ - struct wan_device *p = v; - if (v == SEQ_START_TOKEN) { - seq_puts(m, "Device name |protocol|station|interface|" - "clocking|baud rate| MTU |ndev|link state\n"); - return 0; - } - if (!p->state) - return 0; - seq_printf(m, "%-15s|%-8s| %-7s| %-9s|%-8s|%9u|%5u|%3u |", - p->name, - PROT_DECODE(p->config_id), - p->config_id == WANCONFIG_FR ? - (p->station ? "Node" : "CPE") : - (p->config_id == WANCONFIG_X25 ? - (p->station ? "DCE" : "DTE") : - ("N/A")), - p->interface ? "V.35" : "RS-232", - p->clocking ? "internal" : "external", - p->bps, - p->mtu, - p->ndev); - - switch (p->state) { - case WAN_UNCONFIGURED: - seq_printf(m, "%-12s\n", "unconfigured"); - break; - case WAN_DISCONNECTED: - seq_printf(m, "%-12s\n", "disconnected"); - break; - case WAN_CONNECTING: - seq_printf(m, "%-12s\n", "connecting"); - break; - case WAN_CONNECTED: - seq_printf(m, "%-12s\n", "connected"); - break; - default: - seq_printf(m, "%-12s\n", "invalid"); - break; - } - return 0; -} - -static const struct seq_operations config_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = config_show, -}; - -static const struct seq_operations status_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = status_show, -}; - -static int config_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &config_op); -} - -static int status_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &status_op); -} - -static const struct file_operations config_fops = { - .owner = THIS_MODULE, - .open = config_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations status_fops = { - .owner = THIS_MODULE, - .open = status_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int wandev_show(struct seq_file *m, void *v) -{ - struct wan_device *wandev = m->private; - - if (wandev->magic != ROUTER_MAGIC) - return 0; - - if (!wandev->state) { - seq_puts(m, "device is not configured!\n"); - return 0; - } - - /* Update device statistics */ - if (wandev->update) { - int err = wandev->update(wandev); - if (err == -EAGAIN) { - seq_puts(m, "Device is busy!\n"); - return 0; - } - if (err) { - seq_puts(m, "Device is not configured!\n"); - return 0; - } - } - - seq_printf(m, PROC_STATS_FORMAT, - "total packets received", wandev->stats.rx_packets); - seq_printf(m, PROC_STATS_FORMAT, - "total packets transmitted", wandev->stats.tx_packets); - seq_printf(m, PROC_STATS_FORMAT, - "total bytes received", wandev->stats.rx_bytes); - seq_printf(m, PROC_STATS_FORMAT, - "total bytes transmitted", wandev->stats.tx_bytes); - seq_printf(m, PROC_STATS_FORMAT, - "bad packets received", wandev->stats.rx_errors); - seq_printf(m, PROC_STATS_FORMAT, - "packet transmit problems", wandev->stats.tx_errors); - seq_printf(m, PROC_STATS_FORMAT, - "received frames dropped", wandev->stats.rx_dropped); - seq_printf(m, PROC_STATS_FORMAT, - "transmit frames dropped", wandev->stats.tx_dropped); - seq_printf(m, PROC_STATS_FORMAT, - "multicast packets received", wandev->stats.multicast); - seq_printf(m, PROC_STATS_FORMAT, - "transmit collisions", wandev->stats.collisions); - seq_printf(m, PROC_STATS_FORMAT, - "receive length errors", wandev->stats.rx_length_errors); - seq_printf(m, PROC_STATS_FORMAT, - "receiver overrun errors", wandev->stats.rx_over_errors); - seq_printf(m, PROC_STATS_FORMAT, - "CRC errors", wandev->stats.rx_crc_errors); - seq_printf(m, PROC_STATS_FORMAT, - "frame format errors (aborts)", wandev->stats.rx_frame_errors); - seq_printf(m, PROC_STATS_FORMAT, - "receiver fifo overrun", wandev->stats.rx_fifo_errors); - seq_printf(m, PROC_STATS_FORMAT, - "receiver missed packet", wandev->stats.rx_missed_errors); - seq_printf(m, PROC_STATS_FORMAT, - "aborted frames transmitted", wandev->stats.tx_aborted_errors); - return 0; -} - -static int wandev_open(struct inode *inode, struct file *file) -{ - return single_open(file, wandev_show, PDE(inode)->data); -} - -static const struct file_operations wandev_fops = { - .owner = THIS_MODULE, - .open = wandev_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .unlocked_ioctl = wanrouter_ioctl, -}; - -/* - * Initialize router proc interface. - */ - -int __init wanrouter_proc_init(void) -{ - struct proc_dir_entry *p; - proc_router = proc_mkdir(ROUTER_NAME, init_net.proc_net); - if (!proc_router) - goto fail; - - p = proc_create("config", S_IRUGO, proc_router, &config_fops); - if (!p) - goto fail_config; - p = proc_create("status", S_IRUGO, proc_router, &status_fops); - if (!p) - goto fail_stat; - return 0; -fail_stat: - remove_proc_entry("config", proc_router); -fail_config: - remove_proc_entry(ROUTER_NAME, init_net.proc_net); -fail: - return -ENOMEM; -} - -/* - * Clean up router proc interface. - */ - -void wanrouter_proc_cleanup(void) -{ - remove_proc_entry("config", proc_router); - remove_proc_entry("status", proc_router); - remove_proc_entry(ROUTER_NAME, init_net.proc_net); -} - -/* - * Add directory entry for WAN device. - */ - -int wanrouter_proc_add(struct wan_device* wandev) -{ - if (wandev->magic != ROUTER_MAGIC) - return -EINVAL; - - wandev->dent = proc_create(wandev->name, S_IRUGO, - proc_router, &wandev_fops); - if (!wandev->dent) - return -ENOMEM; - wandev->dent->data = wandev; - return 0; -} - -/* - * Delete directory entry for WAN device. - */ -int wanrouter_proc_delete(struct wan_device* wandev) -{ - if (wandev->magic != ROUTER_MAGIC) - return -EINVAL; - remove_proc_entry(wandev->name, proc_router); - return 0; -} - -#else - -/* - * No /proc - output stubs - */ - -int __init wanrouter_proc_init(void) -{ - return 0; -} - -void wanrouter_proc_cleanup(void) -{ -} - -int wanrouter_proc_add(struct wan_device *wandev) -{ - return 0; -} - -int wanrouter_proc_delete(struct wan_device *wandev) -{ - return 0; -} - -#endif - -/* - * End - */ - diff --git a/net/wireless/chan.c b/net/wireless/chan.c index a7990bb16529..fd556ac05fdb 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -76,6 +76,10 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) return false; if (!chandef->center_freq2) return false; + /* adjacent is not allowed -- that's a 160 MHz channel */ + if (chandef->center_freq1 - chandef->center_freq2 == 80 || + chandef->center_freq2 - chandef->center_freq1 == 80) + return false; break; case NL80211_CHAN_WIDTH_80: if (chandef->center_freq1 != control_freq + 30 && @@ -143,6 +147,32 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c, } } +static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) +{ + int width; + + switch (c->width) { + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_20_NOHT: + width = 20; + break; + case NL80211_CHAN_WIDTH_40: + width = 40; + break; + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_80: + width = 80; + break; + case NL80211_CHAN_WIDTH_160: + width = 160; + break; + default: + WARN_ON_ONCE(1); + return -1; + } + return width; +} + const struct cfg80211_chan_def * cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, const struct cfg80211_chan_def *c2) @@ -188,6 +218,93 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, } EXPORT_SYMBOL(cfg80211_chandef_compatible); +static void cfg80211_set_chans_dfs_state(struct wiphy *wiphy, u32 center_freq, + u32 bandwidth, + enum nl80211_dfs_state dfs_state) +{ + struct ieee80211_channel *c; + u32 freq; + + for (freq = center_freq - bandwidth/2 + 10; + freq <= center_freq + bandwidth/2 - 10; + freq += 20) { + c = ieee80211_get_channel(wiphy, freq); + if (!c || !(c->flags & IEEE80211_CHAN_RADAR)) + continue; + + c->dfs_state = dfs_state; + c->dfs_state_entered = jiffies; + } +} + +void cfg80211_set_dfs_state(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + enum nl80211_dfs_state dfs_state) +{ + int width; + + if (WARN_ON(!cfg80211_chandef_valid(chandef))) + return; + + width = cfg80211_chandef_get_width(chandef); + if (width < 0) + return; + + cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq1, + width, dfs_state); + + if (!chandef->center_freq2) + return; + cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq2, + width, dfs_state); +} + +static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, + u32 center_freq, + u32 bandwidth) +{ + struct ieee80211_channel *c; + u32 freq; + + for (freq = center_freq - bandwidth/2 + 10; + freq <= center_freq + bandwidth/2 - 10; + freq += 20) { + c = ieee80211_get_channel(wiphy, freq); + if (!c) + return -EINVAL; + + if (c->flags & IEEE80211_CHAN_RADAR) + return 1; + } + return 0; +} + + +int cfg80211_chandef_dfs_required(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + int width; + int r; + + if (WARN_ON(!cfg80211_chandef_valid(chandef))) + return -EINVAL; + + width = cfg80211_chandef_get_width(chandef); + if (width < 0) + return -EINVAL; + + r = cfg80211_get_chans_dfs_required(wiphy, chandef->center_freq1, + width); + if (r) + return r; + + if (!chandef->center_freq2) + return 0; + + return cfg80211_get_chans_dfs_required(wiphy, chandef->center_freq2, + width); +} + static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, u32 center_freq, u32 bandwidth, u32 prohibited_flags) @@ -199,7 +316,16 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, freq <= center_freq + bandwidth/2 - 10; freq += 20) { c = ieee80211_get_channel(wiphy, freq); - if (!c || c->flags & prohibited_flags) + if (!c) + return false; + + /* check for radar flags */ + if ((prohibited_flags & c->flags & IEEE80211_CHAN_RADAR) && + (c->dfs_state != NL80211_DFS_AVAILABLE)) + return false; + + /* check for the other flags */ + if (c->flags & prohibited_flags & ~IEEE80211_CHAN_RADAR) return false; } @@ -249,6 +375,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, case NL80211_CHAN_WIDTH_80: if (!vht_cap->vht_supported) return false; + prohibited_flags |= IEEE80211_CHAN_NO_80MHZ; width = 80; break; case NL80211_CHAN_WIDTH_160: @@ -256,6 +383,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, return false; if (!(vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ)) return false; + prohibited_flags |= IEEE80211_CHAN_NO_160MHZ; width = 160; break; default: @@ -263,7 +391,16 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, return false; } - /* TODO: missing regulatory check on 80/160 bandwidth */ + /* + * TODO: What if there are only certain 80/160/80+80 MHz channels + * allowed by the driver, or only certain combinations? + * For 40 MHz the driver can set the NO_HT40 flags, but for + * 80/160 MHz and in particular 80+80 MHz this isn't really + * feasible and we only have NO_80MHZ/NO_160MHZ so far but + * no way to cover 80+80 MHz or more complex restrictions. + * Note that such restrictions also need to be advertised to + * userspace, for example for P2P channel selection. + */ if (width > 20) prohibited_flags |= IEEE80211_CHAN_NO_OFDM; @@ -340,7 +477,10 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - if (wdev->beacon_interval) { + if (wdev->cac_started) { + *chan = wdev->channel; + *chanmode = CHAN_MODE_SHARED; + } else if (wdev->beacon_interval) { *chan = wdev->channel; *chanmode = CHAN_MODE_SHARED; } diff --git a/net/wireless/core.c b/net/wireless/core.c index b677eab55b68..84c9ad7e1dca 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -57,9 +57,6 @@ struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx) { struct cfg80211_registered_device *result = NULL, *rdev; - if (!wiphy_idx_valid(wiphy_idx)) - return NULL; - assert_cfg80211_lock(); list_for_each_entry(rdev, &cfg80211_rdev_list, list) { @@ -74,10 +71,8 @@ struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx) int get_wiphy_idx(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev; - if (!wiphy) - return WIPHY_IDX_STALE; - rdev = wiphy_to_dev(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + return rdev->wiphy_idx; } @@ -86,9 +81,6 @@ struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx) { struct cfg80211_registered_device *rdev; - if (!wiphy_idx_valid(wiphy_idx)) - return NULL; - assert_cfg80211_lock(); rdev = cfg80211_rdev_by_wiphy_idx(wiphy_idx); @@ -220,6 +212,39 @@ static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) rdev_rfkill_poll(rdev); } +void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + lockdep_assert_held(&rdev->devlist_mtx); + lockdep_assert_held(&rdev->sched_scan_mtx); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)) + return; + + if (!wdev->p2p_started) + return; + + rdev_stop_p2p_device(rdev, wdev); + wdev->p2p_started = false; + + rdev->opencount--; + + if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + bool busy = work_busy(&rdev->scan_done_wk); + + /* + * If the work isn't pending or running (in which case it would + * be waiting for the lock we hold) the driver didn't properly + * cancel the scan when the interface was removed. In this case + * warn and leak the scan request object to not crash later. + */ + WARN_ON(!busy); + + rdev->scan_req->aborted = true; + ___cfg80211_scan_done(rdev, !busy); + } +} + static int cfg80211_rfkill_set_block(void *data, bool blocked) { struct cfg80211_registered_device *rdev = data; @@ -229,7 +254,8 @@ static int cfg80211_rfkill_set_block(void *data, bool blocked) return 0; rtnl_lock(); - mutex_lock(&rdev->devlist_mtx); + + /* read-only iteration need not hold the devlist_mtx */ list_for_each_entry(wdev, &rdev->wdev_list, list) { if (wdev->netdev) { @@ -239,18 +265,18 @@ static int cfg80211_rfkill_set_block(void *data, bool blocked) /* otherwise, check iftype */ switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: - if (!wdev->p2p_started) - break; - rdev_stop_p2p_device(rdev, wdev); - wdev->p2p_started = false; - rdev->opencount--; + /* but this requires it */ + mutex_lock(&rdev->devlist_mtx); + mutex_lock(&rdev->sched_scan_mtx); + cfg80211_stop_p2p_device(rdev, wdev); + mutex_unlock(&rdev->sched_scan_mtx); + mutex_unlock(&rdev->devlist_mtx); break; default: break; } } - mutex_unlock(&rdev->devlist_mtx); rtnl_unlock(); return 0; @@ -309,7 +335,7 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) rdev->wiphy_idx = wiphy_counter++; - if (unlikely(!wiphy_idx_valid(rdev->wiphy_idx))) { + if (unlikely(rdev->wiphy_idx < 0)) { wiphy_counter--; mutex_unlock(&cfg80211_mutex); /* ugh, wrapped! */ @@ -332,6 +358,8 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) INIT_LIST_HEAD(&rdev->bss_list); INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results); + INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, + cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT rdev->wiphy.wext = &cfg80211_wext_handler; #endif @@ -390,8 +418,11 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) c = &wiphy->iface_combinations[i]; - /* Combinations with just one interface aren't real */ - if (WARN_ON(c->max_interfaces < 2)) + /* + * Combinations with just one interface aren't real, + * however we make an exception for DFS. + */ + if (WARN_ON((c->max_interfaces < 2) && !c->radar_detect_widths)) return -EINVAL; /* Need at least one channel */ @@ -406,6 +437,11 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) CFG80211_MAX_NUM_DIFFERENT_CHANNELS)) return -EINVAL; + /* DFS only works on one channel. */ + if (WARN_ON(c->radar_detect_widths && + (c->num_different_channels > 1))) + return -EINVAL; + if (WARN_ON(!c->n_limits)) return -EINVAL; @@ -478,6 +514,11 @@ int wiphy_register(struct wiphy *wiphy) ETH_ALEN))) return -EINVAL; + if (WARN_ON(wiphy->max_acl_mac_addrs && + (!(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME) || + !rdev->ops->set_mac_acl))) + return -EINVAL; + if (wiphy->addresses) memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN); @@ -690,6 +731,7 @@ void wiphy_unregister(struct wiphy *wiphy) flush_work(&rdev->scan_done_wk); cancel_work_sync(&rdev->conn_work); flush_work(&rdev->event_work); + cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); if (rdev->wowlan && rdev->ops->set_wakeup) rdev_set_wakeup(rdev, false); @@ -710,7 +752,7 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) kfree(reg); } list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list) - cfg80211_put_bss(&scan->pub); + cfg80211_put_bss(&rdev->wiphy, &scan->pub); kfree(rdev); } @@ -737,17 +779,13 @@ static void wdev_cleanup_work(struct work_struct *work) wdev = container_of(work, struct wireless_dev, cleanup_work); rdev = wiphy_to_dev(wdev->wiphy); - cfg80211_lock_rdev(rdev); + mutex_lock(&rdev->sched_scan_mtx); if (WARN_ON(rdev->scan_req && rdev->scan_req->wdev == wdev)) { rdev->scan_req->aborted = true; ___cfg80211_scan_done(rdev, true); } - cfg80211_unlock_rdev(rdev); - - mutex_lock(&rdev->sched_scan_mtx); - if (WARN_ON(rdev->sched_scan_req && rdev->sched_scan_req->dev == wdev->netdev)) { __cfg80211_stop_sched_scan(rdev, false); @@ -773,21 +811,19 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) return; mutex_lock(&rdev->devlist_mtx); + mutex_lock(&rdev->sched_scan_mtx); list_del_rcu(&wdev->list); rdev->devlist_generation++; switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: - if (!wdev->p2p_started) - break; - rdev_stop_p2p_device(rdev, wdev); - wdev->p2p_started = false; - rdev->opencount--; + cfg80211_stop_p2p_device(rdev, wdev); break; default: WARN_ON_ONCE(1); break; } + mutex_unlock(&rdev->sched_scan_mtx); mutex_unlock(&rdev->devlist_mtx); } EXPORT_SYMBOL(cfg80211_unregister_wdev); @@ -806,6 +842,46 @@ void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, rdev->num_running_monitor_ifaces += num; } +void cfg80211_leave(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + struct net_device *dev = wdev->netdev; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + cfg80211_leave_ibss(rdev, dev, true); + break; + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_STATION: + mutex_lock(&rdev->sched_scan_mtx); + __cfg80211_stop_sched_scan(rdev, false); + mutex_unlock(&rdev->sched_scan_mtx); + + wdev_lock(wdev); +#ifdef CONFIG_CFG80211_WEXT + kfree(wdev->wext.ie); + wdev->wext.ie = NULL; + wdev->wext.ie_len = 0; + wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; +#endif + __cfg80211_disconnect(rdev, dev, + WLAN_REASON_DEAUTH_LEAVING, true); + cfg80211_mlme_down(rdev, dev); + wdev_unlock(wdev); + break; + case NL80211_IFTYPE_MESH_POINT: + cfg80211_leave_mesh(rdev, dev); + break; + case NL80211_IFTYPE_AP: + cfg80211_stop_ap(rdev, dev); + break; + default: + break; + } + + wdev->beacon_interval = 0; +} + static int cfg80211_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ndev) @@ -874,38 +950,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, dev->priv_flags |= IFF_DONT_BRIDGE; break; case NETDEV_GOING_DOWN: - switch (wdev->iftype) { - case NL80211_IFTYPE_ADHOC: - cfg80211_leave_ibss(rdev, dev, true); - break; - case NL80211_IFTYPE_P2P_CLIENT: - case NL80211_IFTYPE_STATION: - mutex_lock(&rdev->sched_scan_mtx); - __cfg80211_stop_sched_scan(rdev, false); - mutex_unlock(&rdev->sched_scan_mtx); - - wdev_lock(wdev); -#ifdef CONFIG_CFG80211_WEXT - kfree(wdev->wext.ie); - wdev->wext.ie = NULL; - wdev->wext.ie_len = 0; - wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; -#endif - __cfg80211_disconnect(rdev, dev, - WLAN_REASON_DEAUTH_LEAVING, true); - cfg80211_mlme_down(rdev, dev); - wdev_unlock(wdev); - break; - case NL80211_IFTYPE_MESH_POINT: - cfg80211_leave_mesh(rdev, dev); - break; - case NL80211_IFTYPE_AP: - cfg80211_stop_ap(rdev, dev); - break; - default: - break; - } - wdev->beacon_interval = 0; + cfg80211_leave(rdev, wdev); break; case NETDEV_DOWN: cfg80211_update_iface_num(rdev, wdev->iftype, -1); @@ -928,6 +973,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, cfg80211_update_iface_num(rdev, wdev->iftype, 1); cfg80211_lock_rdev(rdev); mutex_lock(&rdev->devlist_mtx); + mutex_lock(&rdev->sched_scan_mtx); wdev_lock(wdev); switch (wdev->iftype) { #ifdef CONFIG_CFG80211_WEXT @@ -959,6 +1005,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, break; } wdev_unlock(wdev); + mutex_unlock(&rdev->sched_scan_mtx); rdev->opencount++; mutex_unlock(&rdev->devlist_mtx); cfg80211_unlock_rdev(rdev); @@ -1079,8 +1126,10 @@ static int __init cfg80211_init(void) goto out_fail_reg; cfg80211_wq = create_singlethread_workqueue("cfg80211"); - if (!cfg80211_wq) + if (!cfg80211_wq) { + err = -ENOMEM; goto out_fail_wq; + } return 0; diff --git a/net/wireless/core.h b/net/wireless/core.h index 3563097169cb..fd35dae547c4 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -8,7 +8,6 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/netdevice.h> -#include <linux/kref.h> #include <linux/rbtree.h> #include <linux/debugfs.h> #include <linux/rfkill.h> @@ -18,6 +17,9 @@ #include <net/cfg80211.h> #include "reg.h" + +#define WIPHY_IDX_INVALID -1 + struct cfg80211_registered_device { const struct cfg80211_ops *ops; struct list_head list; @@ -84,9 +86,14 @@ struct cfg80211_registered_device { struct cfg80211_wowlan *wowlan; + struct delayed_work dfs_update_channels_wk; + + /* netlink port which started critical protocol (0 means not started) */ + u32 crit_proto_nlportid; + /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ - struct wiphy wiphy __attribute__((__aligned__(NETDEV_ALIGN))); + struct wiphy wiphy __aligned(NETDEV_ALIGN); }; static inline @@ -96,13 +103,6 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy) return container_of(wiphy, struct cfg80211_registered_device, wiphy); } -/* Note 0 is valid, hence phy0 */ -static inline -bool wiphy_idx_valid(int wiphy_idx) -{ - return wiphy_idx >= 0; -} - static inline void cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev) { @@ -113,6 +113,9 @@ cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev) for (i = 0; i < rdev->wowlan->n_patterns; i++) kfree(rdev->wowlan->patterns[i].mask); kfree(rdev->wowlan->patterns); + if (rdev->wowlan->tcp && rdev->wowlan->tcp->sock) + sock_release(rdev->wowlan->tcp->sock); + kfree(rdev->wowlan->tcp); kfree(rdev->wowlan); } @@ -126,17 +129,12 @@ static inline void assert_cfg80211_lock(void) lockdep_assert_held(&cfg80211_mutex); } -/* - * You can use this to mark a wiphy_idx as not having an associated wiphy. - * It guarantees cfg80211_rdev_by_wiphy_idx(wiphy_idx) will return NULL - */ -#define WIPHY_IDX_STALE -1 - struct cfg80211_internal_bss { struct list_head list; + struct list_head hidden_list; struct rb_node rbn; unsigned long ts; - struct kref ref; + unsigned long refcount; atomic_t hold; /* must be last because of priv member */ @@ -335,20 +333,15 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, int __cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan, - const u8 *bssid, const u8 *prev_bssid, + const u8 *bssid, const u8 *ssid, int ssid_len, - const u8 *ie, int ie_len, bool use_mfp, - struct cfg80211_crypto_settings *crypt, - u32 assoc_flags, struct ieee80211_ht_cap *ht_capa, - struct ieee80211_ht_cap *ht_capa_mask); + struct cfg80211_assoc_request *req); int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, - struct net_device *dev, struct ieee80211_channel *chan, - const u8 *bssid, const u8 *prev_bssid, + struct net_device *dev, + struct ieee80211_channel *chan, + const u8 *bssid, const u8 *ssid, int ssid_len, - const u8 *ie, int ie_len, bool use_mfp, - struct cfg80211_crypto_settings *crypt, - u32 assoc_flags, struct ieee80211_ht_cap *ht_capa, - struct ieee80211_ht_cap *ht_capa_mask); + struct cfg80211_assoc_request *req); int __cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, @@ -380,6 +373,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, bool no_cck, bool dont_wait_for_ack, u64 *cookie); void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, const struct ieee80211_ht_cap *ht_capa_mask); +void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, + const struct ieee80211_vht_cap *vht_capa_mask); /* SME */ int __cfg80211_connect(struct cfg80211_registered_device *rdev, @@ -435,7 +430,24 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_iftype iftype, struct ieee80211_channel *chan, - enum cfg80211_chan_mode chanmode); + enum cfg80211_chan_mode chanmode, + u8 radar_detect); + +/** + * cfg80211_chandef_dfs_required - checks if radar detection is required + * @wiphy: the wiphy to validate against + * @chandef: the channel definition to check + * Return: 1 if radar detection is required, 0 if it is not, < 0 on error + */ +int cfg80211_chandef_dfs_required(struct wiphy *wiphy, + const struct cfg80211_chan_def *c); + +void cfg80211_set_dfs_state(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + enum nl80211_dfs_state dfs_state); + +void cfg80211_dfs_channels_update_work(struct work_struct *work); + static inline int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, @@ -443,7 +455,7 @@ cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype) { return cfg80211_can_use_iftype_chan(rdev, wdev, iftype, NULL, - CHAN_MODE_UNDEFINED); + CHAN_MODE_UNDEFINED, 0); } static inline int @@ -460,7 +472,17 @@ cfg80211_can_use_chan(struct cfg80211_registered_device *rdev, enum cfg80211_chan_mode chanmode) { return cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, - chan, chanmode); + chan, chanmode, 0); +} + +static inline unsigned int elapsed_jiffies_msecs(unsigned long start) +{ + unsigned long end = jiffies; + + if (end >= start) + return jiffies_to_msecs(end - start); + + return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1); } void @@ -481,6 +503,12 @@ int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num); +void cfg80211_leave(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); + +void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); + #define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index 48c48ffafa1d..e37862f1b127 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -15,10 +15,10 @@ static void cfg80211_get_drvinfo(struct net_device *dev, strlcpy(info->version, init_utsname()->release, sizeof(info->version)); if (wdev->wiphy->fw_version[0]) - strncpy(info->fw_version, wdev->wiphy->fw_version, + strlcpy(info->fw_version, wdev->wiphy->fw_version, sizeof(info->fw_version)); else - strncpy(info->fw_version, "N/A", sizeof(info->fw_version)); + strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); strlcpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)), sizeof(info->bus_info)); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 9b9551e4a6f9..d80e47194d49 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -37,7 +37,7 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid) if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(&wdev->current_bss->pub); + cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); } cfg80211_hold_bss(bss_from_pub(bss)); @@ -182,7 +182,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(&wdev->current_bss->pub); + cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); } wdev->current_bss = NULL; diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index 1526c211db66..dc0e59e53dbf 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -430,24 +430,23 @@ static int lib80211_ccmp_get_key(void *key, int len, u8 * seq, void *priv) return CCMP_TK_LEN; } -static char *lib80211_ccmp_print_stats(char *p, void *priv) +static void lib80211_ccmp_print_stats(struct seq_file *m, void *priv) { struct lib80211_ccmp_data *ccmp = priv; - p += sprintf(p, "key[%d] alg=CCMP key_set=%d " - "tx_pn=%02x%02x%02x%02x%02x%02x " - "rx_pn=%02x%02x%02x%02x%02x%02x " - "format_errors=%d replays=%d decrypt_errors=%d\n", - ccmp->key_idx, ccmp->key_set, - ccmp->tx_pn[0], ccmp->tx_pn[1], ccmp->tx_pn[2], - ccmp->tx_pn[3], ccmp->tx_pn[4], ccmp->tx_pn[5], - ccmp->rx_pn[0], ccmp->rx_pn[1], ccmp->rx_pn[2], - ccmp->rx_pn[3], ccmp->rx_pn[4], ccmp->rx_pn[5], - ccmp->dot11RSNAStatsCCMPFormatErrors, - ccmp->dot11RSNAStatsCCMPReplays, - ccmp->dot11RSNAStatsCCMPDecryptErrors); - - return p; + seq_printf(m, + "key[%d] alg=CCMP key_set=%d " + "tx_pn=%02x%02x%02x%02x%02x%02x " + "rx_pn=%02x%02x%02x%02x%02x%02x " + "format_errors=%d replays=%d decrypt_errors=%d\n", + ccmp->key_idx, ccmp->key_set, + ccmp->tx_pn[0], ccmp->tx_pn[1], ccmp->tx_pn[2], + ccmp->tx_pn[3], ccmp->tx_pn[4], ccmp->tx_pn[5], + ccmp->rx_pn[0], ccmp->rx_pn[1], ccmp->rx_pn[2], + ccmp->rx_pn[3], ccmp->rx_pn[4], ccmp->rx_pn[5], + ccmp->dot11RSNAStatsCCMPFormatErrors, + ccmp->dot11RSNAStatsCCMPReplays, + ccmp->dot11RSNAStatsCCMPDecryptErrors); } static struct lib80211_crypto_ops lib80211_crypt_ccmp = { diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index d475cfc8568f..8c90ba79e56e 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -703,30 +703,30 @@ static int lib80211_tkip_get_key(void *key, int len, u8 * seq, void *priv) return TKIP_KEY_LEN; } -static char *lib80211_tkip_print_stats(char *p, void *priv) +static void lib80211_tkip_print_stats(struct seq_file *m, void *priv) { struct lib80211_tkip_data *tkip = priv; - p += sprintf(p, "key[%d] alg=TKIP key_set=%d " - "tx_pn=%02x%02x%02x%02x%02x%02x " - "rx_pn=%02x%02x%02x%02x%02x%02x " - "replays=%d icv_errors=%d local_mic_failures=%d\n", - tkip->key_idx, tkip->key_set, - (tkip->tx_iv32 >> 24) & 0xff, - (tkip->tx_iv32 >> 16) & 0xff, - (tkip->tx_iv32 >> 8) & 0xff, - tkip->tx_iv32 & 0xff, - (tkip->tx_iv16 >> 8) & 0xff, - tkip->tx_iv16 & 0xff, - (tkip->rx_iv32 >> 24) & 0xff, - (tkip->rx_iv32 >> 16) & 0xff, - (tkip->rx_iv32 >> 8) & 0xff, - tkip->rx_iv32 & 0xff, - (tkip->rx_iv16 >> 8) & 0xff, - tkip->rx_iv16 & 0xff, - tkip->dot11RSNAStatsTKIPReplays, - tkip->dot11RSNAStatsTKIPICVErrors, - tkip->dot11RSNAStatsTKIPLocalMICFailures); - return p; + seq_printf(m, + "key[%d] alg=TKIP key_set=%d " + "tx_pn=%02x%02x%02x%02x%02x%02x " + "rx_pn=%02x%02x%02x%02x%02x%02x " + "replays=%d icv_errors=%d local_mic_failures=%d\n", + tkip->key_idx, tkip->key_set, + (tkip->tx_iv32 >> 24) & 0xff, + (tkip->tx_iv32 >> 16) & 0xff, + (tkip->tx_iv32 >> 8) & 0xff, + tkip->tx_iv32 & 0xff, + (tkip->tx_iv16 >> 8) & 0xff, + tkip->tx_iv16 & 0xff, + (tkip->rx_iv32 >> 24) & 0xff, + (tkip->rx_iv32 >> 16) & 0xff, + (tkip->rx_iv32 >> 8) & 0xff, + tkip->rx_iv32 & 0xff, + (tkip->rx_iv16 >> 8) & 0xff, + tkip->rx_iv16 & 0xff, + tkip->dot11RSNAStatsTKIPReplays, + tkip->dot11RSNAStatsTKIPICVErrors, + tkip->dot11RSNAStatsTKIPLocalMICFailures); } static struct lib80211_crypto_ops lib80211_crypt_tkip = { diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c index c1304018fc1c..1c292e4ea7b6 100644 --- a/net/wireless/lib80211_crypt_wep.c +++ b/net/wireless/lib80211_crypt_wep.c @@ -253,11 +253,10 @@ static int lib80211_wep_get_key(void *key, int len, u8 * seq, void *priv) return wep->key_len; } -static char *lib80211_wep_print_stats(char *p, void *priv) +static void lib80211_wep_print_stats(struct seq_file *m, void *priv) { struct lib80211_wep_data *wep = priv; - p += sprintf(p, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len); - return p; + seq_printf(m, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len); } static struct lib80211_crypto_ops lib80211_crypt_wep = { diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index f9d6ce5cfabb..0bb93f3061a4 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -44,6 +44,10 @@ #define MESH_SYNC_NEIGHBOR_OFFSET_MAX 50 +#define MESH_DEFAULT_BEACON_INTERVAL 1000 /* in 1024 us units (=TUs) */ +#define MESH_DEFAULT_DTIM_PERIOD 2 +#define MESH_DEFAULT_AWAKE_WINDOW 10 /* in 1024 us units (=TUs) */ + const struct mesh_config default_mesh_config = { .dot11MeshRetryTimeout = MESH_RET_T, .dot11MeshConfirmTimeout = MESH_CONF_T, @@ -69,6 +73,8 @@ const struct mesh_config default_mesh_config = { .dot11MeshHWMPactivePathToRootTimeout = MESH_PATH_TO_ROOT_TIMEOUT, .dot11MeshHWMProotInterval = MESH_ROOT_INTERVAL, .dot11MeshHWMPconfirmationInterval = MESH_ROOT_CONFIRMATION_INTERVAL, + .power_mode = NL80211_MESH_POWER_ACTIVE, + .dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW, }; const struct mesh_setup default_mesh_setup = { @@ -79,6 +85,9 @@ const struct mesh_setup default_mesh_setup = { .ie = NULL, .ie_len = 0, .is_secure = false, + .user_mpm = false, + .beacon_interval = MESH_DEFAULT_BEACON_INTERVAL, + .dtim_period = MESH_DEFAULT_DTIM_PERIOD, }; int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, @@ -225,20 +234,6 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, return 0; } -void cfg80211_notify_new_peer_candidate(struct net_device *dev, - const u8 *macaddr, const u8* ie, u8 ie_len, gfp_t gfp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - - trace_cfg80211_notify_new_peer_candidate(dev, macaddr); - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_MESH_POINT)) - return; - - nl80211_send_new_peer_candidate(wiphy_to_dev(wdev->wiphy), dev, - macaddr, ie, ie_len, gfp); -} -EXPORT_SYMBOL(cfg80211_notify_new_peer_candidate); - static int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev) { diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 5e8123ee63fd..0c7b7dd855f6 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -58,7 +58,7 @@ void cfg80211_send_rx_assoc(struct net_device *dev, struct cfg80211_bss *bss, */ if (status_code != WLAN_STATUS_SUCCESS && wdev->conn && cfg80211_sme_failed_reassoc(wdev)) { - cfg80211_put_bss(bss); + cfg80211_put_bss(wiphy, bss); goto out; } @@ -70,7 +70,7 @@ void cfg80211_send_rx_assoc(struct net_device *dev, struct cfg80211_bss *bss, * do not call connect_result() now because the * sme will schedule work that does it later. */ - cfg80211_put_bss(bss); + cfg80211_put_bss(wiphy, bss); goto out; } @@ -108,7 +108,7 @@ void __cfg80211_send_deauth(struct net_device *dev, if (wdev->current_bss && ether_addr_equal(wdev->current_bss->pub.bssid, bssid)) { cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(&wdev->current_bss->pub); + cfg80211_put_bss(wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; was_current = true; } @@ -164,7 +164,7 @@ void __cfg80211_send_disassoc(struct net_device *dev, ether_addr_equal(wdev->current_bss->pub.bssid, bssid)) { cfg80211_sme_disassoc(dev, wdev->current_bss); cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(&wdev->current_bss->pub); + cfg80211_put_bss(wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; } else WARN_ON(1); @@ -187,30 +187,6 @@ void cfg80211_send_disassoc(struct net_device *dev, const u8 *buf, size_t len) } EXPORT_SYMBOL(cfg80211_send_disassoc); -void cfg80211_send_unprot_deauth(struct net_device *dev, const u8 *buf, - size_t len) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - trace_cfg80211_send_unprot_deauth(dev); - nl80211_send_unprot_deauth(rdev, dev, buf, len, GFP_ATOMIC); -} -EXPORT_SYMBOL(cfg80211_send_unprot_deauth); - -void cfg80211_send_unprot_disassoc(struct net_device *dev, const u8 *buf, - size_t len) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - trace_cfg80211_send_unprot_disassoc(dev); - nl80211_send_unprot_disassoc(rdev, dev, buf, len, GFP_ATOMIC); -} -EXPORT_SYMBOL(cfg80211_send_unprot_disassoc); - void cfg80211_send_auth_timeout(struct net_device *dev, const u8 *addr) { struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -324,7 +300,7 @@ int __cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, err = rdev_auth(rdev, dev, &req); out: - cfg80211_put_bss(req.bss); + cfg80211_put_bss(&rdev->wiphy, req.bss); return err; } @@ -367,27 +343,38 @@ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, p1[i] &= p2[i]; } +/* Do a logical ht_capa &= ht_capa_mask. */ +void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, + const struct ieee80211_vht_cap *vht_capa_mask) +{ + int i; + u8 *p1, *p2; + if (!vht_capa_mask) { + memset(vht_capa, 0, sizeof(*vht_capa)); + return; + } + + p1 = (u8*)(vht_capa); + p2 = (u8*)(vht_capa_mask); + for (i = 0; i < sizeof(*vht_capa); i++) + p1[i] &= p2[i]; +} + int __cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan, - const u8 *bssid, const u8 *prev_bssid, + const u8 *bssid, const u8 *ssid, int ssid_len, - const u8 *ie, int ie_len, bool use_mfp, - struct cfg80211_crypto_settings *crypt, - u32 assoc_flags, struct ieee80211_ht_cap *ht_capa, - struct ieee80211_ht_cap *ht_capa_mask) + struct cfg80211_assoc_request *req) { struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_assoc_request req; int err; bool was_connected = false; ASSERT_WDEV_LOCK(wdev); - memset(&req, 0, sizeof(req)); - - if (wdev->current_bss && prev_bssid && - ether_addr_equal(wdev->current_bss->pub.bssid, prev_bssid)) { + if (wdev->current_bss && req->prev_bssid && + ether_addr_equal(wdev->current_bss->pub.bssid, req->prev_bssid)) { /* * Trying to reassociate: Allow this to proceed and let the old * association to be dropped when the new one is completed. @@ -399,40 +386,30 @@ int __cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, } else if (wdev->current_bss) return -EALREADY; - req.ie = ie; - req.ie_len = ie_len; - memcpy(&req.crypto, crypt, sizeof(req.crypto)); - req.use_mfp = use_mfp; - req.prev_bssid = prev_bssid; - req.flags = assoc_flags; - if (ht_capa) - memcpy(&req.ht_capa, ht_capa, sizeof(req.ht_capa)); - if (ht_capa_mask) - memcpy(&req.ht_capa_mask, ht_capa_mask, - sizeof(req.ht_capa_mask)); - cfg80211_oper_and_ht_capa(&req.ht_capa_mask, + cfg80211_oper_and_ht_capa(&req->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); + cfg80211_oper_and_vht_capa(&req->vht_capa_mask, + rdev->wiphy.vht_capa_mod_mask); - req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, - WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS); - if (!req.bss) { + req->bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, + WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS); + if (!req->bss) { if (was_connected) wdev->sme_state = CFG80211_SME_CONNECTED; return -ENOENT; } - err = cfg80211_can_use_chan(rdev, wdev, req.bss->channel, - CHAN_MODE_SHARED); + err = cfg80211_can_use_chan(rdev, wdev, chan, CHAN_MODE_SHARED); if (err) goto out; - err = rdev_assoc(rdev, dev, &req); + err = rdev_assoc(rdev, dev, req); out: if (err) { if (was_connected) wdev->sme_state = CFG80211_SME_CONNECTED; - cfg80211_put_bss(req.bss); + cfg80211_put_bss(&rdev->wiphy, req->bss); } return err; @@ -441,21 +418,17 @@ out: int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan, - const u8 *bssid, const u8 *prev_bssid, + const u8 *bssid, const u8 *ssid, int ssid_len, - const u8 *ie, int ie_len, bool use_mfp, - struct cfg80211_crypto_settings *crypt, - u32 assoc_flags, struct ieee80211_ht_cap *ht_capa, - struct ieee80211_ht_cap *ht_capa_mask) + struct cfg80211_assoc_request *req) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; mutex_lock(&rdev->devlist_mtx); wdev_lock(wdev); - err = __cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid, - ssid, ssid_len, ie, ie_len, use_mfp, crypt, - assoc_flags, ht_capa, ht_capa_mask); + err = __cfg80211_mlme_assoc(rdev, dev, chan, bssid, + ssid, ssid_len, req); wdev_unlock(wdev); mutex_unlock(&rdev->devlist_mtx); @@ -514,7 +487,7 @@ static int __cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, if (wdev->sme_state != CFG80211_SME_CONNECTED) return -ENOTCONN; - if (WARN_ON(!wdev->current_bss)) + if (WARN(!wdev->current_bss, "sme_state=%d\n", wdev->sme_state)) return -ENOTCONN; memset(&req, 0, sizeof(req)); @@ -572,67 +545,11 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(&wdev->current_bss->pub); + cfg80211_put_bss(&rdev->wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; } } -void cfg80211_ready_on_channel(struct wireless_dev *wdev, u64 cookie, - struct ieee80211_channel *chan, - unsigned int duration, gfp_t gfp) -{ - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - trace_cfg80211_ready_on_channel(wdev, cookie, chan, duration); - nl80211_send_remain_on_channel(rdev, wdev, cookie, chan, duration, gfp); -} -EXPORT_SYMBOL(cfg80211_ready_on_channel); - -void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie, - struct ieee80211_channel *chan, - gfp_t gfp) -{ - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - trace_cfg80211_ready_on_channel_expired(wdev, cookie, chan); - nl80211_send_remain_on_channel_cancel(rdev, wdev, cookie, chan, gfp); -} -EXPORT_SYMBOL(cfg80211_remain_on_channel_expired); - -void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, - struct station_info *sinfo, gfp_t gfp) -{ - struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - trace_cfg80211_new_sta(dev, mac_addr, sinfo); - nl80211_send_sta_event(rdev, dev, mac_addr, sinfo, gfp); -} -EXPORT_SYMBOL(cfg80211_new_sta); - -void cfg80211_del_sta(struct net_device *dev, const u8 *mac_addr, gfp_t gfp) -{ - struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - trace_cfg80211_del_sta(dev, mac_addr); - nl80211_send_sta_del_event(rdev, dev, mac_addr, gfp); -} -EXPORT_SYMBOL(cfg80211_del_sta); - -void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, - enum nl80211_connect_failed_reason reason, - gfp_t gfp) -{ - struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - nl80211_send_conn_failed_event(rdev, dev, mac_addr, reason, gfp); -} -EXPORT_SYMBOL(cfg80211_conn_failed); - struct cfg80211_mgmt_registration { struct list_head list; @@ -731,6 +648,11 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) spin_unlock_bh(&wdev->mgmt_registrations_lock); + if (nlportid && rdev->crit_proto_nlportid == nlportid) { + rdev->crit_proto_nlportid = 0; + rdev_crit_proto_stop(rdev, wdev); + } + if (nlportid == wdev->ap_unexpected_nlportid) wdev->ap_unexpected_nlportid = 0; } @@ -909,143 +831,122 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, } EXPORT_SYMBOL(cfg80211_rx_mgmt); -void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp) -{ - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - - trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); +void cfg80211_dfs_channels_update_work(struct work_struct *work) +{ + struct delayed_work *delayed_work; + struct cfg80211_registered_device *rdev; + struct cfg80211_chan_def chandef; + struct ieee80211_supported_band *sband; + struct ieee80211_channel *c; + struct wiphy *wiphy; + bool check_again = false; + unsigned long timeout, next_time = 0; + int bandid, i; + + delayed_work = container_of(work, struct delayed_work, work); + rdev = container_of(delayed_work, struct cfg80211_registered_device, + dfs_update_channels_wk); + wiphy = &rdev->wiphy; + + mutex_lock(&cfg80211_mutex); + for (bandid = 0; bandid < IEEE80211_NUM_BANDS; bandid++) { + sband = wiphy->bands[bandid]; + if (!sband) + continue; - /* Indicate TX status of the Action frame to user space */ - nl80211_send_mgmt_tx_status(rdev, wdev, cookie, buf, len, ack, gfp); -} -EXPORT_SYMBOL(cfg80211_mgmt_tx_status); + for (i = 0; i < sband->n_channels; i++) { + c = &sband->channels[i]; -void cfg80211_cqm_rssi_notify(struct net_device *dev, - enum nl80211_cqm_rssi_threshold_event rssi_event, - gfp_t gfp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + if (c->dfs_state != NL80211_DFS_UNAVAILABLE) + continue; - trace_cfg80211_cqm_rssi_notify(dev, rssi_event); + timeout = c->dfs_state_entered + + IEEE80211_DFS_MIN_NOP_TIME_MS; - /* Indicate roaming trigger event to user space */ - nl80211_send_cqm_rssi_notify(rdev, dev, rssi_event, gfp); -} -EXPORT_SYMBOL(cfg80211_cqm_rssi_notify); + if (time_after_eq(jiffies, timeout)) { + c->dfs_state = NL80211_DFS_USABLE; + cfg80211_chandef_create(&chandef, c, + NL80211_CHAN_NO_HT); -void cfg80211_cqm_pktloss_notify(struct net_device *dev, - const u8 *peer, u32 num_packets, gfp_t gfp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + nl80211_radar_notify(rdev, &chandef, + NL80211_RADAR_NOP_FINISHED, + NULL, GFP_ATOMIC); + continue; + } - trace_cfg80211_cqm_pktloss_notify(dev, peer, num_packets); + if (!check_again) + next_time = timeout - jiffies; + else + next_time = min(next_time, timeout - jiffies); + check_again = true; + } + } + mutex_unlock(&cfg80211_mutex); - /* Indicate roaming trigger event to user space */ - nl80211_send_cqm_pktloss_notify(rdev, dev, peer, num_packets, gfp); + /* reschedule if there are other channels waiting to be cleared again */ + if (check_again) + queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, + next_time); } -EXPORT_SYMBOL(cfg80211_cqm_pktloss_notify); -void cfg80211_cqm_txe_notify(struct net_device *dev, - const u8 *peer, u32 num_packets, - u32 rate, u32 intvl, gfp_t gfp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - nl80211_send_cqm_txe_notify(rdev, dev, peer, num_packets, - rate, intvl, gfp); -} -EXPORT_SYMBOL(cfg80211_cqm_txe_notify); - -void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid, - const u8 *replay_ctr, gfp_t gfp) +void cfg80211_radar_event(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + gfp_t gfp) { - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + unsigned long timeout; - trace_cfg80211_gtk_rekey_notify(dev, bssid); - nl80211_gtk_rekey_notify(rdev, dev, bssid, replay_ctr, gfp); -} -EXPORT_SYMBOL(cfg80211_gtk_rekey_notify); + trace_cfg80211_radar_event(wiphy, chandef); -void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, - const u8 *bssid, bool preauth, gfp_t gfp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + /* only set the chandef supplied channel to unavailable, in + * case the radar is detected on only one of multiple channels + * spanned by the chandef. + */ + cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE); - trace_cfg80211_pmksa_candidate_notify(dev, index, bssid, preauth); - nl80211_pmksa_candidate_notify(rdev, dev, index, bssid, preauth, gfp); + timeout = msecs_to_jiffies(IEEE80211_DFS_MIN_NOP_TIME_MS); + queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, + timeout); + + nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp); } -EXPORT_SYMBOL(cfg80211_pmksa_candidate_notify); +EXPORT_SYMBOL(cfg80211_radar_event); -void cfg80211_ch_switch_notify(struct net_device *dev, - struct cfg80211_chan_def *chandef) +void cfg80211_cac_event(struct net_device *netdev, + enum nl80211_radar_event event, gfp_t gfp) { - struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wireless_dev *wdev = netdev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_chan_def chandef; + unsigned long timeout; - trace_cfg80211_ch_switch_notify(dev, chandef); - - wdev_lock(wdev); - - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && - wdev->iftype != NL80211_IFTYPE_P2P_GO)) - goto out; + trace_cfg80211_cac_event(netdev, event); - wdev->channel = chandef->chan; - nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL); -out: - wdev_unlock(wdev); - return; -} -EXPORT_SYMBOL(cfg80211_ch_switch_notify); + if (WARN_ON(!wdev->cac_started)) + return; -bool cfg80211_rx_spurious_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - bool ret; + if (WARN_ON(!wdev->channel)) + return; - trace_cfg80211_rx_spurious_frame(dev, addr); + cfg80211_chandef_create(&chandef, wdev->channel, NL80211_CHAN_NO_HT); - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && - wdev->iftype != NL80211_IFTYPE_P2P_GO)) { - trace_cfg80211_return_bool(false); - return false; + switch (event) { + case NL80211_RADAR_CAC_FINISHED: + timeout = wdev->cac_start_time + + msecs_to_jiffies(IEEE80211_DFS_MIN_CAC_TIME_MS); + WARN_ON(!time_after_eq(jiffies, timeout)); + cfg80211_set_dfs_state(wiphy, &chandef, NL80211_DFS_AVAILABLE); + break; + case NL80211_RADAR_CAC_ABORTED: + break; + default: + WARN_ON(1); + return; } - ret = nl80211_unexpected_frame(dev, addr, gfp); - trace_cfg80211_return_bool(ret); - return ret; -} -EXPORT_SYMBOL(cfg80211_rx_spurious_frame); + wdev->cac_started = false; -bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - bool ret; - - trace_cfg80211_rx_unexpected_4addr_frame(dev, addr); - - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && - wdev->iftype != NL80211_IFTYPE_P2P_GO && - wdev->iftype != NL80211_IFTYPE_AP_VLAN)) { - trace_cfg80211_return_bool(false); - return false; - } - ret = nl80211_unexpected_4addr_frame(dev, addr, gfp); - trace_cfg80211_return_bool(ret); - return ret; + nl80211_radar_notify(rdev, &chandef, event, netdev, gfp); } -EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame); +EXPORT_SYMBOL(cfg80211_cac_event); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f45706adaf34..afa283841e8c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -19,6 +19,7 @@ #include <net/genetlink.h> #include <net/cfg80211.h> #include <net/sock.h> +#include <net/inet_connection_sock.h> #include "core.h" #include "nl80211.h" #include "reg.h" @@ -365,6 +366,18 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, [NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 }, [NL80211_ATTR_P2P_OPPPS] = { .type = NLA_U8 }, + [NL80211_ATTR_ACL_POLICY] = {. type = NLA_U32 }, + [NL80211_ATTR_MAC_ADDRS] = { .type = NLA_NESTED }, + [NL80211_ATTR_STA_CAPABILITY] = { .type = NLA_U16 }, + [NL80211_ATTR_STA_EXT_CAPABILITY] = { .type = NLA_BINARY, }, + [NL80211_ATTR_SPLIT_WIPHY_DUMP] = { .type = NLA_FLAG, }, + [NL80211_ATTR_DISABLE_VHT] = { .type = NLA_FLAG }, + [NL80211_ATTR_VHT_CAPABILITY_MASK] = { + .len = NL80211_VHT_CAPABILITY_LEN, + }, + [NL80211_ATTR_MDID] = { .type = NLA_U16 }, + [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, }; /* policy for the key attributes */ @@ -397,6 +410,26 @@ nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = { [NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST] = { .type = NLA_FLAG }, [NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE] = { .type = NLA_FLAG }, [NL80211_WOWLAN_TRIG_RFKILL_RELEASE] = { .type = NLA_FLAG }, + [NL80211_WOWLAN_TRIG_TCP_CONNECTION] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = { + [NL80211_WOWLAN_TCP_SRC_IPV4] = { .type = NLA_U32 }, + [NL80211_WOWLAN_TCP_DST_IPV4] = { .type = NLA_U32 }, + [NL80211_WOWLAN_TCP_DST_MAC] = { .len = ETH_ALEN }, + [NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 }, + [NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 }, + [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = { .len = 1 }, + [NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ] = { + .len = sizeof(struct nl80211_wowlan_tcp_data_seq) + }, + [NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN] = { + .len = sizeof(struct nl80211_wowlan_tcp_data_token) + }, + [NL80211_WOWLAN_TCP_DATA_INTERVAL] = { .type = NLA_U32 }, + [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = { .len = 1 }, + [NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 }, }; /* policy for GTK rekey offload attributes */ @@ -414,62 +447,69 @@ nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, }; -/* ifidx get helper */ -static int nl80211_get_ifidx(struct netlink_callback *cb) +static int nl80211_prepare_wdev_dump(struct sk_buff *skb, + struct netlink_callback *cb, + struct cfg80211_registered_device **rdev, + struct wireless_dev **wdev) { - int res; - - res = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, - nl80211_fam.attrbuf, nl80211_fam.maxattr, - nl80211_policy); - if (res) - return res; - - if (!nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX]) - return -EINVAL; + int err; - res = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX]); - if (!res) - return -EINVAL; - return res; -} + rtnl_lock(); + mutex_lock(&cfg80211_mutex); -static int nl80211_prepare_netdev_dump(struct sk_buff *skb, - struct netlink_callback *cb, - struct cfg80211_registered_device **rdev, - struct net_device **dev) -{ - int ifidx = cb->args[0]; - int err; + if (!cb->args[0]) { + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, + nl80211_fam.attrbuf, nl80211_fam.maxattr, + nl80211_policy); + if (err) + goto out_unlock; - if (!ifidx) - ifidx = nl80211_get_ifidx(cb); - if (ifidx < 0) - return ifidx; + *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), + nl80211_fam.attrbuf); + if (IS_ERR(*wdev)) { + err = PTR_ERR(*wdev); + goto out_unlock; + } + *rdev = wiphy_to_dev((*wdev)->wiphy); + cb->args[0] = (*rdev)->wiphy_idx; + cb->args[1] = (*wdev)->identifier; + } else { + struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0]); + struct wireless_dev *tmp; - cb->args[0] = ifidx; + if (!wiphy) { + err = -ENODEV; + goto out_unlock; + } + *rdev = wiphy_to_dev(wiphy); + *wdev = NULL; - rtnl_lock(); + mutex_lock(&(*rdev)->devlist_mtx); + list_for_each_entry(tmp, &(*rdev)->wdev_list, list) { + if (tmp->identifier == cb->args[1]) { + *wdev = tmp; + break; + } + } + mutex_unlock(&(*rdev)->devlist_mtx); - *dev = __dev_get_by_index(sock_net(skb->sk), ifidx); - if (!*dev) { - err = -ENODEV; - goto out_rtnl; + if (!*wdev) { + err = -ENODEV; + goto out_unlock; + } } - *rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx); - if (IS_ERR(*rdev)) { - err = PTR_ERR(*rdev); - goto out_rtnl; - } + cfg80211_lock_rdev(*rdev); + mutex_unlock(&cfg80211_mutex); return 0; - out_rtnl: + out_unlock: + mutex_unlock(&cfg80211_mutex); rtnl_unlock(); return err; } -static void nl80211_finish_netdev_dump(struct cfg80211_registered_device *rdev) +static void nl80211_finish_wdev_dump(struct cfg80211_registered_device *rdev) { cfg80211_unlock_rdev(rdev); rtnl_unlock(); @@ -514,7 +554,8 @@ static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, } static int nl80211_msg_put_channel(struct sk_buff *msg, - struct ieee80211_channel *chan) + struct ieee80211_channel *chan, + bool large) { if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_FREQ, chan->center_freq)) @@ -529,9 +570,37 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, if ((chan->flags & IEEE80211_CHAN_NO_IBSS) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_IBSS)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_RADAR) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_RADAR)) - goto nla_put_failure; + if (chan->flags & IEEE80211_CHAN_RADAR) { + if (nla_put_flag(msg, NL80211_FREQUENCY_ATTR_RADAR)) + goto nla_put_failure; + if (large) { + u32 time; + + time = elapsed_jiffies_msecs(chan->dfs_state_entered); + + if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_DFS_STATE, + chan->dfs_state)) + goto nla_put_failure; + if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_DFS_TIME, + time)) + goto nla_put_failure; + } + } + + if (large) { + if ((chan->flags & IEEE80211_CHAN_NO_HT40MINUS) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HT40_MINUS)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_HT40PLUS) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HT40_PLUS)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_80MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_80MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_160MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_160MHZ)) + goto nla_put_failure; + } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, DBM_TO_MBM(chan->max_power))) @@ -807,7 +876,8 @@ nla_put_failure: } static int nl80211_put_iface_combinations(struct wiphy *wiphy, - struct sk_buff *msg) + struct sk_buff *msg, + bool large) { struct nlattr *nl_combis; int i, j; @@ -856,6 +926,10 @@ static int nl80211_put_iface_combinations(struct wiphy *wiphy, nla_put_u32(msg, NL80211_IFACE_COMB_MAXNUM, c->max_interfaces)) goto nla_put_failure; + if (large && + nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_WIDTHS, + c->radar_detect_widths)) + goto nla_put_failure; nla_nest_end(msg, nl_combi); } @@ -867,404 +941,615 @@ nla_put_failure: return -ENOBUFS; } -static int nl80211_send_wiphy(struct sk_buff *msg, u32 portid, u32 seq, int flags, - struct cfg80211_registered_device *dev) +#ifdef CONFIG_PM +static int nl80211_send_wowlan_tcp_caps(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) { - void *hdr; - struct nlattr *nl_bands, *nl_band; - struct nlattr *nl_freqs, *nl_freq; - struct nlattr *nl_rates, *nl_rate; - struct nlattr *nl_cmds; - enum ieee80211_band band; - struct ieee80211_channel *chan; - struct ieee80211_rate *rate; - int i; - const struct ieee80211_txrx_stypes *mgmt_stypes = - dev->wiphy.mgmt_stypes; + const struct wiphy_wowlan_tcp_support *tcp = rdev->wiphy.wowlan.tcp; + struct nlattr *nl_tcp; - hdr = nl80211hdr_put(msg, portid, seq, flags, NL80211_CMD_NEW_WIPHY); - if (!hdr) - return -1; + if (!tcp) + return 0; - if (nla_put_u32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx) || - nla_put_string(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy)) || - nla_put_u32(msg, NL80211_ATTR_GENERATION, - cfg80211_rdev_list_generation) || - nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_SHORT, - dev->wiphy.retry_short) || - nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_LONG, - dev->wiphy.retry_long) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_FRAG_THRESHOLD, - dev->wiphy.frag_threshold) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD, - dev->wiphy.rts_threshold) || - nla_put_u8(msg, NL80211_ATTR_WIPHY_COVERAGE_CLASS, - dev->wiphy.coverage_class) || - nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS, - dev->wiphy.max_scan_ssids) || - nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_SSIDS, - dev->wiphy.max_sched_scan_ssids) || - nla_put_u16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN, - dev->wiphy.max_scan_ie_len) || - nla_put_u16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN, - dev->wiphy.max_sched_scan_ie_len) || - nla_put_u8(msg, NL80211_ATTR_MAX_MATCH_SETS, - dev->wiphy.max_match_sets)) - goto nla_put_failure; + nl_tcp = nla_nest_start(msg, NL80211_WOWLAN_TRIG_TCP_CONNECTION); + if (!nl_tcp) + return -ENOBUFS; - if ((dev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) && - nla_put_flag(msg, NL80211_ATTR_SUPPORT_IBSS_RSN)) - goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) && - nla_put_flag(msg, NL80211_ATTR_SUPPORT_MESH_AUTH)) - goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) && - nla_put_flag(msg, NL80211_ATTR_SUPPORT_AP_UAPSD)) - goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) && - nla_put_flag(msg, NL80211_ATTR_ROAM_SUPPORT)) - goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) && - nla_put_flag(msg, NL80211_ATTR_TDLS_SUPPORT)) - goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP) && - nla_put_flag(msg, NL80211_ATTR_TDLS_EXTERNAL_SETUP)) - goto nla_put_failure; + if (nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD, + tcp->data_payload_max)) + return -ENOBUFS; - if (nla_put(msg, NL80211_ATTR_CIPHER_SUITES, - sizeof(u32) * dev->wiphy.n_cipher_suites, - dev->wiphy.cipher_suites)) - goto nla_put_failure; + if (nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD, + tcp->data_payload_max)) + return -ENOBUFS; - if (nla_put_u8(msg, NL80211_ATTR_MAX_NUM_PMKIDS, - dev->wiphy.max_num_pmkids)) - goto nla_put_failure; + if (tcp->seq && nla_put_flag(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ)) + return -ENOBUFS; - if ((dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) && - nla_put_flag(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE)) - goto nla_put_failure; + if (tcp->tok && nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN, + sizeof(*tcp->tok), tcp->tok)) + return -ENOBUFS; - if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_TX, - dev->wiphy.available_antennas_tx) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_RX, - dev->wiphy.available_antennas_rx)) - goto nla_put_failure; + if (nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_INTERVAL, + tcp->data_interval_max)) + return -ENOBUFS; - if ((dev->wiphy.flags & WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD) && - nla_put_u32(msg, NL80211_ATTR_PROBE_RESP_OFFLOAD, - dev->wiphy.probe_resp_offload)) - goto nla_put_failure; + if (nla_put_u32(msg, NL80211_WOWLAN_TCP_WAKE_PAYLOAD, + tcp->wake_payload_max)) + return -ENOBUFS; - if ((dev->wiphy.available_antennas_tx || - dev->wiphy.available_antennas_rx) && dev->ops->get_antenna) { - u32 tx_ant = 0, rx_ant = 0; - int res; - res = rdev_get_antenna(dev, &tx_ant, &rx_ant); - if (!res) { - if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_TX, - tx_ant) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_RX, - rx_ant)) - goto nla_put_failure; - } + nla_nest_end(msg, nl_tcp); + return 0; +} + +static int nl80211_send_wowlan(struct sk_buff *msg, + struct cfg80211_registered_device *dev, + bool large) +{ + struct nlattr *nl_wowlan; + + if (!dev->wiphy.wowlan.flags && !dev->wiphy.wowlan.n_patterns) + return 0; + + nl_wowlan = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED); + if (!nl_wowlan) + return -ENOBUFS; + + if (((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_ANY) && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_ANY)) || + ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_DISCONNECT) && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT)) || + ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_MAGIC_PKT) && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT)) || + ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY) && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED)) || + ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE)) || + ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ) && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST)) || + ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_4WAY_HANDSHAKE) && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE)) || + ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_RFKILL_RELEASE) && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE))) + return -ENOBUFS; + + if (dev->wiphy.wowlan.n_patterns) { + struct nl80211_wowlan_pattern_support pat = { + .max_patterns = dev->wiphy.wowlan.n_patterns, + .min_pattern_len = dev->wiphy.wowlan.pattern_min_len, + .max_pattern_len = dev->wiphy.wowlan.pattern_max_len, + .max_pkt_offset = dev->wiphy.wowlan.max_pkt_offset, + }; + + if (nla_put(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN, + sizeof(pat), &pat)) + return -ENOBUFS; } - if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES, - dev->wiphy.interface_modes)) - goto nla_put_failure; + if (large && nl80211_send_wowlan_tcp_caps(dev, msg)) + return -ENOBUFS; - nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS); - if (!nl_bands) - goto nla_put_failure; + nla_nest_end(msg, nl_wowlan); - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { - if (!dev->wiphy.bands[band]) - continue; + return 0; +} +#endif - nl_band = nla_nest_start(msg, band); - if (!nl_band) - goto nla_put_failure; +static int nl80211_send_band_rateinfo(struct sk_buff *msg, + struct ieee80211_supported_band *sband) +{ + struct nlattr *nl_rates, *nl_rate; + struct ieee80211_rate *rate; + int i; - /* add HT info */ - if (dev->wiphy.bands[band]->ht_cap.ht_supported && - (nla_put(msg, NL80211_BAND_ATTR_HT_MCS_SET, - sizeof(dev->wiphy.bands[band]->ht_cap.mcs), - &dev->wiphy.bands[band]->ht_cap.mcs) || - nla_put_u16(msg, NL80211_BAND_ATTR_HT_CAPA, - dev->wiphy.bands[band]->ht_cap.cap) || - nla_put_u8(msg, NL80211_BAND_ATTR_HT_AMPDU_FACTOR, - dev->wiphy.bands[band]->ht_cap.ampdu_factor) || - nla_put_u8(msg, NL80211_BAND_ATTR_HT_AMPDU_DENSITY, - dev->wiphy.bands[band]->ht_cap.ampdu_density))) - goto nla_put_failure; + /* add HT info */ + if (sband->ht_cap.ht_supported && + (nla_put(msg, NL80211_BAND_ATTR_HT_MCS_SET, + sizeof(sband->ht_cap.mcs), + &sband->ht_cap.mcs) || + nla_put_u16(msg, NL80211_BAND_ATTR_HT_CAPA, + sband->ht_cap.cap) || + nla_put_u8(msg, NL80211_BAND_ATTR_HT_AMPDU_FACTOR, + sband->ht_cap.ampdu_factor) || + nla_put_u8(msg, NL80211_BAND_ATTR_HT_AMPDU_DENSITY, + sband->ht_cap.ampdu_density))) + return -ENOBUFS; - /* add VHT info */ - if (dev->wiphy.bands[band]->vht_cap.vht_supported && - (nla_put(msg, NL80211_BAND_ATTR_VHT_MCS_SET, - sizeof(dev->wiphy.bands[band]->vht_cap.vht_mcs), - &dev->wiphy.bands[band]->vht_cap.vht_mcs) || - nla_put_u32(msg, NL80211_BAND_ATTR_VHT_CAPA, - dev->wiphy.bands[band]->vht_cap.cap))) - goto nla_put_failure; + /* add VHT info */ + if (sband->vht_cap.vht_supported && + (nla_put(msg, NL80211_BAND_ATTR_VHT_MCS_SET, + sizeof(sband->vht_cap.vht_mcs), + &sband->vht_cap.vht_mcs) || + nla_put_u32(msg, NL80211_BAND_ATTR_VHT_CAPA, + sband->vht_cap.cap))) + return -ENOBUFS; - /* add frequencies */ - nl_freqs = nla_nest_start(msg, NL80211_BAND_ATTR_FREQS); - if (!nl_freqs) - goto nla_put_failure; + /* add bitrates */ + nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES); + if (!nl_rates) + return -ENOBUFS; - for (i = 0; i < dev->wiphy.bands[band]->n_channels; i++) { - nl_freq = nla_nest_start(msg, i); - if (!nl_freq) - goto nla_put_failure; + for (i = 0; i < sband->n_bitrates; i++) { + nl_rate = nla_nest_start(msg, i); + if (!nl_rate) + return -ENOBUFS; + + rate = &sband->bitrates[i]; + if (nla_put_u32(msg, NL80211_BITRATE_ATTR_RATE, + rate->bitrate)) + return -ENOBUFS; + if ((rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) && + nla_put_flag(msg, + NL80211_BITRATE_ATTR_2GHZ_SHORTPREAMBLE)) + return -ENOBUFS; - chan = &dev->wiphy.bands[band]->channels[i]; + nla_nest_end(msg, nl_rate); + } - if (nl80211_msg_put_channel(msg, chan)) - goto nla_put_failure; + nla_nest_end(msg, nl_rates); - nla_nest_end(msg, nl_freq); - } + return 0; +} - nla_nest_end(msg, nl_freqs); +static int +nl80211_send_mgmt_stypes(struct sk_buff *msg, + const struct ieee80211_txrx_stypes *mgmt_stypes) +{ + u16 stypes; + struct nlattr *nl_ftypes, *nl_ifs; + enum nl80211_iftype ift; + int i; - /* add bitrates */ - nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES); - if (!nl_rates) - goto nla_put_failure; + if (!mgmt_stypes) + return 0; - for (i = 0; i < dev->wiphy.bands[band]->n_bitrates; i++) { - nl_rate = nla_nest_start(msg, i); - if (!nl_rate) - goto nla_put_failure; + nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES); + if (!nl_ifs) + return -ENOBUFS; - rate = &dev->wiphy.bands[band]->bitrates[i]; - if (nla_put_u32(msg, NL80211_BITRATE_ATTR_RATE, - rate->bitrate)) - goto nla_put_failure; - if ((rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) && - nla_put_flag(msg, - NL80211_BITRATE_ATTR_2GHZ_SHORTPREAMBLE)) - goto nla_put_failure; + for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { + nl_ftypes = nla_nest_start(msg, ift); + if (!nl_ftypes) + return -ENOBUFS; + i = 0; + stypes = mgmt_stypes[ift].tx; + while (stypes) { + if ((stypes & 1) && + nla_put_u16(msg, NL80211_ATTR_FRAME_TYPE, + (i << 4) | IEEE80211_FTYPE_MGMT)) + return -ENOBUFS; + stypes >>= 1; + i++; + } + nla_nest_end(msg, nl_ftypes); + } + + nla_nest_end(msg, nl_ifs); + + nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES); + if (!nl_ifs) + return -ENOBUFS; - nla_nest_end(msg, nl_rate); + for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { + nl_ftypes = nla_nest_start(msg, ift); + if (!nl_ftypes) + return -ENOBUFS; + i = 0; + stypes = mgmt_stypes[ift].rx; + while (stypes) { + if ((stypes & 1) && + nla_put_u16(msg, NL80211_ATTR_FRAME_TYPE, + (i << 4) | IEEE80211_FTYPE_MGMT)) + return -ENOBUFS; + stypes >>= 1; + i++; } + nla_nest_end(msg, nl_ftypes); + } + nla_nest_end(msg, nl_ifs); + + return 0; +} + +static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, + struct sk_buff *msg, u32 portid, u32 seq, + int flags, bool split, long *split_start, + long *band_start, long *chan_start) +{ + void *hdr; + struct nlattr *nl_bands, *nl_band; + struct nlattr *nl_freqs, *nl_freq; + struct nlattr *nl_cmds; + enum ieee80211_band band; + struct ieee80211_channel *chan; + int i; + const struct ieee80211_txrx_stypes *mgmt_stypes = + dev->wiphy.mgmt_stypes; + long start = 0, start_chan = 0, start_band = 0; + u32 features; - nla_nest_end(msg, nl_rates); + hdr = nl80211hdr_put(msg, portid, seq, flags, NL80211_CMD_NEW_WIPHY); + if (!hdr) + return -ENOBUFS; - nla_nest_end(msg, nl_band); + /* allow always using the variables */ + if (!split) { + split_start = &start; + band_start = &start_band; + chan_start = &start_chan; } - nla_nest_end(msg, nl_bands); - nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS); - if (!nl_cmds) - goto nla_put_failure; + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx) || + nla_put_string(msg, NL80211_ATTR_WIPHY_NAME, + wiphy_name(&dev->wiphy)) || + nla_put_u32(msg, NL80211_ATTR_GENERATION, + cfg80211_rdev_list_generation)) + goto nla_put_failure; + + switch (*split_start) { + case 0: + if (nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_SHORT, + dev->wiphy.retry_short) || + nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_LONG, + dev->wiphy.retry_long) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FRAG_THRESHOLD, + dev->wiphy.frag_threshold) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD, + dev->wiphy.rts_threshold) || + nla_put_u8(msg, NL80211_ATTR_WIPHY_COVERAGE_CLASS, + dev->wiphy.coverage_class) || + nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS, + dev->wiphy.max_scan_ssids) || + nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_SSIDS, + dev->wiphy.max_sched_scan_ssids) || + nla_put_u16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN, + dev->wiphy.max_scan_ie_len) || + nla_put_u16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN, + dev->wiphy.max_sched_scan_ie_len) || + nla_put_u8(msg, NL80211_ATTR_MAX_MATCH_SETS, + dev->wiphy.max_match_sets)) + goto nla_put_failure; - i = 0; -#define CMD(op, n) \ - do { \ - if (dev->ops->op) { \ - i++; \ - if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \ - goto nla_put_failure; \ - } \ - } while (0) - - CMD(add_virtual_intf, NEW_INTERFACE); - CMD(change_virtual_intf, SET_INTERFACE); - CMD(add_key, NEW_KEY); - CMD(start_ap, START_AP); - CMD(add_station, NEW_STATION); - CMD(add_mpath, NEW_MPATH); - CMD(update_mesh_config, SET_MESH_CONFIG); - CMD(change_bss, SET_BSS); - CMD(auth, AUTHENTICATE); - CMD(assoc, ASSOCIATE); - CMD(deauth, DEAUTHENTICATE); - CMD(disassoc, DISASSOCIATE); - CMD(join_ibss, JOIN_IBSS); - CMD(join_mesh, JOIN_MESH); - CMD(set_pmksa, SET_PMKSA); - CMD(del_pmksa, DEL_PMKSA); - CMD(flush_pmksa, FLUSH_PMKSA); - if (dev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) - CMD(remain_on_channel, REMAIN_ON_CHANNEL); - CMD(set_bitrate_mask, SET_TX_BITRATE_MASK); - CMD(mgmt_tx, FRAME); - CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL); - if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { - i++; - if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS)) + if ((dev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) && + nla_put_flag(msg, NL80211_ATTR_SUPPORT_IBSS_RSN)) goto nla_put_failure; - } - if (dev->ops->set_monitor_channel || dev->ops->start_ap || - dev->ops->join_mesh) { - i++; - if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL)) + if ((dev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) && + nla_put_flag(msg, NL80211_ATTR_SUPPORT_MESH_AUTH)) goto nla_put_failure; - } - CMD(set_wds_peer, SET_WDS_PEER); - if (dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) { - CMD(tdls_mgmt, TDLS_MGMT); - CMD(tdls_oper, TDLS_OPER); - } - if (dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) - CMD(sched_scan_start, START_SCHED_SCAN); - CMD(probe_client, PROBE_CLIENT); - CMD(set_noack_map, SET_NOACK_MAP); - if (dev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) { - i++; - if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS)) + if ((dev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) && + nla_put_flag(msg, NL80211_ATTR_SUPPORT_AP_UAPSD)) + goto nla_put_failure; + if ((dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) && + nla_put_flag(msg, NL80211_ATTR_ROAM_SUPPORT)) + goto nla_put_failure; + if ((dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) && + nla_put_flag(msg, NL80211_ATTR_TDLS_SUPPORT)) + goto nla_put_failure; + if ((dev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP) && + nla_put_flag(msg, NL80211_ATTR_TDLS_EXTERNAL_SETUP)) goto nla_put_failure; - } - CMD(start_p2p_device, START_P2P_DEVICE); - CMD(set_mcast_rate, SET_MCAST_RATE); -#ifdef CONFIG_NL80211_TESTMODE - CMD(testmode_cmd, TESTMODE); -#endif + (*split_start)++; + if (split) + break; + case 1: + if (nla_put(msg, NL80211_ATTR_CIPHER_SUITES, + sizeof(u32) * dev->wiphy.n_cipher_suites, + dev->wiphy.cipher_suites)) + goto nla_put_failure; -#undef CMD + if (nla_put_u8(msg, NL80211_ATTR_MAX_NUM_PMKIDS, + dev->wiphy.max_num_pmkids)) + goto nla_put_failure; - if (dev->ops->connect || dev->ops->auth) { - i++; - if (nla_put_u32(msg, i, NL80211_CMD_CONNECT)) + if ((dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) && + nla_put_flag(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE)) goto nla_put_failure; - } - if (dev->ops->disconnect || dev->ops->deauth) { - i++; - if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT)) + if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_TX, + dev->wiphy.available_antennas_tx) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_RX, + dev->wiphy.available_antennas_rx)) goto nla_put_failure; - } - nla_nest_end(msg, nl_cmds); + if ((dev->wiphy.flags & WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD) && + nla_put_u32(msg, NL80211_ATTR_PROBE_RESP_OFFLOAD, + dev->wiphy.probe_resp_offload)) + goto nla_put_failure; - if (dev->ops->remain_on_channel && - (dev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) && - nla_put_u32(msg, NL80211_ATTR_MAX_REMAIN_ON_CHANNEL_DURATION, - dev->wiphy.max_remain_on_channel_duration)) - goto nla_put_failure; + if ((dev->wiphy.available_antennas_tx || + dev->wiphy.available_antennas_rx) && + dev->ops->get_antenna) { + u32 tx_ant = 0, rx_ant = 0; + int res; + res = rdev_get_antenna(dev, &tx_ant, &rx_ant); + if (!res) { + if (nla_put_u32(msg, + NL80211_ATTR_WIPHY_ANTENNA_TX, + tx_ant) || + nla_put_u32(msg, + NL80211_ATTR_WIPHY_ANTENNA_RX, + rx_ant)) + goto nla_put_failure; + } + } - if ((dev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX) && - nla_put_flag(msg, NL80211_ATTR_OFFCHANNEL_TX_OK)) - goto nla_put_failure; + (*split_start)++; + if (split) + break; + case 2: + if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES, + dev->wiphy.interface_modes)) + goto nla_put_failure; + (*split_start)++; + if (split) + break; + case 3: + nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS); + if (!nl_bands) + goto nla_put_failure; - if (mgmt_stypes) { - u16 stypes; - struct nlattr *nl_ftypes, *nl_ifs; - enum nl80211_iftype ift; + for (band = *band_start; band < IEEE80211_NUM_BANDS; band++) { + struct ieee80211_supported_band *sband; - nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES); - if (!nl_ifs) - goto nla_put_failure; + sband = dev->wiphy.bands[band]; + + if (!sband) + continue; - for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { - nl_ftypes = nla_nest_start(msg, ift); - if (!nl_ftypes) + nl_band = nla_nest_start(msg, band); + if (!nl_band) goto nla_put_failure; - i = 0; - stypes = mgmt_stypes[ift].tx; - while (stypes) { - if ((stypes & 1) && - nla_put_u16(msg, NL80211_ATTR_FRAME_TYPE, - (i << 4) | IEEE80211_FTYPE_MGMT)) + + switch (*chan_start) { + case 0: + if (nl80211_send_band_rateinfo(msg, sband)) goto nla_put_failure; - stypes >>= 1; - i++; + (*chan_start)++; + if (split) + break; + default: + /* add frequencies */ + nl_freqs = nla_nest_start( + msg, NL80211_BAND_ATTR_FREQS); + if (!nl_freqs) + goto nla_put_failure; + + for (i = *chan_start - 1; + i < sband->n_channels; + i++) { + nl_freq = nla_nest_start(msg, i); + if (!nl_freq) + goto nla_put_failure; + + chan = &sband->channels[i]; + + if (nl80211_msg_put_channel(msg, chan, + split)) + goto nla_put_failure; + + nla_nest_end(msg, nl_freq); + if (split) + break; + } + if (i < sband->n_channels) + *chan_start = i + 2; + else + *chan_start = 0; + nla_nest_end(msg, nl_freqs); + } + + nla_nest_end(msg, nl_band); + + if (split) { + /* start again here */ + if (*chan_start) + band--; + break; } - nla_nest_end(msg, nl_ftypes); } + nla_nest_end(msg, nl_bands); - nla_nest_end(msg, nl_ifs); + if (band < IEEE80211_NUM_BANDS) + *band_start = band + 1; + else + *band_start = 0; - nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES); - if (!nl_ifs) + /* if bands & channels are done, continue outside */ + if (*band_start == 0 && *chan_start == 0) + (*split_start)++; + if (split) + break; + case 4: + nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS); + if (!nl_cmds) goto nla_put_failure; - for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { - nl_ftypes = nla_nest_start(msg, ift); - if (!nl_ftypes) + i = 0; +#define CMD(op, n) \ + do { \ + if (dev->ops->op) { \ + i++; \ + if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \ + goto nla_put_failure; \ + } \ + } while (0) + + CMD(add_virtual_intf, NEW_INTERFACE); + CMD(change_virtual_intf, SET_INTERFACE); + CMD(add_key, NEW_KEY); + CMD(start_ap, START_AP); + CMD(add_station, NEW_STATION); + CMD(add_mpath, NEW_MPATH); + CMD(update_mesh_config, SET_MESH_CONFIG); + CMD(change_bss, SET_BSS); + CMD(auth, AUTHENTICATE); + CMD(assoc, ASSOCIATE); + CMD(deauth, DEAUTHENTICATE); + CMD(disassoc, DISASSOCIATE); + CMD(join_ibss, JOIN_IBSS); + CMD(join_mesh, JOIN_MESH); + CMD(set_pmksa, SET_PMKSA); + CMD(del_pmksa, DEL_PMKSA); + CMD(flush_pmksa, FLUSH_PMKSA); + if (dev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) + CMD(remain_on_channel, REMAIN_ON_CHANNEL); + CMD(set_bitrate_mask, SET_TX_BITRATE_MASK); + CMD(mgmt_tx, FRAME); + CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL); + if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { + i++; + if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS)) goto nla_put_failure; - i = 0; - stypes = mgmt_stypes[ift].rx; - while (stypes) { - if ((stypes & 1) && - nla_put_u16(msg, NL80211_ATTR_FRAME_TYPE, - (i << 4) | IEEE80211_FTYPE_MGMT)) - goto nla_put_failure; - stypes >>= 1; - i++; - } - nla_nest_end(msg, nl_ftypes); } - nla_nest_end(msg, nl_ifs); - } + if (dev->ops->set_monitor_channel || dev->ops->start_ap || + dev->ops->join_mesh) { + i++; + if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL)) + goto nla_put_failure; + } + CMD(set_wds_peer, SET_WDS_PEER); + if (dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) { + CMD(tdls_mgmt, TDLS_MGMT); + CMD(tdls_oper, TDLS_OPER); + } + if (dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + CMD(sched_scan_start, START_SCHED_SCAN); + CMD(probe_client, PROBE_CLIENT); + CMD(set_noack_map, SET_NOACK_MAP); + if (dev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) { + i++; + if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS)) + goto nla_put_failure; + } + CMD(start_p2p_device, START_P2P_DEVICE); + CMD(set_mcast_rate, SET_MCAST_RATE); + if (split) { + CMD(crit_proto_start, CRIT_PROTOCOL_START); + CMD(crit_proto_stop, CRIT_PROTOCOL_STOP); + } -#ifdef CONFIG_PM - if (dev->wiphy.wowlan.flags || dev->wiphy.wowlan.n_patterns) { - struct nlattr *nl_wowlan; +#ifdef CONFIG_NL80211_TESTMODE + CMD(testmode_cmd, TESTMODE); +#endif - nl_wowlan = nla_nest_start(msg, - NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED); - if (!nl_wowlan) - goto nla_put_failure; +#undef CMD - if (((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_ANY) && - nla_put_flag(msg, NL80211_WOWLAN_TRIG_ANY)) || - ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_DISCONNECT) && - nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT)) || - ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_MAGIC_PKT) && - nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT)) || - ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY) && - nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED)) || - ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) && - nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE)) || - ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ) && - nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST)) || - ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_4WAY_HANDSHAKE) && - nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE)) || - ((dev->wiphy.wowlan.flags & WIPHY_WOWLAN_RFKILL_RELEASE) && - nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE))) - goto nla_put_failure; - if (dev->wiphy.wowlan.n_patterns) { - struct nl80211_wowlan_pattern_support pat = { - .max_patterns = dev->wiphy.wowlan.n_patterns, - .min_pattern_len = - dev->wiphy.wowlan.pattern_min_len, - .max_pattern_len = - dev->wiphy.wowlan.pattern_max_len, - }; - if (nla_put(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN, - sizeof(pat), &pat)) + if (dev->ops->connect || dev->ops->auth) { + i++; + if (nla_put_u32(msg, i, NL80211_CMD_CONNECT)) goto nla_put_failure; } - nla_nest_end(msg, nl_wowlan); - } + if (dev->ops->disconnect || dev->ops->deauth) { + i++; + if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT)) + goto nla_put_failure; + } + + nla_nest_end(msg, nl_cmds); + (*split_start)++; + if (split) + break; + case 5: + if (dev->ops->remain_on_channel && + (dev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) && + nla_put_u32(msg, + NL80211_ATTR_MAX_REMAIN_ON_CHANNEL_DURATION, + dev->wiphy.max_remain_on_channel_duration)) + goto nla_put_failure; + + if ((dev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX) && + nla_put_flag(msg, NL80211_ATTR_OFFCHANNEL_TX_OK)) + goto nla_put_failure; + + if (nl80211_send_mgmt_stypes(msg, mgmt_stypes)) + goto nla_put_failure; + (*split_start)++; + if (split) + break; + case 6: +#ifdef CONFIG_PM + if (nl80211_send_wowlan(msg, dev, split)) + goto nla_put_failure; + (*split_start)++; + if (split) + break; +#else + (*split_start)++; #endif + case 7: + if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES, + dev->wiphy.software_iftypes)) + goto nla_put_failure; - if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES, - dev->wiphy.software_iftypes)) - goto nla_put_failure; + if (nl80211_put_iface_combinations(&dev->wiphy, msg, split)) + goto nla_put_failure; - if (nl80211_put_iface_combinations(&dev->wiphy, msg)) - goto nla_put_failure; + (*split_start)++; + if (split) + break; + case 8: + if ((dev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME) && + nla_put_u32(msg, NL80211_ATTR_DEVICE_AP_SME, + dev->wiphy.ap_sme_capa)) + goto nla_put_failure; - if ((dev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME) && - nla_put_u32(msg, NL80211_ATTR_DEVICE_AP_SME, - dev->wiphy.ap_sme_capa)) - goto nla_put_failure; + features = dev->wiphy.features; + /* + * We can only add the per-channel limit information if the + * dump is split, otherwise it makes it too big. Therefore + * only advertise it in that case. + */ + if (split) + features |= NL80211_FEATURE_ADVERTISE_CHAN_LIMITS; + if (nla_put_u32(msg, NL80211_ATTR_FEATURE_FLAGS, features)) + goto nla_put_failure; - if (nla_put_u32(msg, NL80211_ATTR_FEATURE_FLAGS, - dev->wiphy.features)) - goto nla_put_failure; + if (dev->wiphy.ht_capa_mod_mask && + nla_put(msg, NL80211_ATTR_HT_CAPABILITY_MASK, + sizeof(*dev->wiphy.ht_capa_mod_mask), + dev->wiphy.ht_capa_mod_mask)) + goto nla_put_failure; - if (dev->wiphy.ht_capa_mod_mask && - nla_put(msg, NL80211_ATTR_HT_CAPABILITY_MASK, - sizeof(*dev->wiphy.ht_capa_mod_mask), - dev->wiphy.ht_capa_mod_mask)) - goto nla_put_failure; + if (dev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME && + dev->wiphy.max_acl_mac_addrs && + nla_put_u32(msg, NL80211_ATTR_MAC_ACL_MAX, + dev->wiphy.max_acl_mac_addrs)) + goto nla_put_failure; + /* + * Any information below this point is only available to + * applications that can deal with it being split. This + * helps ensure that newly added capabilities don't break + * older tools by overrunning their buffers. + * + * We still increment split_start so that in the split + * case we'll continue with more data in the next round, + * but break unconditionally so unsplit data stops here. + */ + (*split_start)++; + break; + case 9: + if (dev->wiphy.extended_capabilities && + (nla_put(msg, NL80211_ATTR_EXT_CAPA, + dev->wiphy.extended_capabilities_len, + dev->wiphy.extended_capabilities) || + nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK, + dev->wiphy.extended_capabilities_len, + dev->wiphy.extended_capabilities_mask))) + goto nla_put_failure; + + if (dev->wiphy.vht_capa_mod_mask && + nla_put(msg, NL80211_ATTR_VHT_CAPABILITY_MASK, + sizeof(*dev->wiphy.vht_capa_mod_mask), + dev->wiphy.vht_capa_mod_mask)) + goto nla_put_failure; + + /* done */ + *split_start = 0; + break; + } return genlmsg_end(msg, hdr); nla_put_failure: @@ -1274,22 +1559,83 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 portid, u32 seq, int flag static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) { - int idx = 0; + int idx = 0, ret; int start = cb->args[0]; struct cfg80211_registered_device *dev; + s64 filter_wiphy = -1; + bool split = false; + struct nlattr **tb = nl80211_fam.attrbuf; + int res; mutex_lock(&cfg80211_mutex); + res = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, + tb, nl80211_fam.maxattr, nl80211_policy); + if (res == 0) { + split = tb[NL80211_ATTR_SPLIT_WIPHY_DUMP]; + if (tb[NL80211_ATTR_WIPHY]) + filter_wiphy = nla_get_u32(tb[NL80211_ATTR_WIPHY]); + if (tb[NL80211_ATTR_WDEV]) + filter_wiphy = nla_get_u64(tb[NL80211_ATTR_WDEV]) >> 32; + if (tb[NL80211_ATTR_IFINDEX]) { + struct net_device *netdev; + int ifidx = nla_get_u32(tb[NL80211_ATTR_IFINDEX]); + + netdev = dev_get_by_index(sock_net(skb->sk), ifidx); + if (!netdev) { + mutex_unlock(&cfg80211_mutex); + return -ENODEV; + } + if (netdev->ieee80211_ptr) { + dev = wiphy_to_dev( + netdev->ieee80211_ptr->wiphy); + filter_wiphy = dev->wiphy_idx; + } + dev_put(netdev); + } + } + list_for_each_entry(dev, &cfg80211_rdev_list, list) { if (!net_eq(wiphy_net(&dev->wiphy), sock_net(skb->sk))) continue; if (++idx <= start) continue; - if (nl80211_send_wiphy(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - dev) < 0) { - idx--; - break; - } + if (filter_wiphy != -1 && dev->wiphy_idx != filter_wiphy) + continue; + /* attempt to fit multiple wiphy data chunks into the skb */ + do { + ret = nl80211_send_wiphy(dev, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + split, &cb->args[1], + &cb->args[2], + &cb->args[3]); + if (ret < 0) { + /* + * If sending the wiphy data didn't fit (ENOBUFS + * or EMSGSIZE returned), this SKB is still + * empty (so it's not too big because another + * wiphy dataset is already in the skb) and + * we've not tried to adjust the dump allocation + * yet ... then adjust the alloc size to be + * bigger, and return 1 but with the empty skb. + * This results in an empty message being RX'ed + * in userspace, but that is ignored. + * + * We can then retry with the larger buffer. + */ + if ((ret == -ENOBUFS || ret == -EMSGSIZE) && + !skb->len && + cb->min_dump_alloc < 4096) { + cb->min_dump_alloc = 4096; + mutex_unlock(&cfg80211_mutex); + return 1; + } + idx--; + break; + } + } while (cb->args[1] > 0); + break; } mutex_unlock(&cfg80211_mutex); @@ -1303,11 +1649,12 @@ static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; struct cfg80211_registered_device *dev = info->user_ptr[0]; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + msg = nlmsg_new(4096, GFP_KERNEL); if (!msg) return -ENOMEM; - if (nl80211_send_wiphy(msg, info->snd_portid, info->snd_seq, 0, dev) < 0) { + if (nl80211_send_wiphy(dev, msg, info->snd_portid, info->snd_seq, 0, + false, NULL, NULL, NULL) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -2079,6 +2426,13 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) !(rdev->wiphy.interface_modes & (1 << type))) return -EOPNOTSUPP; + if (type == NL80211_IFTYPE_P2P_DEVICE && info->attrs[NL80211_ATTR_MAC]) { + nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC], + ETH_ALEN); + if (!is_valid_ether_addr(params.macaddr)) + return -EADDRNOTAVAIL; + } + if (info->attrs[NL80211_ATTR_4ADDR]) { params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]); err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type); @@ -2481,6 +2835,97 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) return err; } +/* This function returns an error or the number of nested attributes */ +static int validate_acl_mac_addrs(struct nlattr *nl_attr) +{ + struct nlattr *attr; + int n_entries = 0, tmp; + + nla_for_each_nested(attr, nl_attr, tmp) { + if (nla_len(attr) != ETH_ALEN) + return -EINVAL; + + n_entries++; + } + + return n_entries; +} + +/* + * This function parses ACL information and allocates memory for ACL data. + * On successful return, the calling function is responsible to free the + * ACL buffer returned by this function. + */ +static struct cfg80211_acl_data *parse_acl_data(struct wiphy *wiphy, + struct genl_info *info) +{ + enum nl80211_acl_policy acl_policy; + struct nlattr *attr; + struct cfg80211_acl_data *acl; + int i = 0, n_entries, tmp; + + if (!wiphy->max_acl_mac_addrs) + return ERR_PTR(-EOPNOTSUPP); + + if (!info->attrs[NL80211_ATTR_ACL_POLICY]) + return ERR_PTR(-EINVAL); + + acl_policy = nla_get_u32(info->attrs[NL80211_ATTR_ACL_POLICY]); + if (acl_policy != NL80211_ACL_POLICY_ACCEPT_UNLESS_LISTED && + acl_policy != NL80211_ACL_POLICY_DENY_UNLESS_LISTED) + return ERR_PTR(-EINVAL); + + if (!info->attrs[NL80211_ATTR_MAC_ADDRS]) + return ERR_PTR(-EINVAL); + + n_entries = validate_acl_mac_addrs(info->attrs[NL80211_ATTR_MAC_ADDRS]); + if (n_entries < 0) + return ERR_PTR(n_entries); + + if (n_entries > wiphy->max_acl_mac_addrs) + return ERR_PTR(-ENOTSUPP); + + acl = kzalloc(sizeof(*acl) + (sizeof(struct mac_address) * n_entries), + GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_MAC_ADDRS], tmp) { + memcpy(acl->mac_addrs[i].addr, nla_data(attr), ETH_ALEN); + i++; + } + + acl->n_acl_entries = n_entries; + acl->acl_policy = acl_policy; + + return acl; +} + +static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_acl_data *acl; + int err; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + if (!dev->ieee80211_ptr->beacon_interval) + return -EINVAL; + + acl = parse_acl_data(&rdev->wiphy, info); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + err = rdev_set_mac_acl(rdev, dev, acl); + + kfree(acl); + + return err; +} + static int nl80211_parse_beacon(struct genl_info *info, struct cfg80211_beacon_data *bcn) { @@ -2598,6 +3043,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_ap_settings params; int err; + u8 radar_detect_width = 0; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) @@ -2716,14 +3162,30 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef)) return -EINVAL; + err = cfg80211_chandef_dfs_required(wdev->wiphy, ¶ms.chandef); + if (err < 0) + return err; + if (err) { + radar_detect_width = BIT(params.chandef.width); + params.radar_required = true; + } + mutex_lock(&rdev->devlist_mtx); - err = cfg80211_can_use_chan(rdev, wdev, params.chandef.chan, - CHAN_MODE_SHARED); + err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, + params.chandef.chan, + CHAN_MODE_SHARED, + radar_detect_width); mutex_unlock(&rdev->devlist_mtx); if (err) return err; + if (info->attrs[NL80211_ATTR_ACL_POLICY]) { + params.acl = parse_acl_data(&rdev->wiphy, info); + if (IS_ERR(params.acl)) + return PTR_ERR(params.acl); + } + err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { wdev->preset_chandef = params.chandef; @@ -2732,6 +3194,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) wdev->ssid_len = params.ssid_len; memcpy(wdev->ssid, params.ssid, wdev->ssid_len); } + + kfree(params.acl); + return err; } @@ -2796,6 +3261,7 @@ static int parse_station_flags(struct genl_info *info, sta_flags = nla_data(nla); params->sta_flags_mask = sta_flags->mask; params->sta_flags_set = sta_flags->set; + params->sta_flags_set &= params->sta_flags_mask; if ((params->sta_flags_mask | params->sta_flags_set) & BIT(__NL80211_STA_FLAG_INVALID)) return -EINVAL; @@ -2939,12 +3405,22 @@ static int nl80211_send_station(struct sk_buff *msg, u32 portid, u32 seq, nla_put_u32(msg, NL80211_STA_INFO_INACTIVE_TIME, sinfo->inactive_time)) goto nla_put_failure; - if ((sinfo->filled & STATION_INFO_RX_BYTES) && + if ((sinfo->filled & (STATION_INFO_RX_BYTES | + STATION_INFO_RX_BYTES64)) && nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES, - sinfo->rx_bytes)) + (u32)sinfo->rx_bytes)) goto nla_put_failure; - if ((sinfo->filled & STATION_INFO_TX_BYTES) && + if ((sinfo->filled & (STATION_INFO_TX_BYTES | + NL80211_STA_INFO_TX_BYTES64)) && nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES, + (u32)sinfo->tx_bytes)) + goto nla_put_failure; + if ((sinfo->filled & STATION_INFO_RX_BYTES64) && + nla_put_u64(msg, NL80211_STA_INFO_RX_BYTES64, + sinfo->rx_bytes)) + goto nla_put_failure; + if ((sinfo->filled & STATION_INFO_TX_BYTES64) && + nla_put_u64(msg, NL80211_STA_INFO_TX_BYTES64, sinfo->tx_bytes)) goto nla_put_failure; if ((sinfo->filled & STATION_INFO_LLID) && @@ -3001,6 +3477,18 @@ static int nl80211_send_station(struct sk_buff *msg, u32 portid, u32 seq, nla_put_u32(msg, NL80211_STA_INFO_BEACON_LOSS, sinfo->beacon_loss_count)) goto nla_put_failure; + if ((sinfo->filled & STATION_INFO_LOCAL_PM) && + nla_put_u32(msg, NL80211_STA_INFO_LOCAL_PM, + sinfo->local_pm)) + goto nla_put_failure; + if ((sinfo->filled & STATION_INFO_PEER_PM) && + nla_put_u32(msg, NL80211_STA_INFO_PEER_PM, + sinfo->peer_pm)) + goto nla_put_failure; + if ((sinfo->filled & STATION_INFO_NONPEER_PM) && + nla_put_u32(msg, NL80211_STA_INFO_NONPEER_PM, + sinfo->nonpeer_pm)) + goto nla_put_failure; if (sinfo->filled & STATION_INFO_BSS_PARAM) { bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM); if (!bss_param) @@ -3048,15 +3536,20 @@ static int nl80211_dump_station(struct sk_buff *skb, { struct station_info sinfo; struct cfg80211_registered_device *dev; - struct net_device *netdev; + struct wireless_dev *wdev; u8 mac_addr[ETH_ALEN]; - int sta_idx = cb->args[1]; + int sta_idx = cb->args[2]; int err; - err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev); + err = nl80211_prepare_wdev_dump(skb, cb, &dev, &wdev); if (err) return err; + if (!wdev->netdev) { + err = -EINVAL; + goto out_err; + } + if (!dev->ops->dump_station) { err = -EOPNOTSUPP; goto out_err; @@ -3064,7 +3557,7 @@ static int nl80211_dump_station(struct sk_buff *skb, while (1) { memset(&sinfo, 0, sizeof(sinfo)); - err = rdev_dump_station(dev, netdev, sta_idx, + err = rdev_dump_station(dev, wdev->netdev, sta_idx, mac_addr, &sinfo); if (err == -ENOENT) break; @@ -3074,7 +3567,7 @@ static int nl80211_dump_station(struct sk_buff *skb, if (nl80211_send_station(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - dev, netdev, mac_addr, + dev, wdev->netdev, mac_addr, &sinfo) < 0) goto out; @@ -3083,10 +3576,10 @@ static int nl80211_dump_station(struct sk_buff *skb, out: - cb->args[1] = sta_idx; + cb->args[2] = sta_idx; err = skb->len; out_err: - nl80211_finish_netdev_dump(dev); + nl80211_finish_wdev_dump(dev); return err; } @@ -3127,6 +3620,136 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) return genlmsg_reply(msg, info); } +int cfg80211_check_station_change(struct wiphy *wiphy, + struct station_parameters *params, + enum cfg80211_station_type statype) +{ + if (params->listen_interval != -1) + return -EINVAL; + if (params->aid) + return -EINVAL; + + /* When you run into this, adjust the code below for the new flag */ + BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); + + switch (statype) { + case CFG80211_STA_MESH_PEER_KERNEL: + case CFG80211_STA_MESH_PEER_USER: + /* + * No ignoring the TDLS flag here -- the userspace mesh + * code doesn't have the bug of including TDLS in the + * mask everywhere. + */ + if (params->sta_flags_mask & + ~(BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_MFP) | + BIT(NL80211_STA_FLAG_AUTHORIZED))) + return -EINVAL; + break; + case CFG80211_STA_TDLS_PEER_SETUP: + case CFG80211_STA_TDLS_PEER_ACTIVE: + if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) + return -EINVAL; + /* ignore since it can't change */ + params->sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER); + break; + default: + /* disallow mesh-specific things */ + if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION) + return -EINVAL; + if (params->local_pm) + return -EINVAL; + if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) + return -EINVAL; + } + + if (statype != CFG80211_STA_TDLS_PEER_SETUP && + statype != CFG80211_STA_TDLS_PEER_ACTIVE) { + /* TDLS can't be set, ... */ + if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) + return -EINVAL; + /* + * ... but don't bother the driver with it. This works around + * a hostapd/wpa_supplicant issue -- it always includes the + * TLDS_PEER flag in the mask even for AP mode. + */ + params->sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER); + } + + if (statype != CFG80211_STA_TDLS_PEER_SETUP) { + /* reject other things that can't change */ + if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) + return -EINVAL; + if (params->sta_modify_mask & STATION_PARAM_APPLY_CAPABILITY) + return -EINVAL; + if (params->supported_rates) + return -EINVAL; + if (params->ext_capab || params->ht_capa || params->vht_capa) + return -EINVAL; + } + + if (statype != CFG80211_STA_AP_CLIENT) { + if (params->vlan) + return -EINVAL; + } + + switch (statype) { + case CFG80211_STA_AP_MLME_CLIENT: + /* Use this only for authorizing/unauthorizing a station */ + if (!(params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED))) + return -EOPNOTSUPP; + break; + case CFG80211_STA_AP_CLIENT: + /* accept only the listed bits */ + if (params->sta_flags_mask & + ~(BIT(NL80211_STA_FLAG_AUTHORIZED) | + BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED) | + BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | + BIT(NL80211_STA_FLAG_WME) | + BIT(NL80211_STA_FLAG_MFP))) + return -EINVAL; + + /* but authenticated/associated only if driver handles it */ + if (!(wiphy->features & NL80211_FEATURE_FULL_AP_CLIENT_STATE) && + params->sta_flags_mask & + (BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED))) + return -EINVAL; + break; + case CFG80211_STA_IBSS: + case CFG80211_STA_AP_STA: + /* reject any changes other than AUTHORIZED */ + if (params->sta_flags_mask & ~BIT(NL80211_STA_FLAG_AUTHORIZED)) + return -EINVAL; + break; + case CFG80211_STA_TDLS_PEER_SETUP: + /* reject any changes other than AUTHORIZED or WME */ + if (params->sta_flags_mask & ~(BIT(NL80211_STA_FLAG_AUTHORIZED) | + BIT(NL80211_STA_FLAG_WME))) + return -EINVAL; + /* force (at least) rates when authorizing */ + if (params->sta_flags_set & BIT(NL80211_STA_FLAG_AUTHORIZED) && + !params->supported_rates) + return -EINVAL; + break; + case CFG80211_STA_TDLS_PEER_ACTIVE: + /* reject any changes */ + return -EINVAL; + case CFG80211_STA_MESH_PEER_KERNEL: + if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) + return -EINVAL; + break; + case CFG80211_STA_MESH_PEER_USER: + if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION) + return -EINVAL; + break; + } + + return 0; +} +EXPORT_SYMBOL(cfg80211_check_station_change); + /* * Get vlan interface making sure it is running and on the right wiphy. */ @@ -3149,6 +3772,13 @@ static struct net_device *get_vlan(struct genl_info *info, goto error; } + if (v->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && + v->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + v->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { + ret = -EINVAL; + goto error; + } + if (!netif_running(v)) { ret = -ENETDOWN; goto error; @@ -3160,18 +3790,74 @@ static struct net_device *get_vlan(struct genl_info *info, return ERR_PTR(ret); } +static struct nla_policy +nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] __read_mostly = { + [NL80211_STA_WME_UAPSD_QUEUES] = { .type = NLA_U8 }, + [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 }, +}; + +static int nl80211_parse_sta_wme(struct genl_info *info, + struct station_parameters *params) +{ + struct nlattr *tb[NL80211_STA_WME_MAX + 1]; + struct nlattr *nla; + int err; + + /* parse WME attributes if present */ + if (!info->attrs[NL80211_ATTR_STA_WME]) + return 0; + + nla = info->attrs[NL80211_ATTR_STA_WME]; + err = nla_parse_nested(tb, NL80211_STA_WME_MAX, nla, + nl80211_sta_wme_policy); + if (err) + return err; + + if (tb[NL80211_STA_WME_UAPSD_QUEUES]) + params->uapsd_queues = nla_get_u8( + tb[NL80211_STA_WME_UAPSD_QUEUES]); + if (params->uapsd_queues & ~IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK) + return -EINVAL; + + if (tb[NL80211_STA_WME_MAX_SP]) + params->max_sp = nla_get_u8(tb[NL80211_STA_WME_MAX_SP]); + + if (params->max_sp & ~IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK) + return -EINVAL; + + params->sta_modify_mask |= STATION_PARAM_APPLY_UAPSD; + + return 0; +} + +static int nl80211_set_station_tdls(struct genl_info *info, + struct station_parameters *params) +{ + /* Dummy STA entry gets updated once the peer capabilities are known */ + if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) + params->ht_capa = + nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) + params->vht_capa = + nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + + return nl80211_parse_sta_wme(info, params); +} + static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - int err; struct net_device *dev = info->user_ptr[1]; struct station_parameters params; - u8 *mac_addr = NULL; + u8 *mac_addr; + int err; memset(¶ms, 0, sizeof(params)); params.listen_interval = -1; - params.plink_state = -1; + + if (!rdev->ops->change_station) + return -EOPNOTSUPP; if (info->attrs[NL80211_ATTR_STA_AID]) return -EINVAL; @@ -3188,119 +3874,84 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); } - if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) - params.listen_interval = - nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) { + params.capability = + nla_get_u16(info->attrs[NL80211_ATTR_STA_CAPABILITY]); + params.sta_modify_mask |= STATION_PARAM_APPLY_CAPABILITY; + } - if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) - params.ht_capa = - nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]) { + params.ext_capab = + nla_data(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]); + params.ext_capab_len = + nla_len(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]); + } - if (!rdev->ops->change_station) - return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) + return -EINVAL; if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) return -EINVAL; - if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) + if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) { params.plink_action = - nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); + nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); + if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS) + return -EINVAL; + } - if (info->attrs[NL80211_ATTR_STA_PLINK_STATE]) + if (info->attrs[NL80211_ATTR_STA_PLINK_STATE]) { params.plink_state = - nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]); - - switch (dev->ieee80211_ptr->iftype) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_P2P_GO: - /* disallow mesh-specific things */ - if (params.plink_action) + nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]); + if (params.plink_state >= NUM_NL80211_PLINK_STATES) return -EINVAL; + params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE; + } - /* TDLS can't be set, ... */ - if (params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) - return -EINVAL; - /* - * ... but don't bother the driver with it. This works around - * a hostapd/wpa_supplicant issue -- it always includes the - * TLDS_PEER flag in the mask even for AP mode. - */ - params.sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER); + if (info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]) { + enum nl80211_mesh_power_mode pm = nla_get_u32( + info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]); - /* accept only the listed bits */ - if (params.sta_flags_mask & - ~(BIT(NL80211_STA_FLAG_AUTHORIZED) | - BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | - BIT(NL80211_STA_FLAG_WME) | - BIT(NL80211_STA_FLAG_MFP))) + if (pm <= NL80211_MESH_POWER_UNKNOWN || + pm > NL80211_MESH_POWER_MAX) return -EINVAL; - /* must be last in here for error handling */ - params.vlan = get_vlan(info, rdev); - if (IS_ERR(params.vlan)) - return PTR_ERR(params.vlan); - break; + params.local_pm = pm; + } + + /* Include parameters for TDLS peer (will check later) */ + err = nl80211_set_station_tdls(info, ¶ms); + if (err) + return err; + + params.vlan = get_vlan(info, rdev); + if (IS_ERR(params.vlan)) + return PTR_ERR(params.vlan); + + switch (dev->ieee80211_ptr->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: - /* - * Don't allow userspace to change the TDLS_PEER flag, - * but silently ignore attempts to change it since we - * don't have state here to verify that it doesn't try - * to change the flag. - */ - params.sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER); - /* fall through */ case NL80211_IFTYPE_ADHOC: - /* disallow things sta doesn't support */ - if (params.plink_action) - return -EINVAL; - if (params.ht_capa) - return -EINVAL; - if (params.listen_interval >= 0) - return -EINVAL; - /* reject any changes other than AUTHORIZED */ - if (params.sta_flags_mask & ~BIT(NL80211_STA_FLAG_AUTHORIZED)) - return -EINVAL; - break; case NL80211_IFTYPE_MESH_POINT: - /* disallow things mesh doesn't support */ - if (params.vlan) - return -EINVAL; - if (params.ht_capa) - return -EINVAL; - if (params.listen_interval >= 0) - return -EINVAL; - /* - * No special handling for TDLS here -- the userspace - * mesh code doesn't have this bug. - */ - if (params.sta_flags_mask & - ~(BIT(NL80211_STA_FLAG_AUTHENTICATED) | - BIT(NL80211_STA_FLAG_MFP) | - BIT(NL80211_STA_FLAG_AUTHORIZED))) - return -EINVAL; break; default: - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto out_put_vlan; } - /* be aware of params.vlan when changing code here */ - + /* driver will call cfg80211_check_station_change() */ err = rdev_change_station(rdev, dev, mac_addr, ¶ms); + out_put_vlan: if (params.vlan) dev_put(params.vlan); return err; } -static struct nla_policy -nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] __read_mostly = { - [NL80211_STA_WME_UAPSD_QUEUES] = { .type = NLA_U8 }, - [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 }, -}; - static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -3311,6 +3962,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) memset(¶ms, 0, sizeof(params)); + if (!rdev->ops->add_station) + return -EOPNOTSUPP; + if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -3335,6 +3989,19 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (!params.aid || params.aid > IEEE80211_MAX_AID) return -EINVAL; + if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) { + params.capability = + nla_get_u16(info->attrs[NL80211_ATTR_STA_CAPABILITY]); + params.sta_modify_mask |= STATION_PARAM_APPLY_CAPABILITY; + } + + if (info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]) { + params.ext_capab = + nla_data(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]); + params.ext_capab_len = + nla_len(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) params.ht_capa = nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); @@ -3343,67 +4010,72 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); - if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) + if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) { params.plink_action = - nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); + nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); + if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS) + return -EINVAL; + } - if (!rdev->ops->add_station) - return -EOPNOTSUPP; + err = nl80211_parse_sta_wme(info, ¶ms); + if (err) + return err; if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) return -EINVAL; + /* When you run into this, adjust the code below for the new flag */ + BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); + switch (dev->ieee80211_ptr->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: - /* parse WME attributes if sta is WME capable */ - if ((rdev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) && - (params.sta_flags_set & BIT(NL80211_STA_FLAG_WME)) && - info->attrs[NL80211_ATTR_STA_WME]) { - struct nlattr *tb[NL80211_STA_WME_MAX + 1]; - struct nlattr *nla; - - nla = info->attrs[NL80211_ATTR_STA_WME]; - err = nla_parse_nested(tb, NL80211_STA_WME_MAX, nla, - nl80211_sta_wme_policy); - if (err) - return err; - - if (tb[NL80211_STA_WME_UAPSD_QUEUES]) - params.uapsd_queues = - nla_get_u8(tb[NL80211_STA_WME_UAPSD_QUEUES]); - if (params.uapsd_queues & - ~IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK) - return -EINVAL; - - if (tb[NL80211_STA_WME_MAX_SP]) - params.max_sp = - nla_get_u8(tb[NL80211_STA_WME_MAX_SP]); + /* ignore WME attributes if iface/sta is not capable */ + if (!(rdev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) || + !(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) + params.sta_modify_mask &= ~STATION_PARAM_APPLY_UAPSD; - if (params.max_sp & - ~IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK) - return -EINVAL; - - params.sta_modify_mask |= STATION_PARAM_APPLY_UAPSD; - } /* TDLS peers cannot be added */ if (params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) return -EINVAL; /* but don't bother the driver with it */ params.sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER); + /* allow authenticated/associated only if driver handles it */ + if (!(rdev->wiphy.features & + NL80211_FEATURE_FULL_AP_CLIENT_STATE) && + params.sta_flags_mask & + (BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED))) + return -EINVAL; + /* must be last in here for error handling */ params.vlan = get_vlan(info, rdev); if (IS_ERR(params.vlan)) return PTR_ERR(params.vlan); break; case NL80211_IFTYPE_MESH_POINT: + /* ignore uAPSD data */ + params.sta_modify_mask &= ~STATION_PARAM_APPLY_UAPSD; + + /* associated is disallowed */ + if (params.sta_flags_mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) + return -EINVAL; /* TDLS peers cannot be added */ if (params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) return -EINVAL; break; case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + /* ignore uAPSD data */ + params.sta_modify_mask &= ~STATION_PARAM_APPLY_UAPSD; + + /* these are disallowed */ + if (params.sta_flags_mask & + (BIT(NL80211_STA_FLAG_ASSOCIATED) | + BIT(NL80211_STA_FLAG_AUTHENTICATED))) + return -EINVAL; /* Only TDLS peers can be added */ if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) return -EINVAL; @@ -3413,6 +4085,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) /* ... with external setup is supported */ if (!(rdev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP)) return -EOPNOTSUPP; + /* + * Older wpa_supplicant versions always mark the TDLS peer + * as authorized, but it shouldn't yet be. + */ + params.sta_flags_mask &= ~BIT(NL80211_STA_FLAG_AUTHORIZED); break; default: return -EOPNOTSUPP; @@ -3506,13 +4183,13 @@ static int nl80211_dump_mpath(struct sk_buff *skb, { struct mpath_info pinfo; struct cfg80211_registered_device *dev; - struct net_device *netdev; + struct wireless_dev *wdev; u8 dst[ETH_ALEN]; u8 next_hop[ETH_ALEN]; - int path_idx = cb->args[1]; + int path_idx = cb->args[2]; int err; - err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev); + err = nl80211_prepare_wdev_dump(skb, cb, &dev, &wdev); if (err) return err; @@ -3521,14 +4198,14 @@ static int nl80211_dump_mpath(struct sk_buff *skb, goto out_err; } - if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) { + if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) { err = -EOPNOTSUPP; goto out_err; } while (1) { - err = rdev_dump_mpath(dev, netdev, path_idx, dst, next_hop, - &pinfo); + err = rdev_dump_mpath(dev, wdev->netdev, path_idx, dst, + next_hop, &pinfo); if (err == -ENOENT) break; if (err) @@ -3536,7 +4213,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, if (nl80211_send_mpath(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - netdev, dst, next_hop, + wdev->netdev, dst, next_hop, &pinfo) < 0) goto out; @@ -3545,10 +4222,10 @@ static int nl80211_dump_mpath(struct sk_buff *skb, out: - cb->args[1] = path_idx; + cb->args[2] = path_idx; err = skb->len; out_err: - nl80211_finish_netdev_dump(dev); + nl80211_finish_wdev_dump(dev); return err; } @@ -3787,12 +4464,8 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) * window between nl80211_init() and regulatory_init(), if that is * even possible. */ - mutex_lock(&cfg80211_mutex); - if (unlikely(!cfg80211_regdomain)) { - mutex_unlock(&cfg80211_mutex); + if (unlikely(!rcu_access_pointer(cfg80211_regdomain))) return -EINPROGRESS; - } - mutex_unlock(&cfg80211_mutex); if (!info->attrs[NL80211_ATTR_REG_ALPHA2]) return -EINVAL; @@ -3908,7 +4581,11 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u16(msg, NL80211_MESHCONF_HWMP_ROOT_INTERVAL, cur_params.dot11MeshHWMProotInterval) || nla_put_u16(msg, NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, - cur_params.dot11MeshHWMPconfirmationInterval)) + cur_params.dot11MeshHWMPconfirmationInterval) || + nla_put_u32(msg, NL80211_MESHCONF_POWER_MODE, + cur_params.power_mode) || + nla_put_u16(msg, NL80211_MESHCONF_AWAKE_WINDOW, + cur_params.dot11MeshAwakeWindowDuration)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -3947,6 +4624,8 @@ static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_A [NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT] = { .type = NLA_U32 }, [NL80211_MESHCONF_HWMP_ROOT_INTERVAL] = { .type = NLA_U16 }, [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] = { .type = NLA_U16 }, + [NL80211_MESHCONF_POWER_MODE] = { .type = NLA_U32 }, + [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, }; static const struct nla_policy @@ -3955,6 +4634,7 @@ static const struct nla_policy [NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL] = { .type = NLA_U8 }, [NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC] = { .type = NLA_U8 }, [NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG }, + [NL80211_MESH_SETUP_USERSPACE_MPM] = { .type = NLA_FLAG }, [NL80211_MESH_SETUP_IE] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, [NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG }, @@ -3967,13 +4647,15 @@ static int nl80211_parse_mesh_config(struct genl_info *info, struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1]; u32 mask = 0; -#define FILL_IN_MESH_PARAM_IF_SET(table, cfg, param, mask, attr_num, nla_fn) \ -do {\ - if (table[attr_num]) {\ - cfg->param = nla_fn(table[attr_num]); \ - mask |= (1 << (attr_num - 1)); \ - } \ -} while (0);\ +#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, min, max, mask, attr, fn) \ +do { \ + if (tb[attr]) { \ + if (fn(tb[attr]) < min || fn(tb[attr]) > max) \ + return -EINVAL; \ + cfg->param = fn(tb[attr]); \ + mask |= (1 << (attr - 1)); \ + } \ +} while (0) if (!info->attrs[NL80211_ATTR_MESH_CONFIG]) @@ -3988,83 +4670,98 @@ do {\ BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32); /* Fill in the params struct */ - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, 1, 255, mask, NL80211_MESHCONF_RETRY_TIMEOUT, nla_get_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, 1, 255, mask, NL80211_MESHCONF_CONFIRM_TIMEOUT, nla_get_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, 1, 255, mask, NL80211_MESHCONF_HOLDING_TIMEOUT, nla_get_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, 0, 255, mask, NL80211_MESHCONF_MAX_PEER_LINKS, nla_get_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, 0, 16, mask, NL80211_MESHCONF_MAX_RETRIES, nla_get_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, 1, 255, mask, NL80211_MESHCONF_TTL, nla_get_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, 1, 255, mask, NL80211_MESHCONF_ELEMENT_TTL, nla_get_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, 0, 1, mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS, nla_get_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor, mask, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor, + 1, 255, mask, NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, nla_get_u32); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, 0, 255, mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, nla_get_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, 1, 65535, mask, NL80211_MESHCONF_PATH_REFRESH_TIME, nla_get_u32); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, 1, 65535, mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, nla_get_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout, mask, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout, + 1, 65535, mask, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, nla_get_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, - mask, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, + 1, 65535, mask, + NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval, - mask, NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, + 1, 65535, mask, + NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, - dot11MeshHWMPnetDiameterTraversalTime, mask, + dot11MeshHWMPnetDiameterTraversalTime, + 1, 65535, mask, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, nla_get_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, mask, - NL80211_MESHCONF_HWMP_ROOTMODE, nla_get_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, mask, - NL80211_MESHCONF_HWMP_RANN_INTERVAL, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, 0, 4, + mask, NL80211_MESHCONF_HWMP_ROOTMODE, + nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, 1, 65535, + mask, NL80211_MESHCONF_HWMP_RANN_INTERVAL, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, - dot11MeshGateAnnouncementProtocol, mask, - NL80211_MESHCONF_GATE_ANNOUNCEMENTS, + dot11MeshGateAnnouncementProtocol, 0, 1, + mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS, nla_get_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, 0, 1, mask, NL80211_MESHCONF_FORWARDING, nla_get_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, 1, 255, mask, NL80211_MESHCONF_RSSI_THRESHOLD, nla_get_u32); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, ht_opmode, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, ht_opmode, 0, 16, mask, NL80211_MESHCONF_HT_OPMODE, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout, - mask, + 1, 65535, mask, NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, nla_get_u32); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_ROOT_INTERVAL, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, - dot11MeshHWMPconfirmationInterval, mask, + dot11MeshHWMPconfirmationInterval, + 1, 65535, mask, NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode, + NL80211_MESH_POWER_ACTIVE, + NL80211_MESH_POWER_MAX, + mask, NL80211_MESHCONF_POWER_MODE, + nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, + 0, 65535, mask, + NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); if (mask_out) *mask_out = mask; @@ -4076,6 +4773,7 @@ do {\ static int nl80211_parse_mesh_setup(struct genl_info *info, struct mesh_setup *setup) { + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct nlattr *tb[NL80211_MESH_SETUP_ATTR_MAX + 1]; if (!info->attrs[NL80211_ATTR_MESH_SETUP]) @@ -4112,8 +4810,14 @@ static int nl80211_parse_mesh_setup(struct genl_info *info, setup->ie = nla_data(ieattr); setup->ie_len = nla_len(ieattr); } + if (tb[NL80211_MESH_SETUP_USERSPACE_MPM] && + !(rdev->wiphy.features & NL80211_FEATURE_USERSPACE_MPM)) + return -EINVAL; + setup->user_mpm = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_MPM]); setup->is_authenticated = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AUTH]); setup->is_secure = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AMPE]); + if (setup->is_secure) + setup->user_mpm = true; return 0; } @@ -4152,6 +4856,7 @@ static int nl80211_update_mesh_config(struct sk_buff *skb, static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) { + const struct ieee80211_regdomain *regdom; struct sk_buff *msg; void *hdr = NULL; struct nlattr *nl_reg_rules; @@ -4174,35 +4879,36 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) if (!hdr) goto put_failure; - if (nla_put_string(msg, NL80211_ATTR_REG_ALPHA2, - cfg80211_regdomain->alpha2) || - (cfg80211_regdomain->dfs_region && - nla_put_u8(msg, NL80211_ATTR_DFS_REGION, - cfg80211_regdomain->dfs_region))) - goto nla_put_failure; - if (reg_last_request_cell_base() && nla_put_u32(msg, NL80211_ATTR_USER_REG_HINT_TYPE, NL80211_USER_REG_HINT_CELL_BASE)) goto nla_put_failure; + rcu_read_lock(); + regdom = rcu_dereference(cfg80211_regdomain); + + if (nla_put_string(msg, NL80211_ATTR_REG_ALPHA2, regdom->alpha2) || + (regdom->dfs_region && + nla_put_u8(msg, NL80211_ATTR_DFS_REGION, regdom->dfs_region))) + goto nla_put_failure_rcu; + nl_reg_rules = nla_nest_start(msg, NL80211_ATTR_REG_RULES); if (!nl_reg_rules) - goto nla_put_failure; + goto nla_put_failure_rcu; - for (i = 0; i < cfg80211_regdomain->n_reg_rules; i++) { + for (i = 0; i < regdom->n_reg_rules; i++) { struct nlattr *nl_reg_rule; const struct ieee80211_reg_rule *reg_rule; const struct ieee80211_freq_range *freq_range; const struct ieee80211_power_rule *power_rule; - reg_rule = &cfg80211_regdomain->reg_rules[i]; + reg_rule = ®dom->reg_rules[i]; freq_range = ®_rule->freq_range; power_rule = ®_rule->power_rule; nl_reg_rule = nla_nest_start(msg, i); if (!nl_reg_rule) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u32(msg, NL80211_ATTR_REG_RULE_FLAGS, reg_rule->flags) || @@ -4216,10 +4922,11 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) power_rule->max_antenna_gain) || nla_put_u32(msg, NL80211_ATTR_POWER_RULE_MAX_EIRP, power_rule->max_eirp)) - goto nla_put_failure; + goto nla_put_failure_rcu; nla_nest_end(msg, nl_reg_rule); } + rcu_read_unlock(); nla_nest_end(msg, nl_reg_rules); @@ -4227,6 +4934,8 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) err = genlmsg_reply(msg, info); goto out; +nla_put_failure_rcu: + rcu_read_unlock(); nla_put_failure: genlmsg_cancel(msg, hdr); put_failure: @@ -4259,27 +4968,18 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) dfs_region = nla_get_u8(info->attrs[NL80211_ATTR_DFS_REGION]); nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES], - rem_reg_rules) { + rem_reg_rules) { num_rules++; if (num_rules > NL80211_MAX_SUPP_REG_RULES) return -EINVAL; } - mutex_lock(&cfg80211_mutex); - - if (!reg_is_valid_request(alpha2)) { - r = -EINVAL; - goto bad_reg; - } - size_of_regd = sizeof(struct ieee80211_regdomain) + - (num_rules * sizeof(struct ieee80211_reg_rule)); + num_rules * sizeof(struct ieee80211_reg_rule); rd = kzalloc(size_of_regd, GFP_KERNEL); - if (!rd) { - r = -ENOMEM; - goto bad_reg; - } + if (!rd) + return -ENOMEM; rd->n_reg_rules = num_rules; rd->alpha2[0] = alpha2[0]; @@ -4293,10 +4993,10 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) rd->dfs_region = dfs_region; nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES], - rem_reg_rules) { + rem_reg_rules) { nla_parse(tb, NL80211_REG_RULE_ATTR_MAX, - nla_data(nl_reg_rule), nla_len(nl_reg_rule), - reg_rule_policy); + nla_data(nl_reg_rule), nla_len(nl_reg_rule), + reg_rule_policy); r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]); if (r) goto bad_reg; @@ -4309,16 +5009,14 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) } } - BUG_ON(rule_idx != num_rules); + mutex_lock(&cfg80211_mutex); r = set_regdom(rd); - + /* set_regdom took ownership */ + rd = NULL; mutex_unlock(&cfg80211_mutex); - return r; - bad_reg: - mutex_unlock(&cfg80211_mutex); kfree(rd); return r; } @@ -4366,14 +5064,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->scan) return -EOPNOTSUPP; - if (rdev->scan_req) - return -EBUSY; + mutex_lock(&rdev->sched_scan_mtx); + if (rdev->scan_req) { + err = -EBUSY; + goto unlock; + } if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { n_channels = validate_scan_freqs( info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]); - if (!n_channels) - return -EINVAL; + if (!n_channels) { + err = -EINVAL; + goto unlock; + } } else { enum ieee80211_band band; n_channels = 0; @@ -4387,23 +5090,29 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) n_ssids++; - if (n_ssids > wiphy->max_scan_ssids) - return -EINVAL; + if (n_ssids > wiphy->max_scan_ssids) { + err = -EINVAL; + goto unlock; + } if (info->attrs[NL80211_ATTR_IE]) ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); else ie_len = 0; - if (ie_len > wiphy->max_scan_ie_len) - return -EINVAL; + if (ie_len > wiphy->max_scan_ie_len) { + err = -EINVAL; + goto unlock; + } request = kzalloc(sizeof(*request) + sizeof(*request->ssids) * n_ssids + sizeof(*request->channels) * n_channels + ie_len, GFP_KERNEL); - if (!request) - return -ENOMEM; + if (!request) { + err = -ENOMEM; + goto unlock; + } if (n_ssids) request->ssids = (void *)&request->channels[n_channels]; @@ -4540,6 +5249,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) kfree(request); } + unlock: + mutex_unlock(&rdev->sched_scan_mtx); return err; } @@ -4801,6 +5512,54 @@ static int nl80211_stop_sched_scan(struct sk_buff *skb, return err; } +static int nl80211_start_radar_detection(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_chan_def chandef; + int err; + + err = nl80211_parse_chandef(rdev, info, &chandef); + if (err) + return err; + + if (wdev->cac_started) + return -EBUSY; + + err = cfg80211_chandef_dfs_required(wdev->wiphy, &chandef); + if (err < 0) + return err; + + if (err == 0) + return -EINVAL; + + if (chandef.chan->dfs_state != NL80211_DFS_USABLE) + return -EINVAL; + + if (!rdev->ops->start_radar_detection) + return -EOPNOTSUPP; + + mutex_lock(&rdev->devlist_mtx); + err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, + chandef.chan, CHAN_MODE_SHARED, + BIT(chandef.width)); + if (err) + goto err_locked; + + err = rdev->ops->start_radar_detection(&rdev->wiphy, dev, &chandef); + if (!err) { + wdev->channel = chandef.chan; + wdev->cac_started = true; + wdev->cac_start_time = jiffies; + } +err_locked: + mutex_unlock(&rdev->devlist_mtx); + + return err; +} + static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, u32 seq, int flags, struct cfg80211_registered_device *rdev, @@ -4811,6 +5570,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, const struct cfg80211_bss_ies *ies; void *hdr; struct nlattr *bss; + bool tsf = false; ASSERT_WDEV_LOCK(wdev); @@ -4821,9 +5581,13 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, genl_dump_check_consistent(cb, hdr, &nl80211_fam); - if (nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->bss_generation) || + if (nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->bss_generation)) + goto nla_put_failure; + if (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex)) goto nla_put_failure; + if (nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + goto nla_put_failure; bss = nla_nest_start(msg, NL80211_ATTR_BSS); if (!bss) @@ -4834,22 +5598,24 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, rcu_read_lock(); ies = rcu_dereference(res->ies); - if (ies && ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS, - ies->len, ies->data)) { - rcu_read_unlock(); - goto nla_put_failure; + if (ies) { + if (nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) + goto fail_unlock_rcu; + tsf = true; + if (ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS, + ies->len, ies->data)) + goto fail_unlock_rcu; } ies = rcu_dereference(res->beacon_ies); - if (ies && ies->len && nla_put(msg, NL80211_BSS_BEACON_IES, - ies->len, ies->data)) { - rcu_read_unlock(); - goto nla_put_failure; + if (ies) { + if (!tsf && nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) + goto fail_unlock_rcu; + if (ies->len && nla_put(msg, NL80211_BSS_BEACON_IES, + ies->len, ies->data)) + goto fail_unlock_rcu; } rcu_read_unlock(); - if (res->tsf && - nla_put_u64(msg, NL80211_BSS_TSF, res->tsf)) - goto nla_put_failure; if (res->beacon_interval && nla_put_u16(msg, NL80211_BSS_BEACON_INTERVAL, res->beacon_interval)) goto nla_put_failure; @@ -4894,27 +5660,25 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, return genlmsg_end(msg, hdr); + fail_unlock_rcu: + rcu_read_unlock(); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } -static int nl80211_dump_scan(struct sk_buff *skb, - struct netlink_callback *cb) +static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg80211_registered_device *rdev; - struct net_device *dev; struct cfg80211_internal_bss *scan; struct wireless_dev *wdev; - int start = cb->args[1], idx = 0; + int start = cb->args[2], idx = 0; int err; - err = nl80211_prepare_netdev_dump(skb, cb, &rdev, &dev); + err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (err) return err; - wdev = dev->ieee80211_ptr; - wdev_lock(wdev); spin_lock_bh(&rdev->bss_lock); cfg80211_bss_expire(rdev); @@ -4935,8 +5699,8 @@ static int nl80211_dump_scan(struct sk_buff *skb, spin_unlock_bh(&rdev->bss_lock); wdev_unlock(wdev); - cb->args[1] = idx; - nl80211_finish_netdev_dump(rdev); + cb->args[2] = idx; + nl80211_finish_wdev_dump(rdev); return skb->len; } @@ -5005,14 +5769,19 @@ static int nl80211_dump_survey(struct sk_buff *skb, { struct survey_info survey; struct cfg80211_registered_device *dev; - struct net_device *netdev; - int survey_idx = cb->args[1]; + struct wireless_dev *wdev; + int survey_idx = cb->args[2]; int res; - res = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev); + res = nl80211_prepare_wdev_dump(skb, cb, &dev, &wdev); if (res) return res; + if (!wdev->netdev) { + res = -EINVAL; + goto out_err; + } + if (!dev->ops->dump_survey) { res = -EOPNOTSUPP; goto out_err; @@ -5021,7 +5790,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, while (1) { struct ieee80211_channel *chan; - res = rdev_dump_survey(dev, netdev, survey_idx, &survey); + res = rdev_dump_survey(dev, wdev->netdev, survey_idx, &survey); if (res == -ENOENT) break; if (res) @@ -5043,17 +5812,16 @@ static int nl80211_dump_survey(struct sk_buff *skb, if (nl80211_send_survey(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - netdev, - &survey) < 0) + wdev->netdev, &survey) < 0) goto out; survey_idx++; } out: - cb->args[1] = survey_idx; + cb->args[2] = survey_idx; res = skb->len; out_err: - nl80211_finish_netdev_dump(dev); + nl80211_finish_wdev_dump(dev); return res; } @@ -5261,14 +6029,10 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; - struct cfg80211_crypto_settings crypto; struct ieee80211_channel *chan; - const u8 *bssid, *ssid, *ie = NULL, *prev_bssid = NULL; - int err, ssid_len, ie_len = 0; - bool use_mfp = false; - u32 flags = 0; - struct ieee80211_ht_cap *ht_capa = NULL; - struct ieee80211_ht_cap *ht_capa_mask = NULL; + struct cfg80211_assoc_request req = {}; + const u8 *bssid, *ssid; + int err, ssid_len = 0; if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) return -EINVAL; @@ -5296,41 +6060,58 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); if (info->attrs[NL80211_ATTR_IE]) { - ie = nla_data(info->attrs[NL80211_ATTR_IE]); - ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + req.ie = nla_data(info->attrs[NL80211_ATTR_IE]); + req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); } if (info->attrs[NL80211_ATTR_USE_MFP]) { enum nl80211_mfp mfp = nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]); if (mfp == NL80211_MFP_REQUIRED) - use_mfp = true; + req.use_mfp = true; else if (mfp != NL80211_MFP_NO) return -EINVAL; } if (info->attrs[NL80211_ATTR_PREV_BSSID]) - prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); + req.prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HT])) - flags |= ASSOC_REQ_DISABLE_HT; + req.flags |= ASSOC_REQ_DISABLE_HT; if (info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) - ht_capa_mask = - nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]); + memcpy(&req.ht_capa_mask, + nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]), + sizeof(req.ht_capa_mask)); if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) { - if (!ht_capa_mask) + if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) return -EINVAL; - ht_capa = nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); + memcpy(&req.ht_capa, + nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]), + sizeof(req.ht_capa)); } - err = nl80211_crypto_settings(rdev, info, &crypto, 1); + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT])) + req.flags |= ASSOC_REQ_DISABLE_VHT; + + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) + memcpy(&req.vht_capa_mask, + nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), + sizeof(req.vht_capa_mask)); + + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) { + if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) + return -EINVAL; + memcpy(&req.vht_capa, + nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]), + sizeof(req.vht_capa)); + } + + err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) - err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid, - ssid, ssid_len, ie, ie_len, use_mfp, - &crypto, flags, ht_capa, - ht_capa_mask); + err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, + ssid, ssid_len, &req); return err; } @@ -5867,6 +6648,15 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); } + if (info->attrs[NL80211_ATTR_USE_MFP]) { + connect.mfp = nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]); + if (connect.mfp != NL80211_MFP_REQUIRED && + connect.mfp != NL80211_MFP_NO) + return -EINVAL; + } else { + connect.mfp = NL80211_MFP_NO; + } + if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { connect.channel = ieee80211_get_channel(wiphy, @@ -5901,6 +6691,24 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) sizeof(connect.ht_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT])) + connect.flags |= ASSOC_REQ_DISABLE_VHT; + + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) + memcpy(&connect.vht_capa_mask, + nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), + sizeof(connect.vht_capa_mask)); + + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) { + if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) { + kfree(connkeys); + return -EINVAL; + } + memcpy(&connect.vht_capa, + nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]), + sizeof(connect.vht_capa)); + } + err = cfg80211_connect(rdev, dev, &connect, connkeys); if (err) kfree(connkeys); @@ -6652,6 +7460,21 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE]))) return -EINVAL; + if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) { + setup.beacon_interval = + nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); + if (setup.beacon_interval < 10 || + setup.beacon_interval > 10000) + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_DTIM_PERIOD]) { + setup.dtim_period = + nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]); + if (setup.dtim_period < 1 || setup.dtim_period > 100) + return -EINVAL; + } + if (info->attrs[NL80211_ATTR_MESH_SETUP]) { /* parse additional setup parameters if given */ err = nl80211_parse_mesh_setup(info, &setup); @@ -6659,6 +7482,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) return err; } + if (setup.user_mpm) + cfg.auto_open_plinks = false; + if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { err = nl80211_parse_chandef(rdev, info, &setup.chandef); if (err) @@ -6680,16 +7506,100 @@ static int nl80211_leave_mesh(struct sk_buff *skb, struct genl_info *info) } #ifdef CONFIG_PM +static int nl80211_send_wowlan_patterns(struct sk_buff *msg, + struct cfg80211_registered_device *rdev) +{ + struct nlattr *nl_pats, *nl_pat; + int i, pat_len; + + if (!rdev->wowlan->n_patterns) + return 0; + + nl_pats = nla_nest_start(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN); + if (!nl_pats) + return -ENOBUFS; + + for (i = 0; i < rdev->wowlan->n_patterns; i++) { + nl_pat = nla_nest_start(msg, i + 1); + if (!nl_pat) + return -ENOBUFS; + pat_len = rdev->wowlan->patterns[i].pattern_len; + if (nla_put(msg, NL80211_WOWLAN_PKTPAT_MASK, + DIV_ROUND_UP(pat_len, 8), + rdev->wowlan->patterns[i].mask) || + nla_put(msg, NL80211_WOWLAN_PKTPAT_PATTERN, + pat_len, rdev->wowlan->patterns[i].pattern) || + nla_put_u32(msg, NL80211_WOWLAN_PKTPAT_OFFSET, + rdev->wowlan->patterns[i].pkt_offset)) + return -ENOBUFS; + nla_nest_end(msg, nl_pat); + } + nla_nest_end(msg, nl_pats); + + return 0; +} + +static int nl80211_send_wowlan_tcp(struct sk_buff *msg, + struct cfg80211_wowlan_tcp *tcp) +{ + struct nlattr *nl_tcp; + + if (!tcp) + return 0; + + nl_tcp = nla_nest_start(msg, NL80211_WOWLAN_TRIG_TCP_CONNECTION); + if (!nl_tcp) + return -ENOBUFS; + + if (nla_put_be32(msg, NL80211_WOWLAN_TCP_SRC_IPV4, tcp->src) || + nla_put_be32(msg, NL80211_WOWLAN_TCP_DST_IPV4, tcp->dst) || + nla_put(msg, NL80211_WOWLAN_TCP_DST_MAC, ETH_ALEN, tcp->dst_mac) || + nla_put_u16(msg, NL80211_WOWLAN_TCP_SRC_PORT, tcp->src_port) || + nla_put_u16(msg, NL80211_WOWLAN_TCP_DST_PORT, tcp->dst_port) || + nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD, + tcp->payload_len, tcp->payload) || + nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_INTERVAL, + tcp->data_interval) || + nla_put(msg, NL80211_WOWLAN_TCP_WAKE_PAYLOAD, + tcp->wake_len, tcp->wake_data) || + nla_put(msg, NL80211_WOWLAN_TCP_WAKE_MASK, + DIV_ROUND_UP(tcp->wake_len, 8), tcp->wake_mask)) + return -ENOBUFS; + + if (tcp->payload_seq.len && + nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ, + sizeof(tcp->payload_seq), &tcp->payload_seq)) + return -ENOBUFS; + + if (tcp->payload_tok.len && + nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN, + sizeof(tcp->payload_tok) + tcp->tokens_size, + &tcp->payload_tok)) + return -ENOBUFS; + + return 0; +} + static int nl80211_get_wowlan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct sk_buff *msg; void *hdr; + u32 size = NLMSG_DEFAULT_SIZE; - if (!rdev->wiphy.wowlan.flags && !rdev->wiphy.wowlan.n_patterns) + if (!rdev->wiphy.wowlan.flags && !rdev->wiphy.wowlan.n_patterns && + !rdev->wiphy.wowlan.tcp) return -EOPNOTSUPP; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (rdev->wowlan && rdev->wowlan->tcp) { + /* adjust size to have room for all the data */ + size += rdev->wowlan->tcp->tokens_size + + rdev->wowlan->tcp->payload_len + + rdev->wowlan->tcp->wake_len + + rdev->wowlan->tcp->wake_len / 8; + } + + msg = nlmsg_new(size, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -6720,31 +7630,12 @@ static int nl80211_get_wowlan(struct sk_buff *skb, struct genl_info *info) (rdev->wowlan->rfkill_release && nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE))) goto nla_put_failure; - if (rdev->wowlan->n_patterns) { - struct nlattr *nl_pats, *nl_pat; - int i, pat_len; - nl_pats = nla_nest_start(msg, - NL80211_WOWLAN_TRIG_PKT_PATTERN); - if (!nl_pats) - goto nla_put_failure; + if (nl80211_send_wowlan_patterns(msg, rdev)) + goto nla_put_failure; - for (i = 0; i < rdev->wowlan->n_patterns; i++) { - nl_pat = nla_nest_start(msg, i + 1); - if (!nl_pat) - goto nla_put_failure; - pat_len = rdev->wowlan->patterns[i].pattern_len; - if (nla_put(msg, NL80211_WOWLAN_PKTPAT_MASK, - DIV_ROUND_UP(pat_len, 8), - rdev->wowlan->patterns[i].mask) || - nla_put(msg, NL80211_WOWLAN_PKTPAT_PATTERN, - pat_len, - rdev->wowlan->patterns[i].pattern)) - goto nla_put_failure; - nla_nest_end(msg, nl_pat); - } - nla_nest_end(msg, nl_pats); - } + if (nl80211_send_wowlan_tcp(msg, rdev->wowlan->tcp)) + goto nla_put_failure; nla_nest_end(msg, nl_wowlan); } @@ -6757,6 +7648,151 @@ nla_put_failure: return -ENOBUFS; } +static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev, + struct nlattr *attr, + struct cfg80211_wowlan *trig) +{ + struct nlattr *tb[NUM_NL80211_WOWLAN_TCP]; + struct cfg80211_wowlan_tcp *cfg; + struct nl80211_wowlan_tcp_data_token *tok = NULL; + struct nl80211_wowlan_tcp_data_seq *seq = NULL; + u32 size; + u32 data_size, wake_size, tokens_size = 0, wake_mask_size; + int err, port; + + if (!rdev->wiphy.wowlan.tcp) + return -EINVAL; + + err = nla_parse(tb, MAX_NL80211_WOWLAN_TCP, + nla_data(attr), nla_len(attr), + nl80211_wowlan_tcp_policy); + if (err) + return err; + + if (!tb[NL80211_WOWLAN_TCP_SRC_IPV4] || + !tb[NL80211_WOWLAN_TCP_DST_IPV4] || + !tb[NL80211_WOWLAN_TCP_DST_MAC] || + !tb[NL80211_WOWLAN_TCP_DST_PORT] || + !tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD] || + !tb[NL80211_WOWLAN_TCP_DATA_INTERVAL] || + !tb[NL80211_WOWLAN_TCP_WAKE_PAYLOAD] || + !tb[NL80211_WOWLAN_TCP_WAKE_MASK]) + return -EINVAL; + + data_size = nla_len(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD]); + if (data_size > rdev->wiphy.wowlan.tcp->data_payload_max) + return -EINVAL; + + if (nla_get_u32(tb[NL80211_WOWLAN_TCP_DATA_INTERVAL]) > + rdev->wiphy.wowlan.tcp->data_interval_max || + nla_get_u32(tb[NL80211_WOWLAN_TCP_DATA_INTERVAL]) == 0) + return -EINVAL; + + wake_size = nla_len(tb[NL80211_WOWLAN_TCP_WAKE_PAYLOAD]); + if (wake_size > rdev->wiphy.wowlan.tcp->wake_payload_max) + return -EINVAL; + + wake_mask_size = nla_len(tb[NL80211_WOWLAN_TCP_WAKE_MASK]); + if (wake_mask_size != DIV_ROUND_UP(wake_size, 8)) + return -EINVAL; + + if (tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN]) { + u32 tokln = nla_len(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN]); + + tok = nla_data(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN]); + tokens_size = tokln - sizeof(*tok); + + if (!tok->len || tokens_size % tok->len) + return -EINVAL; + if (!rdev->wiphy.wowlan.tcp->tok) + return -EINVAL; + if (tok->len > rdev->wiphy.wowlan.tcp->tok->max_len) + return -EINVAL; + if (tok->len < rdev->wiphy.wowlan.tcp->tok->min_len) + return -EINVAL; + if (tokens_size > rdev->wiphy.wowlan.tcp->tok->bufsize) + return -EINVAL; + if (tok->offset + tok->len > data_size) + return -EINVAL; + } + + if (tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ]) { + seq = nla_data(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ]); + if (!rdev->wiphy.wowlan.tcp->seq) + return -EINVAL; + if (seq->len == 0 || seq->len > 4) + return -EINVAL; + if (seq->len + seq->offset > data_size) + return -EINVAL; + } + + size = sizeof(*cfg); + size += data_size; + size += wake_size + wake_mask_size; + size += tokens_size; + + cfg = kzalloc(size, GFP_KERNEL); + if (!cfg) + return -ENOMEM; + cfg->src = nla_get_be32(tb[NL80211_WOWLAN_TCP_SRC_IPV4]); + cfg->dst = nla_get_be32(tb[NL80211_WOWLAN_TCP_DST_IPV4]); + memcpy(cfg->dst_mac, nla_data(tb[NL80211_WOWLAN_TCP_DST_MAC]), + ETH_ALEN); + if (tb[NL80211_WOWLAN_TCP_SRC_PORT]) + port = nla_get_u16(tb[NL80211_WOWLAN_TCP_SRC_PORT]); + else + port = 0; +#ifdef CONFIG_INET + /* allocate a socket and port for it and use it */ + err = __sock_create(wiphy_net(&rdev->wiphy), PF_INET, SOCK_STREAM, + IPPROTO_TCP, &cfg->sock, 1); + if (err) { + kfree(cfg); + return err; + } + if (inet_csk_get_port(cfg->sock->sk, port)) { + sock_release(cfg->sock); + kfree(cfg); + return -EADDRINUSE; + } + cfg->src_port = inet_sk(cfg->sock->sk)->inet_num; +#else + if (!port) { + kfree(cfg); + return -EINVAL; + } + cfg->src_port = port; +#endif + + cfg->dst_port = nla_get_u16(tb[NL80211_WOWLAN_TCP_DST_PORT]); + cfg->payload_len = data_size; + cfg->payload = (u8 *)cfg + sizeof(*cfg) + tokens_size; + memcpy((void *)cfg->payload, + nla_data(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD]), + data_size); + if (seq) + cfg->payload_seq = *seq; + cfg->data_interval = nla_get_u32(tb[NL80211_WOWLAN_TCP_DATA_INTERVAL]); + cfg->wake_len = wake_size; + cfg->wake_data = (u8 *)cfg + sizeof(*cfg) + tokens_size + data_size; + memcpy((void *)cfg->wake_data, + nla_data(tb[NL80211_WOWLAN_TCP_WAKE_PAYLOAD]), + wake_size); + cfg->wake_mask = (u8 *)cfg + sizeof(*cfg) + tokens_size + + data_size + wake_size; + memcpy((void *)cfg->wake_mask, + nla_data(tb[NL80211_WOWLAN_TCP_WAKE_MASK]), + wake_mask_size); + if (tok) { + cfg->tokens_size = tokens_size; + memcpy(&cfg->payload_tok, tok, sizeof(*tok) + tokens_size); + } + + trig->tcp = cfg; + + return 0; +} + static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -6767,7 +7803,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) int err, i; bool prev_enabled = rdev->wowlan; - if (!rdev->wiphy.wowlan.flags && !rdev->wiphy.wowlan.n_patterns) + if (!rdev->wiphy.wowlan.flags && !rdev->wiphy.wowlan.n_patterns && + !rdev->wiphy.wowlan.tcp) return -EOPNOTSUPP; if (!info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]) { @@ -6831,7 +7868,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (tb[NL80211_WOWLAN_TRIG_PKT_PATTERN]) { struct nlattr *pat; int n_patterns = 0; - int rem, pat_len, mask_len; + int rem, pat_len, mask_len, pkt_offset; struct nlattr *pat_tb[NUM_NL80211_WOWLAN_PKTPAT]; nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN], @@ -6866,6 +7903,15 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) pat_len < wowlan->pattern_min_len) goto error; + if (!pat_tb[NL80211_WOWLAN_PKTPAT_OFFSET]) + pkt_offset = 0; + else + pkt_offset = nla_get_u32( + pat_tb[NL80211_WOWLAN_PKTPAT_OFFSET]); + if (pkt_offset > wowlan->max_pkt_offset) + goto error; + new_triggers.patterns[i].pkt_offset = pkt_offset; + new_triggers.patterns[i].mask = kmalloc(mask_len + pat_len, GFP_KERNEL); if (!new_triggers.patterns[i].mask) { @@ -6885,6 +7931,14 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) } } + if (tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION]) { + err = nl80211_parse_wowlan_tcp( + rdev, tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION], + &new_triggers); + if (err) + goto error; + } + ntrig = kmemdup(&new_triggers, sizeof(new_triggers), GFP_KERNEL); if (!ntrig) { err = -ENOMEM; @@ -6902,6 +7956,9 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) for (i = 0; i < new_triggers.n_patterns; i++) kfree(new_triggers.patterns[i].mask); kfree(new_triggers.patterns); + if (new_triggers.tcp && new_triggers.tcp->sock) + sock_release(new_triggers.tcp->sock); + kfree(new_triggers.tcp); return err; } #endif @@ -7106,21 +8163,118 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->stop_p2p_device) return -EOPNOTSUPP; - if (!wdev->p2p_started) - return 0; - - rdev_stop_p2p_device(rdev, wdev); - wdev->p2p_started = false; - mutex_lock(&rdev->devlist_mtx); - rdev->opencount--; + mutex_lock(&rdev->sched_scan_mtx); + cfg80211_stop_p2p_device(rdev, wdev); + mutex_unlock(&rdev->sched_scan_mtx); mutex_unlock(&rdev->devlist_mtx); - if (WARN_ON(rdev->scan_req && rdev->scan_req->wdev == wdev)) { - rdev->scan_req->aborted = true; - ___cfg80211_scan_done(rdev, true); - } + return 0; +} + +static int nl80211_get_protocol_features(struct sk_buff *skb, + struct genl_info *info) +{ + void *hdr; + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, + NL80211_CMD_GET_PROTOCOL_FEATURES); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_PROTOCOL_FEATURES, + NL80211_PROTOCOL_FEATURE_SPLIT_WIPHY_DUMP)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + + nla_put_failure: + kfree_skb(msg); + return -ENOBUFS; +} + +static int nl80211_update_ft_ies(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct cfg80211_update_ft_ies_params ft_params; + struct net_device *dev = info->user_ptr[1]; + if (!rdev->ops->update_ft_ies) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_MDID] || + !is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + + memset(&ft_params, 0, sizeof(ft_params)); + ft_params.md = nla_get_u16(info->attrs[NL80211_ATTR_MDID]); + ft_params.ie = nla_data(info->attrs[NL80211_ATTR_IE]); + ft_params.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + + return rdev_update_ft_ies(rdev, dev, &ft_params); +} + +static int nl80211_crit_protocol_start(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + enum nl80211_crit_proto_id proto = NL80211_CRIT_PROTO_UNSPEC; + u16 duration; + int ret; + + if (!rdev->ops->crit_proto_start) + return -EOPNOTSUPP; + + if (WARN_ON(!rdev->ops->crit_proto_stop)) + return -EINVAL; + + if (rdev->crit_proto_nlportid) + return -EBUSY; + + /* determine protocol if provided */ + if (info->attrs[NL80211_ATTR_CRIT_PROT_ID]) + proto = nla_get_u16(info->attrs[NL80211_ATTR_CRIT_PROT_ID]); + + if (proto >= NUM_NL80211_CRIT_PROTO) + return -EINVAL; + + /* timeout must be provided */ + if (!info->attrs[NL80211_ATTR_MAX_CRIT_PROT_DURATION]) + return -EINVAL; + + duration = + nla_get_u16(info->attrs[NL80211_ATTR_MAX_CRIT_PROT_DURATION]); + + if (duration > NL80211_CRIT_PROTO_MAX_DURATION) + return -ERANGE; + + ret = rdev_crit_proto_start(rdev, wdev, proto, duration); + if (!ret) + rdev->crit_proto_nlportid = info->snd_portid; + + return ret; +} + +static int nl80211_crit_protocol_stop(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + + if (!rdev->ops->crit_proto_stop) + return -EOPNOTSUPP; + + if (rdev->crit_proto_nlportid) { + rdev->crit_proto_nlportid = 0; + rdev_crit_proto_stop(rdev, wdev); + } return 0; } @@ -7784,6 +8938,51 @@ static struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_SET_MAC_ACL, + .doit = nl80211_set_mac_acl, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_RADAR_DETECT, + .doit = nl80211_start_radar_detection, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_PROTOCOL_FEATURES, + .doit = nl80211_get_protocol_features, + .policy = nl80211_policy, + }, + { + .cmd = NL80211_CMD_UPDATE_FT_IES, + .doit = nl80211_update_ft_ies, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_CRIT_PROTOCOL_START, + .doit = nl80211_crit_protocol_start, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_CRIT_PROTOCOL_STOP, + .doit = nl80211_crit_protocol_stop, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + } }; static struct genl_multicast_group nl80211_mlme_mcgrp = { @@ -7811,7 +9010,8 @@ void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev) if (!msg) return; - if (nl80211_send_wiphy(msg, 0, 0, 0, rdev) < 0) { + if (nl80211_send_wiphy(rdev, msg, 0, 0, 0, + false, NULL, NULL, NULL) < 0) { nlmsg_free(msg); return; } @@ -7827,7 +9027,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg, struct nlattr *nest; int i; - ASSERT_RDEV_LOCK(rdev); + lockdep_assert_held(&rdev->sched_scan_mtx); if (WARN_ON(!req)) return 0; @@ -8051,7 +9251,7 @@ void nl80211_send_reg_change_event(struct regulatory_request *request) goto nla_put_failure; } - if (wiphy_idx_valid(request->wiphy_idx) && + if (request->wiphy_idx != WIPHY_IDX_INVALID && nla_put_u32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx)) goto nla_put_failure; @@ -8135,21 +9335,31 @@ void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, NL80211_CMD_DISASSOCIATE, gfp); } -void nl80211_send_unprot_deauth(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *buf, - size_t len, gfp_t gfp) +void cfg80211_send_unprot_deauth(struct net_device *dev, const u8 *buf, + size_t len) { - nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_UNPROT_DEAUTHENTICATE, gfp); + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + trace_cfg80211_send_unprot_deauth(dev); + nl80211_send_mlme_event(rdev, dev, buf, len, + NL80211_CMD_UNPROT_DEAUTHENTICATE, GFP_ATOMIC); } +EXPORT_SYMBOL(cfg80211_send_unprot_deauth); -void nl80211_send_unprot_disassoc(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *buf, - size_t len, gfp_t gfp) +void cfg80211_send_unprot_disassoc(struct net_device *dev, const u8 *buf, + size_t len) { - nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_UNPROT_DISASSOCIATE, gfp); + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + trace_cfg80211_send_unprot_disassoc(dev); + nl80211_send_mlme_event(rdev, dev, buf, len, + NL80211_CMD_UNPROT_DISASSOCIATE, GFP_ATOMIC); } +EXPORT_SYMBOL(cfg80211_send_unprot_disassoc); static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev, struct net_device *netdev, int cmd, @@ -8352,14 +9562,19 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, nlmsg_free(msg); } -void nl80211_send_new_peer_candidate(struct cfg80211_registered_device *rdev, - struct net_device *netdev, - const u8 *macaddr, const u8* ie, u8 ie_len, - gfp_t gfp) +void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, + const u8* ie, u8 ie_len, gfp_t gfp) { + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); struct sk_buff *msg; void *hdr; + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_MESH_POINT)) + return; + + trace_cfg80211_notify_new_peer_candidate(dev, addr); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; @@ -8371,8 +9586,8 @@ void nl80211_send_new_peer_candidate(struct cfg80211_registered_device *rdev, } if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || - nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || - nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, macaddr) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || (ie_len && ie && nla_put(msg, NL80211_ATTR_IE, ie_len , ie))) goto nla_put_failure; @@ -8387,6 +9602,7 @@ void nl80211_send_new_peer_candidate(struct cfg80211_registered_device *rdev, genlmsg_cancel(msg, hdr); nlmsg_free(msg); } +EXPORT_SYMBOL(cfg80211_notify_new_peer_candidate); void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *addr, @@ -8455,7 +9671,7 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE); if (!nl_freq) goto nla_put_failure; - if (nl80211_msg_put_channel(msg, channel_before)) + if (nl80211_msg_put_channel(msg, channel_before, false)) goto nla_put_failure; nla_nest_end(msg, nl_freq); @@ -8463,7 +9679,7 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER); if (!nl_freq) goto nla_put_failure; - if (nl80211_msg_put_channel(msg, channel_after)) + if (nl80211_msg_put_channel(msg, channel_after, false)) goto nla_put_failure; nla_nest_end(msg, nl_freq); @@ -8525,31 +9741,42 @@ static void nl80211_send_remain_on_chan_event( nlmsg_free(msg); } -void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, u64 cookie, - struct ieee80211_channel *chan, - unsigned int duration, gfp_t gfp) +void cfg80211_ready_on_channel(struct wireless_dev *wdev, u64 cookie, + struct ieee80211_channel *chan, + unsigned int duration, gfp_t gfp) { + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + trace_cfg80211_ready_on_channel(wdev, cookie, chan, duration); nl80211_send_remain_on_chan_event(NL80211_CMD_REMAIN_ON_CHANNEL, rdev, wdev, cookie, chan, duration, gfp); } +EXPORT_SYMBOL(cfg80211_ready_on_channel); -void nl80211_send_remain_on_channel_cancel( - struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, - u64 cookie, struct ieee80211_channel *chan, gfp_t gfp) +void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie, + struct ieee80211_channel *chan, + gfp_t gfp) { + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + trace_cfg80211_ready_on_channel_expired(wdev, cookie, chan); nl80211_send_remain_on_chan_event(NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, rdev, wdev, cookie, chan, 0, gfp); } +EXPORT_SYMBOL(cfg80211_remain_on_channel_expired); -void nl80211_send_sta_event(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *mac_addr, - struct station_info *sinfo, gfp_t gfp) +void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, + struct station_info *sinfo, gfp_t gfp) { + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); struct sk_buff *msg; + trace_cfg80211_new_sta(dev, mac_addr, sinfo); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; @@ -8563,14 +9790,17 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev, genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, nl80211_mlme_mcgrp.id, gfp); } +EXPORT_SYMBOL(cfg80211_new_sta); -void nl80211_send_sta_del_event(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *mac_addr, - gfp_t gfp) +void cfg80211_del_sta(struct net_device *dev, const u8 *mac_addr, gfp_t gfp) { + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); struct sk_buff *msg; void *hdr; + trace_cfg80211_del_sta(dev, mac_addr); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; @@ -8595,12 +9825,14 @@ void nl80211_send_sta_del_event(struct cfg80211_registered_device *rdev, genlmsg_cancel(msg, hdr); nlmsg_free(msg); } +EXPORT_SYMBOL(cfg80211_del_sta); -void nl80211_send_conn_failed_event(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *mac_addr, - enum nl80211_connect_failed_reason reason, - gfp_t gfp) +void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, + enum nl80211_connect_failed_reason reason, + gfp_t gfp) { + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); struct sk_buff *msg; void *hdr; @@ -8629,6 +9861,7 @@ void nl80211_send_conn_failed_event(struct cfg80211_registered_device *rdev, genlmsg_cancel(msg, hdr); nlmsg_free(msg); } +EXPORT_SYMBOL(cfg80211_conn_failed); static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, const u8 *addr, gfp_t gfp) @@ -8673,19 +9906,47 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, return true; } -bool nl80211_unexpected_frame(struct net_device *dev, const u8 *addr, gfp_t gfp) +bool cfg80211_rx_spurious_frame(struct net_device *dev, + const u8 *addr, gfp_t gfp) { - return __nl80211_unexpected_frame(dev, NL80211_CMD_UNEXPECTED_FRAME, - addr, gfp); + struct wireless_dev *wdev = dev->ieee80211_ptr; + bool ret; + + trace_cfg80211_rx_spurious_frame(dev, addr); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && + wdev->iftype != NL80211_IFTYPE_P2P_GO)) { + trace_cfg80211_return_bool(false); + return false; + } + ret = __nl80211_unexpected_frame(dev, NL80211_CMD_UNEXPECTED_FRAME, + addr, gfp); + trace_cfg80211_return_bool(ret); + return ret; } +EXPORT_SYMBOL(cfg80211_rx_spurious_frame); -bool nl80211_unexpected_4addr_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp) +bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, + const u8 *addr, gfp_t gfp) { - return __nl80211_unexpected_frame(dev, - NL80211_CMD_UNEXPECTED_4ADDR_FRAME, - addr, gfp); + struct wireless_dev *wdev = dev->ieee80211_ptr; + bool ret; + + trace_cfg80211_rx_unexpected_4addr_frame(dev, addr); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && + wdev->iftype != NL80211_IFTYPE_P2P_GO && + wdev->iftype != NL80211_IFTYPE_AP_VLAN)) { + trace_cfg80211_return_bool(false); + return false; + } + ret = __nl80211_unexpected_frame(dev, + NL80211_CMD_UNEXPECTED_4ADDR_FRAME, + addr, gfp); + trace_cfg80211_return_bool(ret); + return ret; } +EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame); int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 nlportid, @@ -8725,15 +9986,17 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, return -ENOBUFS; } -void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, - gfp_t gfp) +void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, gfp_t gfp) { + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); struct net_device *netdev = wdev->netdev; struct sk_buff *msg; void *hdr; + trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; @@ -8761,17 +10024,21 @@ void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev, genlmsg_cancel(msg, hdr); nlmsg_free(msg); } +EXPORT_SYMBOL(cfg80211_mgmt_tx_status); -void -nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, - enum nl80211_cqm_rssi_threshold_event rssi_event, - gfp_t gfp) +void cfg80211_cqm_rssi_notify(struct net_device *dev, + enum nl80211_cqm_rssi_threshold_event rssi_event, + gfp_t gfp) { + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); struct sk_buff *msg; struct nlattr *pinfoattr; void *hdr; + trace_cfg80211_cqm_rssi_notify(dev, rssi_event); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; @@ -8783,7 +10050,7 @@ nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev, } if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || - nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex)) + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; pinfoattr = nla_nest_start(msg, NL80211_ATTR_CQM); @@ -8806,10 +10073,11 @@ nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev, genlmsg_cancel(msg, hdr); nlmsg_free(msg); } +EXPORT_SYMBOL(cfg80211_cqm_rssi_notify); -void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid, - const u8 *replay_ctr, gfp_t gfp) +static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid, + const u8 *replay_ctr, gfp_t gfp) { struct sk_buff *msg; struct nlattr *rekey_attr; @@ -8851,9 +10119,22 @@ void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev, nlmsg_free(msg); } -void nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, int index, - const u8 *bssid, bool preauth, gfp_t gfp) +void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid, + const u8 *replay_ctr, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + trace_cfg80211_gtk_rekey_notify(dev, bssid); + nl80211_gtk_rekey_notify(rdev, dev, bssid, replay_ctr, gfp); +} +EXPORT_SYMBOL(cfg80211_gtk_rekey_notify); + +static void +nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev, + struct net_device *netdev, int index, + const u8 *bssid, bool preauth, gfp_t gfp) { struct sk_buff *msg; struct nlattr *attr; @@ -8896,9 +10177,22 @@ void nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev, nlmsg_free(msg); } -void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, - struct cfg80211_chan_def *chandef, gfp_t gfp) +void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, + const u8 *bssid, bool preauth, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + trace_cfg80211_pmksa_candidate_notify(dev, index, bssid, preauth); + nl80211_pmksa_candidate_notify(rdev, dev, index, bssid, preauth, gfp); +} +EXPORT_SYMBOL(cfg80211_pmksa_candidate_notify); + +static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + struct cfg80211_chan_def *chandef, + gfp_t gfp) { struct sk_buff *msg; void *hdr; @@ -8930,11 +10224,36 @@ void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, nlmsg_free(msg); } -void -nl80211_send_cqm_txe_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *peer, - u32 num_packets, u32 rate, u32 intvl, gfp_t gfp) +void cfg80211_ch_switch_notify(struct net_device *dev, + struct cfg80211_chan_def *chandef) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + trace_cfg80211_ch_switch_notify(dev, chandef); + + wdev_lock(wdev); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && + wdev->iftype != NL80211_IFTYPE_P2P_GO)) + goto out; + + wdev->channel = chandef->chan; + nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL); +out: + wdev_unlock(wdev); + return; +} +EXPORT_SYMBOL(cfg80211_ch_switch_notify); + +void cfg80211_cqm_txe_notify(struct net_device *dev, + const u8 *peer, u32 num_packets, + u32 rate, u32 intvl, gfp_t gfp) { + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); struct sk_buff *msg; struct nlattr *pinfoattr; void *hdr; @@ -8950,7 +10269,7 @@ nl80211_send_cqm_txe_notify(struct cfg80211_registered_device *rdev, } if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || - nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, peer)) goto nla_put_failure; @@ -8979,16 +10298,71 @@ nl80211_send_cqm_txe_notify(struct cfg80211_registered_device *rdev, genlmsg_cancel(msg, hdr); nlmsg_free(msg); } +EXPORT_SYMBOL(cfg80211_cqm_txe_notify); void -nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *peer, - u32 num_packets, gfp_t gfp) +nl80211_radar_notify(struct cfg80211_registered_device *rdev, + struct cfg80211_chan_def *chandef, + enum nl80211_radar_event event, + struct net_device *netdev, gfp_t gfp) { struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_RADAR_DETECT); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx)) + goto nla_put_failure; + + /* NOP and radar events don't need a netdev parameter */ + if (netdev) { + struct wireless_dev *wdev = netdev->ieee80211_ptr; + + if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || + nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + goto nla_put_failure; + } + + if (nla_put_u32(msg, NL80211_ATTR_RADAR_EVENT, event)) + goto nla_put_failure; + + if (nl80211_send_chandef(msg, chandef)) + goto nla_put_failure; + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void cfg80211_cqm_pktloss_notify(struct net_device *dev, + const u8 *peer, u32 num_packets, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct sk_buff *msg; struct nlattr *pinfoattr; void *hdr; + trace_cfg80211_cqm_pktloss_notify(dev, peer, num_packets); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; @@ -9000,7 +10374,7 @@ nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev, } if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || - nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, peer)) goto nla_put_failure; @@ -9023,6 +10397,7 @@ nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev, genlmsg_cancel(msg, hdr); nlmsg_free(msg); } +EXPORT_SYMBOL(cfg80211_cqm_pktloss_notify); void cfg80211_probe_status(struct net_device *dev, const u8 *addr, u64 cookie, bool acked, gfp_t gfp) @@ -9115,6 +10490,114 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_report_obss_beacon); +#ifdef CONFIG_PM +void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev, + struct cfg80211_wowlan_wakeup *wakeup, + gfp_t gfp) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct sk_buff *msg; + void *hdr; + int err, size = 200; + + trace_cfg80211_report_wowlan_wakeup(wdev->wiphy, wdev, wakeup); + + if (wakeup) + size += wakeup->packet_present_len; + + msg = nlmsg_new(size, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_SET_WOWLAN); + if (!hdr) + goto free_msg; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + goto free_msg; + + if (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, + wdev->netdev->ifindex)) + goto free_msg; + + if (wakeup) { + struct nlattr *reasons; + + reasons = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS); + + if (wakeup->disconnect && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT)) + goto free_msg; + if (wakeup->magic_pkt && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT)) + goto free_msg; + if (wakeup->gtk_rekey_failure && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE)) + goto free_msg; + if (wakeup->eap_identity_req && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST)) + goto free_msg; + if (wakeup->four_way_handshake && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE)) + goto free_msg; + if (wakeup->rfkill_release && + nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE)) + goto free_msg; + + if (wakeup->pattern_idx >= 0 && + nla_put_u32(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN, + wakeup->pattern_idx)) + goto free_msg; + + if (wakeup->tcp_match) + nla_put_flag(msg, NL80211_WOWLAN_TRIG_WAKEUP_TCP_MATCH); + + if (wakeup->tcp_connlost) + nla_put_flag(msg, + NL80211_WOWLAN_TRIG_WAKEUP_TCP_CONNLOST); + + if (wakeup->tcp_nomoretokens) + nla_put_flag(msg, + NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS); + + if (wakeup->packet) { + u32 pkt_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211; + u32 len_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211_LEN; + + if (!wakeup->packet_80211) { + pkt_attr = + NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023; + len_attr = + NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023_LEN; + } + + if (wakeup->packet_len && + nla_put_u32(msg, len_attr, wakeup->packet_len)) + goto free_msg; + + if (nla_put(msg, pkt_attr, wakeup->packet_present_len, + wakeup->packet)) + goto free_msg; + } + + nla_nest_end(msg, reasons); + } + + err = genlmsg_end(msg, hdr); + if (err < 0) + goto free_msg; + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + free_msg: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_report_wowlan_wakeup); +#endif + void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper, u16 reason_code, gfp_t gfp) @@ -9201,6 +10684,89 @@ static struct notifier_block nl80211_netlink_notifier = { .notifier_call = nl80211_netlink_notify, }; +void cfg80211_ft_event(struct net_device *netdev, + struct cfg80211_ft_event_params *ft_event) +{ + struct wiphy *wiphy = netdev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct sk_buff *msg; + void *hdr; + int err; + + trace_cfg80211_ft_event(wiphy, netdev, ft_event); + + if (!ft_event->target_ap) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FT_EVENT); + if (!hdr) { + nlmsg_free(msg); + return; + } + + nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, ft_event->target_ap); + if (ft_event->ies) + nla_put(msg, NL80211_ATTR_IE, ft_event->ies_len, ft_event->ies); + if (ft_event->ric_ies) + nla_put(msg, NL80211_ATTR_IE_RIC, ft_event->ric_ies_len, + ft_event->ric_ies); + + err = genlmsg_end(msg, hdr); + if (err < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, GFP_KERNEL); +} +EXPORT_SYMBOL(cfg80211_ft_event); + +void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp) +{ + struct cfg80211_registered_device *rdev; + struct sk_buff *msg; + void *hdr; + u32 nlportid; + + rdev = wiphy_to_dev(wdev->wiphy); + if (!rdev->crit_proto_nlportid) + return; + + nlportid = rdev->crit_proto_nlportid; + rdev->crit_proto_nlportid = 0; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CRIT_PROTOCOL_STOP); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid); + return; + + nla_put_failure: + if (hdr) + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + +} +EXPORT_SYMBOL(cfg80211_crit_proto_stopped); + /* initialisation/exit functions */ int nl80211_init(void) diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 2acba8477e9d..a4073e808c13 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -29,12 +29,6 @@ void nl80211_send_deauth(struct cfg80211_registered_device *rdev, void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, gfp_t gfp); -void nl80211_send_unprot_deauth(struct cfg80211_registered_device *rdev, - struct net_device *netdev, - const u8 *buf, size_t len, gfp_t gfp); -void nl80211_send_unprot_disassoc(struct cfg80211_registered_device *rdev, - struct net_device *netdev, - const u8 *buf, size_t len, gfp_t gfp); void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *addr, gfp_t gfp); @@ -54,10 +48,6 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap); -void nl80211_send_new_peer_candidate(struct cfg80211_registered_device *rdev, - struct net_device *netdev, - const u8 *macaddr, const u8* ie, u8 ie_len, - gfp_t gfp); void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *addr, @@ -73,66 +63,15 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, gfp_t gfp); -void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, u64 cookie, - struct ieee80211_channel *chan, - unsigned int duration, gfp_t gfp); -void nl80211_send_remain_on_channel_cancel( - struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, - u64 cookie, struct ieee80211_channel *chan, gfp_t gfp); - -void nl80211_send_sta_event(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *mac_addr, - struct station_info *sinfo, gfp_t gfp); -void nl80211_send_sta_del_event(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *mac_addr, - gfp_t gfp); - -void nl80211_send_conn_failed_event(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *mac_addr, - enum nl80211_connect_failed_reason reason, - gfp_t gfp); - int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 nlpid, int freq, int sig_dbm, const u8 *buf, size_t len, gfp_t gfp); -void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, - gfp_t gfp); - -void -nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, - enum nl80211_cqm_rssi_threshold_event rssi_event, - gfp_t gfp); -void -nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *peer, - u32 num_packets, gfp_t gfp); void -nl80211_send_cqm_txe_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *peer, - u32 num_packets, u32 rate, u32 intvl, gfp_t gfp); - -void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid, - const u8 *replay_ctr, gfp_t gfp); - -void nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev, - struct net_device *netdev, int index, - const u8 *bssid, bool preauth, gfp_t gfp); - -void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_chan_def *chandef, gfp_t gfp); - -bool nl80211_unexpected_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); -bool nl80211_unexpected_4addr_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); +nl80211_radar_notify(struct cfg80211_registered_device *rdev, + struct cfg80211_chan_def *chandef, + enum nl80211_radar_event event, + struct net_device *netdev, gfp_t gfp); #endif /* __NET_WIRELESS_NL80211_H */ diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 6c0c8191f837..9f15f0ac824d 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -6,11 +6,12 @@ #include "core.h" #include "trace.h" -static inline int rdev_suspend(struct cfg80211_registered_device *rdev) +static inline int rdev_suspend(struct cfg80211_registered_device *rdev, + struct cfg80211_wowlan *wowlan) { int ret; - trace_rdev_suspend(&rdev->wiphy, rdev->wowlan); - ret = rdev->ops->suspend(&rdev->wiphy, rdev->wowlan); + trace_rdev_suspend(&rdev->wiphy, wowlan); + ret = rdev->ops->suspend(&rdev->wiphy, wowlan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -874,5 +875,52 @@ static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev, trace_rdev_stop_p2p_device(&rdev->wiphy, wdev); rdev->ops->stop_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); -} +} + +static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_acl_data *params) +{ + int ret; + + trace_rdev_set_mac_acl(&rdev->wiphy, dev, params); + ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_update_ft_ies_params *ftie) +{ + int ret; + + trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie); + ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + enum nl80211_crit_proto_id protocol, + u16 duration) +{ + int ret; + + trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration); + ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev, + protocol, duration); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + trace_rdev_crit_proto_stop(&rdev->wiphy, wdev); + rdev->ops->crit_proto_stop(&rdev->wiphy, wdev); + trace_rdev_return_void(&rdev->wiphy); +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 82c4fc7c994c..cc35fbaa4578 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -48,7 +48,6 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/list.h> -#include <linux/random.h> #include <linux/ctype.h> #include <linux/nl80211.h> #include <linux/platform_device.h> @@ -66,6 +65,13 @@ #define REG_DBG_PRINT(args...) #endif +enum reg_request_treatment { + REG_REQ_OK, + REG_REQ_IGNORE, + REG_REQ_INTERSECT, + REG_REQ_ALREADY_SET, +}; + static struct regulatory_request core_request_world = { .initiator = NL80211_REGDOM_SET_BY_CORE, .alpha2[0] = '0', @@ -76,7 +82,8 @@ static struct regulatory_request core_request_world = { }; /* Receipt of information from last regulatory request */ -static struct regulatory_request *last_request = &core_request_world; +static struct regulatory_request __rcu *last_request = + (void __rcu *)&core_request_world; /* To trigger userspace events */ static struct platform_device *reg_pdev; @@ -88,16 +95,16 @@ static struct device_type reg_device_type = { /* * Central wireless core regulatory domains, we only need two, * the current one and a world regulatory domain in case we have no - * information to give us an alpha2 + * information to give us an alpha2. */ -const struct ieee80211_regdomain *cfg80211_regdomain; +const struct ieee80211_regdomain __rcu *cfg80211_regdomain; /* * Protects static reg.c components: - * - cfg80211_world_regdom - * - cfg80211_regdom - * - last_request - * - reg_num_devs_support_basehint + * - cfg80211_regdomain (if not used with RCU) + * - cfg80211_world_regdom + * - last_request (if not used with RCU) + * - reg_num_devs_support_basehint */ static DEFINE_MUTEX(reg_mutex); @@ -112,6 +119,31 @@ static inline void assert_reg_lock(void) lockdep_assert_held(®_mutex); } +static const struct ieee80211_regdomain *get_cfg80211_regdom(void) +{ + return rcu_dereference_protected(cfg80211_regdomain, + lockdep_is_held(®_mutex)); +} + +static const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) +{ + return rcu_dereference_protected(wiphy->regd, + lockdep_is_held(®_mutex)); +} + +static void rcu_free_regdom(const struct ieee80211_regdomain *r) +{ + if (!r) + return; + kfree_rcu((struct ieee80211_regdomain *)r, rcu_head); +} + +static struct regulatory_request *get_last_request(void) +{ + return rcu_dereference_check(last_request, + lockdep_is_held(®_mutex)); +} + /* Used to queue up regulatory hints */ static LIST_HEAD(reg_requests_list); static spinlock_t reg_requests_lock; @@ -152,14 +184,14 @@ static const struct ieee80211_regdomain world_regdom = { NL80211_RRF_NO_IBSS | NL80211_RRF_NO_OFDM), /* IEEE 802.11a, channel 36..48 */ - REG_RULE(5180-10, 5240+10, 40, 6, 20, + REG_RULE(5180-10, 5240+10, 80, 6, 20, NL80211_RRF_PASSIVE_SCAN | NL80211_RRF_NO_IBSS), - /* NB: 5260 MHz - 5700 MHz requies DFS */ + /* NB: 5260 MHz - 5700 MHz requires DFS */ /* IEEE 802.11a, channel 149..165 */ - REG_RULE(5745-10, 5825+10, 40, 6, 20, + REG_RULE(5745-10, 5825+10, 80, 6, 20, NL80211_RRF_PASSIVE_SCAN | NL80211_RRF_NO_IBSS), @@ -177,28 +209,37 @@ static char user_alpha2[2]; module_param(ieee80211_regdom, charp, 0444); MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); -static void reset_regdomains(bool full_reset) +static void reset_regdomains(bool full_reset, + const struct ieee80211_regdomain *new_regdom) { + const struct ieee80211_regdomain *r; + struct regulatory_request *lr; + + assert_reg_lock(); + + r = get_cfg80211_regdom(); + /* avoid freeing static information or freeing something twice */ - if (cfg80211_regdomain == cfg80211_world_regdom) - cfg80211_regdomain = NULL; + if (r == cfg80211_world_regdom) + r = NULL; if (cfg80211_world_regdom == &world_regdom) cfg80211_world_regdom = NULL; - if (cfg80211_regdomain == &world_regdom) - cfg80211_regdomain = NULL; + if (r == &world_regdom) + r = NULL; - kfree(cfg80211_regdomain); - kfree(cfg80211_world_regdom); + rcu_free_regdom(r); + rcu_free_regdom(cfg80211_world_regdom); cfg80211_world_regdom = &world_regdom; - cfg80211_regdomain = NULL; + rcu_assign_pointer(cfg80211_regdomain, new_regdom); if (!full_reset) return; - if (last_request != &core_request_world) - kfree(last_request); - last_request = &core_request_world; + lr = get_last_request(); + if (lr != &core_request_world && lr) + kfree_rcu(lr, rcu_head); + rcu_assign_pointer(last_request, &core_request_world); } /* @@ -207,30 +248,29 @@ static void reset_regdomains(bool full_reset) */ static void update_world_regdomain(const struct ieee80211_regdomain *rd) { - BUG_ON(!last_request); + struct regulatory_request *lr; - reset_regdomains(false); + lr = get_last_request(); + + WARN_ON(!lr); + + reset_regdomains(false, rd); cfg80211_world_regdom = rd; - cfg80211_regdomain = rd; } bool is_world_regdom(const char *alpha2) { if (!alpha2) return false; - if (alpha2[0] == '0' && alpha2[1] == '0') - return true; - return false; + return alpha2[0] == '0' && alpha2[1] == '0'; } static bool is_alpha2_set(const char *alpha2) { if (!alpha2) return false; - if (alpha2[0] != 0 && alpha2[1] != 0) - return true; - return false; + return alpha2[0] && alpha2[1]; } static bool is_unknown_alpha2(const char *alpha2) @@ -241,9 +281,7 @@ static bool is_unknown_alpha2(const char *alpha2) * Special case where regulatory domain was built by driver * but a specific alpha2 cannot be determined */ - if (alpha2[0] == '9' && alpha2[1] == '9') - return true; - return false; + return alpha2[0] == '9' && alpha2[1] == '9'; } static bool is_intersected_alpha2(const char *alpha2) @@ -255,39 +293,30 @@ static bool is_intersected_alpha2(const char *alpha2) * result of an intersection between two regulatory domain * structures */ - if (alpha2[0] == '9' && alpha2[1] == '8') - return true; - return false; + return alpha2[0] == '9' && alpha2[1] == '8'; } static bool is_an_alpha2(const char *alpha2) { if (!alpha2) return false; - if (isalpha(alpha2[0]) && isalpha(alpha2[1])) - return true; - return false; + return isalpha(alpha2[0]) && isalpha(alpha2[1]); } static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y) { if (!alpha2_x || !alpha2_y) return false; - if (alpha2_x[0] == alpha2_y[0] && - alpha2_x[1] == alpha2_y[1]) - return true; - return false; + return alpha2_x[0] == alpha2_y[0] && alpha2_x[1] == alpha2_y[1]; } static bool regdom_changes(const char *alpha2) { - assert_cfg80211_lock(); + const struct ieee80211_regdomain *r = get_cfg80211_regdom(); - if (!cfg80211_regdomain) + if (!r) return true; - if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2)) - return false; - return true; + return !alpha2_equal(r->alpha2, alpha2); } /* @@ -301,38 +330,36 @@ static bool is_user_regdom_saved(void) return false; /* This would indicate a mistake on the design */ - if (WARN((!is_world_regdom(user_alpha2) && - !is_an_alpha2(user_alpha2)), + if (WARN(!is_world_regdom(user_alpha2) && !is_an_alpha2(user_alpha2), "Unexpected user alpha2: %c%c\n", - user_alpha2[0], - user_alpha2[1])) + user_alpha2[0], user_alpha2[1])) return false; return true; } -static int reg_copy_regd(const struct ieee80211_regdomain **dst_regd, - const struct ieee80211_regdomain *src_regd) +static const struct ieee80211_regdomain * +reg_copy_regd(const struct ieee80211_regdomain *src_regd) { struct ieee80211_regdomain *regd; - int size_of_regd = 0; + int size_of_regd; unsigned int i; - size_of_regd = sizeof(struct ieee80211_regdomain) + - ((src_regd->n_reg_rules + 1) * sizeof(struct ieee80211_reg_rule)); + size_of_regd = + sizeof(struct ieee80211_regdomain) + + src_regd->n_reg_rules * sizeof(struct ieee80211_reg_rule); regd = kzalloc(size_of_regd, GFP_KERNEL); if (!regd) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain)); for (i = 0; i < src_regd->n_reg_rules; i++) memcpy(®d->reg_rules[i], &src_regd->reg_rules[i], - sizeof(struct ieee80211_reg_rule)); + sizeof(struct ieee80211_reg_rule)); - *dst_regd = regd; - return 0; + return regd; } #ifdef CONFIG_CFG80211_INTERNAL_REGDB @@ -347,9 +374,8 @@ static DEFINE_MUTEX(reg_regdb_search_mutex); static void reg_regdb_search(struct work_struct *work) { struct reg_regdb_search_request *request; - const struct ieee80211_regdomain *curdom, *regdom; - int i, r; - bool set_reg = false; + const struct ieee80211_regdomain *curdom, *regdom = NULL; + int i; mutex_lock(&cfg80211_mutex); @@ -360,14 +386,11 @@ static void reg_regdb_search(struct work_struct *work) list); list_del(&request->list); - for (i=0; i<reg_regdb_size; i++) { + for (i = 0; i < reg_regdb_size; i++) { curdom = reg_regdb[i]; - if (!memcmp(request->alpha2, curdom->alpha2, 2)) { - r = reg_copy_regd(®dom, curdom); - if (r) - break; - set_reg = true; + if (alpha2_equal(request->alpha2, curdom->alpha2)) { + regdom = reg_copy_regd(curdom); break; } } @@ -376,7 +399,7 @@ static void reg_regdb_search(struct work_struct *work) } mutex_unlock(®_regdb_search_mutex); - if (set_reg) + if (!IS_ERR_OR_NULL(regdom)) set_regdom(regdom); mutex_unlock(&cfg80211_mutex); @@ -434,15 +457,14 @@ static int call_crda(const char *alpha2) return kobject_uevent(®_pdev->dev.kobj, KOBJ_CHANGE); } -/* Used by nl80211 before kmalloc'ing our regulatory domain */ -bool reg_is_valid_request(const char *alpha2) +static bool reg_is_valid_request(const char *alpha2) { - assert_cfg80211_lock(); + struct regulatory_request *lr = get_last_request(); - if (!last_request) + if (!lr || lr->processed) return false; - return alpha2_equal(last_request->alpha2, alpha2); + return alpha2_equal(lr->alpha2, alpha2); } /* Sanity check on a regulatory rule */ @@ -460,7 +482,7 @@ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule) freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; if (freq_range->end_freq_khz <= freq_range->start_freq_khz || - freq_range->max_bandwidth_khz > freq_diff) + freq_range->max_bandwidth_khz > freq_diff) return false; return true; @@ -487,8 +509,7 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd) } static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range, - u32 center_freq_khz, - u32 bw_khz) + u32 center_freq_khz, u32 bw_khz) { u32 start_freq_khz, end_freq_khz; @@ -518,7 +539,7 @@ static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range, * regulatory rule support for other "bands". **/ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range, - u32 freq_khz) + u32 freq_khz) { #define ONE_GHZ_IN_KHZ 1000000 /* @@ -540,10 +561,9 @@ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range, * Helper for regdom_intersect(), this does the real * mathematical intersection fun */ -static int reg_rules_intersect( - const struct ieee80211_reg_rule *rule1, - const struct ieee80211_reg_rule *rule2, - struct ieee80211_reg_rule *intersected_rule) +static int reg_rules_intersect(const struct ieee80211_reg_rule *rule1, + const struct ieee80211_reg_rule *rule2, + struct ieee80211_reg_rule *intersected_rule) { const struct ieee80211_freq_range *freq_range1, *freq_range2; struct ieee80211_freq_range *freq_range; @@ -560,11 +580,11 @@ static int reg_rules_intersect( power_rule = &intersected_rule->power_rule; freq_range->start_freq_khz = max(freq_range1->start_freq_khz, - freq_range2->start_freq_khz); + freq_range2->start_freq_khz); freq_range->end_freq_khz = min(freq_range1->end_freq_khz, - freq_range2->end_freq_khz); + freq_range2->end_freq_khz); freq_range->max_bandwidth_khz = min(freq_range1->max_bandwidth_khz, - freq_range2->max_bandwidth_khz); + freq_range2->max_bandwidth_khz); freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; if (freq_range->max_bandwidth_khz > freq_diff) @@ -575,7 +595,7 @@ static int reg_rules_intersect( power_rule->max_antenna_gain = min(power_rule1->max_antenna_gain, power_rule2->max_antenna_gain); - intersected_rule->flags = (rule1->flags | rule2->flags); + intersected_rule->flags = rule1->flags | rule2->flags; if (!is_valid_reg_rule(intersected_rule)) return -EINVAL; @@ -596,9 +616,9 @@ static int reg_rules_intersect( * resulting intersection of rules between rd1 and rd2. We will * kzalloc() this structure for you. */ -static struct ieee80211_regdomain *regdom_intersect( - const struct ieee80211_regdomain *rd1, - const struct ieee80211_regdomain *rd2) +static struct ieee80211_regdomain * +regdom_intersect(const struct ieee80211_regdomain *rd1, + const struct ieee80211_regdomain *rd2) { int r, size_of_regd; unsigned int x, y; @@ -607,12 +627,7 @@ static struct ieee80211_regdomain *regdom_intersect( struct ieee80211_reg_rule *intersected_rule; struct ieee80211_regdomain *rd; /* This is just a dummy holder to help us count */ - struct ieee80211_reg_rule irule; - - /* Uses the stack temporarily for counter arithmetic */ - intersected_rule = &irule; - - memset(intersected_rule, 0, sizeof(struct ieee80211_reg_rule)); + struct ieee80211_reg_rule dummy_rule; if (!rd1 || !rd2) return NULL; @@ -629,11 +644,8 @@ static struct ieee80211_regdomain *regdom_intersect( rule1 = &rd1->reg_rules[x]; for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; - if (!reg_rules_intersect(rule1, rule2, - intersected_rule)) + if (!reg_rules_intersect(rule1, rule2, &dummy_rule)) num_rules++; - memset(intersected_rule, 0, - sizeof(struct ieee80211_reg_rule)); } } @@ -641,15 +653,15 @@ static struct ieee80211_regdomain *regdom_intersect( return NULL; size_of_regd = sizeof(struct ieee80211_regdomain) + - ((num_rules + 1) * sizeof(struct ieee80211_reg_rule)); + num_rules * sizeof(struct ieee80211_reg_rule); rd = kzalloc(size_of_regd, GFP_KERNEL); if (!rd) return NULL; - for (x = 0; x < rd1->n_reg_rules; x++) { + for (x = 0; x < rd1->n_reg_rules && rule_idx < num_rules; x++) { rule1 = &rd1->reg_rules[x]; - for (y = 0; y < rd2->n_reg_rules; y++) { + for (y = 0; y < rd2->n_reg_rules && rule_idx < num_rules; y++) { rule2 = &rd2->reg_rules[y]; /* * This time around instead of using the stack lets @@ -657,8 +669,7 @@ static struct ieee80211_regdomain *regdom_intersect( * a memcpy() */ intersected_rule = &rd->reg_rules[rule_idx]; - r = reg_rules_intersect(rule1, rule2, - intersected_rule); + r = reg_rules_intersect(rule1, rule2, intersected_rule); /* * No need to memset here the intersected rule here as * we're not using the stack anymore @@ -699,34 +710,16 @@ static u32 map_regdom_flags(u32 rd_flags) return channel_flags; } -static int freq_reg_info_regd(struct wiphy *wiphy, - u32 center_freq, - u32 desired_bw_khz, - const struct ieee80211_reg_rule **reg_rule, - const struct ieee80211_regdomain *custom_regd) +static const struct ieee80211_reg_rule * +freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, + const struct ieee80211_regdomain *regd) { int i; bool band_rule_found = false; - const struct ieee80211_regdomain *regd; bool bw_fits = false; - if (!desired_bw_khz) - desired_bw_khz = MHZ_TO_KHZ(20); - - regd = custom_regd ? custom_regd : cfg80211_regdomain; - - /* - * Follow the driver's regulatory domain, if present, unless a country - * IE has been processed or a user wants to help complaince further - */ - if (!custom_regd && - last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && - last_request->initiator != NL80211_REGDOM_SET_BY_USER && - wiphy->regd) - regd = wiphy->regd; - if (!regd) - return -EINVAL; + return ERR_PTR(-EINVAL); for (i = 0; i < regd->n_reg_rules; i++) { const struct ieee80211_reg_rule *rr; @@ -743,33 +736,36 @@ static int freq_reg_info_regd(struct wiphy *wiphy, if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); - bw_fits = reg_does_bw_fit(fr, - center_freq, - desired_bw_khz); + bw_fits = reg_does_bw_fit(fr, center_freq, MHZ_TO_KHZ(20)); - if (band_rule_found && bw_fits) { - *reg_rule = rr; - return 0; - } + if (band_rule_found && bw_fits) + return rr; } if (!band_rule_found) - return -ERANGE; + return ERR_PTR(-ERANGE); - return -EINVAL; + return ERR_PTR(-EINVAL); } -int freq_reg_info(struct wiphy *wiphy, - u32 center_freq, - u32 desired_bw_khz, - const struct ieee80211_reg_rule **reg_rule) +const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, + u32 center_freq) { - assert_cfg80211_lock(); - return freq_reg_info_regd(wiphy, - center_freq, - desired_bw_khz, - reg_rule, - NULL); + const struct ieee80211_regdomain *regd; + struct regulatory_request *lr = get_last_request(); + + /* + * Follow the driver's regulatory domain, if present, unless a country + * IE has been processed or a user wants to help complaince further + */ + if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + lr->initiator != NL80211_REGDOM_SET_BY_USER && + wiphy->regd) + regd = get_wiphy_regdom(wiphy); + else + regd = get_cfg80211_regdom(); + + return freq_reg_info_regd(wiphy, center_freq, regd); } EXPORT_SYMBOL(freq_reg_info); @@ -792,7 +788,6 @@ static const char *reg_initiator_name(enum nl80211_reg_initiator initiator) } static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan, - u32 desired_bw_khz, const struct ieee80211_reg_rule *reg_rule) { const struct ieee80211_power_rule *power_rule; @@ -807,21 +802,16 @@ static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan, else snprintf(max_antenna_gain, 32, "%d", power_rule->max_antenna_gain); - REG_DBG_PRINT("Updating information on frequency %d MHz " - "for a %d MHz width channel with regulatory rule:\n", - chan->center_freq, - KHZ_TO_MHZ(desired_bw_khz)); + REG_DBG_PRINT("Updating information on frequency %d MHz with regulatory rule:\n", + chan->center_freq); REG_DBG_PRINT("%d KHz - %d KHz @ %d KHz), (%s mBi, %d mBm)\n", - freq_range->start_freq_khz, - freq_range->end_freq_khz, - freq_range->max_bandwidth_khz, - max_antenna_gain, + freq_range->start_freq_khz, freq_range->end_freq_khz, + freq_range->max_bandwidth_khz, max_antenna_gain, power_rule->max_eirp); } #else static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan, - u32 desired_bw_khz, const struct ieee80211_reg_rule *reg_rule) { return; @@ -831,43 +821,25 @@ static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan, /* * Note that right now we assume the desired channel bandwidth * is always 20 MHz for each individual channel (HT40 uses 20 MHz - * per channel, the primary and the extension channel). To support - * smaller custom bandwidths such as 5 MHz or 10 MHz we'll need a - * new ieee80211_channel.target_bw and re run the regulatory check - * on the wiphy with the target_bw specified. Then we can simply use - * that below for the desired_bw_khz below. + * per channel, the primary and the extension channel). */ static void handle_channel(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, - enum ieee80211_band band, - unsigned int chan_idx) + struct ieee80211_channel *chan) { - int r; u32 flags, bw_flags = 0; - u32 desired_bw_khz = MHZ_TO_KHZ(20); const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; const struct ieee80211_freq_range *freq_range = NULL; - struct ieee80211_supported_band *sband; - struct ieee80211_channel *chan; struct wiphy *request_wiphy = NULL; + struct regulatory_request *lr = get_last_request(); - assert_cfg80211_lock(); - - request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); - - sband = wiphy->bands[band]; - BUG_ON(chan_idx >= sband->n_channels); - chan = &sband->channels[chan_idx]; + request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); flags = chan->orig_flags; - r = freq_reg_info(wiphy, - MHZ_TO_KHZ(chan->center_freq), - desired_bw_khz, - ®_rule); - - if (r) { + reg_rule = freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq)); + if (IS_ERR(reg_rule)) { /* * We will disable all channels that do not match our * received regulatory rule unless the hint is coming @@ -879,23 +851,27 @@ static void handle_channel(struct wiphy *wiphy, * while 5 GHz is still supported. */ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && - r == -ERANGE) + PTR_ERR(reg_rule) == -ERANGE) return; REG_DBG_PRINT("Disabling freq %d MHz\n", chan->center_freq); - chan->flags = IEEE80211_CHAN_DISABLED; + chan->flags |= IEEE80211_CHAN_DISABLED; return; } - chan_reg_rule_print_dbg(chan, desired_bw_khz, reg_rule); + chan_reg_rule_print_dbg(chan, reg_rule); power_rule = ®_rule->power_rule; freq_range = ®_rule->freq_range; if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(40)) bw_flags = IEEE80211_CHAN_NO_HT40; + if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; - if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY) { /* @@ -912,10 +888,14 @@ static void handle_channel(struct wiphy *wiphy, return; } + chan->dfs_state = NL80211_DFS_USABLE; + chan->dfs_state_entered = jiffies; + chan->beacon_found = false; chan->flags = flags | bw_flags | map_regdom_flags(reg_rule->flags); - chan->max_antenna_gain = min(chan->orig_mag, - (int) MBI_TO_DBI(power_rule->max_antenna_gain)); + chan->max_antenna_gain = + min_t(int, chan->orig_mag, + MBI_TO_DBI(power_rule->max_antenna_gain)); chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp); if (chan->orig_mpwr) { /* @@ -935,68 +915,65 @@ static void handle_channel(struct wiphy *wiphy, } static void handle_band(struct wiphy *wiphy, - enum ieee80211_band band, - enum nl80211_reg_initiator initiator) + enum nl80211_reg_initiator initiator, + struct ieee80211_supported_band *sband) { unsigned int i; - struct ieee80211_supported_band *sband; - BUG_ON(!wiphy->bands[band]); - sband = wiphy->bands[band]; + if (!sband) + return; for (i = 0; i < sband->n_channels; i++) - handle_channel(wiphy, initiator, band, i); + handle_channel(wiphy, initiator, &sband->channels[i]); } static bool reg_request_cell_base(struct regulatory_request *request) { if (request->initiator != NL80211_REGDOM_SET_BY_USER) return false; - if (request->user_reg_hint_type != NL80211_USER_REG_HINT_CELL_BASE) - return false; - return true; + return request->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE; } bool reg_last_request_cell_base(void) { bool val; - assert_cfg80211_lock(); mutex_lock(®_mutex); - val = reg_request_cell_base(last_request); + val = reg_request_cell_base(get_last_request()); mutex_unlock(®_mutex); + return val; } #ifdef CONFIG_CFG80211_CERTIFICATION_ONUS - /* Core specific check */ -static int reg_ignore_cell_hint(struct regulatory_request *pending_request) +static enum reg_request_treatment +reg_ignore_cell_hint(struct regulatory_request *pending_request) { + struct regulatory_request *lr = get_last_request(); + if (!reg_num_devs_support_basehint) - return -EOPNOTSUPP; + return REG_REQ_IGNORE; - if (reg_request_cell_base(last_request)) { - if (!regdom_changes(pending_request->alpha2)) - return -EALREADY; - return 0; - } - return 0; + if (reg_request_cell_base(lr) && + !regdom_changes(pending_request->alpha2)) + return REG_REQ_ALREADY_SET; + + return REG_REQ_OK; } /* Device specific check */ static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy) { - if (!(wiphy->features & NL80211_FEATURE_CELL_BASE_REG_HINTS)) - return true; - return false; + return !(wiphy->features & NL80211_FEATURE_CELL_BASE_REG_HINTS); } #else static int reg_ignore_cell_hint(struct regulatory_request *pending_request) { - return -EOPNOTSUPP; + return REG_REQ_IGNORE; } -static int reg_dev_ignore_cell_hint(struct wiphy *wiphy) + +static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy) { return true; } @@ -1006,18 +983,17 @@ static int reg_dev_ignore_cell_hint(struct wiphy *wiphy) static bool ignore_reg_update(struct wiphy *wiphy, enum nl80211_reg_initiator initiator) { - if (!last_request) { - REG_DBG_PRINT("Ignoring regulatory request %s since " - "last_request is not set\n", + struct regulatory_request *lr = get_last_request(); + + if (!lr) { + REG_DBG_PRINT("Ignoring regulatory request %s since last_request is not set\n", reg_initiator_name(initiator)); return true; } if (initiator == NL80211_REGDOM_SET_BY_CORE && wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY) { - REG_DBG_PRINT("Ignoring regulatory request %s " - "since the driver uses its own custom " - "regulatory domain\n", + REG_DBG_PRINT("Ignoring regulatory request %s since the driver uses its own custom regulatory domain\n", reg_initiator_name(initiator)); return true; } @@ -1028,22 +1004,35 @@ static bool ignore_reg_update(struct wiphy *wiphy, */ if (wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY && !wiphy->regd && initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && - !is_world_regdom(last_request->alpha2)) { - REG_DBG_PRINT("Ignoring regulatory request %s " - "since the driver requires its own regulatory " - "domain to be set first\n", + !is_world_regdom(lr->alpha2)) { + REG_DBG_PRINT("Ignoring regulatory request %s since the driver requires its own regulatory domain to be set first\n", reg_initiator_name(initiator)); return true; } - if (reg_request_cell_base(last_request)) + if (reg_request_cell_base(lr)) return reg_dev_ignore_cell_hint(wiphy); return false; } -static void handle_reg_beacon(struct wiphy *wiphy, - unsigned int chan_idx, +static bool reg_is_world_roaming(struct wiphy *wiphy) +{ + const struct ieee80211_regdomain *cr = get_cfg80211_regdom(); + const struct ieee80211_regdomain *wr = get_wiphy_regdom(wiphy); + struct regulatory_request *lr = get_last_request(); + + if (is_world_regdom(cr->alpha2) || (wr && is_world_regdom(wr->alpha2))) + return true; + + if (lr && lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY) + return true; + + return false; +} + +static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx, struct reg_beacon *reg_beacon) { struct ieee80211_supported_band *sband; @@ -1051,8 +1040,6 @@ static void handle_reg_beacon(struct wiphy *wiphy, bool channel_changed = false; struct ieee80211_channel chan_before; - assert_cfg80211_lock(); - sband = wiphy->bands[reg_beacon->chan.band]; chan = &sband->channels[chan_idx]; @@ -1064,6 +1051,9 @@ static void handle_reg_beacon(struct wiphy *wiphy, chan->beacon_found = true; + if (!reg_is_world_roaming(wiphy)) + return; + if (wiphy->flags & WIPHY_FLAG_DISABLE_BEACON_HINTS) return; @@ -1094,8 +1084,6 @@ static void wiphy_update_new_beacon(struct wiphy *wiphy, unsigned int i; struct ieee80211_supported_band *sband; - assert_cfg80211_lock(); - if (!wiphy->bands[reg_beacon->chan.band]) return; @@ -1114,11 +1102,6 @@ static void wiphy_update_beacon_reg(struct wiphy *wiphy) struct ieee80211_supported_band *sband; struct reg_beacon *reg_beacon; - assert_cfg80211_lock(); - - if (list_empty(®_beacon_list)) - return; - list_for_each_entry(reg_beacon, ®_beacon_list, list) { if (!wiphy->bands[reg_beacon->chan.band]) continue; @@ -1128,18 +1111,6 @@ static void wiphy_update_beacon_reg(struct wiphy *wiphy) } } -static bool reg_is_world_roaming(struct wiphy *wiphy) -{ - if (is_world_regdom(cfg80211_regdomain->alpha2) || - (wiphy->regd && is_world_regdom(wiphy->regd->alpha2))) - return true; - if (last_request && - last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && - wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY) - return true; - return false; -} - /* Reap the advantages of previously found beacons */ static void reg_process_beacons(struct wiphy *wiphy) { @@ -1149,39 +1120,29 @@ static void reg_process_beacons(struct wiphy *wiphy) */ if (!last_request) return; - if (!reg_is_world_roaming(wiphy)) - return; wiphy_update_beacon_reg(wiphy); } -static bool is_ht40_not_allowed(struct ieee80211_channel *chan) +static bool is_ht40_allowed(struct ieee80211_channel *chan) { if (!chan) - return true; + return false; if (chan->flags & IEEE80211_CHAN_DISABLED) - return true; + return false; /* This would happen when regulatory rules disallow HT40 completely */ - if (IEEE80211_CHAN_NO_HT40 == (chan->flags & (IEEE80211_CHAN_NO_HT40))) - return true; - return false; + if ((chan->flags & IEEE80211_CHAN_NO_HT40) == IEEE80211_CHAN_NO_HT40) + return false; + return true; } static void reg_process_ht_flags_channel(struct wiphy *wiphy, - enum ieee80211_band band, - unsigned int chan_idx) + struct ieee80211_channel *channel) { - struct ieee80211_supported_band *sband; - struct ieee80211_channel *channel; + struct ieee80211_supported_band *sband = wiphy->bands[channel->band]; struct ieee80211_channel *channel_before = NULL, *channel_after = NULL; unsigned int i; - assert_cfg80211_lock(); - - sband = wiphy->bands[band]; - BUG_ON(chan_idx >= sband->n_channels); - channel = &sband->channels[chan_idx]; - - if (is_ht40_not_allowed(channel)) { + if (!is_ht40_allowed(channel)) { channel->flags |= IEEE80211_CHAN_NO_HT40; return; } @@ -1192,6 +1153,7 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy, */ for (i = 0; i < sband->n_channels; i++) { struct ieee80211_channel *c = &sband->channels[i]; + if (c->center_freq == (channel->center_freq - 20)) channel_before = c; if (c->center_freq == (channel->center_freq + 20)) @@ -1203,28 +1165,27 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy, * if that ever changes we also need to change the below logic * to include that as well. */ - if (is_ht40_not_allowed(channel_before)) + if (!is_ht40_allowed(channel_before)) channel->flags |= IEEE80211_CHAN_NO_HT40MINUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS; - if (is_ht40_not_allowed(channel_after)) + if (!is_ht40_allowed(channel_after)) channel->flags |= IEEE80211_CHAN_NO_HT40PLUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS; } static void reg_process_ht_flags_band(struct wiphy *wiphy, - enum ieee80211_band band) + struct ieee80211_supported_band *sband) { unsigned int i; - struct ieee80211_supported_band *sband; - BUG_ON(!wiphy->bands[band]); - sband = wiphy->bands[band]; + if (!sband) + return; for (i = 0; i < sband->n_channels; i++) - reg_process_ht_flags_channel(wiphy, band, i); + reg_process_ht_flags_channel(wiphy, &sband->channels[i]); } static void reg_process_ht_flags(struct wiphy *wiphy) @@ -1234,34 +1195,29 @@ static void reg_process_ht_flags(struct wiphy *wiphy) if (!wiphy) return; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { - if (wiphy->bands[band]) - reg_process_ht_flags_band(wiphy, band); - } - + for (band = 0; band < IEEE80211_NUM_BANDS; band++) + reg_process_ht_flags_band(wiphy, wiphy->bands[band]); } static void wiphy_update_regulatory(struct wiphy *wiphy, enum nl80211_reg_initiator initiator) { enum ieee80211_band band; - - assert_reg_lock(); + struct regulatory_request *lr = get_last_request(); if (ignore_reg_update(wiphy, initiator)) return; - last_request->dfs_region = cfg80211_regdomain->dfs_region; + lr->dfs_region = get_cfg80211_regdom()->dfs_region; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { - if (wiphy->bands[band]) - handle_band(wiphy, band, initiator); - } + for (band = 0; band < IEEE80211_NUM_BANDS; band++) + handle_band(wiphy, initiator, wiphy->bands[band]); reg_process_beacons(wiphy); reg_process_ht_flags(wiphy); + if (wiphy->reg_notifier) - wiphy->reg_notifier(wiphy, last_request); + wiphy->reg_notifier(wiphy, lr); } static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) @@ -1269,6 +1225,8 @@ static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) struct cfg80211_registered_device *rdev; struct wiphy *wiphy; + assert_cfg80211_lock(); + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { wiphy = &rdev->wiphy; wiphy_update_regulatory(wiphy, initiator); @@ -1280,53 +1238,40 @@ static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) if (initiator == NL80211_REGDOM_SET_BY_CORE && wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY && wiphy->reg_notifier) - wiphy->reg_notifier(wiphy, last_request); + wiphy->reg_notifier(wiphy, get_last_request()); } } static void handle_channel_custom(struct wiphy *wiphy, - enum ieee80211_band band, - unsigned int chan_idx, + struct ieee80211_channel *chan, const struct ieee80211_regdomain *regd) { - int r; - u32 desired_bw_khz = MHZ_TO_KHZ(20); u32 bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; const struct ieee80211_freq_range *freq_range = NULL; - struct ieee80211_supported_band *sband; - struct ieee80211_channel *chan; - - assert_reg_lock(); - - sband = wiphy->bands[band]; - BUG_ON(chan_idx >= sband->n_channels); - chan = &sband->channels[chan_idx]; - r = freq_reg_info_regd(wiphy, - MHZ_TO_KHZ(chan->center_freq), - desired_bw_khz, - ®_rule, - regd); + reg_rule = freq_reg_info_regd(wiphy, MHZ_TO_KHZ(chan->center_freq), + regd); - if (r) { - REG_DBG_PRINT("Disabling freq %d MHz as custom " - "regd has no rule that fits a %d MHz " - "wide channel\n", - chan->center_freq, - KHZ_TO_MHZ(desired_bw_khz)); + if (IS_ERR(reg_rule)) { + REG_DBG_PRINT("Disabling freq %d MHz as custom regd has no rule that fits it\n", + chan->center_freq); chan->flags = IEEE80211_CHAN_DISABLED; return; } - chan_reg_rule_print_dbg(chan, desired_bw_khz, reg_rule); + chan_reg_rule_print_dbg(chan, reg_rule); power_rule = ®_rule->power_rule; freq_range = ®_rule->freq_range; if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(40)) bw_flags = IEEE80211_CHAN_NO_HT40; + if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; chan->flags |= map_regdom_flags(reg_rule->flags) | bw_flags; chan->max_antenna_gain = (int) MBI_TO_DBI(power_rule->max_antenna_gain); @@ -1334,17 +1279,17 @@ static void handle_channel_custom(struct wiphy *wiphy, (int) MBM_TO_DBM(power_rule->max_eirp); } -static void handle_band_custom(struct wiphy *wiphy, enum ieee80211_band band, +static void handle_band_custom(struct wiphy *wiphy, + struct ieee80211_supported_band *sband, const struct ieee80211_regdomain *regd) { unsigned int i; - struct ieee80211_supported_band *sband; - BUG_ON(!wiphy->bands[band]); - sband = wiphy->bands[band]; + if (!sband) + return; for (i = 0; i < sband->n_channels; i++) - handle_channel_custom(wiphy, band, i, regd); + handle_channel_custom(wiphy, &sband->channels[i], regd); } /* Used by drivers prior to wiphy registration */ @@ -1354,60 +1299,50 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, enum ieee80211_band band; unsigned int bands_set = 0; - mutex_lock(®_mutex); for (band = 0; band < IEEE80211_NUM_BANDS; band++) { if (!wiphy->bands[band]) continue; - handle_band_custom(wiphy, band, regd); + handle_band_custom(wiphy, wiphy->bands[band], regd); bands_set++; } - mutex_unlock(®_mutex); /* * no point in calling this if it won't have any effect - * on your device's supportd bands. + * on your device's supported bands. */ WARN_ON(!bands_set); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); -/* - * Return value which can be used by ignore_request() to indicate - * it has been determined we should intersect two regulatory domains - */ -#define REG_INTERSECT 1 - /* This has the logic which determines when a new request * should be ignored. */ -static int ignore_request(struct wiphy *wiphy, +static enum reg_request_treatment +get_reg_request_treatment(struct wiphy *wiphy, struct regulatory_request *pending_request) { struct wiphy *last_wiphy = NULL; - - assert_cfg80211_lock(); + struct regulatory_request *lr = get_last_request(); /* All initial requests are respected */ - if (!last_request) - return 0; + if (!lr) + return REG_REQ_OK; switch (pending_request->initiator) { case NL80211_REGDOM_SET_BY_CORE: - return 0; + return REG_REQ_OK; case NL80211_REGDOM_SET_BY_COUNTRY_IE: - - if (reg_request_cell_base(last_request)) { + if (reg_request_cell_base(lr)) { /* Trust a Cell base station over the AP's country IE */ if (regdom_changes(pending_request->alpha2)) - return -EOPNOTSUPP; - return -EALREADY; + return REG_REQ_IGNORE; + return REG_REQ_ALREADY_SET; } - last_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + last_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); if (unlikely(!is_an_alpha2(pending_request->alpha2))) return -EINVAL; - if (last_request->initiator == - NL80211_REGDOM_SET_BY_COUNTRY_IE) { + if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) { if (last_wiphy != wiphy) { /* * Two cards with two APs claiming different @@ -1416,23 +1351,23 @@ static int ignore_request(struct wiphy *wiphy, * to be correct. Reject second one for now. */ if (regdom_changes(pending_request->alpha2)) - return -EOPNOTSUPP; - return -EALREADY; + return REG_REQ_IGNORE; + return REG_REQ_ALREADY_SET; } /* * Two consecutive Country IE hints on the same wiphy. * This should be picked up early by the driver/stack */ if (WARN_ON(regdom_changes(pending_request->alpha2))) - return 0; - return -EALREADY; + return REG_REQ_OK; + return REG_REQ_ALREADY_SET; } return 0; case NL80211_REGDOM_SET_BY_DRIVER: - if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) { + if (lr->initiator == NL80211_REGDOM_SET_BY_CORE) { if (regdom_changes(pending_request->alpha2)) - return 0; - return -EALREADY; + return REG_REQ_OK; + return REG_REQ_ALREADY_SET; } /* @@ -1440,59 +1375,59 @@ static int ignore_request(struct wiphy *wiphy, * back in or if you add a new device for which the previously * loaded card also agrees on the regulatory domain. */ - if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && !regdom_changes(pending_request->alpha2)) - return -EALREADY; + return REG_REQ_ALREADY_SET; - return REG_INTERSECT; + return REG_REQ_INTERSECT; case NL80211_REGDOM_SET_BY_USER: if (reg_request_cell_base(pending_request)) return reg_ignore_cell_hint(pending_request); - if (reg_request_cell_base(last_request)) - return -EOPNOTSUPP; + if (reg_request_cell_base(lr)) + return REG_REQ_IGNORE; - if (last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) - return REG_INTERSECT; + if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) + return REG_REQ_INTERSECT; /* * If the user knows better the user should set the regdom * to their country before the IE is picked up */ - if (last_request->initiator == NL80211_REGDOM_SET_BY_USER && - last_request->intersect) - return -EOPNOTSUPP; + if (lr->initiator == NL80211_REGDOM_SET_BY_USER && + lr->intersect) + return REG_REQ_IGNORE; /* * Process user requests only after previous user/driver/core * requests have been processed */ - if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE || - last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER || - last_request->initiator == NL80211_REGDOM_SET_BY_USER) { - if (regdom_changes(last_request->alpha2)) - return -EAGAIN; - } + if ((lr->initiator == NL80211_REGDOM_SET_BY_CORE || + lr->initiator == NL80211_REGDOM_SET_BY_DRIVER || + lr->initiator == NL80211_REGDOM_SET_BY_USER) && + regdom_changes(lr->alpha2)) + return REG_REQ_IGNORE; if (!regdom_changes(pending_request->alpha2)) - return -EALREADY; + return REG_REQ_ALREADY_SET; - return 0; + return REG_REQ_OK; } - return -EINVAL; + return REG_REQ_IGNORE; } static void reg_set_request_processed(void) { bool need_more_processing = false; + struct regulatory_request *lr = get_last_request(); - last_request->processed = true; + lr->processed = true; spin_lock(®_requests_lock); if (!list_empty(®_requests_list)) need_more_processing = true; spin_unlock(®_requests_lock); - if (last_request->initiator == NL80211_REGDOM_SET_BY_USER) + if (lr->initiator == NL80211_REGDOM_SET_BY_USER) cancel_delayed_work(®_timeout); if (need_more_processing) @@ -1508,116 +1443,122 @@ static void reg_set_request_processed(void) * The Wireless subsystem can use this function to hint to the wireless core * what it believes should be the current regulatory domain. * - * Returns zero if all went fine, %-EALREADY if a regulatory domain had - * already been set or other standard error codes. + * Returns one of the different reg request treatment values. * - * Caller must hold &cfg80211_mutex and ®_mutex + * Caller must hold ®_mutex */ -static int __regulatory_hint(struct wiphy *wiphy, - struct regulatory_request *pending_request) +static enum reg_request_treatment +__regulatory_hint(struct wiphy *wiphy, + struct regulatory_request *pending_request) { + const struct ieee80211_regdomain *regd; bool intersect = false; - int r = 0; - - assert_cfg80211_lock(); + enum reg_request_treatment treatment; + struct regulatory_request *lr; - r = ignore_request(wiphy, pending_request); + treatment = get_reg_request_treatment(wiphy, pending_request); - if (r == REG_INTERSECT) { + switch (treatment) { + case REG_REQ_INTERSECT: if (pending_request->initiator == NL80211_REGDOM_SET_BY_DRIVER) { - r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); - if (r) { + regd = reg_copy_regd(get_cfg80211_regdom()); + if (IS_ERR(regd)) { kfree(pending_request); - return r; + return PTR_ERR(regd); } + rcu_assign_pointer(wiphy->regd, regd); } intersect = true; - } else if (r) { + break; + case REG_REQ_OK: + break; + default: /* * If the regulatory domain being requested by the * driver has already been set just copy it to the * wiphy */ - if (r == -EALREADY && - pending_request->initiator == - NL80211_REGDOM_SET_BY_DRIVER) { - r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); - if (r) { + if (treatment == REG_REQ_ALREADY_SET && + pending_request->initiator == NL80211_REGDOM_SET_BY_DRIVER) { + regd = reg_copy_regd(get_cfg80211_regdom()); + if (IS_ERR(regd)) { kfree(pending_request); - return r; + return REG_REQ_IGNORE; } - r = -EALREADY; + treatment = REG_REQ_ALREADY_SET; + rcu_assign_pointer(wiphy->regd, regd); goto new_request; } kfree(pending_request); - return r; + return treatment; } new_request: - if (last_request != &core_request_world) - kfree(last_request); + lr = get_last_request(); + if (lr != &core_request_world && lr) + kfree_rcu(lr, rcu_head); - last_request = pending_request; - last_request->intersect = intersect; + pending_request->intersect = intersect; + pending_request->processed = false; + rcu_assign_pointer(last_request, pending_request); + lr = pending_request; pending_request = NULL; - if (last_request->initiator == NL80211_REGDOM_SET_BY_USER) { - user_alpha2[0] = last_request->alpha2[0]; - user_alpha2[1] = last_request->alpha2[1]; + if (lr->initiator == NL80211_REGDOM_SET_BY_USER) { + user_alpha2[0] = lr->alpha2[0]; + user_alpha2[1] = lr->alpha2[1]; } - /* When r == REG_INTERSECT we do need to call CRDA */ - if (r < 0) { + /* When r == REG_REQ_INTERSECT we do need to call CRDA */ + if (treatment != REG_REQ_OK && treatment != REG_REQ_INTERSECT) { /* * Since CRDA will not be called in this case as we already * have applied the requested regulatory domain before we just * inform userspace we have processed the request */ - if (r == -EALREADY) { - nl80211_send_reg_change_event(last_request); + if (treatment == REG_REQ_ALREADY_SET) { + nl80211_send_reg_change_event(lr); reg_set_request_processed(); } - return r; + return treatment; } - return call_crda(last_request->alpha2); + if (call_crda(lr->alpha2)) + return REG_REQ_IGNORE; + return REG_REQ_OK; } /* This processes *all* regulatory hints */ static void reg_process_hint(struct regulatory_request *reg_request, enum nl80211_reg_initiator reg_initiator) { - int r = 0; struct wiphy *wiphy = NULL; - BUG_ON(!reg_request->alpha2); + if (WARN_ON(!reg_request->alpha2)) + return; - if (wiphy_idx_valid(reg_request->wiphy_idx)) + if (reg_request->wiphy_idx != WIPHY_IDX_INVALID) wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); - if (reg_initiator == NL80211_REGDOM_SET_BY_DRIVER && - !wiphy) { + if (reg_initiator == NL80211_REGDOM_SET_BY_DRIVER && !wiphy) { kfree(reg_request); return; } - r = __regulatory_hint(wiphy, reg_request); - /* This is required so that the orig_* parameters are saved */ - if (r == -EALREADY && wiphy && - wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY) { - wiphy_update_regulatory(wiphy, reg_initiator); - return; + switch (__regulatory_hint(wiphy, reg_request)) { + case REG_REQ_ALREADY_SET: + /* This is required so that the orig_* parameters are saved */ + if (wiphy && wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY) + wiphy_update_regulatory(wiphy, reg_initiator); + break; + default: + if (reg_initiator == NL80211_REGDOM_SET_BY_USER) + schedule_delayed_work(®_timeout, + msecs_to_jiffies(3142)); + break; } - - /* - * We only time out user hints, given that they should be the only - * source of bogus requests. - */ - if (r != -EALREADY && - reg_initiator == NL80211_REGDOM_SET_BY_USER) - schedule_delayed_work(®_timeout, msecs_to_jiffies(3142)); } /* @@ -1627,15 +1568,15 @@ static void reg_process_hint(struct regulatory_request *reg_request, */ static void reg_process_pending_hints(void) { - struct regulatory_request *reg_request; + struct regulatory_request *reg_request, *lr; mutex_lock(&cfg80211_mutex); mutex_lock(®_mutex); + lr = get_last_request(); /* When last_request->processed becomes true this will be rescheduled */ - if (last_request && !last_request->processed) { - REG_DBG_PRINT("Pending regulatory request, waiting " - "for it to be processed...\n"); + if (lr && !lr->processed) { + REG_DBG_PRINT("Pending regulatory request, waiting for it to be processed...\n"); goto out; } @@ -1666,23 +1607,14 @@ static void reg_process_pending_beacon_hints(void) struct cfg80211_registered_device *rdev; struct reg_beacon *pending_beacon, *tmp; - /* - * No need to hold the reg_mutex here as we just touch wiphys - * and do not read or access regulatory variables. - */ mutex_lock(&cfg80211_mutex); + mutex_lock(®_mutex); /* This goes through the _pending_ beacon list */ spin_lock_bh(®_pending_beacons_lock); - if (list_empty(®_pending_beacons)) { - spin_unlock_bh(®_pending_beacons_lock); - goto out; - } - list_for_each_entry_safe(pending_beacon, tmp, ®_pending_beacons, list) { - list_del_init(&pending_beacon->list); /* Applies the beacon hint to current wiphys */ @@ -1694,7 +1626,7 @@ static void reg_process_pending_beacon_hints(void) } spin_unlock_bh(®_pending_beacons_lock); -out: + mutex_unlock(®_mutex); mutex_unlock(&cfg80211_mutex); } @@ -1706,10 +1638,8 @@ static void reg_todo(struct work_struct *work) static void queue_regulatory_request(struct regulatory_request *request) { - if (isalpha(request->alpha2[0])) - request->alpha2[0] = toupper(request->alpha2[0]); - if (isalpha(request->alpha2[1])) - request->alpha2[1] = toupper(request->alpha2[1]); + request->alpha2[0] = toupper(request->alpha2[0]); + request->alpha2[1] = toupper(request->alpha2[1]); spin_lock(®_requests_lock); list_add_tail(&request->list, ®_requests_list); @@ -1726,8 +1656,7 @@ static int regulatory_hint_core(const char *alpha2) { struct regulatory_request *request; - request = kzalloc(sizeof(struct regulatory_request), - GFP_KERNEL); + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; @@ -1746,13 +1675,14 @@ int regulatory_hint_user(const char *alpha2, { struct regulatory_request *request; - BUG_ON(!alpha2); + if (WARN_ON(!alpha2)) + return -EINVAL; request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; - request->wiphy_idx = WIPHY_IDX_STALE; + request->wiphy_idx = WIPHY_IDX_INVALID; request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_USER; @@ -1768,8 +1698,8 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) { struct regulatory_request *request; - BUG_ON(!alpha2); - BUG_ON(!wiphy); + if (WARN_ON(!alpha2 || !wiphy)) + return -EINVAL; request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) @@ -1777,9 +1707,6 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) request->wiphy_idx = get_wiphy_idx(wiphy); - /* Must have registered wiphy first */ - BUG_ON(!wiphy_idx_valid(request->wiphy_idx)); - request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_DRIVER; @@ -1794,18 +1721,17 @@ EXPORT_SYMBOL(regulatory_hint); * We hold wdev_lock() here so we cannot hold cfg80211_mutex() and * therefore cannot iterate over the rdev list here. */ -void regulatory_hint_11d(struct wiphy *wiphy, - enum ieee80211_band band, - const u8 *country_ie, - u8 country_ie_len) +void regulatory_hint_11d(struct wiphy *wiphy, enum ieee80211_band band, + const u8 *country_ie, u8 country_ie_len) { char alpha2[2]; enum environment_cap env = ENVIRON_ANY; - struct regulatory_request *request; + struct regulatory_request *request, *lr; mutex_lock(®_mutex); + lr = get_last_request(); - if (unlikely(!last_request)) + if (unlikely(!lr)) goto out; /* IE len must be evenly divisible by 2 */ @@ -1828,9 +1754,8 @@ void regulatory_hint_11d(struct wiphy *wiphy, * We leave conflict resolution to the workqueue, where can hold * cfg80211_mutex. */ - if (likely(last_request->initiator == - NL80211_REGDOM_SET_BY_COUNTRY_IE && - wiphy_idx_valid(last_request->wiphy_idx))) + if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && + lr->wiphy_idx != WIPHY_IDX_INVALID) goto out; request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); @@ -1843,12 +1768,7 @@ void regulatory_hint_11d(struct wiphy *wiphy, request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE; request->country_ie_env = env; - mutex_unlock(®_mutex); - queue_regulatory_request(request); - - return; - out: mutex_unlock(®_mutex); } @@ -1863,8 +1783,7 @@ static void restore_alpha2(char *alpha2, bool reset_user) if (is_user_regdom_saved()) { /* Unless we're asked to ignore it and reset it */ if (reset_user) { - REG_DBG_PRINT("Restoring regulatory settings " - "including user preference\n"); + REG_DBG_PRINT("Restoring regulatory settings including user preference\n"); user_alpha2[0] = '9'; user_alpha2[1] = '7'; @@ -1874,26 +1793,20 @@ static void restore_alpha2(char *alpha2, bool reset_user) * back as they were for a full restore. */ if (!is_world_regdom(ieee80211_regdom)) { - REG_DBG_PRINT("Keeping preference on " - "module parameter ieee80211_regdom: %c%c\n", - ieee80211_regdom[0], - ieee80211_regdom[1]); + REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n", + ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } } else { - REG_DBG_PRINT("Restoring regulatory settings " - "while preserving user preference for: %c%c\n", - user_alpha2[0], - user_alpha2[1]); + REG_DBG_PRINT("Restoring regulatory settings while preserving user preference for: %c%c\n", + user_alpha2[0], user_alpha2[1]); alpha2[0] = user_alpha2[0]; alpha2[1] = user_alpha2[1]; } } else if (!is_world_regdom(ieee80211_regdom)) { - REG_DBG_PRINT("Keeping preference on " - "module parameter ieee80211_regdom: %c%c\n", - ieee80211_regdom[0], - ieee80211_regdom[1]); + REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n", + ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } else @@ -1948,7 +1861,7 @@ static void restore_regulatory_settings(bool reset_user) mutex_lock(&cfg80211_mutex); mutex_lock(®_mutex); - reset_regdomains(true); + reset_regdomains(true, &world_regdom); restore_alpha2(alpha2, reset_user); /* @@ -1958,49 +1871,35 @@ static void restore_regulatory_settings(bool reset_user) * settings. */ spin_lock(®_requests_lock); - if (!list_empty(®_requests_list)) { - list_for_each_entry_safe(reg_request, tmp, - ®_requests_list, list) { - if (reg_request->initiator != - NL80211_REGDOM_SET_BY_USER) - continue; - list_move_tail(®_request->list, &tmp_reg_req_list); - } + list_for_each_entry_safe(reg_request, tmp, ®_requests_list, list) { + if (reg_request->initiator != NL80211_REGDOM_SET_BY_USER) + continue; + list_move_tail(®_request->list, &tmp_reg_req_list); } spin_unlock(®_requests_lock); /* Clear beacon hints */ spin_lock_bh(®_pending_beacons_lock); - if (!list_empty(®_pending_beacons)) { - list_for_each_entry_safe(reg_beacon, btmp, - ®_pending_beacons, list) { - list_del(®_beacon->list); - kfree(reg_beacon); - } + list_for_each_entry_safe(reg_beacon, btmp, ®_pending_beacons, list) { + list_del(®_beacon->list); + kfree(reg_beacon); } spin_unlock_bh(®_pending_beacons_lock); - if (!list_empty(®_beacon_list)) { - list_for_each_entry_safe(reg_beacon, btmp, - ®_beacon_list, list) { - list_del(®_beacon->list); - kfree(reg_beacon); - } + list_for_each_entry_safe(reg_beacon, btmp, ®_beacon_list, list) { + list_del(®_beacon->list); + kfree(reg_beacon); } /* First restore to the basic regulatory settings */ - cfg80211_regdomain = cfg80211_world_regdom; - world_alpha2[0] = cfg80211_regdomain->alpha2[0]; - world_alpha2[1] = cfg80211_regdomain->alpha2[1]; + world_alpha2[0] = cfg80211_world_regdom->alpha2[0]; + world_alpha2[1] = cfg80211_world_regdom->alpha2[1]; list_for_each_entry(rdev, &cfg80211_rdev_list, list) { if (rdev->wiphy.flags & WIPHY_FLAG_CUSTOM_REGULATORY) restore_custom_reg_settings(&rdev->wiphy); } - mutex_unlock(®_mutex); - mutex_unlock(&cfg80211_mutex); - regulatory_hint_core(world_alpha2); /* @@ -2011,20 +1910,8 @@ static void restore_regulatory_settings(bool reset_user) if (is_an_alpha2(alpha2)) regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER); - if (list_empty(&tmp_reg_req_list)) - return; - - mutex_lock(&cfg80211_mutex); - mutex_lock(®_mutex); - spin_lock(®_requests_lock); - list_for_each_entry_safe(reg_request, tmp, &tmp_reg_req_list, list) { - REG_DBG_PRINT("Adding request for country %c%c back " - "into the queue\n", - reg_request->alpha2[0], - reg_request->alpha2[1]); - list_move_tail(®_request->list, ®_requests_list); - } + list_splice_tail_init(&tmp_reg_req_list, ®_requests_list); spin_unlock(®_requests_lock); mutex_unlock(®_mutex); @@ -2037,8 +1924,7 @@ static void restore_regulatory_settings(bool reset_user) void regulatory_hint_disconnect(void) { - REG_DBG_PRINT("All devices are disconnected, going to " - "restore regulatory settings\n"); + REG_DBG_PRINT("All devices are disconnected, going to restore regulatory settings\n"); restore_regulatory_settings(false); } @@ -2051,31 +1937,48 @@ static bool freq_is_chan_12_13_14(u16 freq) return false; } +static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan) +{ + struct reg_beacon *pending_beacon; + + list_for_each_entry(pending_beacon, ®_pending_beacons, list) + if (beacon_chan->center_freq == + pending_beacon->chan.center_freq) + return true; + return false; +} + int regulatory_hint_found_beacon(struct wiphy *wiphy, struct ieee80211_channel *beacon_chan, gfp_t gfp) { struct reg_beacon *reg_beacon; + bool processing; - if (likely((beacon_chan->beacon_found || - (beacon_chan->flags & IEEE80211_CHAN_RADAR) || + if (beacon_chan->beacon_found || + beacon_chan->flags & IEEE80211_CHAN_RADAR || (beacon_chan->band == IEEE80211_BAND_2GHZ && - !freq_is_chan_12_13_14(beacon_chan->center_freq))))) + !freq_is_chan_12_13_14(beacon_chan->center_freq))) + return 0; + + spin_lock_bh(®_pending_beacons_lock); + processing = pending_reg_beacon(beacon_chan); + spin_unlock_bh(®_pending_beacons_lock); + + if (processing) return 0; reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp); if (!reg_beacon) return -ENOMEM; - REG_DBG_PRINT("Found new beacon on " - "frequency: %d MHz (Ch %d) on %s\n", + REG_DBG_PRINT("Found new beacon on frequency: %d MHz (Ch %d) on %s\n", beacon_chan->center_freq, ieee80211_frequency_to_channel(beacon_chan->center_freq), wiphy_name(wiphy)); memcpy(®_beacon->chan, beacon_chan, - sizeof(struct ieee80211_channel)); - + sizeof(struct ieee80211_channel)); /* * Since we can be called from BH or and non-BH context @@ -2155,21 +2058,19 @@ static void print_dfs_region(u8 dfs_region) pr_info(" DFS Master region JP"); break; default: - pr_info(" DFS Master region Uknown"); + pr_info(" DFS Master region Unknown"); break; } } static void print_regdomain(const struct ieee80211_regdomain *rd) { + struct regulatory_request *lr = get_last_request(); if (is_intersected_alpha2(rd->alpha2)) { - - if (last_request->initiator == - NL80211_REGDOM_SET_BY_COUNTRY_IE) { + if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) { struct cfg80211_registered_device *rdev; - rdev = cfg80211_rdev_by_wiphy_idx( - last_request->wiphy_idx); + rdev = cfg80211_rdev_by_wiphy_idx(lr->wiphy_idx); if (rdev) { pr_info("Current regulatory domain updated by AP to: %c%c\n", rdev->country_ie_alpha2[0], @@ -2178,22 +2079,21 @@ static void print_regdomain(const struct ieee80211_regdomain *rd) pr_info("Current regulatory domain intersected:\n"); } else pr_info("Current regulatory domain intersected:\n"); - } else if (is_world_regdom(rd->alpha2)) + } else if (is_world_regdom(rd->alpha2)) { pr_info("World regulatory domain updated:\n"); - else { + } else { if (is_unknown_alpha2(rd->alpha2)) pr_info("Regulatory domain changed to driver built-in settings (unknown country)\n"); else { - if (reg_request_cell_base(last_request)) - pr_info("Regulatory domain changed " - "to country: %c%c by Cell Station\n", + if (reg_request_cell_base(lr)) + pr_info("Regulatory domain changed to country: %c%c by Cell Station\n", rd->alpha2[0], rd->alpha2[1]); else - pr_info("Regulatory domain changed " - "to country: %c%c\n", + pr_info("Regulatory domain changed to country: %c%c\n", rd->alpha2[0], rd->alpha2[1]); } } + print_dfs_region(rd->dfs_region); print_rd_rules(rd); } @@ -2207,22 +2107,23 @@ static void print_regdomain_info(const struct ieee80211_regdomain *rd) /* Takes ownership of rd only if it doesn't fail */ static int __set_regdom(const struct ieee80211_regdomain *rd) { + const struct ieee80211_regdomain *regd; const struct ieee80211_regdomain *intersected_rd = NULL; struct wiphy *request_wiphy; + struct regulatory_request *lr = get_last_request(); + /* Some basic sanity checks first */ + if (!reg_is_valid_request(rd->alpha2)) + return -EINVAL; + if (is_world_regdom(rd->alpha2)) { - if (WARN_ON(!reg_is_valid_request(rd->alpha2))) - return -EINVAL; update_world_regdomain(rd); return 0; } if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) && - !is_unknown_alpha2(rd->alpha2)) - return -EINVAL; - - if (!last_request) + !is_unknown_alpha2(rd->alpha2)) return -EINVAL; /* @@ -2230,7 +2131,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) * rd is non static (it means CRDA was present and was used last) * and the pending request came in from a country IE */ - if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { + if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { /* * If someone else asked us to change the rd lets only bother * checking if the alpha2 changes if CRDA was already called @@ -2246,29 +2147,23 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) * internal EEPROM data */ - if (WARN_ON(!reg_is_valid_request(rd->alpha2))) - return -EINVAL; - if (!is_valid_rd(rd)) { pr_err("Invalid regulatory domain detected:\n"); print_regdomain_info(rd); return -EINVAL; } - request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); if (!request_wiphy && - (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER || - last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)) { + (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER || + lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)) { schedule_delayed_work(®_timeout, 0); return -ENODEV; } - if (!last_request->intersect) { - int r; - - if (last_request->initiator != NL80211_REGDOM_SET_BY_DRIVER) { - reset_regdomains(false); - cfg80211_regdomain = rd; + if (!lr->intersect) { + if (lr->initiator != NL80211_REGDOM_SET_BY_DRIVER) { + reset_regdomains(false, rd); return 0; } @@ -2284,20 +2179,19 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) if (request_wiphy->regd) return -EALREADY; - r = reg_copy_regd(&request_wiphy->regd, rd); - if (r) - return r; + regd = reg_copy_regd(rd); + if (IS_ERR(regd)) + return PTR_ERR(regd); - reset_regdomains(false); - cfg80211_regdomain = rd; + rcu_assign_pointer(request_wiphy->regd, regd); + reset_regdomains(false, rd); return 0; } /* Intersection requires a bit more work */ - if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { - - intersected_rd = regdom_intersect(rd, cfg80211_regdomain); + if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { + intersected_rd = regdom_intersect(rd, get_cfg80211_regdom()); if (!intersected_rd) return -EINVAL; @@ -2306,15 +2200,19 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) * However if a driver requested this specific regulatory * domain we keep it for its private use */ - if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER) - request_wiphy->regd = rd; - else + if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER) { + const struct ieee80211_regdomain *tmp; + + tmp = get_wiphy_regdom(request_wiphy); + rcu_assign_pointer(request_wiphy->regd, rd); + rcu_free_regdom(tmp); + } else { kfree(rd); + } rd = NULL; - reset_regdomains(false); - cfg80211_regdomain = intersected_rd; + reset_regdomains(false, intersected_rd); return 0; } @@ -2326,15 +2224,15 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) /* * Use this call to set the current regulatory domain. Conflicts with * multiple drivers can be ironed out later. Caller must've already - * kmalloc'd the rd structure. Caller must hold cfg80211_mutex + * kmalloc'd the rd structure. */ int set_regdom(const struct ieee80211_regdomain *rd) { + struct regulatory_request *lr; int r; - assert_cfg80211_lock(); - mutex_lock(®_mutex); + lr = get_last_request(); /* Note that this doesn't update the wiphys, this is done below */ r = __set_regdom(rd); @@ -2343,23 +2241,25 @@ int set_regdom(const struct ieee80211_regdomain *rd) reg_set_request_processed(); kfree(rd); - mutex_unlock(®_mutex); - return r; + goto out; } /* This would make this whole thing pointless */ - if (!last_request->intersect) - BUG_ON(rd != cfg80211_regdomain); + if (WARN_ON(!lr->intersect && rd != get_cfg80211_regdom())) { + r = -EINVAL; + goto out; + } /* update all wiphys now with the new established regulatory domain */ - update_all_wiphy_regulatory(last_request->initiator); + update_all_wiphy_regulatory(lr->initiator); - print_regdomain(cfg80211_regdomain); + print_regdomain(get_cfg80211_regdom()); - nl80211_send_reg_change_event(last_request); + nl80211_send_reg_change_event(lr); reg_set_request_processed(); + out: mutex_unlock(®_mutex); return r; @@ -2367,20 +2267,26 @@ int set_regdom(const struct ieee80211_regdomain *rd) int reg_device_uevent(struct device *dev, struct kobj_uevent_env *env) { - if (last_request && !last_request->processed) { - if (add_uevent_var(env, "COUNTRY=%c%c", - last_request->alpha2[0], - last_request->alpha2[1])) - return -ENOMEM; + struct regulatory_request *lr; + u8 alpha2[2]; + bool add = false; + + rcu_read_lock(); + lr = get_last_request(); + if (lr && !lr->processed) { + memcpy(alpha2, lr->alpha2, 2); + add = true; } + rcu_read_unlock(); + if (add) + return add_uevent_var(env, "COUNTRY=%c%c", + alpha2[0], alpha2[1]); return 0; } void wiphy_regulatory_register(struct wiphy *wiphy) { - assert_cfg80211_lock(); - mutex_lock(®_mutex); if (!reg_dev_ignore_cell_hint(wiphy)) @@ -2395,32 +2301,32 @@ void wiphy_regulatory_register(struct wiphy *wiphy) void wiphy_regulatory_deregister(struct wiphy *wiphy) { struct wiphy *request_wiphy = NULL; - - assert_cfg80211_lock(); + struct regulatory_request *lr; mutex_lock(®_mutex); + lr = get_last_request(); if (!reg_dev_ignore_cell_hint(wiphy)) reg_num_devs_support_basehint--; - kfree(wiphy->regd); + rcu_free_regdom(get_wiphy_regdom(wiphy)); + rcu_assign_pointer(wiphy->regd, NULL); - if (last_request) - request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + if (lr) + request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); if (!request_wiphy || request_wiphy != wiphy) goto out; - last_request->wiphy_idx = WIPHY_IDX_STALE; - last_request->country_ie_env = ENVIRON_ANY; + lr->wiphy_idx = WIPHY_IDX_INVALID; + lr->country_ie_env = ENVIRON_ANY; out: mutex_unlock(®_mutex); } static void reg_timeout_work(struct work_struct *work) { - REG_DBG_PRINT("Timeout while waiting for CRDA to reply, " - "restoring regulatory settings\n"); + REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); restore_regulatory_settings(true); } @@ -2439,13 +2345,13 @@ int __init regulatory_init(void) reg_regdb_size_check(); - cfg80211_regdomain = cfg80211_world_regdom; + rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); user_alpha2[0] = '9'; user_alpha2[1] = '7'; /* We always try to get an update for the static regdomain */ - err = regulatory_hint_core(cfg80211_regdomain->alpha2); + err = regulatory_hint_core(cfg80211_world_regdom->alpha2); if (err) { if (err == -ENOMEM) return err; @@ -2457,10 +2363,6 @@ int __init regulatory_init(void) * errors as non-fatal. */ pr_err("kobject_uevent_env() was unable to call CRDA during init\n"); -#ifdef CONFIG_CFG80211_REG_DEBUG - /* We want to find out exactly why when debugging */ - WARN_ON(err); -#endif } /* @@ -2474,7 +2376,7 @@ int __init regulatory_init(void) return 0; } -void /* __init_or_exit */ regulatory_exit(void) +void regulatory_exit(void) { struct regulatory_request *reg_request, *tmp; struct reg_beacon *reg_beacon, *btmp; @@ -2482,43 +2384,27 @@ void /* __init_or_exit */ regulatory_exit(void) cancel_work_sync(®_work); cancel_delayed_work_sync(®_timeout); - mutex_lock(&cfg80211_mutex); + /* Lock to suppress warnings */ mutex_lock(®_mutex); - - reset_regdomains(true); + reset_regdomains(true, NULL); + mutex_unlock(®_mutex); dev_set_uevent_suppress(®_pdev->dev, true); platform_device_unregister(reg_pdev); - spin_lock_bh(®_pending_beacons_lock); - if (!list_empty(®_pending_beacons)) { - list_for_each_entry_safe(reg_beacon, btmp, - ®_pending_beacons, list) { - list_del(®_beacon->list); - kfree(reg_beacon); - } + list_for_each_entry_safe(reg_beacon, btmp, ®_pending_beacons, list) { + list_del(®_beacon->list); + kfree(reg_beacon); } - spin_unlock_bh(®_pending_beacons_lock); - if (!list_empty(®_beacon_list)) { - list_for_each_entry_safe(reg_beacon, btmp, - ®_beacon_list, list) { - list_del(®_beacon->list); - kfree(reg_beacon); - } + list_for_each_entry_safe(reg_beacon, btmp, ®_beacon_list, list) { + list_del(®_beacon->list); + kfree(reg_beacon); } - spin_lock(®_requests_lock); - if (!list_empty(®_requests_list)) { - list_for_each_entry_safe(reg_request, tmp, - ®_requests_list, list) { - list_del(®_request->list); - kfree(reg_request); - } + list_for_each_entry_safe(reg_request, tmp, ®_requests_list, list) { + list_del(®_request->list); + kfree(reg_request); } - spin_unlock(®_requests_lock); - - mutex_unlock(®_mutex); - mutex_unlock(&cfg80211_mutex); } diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 4c0a32ffd530..af2d5f8a5d82 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -16,10 +16,9 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -extern const struct ieee80211_regdomain *cfg80211_regdomain; +extern const struct ieee80211_regdomain __rcu *cfg80211_regdomain; bool is_world_regdom(const char *alpha2); -bool reg_is_valid_request(const char *alpha2); bool reg_supported_dfs_region(u8 dfs_region); int regulatory_hint_user(const char *alpha2, @@ -55,8 +54,8 @@ bool reg_last_request_cell_base(void); * set the wiphy->disable_beacon_hints to true. */ int regulatory_hint_found_beacon(struct wiphy *wiphy, - struct ieee80211_channel *beacon_chan, - gfp_t gfp); + struct ieee80211_channel *beacon_chan, + gfp_t gfp); /** * regulatory_hint_11d - hints a country IE as a regulatory domain diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 45f1618c8e23..fd99ea495b7e 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -19,55 +19,142 @@ #include "wext-compat.h" #include "rdev-ops.h" +/** + * DOC: BSS tree/list structure + * + * At the top level, the BSS list is kept in both a list in each + * registered device (@bss_list) as well as an RB-tree for faster + * lookup. In the RB-tree, entries can be looked up using their + * channel, MESHID, MESHCONF (for MBSSes) or channel, BSSID, SSID + * for other BSSes. + * + * Due to the possibility of hidden SSIDs, there's a second level + * structure, the "hidden_list" and "hidden_beacon_bss" pointer. + * The hidden_list connects all BSSes belonging to a single AP + * that has a hidden SSID, and connects beacon and probe response + * entries. For a probe response entry for a hidden SSID, the + * hidden_beacon_bss pointer points to the BSS struct holding the + * beacon's information. + * + * Reference counting is done for all these references except for + * the hidden_list, so that a beacon BSS struct that is otherwise + * not referenced has one reference for being on the bss_list and + * one for each probe response entry that points to it using the + * hidden_beacon_bss pointer. When a BSS struct that has such a + * pointer is get/put, the refcount update is also propagated to + * the referenced struct, this ensure that it cannot get removed + * while somebody is using the probe response version. + * + * Note that the hidden_beacon_bss pointer never changes, due to + * the reference counting. Therefore, no locking is needed for + * it. + * + * Also note that the hidden_beacon_bss pointer is only relevant + * if the driver uses something other than the IEs, e.g. private + * data stored stored in the BSS struct, since the beacon IEs are + * also linked into the probe response struct. + */ + #define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ) -static void bss_release(struct kref *ref) +static void bss_free(struct cfg80211_internal_bss *bss) { struct cfg80211_bss_ies *ies; - struct cfg80211_internal_bss *bss; - - bss = container_of(ref, struct cfg80211_internal_bss, ref); if (WARN_ON(atomic_read(&bss->hold))) return; - if (bss->pub.free_priv) - bss->pub.free_priv(&bss->pub); - ies = (void *)rcu_access_pointer(bss->pub.beacon_ies); - if (ies) + if (ies && !bss->pub.hidden_beacon_bss) kfree_rcu(ies, rcu_head); ies = (void *)rcu_access_pointer(bss->pub.proberesp_ies); if (ies) kfree_rcu(ies, rcu_head); + /* + * This happens when the module is removed, it doesn't + * really matter any more save for completeness + */ + if (!list_empty(&bss->hidden_list)) + list_del(&bss->hidden_list); + kfree(bss); } -/* must hold dev->bss_lock! */ -static void __cfg80211_unlink_bss(struct cfg80211_registered_device *dev, +static inline void bss_ref_get(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *bss) +{ + lockdep_assert_held(&dev->bss_lock); + + bss->refcount++; + if (bss->pub.hidden_beacon_bss) { + bss = container_of(bss->pub.hidden_beacon_bss, + struct cfg80211_internal_bss, + pub); + bss->refcount++; + } +} + +static inline void bss_ref_put(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *bss) +{ + lockdep_assert_held(&dev->bss_lock); + + if (bss->pub.hidden_beacon_bss) { + struct cfg80211_internal_bss *hbss; + hbss = container_of(bss->pub.hidden_beacon_bss, + struct cfg80211_internal_bss, + pub); + hbss->refcount--; + if (hbss->refcount == 0) + bss_free(hbss); + } + bss->refcount--; + if (bss->refcount == 0) + bss_free(bss); +} + +static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *dev, struct cfg80211_internal_bss *bss) { + lockdep_assert_held(&dev->bss_lock); + + if (!list_empty(&bss->hidden_list)) { + /* + * don't remove the beacon entry if it has + * probe responses associated with it + */ + if (!bss->pub.hidden_beacon_bss) + return false; + /* + * if it's a probe response entry break its + * link to the other entries in the group + */ + list_del_init(&bss->hidden_list); + } + list_del_init(&bss->list); rb_erase(&bss->rbn, &dev->bss_tree); - kref_put(&bss->ref, bss_release); + bss_ref_put(dev, bss); + return true; } -/* must hold dev->bss_lock! */ static void __cfg80211_bss_expire(struct cfg80211_registered_device *dev, unsigned long expire_time) { struct cfg80211_internal_bss *bss, *tmp; bool expired = false; + lockdep_assert_held(&dev->bss_lock); + list_for_each_entry_safe(bss, tmp, &dev->bss_list, list) { if (atomic_read(&bss->hold)) continue; if (!time_after(expire_time, bss->ts)) continue; - __cfg80211_unlink_bss(dev, bss); - expired = true; + if (__cfg80211_unlink_bss(dev, bss)) + expired = true; } if (expired) @@ -82,7 +169,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool leak) union iwreq_data wrqu; #endif - ASSERT_RDEV_LOCK(rdev); + lockdep_assert_held(&rdev->sched_scan_mtx); request = rdev->scan_req; @@ -143,9 +230,9 @@ void __cfg80211_scan_done(struct work_struct *wk) rdev = container_of(wk, struct cfg80211_registered_device, scan_done_wk); - cfg80211_lock_rdev(rdev); + mutex_lock(&rdev->sched_scan_mtx); ___cfg80211_scan_done(rdev, false); - cfg80211_unlock_rdev(rdev); + mutex_unlock(&rdev->sched_scan_mtx); } void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted) @@ -234,15 +321,16 @@ int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, return 0; } -/* must hold dev->bss_lock! */ void cfg80211_bss_age(struct cfg80211_registered_device *dev, unsigned long age_secs) { struct cfg80211_internal_bss *bss; unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC); + spin_lock_bh(&dev->bss_lock); list_for_each_entry(bss, &dev->bss_list, list) bss->ts -= age_jiffies; + spin_unlock_bh(&dev->bss_lock); } void cfg80211_bss_expire(struct cfg80211_registered_device *dev) @@ -277,40 +365,24 @@ const u8 *cfg80211_find_vendor_ie(unsigned int oui, u8 oui_type, if (!pos) return NULL; - if (end - pos < sizeof(*ie)) - return NULL; - ie = (struct ieee80211_vendor_ie *)pos; + + /* make sure we can access ie->len */ + BUILD_BUG_ON(offsetof(struct ieee80211_vendor_ie, len) != 1); + + if (ie->len < sizeof(*ie)) + goto cont; + ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2]; if (ie_oui == oui && ie->oui_type == oui_type) return pos; - +cont: pos += 2 + ie->len; } return NULL; } EXPORT_SYMBOL(cfg80211_find_vendor_ie); -static int cmp_ies(u8 num, const u8 *ies1, int len1, const u8 *ies2, int len2) -{ - const u8 *ie1 = cfg80211_find_ie(num, ies1, len1); - const u8 *ie2 = cfg80211_find_ie(num, ies2, len2); - - /* equal if both missing */ - if (!ie1 && !ie2) - return 0; - /* sort missing IE before (left of) present IE */ - if (!ie1) - return -1; - if (!ie2) - return 1; - - /* sort by length first, then by contents */ - if (ie1[1] != ie2[1]) - return ie2[1] - ie1[1]; - return memcmp(ie1 + 2, ie2 + 2, ie1[1]); -} - static bool is_bss(struct cfg80211_bss *a, const u8 *bssid, const u8 *ssid, size_t ssid_len) { @@ -334,109 +406,30 @@ static bool is_bss(struct cfg80211_bss *a, const u8 *bssid, return memcmp(ssidie + 2, ssid, ssid_len) == 0; } -static bool is_mesh_bss(struct cfg80211_bss *a) -{ - const struct cfg80211_bss_ies *ies; - const u8 *ie; - - if (!WLAN_CAPABILITY_IS_STA_BSS(a->capability)) - return false; - - ies = rcu_access_pointer(a->ies); - if (!ies) - return false; - - ie = cfg80211_find_ie(WLAN_EID_MESH_ID, ies->data, ies->len); - if (!ie) - return false; - - ie = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, ies->data, ies->len); - if (!ie) - return false; - - return true; -} - -static bool is_mesh(struct cfg80211_bss *a, - const u8 *meshid, size_t meshidlen, - const u8 *meshcfg) -{ - const struct cfg80211_bss_ies *ies; - const u8 *ie; - - if (!WLAN_CAPABILITY_IS_STA_BSS(a->capability)) - return false; - - ies = rcu_access_pointer(a->ies); - if (!ies) - return false; - - ie = cfg80211_find_ie(WLAN_EID_MESH_ID, ies->data, ies->len); - if (!ie) - return false; - if (ie[1] != meshidlen) - return false; - if (memcmp(ie + 2, meshid, meshidlen)) - return false; - - ie = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, ies->data, ies->len); - if (!ie) - return false; - if (ie[1] != sizeof(struct ieee80211_meshconf_ie)) - return false; - - /* - * Ignore mesh capability (last two bytes of the IE) when - * comparing since that may differ between stations taking - * part in the same mesh. - */ - return memcmp(ie + 2, meshcfg, - sizeof(struct ieee80211_meshconf_ie) - 2) == 0; -} +/** + * enum bss_compare_mode - BSS compare mode + * @BSS_CMP_REGULAR: regular compare mode (for insertion and normal find) + * @BSS_CMP_HIDE_ZLEN: find hidden SSID with zero-length mode + * @BSS_CMP_HIDE_NUL: find hidden SSID with NUL-ed out mode + */ +enum bss_compare_mode { + BSS_CMP_REGULAR, + BSS_CMP_HIDE_ZLEN, + BSS_CMP_HIDE_NUL, +}; -static int cmp_bss_core(struct cfg80211_bss *a, struct cfg80211_bss *b) +static int cmp_bss(struct cfg80211_bss *a, + struct cfg80211_bss *b, + enum bss_compare_mode mode) { const struct cfg80211_bss_ies *a_ies, *b_ies; - int r; + const u8 *ie1 = NULL; + const u8 *ie2 = NULL; + int i, r; if (a->channel != b->channel) return b->channel->center_freq - a->channel->center_freq; - if (is_mesh_bss(a) && is_mesh_bss(b)) { - a_ies = rcu_access_pointer(a->ies); - if (!a_ies) - return -1; - b_ies = rcu_access_pointer(b->ies); - if (!b_ies) - return 1; - - r = cmp_ies(WLAN_EID_MESH_ID, - a_ies->data, a_ies->len, - b_ies->data, b_ies->len); - if (r) - return r; - return cmp_ies(WLAN_EID_MESH_CONFIG, - a_ies->data, a_ies->len, - b_ies->data, b_ies->len); - } - - /* - * we can't use compare_ether_addr here since we need a < > operator. - * The binary return value of compare_ether_addr isn't enough - */ - return memcmp(a->bssid, b->bssid, sizeof(a->bssid)); -} - -static int cmp_bss(struct cfg80211_bss *a, - struct cfg80211_bss *b) -{ - const struct cfg80211_bss_ies *a_ies, *b_ies; - int r; - - r = cmp_bss_core(a, b); - if (r) - return r; - a_ies = rcu_access_pointer(a->ies); if (!a_ies) return -1; @@ -444,42 +437,51 @@ static int cmp_bss(struct cfg80211_bss *a, if (!b_ies) return 1; - return cmp_ies(WLAN_EID_SSID, - a_ies->data, a_ies->len, - b_ies->data, b_ies->len); -} - -static int cmp_hidden_bss(struct cfg80211_bss *a, struct cfg80211_bss *b) -{ - const struct cfg80211_bss_ies *a_ies, *b_ies; - const u8 *ie1; - const u8 *ie2; - int i; - int r; + if (WLAN_CAPABILITY_IS_STA_BSS(a->capability)) + ie1 = cfg80211_find_ie(WLAN_EID_MESH_ID, + a_ies->data, a_ies->len); + if (WLAN_CAPABILITY_IS_STA_BSS(b->capability)) + ie2 = cfg80211_find_ie(WLAN_EID_MESH_ID, + b_ies->data, b_ies->len); + if (ie1 && ie2) { + int mesh_id_cmp; + + if (ie1[1] == ie2[1]) + mesh_id_cmp = memcmp(ie1 + 2, ie2 + 2, ie1[1]); + else + mesh_id_cmp = ie2[1] - ie1[1]; + + ie1 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, + a_ies->data, a_ies->len); + ie2 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, + b_ies->data, b_ies->len); + if (ie1 && ie2) { + if (mesh_id_cmp) + return mesh_id_cmp; + if (ie1[1] != ie2[1]) + return ie2[1] - ie1[1]; + return memcmp(ie1 + 2, ie2 + 2, ie1[1]); + } + } - r = cmp_bss_core(a, b); + /* + * we can't use compare_ether_addr here since we need a < > operator. + * The binary return value of compare_ether_addr isn't enough + */ + r = memcmp(a->bssid, b->bssid, sizeof(a->bssid)); if (r) return r; - a_ies = rcu_access_pointer(a->ies); - if (!a_ies) - return -1; - b_ies = rcu_access_pointer(b->ies); - if (!b_ies) - return 1; - ie1 = cfg80211_find_ie(WLAN_EID_SSID, a_ies->data, a_ies->len); ie2 = cfg80211_find_ie(WLAN_EID_SSID, b_ies->data, b_ies->len); + if (!ie1 && !ie2) + return 0; + /* - * Key comparator must use same algorithm in any rb-tree - * search function (order is important), otherwise ordering - * of items in the tree is broken and search gives incorrect - * results. This code uses same order as cmp_ies() does. - * - * Note that due to the differring behaviour with hidden SSIDs - * this function only works when "b" is the tree element and - * "a" is the key we're looking for. + * Note that with "hide_ssid", the function returns a match if + * the already-present BSS ("b") is a hidden SSID beacon for + * the new BSS ("a"). */ /* sort missing IE before (left of) present IE */ @@ -488,24 +490,36 @@ static int cmp_hidden_bss(struct cfg80211_bss *a, struct cfg80211_bss *b) if (!ie2) return 1; - /* zero-size SSID is used as an indication of the hidden bss */ - if (!ie2[1]) + switch (mode) { + case BSS_CMP_HIDE_ZLEN: + /* + * In ZLEN mode we assume the BSS entry we're + * looking for has a zero-length SSID. So if + * the one we're looking at right now has that, + * return 0. Otherwise, return the difference + * in length, but since we're looking for the + * 0-length it's really equivalent to returning + * the length of the one we're looking at. + * + * No content comparison is needed as we assume + * the content length is zero. + */ + return ie2[1]; + case BSS_CMP_REGULAR: + default: + /* sort by length first, then by contents */ + if (ie1[1] != ie2[1]) + return ie2[1] - ie1[1]; + return memcmp(ie1 + 2, ie2 + 2, ie1[1]); + case BSS_CMP_HIDE_NUL: + if (ie1[1] != ie2[1]) + return ie2[1] - ie1[1]; + /* this is equivalent to memcmp(zeroes, ie2 + 2, len) */ + for (i = 0; i < ie2[1]; i++) + if (ie2[i + 2]) + return -1; return 0; - - /* sort by length first, then by contents */ - if (ie1[1] != ie2[1]) - return ie2[1] - ie1[1]; - - /* - * zeroed SSID ie is another indication of a hidden bss; - * if it isn't zeroed just return the regular sort value - * to find the next candidate - */ - for (i = 0; i < ie2[1]; i++) - if (ie2[i + 2]) - return memcmp(ie1 + 2, ie2 + 2, ie1[1]); - - return 0; + } } struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, @@ -534,7 +548,7 @@ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, continue; if (is_bss(&bss->pub, bssid, ssid, ssid_len)) { res = bss; - kref_get(&res->ref); + bss_ref_get(dev, res); break; } } @@ -547,34 +561,6 @@ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_get_bss); -struct cfg80211_bss *cfg80211_get_mesh(struct wiphy *wiphy, - struct ieee80211_channel *channel, - const u8 *meshid, size_t meshidlen, - const u8 *meshcfg) -{ - struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); - struct cfg80211_internal_bss *bss, *res = NULL; - - spin_lock_bh(&dev->bss_lock); - - list_for_each_entry(bss, &dev->bss_list, list) { - if (channel && bss->pub.channel != channel) - continue; - if (is_mesh(&bss->pub, meshid, meshidlen, meshcfg)) { - res = bss; - kref_get(&res->ref); - break; - } - } - - spin_unlock_bh(&dev->bss_lock); - if (!res) - return NULL; - return &res->pub; -} -EXPORT_SYMBOL(cfg80211_get_mesh); - - static void rb_insert_bss(struct cfg80211_registered_device *dev, struct cfg80211_internal_bss *bss) { @@ -587,7 +573,7 @@ static void rb_insert_bss(struct cfg80211_registered_device *dev, parent = *p; tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn); - cmp = cmp_bss(&bss->pub, &tbss->pub); + cmp = cmp_bss(&bss->pub, &tbss->pub, BSS_CMP_REGULAR); if (WARN_ON(!cmp)) { /* will sort of leak this BSS */ @@ -606,7 +592,8 @@ static void rb_insert_bss(struct cfg80211_registered_device *dev, static struct cfg80211_internal_bss * rb_find_bss(struct cfg80211_registered_device *dev, - struct cfg80211_internal_bss *res) + struct cfg80211_internal_bss *res, + enum bss_compare_mode mode) { struct rb_node *n = dev->bss_tree.rb_node; struct cfg80211_internal_bss *bss; @@ -614,7 +601,7 @@ rb_find_bss(struct cfg80211_registered_device *dev, while (n) { bss = rb_entry(n, struct cfg80211_internal_bss, rbn); - r = cmp_bss(&res->pub, &bss->pub); + r = cmp_bss(&res->pub, &bss->pub, mode); if (r == 0) return bss; @@ -627,46 +614,67 @@ rb_find_bss(struct cfg80211_registered_device *dev, return NULL; } -static struct cfg80211_internal_bss * -rb_find_hidden_bss(struct cfg80211_registered_device *dev, - struct cfg80211_internal_bss *res) +static bool cfg80211_combine_bsses(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *new) { - struct rb_node *n = dev->bss_tree.rb_node; + const struct cfg80211_bss_ies *ies; struct cfg80211_internal_bss *bss; - int r; + const u8 *ie; + int i, ssidlen; + u8 fold = 0; - while (n) { - bss = rb_entry(n, struct cfg80211_internal_bss, rbn); - r = cmp_hidden_bss(&res->pub, &bss->pub); + ies = rcu_access_pointer(new->pub.beacon_ies); + if (WARN_ON(!ies)) + return false; - if (r == 0) - return bss; - else if (r < 0) - n = n->rb_left; - else - n = n->rb_right; + ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); + if (!ie) { + /* nothing to do */ + return true; } - return NULL; -} + ssidlen = ie[1]; + for (i = 0; i < ssidlen; i++) + fold |= ie[2 + i]; -static void -copy_hidden_ies(struct cfg80211_internal_bss *res, - struct cfg80211_internal_bss *hidden) -{ - const struct cfg80211_bss_ies *ies; + if (fold) { + /* not a hidden SSID */ + return true; + } - if (rcu_access_pointer(res->pub.beacon_ies)) - return; + /* This is the bad part ... */ - ies = rcu_access_pointer(hidden->pub.beacon_ies); - if (WARN_ON(!ies)) - return; + list_for_each_entry(bss, &dev->bss_list, list) { + if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid)) + continue; + if (bss->pub.channel != new->pub.channel) + continue; + if (rcu_access_pointer(bss->pub.beacon_ies)) + continue; + ies = rcu_access_pointer(bss->pub.ies); + if (!ies) + continue; + ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); + if (!ie) + continue; + if (ssidlen && ie[1] != ssidlen) + continue; + /* that would be odd ... */ + if (bss->pub.beacon_ies) + continue; + if (WARN_ON_ONCE(bss->pub.hidden_beacon_bss)) + continue; + if (WARN_ON_ONCE(!list_empty(&bss->hidden_list))) + list_del(&bss->hidden_list); + /* combine them */ + list_add(&bss->hidden_list, &new->hidden_list); + bss->pub.hidden_beacon_bss = &new->pub; + new->refcount += bss->refcount; + rcu_assign_pointer(bss->pub.beacon_ies, + new->pub.beacon_ies); + } - ies = kmemdup(ies, sizeof(*ies) + ies->len, GFP_ATOMIC); - if (unlikely(!ies)) - return; - rcu_assign_pointer(res->pub.beacon_ies, ies); + return true; } static struct cfg80211_internal_bss * @@ -687,15 +695,9 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, return NULL; } - found = rb_find_bss(dev, tmp); + found = rb_find_bss(dev, tmp, BSS_CMP_REGULAR); if (found) { - found->pub.beacon_interval = tmp->pub.beacon_interval; - found->pub.tsf = tmp->pub.tsf; - found->pub.signal = tmp->pub.signal; - found->pub.capability = tmp->pub.capability; - found->ts = tmp->ts; - /* Update IEs */ if (rcu_access_pointer(tmp->pub.proberesp_ies)) { const struct cfg80211_bss_ies *old; @@ -711,41 +713,65 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } else if (rcu_access_pointer(tmp->pub.beacon_ies)) { - const struct cfg80211_bss_ies *old, *ies; + const struct cfg80211_bss_ies *old; + struct cfg80211_internal_bss *bss; + + if (found->pub.hidden_beacon_bss && + !list_empty(&found->hidden_list)) { + const struct cfg80211_bss_ies *f; + + /* + * The found BSS struct is one of the probe + * response members of a group, but we're + * receiving a beacon (beacon_ies in the tmp + * bss is used). This can only mean that the + * AP changed its beacon from not having an + * SSID to showing it, which is confusing so + * drop this information. + */ + + f = rcu_access_pointer(tmp->pub.beacon_ies); + kfree_rcu((struct cfg80211_bss_ies *)f, + rcu_head); + goto drop; + } old = rcu_access_pointer(found->pub.beacon_ies); - ies = rcu_access_pointer(found->pub.ies); rcu_assign_pointer(found->pub.beacon_ies, tmp->pub.beacon_ies); /* Override IEs if they were from a beacon before */ - if (old == ies) + if (old == rcu_access_pointer(found->pub.ies)) rcu_assign_pointer(found->pub.ies, tmp->pub.beacon_ies); + /* Assign beacon IEs to all sub entries */ + list_for_each_entry(bss, &found->hidden_list, + hidden_list) { + const struct cfg80211_bss_ies *ies; + + ies = rcu_access_pointer(bss->pub.beacon_ies); + WARN_ON(ies != old); + + rcu_assign_pointer(bss->pub.beacon_ies, + tmp->pub.beacon_ies); + } + if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } + + found->pub.beacon_interval = tmp->pub.beacon_interval; + found->pub.signal = tmp->pub.signal; + found->pub.capability = tmp->pub.capability; + found->ts = tmp->ts; } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; struct cfg80211_bss_ies *ies; - /* First check if the beacon is a probe response from - * a hidden bss. If so, copy beacon ies (with nullified - * ssid) into the probe response bss entry (with real ssid). - * It is required basically for PSM implementation - * (probe responses do not contain tim ie) */ - - /* TODO: The code is not trying to update existing probe - * response bss entries when beacon ies are - * getting changed. */ - hidden = rb_find_hidden_bss(dev, tmp); - if (hidden) - copy_hidden_ies(tmp, hidden); - /* * create a copy -- the "res" variable that is passed in * is allocated on the stack since it's not needed in the @@ -760,21 +786,51 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, ies = (void *)rcu_dereference(tmp->pub.proberesp_ies); if (ies) kfree_rcu(ies, rcu_head); - spin_unlock_bh(&dev->bss_lock); - return NULL; + goto drop; } memcpy(new, tmp, sizeof(*new)); - kref_init(&new->ref); + new->refcount = 1; + INIT_LIST_HEAD(&new->hidden_list); + + if (rcu_access_pointer(tmp->pub.proberesp_ies)) { + hidden = rb_find_bss(dev, tmp, BSS_CMP_HIDE_ZLEN); + if (!hidden) + hidden = rb_find_bss(dev, tmp, + BSS_CMP_HIDE_NUL); + if (hidden) { + new->pub.hidden_beacon_bss = &hidden->pub; + list_add(&new->hidden_list, + &hidden->hidden_list); + hidden->refcount++; + rcu_assign_pointer(new->pub.beacon_ies, + hidden->pub.beacon_ies); + } + } else { + /* + * Ok so we found a beacon, and don't have an entry. If + * it's a beacon with hidden SSID, we might be in for an + * expensive search for any probe responses that should + * be grouped with this beacon for updates ... + */ + if (!cfg80211_combine_bsses(dev, new)) { + kfree(new); + goto drop; + } + } + list_add_tail(&new->list, &dev->bss_list); rb_insert_bss(dev, new); found = new; } dev->bss_generation++; + bss_ref_get(dev, found); spin_unlock_bh(&dev->bss_lock); - kref_get(&found->ref); return found; + drop: + spin_unlock_bh(&dev->bss_lock); + return NULL; } static struct ieee80211_channel * @@ -833,7 +889,6 @@ cfg80211_inform_bss(struct wiphy *wiphy, memcpy(tmp.pub.bssid, bssid, ETH_ALEN); tmp.pub.channel = channel; tmp.pub.signal = signal; - tmp.pub.tsf = tsf; tmp.pub.beacon_interval = beacon_interval; tmp.pub.capability = capability; /* @@ -841,16 +896,14 @@ cfg80211_inform_bss(struct wiphy *wiphy, * Response frame, we need to pick one of the options and only use it * with the driver that does not provide the full Beacon/Probe Response * frame. Use Beacon frame pointer to avoid indicating that this should - * override the iies pointer should we have received an earlier + * override the IEs pointer should we have received an earlier * indication of Probe Response data. - * - * The initial buffer for the IEs is allocated with the BSS entry and - * is located after the private area. */ ies = kmalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; + ies->tsf = tsf; memcpy(ies->data, ie, ielen); rcu_assign_pointer(tmp.pub.beacon_ies, ies); @@ -907,6 +960,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, if (!ies) return NULL; ies->len = ielen; + ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); memcpy(ies->data, mgmt->u.probe_resp.variable, ielen); if (ieee80211_is_probe_resp(mgmt->frame_control)) @@ -918,7 +972,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN); tmp.pub.channel = channel; tmp.pub.signal = signal; - tmp.pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); @@ -935,27 +988,35 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_inform_bss_frame); -void cfg80211_ref_bss(struct cfg80211_bss *pub) +void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { + struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); struct cfg80211_internal_bss *bss; if (!pub) return; bss = container_of(pub, struct cfg80211_internal_bss, pub); - kref_get(&bss->ref); + + spin_lock_bh(&dev->bss_lock); + bss_ref_get(dev, bss); + spin_unlock_bh(&dev->bss_lock); } EXPORT_SYMBOL(cfg80211_ref_bss); -void cfg80211_put_bss(struct cfg80211_bss *pub) +void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { + struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); struct cfg80211_internal_bss *bss; if (!pub) return; bss = container_of(pub, struct cfg80211_internal_bss, pub); - kref_put(&bss->ref, bss_release); + + spin_lock_bh(&dev->bss_lock); + bss_ref_put(dev, bss); + spin_unlock_bh(&dev->bss_lock); } EXPORT_SYMBOL(cfg80211_put_bss); @@ -971,8 +1032,8 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) spin_lock_bh(&dev->bss_lock); if (!list_empty(&bss->list)) { - __cfg80211_unlink_bss(dev, bss); - dev->bss_generation++; + if (__cfg80211_unlink_bss(dev, bss)) + dev->bss_generation++; } spin_unlock_bh(&dev->bss_lock); } @@ -1001,6 +1062,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); + mutex_lock(&rdev->sched_scan_mtx); if (rdev->scan_req) { err = -EBUSY; goto out; @@ -1107,6 +1169,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, dev_hold(dev); } out: + mutex_unlock(&rdev->sched_scan_mtx); kfree(creq); cfg80211_unlock_rdev(rdev); return err; @@ -1155,16 +1218,6 @@ static void ieee80211_scan_add_ies(struct iw_request_info *info, } } -static inline unsigned int elapsed_jiffies_msecs(unsigned long start) -{ - unsigned long end = jiffies; - - if (end >= start) - return jiffies_to_msecs(end - start); - - return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1); -} - static char * ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, struct cfg80211_internal_bss *bss, char *current_ev, @@ -1241,15 +1294,10 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, rcu_read_lock(); ies = rcu_dereference(bss->pub.ies); - if (ies) { - rem = ies->len; - ie = ies->data; - } else { - rem = 0; - ie = NULL; - } + rem = ies->len; + ie = ies->data; - while (ies && rem >= 2) { + while (rem >= 2) { /* invalid data */ if (ie[1] > rem - 2) break; @@ -1362,7 +1410,7 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, if (buf) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; - sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->pub.tsf)); + sprintf(buf, "tsf=%016llx", (unsigned long long)(ies->tsf)); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point(info, current_ev, end_buf, &iwe, buf); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index f2431e41a373..a9dc5c736df0 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -85,6 +85,7 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) ASSERT_RTNL(); ASSERT_RDEV_LOCK(rdev); ASSERT_WDEV_LOCK(wdev); + lockdep_assert_held(&rdev->sched_scan_mtx); if (rdev->scan_req) return -EBUSY; @@ -159,7 +160,7 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); struct cfg80211_connect_params *params; - const u8 *prev_bssid = NULL; + struct cfg80211_assoc_request req = {}; int err; ASSERT_WDEV_LOCK(wdev); @@ -186,15 +187,20 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev) BUG_ON(!rdev->ops->assoc); wdev->conn->state = CFG80211_CONN_ASSOCIATING; if (wdev->conn->prev_bssid_valid) - prev_bssid = wdev->conn->prev_bssid; - err = __cfg80211_mlme_assoc(rdev, wdev->netdev, - params->channel, params->bssid, - prev_bssid, - params->ssid, params->ssid_len, - params->ie, params->ie_len, - false, ¶ms->crypto, - params->flags, ¶ms->ht_capa, - ¶ms->ht_capa_mask); + req.prev_bssid = wdev->conn->prev_bssid; + req.ie = params->ie; + req.ie_len = params->ie_len; + req.use_mfp = params->mfp != NL80211_MFP_NO; + req.crypto = params->crypto; + req.flags = params->flags; + req.ht_capa = params->ht_capa; + req.ht_capa_mask = params->ht_capa_mask; + req.vht_capa = params->vht_capa; + req.vht_capa_mask = params->vht_capa_mask; + + err = __cfg80211_mlme_assoc(rdev, wdev->netdev, params->channel, + params->bssid, params->ssid, + params->ssid_len, &req); if (err) __cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, @@ -222,6 +228,7 @@ void cfg80211_conn_work(struct work_struct *work) rtnl_lock(); cfg80211_lock_rdev(rdev); mutex_lock(&rdev->devlist_mtx); + mutex_lock(&rdev->sched_scan_mtx); list_for_each_entry(wdev, &rdev->wdev_list, list) { wdev_lock(wdev); @@ -229,7 +236,7 @@ void cfg80211_conn_work(struct work_struct *work) wdev_unlock(wdev); continue; } - if (wdev->sme_state != CFG80211_SME_CONNECTING) { + if (wdev->sme_state != CFG80211_SME_CONNECTING || !wdev->conn) { wdev_unlock(wdev); continue; } @@ -246,6 +253,7 @@ void cfg80211_conn_work(struct work_struct *work) wdev_unlock(wdev); } + mutex_unlock(&rdev->sched_scan_mtx); mutex_unlock(&rdev->devlist_mtx); cfg80211_unlock_rdev(rdev); rtnl_unlock(); @@ -300,7 +308,7 @@ static void __cfg80211_sme_scan_done(struct net_device *dev) bss = cfg80211_get_conn_bss(wdev); if (bss) { - cfg80211_put_bss(bss); + cfg80211_put_bss(&rdev->wiphy, bss); } else { /* not found */ if (wdev->conn->state == CFG80211_CONN_SCAN_AGAIN) @@ -319,11 +327,9 @@ void cfg80211_sme_scan_done(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; - mutex_lock(&wiphy_to_dev(wdev->wiphy)->devlist_mtx); wdev_lock(wdev); __cfg80211_sme_scan_done(dev); wdev_unlock(wdev); - mutex_unlock(&wiphy_to_dev(wdev->wiphy)->devlist_mtx); } void cfg80211_sme_rx_auth(struct net_device *dev, @@ -463,7 +469,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(&wdev->current_bss->pub); + cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; } @@ -479,7 +485,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, kfree(wdev->connect_keys); wdev->connect_keys = NULL; wdev->ssid_len = 0; - cfg80211_put_bss(bss); + cfg80211_put_bss(wdev->wiphy, bss); return; } @@ -519,10 +525,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, * - country_ie + 2, the start of the country ie data, and * - and country_ie[1] which is the IE length */ - regulatory_hint_11d(wdev->wiphy, - bss->channel->band, - country_ie + 2, - country_ie[1]); + regulatory_hint_11d(wdev->wiphy, bss->channel->band, + country_ie + 2, country_ie[1]); kfree(country_ie); } @@ -587,7 +591,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev, } cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(&wdev->current_bss->pub); + cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; cfg80211_hold_bss(bss_from_pub(bss)); @@ -622,7 +626,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev, return; out: - cfg80211_put_bss(bss); + cfg80211_put_bss(wdev->wiphy, bss); } void cfg80211_roamed(struct net_device *dev, @@ -664,7 +668,7 @@ void cfg80211_roamed_bss(struct net_device *dev, ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp); if (!ev) { - cfg80211_put_bss(bss); + cfg80211_put_bss(wdev->wiphy, bss); return; } @@ -705,7 +709,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(&wdev->current_bss->pub); + cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); } wdev->current_bss = NULL; @@ -876,7 +880,7 @@ int __cfg80211_connect(struct cfg80211_registered_device *rdev, if (bss) { wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; err = cfg80211_conn_do_work(wdev); - cfg80211_put_bss(bss); + cfg80211_put_bss(wdev->wiphy, bss); } else { /* otherwise we'll need to scan for the AP first */ err = cfg80211_conn_scan(wdev); @@ -925,9 +929,12 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, int err; mutex_lock(&rdev->devlist_mtx); + /* might request scan - scan_mtx -> wdev_mtx dependency */ + mutex_lock(&rdev->sched_scan_mtx); wdev_lock(dev->ieee80211_ptr); err = __cfg80211_connect(rdev, dev, connect, connkeys, NULL); wdev_unlock(dev->ieee80211_ptr); + mutex_unlock(&rdev->sched_scan_mtx); mutex_unlock(&rdev->devlist_mtx); return err; diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 1f6f01e2dc4c..8f28b9f798d8 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -83,6 +83,14 @@ static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } +static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) +{ + struct wireless_dev *wdev; + + list_for_each_entry(wdev, &rdev->wdev_list, list) + cfg80211_leave(rdev, wdev); +} + static int wiphy_suspend(struct device *dev, pm_message_t state) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); @@ -90,12 +98,19 @@ static int wiphy_suspend(struct device *dev, pm_message_t state) rdev->suspend_at = get_seconds(); - if (rdev->ops->suspend) { - rtnl_lock(); - if (rdev->wiphy.registered) - ret = rdev_suspend(rdev); - rtnl_unlock(); + rtnl_lock(); + if (rdev->wiphy.registered) { + if (!rdev->wowlan) + cfg80211_leave_all(rdev); + if (rdev->ops->suspend) + ret = rdev_suspend(rdev, rdev->wowlan); + if (ret == 1) { + /* Driver refuse to configure wowlan */ + cfg80211_leave_all(rdev); + ret = rdev_suspend(rdev, NULL); + } } + rtnl_unlock(); return ret; } @@ -106,9 +121,7 @@ static int wiphy_resume(struct device *dev) int ret = 0; /* Age scan results with time spent in suspend */ - spin_lock_bh(&rdev->bss_lock); cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); - spin_unlock_bh(&rdev->bss_lock); if (rdev->ops->resume) { rtnl_lock(); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 2134576f426e..ecd4fcec3c94 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -27,7 +27,8 @@ #define WIPHY_PR_ARG __entry->wiphy_name #define WDEV_ENTRY __field(u32, id) -#define WDEV_ASSIGN (__entry->id) = (wdev ? wdev->identifier : 0) +#define WDEV_ASSIGN (__entry->id) = (!IS_ERR_OR_NULL(wdev) \ + ? wdev->identifier : 0) #define WDEV_PR_FMT "wdev(%u)" #define WDEV_PR_ARG (__entry->id) @@ -1767,6 +1768,79 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device, TP_ARGS(wiphy, wdev) ); +TRACE_EVENT(rdev_set_mac_acl, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_acl_data *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u32, acl_policy) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->acl_policy = params->acl_policy; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", acl policy: %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->acl_policy) +); + +TRACE_EVENT(rdev_update_ft_ies, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_update_ft_ies_params *ftie), + TP_ARGS(wiphy, netdev, ftie), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u16, md) + __dynamic_array(u8, ie, ftie->ie_len) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->md = ftie->md; + memcpy(__get_dynamic_array(ie), ftie->ie, ftie->ie_len); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", md: 0x%x", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->md) +); + +TRACE_EVENT(rdev_crit_proto_start, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + enum nl80211_crit_proto_id protocol, u16 duration), + TP_ARGS(wiphy, wdev, protocol, duration), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u16, proto) + __field(u16, duration) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->proto = protocol; + __entry->duration = duration; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", proto=%x, duration=%u", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->proto, __entry->duration) +); + +TRACE_EVENT(rdev_crit_proto_stop, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, + WIPHY_PR_ARG, WDEV_PR_ARG) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ @@ -2033,6 +2107,21 @@ TRACE_EVENT(cfg80211_reg_can_beacon, WIPHY_PR_ARG, CHAN_DEF_PR_ARG) ); +TRACE_EVENT(cfg80211_chandef_dfs_required, + TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), + TP_ARGS(wiphy, chandef), + TP_STRUCT__entry( + WIPHY_ENTRY + CHAN_DEF_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + CHAN_DEF_ASSIGN(chandef); + ), + TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, + WIPHY_PR_ARG, CHAN_DEF_PR_ARG) +); + TRACE_EVENT(cfg80211_ch_switch_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef), @@ -2049,6 +2138,36 @@ TRACE_EVENT(cfg80211_ch_switch_notify, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); +TRACE_EVENT(cfg80211_radar_event, + TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), + TP_ARGS(wiphy, chandef), + TP_STRUCT__entry( + WIPHY_ENTRY + CHAN_DEF_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + CHAN_DEF_ASSIGN(chandef); + ), + TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, + WIPHY_PR_ARG, CHAN_DEF_PR_ARG) +); + +TRACE_EVENT(cfg80211_cac_event, + TP_PROTO(struct net_device *netdev, enum nl80211_radar_event evt), + TP_ARGS(netdev, evt), + TP_STRUCT__entry( + NETDEV_ENTRY + __field(enum nl80211_radar_event, evt) + ), + TP_fast_assign( + NETDEV_ASSIGN; + __entry->evt = evt; + ), + TP_printk(NETDEV_PR_FMT ", event: %d", + NETDEV_PR_ARG, __entry->evt) +); + DECLARE_EVENT_CLASS(cfg80211_rx_evt, TP_PROTO(struct net_device *netdev, const u8 *addr), TP_ARGS(netdev, addr), @@ -2315,6 +2434,67 @@ TRACE_EVENT(cfg80211_return_u32, TP_printk("ret: %u", __entry->ret) ); +TRACE_EVENT(cfg80211_report_wowlan_wakeup, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + struct cfg80211_wowlan_wakeup *wakeup), + TP_ARGS(wiphy, wdev, wakeup), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(bool, disconnect) + __field(bool, magic_pkt) + __field(bool, gtk_rekey_failure) + __field(bool, eap_identity_req) + __field(bool, four_way_handshake) + __field(bool, rfkill_release) + __field(s32, pattern_idx) + __field(u32, packet_len) + __dynamic_array(u8, packet, wakeup->packet_present_len) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->disconnect = wakeup->disconnect; + __entry->magic_pkt = wakeup->magic_pkt; + __entry->gtk_rekey_failure = wakeup->gtk_rekey_failure; + __entry->eap_identity_req = wakeup->eap_identity_req; + __entry->four_way_handshake = wakeup->four_way_handshake; + __entry->rfkill_release = wakeup->rfkill_release; + __entry->pattern_idx = wakeup->pattern_idx; + __entry->packet_len = wakeup->packet_len; + if (wakeup->packet && wakeup->packet_present_len) + memcpy(__get_dynamic_array(packet), wakeup->packet, + wakeup->packet_present_len); + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) +); + +TRACE_EVENT(cfg80211_ft_event, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_ft_event_params *ft_event), + TP_ARGS(wiphy, netdev, ft_event), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __dynamic_array(u8, ies, ft_event->ies_len) + MAC_ENTRY(target_ap) + __dynamic_array(u8, ric_ies, ft_event->ric_ies_len) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + if (ft_event->ies) + memcpy(__get_dynamic_array(ies), ft_event->ies, + ft_event->ies_len); + MAC_ASSIGN(target_ap, ft_event->target_ap); + if (ft_event->ric_ies) + memcpy(__get_dynamic_array(ric_ies), ft_event->ric_ies, + ft_event->ric_ies_len); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", target_ap: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(target_ap)) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index 16d76a807c2f..f5ad4d94ba88 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -511,7 +511,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, encaps_data = bridge_tunnel_header; encaps_len = sizeof(bridge_tunnel_header); skip_header_bytes -= 2; - } else if (ethertype > 0x600) { + } else if (ethertype >= ETH_P_802_3_MIN) { encaps_data = rfc1042_header; encaps_len = sizeof(rfc1042_header); skip_header_bytes -= 2; @@ -1155,6 +1155,26 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, } EXPORT_SYMBOL(cfg80211_get_p2p_attr); +bool ieee80211_operating_class_to_band(u8 operating_class, + enum ieee80211_band *band) +{ + switch (operating_class) { + case 112: + case 115 ... 127: + *band = IEEE80211_BAND_5GHZ; + return true; + case 81: + case 82: + case 83: + case 84: + *band = IEEE80211_BAND_2GHZ; + return true; + } + + return false; +} +EXPORT_SYMBOL(ieee80211_operating_class_to_band); + int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, u32 beacon_int) { @@ -1184,7 +1204,8 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_iftype iftype, struct ieee80211_channel *chan, - enum cfg80211_chan_mode chanmode) + enum cfg80211_chan_mode chanmode, + u8 radar_detect) { struct wireless_dev *wdev_iter; u32 used_iftypes = BIT(iftype); @@ -1195,14 +1216,46 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, enum cfg80211_chan_mode chmode; int num_different_channels = 0; int total = 1; + bool radar_required; int i, j; ASSERT_RTNL(); lockdep_assert_held(&rdev->devlist_mtx); + if (WARN_ON(hweight32(radar_detect) > 1)) + return -EINVAL; + + switch (iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_WDS: + radar_required = !!(chan && + (chan->flags & IEEE80211_CHAN_RADAR)); + break; + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_MONITOR: + radar_required = false; + break; + case NUM_NL80211_IFTYPES: + case NL80211_IFTYPE_UNSPECIFIED: + default: + return -EINVAL; + } + + if (radar_required && !radar_detect) + return -EINVAL; + /* Always allow software iftypes */ - if (rdev->wiphy.software_iftypes & BIT(iftype)) + if (rdev->wiphy.software_iftypes & BIT(iftype)) { + if (radar_detect) + return -EINVAL; return 0; + } memset(num, 0, sizeof(num)); memset(used_channels, 0, sizeof(used_channels)); @@ -1225,12 +1278,12 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, list_for_each_entry(wdev_iter, &rdev->wdev_list, list) { if (wdev_iter == wdev) continue; - if (wdev_iter->netdev) { - if (!netif_running(wdev_iter->netdev)) - continue; - } else if (wdev_iter->iftype == NL80211_IFTYPE_P2P_DEVICE) { + if (wdev_iter->iftype == NL80211_IFTYPE_P2P_DEVICE) { if (!wdev_iter->p2p_started) continue; + } else if (wdev_iter->netdev) { + if (!netif_running(wdev_iter->netdev)) + continue; } else { WARN_ON(1); } @@ -1275,7 +1328,7 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, used_iftypes |= BIT(wdev_iter->iftype); } - if (total == 1) + if (total == 1 && !radar_detect) return 0; for (i = 0; i < rdev->wiphy.n_iface_combinations; i++) { @@ -1308,6 +1361,9 @@ int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, } } + if (radar_detect && !(c->radar_detect_widths & radar_detect)) + goto cont; + /* * Finally check that all iftypes that we're currently * using are actually part of this combination. If they diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c index 8bafa31fa9f8..e98a01c1034f 100644 --- a/net/wireless/wext-proc.c +++ b/net/wireless/wext-proc.c @@ -143,7 +143,8 @@ static const struct file_operations wireless_seq_fops = { int __net_init wext_proc_init(struct net *net) { /* Create /proc/net/wireless entry */ - if (!proc_net_fops_create(net, "wireless", S_IRUGO, &wireless_seq_fops)) + if (!proc_create("wireless", S_IRUGO, net->proc_net, + &wireless_seq_fops)) return -ENOMEM; return 0; @@ -151,5 +152,5 @@ int __net_init wext_proc_init(struct net *net) void __net_exit wext_proc_exit(struct net *net) { - proc_net_remove(net, "wireless"); + remove_proc_entry("wireless", net->proc_net); } diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index fb9622f6d99c..e79cb5c0655a 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -89,6 +89,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev, cfg80211_lock_rdev(rdev); mutex_lock(&rdev->devlist_mtx); + mutex_lock(&rdev->sched_scan_mtx); wdev_lock(wdev); if (wdev->sme_state != CFG80211_SME_IDLE) { @@ -135,6 +136,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev, err = cfg80211_mgd_wext_connect(rdev, wdev); out: wdev_unlock(wdev); + mutex_unlock(&rdev->sched_scan_mtx); mutex_unlock(&rdev->devlist_mtx); cfg80211_unlock_rdev(rdev); return err; @@ -190,6 +192,7 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev, cfg80211_lock_rdev(rdev); mutex_lock(&rdev->devlist_mtx); + mutex_lock(&rdev->sched_scan_mtx); wdev_lock(wdev); err = 0; @@ -223,6 +226,7 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev, err = cfg80211_mgd_wext_connect(rdev, wdev); out: wdev_unlock(wdev); + mutex_unlock(&rdev->sched_scan_mtx); mutex_unlock(&rdev->devlist_mtx); cfg80211_unlock_rdev(rdev); return err; @@ -285,6 +289,7 @@ int cfg80211_mgd_wext_siwap(struct net_device *dev, cfg80211_lock_rdev(rdev); mutex_lock(&rdev->devlist_mtx); + mutex_lock(&rdev->sched_scan_mtx); wdev_lock(wdev); if (wdev->sme_state != CFG80211_SME_IDLE) { @@ -313,6 +318,7 @@ int cfg80211_mgd_wext_siwap(struct net_device *dev, err = cfg80211_mgd_wext_connect(rdev, wdev); out: wdev_unlock(wdev); + mutex_unlock(&rdev->sched_scan_mtx); mutex_unlock(&rdev->devlist_mtx); cfg80211_unlock_rdev(rdev); return err; diff --git a/net/x25/Kconfig b/net/x25/Kconfig index e6759c9660bb..c959312c45e3 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -3,8 +3,7 @@ # config X25 - tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "CCITT X.25 Packet Layer" ---help--- X.25 is a set of standardized network protocols, similar in scope to frame relay; the one physical line from your box to the X.25 network diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index a306bc66000e..37ca9694aabe 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -208,11 +208,10 @@ static void x25_remove_socket(struct sock *sk) static void x25_kill_by_device(struct net_device *dev) { struct sock *s; - struct hlist_node *node; write_lock_bh(&x25_list_lock); - sk_for_each(s, node, &x25_list) + sk_for_each(s, &x25_list) if (x25_sk(s)->neighbour && x25_sk(s)->neighbour->dev == dev) x25_disconnect(s, ENETUNREACH, 0, 0); @@ -280,12 +279,11 @@ static struct sock *x25_find_listener(struct x25_address *addr, { struct sock *s; struct sock *next_best; - struct hlist_node *node; read_lock_bh(&x25_list_lock); next_best = NULL; - sk_for_each(s, node, &x25_list) + sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || !strcmp(addr->x25_addr, @@ -323,9 +321,8 @@ found: static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; - struct hlist_node *node; - sk_for_each(s, node, &x25_list) + sk_for_each(s, &x25_list) if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { sock_hold(s); goto found; @@ -1782,11 +1779,10 @@ static struct notifier_block x25_dev_notifier = { void x25_kill_by_neigh(struct x25_neigh *nb) { struct sock *s; - struct hlist_node *node; write_lock_bh(&x25_list_lock); - sk_for_each(s, node, &x25_list) + sk_for_each(s, &x25_list) if (x25_sk(s)->neighbour == nb) x25_disconnect(s, ENETUNREACH, 0, 0); diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c index 2ffde4631ae2..0917f047f2cf 100644 --- a/net/x25/x25_proc.c +++ b/net/x25/x25_proc.c @@ -187,7 +187,6 @@ static int x25_seq_forward_open(struct inode *inode, struct file *file) } static const struct file_operations x25_seq_socket_fops = { - .owner = THIS_MODULE, .open = x25_seq_socket_open, .read = seq_read, .llseek = seq_lseek, @@ -195,7 +194,6 @@ static const struct file_operations x25_seq_socket_fops = { }; static const struct file_operations x25_seq_route_fops = { - .owner = THIS_MODULE, .open = x25_seq_route_open, .read = seq_read, .llseek = seq_lseek, @@ -203,55 +201,38 @@ static const struct file_operations x25_seq_route_fops = { }; static const struct file_operations x25_seq_forward_fops = { - .owner = THIS_MODULE, .open = x25_seq_forward_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; -static struct proc_dir_entry *x25_proc_dir; - int __init x25_proc_init(void) { - struct proc_dir_entry *p; - int rc = -ENOMEM; + if (!proc_mkdir("x25", init_net.proc_net)) + return -ENOMEM; - x25_proc_dir = proc_mkdir("x25", init_net.proc_net); - if (!x25_proc_dir) + if (!proc_create("x25/route", S_IRUGO, init_net.proc_net, + &x25_seq_route_fops)) goto out; - p = proc_create("route", S_IRUGO, x25_proc_dir, &x25_seq_route_fops); - if (!p) - goto out_route; - - p = proc_create("socket", S_IRUGO, x25_proc_dir, &x25_seq_socket_fops); - if (!p) - goto out_socket; + if (!proc_create("x25/socket", S_IRUGO, init_net.proc_net, + &x25_seq_socket_fops)) + goto out; - p = proc_create("forward", S_IRUGO, x25_proc_dir, - &x25_seq_forward_fops); - if (!p) - goto out_forward; - rc = 0; + if (!proc_create("x25/forward", S_IRUGO, init_net.proc_net, + &x25_seq_forward_fops)) + goto out; + return 0; out: - return rc; -out_forward: - remove_proc_entry("socket", x25_proc_dir); -out_socket: - remove_proc_entry("route", x25_proc_dir); -out_route: - remove_proc_entry("x25", init_net.proc_net); - goto out; + remove_proc_subtree("x25", init_net.proc_net); + return -ENOMEM; } void __exit x25_proc_exit(void) { - remove_proc_entry("forward", x25_proc_dir); - remove_proc_entry("route", x25_proc_dir); - remove_proc_entry("socket", x25_proc_dir); - remove_proc_entry("x25", init_net.proc_net); + remove_proc_subtree("x25", init_net.proc_net); } #else /* CONFIG_PROC_FS */ diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index ce90b8d92365..bda1a13628a8 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -21,8 +21,8 @@ config XFRM_USER If unsure, say Y. config XFRM_SUB_POLICY - bool "Transformation sub policy support (EXPERIMENTAL)" - depends on XFRM && EXPERIMENTAL + bool "Transformation sub policy support" + depends on XFRM ---help--- Support sub policy for developers. By using sub policy with main one, two policies can be applied to the same packet at once. @@ -31,8 +31,8 @@ config XFRM_SUB_POLICY If unsure, say N. config XFRM_MIGRATE - bool "Transformation migrate database (EXPERIMENTAL)" - depends on XFRM && EXPERIMENTAL + bool "Transformation migrate database" + depends on XFRM ---help--- A feature to update locator(s) of a given IPsec security association dynamically. This feature is required, for @@ -42,8 +42,8 @@ config XFRM_MIGRATE If unsure, say N. config XFRM_STATISTICS - bool "Transformation statistics (EXPERIMENTAL)" - depends on INET && XFRM && PROC_FS && EXPERIMENTAL + bool "Transformation statistics" + depends on INET && XFRM && PROC_FS ---help--- This statistics is not a SNMP/MIB specification but shows statistics about transformation error (or almost error) factor @@ -68,8 +68,8 @@ config NET_KEY Say Y unless you know what you are doing. config NET_KEY_MIGRATE - bool "PF_KEY MIGRATE (EXPERIMENTAL)" - depends on NET_KEY && EXPERIMENTAL + bool "PF_KEY MIGRATE" + depends on NET_KEY select XFRM_MIGRATE ---help--- Add a PF_KEY MIGRATE message to PF_KEYv2 socket family. diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 4ce2d93162c1..ab4ef72f0b1d 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -35,6 +35,8 @@ static struct xfrm_algo_desc aead_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV8, .sadb_alg_ivlen = 8, @@ -51,6 +53,8 @@ static struct xfrm_algo_desc aead_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV12, .sadb_alg_ivlen = 8, @@ -67,6 +71,8 @@ static struct xfrm_algo_desc aead_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV16, .sadb_alg_ivlen = 8, @@ -83,6 +89,8 @@ static struct xfrm_algo_desc aead_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV8, .sadb_alg_ivlen = 8, @@ -99,6 +107,8 @@ static struct xfrm_algo_desc aead_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV12, .sadb_alg_ivlen = 8, @@ -115,6 +125,8 @@ static struct xfrm_algo_desc aead_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV16, .sadb_alg_ivlen = 8, @@ -131,6 +143,8 @@ static struct xfrm_algo_desc aead_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_NULL_AES_GMAC, .sadb_alg_ivlen = 8, @@ -151,6 +165,8 @@ static struct xfrm_algo_desc aalg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_AALG_NULL, .sadb_alg_ivlen = 0, @@ -169,6 +185,8 @@ static struct xfrm_algo_desc aalg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_AALG_MD5HMAC, .sadb_alg_ivlen = 0, @@ -187,6 +205,8 @@ static struct xfrm_algo_desc aalg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_AALG_SHA1HMAC, .sadb_alg_ivlen = 0, @@ -205,6 +225,8 @@ static struct xfrm_algo_desc aalg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_AALG_SHA2_256HMAC, .sadb_alg_ivlen = 0, @@ -222,6 +244,8 @@ static struct xfrm_algo_desc aalg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_AALG_SHA2_384HMAC, .sadb_alg_ivlen = 0, @@ -239,6 +263,8 @@ static struct xfrm_algo_desc aalg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_AALG_SHA2_512HMAC, .sadb_alg_ivlen = 0, @@ -257,6 +283,8 @@ static struct xfrm_algo_desc aalg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC, .sadb_alg_ivlen = 0, @@ -274,6 +302,8 @@ static struct xfrm_algo_desc aalg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_AALG_AES_XCBC_MAC, .sadb_alg_ivlen = 0, @@ -281,6 +311,19 @@ static struct xfrm_algo_desc aalg_list[] = { .sadb_alg_maxbits = 128 } }, +{ + /* rfc4494 */ + .name = "cmac(aes)", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 128, + } + }, + + .pfkey_supported = 0, +}, }; static struct xfrm_algo_desc ealg_list[] = { @@ -295,6 +338,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_EALG_NULL, .sadb_alg_ivlen = 0, @@ -313,6 +358,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_EALG_DESCBC, .sadb_alg_ivlen = 8, @@ -331,6 +378,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_EALG_3DESCBC, .sadb_alg_ivlen = 8, @@ -349,6 +398,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_CASTCBC, .sadb_alg_ivlen = 8, @@ -367,6 +418,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_BLOWFISHCBC, .sadb_alg_ivlen = 8, @@ -385,6 +438,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_AESCBC, .sadb_alg_ivlen = 8, @@ -403,6 +458,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_SERPENTCBC, .sadb_alg_ivlen = 8, @@ -421,6 +478,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_CAMELLIACBC, .sadb_alg_ivlen = 8, @@ -439,6 +498,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_TWOFISHCBC, .sadb_alg_ivlen = 8, @@ -456,6 +517,8 @@ static struct xfrm_algo_desc ealg_list[] = { } }, + .pfkey_supported = 1, + .desc = { .sadb_alg_id = SADB_X_EALG_AESCTR, .sadb_alg_ivlen = 8, @@ -473,6 +536,7 @@ static struct xfrm_algo_desc calg_list[] = { .threshold = 90, } }, + .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_CALG_DEFLATE } }, { @@ -482,6 +546,7 @@ static struct xfrm_algo_desc calg_list[] = { .threshold = 90, } }, + .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_CALG_LZS } }, { @@ -491,6 +556,7 @@ static struct xfrm_algo_desc calg_list[] = { .threshold = 50, } }, + .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_CALG_LZJH } }, }; @@ -700,8 +766,7 @@ void xfrm_probe_algs(void) } for (i = 0; i < ealg_entries(); i++) { - status = crypto_has_blkcipher(ealg_list[i].name, 0, - CRYPTO_ALG_ASYNC); + status = crypto_has_ablkcipher(ealg_list[i].name, 0, 0); if (ealg_list[i].available != status) ealg_list[i].available = status; } @@ -715,27 +780,27 @@ void xfrm_probe_algs(void) } EXPORT_SYMBOL_GPL(xfrm_probe_algs); -int xfrm_count_auth_supported(void) +int xfrm_count_pfkey_auth_supported(void) { int i, n; for (i = 0, n = 0; i < aalg_entries(); i++) - if (aalg_list[i].available) + if (aalg_list[i].available && aalg_list[i].pfkey_supported) n++; return n; } -EXPORT_SYMBOL_GPL(xfrm_count_auth_supported); +EXPORT_SYMBOL_GPL(xfrm_count_pfkey_auth_supported); -int xfrm_count_enc_supported(void) +int xfrm_count_pfkey_enc_supported(void) { int i, n; for (i = 0, n = 0; i < ealg_entries(); i++) - if (ealg_list[i].available) + if (ealg_list[i].available && ealg_list[i].pfkey_supported) n++; return n; } -EXPORT_SYMBOL_GPL(xfrm_count_enc_supported); +EXPORT_SYMBOL_GPL(xfrm_count_pfkey_enc_supported); #if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 95a338c89f99..bcfda8921b5b 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -61,6 +61,12 @@ static int xfrm_output_one(struct sk_buff *skb, int err) } spin_lock_bh(&x->lock); + + if (unlikely(x->km.state != XFRM_STATE_VALID)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEINVALID); + goto error; + } + err = xfrm_state_check_expire(x); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEEXPIRED); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 07c585756d2a..23cea0f74336 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -35,6 +35,10 @@ #include "xfrm_hash.h" +#define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10)) +#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) +#define XFRM_MAX_QUEUE_LEN 100 + DEFINE_MUTEX(xfrm_cfg_mutex); EXPORT_SYMBOL(xfrm_cfg_mutex); @@ -51,7 +55,7 @@ static struct kmem_cache *xfrm_dst_cache __read_mostly; static void xfrm_init_pmtu(struct dst_entry *dst); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); - +static void xfrm_policy_queue_process(unsigned long arg); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); @@ -287,8 +291,11 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); atomic_set(&policy->refcnt, 1); + skb_queue_head_init(&policy->polq.hold_queue); setup_timer(&policy->timer, xfrm_policy_timer, (unsigned long)policy); + setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process, + (unsigned long)policy); policy->flo.ops = &xfrm_policy_fc_ops; } return policy; @@ -309,6 +316,16 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) } EXPORT_SYMBOL(xfrm_policy_destroy); +static void xfrm_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(list)) != NULL) { + dev_put(skb->dev); + kfree_skb(skb); + } +} + /* Rule must be locked. Release descentant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ @@ -319,6 +336,9 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) atomic_inc(&policy->genid); + del_timer(&policy->polq.hold_timer); + xfrm_queue_purge(&policy->polq.hold_queue); + if (del_timer(&policy->timer)) xfrm_pol_put(policy); @@ -359,27 +379,27 @@ static void xfrm_dst_hash_transfer(struct hlist_head *list, struct hlist_head *ndsttable, unsigned int nhashmask) { - struct hlist_node *entry, *tmp, *entry0 = NULL; + struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; redo: - hlist_for_each_entry_safe(pol, entry, tmp, list, bydst) { + hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask); if (!entry0) { - hlist_del(entry); + hlist_del(&pol->bydst); hlist_add_head(&pol->bydst, ndsttable+h); h0 = h; } else { if (h != h0) continue; - hlist_del(entry); + hlist_del(&pol->bydst); hlist_add_after(entry0, &pol->bydst); } - entry0 = entry; + entry0 = &pol->bydst; } if (!hlist_empty(list)) { entry0 = NULL; @@ -391,10 +411,10 @@ static void xfrm_idx_hash_transfer(struct hlist_head *list, struct hlist_head *nidxtable, unsigned int nhashmask) { - struct hlist_node *entry, *tmp; + struct hlist_node *tmp; struct xfrm_policy *pol; - hlist_for_each_entry_safe(pol, entry, tmp, list, byidx) { + hlist_for_each_entry_safe(pol, tmp, list, byidx) { unsigned int h; h = __idx_hash(pol->index, nhashmask); @@ -524,7 +544,6 @@ static u32 xfrm_gen_index(struct net *net, int dir) static u32 idx_generator; for (;;) { - struct hlist_node *entry; struct hlist_head *list; struct xfrm_policy *p; u32 idx; @@ -536,7 +555,7 @@ static u32 xfrm_gen_index(struct net *net, int dir) idx = 8; list = net->xfrm.policy_byidx + idx_hash(net, idx); found = 0; - hlist_for_each_entry(p, entry, list, byidx) { + hlist_for_each_entry(p, list, byidx) { if (p->index == idx) { found = 1; break; @@ -562,23 +581,62 @@ static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s return 0; } +static void xfrm_policy_requeue(struct xfrm_policy *old, + struct xfrm_policy *new) +{ + struct xfrm_policy_queue *pq = &old->polq; + struct sk_buff_head list; + + __skb_queue_head_init(&list); + + spin_lock_bh(&pq->hold_queue.lock); + skb_queue_splice_init(&pq->hold_queue, &list); + del_timer(&pq->hold_timer); + spin_unlock_bh(&pq->hold_queue.lock); + + if (skb_queue_empty(&list)) + return; + + pq = &new->polq; + + spin_lock_bh(&pq->hold_queue.lock); + skb_queue_splice(&list, &pq->hold_queue); + pq->timeout = XFRM_QUEUE_TMO_MIN; + mod_timer(&pq->hold_timer, jiffies); + spin_unlock_bh(&pq->hold_queue.lock); +} + +static bool xfrm_policy_mark_match(struct xfrm_policy *policy, + struct xfrm_policy *pol) +{ + u32 mark = policy->mark.v & policy->mark.m; + + if (policy->mark.v == pol->mark.v && policy->mark.m == pol->mark.m) + return true; + + if ((mark & pol->mark.m) == pol->mark.v && + policy->priority == pol->priority) + return true; + + return false; +} + int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) { struct net *net = xp_net(policy); struct xfrm_policy *pol; struct xfrm_policy *delpol; struct hlist_head *chain; - struct hlist_node *entry, *newpos; - u32 mark = policy->mark.v & policy->mark.m; + struct hlist_node *newpos; write_lock_bh(&xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); delpol = NULL; newpos = NULL; - hlist_for_each_entry(pol, entry, chain, bydst) { + hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && !selector_cmp(&pol->selector, &policy->selector) && - (mark & pol->mark.m) == pol->mark.v && + xfrm_policy_mark_match(policy, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) { @@ -603,8 +661,10 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) net->xfrm.policy_count[dir]++; atomic_inc(&flow_cache_genid); rt_genid_bump(net); - if (delpol) + if (delpol) { + xfrm_policy_requeue(delpol, policy); __xfrm_policy_unlink(delpol, dir); + } policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir); hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); policy->curlft.add_time = get_seconds(); @@ -630,13 +690,12 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, { struct xfrm_policy *pol, *ret; struct hlist_head *chain; - struct hlist_node *entry; *err = 0; write_lock_bh(&xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); ret = NULL; - hlist_for_each_entry(pol, entry, chain, bydst) { + hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && (mark & pol->mark.m) == pol->mark.v && !selector_cmp(sel, &pol->selector) && @@ -668,7 +727,6 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, { struct xfrm_policy *pol, *ret; struct hlist_head *chain; - struct hlist_node *entry; *err = -ENOENT; if (xfrm_policy_id2dir(id) != dir) @@ -678,7 +736,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, write_lock_bh(&xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; - hlist_for_each_entry(pol, entry, chain, byidx) { + hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && (mark & pol->mark.m) == pol->mark.v) { xfrm_pol_hold(pol); @@ -711,10 +769,9 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audi for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy *pol; - struct hlist_node *entry; int i; - hlist_for_each_entry(pol, entry, + hlist_for_each_entry(pol, &net->xfrm.policy_inexact[dir], bydst) { if (pol->type != type) continue; @@ -728,7 +785,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audi } } for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { - hlist_for_each_entry(pol, entry, + hlist_for_each_entry(pol, net->xfrm.policy_bydst[dir].table + i, bydst) { if (pol->type != type) @@ -767,11 +824,10 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info) for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy *pol; - struct hlist_node *entry; int i; again1: - hlist_for_each_entry(pol, entry, + hlist_for_each_entry(pol, &net->xfrm.policy_inexact[dir], bydst) { if (pol->type != type) continue; @@ -791,7 +847,7 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info) for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { again2: - hlist_for_each_entry(pol, entry, + hlist_for_each_entry(pol, net->xfrm.policy_bydst[dir].table + i, bydst) { if (pol->type != type) @@ -919,7 +975,6 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, int err; struct xfrm_policy *pol, *ret; const xfrm_address_t *daddr, *saddr; - struct hlist_node *entry; struct hlist_head *chain; u32 priority = ~0U; @@ -931,7 +986,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, read_lock_bh(&xfrm_policy_lock); chain = policy_hash_direct(net, daddr, saddr, family, dir); ret = NULL; - hlist_for_each_entry(pol, entry, chain, bydst) { + hlist_for_each_entry(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, dir); if (err) { if (err == -ESRCH) @@ -947,7 +1002,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } } chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, entry, chain, bydst) { + hlist_for_each_entry(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, dir); if (err) { if (err == -ESRCH) @@ -982,6 +1037,24 @@ __xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); } +static int flow_to_policy_dir(int dir) +{ + if (XFRM_POLICY_IN == FLOW_DIR_IN && + XFRM_POLICY_OUT == FLOW_DIR_OUT && + XFRM_POLICY_FWD == FLOW_DIR_FWD) + return dir; + + switch (dir) { + default: + case FLOW_DIR_IN: + return XFRM_POLICY_IN; + case FLOW_DIR_OUT: + return XFRM_POLICY_OUT; + case FLOW_DIR_FWD: + return XFRM_POLICY_FWD; + } +} + static struct flow_cache_object * xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct flow_cache_object *old_obj, void *ctx) @@ -991,7 +1064,7 @@ xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, if (old_obj) xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo)); - pol = __xfrm_policy_lookup(net, fl, family, dir); + pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir)); if (IS_ERR_OR_NULL(pol)) return ERR_CAST(pol); @@ -1115,11 +1188,15 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir); __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir); } - if (old_pol) + if (old_pol) { + if (pol) + xfrm_policy_requeue(old_pol, pol); + /* Unlinking succeeds always. This is the only function * allowed to delete or replace socket policy. */ __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir); + } write_unlock_bh(&xfrm_policy_lock); if (old_pol) { @@ -1310,6 +1387,8 @@ static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *f * It means we need to try again resolving. */ if (xdst->num_xfrms > 0) return NULL; + } else if (dst->flags & DST_XFRM_QUEUE) { + return NULL; } else { /* Real bundle */ if (stale_bundle(dst)) @@ -1673,6 +1752,171 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, return xdst; } +static void xfrm_policy_queue_process(unsigned long arg) +{ + int err = 0; + struct sk_buff *skb; + struct sock *sk; + struct dst_entry *dst; + struct net_device *dev; + struct xfrm_policy *pol = (struct xfrm_policy *)arg; + struct xfrm_policy_queue *pq = &pol->polq; + struct flowi fl; + struct sk_buff_head list; + + spin_lock(&pq->hold_queue.lock); + skb = skb_peek(&pq->hold_queue); + dst = skb_dst(skb); + sk = skb->sk; + xfrm_decode_session(skb, &fl, dst->ops->family); + spin_unlock(&pq->hold_queue.lock); + + dst_hold(dst->path); + dst = xfrm_lookup(xp_net(pol), dst->path, &fl, + sk, 0); + if (IS_ERR(dst)) + goto purge_queue; + + if (dst->flags & DST_XFRM_QUEUE) { + dst_release(dst); + + if (pq->timeout >= XFRM_QUEUE_TMO_MAX) + goto purge_queue; + + pq->timeout = pq->timeout << 1; + mod_timer(&pq->hold_timer, jiffies + pq->timeout); + return; + } + + dst_release(dst); + + __skb_queue_head_init(&list); + + spin_lock(&pq->hold_queue.lock); + pq->timeout = 0; + skb_queue_splice_init(&pq->hold_queue, &list); + spin_unlock(&pq->hold_queue.lock); + + while (!skb_queue_empty(&list)) { + skb = __skb_dequeue(&list); + + xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); + dst_hold(skb_dst(skb)->path); + dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path, + &fl, skb->sk, 0); + if (IS_ERR(dst)) { + dev_put(skb->dev); + kfree_skb(skb); + continue; + } + + nf_reset(skb); + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + dev = skb->dev; + err = dst_output(skb); + dev_put(dev); + } + + return; + +purge_queue: + pq->timeout = 0; + xfrm_queue_purge(&pq->hold_queue); +} + +static int xdst_queue_output(struct sk_buff *skb) +{ + unsigned long sched_next; + struct dst_entry *dst = skb_dst(skb); + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + struct xfrm_policy_queue *pq = &xdst->pols[0]->polq; + + if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) { + kfree_skb(skb); + return -EAGAIN; + } + + skb_dst_force(skb); + dev_hold(skb->dev); + + spin_lock_bh(&pq->hold_queue.lock); + + if (!pq->timeout) + pq->timeout = XFRM_QUEUE_TMO_MIN; + + sched_next = jiffies + pq->timeout; + + if (del_timer(&pq->hold_timer)) { + if (time_before(pq->hold_timer.expires, sched_next)) + sched_next = pq->hold_timer.expires; + } + + __skb_queue_tail(&pq->hold_queue, skb); + mod_timer(&pq->hold_timer, sched_next); + + spin_unlock_bh(&pq->hold_queue.lock); + + return 0; +} + +static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, + struct dst_entry *dst, + const struct flowi *fl, + int num_xfrms, + u16 family) +{ + int err; + struct net_device *dev; + struct dst_entry *dst1; + struct xfrm_dst *xdst; + + xdst = xfrm_alloc_dst(net, family); + if (IS_ERR(xdst)) + return xdst; + + if (net->xfrm.sysctl_larval_drop || num_xfrms <= 0 || + (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP)) + return xdst; + + dst1 = &xdst->u.dst; + dst_hold(dst); + xdst->route = dst; + + dst_copy_metrics(dst1, dst); + + dst1->obsolete = DST_OBSOLETE_FORCE_CHK; + dst1->flags |= DST_HOST | DST_XFRM_QUEUE; + dst1->lastuse = jiffies; + + dst1->input = dst_discard; + dst1->output = xdst_queue_output; + + dst_hold(dst); + dst1->child = dst; + dst1->path = dst; + + xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); + + err = -ENODEV; + dev = dst->dev; + if (!dev) + goto free_dst; + + err = xfrm_fill_dst(xdst, dev, fl); + if (err) + goto free_dst; + +out: + return xdst; + +free_dst: + dst_release(dst1); + xdst = ERR_PTR(err); + goto out; +} + static struct flow_cache_object * xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct flow_cache_object *oldflo, void *ctx) @@ -1706,7 +1950,8 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, * previous cache entry */ if (xdst == NULL) { num_pols = 1; - pols[0] = __xfrm_policy_lookup(net, fl, family, dir); + pols[0] = __xfrm_policy_lookup(net, fl, family, + flow_to_policy_dir(dir)); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) @@ -1751,7 +1996,7 @@ make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: * either because the policy blocks, has no transformations or * we could not build template (no xfrm_states).*/ - xdst = xfrm_alloc_dst(net, family); + xdst = xfrm_create_dummy_bundle(net, dst_orig, fl, num_xfrms, family); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); return ERR_CAST(xdst); @@ -2359,6 +2604,9 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) (dst->dev && !netif_running(dst->dev))) return 0; + if (dst->flags & DST_XFRM_QUEUE) + return 1; + last = NULL; do { @@ -2786,10 +3034,10 @@ static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, { if (sel_cmp->proto == IPSEC_ULPROTO_ANY) { if (sel_tgt->family == sel_cmp->family && - xfrm_addr_cmp(&sel_tgt->daddr, &sel_cmp->daddr, - sel_cmp->family) == 0 && - xfrm_addr_cmp(&sel_tgt->saddr, &sel_cmp->saddr, - sel_cmp->family) == 0 && + xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr, + sel_cmp->family) && + xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr, + sel_cmp->family) && sel_tgt->prefixlen_d == sel_cmp->prefixlen_d && sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) { return true; @@ -2806,13 +3054,12 @@ static struct xfrm_policy * xfrm_migrate_policy_find(const struct xfrm_selector u8 dir, u8 type) { struct xfrm_policy *pol, *ret = NULL; - struct hlist_node *entry; struct hlist_head *chain; u32 priority = ~0U; read_lock_bh(&xfrm_policy_lock); chain = policy_hash_direct(&init_net, &sel->daddr, &sel->saddr, sel->family, dir); - hlist_for_each_entry(pol, entry, chain, bydst) { + hlist_for_each_entry(pol, chain, bydst) { if (xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; @@ -2821,7 +3068,7 @@ static struct xfrm_policy * xfrm_migrate_policy_find(const struct xfrm_selector } } chain = &init_net.xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, entry, chain, bydst) { + hlist_for_each_entry(pol, chain, bydst) { if (xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type && pol->priority < priority) { @@ -2847,10 +3094,10 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: - if (xfrm_addr_cmp(&t->id.daddr, &m->old_daddr, - m->old_family) == 0 && - xfrm_addr_cmp(&t->saddr, &m->old_saddr, - m->old_family) == 0) { + if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, + m->old_family) && + xfrm_addr_equal(&t->saddr, &m->old_saddr, + m->old_family)) { match = 1; } break; @@ -2916,10 +3163,10 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) return -EINVAL; for (i = 0; i < num_migrate; i++) { - if ((xfrm_addr_cmp(&m[i].old_daddr, &m[i].new_daddr, - m[i].old_family) == 0) && - (xfrm_addr_cmp(&m[i].old_saddr, &m[i].new_saddr, - m[i].old_family) == 0)) + if (xfrm_addr_equal(&m[i].old_daddr, &m[i].new_daddr, + m[i].old_family) && + xfrm_addr_equal(&m[i].old_saddr, &m[i].new_saddr, + m[i].old_family)) return -EINVAL; if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index d0a1af8ed584..c721b0d9ab8b 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -43,6 +43,7 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmOutPolDead", LINUX_MIB_XFRMOUTPOLDEAD), SNMP_MIB_ITEM("XfrmOutPolError", LINUX_MIB_XFRMOUTPOLERROR), SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR), + SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID), SNMP_MIB_SENTINEL }; @@ -73,13 +74,13 @@ static const struct file_operations xfrm_statistics_seq_fops = { int __net_init xfrm_proc_init(struct net *net) { - if (!proc_net_fops_create(net, "xfrm_stat", S_IRUGO, - &xfrm_statistics_seq_fops)) + if (!proc_create("xfrm_stat", S_IRUGO, net->proc_net, + &xfrm_statistics_seq_fops)) return -ENOMEM; return 0; } void xfrm_proc_fini(struct net *net) { - proc_net_remove(net, "xfrm_stat"); + remove_proc_entry("xfrm_stat", net->proc_net); } diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 35754cc8a9e5..8dafe6d3c6e4 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -334,6 +334,70 @@ static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event) x->xflags &= ~XFRM_TIME_DEFER; } +static void xfrm_replay_notify_esn(struct xfrm_state *x, int event) +{ + u32 seq_diff, oseq_diff; + struct km_event c; + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + struct xfrm_replay_state_esn *preplay_esn = x->preplay_esn; + + /* we send notify messages in case + * 1. we updated on of the sequence numbers, and the seqno difference + * is at least x->replay_maxdiff, in this case we also update the + * timeout of our timer function + * 2. if x->replay_maxage has elapsed since last update, + * and there were changes + * + * The state structure must be locked! + */ + + switch (event) { + case XFRM_REPLAY_UPDATE: + if (!x->replay_maxdiff) + break; + + if (replay_esn->seq_hi == preplay_esn->seq_hi) + seq_diff = replay_esn->seq - preplay_esn->seq; + else + seq_diff = ~preplay_esn->seq + replay_esn->seq + 1; + + if (replay_esn->oseq_hi == preplay_esn->oseq_hi) + oseq_diff = replay_esn->oseq - preplay_esn->oseq; + else + oseq_diff = ~preplay_esn->oseq + replay_esn->oseq + 1; + + if (seq_diff < x->replay_maxdiff && + oseq_diff < x->replay_maxdiff) { + + if (x->xflags & XFRM_TIME_DEFER) + event = XFRM_REPLAY_TIMEOUT; + else + return; + } + + break; + + case XFRM_REPLAY_TIMEOUT: + if (memcmp(x->replay_esn, x->preplay_esn, + xfrm_replay_state_esn_len(replay_esn)) == 0) { + x->xflags |= XFRM_TIME_DEFER; + return; + } + + break; + } + + memcpy(x->preplay_esn, x->replay_esn, + xfrm_replay_state_esn_len(replay_esn)); + c.event = XFRM_MSG_NEWAE; + c.data.aevent = event; + km_state_notify(x, &c); + + if (x->replay_maxage && + !mod_timer(&x->rtimer, jiffies + x->replay_maxage)) + x->xflags &= ~XFRM_TIME_DEFER; +} + static int xfrm_replay_overflow_esn(struct xfrm_state *x, struct sk_buff *skb) { int err = 0; @@ -510,7 +574,7 @@ static struct xfrm_replay xfrm_replay_esn = { .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, .recheck = xfrm_replay_recheck_esn, - .notify = xfrm_replay_notify_bmp, + .notify = xfrm_replay_notify_esn, .overflow = xfrm_replay_overflow_esn, }; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 3459692092ec..78f66fa92449 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -72,10 +72,10 @@ static void xfrm_hash_transfer(struct hlist_head *list, struct hlist_head *nspitable, unsigned int nhashmask) { - struct hlist_node *entry, *tmp; + struct hlist_node *tmp; struct xfrm_state *x; - hlist_for_each_entry_safe(x, entry, tmp, list, bydst) { + hlist_for_each_entry_safe(x, tmp, list, bydst) { unsigned int h; h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, @@ -158,8 +158,8 @@ out_unlock: mutex_unlock(&hash_resize_mutex); } -static DEFINE_RWLOCK(xfrm_state_afinfo_lock); -static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO]; +static DEFINE_SPINLOCK(xfrm_state_afinfo_lock); +static struct xfrm_state_afinfo __rcu *xfrm_state_afinfo[NPROTO]; static DEFINE_SPINLOCK(xfrm_state_gc_lock); @@ -168,58 +168,45 @@ int __xfrm_state_delete(struct xfrm_state *x); int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol); void km_state_expired(struct xfrm_state *x, int hard, u32 portid); -static struct xfrm_state_afinfo *xfrm_state_lock_afinfo(unsigned int family) -{ - struct xfrm_state_afinfo *afinfo; - if (unlikely(family >= NPROTO)) - return NULL; - write_lock_bh(&xfrm_state_afinfo_lock); - afinfo = xfrm_state_afinfo[family]; - if (unlikely(!afinfo)) - write_unlock_bh(&xfrm_state_afinfo_lock); - return afinfo; -} - -static void xfrm_state_unlock_afinfo(struct xfrm_state_afinfo *afinfo) - __releases(xfrm_state_afinfo_lock) -{ - write_unlock_bh(&xfrm_state_afinfo_lock); -} - +static DEFINE_SPINLOCK(xfrm_type_lock); int xfrm_register_type(const struct xfrm_type *type, unsigned short family) { - struct xfrm_state_afinfo *afinfo = xfrm_state_lock_afinfo(family); + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); const struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; + spin_lock_bh(&xfrm_type_lock); if (likely(typemap[type->proto] == NULL)) typemap[type->proto] = type; else err = -EEXIST; - xfrm_state_unlock_afinfo(afinfo); + spin_unlock_bh(&xfrm_type_lock); + xfrm_state_put_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_register_type); int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) { - struct xfrm_state_afinfo *afinfo = xfrm_state_lock_afinfo(family); + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); const struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; + spin_lock_bh(&xfrm_type_lock); if (unlikely(typemap[type->proto] != type)) err = -ENOENT; else typemap[type->proto] = NULL; - xfrm_state_unlock_afinfo(afinfo); + spin_unlock_bh(&xfrm_type_lock); + xfrm_state_put_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_unregister_type); @@ -256,6 +243,7 @@ static void xfrm_put_type(const struct xfrm_type *type) module_put(type->owner); } +static DEFINE_SPINLOCK(xfrm_mode_lock); int xfrm_register_mode(struct xfrm_mode *mode, int family) { struct xfrm_state_afinfo *afinfo; @@ -265,12 +253,13 @@ int xfrm_register_mode(struct xfrm_mode *mode, int family) if (unlikely(mode->encap >= XFRM_MODE_MAX)) return -EINVAL; - afinfo = xfrm_state_lock_afinfo(family); + afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; err = -EEXIST; modemap = afinfo->mode_map; + spin_lock_bh(&xfrm_mode_lock); if (modemap[mode->encap]) goto out; @@ -283,7 +272,8 @@ int xfrm_register_mode(struct xfrm_mode *mode, int family) err = 0; out: - xfrm_state_unlock_afinfo(afinfo); + spin_unlock_bh(&xfrm_mode_lock); + xfrm_state_put_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_register_mode); @@ -297,19 +287,21 @@ int xfrm_unregister_mode(struct xfrm_mode *mode, int family) if (unlikely(mode->encap >= XFRM_MODE_MAX)) return -EINVAL; - afinfo = xfrm_state_lock_afinfo(family); + afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; err = -ENOENT; modemap = afinfo->mode_map; + spin_lock_bh(&xfrm_mode_lock); if (likely(modemap[mode->encap] == mode)) { modemap[mode->encap] = NULL; module_put(mode->afinfo->owner); err = 0; } - xfrm_state_unlock_afinfo(afinfo); + spin_unlock_bh(&xfrm_mode_lock); + xfrm_state_put_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_unregister_mode); @@ -376,14 +368,14 @@ static void xfrm_state_gc_task(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.state_gc_work); struct xfrm_state *x; - struct hlist_node *entry, *tmp; + struct hlist_node *tmp; struct hlist_head gc_list; spin_lock_bh(&xfrm_state_gc_lock); hlist_move_list(&net->xfrm.state_gc_list, &gc_list); spin_unlock_bh(&xfrm_state_gc_lock); - hlist_for_each_entry_safe(x, entry, tmp, &gc_list, gclist) + hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) xfrm_state_gc_destroy(x); wake_up(&net->xfrm.km_waitq); @@ -585,10 +577,9 @@ xfrm_state_flush_secctx_check(struct net *net, u8 proto, struct xfrm_audit *audi int i, err = 0; for (i = 0; i <= net->xfrm.state_hmask; i++) { - struct hlist_node *entry; struct xfrm_state *x; - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+i, bydst) { + hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { if (xfrm_id_proto_match(x->id.proto, proto) && (err = security_xfrm_state_delete(x)) != 0) { xfrm_audit_state_delete(x, 0, @@ -621,10 +612,9 @@ int xfrm_state_flush(struct net *net, u8 proto, struct xfrm_audit *audit_info) err = -ESRCH; for (i = 0; i <= net->xfrm.state_hmask; i++) { - struct hlist_node *entry; struct xfrm_state *x; restart: - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+i, bydst) { + hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { if (!xfrm_state_kern(x) && xfrm_id_proto_match(x->id.proto, proto)) { xfrm_state_hold(x); @@ -693,13 +683,12 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, { unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); struct xfrm_state *x; - struct hlist_node *entry; - hlist_for_each_entry(x, entry, net->xfrm.state_byspi+h, byspi) { + hlist_for_each_entry(x, net->xfrm.state_byspi+h, byspi) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || - xfrm_addr_cmp(&x->id.daddr, daddr, family)) + !xfrm_addr_equal(&x->id.daddr, daddr, family)) continue; if ((mark & x->mark.m) != x->mark.v) @@ -718,13 +707,12 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, { unsigned int h = xfrm_src_hash(net, daddr, saddr, family); struct xfrm_state *x; - struct hlist_node *entry; - hlist_for_each_entry(x, entry, net->xfrm.state_bysrc+h, bysrc) { + hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) { if (x->props.family != family || x->id.proto != proto || - xfrm_addr_cmp(&x->id.daddr, daddr, family) || - xfrm_addr_cmp(&x->props.saddr, saddr, family)) + !xfrm_addr_equal(&x->id.daddr, daddr, family) || + !xfrm_addr_equal(&x->props.saddr, saddr, family)) continue; if ((mark & x->mark.m) != x->mark.v) @@ -806,7 +794,6 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); unsigned int h, h_wildcard; - struct hlist_node *entry; struct xfrm_state *x, *x0, *to_put; int acquire_in_progress = 0; int error = 0; @@ -818,7 +805,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, spin_lock_bh(&xfrm_state_lock); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -834,7 +821,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, goto found; h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) { + hlist_for_each_entry(x, net->xfrm.state_bydst+h_wildcard, bydst) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -914,11 +901,10 @@ xfrm_stateonly_find(struct net *net, u32 mark, { unsigned int h; struct xfrm_state *rx = NULL, *x = NULL; - struct hlist_node *entry; spin_lock(&xfrm_state_lock); h = xfrm_dst_hash(net, daddr, saddr, reqid, family); - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && (mark & x->mark.m) == x->mark.v && @@ -980,17 +966,16 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) unsigned short family = xnew->props.family; u32 reqid = xnew->props.reqid; struct xfrm_state *x; - struct hlist_node *entry; unsigned int h; u32 mark = xnew->mark.v & xnew->mark.m; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && (mark & x->mark.m) == x->mark.v && - !xfrm_addr_cmp(&x->id.daddr, &xnew->id.daddr, family) && - !xfrm_addr_cmp(&x->props.saddr, &xnew->props.saddr, family)) + xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && + xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) x->genid++; } } @@ -1012,11 +997,10 @@ static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m, const xfrm_address_t *saddr, int create) { unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family); - struct hlist_node *entry; struct xfrm_state *x; u32 mark = m->v & m->m; - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.reqid != reqid || x->props.mode != mode || x->props.family != family || @@ -1024,8 +1008,8 @@ static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m, x->id.spi != 0 || x->id.proto != proto || (mark & x->mark.m) != x->mark.v || - xfrm_addr_cmp(&x->id.daddr, daddr, family) || - xfrm_addr_cmp(&x->props.saddr, saddr, family)) + !xfrm_addr_equal(&x->id.daddr, daddr, family) || + !xfrm_addr_equal(&x->props.saddr, saddr, family)) continue; xfrm_state_hold(x); @@ -1108,7 +1092,7 @@ int xfrm_state_add(struct xfrm_state *x) if (use_spi && x->km.seq) { x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq); if (x1 && ((x1->id.proto != x->id.proto) || - xfrm_addr_cmp(&x1->id.daddr, &x->id.daddr, family))) { + !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) { to_put = x1; x1 = NULL; } @@ -1203,6 +1187,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) goto error; x->props.flags = orig->props.flags; + x->props.extra_flags = orig->props.extra_flags; x->curlft.add_time = orig->curlft.add_time; x->km.state = orig->km.state; @@ -1223,21 +1208,20 @@ struct xfrm_state * xfrm_migrate_state_find(struct xfrm_migrate *m) { unsigned int h; struct xfrm_state *x; - struct hlist_node *entry; if (m->reqid) { h = xfrm_dst_hash(&init_net, &m->old_daddr, &m->old_saddr, m->reqid, m->old_family); - hlist_for_each_entry(x, entry, init_net.xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, init_net.xfrm.state_bydst+h, bydst) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; if (m->reqid && x->props.reqid != m->reqid) continue; - if (xfrm_addr_cmp(&x->id.daddr, &m->old_daddr, - m->old_family) || - xfrm_addr_cmp(&x->props.saddr, &m->old_saddr, - m->old_family)) + if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, + m->old_family) || + !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, + m->old_family)) continue; xfrm_state_hold(x); return x; @@ -1245,14 +1229,14 @@ struct xfrm_state * xfrm_migrate_state_find(struct xfrm_migrate *m) } else { h = xfrm_src_hash(&init_net, &m->old_daddr, &m->old_saddr, m->old_family); - hlist_for_each_entry(x, entry, init_net.xfrm.state_bysrc+h, bysrc) { + hlist_for_each_entry(x, init_net.xfrm.state_bysrc+h, bysrc) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; - if (xfrm_addr_cmp(&x->id.daddr, &m->old_daddr, - m->old_family) || - xfrm_addr_cmp(&x->props.saddr, &m->old_saddr, - m->old_family)) + if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, + m->old_family) || + !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, + m->old_family)) continue; xfrm_state_hold(x); return x; @@ -1277,7 +1261,7 @@ struct xfrm_state * xfrm_state_migrate(struct xfrm_state *x, memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); /* add state */ - if (!xfrm_addr_cmp(&x->id.daddr, &m->new_daddr, m->new_family)) { + if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) { /* a care is needed when the destination address of the state is to be updated as it is a part of triplet */ xfrm_state_insert(xc); @@ -1370,9 +1354,6 @@ int xfrm_state_check_expire(struct xfrm_state *x) if (!x->curlft.use_time) x->curlft.use_time = get_seconds(); - if (x->km.state != XFRM_STATE_VALID) - return -EINVAL; - if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { x->km.state = XFRM_STATE_EXPIRED; @@ -1477,10 +1458,9 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s int i; for (i = 0; i <= net->xfrm.state_hmask; i++) { - struct hlist_node *entry; struct xfrm_state *x; - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+i, bydst) { + hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { if (x->km.seq == seq && (mark & x->mark.m) == x->mark.v && x->km.state == XFRM_STATE_ACQ) { @@ -1648,27 +1628,26 @@ static void xfrm_replay_timer_handler(unsigned long data) } static LIST_HEAD(xfrm_km_list); -static DEFINE_RWLOCK(xfrm_km_lock); void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { struct xfrm_mgr *km; - read_lock(&xfrm_km_lock); - list_for_each_entry(km, &xfrm_km_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(km, &xfrm_km_list, list) if (km->notify_policy) km->notify_policy(xp, dir, c); - read_unlock(&xfrm_km_lock); + rcu_read_unlock(); } void km_state_notify(struct xfrm_state *x, const struct km_event *c) { struct xfrm_mgr *km; - read_lock(&xfrm_km_lock); - list_for_each_entry(km, &xfrm_km_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(km, &xfrm_km_list, list) if (km->notify) km->notify(x, c); - read_unlock(&xfrm_km_lock); + rcu_read_unlock(); } EXPORT_SYMBOL(km_policy_notify); @@ -1698,13 +1677,13 @@ int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) int err = -EINVAL, acqret; struct xfrm_mgr *km; - read_lock(&xfrm_km_lock); - list_for_each_entry(km, &xfrm_km_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(km, &xfrm_km_list, list) { acqret = km->acquire(x, t, pol); if (!acqret) err = acqret; } - read_unlock(&xfrm_km_lock); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_query); @@ -1714,14 +1693,14 @@ int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) int err = -EINVAL; struct xfrm_mgr *km; - read_lock(&xfrm_km_lock); - list_for_each_entry(km, &xfrm_km_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->new_mapping) err = km->new_mapping(x, ipaddr, sport); if (!err) break; } - read_unlock(&xfrm_km_lock); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_new_mapping); @@ -1750,15 +1729,15 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, int ret; struct xfrm_mgr *km; - read_lock(&xfrm_km_lock); - list_for_each_entry(km, &xfrm_km_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->migrate) { ret = km->migrate(sel, dir, type, m, num_migrate, k); if (!ret) err = ret; } } - read_unlock(&xfrm_km_lock); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_migrate); @@ -1770,15 +1749,15 @@ int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address int ret; struct xfrm_mgr *km; - read_lock(&xfrm_km_lock); - list_for_each_entry(km, &xfrm_km_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->report) { ret = km->report(net, proto, sel, addr); if (!ret) err = ret; } } - read_unlock(&xfrm_km_lock); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_report); @@ -1802,14 +1781,14 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen goto out; err = -EINVAL; - read_lock(&xfrm_km_lock); - list_for_each_entry(km, &xfrm_km_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(km, &xfrm_km_list, list) { pol = km->compile_policy(sk, optname, data, optlen, &err); if (err >= 0) break; } - read_unlock(&xfrm_km_lock); + rcu_read_unlock(); if (err >= 0) { xfrm_sk_policy_insert(sk, err, pol); @@ -1823,20 +1802,23 @@ out: } EXPORT_SYMBOL(xfrm_user_policy); +static DEFINE_SPINLOCK(xfrm_km_lock); + int xfrm_register_km(struct xfrm_mgr *km) { - write_lock_bh(&xfrm_km_lock); - list_add_tail(&km->list, &xfrm_km_list); - write_unlock_bh(&xfrm_km_lock); + spin_lock_bh(&xfrm_km_lock); + list_add_tail_rcu(&km->list, &xfrm_km_list); + spin_unlock_bh(&xfrm_km_lock); return 0; } EXPORT_SYMBOL(xfrm_register_km); int xfrm_unregister_km(struct xfrm_mgr *km) { - write_lock_bh(&xfrm_km_lock); - list_del(&km->list); - write_unlock_bh(&xfrm_km_lock); + spin_lock_bh(&xfrm_km_lock); + list_del_rcu(&km->list); + spin_unlock_bh(&xfrm_km_lock); + synchronize_rcu(); return 0; } EXPORT_SYMBOL(xfrm_unregister_km); @@ -1848,12 +1830,12 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) return -EINVAL; if (unlikely(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; - write_lock_bh(&xfrm_state_afinfo_lock); + spin_lock_bh(&xfrm_state_afinfo_lock); if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) err = -ENOBUFS; else - xfrm_state_afinfo[afinfo->family] = afinfo; - write_unlock_bh(&xfrm_state_afinfo_lock); + rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo); + spin_unlock_bh(&xfrm_state_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_state_register_afinfo); @@ -1865,14 +1847,15 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) return -EINVAL; if (unlikely(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; - write_lock_bh(&xfrm_state_afinfo_lock); + spin_lock_bh(&xfrm_state_afinfo_lock); if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo)) err = -EINVAL; else - xfrm_state_afinfo[afinfo->family] = NULL; + RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL); } - write_unlock_bh(&xfrm_state_afinfo_lock); + spin_unlock_bh(&xfrm_state_afinfo_lock); + synchronize_rcu(); return err; } EXPORT_SYMBOL(xfrm_state_unregister_afinfo); @@ -1882,17 +1865,16 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) struct xfrm_state_afinfo *afinfo; if (unlikely(family >= NPROTO)) return NULL; - read_lock(&xfrm_state_afinfo_lock); - afinfo = xfrm_state_afinfo[family]; + rcu_read_lock(); + afinfo = rcu_dereference(xfrm_state_afinfo[family]); if (unlikely(!afinfo)) - read_unlock(&xfrm_state_afinfo_lock); + rcu_read_unlock(); return afinfo; } static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) - __releases(xfrm_state_afinfo_lock) { - read_unlock(&xfrm_state_afinfo_lock); + rcu_read_unlock(); } /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index eb872b2e366e..aa778748c565 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -515,6 +515,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, copy_from_user_state(x, p); + if (attrs[XFRMA_SA_EXTRA_FLAGS]) + x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); + if ((err = attach_aead(&x->aead, &x->props.ealgo, attrs[XFRMA_ALG_AEAD]))) goto error; @@ -779,6 +782,13 @@ static int copy_to_user_state_extra(struct xfrm_state *x, copy_to_user_state(x, p); + if (x->props.extra_flags) { + ret = nla_put_u32(skb, XFRMA_SA_EXTRA_FLAGS, + x->props.extra_flags); + if (ret) + goto out; + } + if (x->coaddr) { ret = nla_put(skb, XFRMA_COADDR, sizeof(*x->coaddr), x->coaddr); if (ret) @@ -1112,7 +1122,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, mark = xfrm_mark_get(attrs, &m); if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); - if (x && xfrm_addr_cmp(&x->id.daddr, daddr, family)) { + if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { xfrm_state_put(x); x = NULL; } @@ -2302,9 +2312,10 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_MARK] = { .len = sizeof(struct xfrm_mark) }, [XFRMA_TFCPAD] = { .type = NLA_U32 }, [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) }, + [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, }; -static struct xfrm_link { +static const struct xfrm_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); @@ -2338,7 +2349,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *attrs[XFRMA_MAX+1]; - struct xfrm_link *link; + const struct xfrm_link *link; int type, err; type = nlh->nlmsg_type; @@ -2495,6 +2506,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) x->security->ctx_len); if (x->coaddr) l += nla_total_size(sizeof(*x->coaddr)); + if (x->props.extra_flags) + l += nla_total_size(sizeof(x->props.extra_flags)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size(sizeof(u64)); |