diff options
Diffstat (limited to 'net')
129 files changed, 2213 insertions, 1255 deletions
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 4d39d802be2c..912613c566cb 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -5,7 +5,7 @@ #include <linux/export.h> #include "vlan.h" -bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) +bool vlan_do_receive(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; @@ -13,14 +13,8 @@ bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) struct vlan_pcpu_stats *rx_stats; vlan_dev = vlan_find_dev(skb->dev, vlan_id); - if (!vlan_dev) { - /* Only the last call to vlan_do_receive() should change - * pkt_type to PACKET_OTHERHOST - */ - if (vlan_id && last_handler) - skb->pkt_type = PACKET_OTHERHOST; + if (!vlan_dev) return false; - } skb = *skbp = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) @@ -106,7 +100,6 @@ static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) return NULL; memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; - skb_reset_mac_len(skb); return skb; } @@ -140,6 +133,8 @@ struct sk_buff *vlan_untag(struct sk_buff *skb) skb_reset_network_header(skb); skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + return skb; err_free: diff --git a/net/atm/common.c b/net/atm/common.c index b4b44dbed645..0c0ad930a632 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -812,6 +812,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) return -ENOTCONN; + memset(&pvc, 0, sizeof(pvc)); pvc.sap_family = AF_ATMPVC; pvc.sap_addr.itf = vcc->dev->number; pvc.sap_addr.vpi = vcc->vpi; diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 3a734919c36c..ae0324021407 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -95,6 +95,7 @@ static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, return -ENOTCONN; *sockaddr_len = sizeof(struct sockaddr_atmpvc); addr = (struct sockaddr_atmpvc *)sockaddr; + memset(addr, 0, sizeof(*addr)); addr->sap_family = AF_ATMPVC; addr->sap_addr.itf = vcc->dev->number; addr->sap_addr.vpi = vcc->vpi; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a6d5d63fb6ad..fa701b6f7948 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -72,7 +72,7 @@ static unsigned long bat_iv_ogm_emit_send_time(const struct bat_priv *bat_priv) { return jiffies + msecs_to_jiffies( atomic_read(&bat_priv->orig_interval) - - JITTER + (random32() % 2*JITTER)); + JITTER + (random32() % (2*JITTER))); } /* when do we schedule a ogm packet to be sent */ diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ed0b87e33b2f..7a201173329e 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -42,6 +42,7 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> +#include <net/bluetooth/smp.h> static void hci_le_connect(struct hci_conn *conn) { @@ -672,6 +673,9 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("conn %p", conn); + if (conn->type == LE_LINK) + return smp_conn_security(conn, sec_level); + /* For sdp we don't need the link key. */ if (sec_level == BT_SECURITY_SDP) return 1; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 748cdf553328..a1365e065799 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -766,6 +766,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) cancel_work_sync(&hdev->le_scan); + cancel_delayed_work(&hdev->power_off); + hci_req_cancel(hdev, ENODEV); hci_req_lock(hdev); @@ -1880,6 +1882,8 @@ void hci_unregister_dev(struct hci_dev *hdev) for (i = 0; i < NUM_REASSEMBLY; i++) kfree_skb(hdev->reassembly[i]); + cancel_work_sync(&hdev->power_on); + if (!test_bit(HCI_INIT, &hdev->flags) && !test_bit(HCI_SETUP, &hdev->dev_flags)) { hci_dev_lock(hdev); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 626318c122f4..aba4fccd768a 100755 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1357,6 +1357,9 @@ static bool hci_resolve_next_name(struct hci_dev *hdev) return false; e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED); + if (!e) + return false; + if (hci_resolve_name(hdev, e) == 0) { e->name_state = NAME_PENDING; return true; @@ -1385,12 +1388,20 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn, return; e = hci_inquiry_cache_lookup_resolve(hdev, bdaddr, NAME_PENDING); - if (e) { + /* If the device was not found in a list of found devices names of which + * are pending. there is no need to continue resolving a next name as it + * will be done upon receiving another Remote Name Request Complete + * Event */ + if (!e) + return; + + list_del(&e->list); + if (name) { e->name_state = NAME_KNOWN; - list_del(&e->list); - if (name) - mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, - e->data.rssi, name, name_len); + mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, + e->data.rssi, name, name_len); + } else { + e->name_state = NAME_NOT_KNOWN; } if (hci_resolve_next_name(hdev)) @@ -1749,7 +1760,12 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; hci_conn_hold(conn); - conn->disc_timeout = HCI_DISCONN_TIMEOUT; + + if (!conn->out && !hci_conn_ssp_enabled(conn) && + !hci_find_link_key(hdev, &ev->bdaddr)) + conn->disc_timeout = HCI_PAIRING_TIMEOUT; + else + conn->disc_timeout = HCI_DISCONN_TIMEOUT; } else conn->state = BT_CONNECTED; @@ -2361,7 +2377,7 @@ static inline void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *sk if (ev->opcode != HCI_OP_NOP) del_timer(&hdev->cmd_timer); - if (ev->ncmd) { + if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) { atomic_set(&hdev->cmd_cnt, 1); if (!skb_queue_empty(&hdev->cmd_q)) queue_work(hdev->workqueue, &hdev->cmd_work); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 5914623f426a..bedc768c8cdf 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -706,6 +706,7 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int *add *addr_len = sizeof(*haddr); haddr->hci_family = AF_BLUETOOTH; haddr->hci_dev = hdev->id; + haddr->hci_channel= 0; release_sock(sk); return 0; @@ -1016,6 +1017,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char { struct hci_filter *f = &hci_pi(sk)->filter; + memset(&uf, 0, sizeof(uf)); uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index d478be11d562..7a0d9840866d 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -949,7 +949,7 @@ static int hidp_setup_hid(struct hidp_session *session, hid->version = req->version; hid->country = req->country; - strncpy(hid->name, req->name, 128); + strncpy(hid->name, req->name, sizeof(req->name) - 1); strncpy(hid->phys, batostr(&bt_sk(session->ctrl_sock->sk)->src), 64); strncpy(hid->uniq, batostr(&bt_sk(session->ctrl_sock->sk)->dst), 64); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 0939c7295a64..e49f200c68e5 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -893,6 +893,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) sk = chan->sk; hci_conn_hold(conn->hcon); + conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT; bacpy(&bt_sk(sk)->src, conn->src); bacpy(&bt_sk(sk)->dst, conn->dst); @@ -936,14 +937,15 @@ static void l2cap_chan_ready(struct l2cap_chan *chan) static void l2cap_conn_ready(struct l2cap_conn *conn) { struct l2cap_chan *chan; + struct hci_conn *hcon = conn->hcon; BT_DBG("conn %p", conn); - if (!conn->hcon->out && conn->hcon->type == LE_LINK) + if (!hcon->out && hcon->type == LE_LINK) l2cap_le_conn_ready(conn); - if (conn->hcon->out && conn->hcon->type == LE_LINK) - smp_conn_security(conn, conn->hcon->pending_sec_level); + if (hcon->out && hcon->type == LE_LINK) + smp_conn_security(hcon, hcon->pending_sec_level); mutex_lock(&conn->chan_lock); @@ -951,8 +953,8 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) l2cap_chan_lock(chan); - if (conn->hcon->type == LE_LINK) { - if (smp_conn_security(conn, chan->sec_level)) + if (hcon->type == LE_LINK) { + if (smp_conn_security(hcon, chan->sec_level)) l2cap_chan_ready(chan); } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { @@ -2583,12 +2585,14 @@ static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len) while (len >= L2CAP_CONF_OPT_SIZE) { len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val); - switch (type) { - case L2CAP_CONF_RFC: - if (olen == sizeof(rfc)) - memcpy(&rfc, (void *)val, olen); - goto done; - } + if (type != L2CAP_CONF_RFC) + continue; + + if (olen != sizeof(rfc)) + break; + + memcpy(&rfc, (void *)val, olen); + goto done; } /* Use sane default values in case a misbehaving remote device diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 7ce5ab54ba5b..882c29679810 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -242,6 +242,7 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l BT_DBG("sock %p, sk %p", sock, sk); + memset(la, 0, sizeof(struct sockaddr_l2)); addr->sa_family = AF_BLUETOOTH; *len = sizeof(struct sockaddr_l2); @@ -587,7 +588,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch break; } - if (smp_conn_security(conn, sec.level)) + if (smp_conn_security(conn->hcon, sec.level)) break; sk->sk_state = BT_CONFIG; chan->state = BT_CONFIG; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c05c3a69c372..00879b8215e7 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2801,6 +2801,22 @@ int mgmt_powered(struct hci_dev *hdev, u8 powered) if (scan) hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { + u8 ssp = 1; + + hci_send_cmd(hdev, HCI_OP_WRITE_SSP_MODE, 1, &ssp); + } + + if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) { + struct hci_cp_write_le_host_supported cp; + + cp.le = 1; + cp.simul = !!(hdev->features[6] & LMP_SIMUL_LE_BR); + + hci_send_cmd(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, + sizeof(cp), &cp); + } + update_class(hdev); update_name(hdev, hdev->dev_name); update_eir(hdev); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index a55a43e9f70e..8d1edd7207df 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -485,7 +485,7 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f long timeo; int err = 0; - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_type != SOCK_STREAM) { err = -EINVAL; @@ -522,7 +522,7 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f release_sock(sk); timeo = schedule_timeout(timeo); - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); @@ -546,6 +546,7 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int * BT_DBG("sock %p, sk %p", sock, sk); + memset(sa, 0, sizeof(*sa)); sa->rc_family = AF_BLUETOOTH; sa->rc_channel = rfcomm_pi(sk)->channel; if (peer) @@ -836,6 +837,7 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c } sec.level = rfcomm_pi(sk)->sec_level; + sec.key_size = 0; len = min_t(unsigned int, len, sizeof(sec)); if (copy_to_user(optval, (char *) &sec, len)) diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 4bf54b377255..95a0f60fc6e0 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -467,7 +467,7 @@ static int rfcomm_get_dev_list(void __user *arg) size = sizeof(*dl) + dev_num * sizeof(*di); - dl = kmalloc(size, GFP_KERNEL); + dl = kzalloc(size, GFP_KERNEL); if (!dl) return -ENOMEM; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index deb119875fd9..605156f13899 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -31,6 +31,8 @@ #define SMP_TIMEOUT msecs_to_jiffies(30000) +#define AUTH_REQ_MASK 0x07 + static inline void swap128(u8 src[16], u8 dst[16]) { int i; @@ -229,7 +231,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, req->max_key_size = SMP_MAX_ENC_KEY_SIZE; req->init_key_dist = 0; req->resp_key_dist = dist_keys; - req->auth_req = authreq; + req->auth_req = (authreq & AUTH_REQ_MASK); return; } @@ -238,7 +240,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; rsp->init_key_dist = 0; rsp->resp_key_dist = req->resp_key_dist & dist_keys; - rsp->auth_req = authreq; + rsp->auth_req = (authreq & AUTH_REQ_MASK); } static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) @@ -266,10 +268,10 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason, u8 send) mgmt_auth_failed(conn->hcon->hdev, conn->dst, hcon->type, hcon->dst_type, reason); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { - cancel_delayed_work_sync(&conn->security_timer); + cancel_delayed_work_sync(&conn->security_timer); + + if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) smp_chan_destroy(conn); - } } #define JUST_WORKS 0x00 @@ -753,9 +755,9 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } -int smp_conn_security(struct l2cap_conn *conn, __u8 sec_level) +int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { - struct hci_conn *hcon = conn->hcon; + struct l2cap_conn *conn = hcon->l2cap_data; struct smp_chan *smp = conn->smp_chan; __u8 authreq; @@ -850,6 +852,19 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) skb_pull(skb, sizeof(code)); + /* + * The SMP context must be initialized for all other PDUs except + * pairing and security requests. If we get any other PDU when + * not initialized simply disconnect (done if this function + * returns an error). + */ + if (code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ && + !conn->smp_chan) { + BT_ERR("Unexpected SMP command 0x%02x. Disconnecting.", code); + kfree_skb(skb); + return -ENOTSUPP; + } + switch (code) { case SMP_CMD_PAIRING_REQ: reason = smp_cmd_pairing_req(conn, skb); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index d7f49b63ab0f..e54ef82fdad7 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -254,6 +254,9 @@ static int br_parse_ip_options(struct sk_buff *skb) struct net_device *dev = skb->dev; u32 len; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index e16aade51ae0..718cbe8ad42b 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -16,6 +16,7 @@ #include <linux/etherdevice.h> #include <linux/llc.h> #include <linux/slab.h> +#include <linux/pkt_sched.h> #include <net/net_namespace.h> #include <net/llc.h> #include <net/llc_pdu.h> @@ -40,6 +41,7 @@ static void br_send_bpdu(struct net_bridge_port *p, skb->dev = p->dev; skb->protocol = htons(ETH_P_802_2); + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, LLC_RESERVE); memcpy(__skb_put(skb, length), data, length); diff --git a/net/can/bcm.c b/net/can/bcm.c index 151b7730c12c..3910c1fefd04 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1084,6 +1084,9 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->sk = sk; op->ifindex = ifindex; + /* ifindex for timeout events w/o previous frame reception */ + op->rx_ifindex = ifindex; + /* initialize uninitialized (kzalloc) structure */ hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); op->timer.function = bcm_rx_timeout_handler; diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 214c2bb43d62..925ca583c09c 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -59,9 +59,7 @@ static int handle_reply(struct ceph_auth_client *ac, int result, */ static int ceph_auth_none_create_authorizer( struct ceph_auth_client *ac, int peer_type, - struct ceph_authorizer **a, - void **buf, size_t *len, - void **reply_buf, size_t *reply_len) + struct ceph_auth_handshake *auth) { struct ceph_auth_none_info *ai = ac->private; struct ceph_none_authorizer *au = &ai->au; @@ -82,11 +80,12 @@ static int ceph_auth_none_create_authorizer( dout("built authorizer len %d\n", au->buf_len); } - *a = (struct ceph_authorizer *)au; - *buf = au->buf; - *len = au->buf_len; - *reply_buf = au->reply_buf; - *reply_len = sizeof(au->reply_buf); + auth->authorizer = (struct ceph_authorizer *) au; + auth->authorizer_buf = au->buf; + auth->authorizer_buf_len = au->buf_len; + auth->authorizer_reply_buf = au->reply_buf; + auth->authorizer_reply_buf_len = sizeof (au->reply_buf); + return 0; bad2: diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 1587dc6010c6..a16bf14eb027 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -526,9 +526,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, static int ceph_x_create_authorizer( struct ceph_auth_client *ac, int peer_type, - struct ceph_authorizer **a, - void **buf, size_t *len, - void **reply_buf, size_t *reply_len) + struct ceph_auth_handshake *auth) { struct ceph_x_authorizer *au; struct ceph_x_ticket_handler *th; @@ -548,11 +546,12 @@ static int ceph_x_create_authorizer( return ret; } - *a = (struct ceph_authorizer *)au; - *buf = au->buf->vec.iov_base; - *len = au->buf->vec.iov_len; - *reply_buf = au->reply_buf; - *reply_len = sizeof(au->reply_buf); + auth->authorizer = (struct ceph_authorizer *) au; + auth->authorizer_buf = au->buf->vec.iov_base; + auth->authorizer_buf_len = au->buf->vec.iov_len; + auth->authorizer_reply_buf = au->reply_buf; + auth->authorizer_reply_buf_len = sizeof (au->reply_buf); + return 0; } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index cc913193d992..b11448f052f4 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -83,7 +83,6 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) return -1; } } else { - pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); memcpy(&client->fsid, fsid, sizeof(*fsid)); } return 0; @@ -305,7 +304,6 @@ ceph_parse_options(char *options, const char *dev_name, /* start with defaults */ opt->flags = CEPH_OPT_DEFAULT; - opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ @@ -391,7 +389,7 @@ ceph_parse_options(char *options, const char *dev_name, /* misc */ case Opt_osdtimeout: - opt->osd_timeout = intval; + pr_warning("ignoring deprecated osdtimeout option\n"); break; case Opt_osdkeepalivetimeout: opt->osd_keepalive_timeout = intval; @@ -468,19 +466,15 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, /* msgr */ if (ceph_test_opt(client, MYIP)) myaddr = &client->options->my_addr; - client->msgr = ceph_messenger_create(myaddr, - client->supported_features, - client->required_features); - if (IS_ERR(client->msgr)) { - err = PTR_ERR(client->msgr); - goto fail; - } - client->msgr->nocrc = ceph_test_opt(client, NOCRC); + ceph_messenger_init(&client->msgr, myaddr, + client->supported_features, + client->required_features, + ceph_test_opt(client, NOCRC)); /* subsystems */ err = ceph_monc_init(&client->monc, client); if (err < 0) - goto fail_msgr; + goto fail; err = ceph_osdc_init(&client->osdc, client); if (err < 0) goto fail_monc; @@ -489,8 +483,6 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, fail_monc: ceph_monc_stop(&client->monc); -fail_msgr: - ceph_messenger_destroy(client->msgr); fail: kfree(client); return ERR_PTR(err); @@ -501,22 +493,15 @@ void ceph_destroy_client(struct ceph_client *client) { dout("destroy_client %p\n", client); + atomic_set(&client->msgr.stopping, 1); + /* unmount */ ceph_osdc_stop(&client->osdc); - /* - * make sure osd connections close out before destroying the - * auth module, which is needed to free those connections' - * ceph_authorizers. - */ - ceph_msgr_flush(); - ceph_monc_stop(&client->monc); ceph_debugfs_client_cleanup(client); - ceph_messenger_destroy(client->msgr); - ceph_destroy_options(client->options); kfree(client); diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index d6ebb13a18a4..fbda0521a006 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -26,9 +26,9 @@ const char *crush_bucket_alg_name(int alg) * @b: bucket pointer * @p: item index in bucket */ -int crush_get_bucket_item_weight(struct crush_bucket *b, int p) +int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) { - if (p >= b->size) + if ((__u32)p >= b->size) return 0; switch (b->alg) { @@ -37,9 +37,7 @@ int crush_get_bucket_item_weight(struct crush_bucket *b, int p) case CRUSH_BUCKET_LIST: return ((struct crush_bucket_list *)b)->item_weights[p]; case CRUSH_BUCKET_TREE: - if (p & 1) - return ((struct crush_bucket_tree *)b)->node_weights[p]; - return 0; + return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; case CRUSH_BUCKET_STRAW: return ((struct crush_bucket_straw *)b)->item_weights[p]; } @@ -87,6 +85,8 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { + kfree(b->h.perm); + kfree(b->h.items); kfree(b->node_weights); kfree(b); } @@ -124,10 +124,9 @@ void crush_destroy_bucket(struct crush_bucket *b) */ void crush_destroy(struct crush_map *map) { - int b; - /* buckets */ if (map->buckets) { + __s32 b; for (b = 0; b < map->max_buckets; b++) { if (map->buckets[b] == NULL) continue; @@ -138,6 +137,7 @@ void crush_destroy(struct crush_map *map) /* rules */ if (map->rules) { + __u32 b; for (b = 0; b < map->max_rules; b++) kfree(map->rules[b]); kfree(map->rules); diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index b79747c4b645..00baad5d3bde 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -32,9 +32,9 @@ * @type: storage ruleset type (user defined) * @size: output set size */ -int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) +int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size) { - int i; + __u32 i; for (i = 0; i < map->max_rules; i++) { if (map->rules[i] && @@ -72,7 +72,7 @@ static int bucket_perm_choose(struct crush_bucket *bucket, unsigned i, s; /* start a new permutation if @x has changed */ - if (bucket->perm_x != x || bucket->perm_n == 0) { + if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { dprintk("bucket %d new x=%d\n", bucket->id, x); bucket->perm_x = x; @@ -152,8 +152,8 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, return bucket->h.items[i]; } - BUG_ON(1); - return 0; + dprintk("bad list sums for bucket %d\n", bucket->h.id); + return bucket->h.items[0]; } @@ -219,7 +219,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, static int bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r) { - int i; + __u32 i; int high = 0; __u64 high_draw = 0; __u64 draw; @@ -239,6 +239,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); + BUG_ON(in->size == 0); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: return bucket_uniform_choose((struct crush_bucket_uniform *)in, @@ -253,7 +254,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) return bucket_straw_choose((struct crush_bucket_straw *)in, x, r); default: - BUG_ON(1); + dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } @@ -262,7 +263,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) * true if device is marked "out" (failed, fully offloaded) * of the cluster */ -static int is_out(struct crush_map *map, __u32 *weight, int item, int x) +static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) { if (weight[item] >= 0x10000) return 0; @@ -287,16 +288,16 @@ static int is_out(struct crush_map *map, __u32 *weight, int item, int x) * @recurse_to_leaf: true if we want one device under each item of given type * @out2: second output vector for leaf items (if @recurse_to_leaf) */ -static int crush_choose(struct crush_map *map, +static int crush_choose(const struct crush_map *map, struct crush_bucket *bucket, - __u32 *weight, + const __u32 *weight, int x, int numrep, int type, int *out, int outpos, int firstn, int recurse_to_leaf, int *out2) { int rep; - int ftotal, flocal; + unsigned int ftotal, flocal; int retry_descent, retry_bucket, skip_rep; struct crush_bucket *in = bucket; int r; @@ -304,7 +305,7 @@ static int crush_choose(struct crush_map *map, int item = 0; int itemtype; int collide, reject; - const int orig_tries = 5; /* attempts before we fall back to search */ + const unsigned int orig_tries = 5; /* attempts before we fall back to search */ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep); @@ -325,7 +326,7 @@ static int crush_choose(struct crush_map *map, r = rep; if (in->alg == CRUSH_BUCKET_UNIFORM) { /* be careful */ - if (firstn || numrep >= in->size) + if (firstn || (__u32)numrep >= in->size) /* r' = r + f_total */ r += ftotal; else if (in->size % numrep == 0) @@ -354,7 +355,11 @@ static int crush_choose(struct crush_map *map, item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); - BUG_ON(item >= map->max_devices); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + skip_rep = 1; + break; + } /* desired type? */ if (item < 0) @@ -365,8 +370,12 @@ static int crush_choose(struct crush_map *map, /* keep going? */ if (itemtype != type) { - BUG_ON(item >= 0 || - (-1-item) >= map->max_buckets); + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + skip_rep = 1; + break; + } in = map->buckets[-1-item]; retry_bucket = 1; continue; @@ -415,7 +424,7 @@ reject: if (collide && flocal < 3) /* retry locally a few times */ retry_bucket = 1; - else if (flocal < in->size + orig_tries) + else if (flocal <= in->size + orig_tries) /* exhaustive bucket search */ retry_bucket = 1; else if (ftotal < 20) @@ -425,7 +434,7 @@ reject: /* else give up */ skip_rep = 1; dprintk(" reject %d collide %d " - "ftotal %d flocal %d\n", + "ftotal %u flocal %u\n", reject, collide, ftotal, flocal); } @@ -456,9 +465,9 @@ reject: * @result_max: maximum result size * @force: force initial replica choice; -1 for none */ -int crush_do_rule(struct crush_map *map, +int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - int force, __u32 *weight) + int force, const __u32 *weight) { int result_len; int force_context[CRUSH_MAX_DEPTH]; @@ -473,12 +482,15 @@ int crush_do_rule(struct crush_map *map, int osize; int *tmp; struct crush_rule *rule; - int step; + __u32 step; int i, j; int numrep; int firstn; - BUG_ON(ruleno >= map->max_rules); + if ((__u32)ruleno >= map->max_rules) { + dprintk(" bad ruleno %d\n", ruleno); + return 0; + } rule = map->rules[ruleno]; result_len = 0; @@ -488,7 +500,8 @@ int crush_do_rule(struct crush_map *map, /* * determine hierarchical context of force, if any. note * that this may or may not correspond to the specific types - * referenced by the crush rule. + * referenced by the crush rule. it will also only affect + * the first descent (TAKE). */ if (force >= 0 && force < map->max_devices && @@ -527,7 +540,8 @@ int crush_do_rule(struct crush_map *map, firstn = 1; case CRUSH_RULE_CHOOSE_LEAF_INDEP: case CRUSH_RULE_CHOOSE_INDEP: - BUG_ON(wsize == 0); + if (wsize == 0) + break; recurse_to_leaf = rule->steps[step].op == @@ -596,7 +610,9 @@ int crush_do_rule(struct crush_map *map, break; default: - BUG_ON(1); + dprintk(" unknown op %d at step %d\n", + curstep->op, step); + break; } } return result_len; diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index b780cb7947dd..9da7fdd3cd8a 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -466,6 +466,7 @@ void ceph_key_destroy(struct key *key) { struct ceph_crypto_key *ckey = key->payload.data; ceph_crypto_key_destroy(ckey); + kfree(ckey); } struct key_type key_type_ceph = { diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 1919d1550d75..3572dc518bc9 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -16,7 +16,8 @@ struct ceph_crypto_key { static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { - kfree(key->key); + if (key) + kfree(key->key); } extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst, diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 27d4ea315d12..680978d00446 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -189,6 +189,9 @@ int ceph_debugfs_client_init(struct ceph_client *client) snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, client->monc.auth->global_id); + dout("ceph_debugfs_client_init %p %s\n", client, name); + + BUG_ON(client->debugfs_dir); client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); if (!client->debugfs_dir) goto out; @@ -234,6 +237,7 @@ out: void ceph_debugfs_client_cleanup(struct ceph_client *client) { + dout("ceph_debugfs_client_cleanup %p\n", client); debugfs_remove(client->debugfs_osdmap); debugfs_remove(client->debugfs_monmap); debugfs_remove(client->osdc.debugfs_file); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f0993af2ae4d..ba1037ceb496 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -29,6 +29,74 @@ * the sender. */ +/* + * We track the state of the socket on a given connection using + * values defined below. The transition to a new socket state is + * handled by a function which verifies we aren't coming from an + * unexpected state. + * + * -------- + * | NEW* | transient initial state + * -------- + * | con_sock_state_init() + * v + * ---------- + * | CLOSED | initialized, but no socket (and no + * ---------- TCP connection) + * ^ \ + * | \ con_sock_state_connecting() + * | ---------------------- + * | \ + * + con_sock_state_closed() \ + * |+--------------------------- \ + * | \ \ \ + * | ----------- \ \ + * | | CLOSING | socket event; \ \ + * | ----------- await close \ \ + * | ^ \ | + * | | \ | + * | + con_sock_state_closing() \ | + * | / \ | | + * | / --------------- | | + * | / \ v v + * | / -------------- + * | / -----------------| CONNECTING | socket created, TCP + * | | / -------------- connect initiated + * | | | con_sock_state_connected() + * | | v + * ------------- + * | CONNECTED | TCP connection established + * ------------- + * + * State values for ceph_connection->sock_state; NEW is assumed to be 0. + */ + +#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ +#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ +#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ +#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ +#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ + +/* + * connection states + */ +#define CON_STATE_CLOSED 1 /* -> PREOPEN */ +#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ +#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ +#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ +#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ +#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ + +/* + * ceph_connection flag bits + */ +#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop + * messages on errors */ +#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ +#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ +#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ +#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ + /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; @@ -147,72 +215,130 @@ void ceph_msgr_flush(void) } EXPORT_SYMBOL(ceph_msgr_flush); +/* Connection socket state transition functions */ + +static void con_sock_state_init(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); + if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSED); +} + +static void con_sock_state_connecting(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); + if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CONNECTING); +} + +static void con_sock_state_connected(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CONNECTED); +} + +static void con_sock_state_closing(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && + old_state != CON_SOCK_STATE_CONNECTED && + old_state != CON_SOCK_STATE_CLOSING)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSING); +} + +static void con_sock_state_closed(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && + old_state != CON_SOCK_STATE_CLOSING && + old_state != CON_SOCK_STATE_CONNECTING && + old_state != CON_SOCK_STATE_CLOSED)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSED); +} /* * socket callback functions */ /* data available on socket, or listen socket received a connect */ -static void ceph_data_ready(struct sock *sk, int count_unused) +static void ceph_sock_data_ready(struct sock *sk, int count_unused) { struct ceph_connection *con = sk->sk_user_data; + if (atomic_read(&con->msgr->stopping)) { + return; + } if (sk->sk_state != TCP_CLOSE_WAIT) { - dout("ceph_data_ready on %p state = %lu, queueing work\n", + dout("%s on %p state = %lu, queueing work\n", __func__, con, con->state); queue_con(con); } } /* socket has buffer space for writing */ -static void ceph_write_space(struct sock *sk) +static void ceph_sock_write_space(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept - * more data. clear SOCK_NOSPACE so that ceph_write_space() + * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() * doesn't get called again until try_write() fills the socket * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ - if (test_bit(WRITE_PENDING, &con->state)) { + if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { - dout("ceph_write_space %p queueing write work\n", con); + dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); queue_con(con); } } else { - dout("ceph_write_space %p nothing to write\n", con); + dout("%s %p nothing to write\n", __func__, con); } } /* socket's state has changed */ -static void ceph_state_change(struct sock *sk) +static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; - dout("ceph_state_change %p state = %lu sk_state = %u\n", + dout("%s %p state = %lu sk_state = %u\n", __func__, con, con->state, sk->sk_state); - if (test_bit(CLOSED, &con->state)) - return; - switch (sk->sk_state) { case TCP_CLOSE: - dout("ceph_state_change TCP_CLOSE\n"); + dout("%s TCP_CLOSE\n", __func__); case TCP_CLOSE_WAIT: - dout("ceph_state_change TCP_CLOSE_WAIT\n"); - if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { - if (test_bit(CONNECTING, &con->state)) - con->error_msg = "connection failed"; - else - con->error_msg = "socket closed"; - queue_con(con); - } + dout("%s TCP_CLOSE_WAIT\n", __func__); + con_sock_state_closing(con); + set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); + queue_con(con); break; case TCP_ESTABLISHED: - dout("ceph_state_change TCP_ESTABLISHED\n"); + dout("%s TCP_ESTABLISHED\n", __func__); + con_sock_state_connected(con); queue_con(con); break; default: /* Everything else is uninteresting */ @@ -228,9 +354,9 @@ static void set_sock_callbacks(struct socket *sock, { struct sock *sk = sock->sk; sk->sk_user_data = con; - sk->sk_data_ready = ceph_data_ready; - sk->sk_write_space = ceph_write_space; - sk->sk_state_change = ceph_state_change; + sk->sk_data_ready = ceph_sock_data_ready; + sk->sk_write_space = ceph_sock_write_space; + sk->sk_state_change = ceph_sock_state_change; } @@ -262,6 +388,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); + con_sock_state_connecting(con); ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), O_NONBLOCK); if (ret == -EINPROGRESS) { @@ -277,7 +404,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } con->sock = sock; - return 0; } @@ -333,16 +459,24 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, */ static int con_close_socket(struct ceph_connection *con) { - int rc; + int rc = 0; dout("con_close_socket on %p sock %p\n", con, con->sock); - if (!con->sock) - return 0; - set_bit(SOCK_CLOSED, &con->state); - rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); - sock_release(con->sock); - con->sock = NULL; - clear_bit(SOCK_CLOSED, &con->state); + if (con->sock) { + rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); + sock_release(con->sock); + con->sock = NULL; + } + + /* + * Forcibly clear the SOCK_CLOSED flag. It gets set + * independent of the connection mutex, and we could have + * received a socket close event before we had the chance to + * shut the socket down. + */ + clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags); + + con_sock_state_closed(con); return rc; } @@ -353,6 +487,10 @@ static int con_close_socket(struct ceph_connection *con) static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); + BUG_ON(msg->con == NULL); + msg->con->ops->put(msg->con); + msg->con = NULL; + ceph_msg_put(msg); } static void ceph_msg_remove_list(struct list_head *head) @@ -368,12 +506,16 @@ static void reset_connection(struct ceph_connection *con) { /* reset connection, out_queue, msg_ and connect_seq */ /* discard existing out_queue and msg_seq */ + dout("reset_connection %p\n", con); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); if (con->in_msg) { + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; + con->ops->put(con); } con->connect_seq = 0; @@ -391,32 +533,44 @@ static void reset_connection(struct ceph_connection *con) */ void ceph_con_close(struct ceph_connection *con) { + mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr.in_addr)); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ - clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ - clear_bit(KEEPALIVE_PENDING, &con->state); - clear_bit(WRITE_PENDING, &con->state); - mutex_lock(&con->mutex); + con->state = CON_STATE_CLOSED; + + clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ + clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); + clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); + clear_bit(CON_FLAG_BACKOFF, &con->flags); + reset_connection(con); con->peer_global_seq = 0; cancel_delayed_work(&con->work); + con_close_socket(con); mutex_unlock(&con->mutex); - queue_con(con); } EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ -void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) +void ceph_con_open(struct ceph_connection *con, + __u8 entity_type, __u64 entity_num, + struct ceph_entity_addr *addr) { + mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); - set_bit(OPENING, &con->state); - clear_bit(CLOSED, &con->state); + + WARN_ON(con->state != CON_STATE_CLOSED); + con->state = CON_STATE_PREOPEN; + + con->peer_name.type = (__u8) entity_type; + con->peer_name.num = cpu_to_le64(entity_num); + memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ + mutex_unlock(&con->mutex); queue_con(con); } EXPORT_SYMBOL(ceph_con_open); @@ -430,42 +584,26 @@ bool ceph_con_opened(struct ceph_connection *con) } /* - * generic get/put - */ -struct ceph_connection *ceph_con_get(struct ceph_connection *con) -{ - int nref = __atomic_add_unless(&con->nref, 1, 0); - - dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1); - - return nref ? con : NULL; -} - -void ceph_con_put(struct ceph_connection *con) -{ - int nref = atomic_dec_return(&con->nref); - - BUG_ON(nref < 0); - if (nref == 0) { - BUG_ON(con->sock); - kfree(con); - } - dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref); -} - -/* * initialize a new connection. */ -void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) +void ceph_con_init(struct ceph_connection *con, void *private, + const struct ceph_connection_operations *ops, + struct ceph_messenger *msgr) { dout("con_init %p\n", con); memset(con, 0, sizeof(*con)); - atomic_set(&con->nref, 1); + con->private = private; + con->ops = ops; con->msgr = msgr; + + con_sock_state_init(con); + mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, con_work); + + con->state = CON_STATE_CLOSED; } EXPORT_SYMBOL(ceph_con_init); @@ -486,14 +624,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) return ret; } -static void ceph_con_out_kvec_reset(struct ceph_connection *con) +static void con_out_kvec_reset(struct ceph_connection *con) { con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; } -static void ceph_con_out_kvec_add(struct ceph_connection *con, +static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { int index; @@ -507,6 +645,53 @@ static void ceph_con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +#ifdef CONFIG_BLOCK +static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +{ + if (!bio) { + *iter = NULL; + *seg = 0; + return; + } + *iter = bio; + *seg = bio->bi_idx; +} + +static void iter_bio_next(struct bio **bio_iter, int *seg) +{ + if (*bio_iter == NULL) + return; + + BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + + (*seg)++; + if (*seg == (*bio_iter)->bi_vcnt) + init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); +} +#endif + +static void prepare_write_message_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + BUG_ON(!msg); + BUG_ON(!msg->hdr.data_len); + + /* initialize page iterator */ + con->out_msg_pos.page = 0; + if (msg->pages) + con->out_msg_pos.page_pos = msg->page_alignment; + else + con->out_msg_pos.page_pos = 0; +#ifdef CONFIG_BLOCK + if (msg->bio) + init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); +#endif + con->out_msg_pos.data_pos = 0; + con->out_msg_pos.did_page_crc = false; + con->out_more = 1; /* data + footer will follow */ +} + /* * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. @@ -516,6 +701,8 @@ static void prepare_write_message_footer(struct ceph_connection *con) struct ceph_msg *m = con->out_msg; int v = con->out_kvec_left; + m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; + dout("prepare_write_message_footer %p\n", con); con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; @@ -534,7 +721,7 @@ static void prepare_write_message(struct ceph_connection *con) struct ceph_msg *m; u32 crc; - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); con->out_kvec_is_msg = true; con->out_msg_done = false; @@ -542,14 +729,16 @@ static void prepare_write_message(struct ceph_connection *con) * TCP packet that's a good thing. */ if (con->in_seq > con->in_seq_acked) { con->in_seq_acked = con->in_seq; - ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + con_out_kvec_add(con, sizeof (con->out_temp_ack), &con->out_temp_ack); } + BUG_ON(list_empty(&con->out_queue)); m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); con->out_msg = m; + BUG_ON(m->con != con); /* put message on sent list */ ceph_msg_get(m); @@ -572,18 +761,18 @@ static void prepare_write_message(struct ceph_connection *con) BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ - ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); - ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); + con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); + con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) - ceph_con_out_kvec_add(con, m->middle->vec.iov_len, + con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); /* fill in crc (except data pages), footer */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; + con->out_msg->footer.flags = 0; crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); @@ -593,28 +782,19 @@ static void prepare_write_message(struct ceph_connection *con) con->out_msg->footer.middle_crc = cpu_to_le32(crc); } else con->out_msg->footer.middle_crc = 0; - con->out_msg->footer.data_crc = 0; - dout("prepare_write_message front_crc %u data_crc %u\n", + dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); /* is there a data payload? */ - if (le32_to_cpu(m->hdr.data_len) > 0) { - /* initialize page iterator */ - con->out_msg_pos.page = 0; - if (m->pages) - con->out_msg_pos.page_pos = m->page_alignment; - else - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = false; - con->out_more = 1; /* data + footer will follow */ - } else { + con->out_msg->footer.data_crc = 0; + if (m->hdr.data_len) + prepare_write_message_data(con); + else /* no, queue up footer too and be done */ prepare_write_message_footer(con); - } - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -626,16 +806,16 @@ static void prepare_write_ack(struct ceph_connection *con) con->in_seq_acked, con->in_seq); con->in_seq_acked = con->in_seq; - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); - ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + con_out_kvec_add(con, sizeof (con->out_temp_ack), &con->out_temp_ack); con->out_more = 1; /* more will follow.. eventually.. */ - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -644,63 +824,60 @@ static void prepare_write_ack(struct ceph_connection *con) static void prepare_write_keepalive(struct ceph_connection *con) { dout("prepare_write_keepalive %p\n", con); - ceph_con_out_kvec_reset(con); - ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); - set_bit(WRITE_PENDING, &con->state); + con_out_kvec_reset(con); + con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* * Connection negotiation. */ -static int prepare_connect_authorizer(struct ceph_connection *con) +static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, + int *auth_proto) { - void *auth_buf; - int auth_len = 0; - int auth_protocol = 0; + struct ceph_auth_handshake *auth; + if (!con->ops->get_authorizer) { + con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; + con->out_connect.authorizer_len = 0; + return NULL; + } + + /* Can't hold the mutex while getting authorizer */ mutex_unlock(&con->mutex); - if (con->ops->get_authorizer) - con->ops->get_authorizer(con, &auth_buf, &auth_len, - &auth_protocol, &con->auth_reply_buf, - &con->auth_reply_buf_len, - con->auth_retry); + auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) - return -EAGAIN; - - con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); - con->out_connect.authorizer_len = cpu_to_le32(auth_len); + if (IS_ERR(auth)) + return auth; + if (con->state != CON_STATE_NEGOTIATING) + return ERR_PTR(-EAGAIN); - if (auth_len) - ceph_con_out_kvec_add(con, auth_len, auth_buf); - - return 0; + con->auth_reply_buf = auth->authorizer_reply_buf; + con->auth_reply_buf_len = auth->authorizer_reply_buf_len; + return auth; } /* * We connected to a peer and are saying hello. */ -static void prepare_write_banner(struct ceph_messenger *msgr, - struct ceph_connection *con) +static void prepare_write_banner(struct ceph_connection *con) { - ceph_con_out_kvec_reset(con); - ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); - ceph_con_out_kvec_add(con, sizeof (msgr->my_enc_addr), - &msgr->my_enc_addr); + con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); + con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), + &con->msgr->my_enc_addr); con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } -static int prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con, - int include_banner) +static int prepare_write_connect(struct ceph_connection *con) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; + int auth_proto; + struct ceph_auth_handshake *auth; switch (con->peer_name.type) { case CEPH_ENTITY_TYPE_MON: @@ -719,23 +896,32 @@ static int prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = cpu_to_le64(msgr->supported_features); + con->out_connect.features = cpu_to_le64(con->msgr->supported_features); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; - if (include_banner) - prepare_write_banner(msgr, con); - else - ceph_con_out_kvec_reset(con); - ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); + auth_proto = CEPH_AUTH_UNKNOWN; + auth = get_connect_authorizer(con, &auth_proto); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); + con->out_connect.authorizer_len = auth ? + cpu_to_le32(auth->authorizer_buf_len) : 0; + + con_out_kvec_add(con, sizeof (con->out_connect), + &con->out_connect); + if (auth && auth->authorizer_buf_len) + con_out_kvec_add(con, auth->authorizer_buf_len, + auth->authorizer_buf); con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); - return prepare_connect_authorizer(con); + return 0; } /* @@ -781,30 +967,34 @@ out: return ret; /* done! */ } -#ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +static void out_msg_pos_next(struct ceph_connection *con, struct page *page, + size_t len, size_t sent, bool in_trail) { - if (!bio) { - *iter = NULL; - *seg = 0; - return; - } - *iter = bio; - *seg = bio->bi_idx; -} + struct ceph_msg *msg = con->out_msg; -static void iter_bio_next(struct bio **bio_iter, int *seg) -{ - if (*bio_iter == NULL) - return; + BUG_ON(!msg); + BUG_ON(!sent); - BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + con->out_msg_pos.data_pos += sent; + con->out_msg_pos.page_pos += sent; + if (sent < len) + return; - (*seg)++; - if (*seg == (*bio_iter)->bi_vcnt) - init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); -} + BUG_ON(sent != len); + con->out_msg_pos.page_pos = 0; + con->out_msg_pos.page++; + con->out_msg_pos.did_page_crc = false; + if (in_trail) + list_move_tail(&page->lru, + &msg->trail->head); + else if (msg->pagelist) + list_move_tail(&page->lru, + &msg->pagelist->head); +#ifdef CONFIG_BLOCK + else if (msg->bio) + iter_bio_next(&msg->bio_iter, &msg->bio_seg); #endif +} /* * Write as much message data payload as we can. If we finish, queue @@ -821,41 +1011,36 @@ static int write_partial_msg_pages(struct ceph_connection *con) bool do_datacrc = !con->msgr->nocrc; int ret; int total_max_write; - int in_trail = 0; - size_t trail_len = (msg->trail ? msg->trail->length : 0); + bool in_trail = false; + const size_t trail_len = (msg->trail ? msg->trail->length : 0); + const size_t trail_off = data_len - trail_len; dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, + con, msg, con->out_msg_pos.page, msg->nr_pages, con->out_msg_pos.page_pos); -#ifdef CONFIG_BLOCK - if (msg->bio && !msg->bio_iter) - init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); -#endif - + /* + * Iterate through each page that contains data to be + * written, and send as much as possible for each. + * + * If we are calculating the data crc (the default), we will + * need to map the page. If we have no pages, they have + * been revoked, so use the zero page. + */ while (data_len > con->out_msg_pos.data_pos) { struct page *page = NULL; int max_write = PAGE_SIZE; int bio_offset = 0; - total_max_write = data_len - trail_len - - con->out_msg_pos.data_pos; - - /* - * if we are calculating the data crc (the default), we need - * to map the page. if our pages[] has been revoked, use the - * zero page. - */ - - /* have we reached the trail part of the data? */ - if (con->out_msg_pos.data_pos >= data_len - trail_len) { - in_trail = 1; + in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; + if (!in_trail) + total_max_write = trail_off - con->out_msg_pos.data_pos; + if (in_trail) { total_max_write = data_len - con->out_msg_pos.data_pos; page = list_first_entry(&msg->trail->head, struct page, lru); - max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; } else if (msg->pagelist) { @@ -878,52 +1063,32 @@ static int write_partial_msg_pages(struct ceph_connection *con) if (do_datacrc && !con->out_msg_pos.did_page_crc) { void *base; - u32 crc; - u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); + u32 crc = le32_to_cpu(msg->footer.data_crc); char *kaddr; kaddr = kmap(page); BUG_ON(kaddr == NULL); base = kaddr + con->out_msg_pos.page_pos + bio_offset; - crc = crc32c(tmpcrc, base, len); - con->out_msg->footer.data_crc = cpu_to_le32(crc); + crc = crc32c(crc, base, len); + kunmap(page); + msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } ret = ceph_tcp_sendpage(con->sock, page, con->out_msg_pos.page_pos + bio_offset, len, 1); - - if (do_datacrc) - kunmap(page); - if (ret <= 0) goto out; - con->out_msg_pos.data_pos += ret; - con->out_msg_pos.page_pos += ret; - if (ret == len) { - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = false; - if (in_trail) - list_move_tail(&page->lru, - &msg->trail->head); - else if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); -#ifdef CONFIG_BLOCK - else if (msg->bio) - iter_bio_next(&msg->bio_iter, &msg->bio_seg); -#endif - } + out_msg_pos_next(con, page, len, (size_t) ret, in_trail); } dout("write_partial_msg_pages %p msg %p done\n", con, msg); /* prepare and queue up footer, too */ if (!do_datacrc) - con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - ceph_con_out_kvec_reset(con); + msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con_out_kvec_reset(con); prepare_write_message_footer(con); ret = 1; out: @@ -992,11 +1157,10 @@ static int prepare_read_message(struct ceph_connection *con) static int read_partial(struct ceph_connection *con, - int *to, int size, void *object) + int end, int size, void *object) { - *to += size; - while (con->in_base_pos < *to) { - int left = *to - con->in_base_pos; + while (con->in_base_pos < end) { + int left = end - con->in_base_pos; int have = size - left; int ret = ceph_tcp_recvmsg(con->sock, object + have, left); if (ret <= 0) @@ -1012,37 +1176,52 @@ static int read_partial(struct ceph_connection *con, */ static int read_partial_banner(struct ceph_connection *con) { - int ret, to = 0; + int size; + int end; + int ret; dout("read_partial_banner %p at %d\n", con, con->in_base_pos); /* peer's banner */ - ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); + size = strlen(CEPH_BANNER); + end = size; + ret = read_partial(con, end, size, con->in_banner); if (ret <= 0) goto out; - ret = read_partial(con, &to, sizeof(con->actual_peer_addr), - &con->actual_peer_addr); + + size = sizeof (con->actual_peer_addr); + end += size; + ret = read_partial(con, end, size, &con->actual_peer_addr); if (ret <= 0) goto out; - ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), - &con->peer_addr_for_me); + + size = sizeof (con->peer_addr_for_me); + end += size; + ret = read_partial(con, end, size, &con->peer_addr_for_me); if (ret <= 0) goto out; + out: return ret; } static int read_partial_connect(struct ceph_connection *con) { - int ret, to = 0; + int size; + int end; + int ret; dout("read_partial_connect %p at %d\n", con, con->in_base_pos); - ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); + size = sizeof (con->in_reply); + end = size; + ret = read_partial(con, end, size, &con->in_reply); if (ret <= 0) goto out; - ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), - con->auth_reply_buf); + + size = le32_to_cpu(con->in_reply.authorizer_len); + end += size; + ret = read_partial(con, end, size, con->auth_reply_buf); if (ret <= 0) goto out; @@ -1321,22 +1500,9 @@ static int process_banner(struct ceph_connection *con) ceph_pr_addr(&con->msgr->inst.addr.in_addr)); } - set_bit(NEGOTIATING, &con->state); - prepare_read_connect(con); return 0; } -static void fail_protocol(struct ceph_connection *con) -{ - reset_connection(con); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - - mutex_unlock(&con->mutex); - if (con->ops->bad_proto) - con->ops->bad_proto(con); - mutex_lock(&con->mutex); -} - static int process_connect(struct ceph_connection *con) { u64 sup_feat = con->msgr->supported_features; @@ -1354,7 +1520,7 @@ static int process_connect(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr), sup_feat, server_feat, server_feat & ~sup_feat); con->error_msg = "missing required protocol features"; - fail_protocol(con); + reset_connection(con); return -1; case CEPH_MSGR_TAG_BADPROTOVER: @@ -1365,7 +1531,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; - fail_protocol(con); + reset_connection(con); return -1; case CEPH_MSGR_TAG_BADAUTHORIZER: @@ -1377,7 +1543,8 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; - ret = prepare_write_connect(con->msgr, con, 0); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); if (ret < 0) return ret; prepare_read_connect(con); @@ -1392,12 +1559,15 @@ static int process_connect(struct ceph_connection *con) * dropped messages. */ dout("process_connect got RESET peer seq %u\n", - le32_to_cpu(con->in_connect.connect_seq)); + le32_to_cpu(con->in_reply.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); - prepare_write_connect(con->msgr, con, 0); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; prepare_read_connect(con); /* Tell ceph about it. */ @@ -1406,8 +1576,7 @@ static int process_connect(struct ceph_connection *con) if (con->ops->peer_reset) con->ops->peer_reset(con); mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) + if (con->state != CON_STATE_NEGOTIATING) return -EAGAIN; break; @@ -1416,11 +1585,14 @@ static int process_connect(struct ceph_connection *con) * If we sent a smaller connect_seq than the peer has, try * again with a larger value. */ - dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", + dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", le32_to_cpu(con->out_connect.connect_seq), - le32_to_cpu(con->in_connect.connect_seq)); - con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); - prepare_write_connect(con->msgr, con, 0); + le32_to_cpu(con->in_reply.connect_seq)); + con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; prepare_read_connect(con); break; @@ -1431,10 +1603,13 @@ static int process_connect(struct ceph_connection *con) */ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", con->peer_global_seq, - le32_to_cpu(con->in_connect.global_seq)); + le32_to_cpu(con->in_reply.global_seq)); get_global_seq(con->msgr, - le32_to_cpu(con->in_connect.global_seq)); - prepare_write_connect(con->msgr, con, 0); + le32_to_cpu(con->in_reply.global_seq)); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; prepare_read_connect(con); break; @@ -1446,10 +1621,13 @@ static int process_connect(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr), req_feat, server_feat, req_feat & ~server_feat); con->error_msg = "missing required protocol features"; - fail_protocol(con); + reset_connection(con); return -1; } - clear_bit(CONNECTING, &con->state); + + WARN_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_OPEN; + con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; con->peer_features = server_feat; @@ -1461,7 +1639,9 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.connect_seq)); if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - set_bit(LOSSYTX, &con->state); + set_bit(CON_FLAG_LOSSYTX, &con->flags); + + con->delay = 0; /* reset backoff memory */ prepare_read_tag(con); break; @@ -1491,10 +1671,10 @@ static int process_connect(struct ceph_connection *con) */ static int read_partial_ack(struct ceph_connection *con) { - int to = 0; + int size = sizeof (con->in_temp_ack); + int end = size; - return read_partial(con, &to, sizeof(con->in_temp_ack), - &con->in_temp_ack); + return read_partial(con, end, size, &con->in_temp_ack); } @@ -1547,10 +1727,7 @@ static int read_partial_message_section(struct ceph_connection *con, return 1; } -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip); - +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); static int read_partial_message_pages(struct ceph_connection *con, struct page **pages, @@ -1593,9 +1770,6 @@ static int read_partial_message_bio(struct ceph_connection *con, void *p; int ret, left; - if (IS_ERR(bv)) - return PTR_ERR(bv); - left = min((int)(data_len - con->in_msg_pos.data_pos), (int)(bv->bv_len - con->in_msg_pos.page_pos)); @@ -1627,26 +1801,22 @@ static int read_partial_message_bio(struct ceph_connection *con, static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; + int size; + int end; int ret; - int to, left; unsigned front_len, middle_len, data_len; bool do_datacrc = !con->msgr->nocrc; - int skip; u64 seq; u32 crc; dout("read_partial_message con %p msg %p\n", con, m); /* header */ - while (con->in_base_pos < sizeof(con->in_hdr)) { - left = sizeof(con->in_hdr) - con->in_base_pos; - ret = ceph_tcp_recvmsg(con->sock, - (char *)&con->in_hdr + con->in_base_pos, - left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } + size = sizeof (con->in_hdr); + end = size; + ret = read_partial(con, end, size, &con->in_hdr); + if (ret <= 0) + return ret; crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); if (cpu_to_le32(crc) != con->in_hdr.crc) { @@ -1686,10 +1856,13 @@ static int read_partial_message(struct ceph_connection *con) /* allocate message? */ if (!con->in_msg) { + int skip = 0; + dout("got hdr type %d front %d data %d\n", con->in_hdr.type, con->in_hdr.front_len, con->in_hdr.data_len); - skip = 0; - con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); + ret = ceph_con_in_msg_alloc(con, &skip); + if (ret < 0) + return ret; if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); @@ -1700,11 +1873,9 @@ static int read_partial_message(struct ceph_connection *con) con->in_seq++; return 0; } - if (!con->in_msg) { - con->error_msg = - "error allocating memory for incoming message"; - return -ENOMEM; - } + + BUG_ON(!con->in_msg); + BUG_ON(con->in_msg->con != con); m = con->in_msg; m->front.iov_len = 0; /* haven't read it yet */ if (m->middle) @@ -1716,6 +1887,11 @@ static int read_partial_message(struct ceph_connection *con) else con->in_msg_pos.page_pos = 0; con->in_msg_pos.data_pos = 0; + +#ifdef CONFIG_BLOCK + if (m->bio) + init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); +#endif } /* front */ @@ -1732,10 +1908,6 @@ static int read_partial_message(struct ceph_connection *con) if (ret <= 0) return ret; } -#ifdef CONFIG_BLOCK - if (m->bio && !m->bio_iter) - init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); -#endif /* (page) data */ while (con->in_msg_pos.data_pos < data_len) { @@ -1746,7 +1918,7 @@ static int read_partial_message(struct ceph_connection *con) return ret; #ifdef CONFIG_BLOCK } else if (m->bio) { - + BUG_ON(!m->bio_iter); ret = read_partial_message_bio(con, &m->bio_iter, &m->bio_seg, data_len, do_datacrc); @@ -1759,16 +1931,12 @@ static int read_partial_message(struct ceph_connection *con) } /* footer */ - to = sizeof(m->hdr) + sizeof(m->footer); - while (con->in_base_pos < to) { - left = to - con->in_base_pos; - ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer + - (con->in_base_pos - sizeof(m->hdr)), - left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } + size = sizeof (m->footer); + end += size; + ret = read_partial(con, end, size, &m->footer); + if (ret <= 0) + return ret; + dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", m, front_len, m->footer.front_crc, middle_len, m->footer.middle_crc, data_len, m->footer.data_crc); @@ -1804,8 +1972,11 @@ static void process_message(struct ceph_connection *con) { struct ceph_msg *msg; + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; msg = con->in_msg; con->in_msg = NULL; + con->ops->put(con); /* if first message, set peer_name */ if (con->peer_name.type == 0) @@ -1825,7 +1996,6 @@ static void process_message(struct ceph_connection *con) con->ops->dispatch(con, msg); mutex_lock(&con->mutex); - prepare_read_tag(con); } @@ -1835,21 +2005,21 @@ static void process_message(struct ceph_connection *con) */ static int try_write(struct ceph_connection *con) { - struct ceph_messenger *msgr = con->msgr; int ret = 1; - dout("try_write start %p state %lu nref %d\n", con, con->state, - atomic_read(&con->nref)); + dout("try_write start %p state %lu\n", con, con->state); more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); /* open the socket first? */ - if (con->sock == NULL) { - prepare_write_connect(msgr, con, 1); + if (con->state == CON_STATE_PREOPEN) { + BUG_ON(con->sock); + con->state = CON_STATE_CONNECTING; + + con_out_kvec_reset(con); + prepare_write_banner(con); prepare_read_banner(con); - set_bit(CONNECTING, &con->state); - clear_bit(NEGOTIATING, &con->state); BUG_ON(con->in_msg); con->in_tag = CEPH_MSGR_TAG_READY; @@ -1896,7 +2066,7 @@ more_kvec: } do_next: - if (!test_bit(CONNECTING, &con->state)) { + if (con->state == CON_STATE_OPEN) { /* is anything else pending? */ if (!list_empty(&con->out_queue)) { prepare_write_message(con); @@ -1906,14 +2076,15 @@ do_next: prepare_write_ack(con); goto more; } - if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { + if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, + &con->flags)) { prepare_write_keepalive(con); goto more; } } /* Nothing to do! */ - clear_bit(WRITE_PENDING, &con->state); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); dout("try_write nothing else to write.\n"); ret = 0; out: @@ -1930,38 +2101,45 @@ static int try_read(struct ceph_connection *con) { int ret = -1; - if (!con->sock) - return 0; - - if (test_bit(STANDBY, &con->state)) +more: + dout("try_read start on %p state %lu\n", con, con->state); + if (con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN) return 0; - dout("try_read start on %p\n", con); + BUG_ON(!con->sock); -more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); - /* - * process_connect and process_message drop and re-take - * con->mutex. make sure we handle a racing close or reopen. - */ - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) { - ret = -EAGAIN; + if (con->state == CON_STATE_CONNECTING) { + dout("try_read connecting\n"); + ret = read_partial_banner(con); + if (ret <= 0) + goto out; + ret = process_banner(con); + if (ret < 0) + goto out; + + con->state = CON_STATE_NEGOTIATING; + + /* + * Received banner is good, exchange connection info. + * Do not reset out_kvec, as sending our banner raced + * with receiving peer banner after connect completed. + */ + ret = prepare_write_connect(con); + if (ret < 0) + goto out; + prepare_read_connect(con); + + /* Send connection info before awaiting response */ goto out; } - if (test_bit(CONNECTING, &con->state)) { - if (!test_bit(NEGOTIATING, &con->state)) { - dout("try_read connecting\n"); - ret = read_partial_banner(con); - if (ret <= 0) - goto out; - ret = process_banner(con); - if (ret < 0) - goto out; - } + if (con->state == CON_STATE_NEGOTIATING) { + dout("try_read negotiating\n"); ret = read_partial_connect(con); if (ret <= 0) goto out; @@ -1971,6 +2149,8 @@ more: goto more; } + WARN_ON(con->state != CON_STATE_OPEN); + if (con->in_base_pos < 0) { /* * skipping + discarding content. @@ -2004,7 +2184,8 @@ more: prepare_read_ack(con); break; case CEPH_MSGR_TAG_CLOSE: - set_bit(CLOSED, &con->state); /* fixme */ + con_close_socket(con); + con->state = CON_STATE_CLOSED; goto out; default: goto bad_tag; @@ -2027,6 +2208,8 @@ more: if (con->in_tag == CEPH_MSGR_TAG_READY) goto more; process_message(con); + if (con->state == CON_STATE_OPEN) + prepare_read_tag(con); goto more; } if (con->in_tag == CEPH_MSGR_TAG_ACK) { @@ -2055,12 +2238,6 @@ bad_tag: */ static void queue_con(struct ceph_connection *con) { - if (test_bit(DEAD, &con->state)) { - dout("queue_con %p ignoring: DEAD\n", - con); - return; - } - if (!con->ops->get(con)) { dout("queue_con %p ref count 0\n", con); return; @@ -2074,6 +2251,35 @@ static void queue_con(struct ceph_connection *con) } } +static bool con_sock_closed(struct ceph_connection *con) +{ + if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) + return false; + +#define CASE(x) \ + case CON_STATE_ ## x: \ + con->error_msg = "socket closed (con state " #x ")"; \ + break; + + switch (con->state) { + CASE(CLOSED); + CASE(PREOPEN); + CASE(CONNECTING); + CASE(NEGOTIATING); + CASE(OPEN); + CASE(STANDBY); + default: + pr_warning("%s con %p unrecognized state %lu\n", + __func__, con, con->state); + con->error_msg = "unrecognized con state"; + BUG(); + break; + } +#undef CASE + + return true; +} + /* * Do some work on a connection. Drop a connection ref when we're done. */ @@ -2085,7 +2291,10 @@ static void con_work(struct work_struct *work) mutex_lock(&con->mutex); restart: - if (test_and_clear_bit(BACKOFF, &con->state)) { + if (con_sock_closed(con)) + goto fault; + + if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { dout("con_work %p backing off\n", con); if (queue_delayed_work(ceph_msgr_wq, &con->work, round_jiffies_relative(con->delay))) { @@ -2093,41 +2302,42 @@ restart: mutex_unlock(&con->mutex); return; } else { - con->ops->put(con); dout("con_work %p FAILED to back off %lu\n", con, con->delay); + set_bit(CON_FLAG_BACKOFF, &con->flags); } + goto done; } - if (test_bit(STANDBY, &con->state)) { + if (con->state == CON_STATE_STANDBY) { dout("con_work %p STANDBY\n", con); goto done; } - if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ - dout("con_work CLOSED\n"); - con_close_socket(con); + if (con->state == CON_STATE_CLOSED) { + dout("con_work %p CLOSED\n", con); + BUG_ON(con->sock); goto done; } - if (test_and_clear_bit(OPENING, &con->state)) { - /* reopen w/ new peer */ + if (con->state == CON_STATE_PREOPEN) { dout("con_work OPENING\n"); - con_close_socket(con); + BUG_ON(con->sock); } - if (test_and_clear_bit(SOCK_CLOSED, &con->state)) - goto fault; - ret = try_read(con); if (ret == -EAGAIN) goto restart; - if (ret < 0) + if (ret < 0) { + con->error_msg = "socket error on read"; goto fault; + } ret = try_write(con); if (ret == -EAGAIN) goto restart; - if (ret < 0) + if (ret < 0) { + con->error_msg = "socket error on write"; goto fault; + } done: mutex_unlock(&con->mutex); @@ -2136,7 +2346,6 @@ done_unlocked: return; fault: - mutex_unlock(&con->mutex); ceph_fault(con); /* error/fault path */ goto done_unlocked; } @@ -2147,26 +2356,31 @@ fault: * exponential backoff */ static void ceph_fault(struct ceph_connection *con) + __releases(con->mutex) { - pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); - if (test_bit(LOSSYTX, &con->state)) { - dout("fault on LOSSYTX channel\n"); - goto out; - } - - mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state)) - goto out_unlock; + WARN_ON(con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN); con_close_socket(con); + if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { + dout("fault on LOSSYTX channel, marking CLOSED\n"); + con->state = CON_STATE_CLOSED; + goto out_unlock; + } + if (con->in_msg) { + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; + con->ops->put(con); } /* Requeue anything that hasn't been acked */ @@ -2175,12 +2389,13 @@ static void ceph_fault(struct ceph_connection *con) /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && - !test_bit(KEEPALIVE_PENDING, &con->state)) { + !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); - clear_bit(WRITE_PENDING, &con->state); - set_bit(STANDBY, &con->state); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con->state = CON_STATE_STANDBY; } else { /* retry after a delay. */ + con->state = CON_STATE_PREOPEN; if (con->delay == 0) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) @@ -2201,13 +2416,12 @@ static void ceph_fault(struct ceph_connection *con) * that when con_work restarts we schedule the * delay then. */ - set_bit(BACKOFF, &con->state); + set_bit(CON_FLAG_BACKOFF, &con->flags); } } out_unlock: mutex_unlock(&con->mutex); -out: /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. @@ -2224,18 +2438,14 @@ out: /* - * create a new messenger instance + * initialize a new messenger instance */ -struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features) +void ceph_messenger_init(struct ceph_messenger *msgr, + struct ceph_entity_addr *myaddr, + u32 supported_features, + u32 required_features, + bool nocrc) { - struct ceph_messenger *msgr; - - msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); - if (msgr == NULL) - return ERR_PTR(-ENOMEM); - msgr->supported_features = supported_features; msgr->required_features = required_features; @@ -2248,30 +2458,23 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, msgr->inst.addr.type = 0; get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); + msgr->nocrc = nocrc; - dout("messenger_create %p\n", msgr); - return msgr; -} -EXPORT_SYMBOL(ceph_messenger_create); + atomic_set(&msgr->stopping, 0); -void ceph_messenger_destroy(struct ceph_messenger *msgr) -{ - dout("destroy %p\n", msgr); - kfree(msgr); - dout("destroyed messenger %p\n", msgr); + dout("%s %p\n", __func__, msgr); } -EXPORT_SYMBOL(ceph_messenger_destroy); +EXPORT_SYMBOL(ceph_messenger_init); static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ - if (test_and_clear_bit(STANDBY, &con->state)) { - mutex_lock(&con->mutex); + if (con->state == CON_STATE_STANDBY) { dout("clear_standby %p and ++connect_seq\n", con); + con->state = CON_STATE_PREOPEN; con->connect_seq++; - WARN_ON(test_bit(WRITE_PENDING, &con->state)); - WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); - mutex_unlock(&con->mutex); + WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); + WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); } } @@ -2280,21 +2483,24 @@ static void clear_standby(struct ceph_connection *con) */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { - if (test_bit(CLOSED, &con->state)) { - dout("con_send %p closed, dropping %p\n", con, msg); - ceph_msg_put(msg); - return; - } - /* set src+dst */ msg->hdr.src = con->msgr->inst.name; - BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); - msg->needs_out_seq = true; - /* queue */ mutex_lock(&con->mutex); + + if (con->state == CON_STATE_CLOSED) { + dout("con_send %p closed, dropping %p\n", con, msg); + ceph_msg_put(msg); + mutex_unlock(&con->mutex); + return; + } + + BUG_ON(msg->con != NULL); + msg->con = con->ops->get(con); + BUG_ON(msg->con == NULL); + BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, @@ -2303,12 +2509,13 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len)); + + clear_standby(con); mutex_unlock(&con->mutex); /* if there wasn't anything waiting to send before, queue * new work */ - clear_standby(con); - if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) + if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); @@ -2316,24 +2523,34 @@ EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send */ -void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) +void ceph_msg_revoke(struct ceph_msg *msg) { + struct ceph_connection *con = msg->con; + + if (!con) + return; /* Message not in our possession */ + mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p - was on queue\n", con, msg); + dout("%s %p msg %p - was on queue\n", __func__, con, msg); list_del_init(&msg->list_head); - ceph_msg_put(msg); + BUG_ON(msg->con == NULL); + msg->con->ops->put(msg->con); + msg->con = NULL; msg->hdr.seq = 0; + + ceph_msg_put(msg); } if (con->out_msg == msg) { - dout("con_revoke %p msg %p - was sending\n", con, msg); + dout("%s %p msg %p - was sending\n", __func__, con, msg); con->out_msg = NULL; if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; } - ceph_msg_put(msg); msg->hdr.seq = 0; + + ceph_msg_put(msg); } mutex_unlock(&con->mutex); } @@ -2341,17 +2558,27 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) /* * Revoke a message that we may be reading data into */ -void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) +void ceph_msg_revoke_incoming(struct ceph_msg *msg) { + struct ceph_connection *con; + + BUG_ON(msg == NULL); + if (!msg->con) { + dout("%s msg %p null con\n", __func__, msg); + + return; /* Message not in our possession */ + } + + con = msg->con; mutex_lock(&con->mutex); - if (con->in_msg && con->in_msg == msg) { + if (con->in_msg == msg) { unsigned front_len = le32_to_cpu(con->in_hdr.front_len); unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len); unsigned data_len = le32_to_cpu(con->in_hdr.data_len); /* skip rest of message */ - dout("con_revoke_pages %p msg %p revoked\n", con, msg); - con->in_base_pos = con->in_base_pos - + dout("%s %p msg %p revoked\n", __func__, con, msg); + con->in_base_pos = con->in_base_pos - sizeof(struct ceph_msg_header) - front_len - middle_len - @@ -2362,8 +2589,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; } else { - dout("con_revoke_pages %p msg %p pages %p no-op\n", - con, con->in_msg, msg); + dout("%s %p in_msg %p msg %p no-op\n", + __func__, con, con->in_msg, msg); } mutex_unlock(&con->mutex); } @@ -2374,9 +2601,11 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) void ceph_con_keepalive(struct ceph_connection *con) { dout("con_keepalive %p\n", con); + mutex_lock(&con->mutex); clear_standby(con); - if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && - test_and_set_bit(WRITE_PENDING, &con->state) == 0) + mutex_unlock(&con->mutex); + if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && + test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); @@ -2395,6 +2624,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, if (m == NULL) goto out; kref_init(&m->kref); + + m->con = NULL; INIT_LIST_HEAD(&m->list_head); m->hdr.tid = 0; @@ -2490,46 +2721,78 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) } /* - * Generic message allocator, for incoming messages. + * Allocate a message for receiving an incoming message on a + * connection, and save the result in con->in_msg. Uses the + * connection's private alloc_msg op if available. + * + * Returns 0 on success, or a negative error code. + * + * On success, if we set *skip = 1: + * - the next message should be skipped and ignored. + * - con->in_msg == NULL + * or if we set *skip = 0: + * - con->in_msg is non-null. + * On error (ENOMEM, EAGAIN, ...), + * - con->in_msg == NULL */ -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) { + struct ceph_msg_header *hdr = &con->in_hdr; int type = le16_to_cpu(hdr->type); int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); - struct ceph_msg *msg = NULL; - int ret; + int ret = 0; + + BUG_ON(con->in_msg != NULL); if (con->ops->alloc_msg) { + struct ceph_msg *msg; + mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (!msg || *skip) - return NULL; + if (con->state != CON_STATE_OPEN) { + if (msg) + ceph_msg_put(msg); + return -EAGAIN; + } + con->in_msg = msg; + if (con->in_msg) { + con->in_msg->con = con->ops->get(con); + BUG_ON(con->in_msg->con == NULL); + } + if (*skip) { + con->in_msg = NULL; + return 0; + } + if (!con->in_msg) { + con->error_msg = + "error allocating memory for incoming message"; + return -ENOMEM; + } } - if (!msg) { - *skip = 0; - msg = ceph_msg_new(type, front_len, GFP_NOFS, false); - if (!msg) { + if (!con->in_msg) { + con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!con->in_msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); - return NULL; + return -ENOMEM; } - msg->page_alignment = le16_to_cpu(hdr->data_off); + con->in_msg->con = con->ops->get(con); + BUG_ON(con->in_msg->con == NULL); + con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); } - memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); + memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); - if (middle_len && !msg->middle) { - ret = ceph_alloc_middle(con, msg); + if (middle_len && !con->in_msg->middle) { + ret = ceph_alloc_middle(con, con->in_msg); if (ret < 0) { - ceph_msg_put(msg); - return NULL; + ceph_msg_put(con->in_msg); + con->in_msg = NULL; } } - return msg; + return ret; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 1845cde26227..89a6409b4e1d 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) monc->pending_auth = 1; monc->m_auth->front.iov_len = len; monc->m_auth->hdr.front_len = cpu_to_le32(len); - ceph_con_revoke(monc->con, monc->m_auth); + ceph_msg_revoke(monc->m_auth); ceph_msg_get(monc->m_auth); /* keep our ref */ - ceph_con_send(monc->con, monc->m_auth); + ceph_con_send(&monc->con, monc->m_auth); } /* @@ -117,8 +117,11 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) static void __close_session(struct ceph_mon_client *monc) { dout("__close_session closing mon%d\n", monc->cur_mon); - ceph_con_revoke(monc->con, monc->m_auth); - ceph_con_close(monc->con); + ceph_msg_revoke(monc->m_auth); + ceph_msg_revoke_incoming(monc->m_auth_reply); + ceph_msg_revoke(monc->m_subscribe); + ceph_msg_revoke_incoming(monc->m_subscribe_ack); + ceph_con_close(&monc->con); monc->cur_mon = -1; monc->pending_auth = 0; ceph_auth_reset(monc->auth); @@ -142,9 +145,8 @@ static int __open_session(struct ceph_mon_client *monc) monc->want_next_osdmap = !!monc->want_next_osdmap; dout("open_session mon%d opening\n", monc->cur_mon); - monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; - monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); - ceph_con_open(monc->con, + ceph_con_open(&monc->con, + CEPH_ENTITY_TYPE_MON, monc->cur_mon, &monc->monmap->mon_inst[monc->cur_mon].addr); /* initiatiate authentication handshake */ @@ -226,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_con_revoke(monc->con, msg); - ceph_con_send(monc->con, ceph_msg_get(msg)); + ceph_msg_revoke(msg); + ceph_con_send(&monc->con, ceph_msg_get(msg)); monc->sub_sent = jiffies | 1; /* never 0 */ } @@ -247,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, if (monc->hunting) { pr_info("mon%d %s session established\n", monc->cur_mon, - ceph_pr_addr(&monc->con->peer_addr.in_addr)); + ceph_pr_addr(&monc->con.peer_addr.in_addr)); monc->hunting = false; } dout("handle_subscribe_ack after %d seconds\n", seconds); @@ -309,6 +311,17 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) EXPORT_SYMBOL(ceph_monc_open_session); /* + * We require the fsid and global_id in order to initialize our + * debugfs dir. + */ +static bool have_debugfs_info(struct ceph_mon_client *monc) +{ + dout("have_debugfs_info fsid %d globalid %lld\n", + (int)monc->client->have_fsid, monc->auth->global_id); + return monc->client->have_fsid && monc->auth->global_id > 0; +} + +/* * The monitor responds with mount ack indicate mount success. The * included client ticket allows the client to talk to MDSs and OSDs. */ @@ -318,9 +331,12 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_client *client = monc->client; struct ceph_monmap *monmap = NULL, *old = monc->monmap; void *p, *end; + int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); + had_debugfs_info = have_debugfs_info(monc); + dout("handle_monmap\n"); p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -342,12 +358,22 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, if (!client->have_fsid) { client->have_fsid = true; + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } mutex_unlock(&monc->mutex); - /* - * do debugfs initialization without mutex to avoid - * creating a locking dependency - */ - ceph_debugfs_client_init(client); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } + goto out_unlocked; } out: @@ -439,6 +465,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, m = NULL; } else { dout("get_generic_reply %lld got %p\n", tid, req->reply); + *skip = 0; m = ceph_msg_get(req->reply); /* * we don't need to track the connection reading into @@ -461,7 +488,7 @@ static int do_generic_request(struct ceph_mon_client *monc, req->request->hdr.tid = cpu_to_le64(req->tid); __insert_generic_request(monc, req); monc->num_generic_requests++; - ceph_con_send(monc->con, ceph_msg_get(req->request)); + ceph_con_send(&monc->con, ceph_msg_get(req->request)); mutex_unlock(&monc->mutex); err = wait_for_completion_interruptible(&req->completion); @@ -684,8 +711,9 @@ static void __resend_generic_request(struct ceph_mon_client *monc) for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { req = rb_entry(p, struct ceph_mon_generic_request, node); - ceph_con_revoke(monc->con, req->request); - ceph_con_send(monc->con, ceph_msg_get(req->request)); + ceph_msg_revoke(req->request); + ceph_msg_revoke_incoming(req->reply); + ceph_con_send(&monc->con, ceph_msg_get(req->request)); } } @@ -705,7 +733,7 @@ static void delayed_work(struct work_struct *work) __close_session(monc); __open_session(monc); /* continue hunting */ } else { - ceph_con_keepalive(monc->con); + ceph_con_keepalive(&monc->con); __validate_auth(monc); @@ -760,19 +788,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) goto out; /* connection */ - monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); - if (!monc->con) - goto out_monmap; - ceph_con_init(monc->client->msgr, monc->con); - monc->con->private = monc; - monc->con->ops = &mon_con_ops; - /* authentication */ monc->auth = ceph_auth_init(cl->options->name, cl->options->key); if (IS_ERR(monc->auth)) { err = PTR_ERR(monc->auth); - goto out_con; + goto out_monmap; } monc->auth->want_keys = CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | @@ -801,6 +822,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) if (!monc->m_auth) goto out_auth_reply; + ceph_con_init(&monc->con, monc, &mon_con_ops, + &monc->client->msgr); + monc->cur_mon = -1; monc->hunting = true; monc->sub_renew_after = jiffies; @@ -824,8 +848,6 @@ out_subscribe_ack: ceph_msg_put(monc->m_subscribe_ack); out_auth: ceph_auth_destroy(monc->auth); -out_con: - monc->con->ops->put(monc->con); out_monmap: kfree(monc->monmap); out: @@ -841,12 +863,16 @@ void ceph_monc_stop(struct ceph_mon_client *monc) mutex_lock(&monc->mutex); __close_session(monc); - monc->con->private = NULL; - monc->con->ops->put(monc->con); - monc->con = NULL; - mutex_unlock(&monc->mutex); + /* + * flush msgr queue before we destroy ourselves to ensure that: + * - any work that references our embedded con is finished. + * - any osd_client or other work that may reference an authorizer + * finishes before we shut down the auth subsystem. + */ + ceph_msgr_flush(); + ceph_auth_destroy(monc->auth); ceph_msg_put(monc->m_auth); @@ -863,8 +889,10 @@ static void handle_auth_reply(struct ceph_mon_client *monc, { int ret; int was_auth = 0; + int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); + had_debugfs_info = have_debugfs_info(monc); if (monc->auth->ops) was_auth = monc->auth->ops->is_authenticated(monc->auth); monc->pending_auth = 0; @@ -880,14 +908,29 @@ static void handle_auth_reply(struct ceph_mon_client *monc, } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); - monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; - monc->client->msgr->inst.name.num = + monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; + monc->client->msgr.inst.name.num = cpu_to_le64(monc->auth->global_id); __send_subscribe(monc); __resend_generic_request(monc); } + + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } mutex_unlock(&monc->mutex); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } } static int __validate_auth(struct ceph_mon_client *monc) @@ -992,6 +1035,8 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: m = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!m) + return NULL; /* ENOMEM--return skip == 0 */ break; } @@ -1021,7 +1066,7 @@ static void mon_fault(struct ceph_connection *con) if (!monc->hunting) pr_info("mon%d %s session lost, " "hunting for new mon\n", monc->cur_mon, - ceph_pr_addr(&monc->con->peer_addr.in_addr)); + ceph_pr_addr(&monc->con.peer_addr.in_addr)); __close_session(monc); if (!monc->hunting) { @@ -1036,9 +1081,23 @@ out: mutex_unlock(&monc->mutex); } +/* + * We can ignore refcounting on the connection struct, as all references + * will come from the messenger workqueue, which is drained prior to + * mon_client destruction. + */ +static struct ceph_connection *con_get(struct ceph_connection *con) +{ + return con; +} + +static void con_put(struct ceph_connection *con) +{ +} + static const struct ceph_connection_operations mon_con_ops = { - .get = ceph_con_get, - .put = ceph_con_put, + .get = con_get, + .put = con_put, .dispatch = dispatch, .fault = mon_fault, .alloc_msg = mon_alloc_msg, diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index 11d5f4196a73..ddec1c10ac80 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg) struct ceph_msgpool *pool = arg; struct ceph_msg *msg; - msg = ceph_msg_new(0, pool->front_len, gfp_mask, true); + msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); if (!msg) { dout("msgpool_alloc %s failed\n", pool->name); } else { @@ -32,10 +32,11 @@ static void msgpool_free(void *element, void *arg) ceph_msg_put(msg); } -int ceph_msgpool_init(struct ceph_msgpool *pool, +int ceph_msgpool_init(struct ceph_msgpool *pool, int type, int front_len, int size, bool blocking, const char *name) { dout("msgpool %s init\n", name); + pool->type = type; pool->front_len = front_len; pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); if (!pool->pool) @@ -61,7 +62,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, WARN_ON(1); /* try to alloc a fresh message */ - return ceph_msg_new(0, front_len, GFP_NOFS, false); + return ceph_msg_new(pool->type, front_len, GFP_NOFS, false); } msg = mempool_alloc(pool->pool, GFP_NOFS); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5e254055c910..b16dfa25e750 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -52,7 +52,7 @@ static int op_has_extent(int op) op == CEPH_OSD_OP_WRITE); } -void ceph_calc_raw_layout(struct ceph_osd_client *osdc, +int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, u64 snapid, u64 off, u64 *plen, u64 *bno, @@ -62,12 +62,15 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; u64 orig_len = *plen; u64 objoff, objlen; /* extent in object */ + int r; reqhead->snapid = cpu_to_le64(snapid); /* object extent? */ - ceph_calc_file_object_mapping(layout, off, plen, bno, - &objoff, &objlen); + r = ceph_calc_file_object_mapping(layout, off, plen, bno, + &objoff, &objlen); + if (r < 0) + return r; if (*plen < orig_len) dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); @@ -83,7 +86,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", *bno, objoff, objlen, req->r_num_pages); - + return 0; } EXPORT_SYMBOL(ceph_calc_raw_layout); @@ -112,20 +115,25 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); * * fill osd op in request message. */ -static void calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) +static int calc_layout(struct ceph_osd_client *osdc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + u64 off, u64 *plen, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) { u64 bno; + int r; - ceph_calc_raw_layout(osdc, layout, vino.snap, off, - plen, &bno, req, op); + r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, + plen, &bno, req, op); + if (r < 0) + return r; snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); + + return r; } /* @@ -139,15 +147,14 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_reply) - ceph_msg_put(req->r_reply); if (req->r_con_filling_msg) { - dout("release_request revoking pages %p from con %p\n", + dout("%s revoking pages %p from con %p\n", __func__, req->r_pages, req->r_con_filling_msg); - ceph_con_revoke_message(req->r_con_filling_msg, - req->r_reply); - ceph_con_put(req->r_con_filling_msg); + ceph_msg_revoke_incoming(req->r_reply); + req->r_con_filling_msg->ops->put(req->r_con_filling_msg); } + if (req->r_reply) + ceph_msg_put(req->r_reply); if (req->r_own_pages) ceph_release_page_vector(req->r_pages, req->r_num_pages); @@ -214,6 +221,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); + RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd); @@ -243,6 +251,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } ceph_pagelist_init(req->r_trail); } + /* create request message; allow space for oid */ msg_size += MAX_OBJ_NAME_SIZE; if (snapc) @@ -256,7 +265,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, return NULL; } - msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); memset(msg->front.iov_base, 0, msg->front.iov_len); req->r_request = msg; @@ -278,7 +286,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, { dst->op = cpu_to_le16(src->op); - switch (dst->op) { + switch (src->op) { case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: dst->extent.offset = @@ -454,6 +462,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, { struct ceph_osd_req_op ops[3]; struct ceph_osd_request *req; + int r; ops[0].op = opcode; ops[0].extent.truncate_seq = truncate_seq; @@ -472,10 +481,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, use_mempool, GFP_NOFS, NULL, NULL); if (!req) - return NULL; + return ERR_PTR(-ENOMEM); /* calculate max write size */ - calc_layout(osdc, vino, layout, off, plen, req, ops); + r = calc_layout(osdc, vino, layout, off, plen, req, ops); + if (r < 0) + return ERR_PTR(r); req->r_file_layout = *layout; /* keep a copy */ /* in case it differs from natural (file) alignment that @@ -568,7 +579,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, dout("__kick_osd_requests osd%d\n", osd->o_osd); err = __reset_osd(osdc, osd); - if (err == -EAGAIN) + if (err) return; list_for_each_entry(req, &osd->o_requests, r_osd_item) { @@ -595,14 +606,6 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, } } -static void kick_osd_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) -{ - mutex_lock(&osdc->request_mutex); - __kick_osd_requests(osdc, kickosd); - mutex_unlock(&osdc->request_mutex); -} - /* * If the osd connection drops, we need to resubmit all requests. */ @@ -616,7 +619,9 @@ static void osd_reset(struct ceph_connection *con) dout("osd_reset osd%d\n", osd->o_osd); osdc = osd->o_osdc; down_read(&osdc->map_sem); - kick_osd_requests(osdc, osd); + mutex_lock(&osdc->request_mutex); + __kick_osd_requests(osdc, osd); + mutex_unlock(&osdc->request_mutex); send_queued(osdc); up_read(&osdc->map_sem); } @@ -624,7 +629,7 @@ static void osd_reset(struct ceph_connection *con) /* * Track open sessions with osds. */ -static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) +static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) { struct ceph_osd *osd; @@ -634,15 +639,14 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) atomic_set(&osd->o_ref, 1); osd->o_osdc = osdc; + osd->o_osd = onum; + RB_CLEAR_NODE(&osd->o_node); INIT_LIST_HEAD(&osd->o_requests); INIT_LIST_HEAD(&osd->o_linger_requests); INIT_LIST_HEAD(&osd->o_osd_lru); osd->o_incarnation = 1; - ceph_con_init(osdc->client->msgr, &osd->o_con); - osd->o_con.private = osd; - osd->o_con.ops = &osd_con_ops; - osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; + ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); INIT_LIST_HEAD(&osd->o_keepalive_item); return osd; @@ -664,11 +668,11 @@ static void put_osd(struct ceph_osd *osd) { dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref)) { + if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (osd->o_authorizer) - ac->ops->destroy_authorizer(ac, osd->o_authorizer); + if (ac->ops && ac->ops->destroy_authorizer) + ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } } @@ -740,6 +744,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) if (list_empty(&osd->o_requests) && list_empty(&osd->o_linger_requests)) { __remove_osd(osdc, osd); + ret = -ENODEV; } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], &osd->o_con.peer_addr, sizeof(osd->o_con.peer_addr)) == 0 && @@ -752,7 +757,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) ret = -EAGAIN; } else { ceph_con_close(&osd->o_con); - ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); + ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, + &osdc->osdmap->osd_addr[osd->o_osd]); osd->o_incarnation++; } return ret; @@ -841,13 +847,19 @@ static void register_request(struct ceph_osd_client *osdc, static void __unregister_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { + if (RB_EMPTY_NODE(&req->r_node)) { + dout("__unregister_request %p tid %lld not registered\n", + req, req->r_tid); + return; + } + dout("__unregister_request %p tid %lld\n", req, req->r_tid); rb_erase(&req->r_node, &osdc->requests); osdc->num_requests--; if (req->r_osd) { /* make sure the original request isn't in flight. */ - ceph_con_revoke(&req->r_osd->o_con, req->r_request); + ceph_msg_revoke(req->r_request); list_del_init(&req->r_osd_item); if (list_empty(&req->r_osd->o_requests) && @@ -859,9 +871,9 @@ static void __unregister_request(struct ceph_osd_client *osdc, req->r_osd = NULL; } + list_del_init(&req->r_req_lru_item); ceph_osdc_put_request(req); - list_del_init(&req->r_req_lru_item); if (osdc->num_requests == 0) { dout(" no requests, canceling timeout\n"); __cancel_osd_timeout(osdc); @@ -874,7 +886,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, static void __cancel_request(struct ceph_osd_request *req) { if (req->r_sent && req->r_osd) { - ceph_con_revoke(&req->r_osd->o_con, req->r_request); + ceph_msg_revoke(req->r_request); req->r_sent = 0; } } @@ -884,15 +896,17 @@ static void __register_linger_request(struct ceph_osd_client *osdc, { dout("__register_linger_request %p\n", req); list_add_tail(&req->r_linger_item, &osdc->req_linger); - list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); + if (req->r_osd) + list_add_tail(&req->r_linger_osd, + &req->r_osd->o_linger_requests); } static void __unregister_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { dout("__unregister_linger_request %p\n", req); + list_del_init(&req->r_linger_item); if (req->r_osd) { - list_del_init(&req->r_linger_item); list_del_init(&req->r_linger_osd); if (list_empty(&req->r_osd->o_requests) && @@ -992,18 +1006,18 @@ static int __map_request(struct ceph_osd_client *osdc, req->r_osd = __lookup_osd(osdc, o); if (!req->r_osd && o >= 0) { err = -ENOMEM; - req->r_osd = create_osd(osdc); + req->r_osd = create_osd(osdc, o); if (!req->r_osd) { list_move(&req->r_req_lru_item, &osdc->req_notarget); goto out; } dout("map_request osd %p is osd%d\n", req->r_osd, o); - req->r_osd->o_osd = o; - req->r_osd->o_con.peer_name.num = cpu_to_le64(o); __insert_osd(osdc, req->r_osd); - ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); + ceph_con_open(&req->r_osd->o_con, + CEPH_ENTITY_TYPE_OSD, o, + &osdc->osdmap->osd_addr[o]); } if (req->r_osd) { @@ -1071,12 +1085,10 @@ static void handle_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); - struct ceph_osd_request *req, *last_req = NULL; + struct ceph_osd_request *req; struct ceph_osd *osd; - unsigned long timeout = osdc->client->options->osd_timeout * HZ; unsigned long keepalive = osdc->client->options->osd_keepalive_timeout * HZ; - unsigned long last_stamp = 0; struct list_head slow_osds; dout("timeout\n"); down_read(&osdc->map_sem); @@ -1086,37 +1098,6 @@ static void handle_timeout(struct work_struct *work) mutex_lock(&osdc->request_mutex); /* - * reset osds that appear to be _really_ unresponsive. this - * is a failsafe measure.. we really shouldn't be getting to - * this point if the system is working properly. the monitors - * should mark the osd as failed and we should find out about - * it from an updated osd map. - */ - while (timeout && !list_empty(&osdc->req_lru)) { - req = list_entry(osdc->req_lru.next, struct ceph_osd_request, - r_req_lru_item); - - /* hasn't been long enough since we sent it? */ - if (time_before(jiffies, req->r_stamp + timeout)) - break; - - /* hasn't been long enough since it was acked? */ - if (req->r_request->ack_stamp == 0 || - time_before(jiffies, req->r_request->ack_stamp + timeout)) - break; - - BUG_ON(req == last_req && req->r_stamp == last_stamp); - last_req = req; - last_stamp = req->r_stamp; - - osd = req->r_osd; - BUG_ON(!osd); - pr_warning(" tid %llu timed out on osd%d, will reset osd\n", - req->r_tid, osd->o_osd); - __kick_osd_requests(osdc, osd); - } - - /* * ping osds that are a bit slow. this ensures that if there * is a break in the TCP connection we will notice, and reopen * a connection with that osd (from the fault callback). @@ -1210,7 +1191,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, if (req->r_con_filling_msg == con && req->r_reply == msg) { dout(" dropping con_filling_msg ref %p\n", con); req->r_con_filling_msg = NULL; - ceph_con_put(con); + con->ops->put(con); } if (!req->r_got_reply) { @@ -1287,7 +1268,7 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) * Requeue requests whose mapping to an OSD has changed. If requests map to * no osd, request a new map. * - * Caller should hold map_sem for read and request_mutex. + * Caller should hold map_sem for read. */ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) { @@ -1298,8 +1279,27 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + for (p = rb_first(&osdc->requests); p; ) { req = rb_entry(p, struct ceph_osd_request, r_node); + p = rb_next(p); + + /* + * For linger requests that have not yet been + * registered, move them to the linger list; they'll + * be sent to the osd in the loop below. Unregister + * the request before re-registering it as a linger + * request to ensure the __map_request() below + * will decide it needs to be sent. + */ + if (req->r_linger && list_empty(&req->r_linger_item)) { + dout("%p tid %llu restart on osd%d\n", + req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + __unregister_request(osdc, req); + __register_linger_request(osdc, req); + continue; + } + err = __map_request(osdc, req, force_resend); if (err < 0) continue; /* error */ @@ -1307,10 +1307,12 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("%p tid %llu maps to no osd\n", req, req->r_tid); needmap++; /* request a newer map */ } else if (err > 0) { - dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - if (!req->r_linger) + if (!req->r_linger) { + dout("%p tid %llu requeued on osd%d\n", req, + req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); req->r_flags |= CEPH_OSD_FLAG_RETRY; + } } } @@ -1319,6 +1321,7 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); err = __map_request(osdc, req, force_resend); + dout("__map_request returned %d\n", err); if (err == 0) continue; /* no change and no osd was specified */ if (err < 0) @@ -1331,8 +1334,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); - __unregister_linger_request(osdc, req); __register_request(osdc, req); + __unregister_linger_request(osdc, req); } mutex_unlock(&osdc->request_mutex); @@ -1340,6 +1343,7 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("%d requests for down osds, need new map\n", needmap); ceph_monc_request_next_osdmap(&osdc->client->monc); } + reset_changed_osds(osdc); } @@ -1385,7 +1389,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) epoch, maplen); newmap = osdmap_apply_incremental(&p, next, osdc->osdmap, - osdc->client->msgr); + &osdc->client->msgr); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; @@ -1396,7 +1400,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) osdc->osdmap = newmap; } kick_requests(osdc, 0); - reset_changed_osds(osdc); } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); @@ -1566,6 +1569,7 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc, event->data = data; event->osdc = osdc; INIT_LIST_HEAD(&event->osd_node); + RB_CLEAR_NODE(&event->node); kref_init(&event->kref); /* one ref for us */ kref_get(&event->kref); /* one ref for the caller */ init_completion(&event->completion); @@ -1833,11 +1837,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->req_mempool) goto out; - err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, + err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, + OSD_OP_FRONT_LEN, 10, true, "osd_op"); if (err < 0) goto out_mempool; - err = ceph_msgpool_init(&osdc->msgpool_op_reply, + err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, OSD_OPREPLY_FRONT_LEN, 10, true, "osd_op_reply"); if (err < 0) @@ -1896,8 +1901,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, false, 1, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* it may be a short read due to an object boundary */ req->r_pages = pages; @@ -1939,8 +1944,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, snapc, do_sync, truncate_seq, truncate_size, mtime, nofail, 1, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* it may be a short write due to an object boundary */ req->r_pages = pages; @@ -2019,10 +2024,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } if (req->r_con_filling_msg) { - dout("get_reply revoking msg %p from old con %p\n", + dout("%s revoking msg %p from old con %p\n", __func__, req->r_reply, req->r_con_filling_msg); - ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); - ceph_con_put(req->r_con_filling_msg); + ceph_msg_revoke_incoming(req->r_reply); + req->r_con_filling_msg->ops->put(req->r_con_filling_msg); req->r_con_filling_msg = NULL; } @@ -2057,7 +2062,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, #endif } *skip = 0; - req->r_con_filling_msg = ceph_con_get(con); + req->r_con_filling_msg = con->ops->get(con); dout("get_reply tid %lld %p\n", tid, m); out: @@ -2074,6 +2079,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, int type = le16_to_cpu(hdr->type); int front = le32_to_cpu(hdr->front_len); + *skip = 0; switch (type) { case CEPH_MSG_OSD_MAP: case CEPH_MSG_WATCH_NOTIFY: @@ -2108,37 +2114,32 @@ static void put_osd_con(struct ceph_connection *con) /* * authentication */ -static int get_authorizer(struct ceph_connection *con, - void **buf, int *len, int *proto, - void **reply_buf, int *reply_len, int force_new) +/* + * Note: returned pointer is the address of a structure that's + * managed separately. Caller must *not* attempt to free it. + */ +static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, + int *proto, int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - int ret = 0; + struct ceph_auth_handshake *auth = &o->o_auth; - if (force_new && o->o_authorizer) { - ac->ops->destroy_authorizer(ac, o->o_authorizer); - o->o_authorizer = NULL; - } - if (o->o_authorizer == NULL) { - ret = ac->ops->create_authorizer( - ac, CEPH_ENTITY_TYPE_OSD, - &o->o_authorizer, - &o->o_authorizer_buf, - &o->o_authorizer_buf_len, - &o->o_authorizer_reply_buf, - &o->o_authorizer_reply_buf_len); + if (force_new && auth->authorizer) { + if (ac->ops && ac->ops->destroy_authorizer) + ac->ops->destroy_authorizer(ac, auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { + int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); if (ret) - return ret; + return ERR_PTR(ret); } - *proto = ac->protocol; - *buf = o->o_authorizer_buf; - *len = o->o_authorizer_buf_len; - *reply_buf = o->o_authorizer_reply_buf; - *reply_len = o->o_authorizer_reply_buf_len; - return 0; + + return auth; } @@ -2148,7 +2149,11 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); + /* + * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, + * XXX which do we do: succeed or fail? + */ + return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -2157,7 +2162,7 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - if (ac->ops->invalidate_authorizer) + if (ac->ops && ac->ops->invalidate_authorizer) ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); return ceph_monc_validate_auth(&osdc->client->monc); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 29ad46ec9dcf..7fbe21030f54 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -495,15 +495,16 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) ceph_decode_32_safe(p, end, pool, bad); ceph_decode_32_safe(p, end, len, bad); dout(" pool %d len %d\n", pool, len); + ceph_decode_need(p, end, len, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) { + char *name = kstrndup(*p, len, GFP_NOFS); + + if (!name) + return -ENOMEM; kfree(pi->name); - pi->name = kmalloc(len + 1, GFP_NOFS); - if (pi->name) { - memcpy(pi->name, *p, len); - pi->name[len] = '\0'; - dout(" name is %s\n", pi->name); - } + pi->name = name; + dout(" name is %s\n", pi->name); } *p += len; } @@ -612,10 +613,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_safe(p, end, max, bad); while (max--) { ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); + err = -ENOMEM; pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) goto bad; pi->id = ceph_decode_32(p); + err = -EINVAL; ev = ceph_decode_8(p); /* encoding version */ if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", @@ -631,8 +634,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) __insert_pg_pool(&map->pg_pools, pi); } - if (version >= 5 && __decode_pool_names(p, end, map) < 0) - goto bad; + if (version >= 5) { + err = __decode_pool_names(p, end, map); + if (err < 0) { + dout("fail to decode pool names"); + goto bad; + } + } ceph_decode_32_safe(p, end, map->pool_max, bad); @@ -673,6 +681,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); ceph_decode_copy(p, &pgid, sizeof(pgid)); n = ceph_decode_32(p); + err = -EINVAL; + if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) + goto bad; ceph_decode_need(p, end, n * sizeof(u32), bad); err = -ENOMEM; pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); @@ -709,7 +720,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) return map; bad: - dout("osdmap_decode fail\n"); + dout("osdmap_decode fail err %d\n", err); ceph_osdmap_destroy(map); return ERR_PTR(err); } @@ -803,6 +814,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); + err = -EINVAL; goto bad; } pi = __lookup_pg_pool(&map->pg_pools, pool); @@ -819,8 +831,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (err < 0) goto bad; } - if (version >= 5 && __decode_pool_names(p, end, map) < 0) - goto bad; + if (version >= 5) { + err = __decode_pool_names(p, end, map); + if (err < 0) + goto bad; + } /* old_pool */ ceph_decode_32_safe(p, end, len, bad); @@ -890,13 +905,19 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, pglen = ceph_decode_32(p); if (pglen) { - /* insert */ ceph_decode_need(p, end, pglen*sizeof(u32), bad); + + /* removing existing (if any) */ + (void) __remove_pg_mapping(&map->pg_temp, pgid); + + /* insert */ + err = -EINVAL; + if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) + goto bad; + err = -ENOMEM; pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); - if (!pg) { - err = -ENOMEM; + if (!pg) goto bad; - } pg->pgid = pgid; pg->len = pglen; for (j = 0; j < pglen; j++) @@ -940,7 +961,7 @@ bad: * for now, we write only a single su, until we can * pass a stride back to the caller. */ -void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, +int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *ono, u64 *oxoff, u64 *oxlen) @@ -954,11 +975,17 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, osize, su); + if (su == 0 || sc == 0) + goto invalid; su_per_object = osize / su; + if (su_per_object == 0) + goto invalid; dout("osize %u / su %u = su_per_object %u\n", osize, su, su_per_object); - BUG_ON((su & ~PAGE_MASK) != 0); + if ((su & ~PAGE_MASK) != 0) + goto invalid; + /* bl = *off / su; */ t = off; do_div(t, su); @@ -986,6 +1013,14 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, *plen = *oxlen; dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); + return 0; + +invalid: + dout(" invalid layout\n"); + *ono = 0; + *oxoff = 0; + *oxlen = 0; + return -EINVAL; } EXPORT_SYMBOL(ceph_calc_file_object_mapping); diff --git a/net/core/datagram.c b/net/core/datagram.c index e4fbfd6e2bd4..da7e0c867cc0 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -187,7 +187,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, skb_queue_walk(queue, skb) { *peeked = skb->peeked; if (flags & MSG_PEEK) { - if (*off >= skb->len) { + if (*off >= skb->len && skb->len) { *off -= skb->len; continue; } diff --git a/net/core/dev.c b/net/core/dev.c index c299416d0e89..eb858dc6ab86 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1056,6 +1056,8 @@ rollback: */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { + char *new_ifalias; + ASSERT_RTNL(); if (len >= IFALIASZ) @@ -1069,9 +1071,10 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return 0; } - dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); - if (!dev->ifalias) + new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); + if (!new_ifalias) return -ENOMEM; + dev->ifalias = new_ifalias; strlcpy(dev->ifalias, alias, len+1); return len; @@ -1638,6 +1641,19 @@ static inline int deliver_skb(struct sk_buff *skb, return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ + if (!ptype->af_packet_priv || !skb->sk) + return false; + + if (ptype->id_match) + return ptype->id_match(ptype, skb->sk); + else if ((struct sock *)ptype->af_packet_priv == skb->sk) + return true; + + return false; +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1655,8 +1671,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * they originated from - MvS (miquels@drinkel.ow.org) */ if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { + (!skb_loop_sk(ptype, skb))) { if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; @@ -2106,7 +2121,8 @@ static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) static netdev_features_t harmonize_features(struct sk_buff *skb, __be16 protocol, netdev_features_t features) { - if (!can_checksum_protocol(features, protocol)) { + if (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, protocol)) { features &= ~NETIF_F_ALL_CSUM; features &= ~NETIF_F_SG; } else if (illegal_highdma(skb->dev, skb)) { @@ -2121,6 +2137,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) __be16 protocol = skb->protocol; netdev_features_t features = skb->dev->features; + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + features &= ~NETIF_F_GSO_MASK; + if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; @@ -2599,15 +2618,16 @@ void __skb_get_rxhash(struct sk_buff *skb) if (!skb_flow_dissect(skb, &keys)) return; - if (keys.ports) { - if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]) - swap(keys.port16[0], keys.port16[1]); + if (keys.ports) skb->l4_rxhash = 1; - } /* get a consistent hash (same value on both flow directions) */ - if ((__force u32)keys.dst < (__force u32)keys.src) + if (((__force u32)keys.dst < (__force u32)keys.src) || + (((__force u32)keys.dst == (__force u32)keys.src) && + ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { swap(keys.dst, keys.src); + swap(keys.port16[0], keys.port16[1]); + } hash = jhash_3words((__force u32)keys.dst, (__force u32)keys.src, @@ -2743,8 +2763,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) + rflow->last_qtail)) >= 0)) { + tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + } if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { *rflowp = rflow; @@ -3189,18 +3211,18 @@ another_round: ncls: #endif - rx_handler = rcu_dereference(skb->dev->rx_handler); if (vlan_tx_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - if (vlan_do_receive(&skb, !rx_handler)) + if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto out; } + rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); @@ -3220,6 +3242,9 @@ ncls: } } + if (vlan_tx_nonzero_tag_present(skb)) + skb->pkt_type = PACKET_OTHERHOST; + /* deliver only exact match when indicated */ null_or_dev = deliver_exact ? skb->dev : NULL; @@ -5909,6 +5934,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_segs = GSO_MAX_SEGS; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); @@ -6284,7 +6310,8 @@ static struct hlist_head *netdev_create_hash(void) /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { - INIT_LIST_HEAD(&net->dev_base_head); + if (net != &init_net) + INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 626698f0db8b..76f6d0b02f28 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -308,7 +308,8 @@ int dev_addr_del(struct net_device *dev, unsigned char *addr, */ ha = list_first_entry(&dev->dev_addrs.list, struct netdev_hw_addr, list); - if (ha->addr == dev->dev_addr && ha->refcount == 1) + if (!memcmp(ha->addr, addr, dev->addr_len) && + ha->type == addr_type && ha->refcount == 1) return -ENOENT; err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 73b90351df5c..ac88107d1bc9 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1285,8 +1285,6 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) if (!dst) goto discard; - __skb_pull(skb, skb_network_offset(skb)); - if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; @@ -1296,6 +1294,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) neigh_hh_init(neigh, dst); do { + __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); @@ -1326,9 +1325,8 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) unsigned int seq; int err; - __skb_pull(skb, skb_network_offset(skb)); - do { + __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 31a5ae51a45c..dd00b71dd092 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -25,7 +25,9 @@ static DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); -struct net init_net; +struct net init_net = { + .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), +}; EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b81369b6ddc0..114d8a9e8570 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1802,10 +1802,13 @@ static ssize_t pktgen_thread_write(struct file *file, return -EFAULT; i += len; mutex_lock(&pktgen_thread_lock); - pktgen_add_device(t, f); + ret = pktgen_add_device(t, f); mutex_unlock(&pktgen_thread_lock); - ret = count; - sprintf(pg_result, "OK: add_device=%s", f); + if (!ret) { + ret = count; + sprintf(pg_result, "OK: add_device=%s", f); + } else + sprintf(pg_result, "ERROR: can not add device %s", f); goto out; } @@ -2932,7 +2935,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, sizeof(struct ipv6hdr) - sizeof(struct udphdr) - pkt_dev->pkt_overhead; - if (datalen < sizeof(struct pktgen_hdr)) { + if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) { datalen = sizeof(struct pktgen_hdr); if (net_ratelimit()) pr_info("increased datalen to %d\n", datalen); diff --git a/net/core/sock.c b/net/core/sock.c index 0f8402ea434b..4b469e367923 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -644,7 +644,8 @@ set_rcvbuf: case SO_KEEPALIVE: #ifdef CONFIG_INET - if (sk->sk_protocol == IPPROTO_TCP) + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) tcp_set_keepalive(sk, valbool); #endif sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); @@ -1411,6 +1412,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; + sk->sk_gso_max_segs = dst->dev->gso_max_segs; } } } diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index b9868e1fd62c..aa74be442dfb 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -126,6 +126,9 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(*req)) return -EINVAL; + if (req->sdiag_family >= AF_MAX) + return -EINVAL; + hndl = sock_diag_lock_handler(req->sdiag_family); if (hndl == NULL) err = -ENOENT; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 75c3582a7678..fb85d371a8de 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -246,7 +246,7 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; - if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) + if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, optval, optlen); return rc; @@ -257,7 +257,7 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; - if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) + if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, optval, optlen); return rc; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 70bfaf2d1965..b658f3b8a23c 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -531,6 +531,7 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, case DCCP_SOCKOPT_CCID_TX_INFO: if (len < sizeof(tfrc)) return -EINVAL; + memset(&tfrc, 0, sizeof(tfrc)); tfrc.tfrctx_x = hc->tx_x; tfrc.tfrctx_x_recv = hc->tx_x_recv; tfrc.tfrctx_x_calc = hc->tx_x_calc; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index caf6e1734b62..c6f6e425b2e4 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -435,8 +435,8 @@ exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: - bh_unlock_sock(newsk); - sock_put(newsk); + inet_csk_prepare_forced_close(newsk); + dccp_done(newsk); goto exit; } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4dc588f520e0..aaa8f8bee9e1 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -611,7 +611,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, newinet->inet_rcv_saddr = LOOPBACK4_IPV6; if (__inet_inherit_port(sk, newsk) < 0) { - sock_put(newsk); + inet_csk_prepare_forced_close(newsk); + dccp_done(newsk); goto out; } __inet6_hash(newsk, NULL); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0b711659ac74..de5ec03036f1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -240,8 +240,12 @@ EXPORT_SYMBOL(inet_listen); u32 inet_ehash_secret __read_mostly; EXPORT_SYMBOL(inet_ehash_secret); +u32 ipv6_hash_secret __read_mostly; +EXPORT_SYMBOL(ipv6_hash_secret); + /* - * inet_ehash_secret must be set exactly once + * inet_ehash_secret must be set exactly once, and to a non nul value + * ipv6_hash_secret must be set exactly once. */ void build_ehash_secret(void) { @@ -251,7 +255,8 @@ void build_ehash_secret(void) get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - cmpxchg(&inet_ehash_secret, 0, rnd); + if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) + get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); } EXPORT_SYMBOL(build_ehash_secret); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 19d66cefd7d3..3f4043258542 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -659,6 +659,22 @@ void inet_csk_destroy_sock(struct sock *sk) } EXPORT_SYMBOL(inet_csk_destroy_sock); +/* This function allows to force a closure of a socket after the call to + * tcp/dccp_create_openreq_child(). + */ +void inet_csk_prepare_forced_close(struct sock *sk) +{ + /* sk_clone_lock locked the socket and set refcnt to 2 */ + bh_unlock_sock(sk); + sock_put(sk); + + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + percpu_counter_inc(sk->sk_prot->orphan_count); + inet_sk(sk)->inet_num = 0; +} +EXPORT_SYMBOL(inet_csk_prepare_forced_close); + int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8f8db724bfaf..d7b862ad4be4 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -44,6 +44,10 @@ struct inet_diag_entry { u16 dport; u16 family; u16 userlocks; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */ + struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */ +#endif }; #define INET_DIAG_PUT(skb, attrtype, attrlen) \ @@ -419,25 +423,31 @@ static int inet_diag_bc_run(const struct nlattr *_bc, break; } - if (cond->prefix_len == 0) - break; - if (op->code == INET_DIAG_BC_S_COND) addr = entry->saddr; else addr = entry->daddr; + if (cond->family != AF_UNSPEC && + cond->family != entry->family) { + if (entry->family == AF_INET6 && + cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == htonl(0xffff) && + bitstring_match(addr + 3, + cond->addr, + cond->prefix_len)) + break; + } + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; if (bitstring_match(addr, cond->addr, cond->prefix_len)) break; - if (entry->family == AF_INET6 && - cond->family == AF_INET) { - if (addr[0] == 0 && addr[1] == 0 && - addr[2] == htonl(0xffff) && - bitstring_match(addr + 3, cond->addr, - cond->prefix_len)) - break; - } yes = 0; break; } @@ -500,6 +510,55 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } +/* Validate an inet_diag_hostcond. */ +static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + int addr_len; + struct inet_diag_hostcond *cond; + + /* Check hostcond space. */ + *min_len += sizeof(struct inet_diag_hostcond); + if (len < *min_len) + return false; + cond = (struct inet_diag_hostcond *)(op + 1); + + /* Check address family and address length. */ + switch (cond->family) { + case AF_UNSPEC: + addr_len = 0; + break; + case AF_INET: + addr_len = sizeof(struct in_addr); + break; + case AF_INET6: + addr_len = sizeof(struct in6_addr); + break; + default: + return false; + } + *min_len += addr_len; + if (len < *min_len) + return false; + + /* Check prefix length (in bits) vs address length (in bytes). */ + if (cond->prefix_len > 8 * addr_len) + return false; + + return true; +} + +/* Validate a port comparison operator. */ +static inline bool valid_port_comparison(const struct inet_diag_bc_op *op, + int len, int *min_len) +{ + /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ + *min_len += sizeof(struct inet_diag_bc_op); + if (len < *min_len) + return false; + return true; +} + static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) { const void *bc = bytecode; @@ -507,29 +566,39 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) while (len > 0) { const struct inet_diag_bc_op *op = bc; + int min_len = sizeof(struct inet_diag_bc_op); //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); switch (op->code) { - case INET_DIAG_BC_AUTO: case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: + if (!valid_hostcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: - case INET_DIAG_BC_JMP: - if (op->no < 4 || op->no > len + 4 || op->no & 3) - return -EINVAL; - if (op->no < len && - !valid_cc(bytecode, bytecode_len, len - op->no)) + if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_AUTO: + case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; default: return -EINVAL; } - if (op->yes < 4 || op->yes > len + 4 || op->yes & 3) + + if (op->code != INET_DIAG_BC_NOP) { + if (op->no < min_len || op->no > len + 4 || op->no & 3) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len - op->no)) + return -EINVAL; + } + + if (op->yes < min_len || op->yes > len + 4 || op->yes & 3) return -EINVAL; bc += op->yes; len -= op->yes; @@ -586,6 +655,36 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } +/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses + * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6. + */ +static inline void inet_diag_req_addrs(const struct sock *sk, + const struct request_sock *req, + struct inet_diag_entry *entry) +{ + struct inet_request_sock *ireq = inet_rsk(req); + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + if (req->rsk_ops->family == AF_INET6) { + entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32; + entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32; + } else if (req->rsk_ops->family == AF_INET) { + ipv6_addr_set_v4mapped(ireq->loc_addr, + &entry->saddr_storage); + ipv6_addr_set_v4mapped(ireq->rmt_addr, + &entry->daddr_storage); + entry->saddr = entry->saddr_storage.s6_addr32; + entry->daddr = entry->daddr_storage.s6_addr32; + } + } else +#endif + { + entry->saddr = &ireq->loc_addr; + entry->daddr = &ireq->rmt_addr; + } +} + static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, struct request_sock *req, u32 pid, u32 seq, const struct nlmsghdr *unlh) @@ -624,8 +723,10 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, r->idiag_inode = 0; #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = inet6_rsk(req)->loc_addr; - *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; + struct inet_diag_entry entry; + inet_diag_req_addrs(sk, req, &entry); + memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr)); + memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr)); } #endif nlh->nlmsg_len = skb_tail_pointer(skb) - b; @@ -683,18 +784,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, continue; if (bc) { - entry.saddr = -#if IS_ENABLED(CONFIG_IPV6) - (entry.family == AF_INET6) ? - inet6_rsk(req)->loc_addr.s6_addr32 : -#endif - &ireq->loc_addr; - entry.daddr = -#if IS_ENABLED(CONFIG_IPV6) - (entry.family == AF_INET6) ? - inet6_rsk(req)->rmt_addr.s6_addr32 : -#endif - &ireq->rmt_addr; + inet_diag_req_addrs(sk, req, &entry); entry.dport = ntohs(ireq->rmt_port); if (!inet_diag_bc_run(bc, &entry)) @@ -875,13 +965,16 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc) { const struct inet_diag_handler *handler; + int err = 0; handler = inet_diag_lock_handler(r->sdiag_protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r, bc); + else + err = PTR_ERR(handler); inet_diag_unlock_handler(handler); - return skb->len; + return err ? : skb->len; } static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 3727e234c884..b7bf6e30adbc 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -685,28 +685,27 @@ EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) { - const struct iphdr *iph; + struct iphdr iph; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) + if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) return skb; - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) + if (iph.ihl < 5 || iph.version != 4) return skb; - if (!pskb_may_pull(skb, iph->ihl*4)) - return skb; - iph = ip_hdr(skb); - len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl * 4)) + + len = ntohs(iph.tot_len); + if (skb->len < len || len < (iph.ihl * 4)) return skb; - if (ip_is_fragment(ip_hdr(skb))) { + if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { + if (!pskb_may_pull(skb, iph.ihl*4)) + return skb; if (pskb_trim_rcsum(skb, len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 2fd0fba77124..374828487003 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -456,19 +456,28 @@ static int do_ip_setsockopt(struct sock *sk, int level, struct inet_sock *inet = inet_sk(sk); int val = 0, err; - if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | - (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | - (1<<IP_RETOPTS) | (1<<IP_TOS) | - (1<<IP_TTL) | (1<<IP_HDRINCL) | - (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | - (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | - (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) | - (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) || - optname == IP_UNICAST_IF || - optname == IP_MULTICAST_TTL || - optname == IP_MULTICAST_ALL || - optname == IP_MULTICAST_LOOP || - optname == IP_RECVORIGDSTADDR) { + switch (optname) { + case IP_PKTINFO: + case IP_RECVTTL: + case IP_RECVOPTS: + case IP_RECVTOS: + case IP_RETOPTS: + case IP_TOS: + case IP_TTL: + case IP_HDRINCL: + case IP_MTU_DISCOVER: + case IP_RECVERR: + case IP_ROUTER_ALERT: + case IP_FREEBIND: + case IP_PASSSEC: + case IP_TRANSPARENT: + case IP_MINTTL: + case IP_NODEFRAG: + case IP_UNICAST_IF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_ALL: + case IP_MULTICAST_LOOP: + case IP_RECVORIGDSTADDR: if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -580,7 +589,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_TTL: if (optlen < 1) goto e_inval; - if (val != -1 && (val < 0 || val > 255)) + if (val != -1 && (val < 1 || val > 255)) goto e_inval; inet->uc_ttl = val; break; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 960fbfc3e976..8626b645ec62 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -124,6 +124,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ipmr_new_table(struct net *net, u32 id); +static void ipmr_free_table(struct mr_table *mrt); + static int ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *cache, int local); @@ -131,6 +133,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); +static void mroute_clean_tables(struct mr_table *mrt); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -271,7 +274,7 @@ static void __net_exit ipmr_rules_exit(struct net *net) list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); - kfree(mrt); + ipmr_free_table(mrt); } fib_rules_unregister(net->ipv4.mr_rules_ops); } @@ -299,7 +302,7 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { - kfree(net->ipv4.mrt); + ipmr_free_table(net->ipv4.mrt); } #endif @@ -336,6 +339,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) return mrt; } +static void ipmr_free_table(struct mr_table *mrt) +{ + del_timer_sync(&mrt->ipmr_expire_timer); + mroute_clean_tables(mrt); + kfree(mrt); +} + /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 57932c43960e..566be2dd73f4 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -148,7 +148,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, hdr, NULL, &matchoff, &matchlen, &addr, &port) > 0) { - unsigned int matchend, poff, plen, buflen, n; + unsigned int olen, matchend, poff, plen, buflen, n; char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; /* We're only interested in headers related to this @@ -163,11 +163,12 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, goto next; } + olen = *datalen; if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; - matchend = matchoff + matchlen; + matchend = matchoff + matchlen + *datalen - olen; /* The maddr= parameter (RFC 2361) specifies where to send * the reply. */ @@ -501,7 +502,10 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, ret = nf_ct_expect_related(rtcp_exp); if (ret == 0) break; - else if (ret != -EBUSY) { + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { nf_ct_unexpect_related(rtp_exp); port = 0; break; diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 3828a4229822..da4098f08784 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -194,7 +194,8 @@ nf_nat_out(unsigned int hooknum, if ((ct->tuplehash[dir].tuple.src.u3.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) || - (ct->tuplehash[dir].tuple.src.u.all != + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all) ) return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; @@ -230,7 +231,8 @@ nf_nat_local_fn(unsigned int hooknum, ret = NF_DROP; } #ifdef CONFIG_XFRM - else if (ct->tuplehash[dir].tuple.dst.u.all != + else if (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) if (ip_xfrm_me_harder(skb)) ret = NF_DROP; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 50009c787bcd..c234bda5b801 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -321,8 +321,8 @@ void ping_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr *)skb->data; struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); struct inet_sock *inet_sock; - int type = icmph->type; - int code = icmph->code; + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8af0d44e4e22..212897517b89 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -232,7 +232,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), - SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN), SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), @@ -258,6 +257,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), + SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), + SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bbd604c68e68..2fe0dc28ea98 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -131,18 +131,20 @@ found: * 0 - deliver * 1 - block */ -static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) { - int type; + struct icmphdr _hdr; + const struct icmphdr *hdr; - if (!pskb_may_pull(skb, sizeof(struct icmphdr))) + hdr = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_hdr), &_hdr); + if (!hdr) return 1; - type = icmp_hdr(skb)->type; - if (type < 32) { + if (hdr->type < 32) { __u32 data = raw_sk(sk)->filter.data; - return ((1 << type) & data) != 0; + return ((1U << hdr->type) & data) != 0; } /* Do not block unknown ICMP types */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7a7724da9bff..bf7a604c695c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -590,6 +590,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_challenge_ack_limit", + .data = &sysctl_tcp_challenge_ack_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_NET_DMA { .procname = "tcp_dma_copybreak", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 38d6e4374561..8429ac5eb914 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -485,14 +485,12 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) !tp->urg_data || before(tp->urg_seq, tp->copied_seq) || !before(tp->urg_seq, tp->rcv_nxt)) { - struct sk_buff *skb; answ = tp->rcv_nxt - tp->copied_seq; - /* Subtract 1, if FIN is in queue. */ - skb = skb_peek_tail(&sk->sk_receive_queue); - if (answ && skb) - answ -= tcp_hdr(skb)->fin; + /* Subtract 1, if FIN was received */ + if (answ && sock_flag(sk, SOCK_DONE)) + answ--; } else answ = tp->urg_seq - tp->copied_seq; release_sock(sk); @@ -744,7 +742,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, old_size_goal + mss_now > xmit_size_goal)) { xmit_size_goal = old_size_goal; } else { - tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + tp->xmit_size_goal_segs = + min_t(u16, xmit_size_goal / mss_now, + sk->sk_gso_max_segs); xmit_size_goal = tp->xmit_size_goal_segs * mss_now; } } @@ -1602,8 +1602,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } #ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + if (tp->ucopy.dma_chan) { + if (tp->rcv_wnd == 0 && + !skb_queue_empty(&sk->sk_async_wait_queue)) { + tcp_service_net_dma(sk, true); + tcp_cleanup_rbuf(sk, copied); + } else + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + } #endif if (copied >= target) { /* Do not sleep, just process backlog. */ diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 272a84593c85..69251dde7501 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -291,7 +291,8 @@ int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) left = tp->snd_cwnd - in_flight; if (sk_can_gso(sk) && left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && - left * tp->mss_cache < sk->sk_gso_max_size) + left * tp->mss_cache < sk->sk_gso_max_size && + left < sk->sk_gso_max_segs) return 1; return left <= tcp_max_tso_deferred_mss(tp); } diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 813b43a76fec..834857f3c871 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -313,11 +313,13 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, .tcpv_rttcnt = ca->cnt_rtt, .tcpv_minrtt = ca->base_rtt, }; - u64 t = ca->sum_rtt; - do_div(t, ca->cnt_rtt); - info.tcpv_rtt = t; + if (info.tcpv_rttcnt > 0) { + u64 t = ca->sum_rtt; + do_div(t, info.tcpv_rttcnt); + info.tcpv_rtt = t; + } nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6b017383119e..b40e05b9c451 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -88,6 +88,9 @@ int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +/* rfc5961 challenge ack rate limiting */ +int sysctl_tcp_challenge_ack_limit = 100; + int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -3037,13 +3040,14 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int newly_acked_sacked, bool is_dupack, + int prior_sacked, bool is_dupack, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); + int newly_acked_sacked = 0; int fast_rexmit = 0, mib_idx; if (WARN_ON(!tp->packets_out && tp->sacked_out)) @@ -3103,6 +3107,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_add_reno_sack(sk); } else do_lost = tcp_try_undo_partial(sk, pkts_acked); + newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; break; case TCP_CA_Loss: if (flag & FLAG_DATA_ACKED) @@ -3124,6 +3129,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (is_dupack) tcp_add_reno_sack(sk); } + newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); @@ -3633,6 +3639,11 @@ static int tcp_process_frto(struct sock *sk, int flag) } } else { if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + if (!tcp_packets_in_flight(tp)) { + tcp_enter_frto_loss(sk, 2, flag); + return true; + } + /* Prevent sending of new data. */ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)); @@ -3681,6 +3692,24 @@ static int tcp_process_frto(struct sock *sk, int flag) return 0; } +/* RFC 5961 7 [ACK Throttling] */ +static void tcp_send_challenge_ack(struct sock *sk) +{ + /* unprotected vars, we dont care of overwrites */ + static u32 challenge_timestamp; + static unsigned int challenge_count; + u32 now = jiffies / HZ; + + if (now != challenge_timestamp) { + challenge_timestamp = now; + challenge_count = 0; + } + if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + tcp_send_ack(sk); + } +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3695,14 +3724,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets; int prior_sacked = tp->sacked_out; int pkts_acked = 0; - int newly_acked_sacked = 0; int frto_cwnd = 0; /* If the ack is older than previous acks * then we can probably ignore it. */ - if (before(ack, prior_snd_una)) + if (before(ack, prior_snd_una)) { + /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ + if (before(ack, prior_snd_una - tp->max_window)) { + tcp_send_challenge_ack(sk); + return -1; + } goto old_ack; + } /* If the ack includes data we haven't sent yet, discard * this segment (RFC793 Section 3.9). @@ -3768,8 +3802,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); pkts_acked = prior_packets - tp->packets_out; - newly_acked_sacked = (prior_packets - prior_sacked) - - (tp->packets_out - tp->sacked_out); if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); @@ -3783,7 +3815,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) @@ -3798,7 +3830,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3818,8 +3850,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - newly_acked_sacked = tp->sacked_out - prior_sacked; - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); } @@ -5271,8 +5302,8 @@ out: /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ -static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, int syn_inerr) +static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, int syn_inerr) { const u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); @@ -5297,38 +5328,48 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * an acknowledgment should be sent in reply (unless the RST * bit is set, if so drop the segment and return)". */ - if (!th->rst) + if (!th->rst) { + if (th->syn) + goto syn_challenge; tcp_send_dupack(sk, skb); + } goto discard; } /* Step 2: check RST bit */ if (th->rst) { - tcp_reset(sk); + /* RFC 5961 3.2 : + * If sequence number exactly matches RCV.NXT, then + * RESET the connection + * else + * Send a challenge ACK + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) + tcp_reset(sk); + else + tcp_send_challenge_ack(sk); goto discard; } - /* ts_recent update must be made after we are sure that the packet - * is in window. - */ - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - /* step 3: check security and precedence [ignored] */ - /* step 4: Check for a SYN in window. */ - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* step 4: Check for a SYN + * RFC 5691 4.2 : Send a challenge ack + */ + if (th->syn) { +syn_challenge: if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return -1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); + tcp_send_challenge_ack(sk); + goto discard; } - return 1; + return true; discard: __kfree_skb(skb); - return 0; + return false; } /* @@ -5358,7 +5399,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); - int res; /* * Header prediction. @@ -5538,14 +5578,18 @@ slow_path: * Standard slow path. */ - res = tcp_validate_incoming(sk, skb, th, 1); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 1)) + return 0; step5: if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) goto discard; + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ @@ -5850,7 +5894,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; - int res; tp->rx_opt.saw_tstamp = 0; @@ -5905,9 +5948,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; } - res = tcp_validate_incoming(sk, skb, th, 0); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 0)) + return 0; /* step 5: check the ACK field */ if (th->ack) { @@ -6018,6 +6060,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } else goto discard; + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + /* step 6: check the URG bit */ tcp_urg(sk, skb, th); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0cb86ceb652f..76f50e1b53af 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -678,10 +678,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; /* When socket is gone, all binding information is lost. - * routing might fail in this case. using iif for oif to - * make sure we can deliver it + * routing might fail in this case. No choice here, if we choose to force + * input interface, we will misroute in case of asymmetric route. */ - arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); + if (sk) + arg.bound_dev_if = sk->sk_bound_dev_if; net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; @@ -1523,10 +1524,8 @@ exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: - tcp_clear_xmit_timers(newsk); - tcp_cleanup_congestion_control(newsk); - bh_unlock_sock(newsk); - sock_put(newsk); + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); goto exit; } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7ac6423117ad..2d27e1af9303 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1318,21 +1318,21 @@ static void tcp_cwnd_validate(struct sock *sk) * when we would be allowed to send the split-due-to-Nagle skb fully. */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int cwnd) + unsigned int mss_now, unsigned int max_segs) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, cwnd_len; + u32 needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; - cwnd_len = mss_now * cwnd; + max_len = mss_now * max_segs; - if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) - return cwnd_len; + if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) + return max_len; needed = min(skb->len, window); - if (cwnd_len <= needed) - return cwnd_len; + if (max_len <= needed) + return max_len; return needed - needed % mss_now; } @@ -1560,7 +1560,8 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= sk->sk_gso_max_size) + if (limit >= min_t(unsigned int, sk->sk_gso_max_size, + sk->sk_gso_max_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1786,7 +1787,9 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - cwnd_quota); + min_t(unsigned int, + cwnd_quota, + sk->sk_gso_max_segs)); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7d5cb975cc6f..81e0ad2442c7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -493,8 +493,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct net_device *dev; struct inet6_dev *idev; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); @@ -503,7 +502,6 @@ static void addrconf_forward_change(struct net *net, __s32 newf) dev_forward_change(idev); } } - rcu_read_unlock(); } static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) @@ -795,10 +793,16 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) struct in6_addr prefix; struct rt6_info *rt; struct net *net = dev_net(ifp->idev->dev); + struct flowi6 fl6 = {}; + ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len); - rt = rt6_lookup(net, &prefix, NULL, ifp->idev->dev->ifindex, 1); + fl6.flowi6_oif = ifp->idev->dev->ifindex; + fl6.daddr = prefix; + rt = (struct rt6_info *)ip6_route_lookup(net, &fl6, + RT6_LOOKUP_F_IFACE); - if (rt && addrconf_is_prefix_route(rt)) { + if (rt != net->ipv6.ip6_null_entry && + addrconf_is_prefix_route(rt)) { if (onlink == 0) { ip6_del_rt(rt); rt = NULL; @@ -1732,7 +1736,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, continue; if ((rt->rt6i_flags & flags) != flags) continue; - if ((noflags != 0) && ((rt->rt6i_flags & flags) != 0)) + if ((rt->rt6i_flags & noflags) != 0) continue; dst_hold(&rt->dst); break; @@ -3091,14 +3095,15 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) struct hlist_node *n; hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket], addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; /* sync with offset */ if (p < state->offset) { p++; continue; } state->offset++; - if (net_eq(dev_net(ifa->idev->dev), net)) - return ifa; + return ifa; } /* prepare for next bucket */ @@ -3116,18 +3121,20 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct hlist_node *n = &ifa->addr_lst; hlist_for_each_entry_continue_rcu_bh(ifa, n, addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; state->offset++; - if (net_eq(dev_net(ifa->idev->dev), net)) - return ifa; + return ifa; } while (++state->bucket < IN6_ADDR_HSIZE) { state->offset = 0; hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket], addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; state->offset++; - if (net_eq(dev_net(ifa->idev->dev), net)) - return ifa; + return ifa; } } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 92bb9cba5c39..c3a007dc37cd 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -818,6 +818,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) offsetof(struct rt6_info, rt6i_src), allow_create, replace_required); + if (IS_ERR(sn)) { + err = PTR_ERR(sn); + sn = NULL; + } if (!sn) { /* If it is failed, discard just allocated root, and then (in st_failure) stale node diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 13e5399b1cd9..ce661baa60cb 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1287,10 +1287,10 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, cork->length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len; + exthdrlen = (opt ? opt->opt_flen : 0); length += exthdrlen; transhdrlen += exthdrlen; - dst_exthdrlen = rt->dst.header_len; + dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; } else { rt = (struct rt6_info *)cork->dst; fl6 = &inet->cork.fl.u.ip6; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 63dd1f89ed7d..34c1109d3468 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -828,6 +828,7 @@ pref_skip_coa: if (val < 0 || val > 255) goto e_inval; np->min_hopcount = val; + retv = 0; break; case IPV6_DONTFRAG: np->dontfrag = valbool; diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 7e1e0fbfef21..740c919e0b6c 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -84,28 +84,30 @@ static int mip6_mh_len(int type) static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) { - struct ip6_mh *mh; + struct ip6_mh _hdr; + const struct ip6_mh *mh; - if (!pskb_may_pull(skb, (skb_transport_offset(skb)) + 8) || - !pskb_may_pull(skb, (skb_transport_offset(skb) + - ((skb_transport_header(skb)[1] + 1) << 3)))) + mh = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_hdr), &_hdr); + if (!mh) return -1; - mh = (struct ip6_mh *)skb_transport_header(skb); + if (((mh->ip6mh_hdrlen + 1) << 3) > skb->len) + return -1; if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n", mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); - mip6_param_prob(skb, 0, ((&mh->ip6mh_hdrlen) - - skb_network_header(skb))); + mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) + + skb_network_header_len(skb)); return -1; } if (mh->ip6mh_proto != IPPROTO_NONE) { LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n", mh->ip6mh_proto); - mip6_param_prob(skb, 0, ((&mh->ip6mh_proto) - - skb_network_header(skb))); + mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) + + skb_network_header_len(skb)); return -1; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 176b469322ac..843d6ebc525e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -593,7 +593,7 @@ static void ndisc_send_unsol_na(struct net_device *dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; - struct in6_addr mcaddr; + struct in6_addr mcaddr = IN6ADDR_LINKLOCAL_ALLNODES_INIT; idev = in6_dev_get(dev); if (!idev) @@ -601,7 +601,6 @@ static void ndisc_send_unsol_na(struct net_device *dev) read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { - addrconf_addr_solict_mult(&ifa->addr, &mcaddr); ndisc_send_na(dev, NULL, &mcaddr, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5bddea778840..3ee28700de4c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -107,21 +107,20 @@ found: * 0 - deliver * 1 - block */ -static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) +static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) { - struct icmp6hdr *icmph; - struct raw6_sock *rp = raw6_sk(sk); - - if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) { - __u32 *data = &rp->filter.data[0]; - int bit_nr; + struct icmp6hdr *_hdr; + const struct icmp6hdr *hdr; - icmph = (struct icmp6hdr *) skb->data; - bit_nr = icmph->icmp6_type; + hdr = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_hdr), &_hdr); + if (hdr) { + const __u32 *data = &raw6_sk(sk)->filter.data[0]; + unsigned int type = hdr->icmp6_type; - return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0; + return (data[type >> 5] & (1U << (type & 31))) != 0; } - return 0; + return 1; } #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c4920ca83f5f..493490f052a0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -203,7 +203,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { }; static const u32 ip6_template_metrics[RTAX_MAX] = { - [RTAX_HOPLIMIT - 1] = 255, + [RTAX_HOPLIMIT - 1] = 0, }; static struct rt6_info ip6_null_entry_template = { @@ -846,7 +846,8 @@ restart: dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); - if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) + if (!dst_get_neighbour_noref_raw(&rt->dst) && + !(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_LOCAL))) nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); else if (!(rt->dst.flags & DST_HOST)) nrt = rt6_alloc_clone(rt, &fl6->daddr); @@ -1135,7 +1136,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; - dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255); + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); spin_lock_bh(&icmp6_dst_lock); rt->dst.next = icmp6_dst_gc_list; @@ -1485,17 +1486,18 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry) - return -ENOENT; + if (rt == net->ipv6.ip6_null_entry) { + err = -ENOENT; + goto out; + } table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_del(rt, info); - dst_release(&rt->dst); - write_unlock_bh(&table->tb6_lock); +out: + dst_release(&rt->dst); return err; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 98256cf72f9d..3889e0204183 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -896,7 +896,8 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; - fl6.flowi6_oif = inet6_iif(skb); + if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = inet6_iif(skb); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); @@ -1410,7 +1411,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #endif if (__inet_inherit_port(sk, newsk) < 0) { - sock_put(newsk); + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); goto out; } __inet6_hash(newsk, NULL); diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 89ff8c67943e..7501b22b9c59 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1253,11 +1253,10 @@ static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) /* Remove from tunnel list */ spin_lock_bh(&pn->l2tp_tunnel_list_lock); list_del_rcu(&tunnel->list); + kfree_rcu(tunnel, rcu); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - synchronize_rcu(); atomic_dec(&l2tp_tunnel_count); - kfree(tunnel); } /* Create a socket for the tunnel, if one isn't set up by diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a16a48e79fab..439379484bfc 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -157,6 +157,7 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ struct hlist_head session_hlist[L2TP_HASH_SIZE]; /* hashed list of sessions, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 7446038e6b42..ab9a293ad034 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -132,7 +132,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, printk("\n"); } - if (!pskb_may_pull(skb, sizeof(ETH_HLEN))) + if (!pskb_may_pull(skb, ETH_HLEN)) goto error; secpath_reset(skb); @@ -269,6 +269,7 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p out_del_dev: free_netdev(dev); + spriv->dev = NULL; out_del_session: l2tp_session_delete(session); out: diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index b9bef2c75026..df08d7779e1d 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -971,14 +971,13 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_llc sllc; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); - int rc = 0; + int rc = -EBADF; memset(&sllc, 0, sizeof(sllc)); lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; *uaddrlen = sizeof(sllc); - memset(uaddr, 0, *uaddrlen); if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 677d65929780..944334869596 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -151,7 +151,17 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, sta = sta_info_get(sdata, mac_addr); else sta = sta_info_get_bss(sdata, mac_addr); - if (!sta) { + /* + * The ASSOC test makes sure the driver is ready to + * receive the key. When wpa_supplicant has roamed + * using FT, it attempts to set the key before + * association has completed, this rejects that attempt + * so it will set the key again after assocation. + * + * TODO: accept the key if we have a station entry and + * add it to the device after the station. + */ + if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) { ieee80211_key_free(sdata->local, key); err = -ENOENT; goto out_unlock; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index cef7c29214a8..50191a30207c 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -664,8 +664,8 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata) printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other " "IBSS networks with same SSID (merge)\n", sdata->name); - ieee80211_request_internal_scan(sdata, - ifibss->ssid, ifibss->ssid_len, NULL); + ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, + NULL); } static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) @@ -772,9 +772,8 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to " "join\n", sdata->name); - ieee80211_request_internal_scan(sdata, - ifibss->ssid, ifibss->ssid_len, - ifibss->fixed_channel ? ifibss->channel : NULL); + ieee80211_request_ibss_scan(sdata, ifibss->ssid, + ifibss->ssid_len, chan); } else { int interval = IEEE80211_SCAN_INTERVAL; @@ -1110,7 +1109,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH; sdata->u.ibss.ibss_join_req = jiffies; - memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN); + memcpy(sdata->u.ibss.ssid, params->ssid, params->ssid_len); sdata->u.ibss.ssid_len = params->ssid_len; mutex_unlock(&sdata->u.ibss.mtx); @@ -1153,10 +1152,6 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) mutex_lock(&sdata->u.ibss.mtx); - sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH; - memset(sdata->u.ibss.bssid, 0, ETH_ALEN); - sdata->u.ibss.ssid_len = 0; - active_ibss = ieee80211_sta_active_ibss(sdata); if (!active_ibss && !is_zero_ether_addr(ifibss->bssid)) { @@ -1177,6 +1172,10 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) } } + ifibss->state = IEEE80211_IBSS_MLME_SEARCH; + memset(ifibss->bssid, 0, ETH_ALEN); + ifibss->ssid_len = 0; + sta_info_flush(sdata->local, sdata); spin_lock_bh(&ifibss->incomplete_lock); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a68ef6219833..e4b6910d745d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1239,9 +1239,9 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, /* scan/BSS handling */ void ieee80211_scan_work(struct work_struct *work); -int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, - const u8 *ssid, u8 ssid_len, - struct ieee80211_channel *chan); +int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, + const u8 *ssid, u8 ssid_len, + struct ieee80211_channel *chan); int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req); void ieee80211_scan_cancel(struct ieee80211_local *local); @@ -1270,10 +1270,8 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata); void ieee80211_sched_scan_stopped_work(struct work_struct *work); /* off-channel helpers */ -void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local, - bool offchannel_ps_enable); -void ieee80211_offchannel_return(struct ieee80211_local *local, - bool offchannel_ps_disable); +void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local); +void ieee80211_offchannel_return(struct ieee80211_local *local); void ieee80211_hw_roc_setup(struct ieee80211_local *local); /* interface handling */ @@ -1303,6 +1301,8 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); +void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, + struct sk_buff_head *skbs); /* HT */ bool ieee80111_cfg_override_disables_ht40(struct ieee80211_sub_if_data *sdata); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 25be6831ff09..abc31d7bd2a5 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3232,6 +3232,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, goto out_unlock; err_clear: + memset(ifmgd->bssid, 0, ETH_ALEN); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->auth_data = NULL; err_free: kfree(auth_data); @@ -3410,6 +3412,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, err = 0; goto out; err_clear: + memset(ifmgd->bssid, 0, ETH_ALEN); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->assoc_data = NULL; err_free: kfree(assoc_data); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 935aa4b6deee..c22f0748f4f6 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -103,8 +103,7 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) ieee80211_sta_reset_conn_monitor(sdata); } -void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local, - bool offchannel_ps_enable) +void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; @@ -129,8 +128,7 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local, if (sdata->vif.type != NL80211_IFTYPE_MONITOR) { netif_tx_stop_all_queues(sdata->dev); - if (offchannel_ps_enable && - (sdata->vif.type == NL80211_IFTYPE_STATION) && + if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.associated) ieee80211_offchannel_ps_enable(sdata, true); } @@ -138,8 +136,7 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local, mutex_unlock(&local->iflist_mtx); } -void ieee80211_offchannel_return(struct ieee80211_local *local, - bool offchannel_ps_disable) +void ieee80211_offchannel_return(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; @@ -152,11 +149,9 @@ void ieee80211_offchannel_return(struct ieee80211_local *local, continue; /* Tell AP we're back */ - if (offchannel_ps_disable && - sdata->vif.type == NL80211_IFTYPE_STATION) { - if (sdata->u.mgd.associated) - ieee80211_offchannel_ps_disable(sdata); - } + if (sdata->vif.type == NL80211_IFTYPE_STATION && + sdata->u.mgd.associated) + ieee80211_offchannel_ps_disable(sdata); if (sdata->vif.type != NL80211_IFTYPE_MONITOR) { /* diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c9b508ea9d6b..8ce9feb13010 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -513,6 +513,11 @@ ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) if (ieee80211_is_action(hdr->frame_control)) { u8 category; + + /* make sure category field is present */ + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE) + return RX_DROP_MONITOR; + mgmt = (struct ieee80211_mgmt *)hdr; category = mgmt->u.action.category; if (category != WLAN_CATEGORY_MESH_ACTION && @@ -869,14 +874,16 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) */ if (rx->sta && rx->sdata->vif.type == NL80211_IFTYPE_STATION && ieee80211_is_data_present(hdr->frame_control)) { - u16 ethertype; - u8 *payload; - - payload = rx->skb->data + - ieee80211_hdrlen(hdr->frame_control); - ethertype = (payload[6] << 8) | payload[7]; - if (cpu_to_be16(ethertype) == - rx->sdata->control_port_protocol) + unsigned int hdrlen; + __be16 ethertype; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + + if (rx->skb->len < hdrlen + 8) + return RX_DROP_MONITOR; + + skb_copy_bits(rx->skb, hdrlen + 6, ðertype, 2); + if (ethertype == rx->sdata->control_port_protocol) return RX_CONTINUE; } @@ -1465,11 +1472,14 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) hdr = (struct ieee80211_hdr *)rx->skb->data; fc = hdr->frame_control; + + if (ieee80211_is_ctl(fc)) + return RX_CONTINUE; + sc = le16_to_cpu(hdr->seq_ctrl); frag = sc & IEEE80211_SCTL_FRAG; if (likely((!ieee80211_has_morefrags(fc) && frag == 0) || - (rx->skb)->len < 24 || is_multicast_ether_addr(hdr->addr1))) { /* not fragmented */ goto out; @@ -1892,6 +1902,20 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) hdr = (struct ieee80211_hdr *) skb->data; hdrlen = ieee80211_hdrlen(hdr->frame_control); + + /* make sure fixed part of mesh header is there, also checks skb len */ + if (!pskb_may_pull(rx->skb, hdrlen + 6)) + return RX_DROP_MONITOR; + + mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); + + /* make sure full mesh header is there, also checks skb len */ + if (!pskb_may_pull(rx->skb, + hdrlen + ieee80211_get_mesh_hdrlen(mesh_hdr))) + return RX_DROP_MONITOR; + + /* reload pointers */ + hdr = (struct ieee80211_hdr *) skb->data; mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); /* frame is in RMC, don't forward */ @@ -1900,7 +1924,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) mesh_rmc_check(hdr->addr3, mesh_hdr, rx->sdata)) return RX_DROP_MONITOR; - if (!ieee80211_is_data(hdr->frame_control)) + if (!ieee80211_is_data(hdr->frame_control) || + !(status->rx_flags & IEEE80211_RX_RA_MATCH)) return RX_CONTINUE; if (!mesh_hdr->ttl) @@ -1914,9 +1939,12 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) if (is_multicast_ether_addr(hdr->addr1)) { mpp_addr = hdr->addr3; proxied_addr = mesh_hdr->eaddr1; - } else { + } else if (mesh_hdr->flags & MESH_FLAGS_AE_A5_A6) { + /* has_a4 already checked in ieee80211_rx_mesh_check */ mpp_addr = hdr->addr4; proxied_addr = mesh_hdr->eaddr2; + } else { + return RX_DROP_MONITOR; } rcu_read_lock(); @@ -1944,9 +1972,6 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) } skb_set_queue_mapping(skb, q); - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - goto out; - if (!--mesh_hdr->ttl) { IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl); return RX_DROP_MONITOR; @@ -2361,6 +2386,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } break; case WLAN_CATEGORY_SELF_PROTECTED: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.self_prot.action_code))) + break; + switch (mgmt->u.action.u.self_prot.action_code) { case WLAN_SP_MESH_PEERING_OPEN: case WLAN_SP_MESH_PEERING_CLOSE: @@ -2379,6 +2408,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } break; case WLAN_CATEGORY_MESH_ACTION: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.mesh_action.action_code))) + break; + if (!ieee80211_vif_is_mesh(&sdata->vif)) break; if (mesh_action_is_path_sel(mgmt) && @@ -2927,10 +2960,15 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, test_bit(SCAN_SW_SCANNING, &local->scanning))) status->rx_flags |= IEEE80211_RX_IN_SCAN; - if (ieee80211_is_mgmt(fc)) - err = skb_linearize(skb); - else + if (ieee80211_is_mgmt(fc)) { + /* drop frame if too short for header */ + if (skb->len < ieee80211_hdrlen(fc)) + err = -ENOBUFS; + else + err = skb_linearize(skb); + } else { err = !pskb_may_pull(skb, ieee80211_hdrlen(fc)); + } if (err) { dev_kfree_skb(skb); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 9f8412c48960..e7e0e368cc34 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -329,7 +329,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, if (!was_hw_scan) { ieee80211_configure_filter(local); drv_sw_scan_complete(local); - ieee80211_offchannel_return(local, true); + ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); @@ -374,7 +374,7 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local) local->next_scan_state = SCAN_DECISION; local->scan_channel_idx = 0; - ieee80211_offchannel_stop_vifs(local, true); + ieee80211_offchannel_stop_vifs(local); ieee80211_configure_filter(local); @@ -653,12 +653,8 @@ static void ieee80211_scan_state_suspend(struct ieee80211_local *local, local->scan_channel = NULL; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); - /* - * Re-enable vifs and beaconing. Leave PS - * in off-channel state..will put that back - * on-channel at the end of scanning. - */ - ieee80211_offchannel_return(local, false); + /* disable PS */ + ieee80211_offchannel_return(local); #ifndef CONFIG_MAC80211_SCAN_ABORT *next_delay = HZ / 5; @@ -679,8 +675,7 @@ static void ieee80211_scan_state_suspend(struct ieee80211_local *local, static void ieee80211_scan_state_resume(struct ieee80211_local *local, unsigned long *next_delay) { - /* PS already is in off-channel mode */ - ieee80211_offchannel_stop_vifs(local, false); + ieee80211_offchannel_stop_vifs(local); if (local->ops->flush) { drv_flush(local, false); @@ -804,9 +799,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, return res; } -int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, - const u8 *ssid, u8 ssid_len, - struct ieee80211_channel *chan) +int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, + const u8 *ssid, u8 ssid_len, + struct ieee80211_channel *chan) { struct ieee80211_local *local = sdata->local; int ret = -EBUSY; @@ -820,22 +815,36 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, /* fill internal scan request */ if (!chan) { - int i, nchan = 0; + int i, max_n; + int n_ch = 0; for (band = 0; band < IEEE80211_NUM_BANDS; band++) { if (!local->hw.wiphy->bands[band]) continue; - for (i = 0; - i < local->hw.wiphy->bands[band]->n_channels; - i++) { - local->int_scan_req->channels[nchan] = + + max_n = local->hw.wiphy->bands[band]->n_channels; + for (i = 0; i < max_n; i++) { + struct ieee80211_channel *tmp_ch = &local->hw.wiphy->bands[band]->channels[i]; - nchan++; + + if (tmp_ch->flags & (IEEE80211_CHAN_NO_IBSS | + IEEE80211_CHAN_DISABLED)) + continue; + + local->int_scan_req->channels[n_ch] = tmp_ch; + n_ch++; } } - local->int_scan_req->n_channels = nchan; + if (WARN_ON_ONCE(n_ch == 0)) + goto unlock; + + local->int_scan_req->n_channels = n_ch; } else { + if (WARN_ON_ONCE(chan->flags & (IEEE80211_CHAN_NO_IBSS | + IEEE80211_CHAN_DISABLED))) + goto unlock; + local->int_scan_req->channels[0] = chan; local->int_scan_req->n_channels = 1; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index d93d39b84344..e2e0e0bc6622 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -738,8 +738,8 @@ int __must_check __sta_info_destroy(struct sta_info *sta) for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]); - __skb_queue_purge(&sta->ps_tx_buf[ac]); - __skb_queue_purge(&sta->tx_filtered[ac]); + ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]); + ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]); } #ifdef CONFIG_MAC80211_MESH @@ -774,7 +774,7 @@ int __must_check __sta_info_destroy(struct sta_info *sta) tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]); if (!tid_tx) continue; - __skb_queue_purge(&tid_tx->pending); + ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending); kfree(tid_tx); } @@ -844,7 +844,7 @@ void sta_info_init(struct ieee80211_local *local) void sta_info_stop(struct ieee80211_local *local) { - del_timer(&local->sta_cleanup); + del_timer_sync(&local->sta_cleanup); sta_info_flush(local, NULL); } @@ -959,6 +959,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) struct ieee80211_local *local = sdata->local; struct sk_buff_head pending; int filtered = 0, buffered = 0, ac; + unsigned long flags; clear_sta_flag(sta, WLAN_STA_SP); @@ -974,12 +975,16 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; + spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending); + spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); tmp = skb_queue_len(&pending); filtered += tmp - count; count = tmp; + spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending); + spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); tmp = skb_queue_len(&pending); buffered += tmp - count; } diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 5f8f89e89d6b..47b117f3f567 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -660,3 +660,12 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb) dev_kfree_skb_any(skb); } EXPORT_SYMBOL(ieee80211_free_txskb); + +void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, + struct sk_buff_head *skbs) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(skbs))) + ieee80211_free_txskb(hw, skb); +} diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e76facc69e95..eace7664c805 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1357,7 +1357,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) if (tx->skb) dev_kfree_skb(tx->skb); else - __skb_queue_purge(&tx->skbs); + ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs); return -1; } else if (unlikely(res == TX_QUEUED)) { I802_DEBUG_INC(tx->local->tx_handlers_queued); @@ -2126,10 +2126,13 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, */ void ieee80211_clear_tx_pending(struct ieee80211_local *local) { + struct sk_buff *skb; int i; - for (i = 0; i < local->hw.queues; i++) - skb_queue_purge(&local->pending[i]); + for (i = 0; i < local->hw.queues; i++) { + while ((skb = skb_dequeue(&local->pending[i])) != NULL) + ieee80211_free_txskb(&local->hw, skb); + } } /* diff --git a/net/mac80211/util.c b/net/mac80211/util.c index eb9d7c0529b6..73ef163a0391 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -592,13 +592,38 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, break; } - if (id != WLAN_EID_VENDOR_SPECIFIC && - id != WLAN_EID_QUIET && - test_bit(id, seen_elems)) { - elems->parse_error = true; - left -= elen; - pos += elen; - continue; + switch (id) { + case WLAN_EID_SSID: + case WLAN_EID_SUPP_RATES: + case WLAN_EID_FH_PARAMS: + case WLAN_EID_DS_PARAMS: + case WLAN_EID_CF_PARAMS: + case WLAN_EID_TIM: + case WLAN_EID_IBSS_PARAMS: + case WLAN_EID_CHALLENGE: + case WLAN_EID_RSN: + case WLAN_EID_ERP_INFO: + case WLAN_EID_EXT_SUPP_RATES: + case WLAN_EID_HT_CAPABILITY: + case WLAN_EID_MESH_ID: + case WLAN_EID_MESH_CONFIG: + case WLAN_EID_PEER_MGMT: + case WLAN_EID_PREQ: + case WLAN_EID_PREP: + case WLAN_EID_PERR: + case WLAN_EID_RANN: + case WLAN_EID_CHANNEL_SWITCH: + case WLAN_EID_EXT_CHANSWITCH_ANN: + case WLAN_EID_COUNTRY: + case WLAN_EID_PWR_CONSTRAINT: + case WLAN_EID_TIMEOUT_INTERVAL: + if (test_bit(id, seen_elems)) { + elems->parse_error = true; + left -= elen; + pos += elen; + continue; + } + break; } if (calc_crc && id < 64 && (filter & (1ULL << id))) @@ -1316,6 +1341,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; + if (!sdata->u.mgd.associated) + continue; ieee80211_send_nullfunc(local, sdata, 0); } diff --git a/net/mac80211/work.c b/net/mac80211/work.c index c6e230efa049..a74f53894d69 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -148,7 +148,7 @@ static void ieee80211_work_work(struct work_struct *work) } if (!started && !local->tmp_channel) { - ieee80211_offchannel_stop_vifs(local, true); + ieee80211_offchannel_stop_vifs(local); local->tmp_channel = wk->chan; local->tmp_channel_type = wk->chan_type; @@ -220,7 +220,7 @@ static void ieee80211_work_work(struct work_struct *work) local->tmp_channel = NULL; ieee80211_hw_config(local, 0); - ieee80211_offchannel_return(local, true); + ieee80211_offchannel_return(local); /* give connection some time to breathe */ run_again(local, jiffies + HZ/2); diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 0ae23c60968c..ea6d03bd5d5f 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -106,7 +106,8 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) if (status->flag & RX_FLAG_MMIC_ERROR) goto mic_fail; - if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key) + if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key && + rx->key->conf.cipher == WLAN_CIPHER_SUITE_TKIP) goto update_iv; return RX_CONTINUE; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f5589987fc80..cbc5bfd8c8e4 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1521,11 +1521,12 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, { struct net_device *dev = ptr; struct net *net = dev_net(dev); + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_service *svc; struct ip_vs_dest *dest; unsigned int idx; - if (event != NETDEV_UNREGISTER) + if (event != NETDEV_UNREGISTER || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); EnterFunction(2); @@ -1551,7 +1552,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, } } - list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) { + list_for_each_entry(dest, &ipvs->dest_trash, n_list) { __ip_vs_dev_reset(dest, dev); } mutex_unlock(&__ip_vs_mutex); @@ -2713,6 +2714,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; + memset(&t, 0, sizeof(t)); __ip_vs_get_timeouts(net, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 729f157a0efa..9a171b2445b1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -249,12 +249,15 @@ static void death_by_event(unsigned long ul_conntrack) { struct nf_conn *ct = (void *)ul_conntrack; struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { /* bad luck, let's retry again */ - ct->timeout.expires = jiffies + + ecache->timeout.expires = jiffies + (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ct->timeout); + add_timer(&ecache->timeout); return; } /* we've got the event delivered, now it's dying */ @@ -268,6 +271,9 @@ static void death_by_event(unsigned long ul_conntrack) void nf_ct_insert_dying_list(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); /* add this conntrack to the dying list */ spin_lock_bh(&nf_conntrack_lock); @@ -275,10 +281,10 @@ void nf_ct_insert_dying_list(struct nf_conn *ct) &net->ct.dying); spin_unlock_bh(&nf_conntrack_lock); /* set a new timer to retry event delivery */ - setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); - ct->timeout.expires = jiffies + + setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); + ecache->timeout.expires = jiffies + (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ct->timeout); + add_timer(&ecache->timeout); } EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 4147ba3f653c..e41ec849120a 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -361,23 +361,6 @@ static void evict_oldest_expect(struct nf_conn *master, } } -static inline int refresh_timer(struct nf_conntrack_expect *i) -{ - struct nf_conn_help *master_help = nfct_help(i->master); - const struct nf_conntrack_expect_policy *p; - - if (!del_timer(&i->timeout)) - return 0; - - p = &rcu_dereference_protected( - master_help->helper, - lockdep_is_held(&nf_conntrack_lock) - )->expect_policy[i->class]; - i->timeout.expires = jiffies + p->timeout * HZ; - add_timer(&i->timeout); - return 1; -} - static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) { const struct nf_conntrack_expect_policy *p; @@ -386,7 +369,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); - struct hlist_node *n; + struct hlist_node *n, *next; unsigned int h; int ret = 1; @@ -395,12 +378,12 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_safe(i, n, next, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { - /* Refresh timer: if it's dying, ignore.. */ - if (refresh_timer(i)) { - ret = 0; - goto out; + if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_ct_expect_put(i); + break; } } else if (expect_clash(i, expect)) { ret = -EBUSY; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 0d07a1dcf605..e0221238e749 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -158,21 +158,18 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ -/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* * sNO -> sIV Too late and no reason to do anything * sSS -> sIV Client can't send SYN and then SYN/ACK * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open - * sSR -> sIG - * sES -> sIG Error: SYNs in window outside the SYN_SENT state - * are errors. Receiver will reply with RST - * and close the connection. - * Or we are not in sync and hold a dead connection. - * sFW -> sIG - * sCW -> sIG - * sLA -> sIG - * sTW -> sIG - * sCL -> sIG + * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open + * sES -> sIV Invalid SYN/ACK packets sent by the client + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV + * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, @@ -627,15 +624,9 @@ static bool tcp_in_window(const struct nf_conn *ct, ack = sack = receiver->td_end; } - if (seq == end - && (!tcph->rst - || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) /* - * Packets contains no data: we assume it is valid - * and check the ack value only. - * However RST segments are always validated by their - * SEQ number, except when seq == 0 (reset sent answering - * SYN. + * RST sent answering SYN. */ seq = end = sender->td_end; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index d95f9c963cde..2195eb0727a3 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -389,8 +389,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) /* Precision saver. */ -static inline u_int32_t -user2credits(u_int32_t user) +static u32 user2credits(u32 user) { /* If multiplying would overflow... */ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) @@ -400,7 +399,7 @@ user2credits(u_int32_t user) return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE; } -static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) +static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) { dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY; if (dh->rateinfo.credit > dh->rateinfo.credit_cap) @@ -535,8 +534,7 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) dh->rateinfo.prev = jiffies; dh->rateinfo.credit = user2credits(hinfo->cfg.avg * hinfo->cfg.burst); - dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * - hinfo->cfg.burst); + dh->rateinfo.credit_cap = dh->rateinfo.credit; dh->rateinfo.cost = user2credits(hinfo->cfg.avg); } else { /* update expiration timeout */ diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 32b7a579a032..a4c1e4528cac 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -88,8 +88,7 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par) } /* Precision saver. */ -static u_int32_t -user2credits(u_int32_t user) +static u32 user2credits(u32 user) { /* If multiplying would overflow... */ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) @@ -118,12 +117,12 @@ static int limit_mt_check(const struct xt_mtchk_param *par) /* For SMP, we only want to use one set of state. */ r->master = priv; + /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * + 128. */ + priv->prev = jiffies; + priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ if (r->cost == 0) { - /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * - 128. */ - priv->prev = jiffies; - priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ - r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } return 0; diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 0ec8138aa470..c6f7db720d84 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -16,6 +16,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -44,6 +45,14 @@ const struct ip_set_adt_opt n = { \ .cmdflags = cfs, \ .timeout = t, \ } +#define ADT_MOPT(n, f, d, fs, cfs, t) \ +struct ip_set_adt_opt n = { \ + .family = f, \ + .dim = d, \ + .flags = fs, \ + .cmdflags = cfs, \ + .timeout = t, \ +} /* Revision 0 interface: backward compatible with netfilter/iptables */ @@ -296,11 +305,15 @@ static unsigned int set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v2 *info = par->targinfo; - ADT_OPT(add_opt, par->family, info->add_set.dim, - info->add_set.flags, info->flags, info->timeout); + ADT_MOPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, info->del_set.flags, 0, UINT_MAX); + /* Normalize to fit into jiffies */ + if (add_opt.timeout != IPSET_NO_TIMEOUT && + add_opt.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.timeout = UINT_MAX/MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index faa48f70b7c9..9017e3ef8fee 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -137,6 +137,8 @@ static void netlink_destroy_callback(struct netlink_callback *cb); static DEFINE_RWLOCK(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); +#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); + static ATOMIC_NOTIFIER_HEAD(netlink_chain); static inline u32 netlink_group_mask(u32 group) @@ -156,6 +158,8 @@ static void netlink_sock_destruct(struct sock *sk) if (nlk->cb) { if (nlk->cb->done) nlk->cb->done(nlk->cb); + + module_put(nlk->cb->module); netlink_destroy_callback(nlk->cb); } @@ -330,6 +334,11 @@ netlink_update_listeners(struct sock *sk) struct hlist_node *node; unsigned long mask; unsigned int i; + struct listeners *listeners; + + listeners = nl_deref_protected(tbl->listeners); + if (!listeners) + return; for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; @@ -337,7 +346,7 @@ netlink_update_listeners(struct sock *sk) if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } - tbl->listeners->masks[i] = mask; + listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ @@ -518,7 +527,11 @@ static int netlink_release(struct socket *sock) if (netlink_is_kernel(sk)) { BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { - kfree(nl_table[sk->sk_protocol].listeners); + struct listeners *old; + + old = nl_deref_protected(nl_table[sk->sk_protocol].listeners); + RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL); + kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].registered = 0; } @@ -948,7 +961,7 @@ int netlink_has_listeners(struct sock *sk, unsigned int group) rcu_read_lock(); listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); - if (group - 1 < nl_table[sk->sk_protocol].groups) + if (listeners && group - 1 < nl_table[sk->sk_protocol].groups) res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); @@ -1329,7 +1342,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &scm; - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, true); if (err < 0) return err; @@ -1340,7 +1353,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, dst_pid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; - if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) + if ((dst_group || dst_pid) && + !netlink_capable(sock, NL_NONROOT_SEND)) goto out; } else { dst_pid = nlk->dst_pid; @@ -1579,7 +1593,7 @@ int __netlink_change_ngroups(struct sock *sk, unsigned int groups) new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); if (!new) return -ENOMEM; - old = rcu_dereference_protected(tbl->listeners, 1); + old = nl_deref_protected(tbl->listeners); memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); rcu_assign_pointer(tbl->listeners, new); @@ -1727,6 +1741,7 @@ static int netlink_dump(struct sock *sk) nlk->cb = NULL; mutex_unlock(nlk->cb_mutex); + module_put(cb->module); netlink_destroy_callback(cb); return 0; @@ -1736,9 +1751,9 @@ errout_skb: return err; } -int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, - const struct nlmsghdr *nlh, - struct netlink_dump_control *control) +int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) { struct netlink_callback *cb; struct sock *sk; @@ -1753,6 +1768,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->done = control->done; cb->nlh = nlh; cb->data = control->data; + cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; atomic_inc(&skb->users); cb->skb = skb; @@ -1763,19 +1779,28 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, return -ECONNREFUSED; } nlk = nlk_sk(sk); - /* A dump is in progress... */ + mutex_lock(nlk->cb_mutex); + /* A dump is in progress... */ if (nlk->cb) { mutex_unlock(nlk->cb_mutex); netlink_destroy_callback(cb); - sock_put(sk); - return -EBUSY; + ret = -EBUSY; + goto out; } + /* add reference of module which cb->dump belongs to */ + if (!try_module_get(cb->module)) { + mutex_unlock(nlk->cb_mutex); + netlink_destroy_callback(cb); + ret = -EPROTONOSUPPORT; + goto out; + } + nlk->cb = cb; mutex_unlock(nlk->cb_mutex); ret = netlink_dump(sk); - +out: sock_put(sk); if (ret) @@ -1786,7 +1811,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, */ return -EINTR; } -EXPORT_SYMBOL(netlink_dump_start); +EXPORT_SYMBOL(__netlink_dump_start); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) { @@ -2115,6 +2140,7 @@ static void __init netlink_add_usersock_entry(void) rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; + nl_table[NETLINK_USERSOCK].nl_nonroot = NL_NONROOT_SEND; netlink_table_ungrab(); } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 06592d8b4a2b..1b9024ee963c 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1169,7 +1169,12 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (er < 0) { + skb_free_datagram(sk, skb); + release_sock(sk); + return er; + } if (sax != NULL) { sax->sax25_family = AF_NETROM; diff --git a/net/nfc/llcp/llcp.c b/net/nfc/llcp/llcp.c index 17a578f641f1..c40112c39e15 100644 --- a/net/nfc/llcp/llcp.c +++ b/net/nfc/llcp/llcp.c @@ -966,7 +966,7 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) local->remote_lto = LLCP_DEFAULT_LTO; local->remote_rw = LLCP_DEFAULT_RW; - list_add(&llcp_devices, &local->list); + list_add(&local->list, &llcp_devices); return 0; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index b6b1d7daa3cb..ce5348f5f601 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -24,6 +24,9 @@ #include <linux/ethtool.h> #include <linux/skbuff.h> +#include <net/dst.h> +#include <net/xfrm.h> + #include "datapath.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -209,6 +212,11 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) int len; len = skb->len; + + skb_dst_drop(skb); + nf_reset(skb); + secpath_reset(skb); + skb->dev = netdev; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 4f2c0df79563..38ca5e07d520 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1280,6 +1280,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) spin_unlock(&f->lock); } +bool match_fanout_group(struct packet_type *ptype, struct sock * sk) +{ + if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) + return true; + + return false; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_sock *po = pkt_sk(sk); @@ -1332,6 +1340,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; + match->prot_hook.id_match = match_fanout_group; dev_add_pack(&match->prot_hook); list_add(&match->list, &fanout_list); } @@ -1943,7 +1952,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb) if (likely(po->tx_ring.pg_vec)) { ph = skb_shinfo(skb)->destructor_arg; - BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); BUG_ON(atomic_read(&po->tx_ring.pending) == 0); atomic_dec(&po->tx_ring.pending); __packet_set_status(po, ph, TP_STATUS_AVAILABLE); @@ -2442,13 +2450,15 @@ static int packet_release(struct socket *sock) packet_flush_mclist(sk); - memset(&req_u, 0, sizeof(req_u)); - - if (po->rx_ring.pg_vec) + if (po->rx_ring.pg_vec) { + memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 0); + } - if (po->tx_ring.pg_vec) + if (po->tx_ring.pg_vec) { + memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 1); + } fanout_release(sk); diff --git a/net/rds/recv.c b/net/rds/recv.c index 5c6e9f132026..9f0f17cf6bf9 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -410,6 +410,8 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); + msg->msg_namelen = 0; + if (msg_flags & MSG_OOB) goto out; @@ -485,6 +487,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, sin->sin_port = inc->i_hdr.h_sport; sin->sin_addr.s_addr = inc->i_saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + msg->msg_namelen = sizeof(*sin); } break; } diff --git a/net/rds/send.c b/net/rds/send.c index 96531d4033a2..88eace57dd6b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1122,7 +1122,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_pong); if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_xmit(conn); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); rds_message_put(rm); return 0; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b77f5a06a658..bdacd8df318c 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -67,6 +67,9 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, struct tcf_common *pc; int ret = 0; int err; +#ifdef CONFIG_GACT_PROB + struct tc_gact_p *p_parm = NULL; +#endif if (nla == NULL) return -EINVAL; @@ -82,6 +85,12 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, #ifndef CONFIG_GACT_PROB if (tb[TCA_GACT_PROB] != NULL) return -EOPNOTSUPP; +#else + if (tb[TCA_GACT_PROB]) { + p_parm = nla_data(tb[TCA_GACT_PROB]); + if (p_parm->ptype >= MAX_RAND) + return -EINVAL; + } #endif pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); @@ -103,8 +112,7 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, spin_lock_bh(&gact->tcf_lock); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB - if (tb[TCA_GACT_PROB] != NULL) { - struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]); + if (p_parm) { gact->tcfg_paction = p_parm->paction; gact->tcfg_pval = p_parm->pval; gact->tcfg_ptype = p_parm->ptype; @@ -133,7 +141,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB - if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL) + if (gact->tcfg_ptype) action = gact_rand[gact->tcfg_ptype](gact); else action = gact->tcf_action; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 24d94c097b35..599f67ada1ed 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -250,10 +250,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) cl = defmap[TC_PRIO_BESTEFFORT]; - if (cl == NULL || cl->level >= head->level) + if (cl == NULL) goto fallback; } - + if (cl->level >= head->level) + goto fallback; #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 29b942ce9e82..f08b9166119b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -876,7 +876,7 @@ ok: q->now = psched_get_time(); start_at = jiffies; - next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; + next_event = q->now + 5LLU * PSCHED_TICKS_PER_SEC; for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index e68cb440756a..cdd474afbbd2 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -830,7 +830,10 @@ static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { - cl->S = next->F; + if (qfq_gt(limit, next->F)) + cl->S = next->F; + else /* preserve timestamp correctness */ + cl->S = limit; return; } } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 6c8556459a75..0018b653cb1f 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -183,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, msg = sctp_datamsg_new(GFP_KERNEL); if (!msg) - return NULL; + return ERR_PTR(-ENOMEM); /* Note: Calculate this outside of the loop, so that all fragments * have the same expiration. @@ -280,11 +280,14 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0); - if (!chunk) + if (!chunk) { + err = -ENOMEM; goto errout; + } + err = sctp_user_addto_chunk(chunk, offset, len, msgh->msg_iov); if (err < 0) - goto errout; + goto errout_chunk_free; offset += len; @@ -315,8 +318,10 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0); - if (!chunk) + if (!chunk) { + err = -ENOMEM; goto errout; + } err = sctp_user_addto_chunk(chunk, offset, over,msgh->msg_iov); @@ -324,7 +329,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - (__u8 *)chunk->skb->data); if (err < 0) - goto errout; + goto errout_chunk_free; sctp_datamsg_assign(msg, chunk); list_add_tail(&chunk->frag_list, &msg->chunks); @@ -332,6 +337,9 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, return msg; +errout_chunk_free: + sctp_chunk_free(chunk); + errout: list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); @@ -339,7 +347,7 @@ errout: sctp_chunk_free(chunk); } sctp_datamsg_put(msg); - return NULL; + return ERR_PTR(err); } /* Check whether this message has expired. */ diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 68a385d7c3bd..58cd035dcd20 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -248,6 +248,8 @@ void sctp_endpoint_free(struct sctp_endpoint *ep) /* Final destructor for endpoint. */ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) { + int i; + SCTP_ASSERT(ep->base.dead, "Endpoint is not dead", return); /* Free up the HMAC transform. */ @@ -270,6 +272,9 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); + for (i = 0; i < SCTP_HOW_MANY_SECRETS; ++i) + memset(&ep->secret_key[i], 0, SCTP_SECRET_SIZE); + /* Remove and free the port */ if (sctp_sk(ep->base.sk)->bind_hash) sctp_put_port(ep->base.sk); diff --git a/net/sctp/output.c b/net/sctp/output.c index 8fc4dcd294ab..32ba8d0e50e2 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -334,6 +334,25 @@ finish: return retval; } +static void sctp_packet_release_owner(struct sk_buff *skb) +{ + sk_free(skb->sk); +} + +static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sctp_packet_release_owner; + + /* + * The data chunks have already been accounted for in sctp_sendmsg(), + * therefore only reserve a single byte to keep socket around until + * the packet has been transmitted. + */ + atomic_inc(&sk->sk_wmem_alloc); +} + /* All packets are sent to the network through this function from * sctp_outq_tail(). * @@ -375,7 +394,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) /* Set the owning socket so that we know where to get the * destination IP address. */ - skb_set_owner_w(nskb, sk); + sctp_packet_set_owner_w(nskb, sk); if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sctp_sk(sk)); diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index cfeb1d4a1ee6..96eb168a1f47 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -223,7 +223,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) /* Free the outqueue structure and any related pending chunks. */ -void sctp_outq_teardown(struct sctp_outq *q) +static void __sctp_outq_teardown(struct sctp_outq *q) { struct sctp_transport *transport; struct list_head *lchunk, *temp; @@ -276,8 +276,6 @@ void sctp_outq_teardown(struct sctp_outq *q) sctp_chunk_free(chunk); } - q->error = 0; - /* Throw away any leftover control chunks. */ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { list_del_init(&chunk->list); @@ -285,11 +283,17 @@ void sctp_outq_teardown(struct sctp_outq *q) } } +void sctp_outq_teardown(struct sctp_outq *q) +{ + __sctp_outq_teardown(q); + sctp_outq_init(q->asoc, q); +} + /* Free the outqueue structure and any related pending chunks. */ void sctp_outq_free(struct sctp_outq *q) { /* Throw away leftover chunks. */ - sctp_outq_teardown(q); + __sctp_outq_teardown(q); /* If we were kmalloc()'d, free the memory. */ if (q->malloced) diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 1ff51c9d18d5..2fdb05d8aac8 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1610,8 +1610,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, asoc->outqueue.outstanding_bytes; sackh.num_gap_ack_blocks = 0; sackh.num_dup_tsns = 0; + chunk->subh.sack_hdr = &sackh; sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, - SCTP_SACKH(&sackh)); + SCTP_CHUNK(chunk)); break; case SCTP_CMD_DISCARD_PACKET: diff --git a/net/sctp/socket.c b/net/sctp/socket.c index dba20d6e3247..9fd05edef190 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1908,8 +1908,8 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, /* Break the message into multiple chunks of maximum size. */ datamsg = sctp_datamsg_from_user(asoc, sinfo, msg, msg_len); - if (!datamsg) { - err = -ENOMEM; + if (IS_ERR(datamsg)) { + err = PTR_ERR(datamsg); goto out_free; } @@ -3375,7 +3375,7 @@ static int sctp_setsockopt_auth_key(struct sock *sk, ret = sctp_auth_set_key(sctp_sk(sk)->ep, asoc, authkey); out: - kfree(authkey); + kzfree(authkey); return ret; } diff --git a/net/socket.c b/net/socket.c index 573b26152a30..dab317686ad3 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2605,7 +2605,7 @@ static int do_siocgstamp(struct net *net, struct socket *sock, err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); set_fs(old_fs); if (!err) - err = compat_put_timeval(up, &ktv); + err = compat_put_timeval(&ktv, up); return err; } @@ -2621,7 +2621,7 @@ static int do_siocgstampns(struct net *net, struct socket *sock, err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); set_fs(old_fs); if (!err) - err = compat_put_timespec(up, &kts); + err = compat_put_timespec(&kts, up); return err; } @@ -2658,6 +2658,7 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) return -EFAULT; + memset(&ifc, 0, sizeof(ifc)); if (ifc32.ifcbuf == 0) { ifc32.ifc_len = 0; ifc.ifc_len = 0; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index de0b0f39d9d8..76cb304f3f1a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1406,11 +1406,11 @@ static ssize_t read_flush(struct file *file, char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { - char tbuf[20]; + char tbuf[22]; unsigned long p = *ppos; size_t len; - sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time)); + snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time)); len = strlen(tbuf); if (p >= len) return 0; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 57f2731d957a..a28a2111297e 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -236,7 +236,7 @@ static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event) spin_lock(&sn->rpc_client_lock); list_for_each_entry(clnt, &sn->all_clients, cl_clients) { if (clnt->cl_program->pipe_dir_name == NULL) - break; + continue; if (rpc_clnt_skip_event(clnt, event)) continue; if (atomic_inc_not_zero(&clnt->cl_count) == 0) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index faa078f74b27..b8bda4434c05 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1157,14 +1157,19 @@ static void rpc_kill_sb(struct super_block *sb) struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); mutex_lock(&sn->pipefs_sb_lock); + if (sn->pipefs_sb != sb) { + mutex_unlock(&sn->pipefs_sb_lock); + goto out; + } sn->pipefs_sb = NULL; mutex_unlock(&sn->pipefs_sb_lock); - put_net(net); dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", net, NET_NAME(net)); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); + put_net(net); +out: kill_litter_super(sb); } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index eda32ae7dec4..85b9235fbee2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -915,16 +915,35 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data) return task; } +/* + * rpc_free_task - release rpc task and perform cleanups + * + * Note that we free up the rpc_task _after_ rpc_release_calldata() + * in order to work around a workqueue dependency issue. + * + * Tejun Heo states: + * "Workqueue currently considers two work items to be the same if they're + * on the same address and won't execute them concurrently - ie. it + * makes a work item which is queued again while being executed wait + * for the previous execution to complete. + * + * If a work function frees the work item, and then waits for an event + * which should be performed by another work item and *that* work item + * recycles the freed work item, it can create a false dependency loop. + * There really is no reliable way to detect this short of verifying + * every memory free." + * + */ static void rpc_free_task(struct rpc_task *task) { - const struct rpc_call_ops *tk_ops = task->tk_ops; - void *calldata = task->tk_calldata; + unsigned short tk_flags = task->tk_flags; + + rpc_release_calldata(task->tk_ops, task->tk_calldata); - if (task->tk_flags & RPC_TASK_DYNAMIC) { + if (tk_flags & RPC_TASK_DYNAMIC) { dprintk("RPC: %5u freeing task\n", task->tk_pid); mempool_free(task, rpc_task_mempool); } - rpc_release_calldata(tk_ops, calldata); } static void rpc_async_release(struct work_struct *work) @@ -934,8 +953,7 @@ static void rpc_async_release(struct work_struct *work) static void rpc_release_resources_task(struct rpc_task *task) { - if (task->tk_rqstp) - xprt_release(task); + xprt_release(task); if (task->tk_msg.rpc_cred) { put_rpccred(task->tk_msg.rpc_cred); task->tk_msg.rpc_cred = NULL; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 4bda09d7e1a4..aec7dbb9d3dd 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -316,7 +316,6 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) */ void svc_xprt_enqueue(struct svc_xprt *xprt) { - struct svc_serv *serv = xprt->xpt_server; struct svc_pool *pool; struct svc_rqst *rqstp; int cpu; @@ -362,8 +361,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) rqstp, rqstp->rq_xprt); rqstp->rq_xprt = xprt; svc_xprt_get(xprt); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); pool->sp_stats.threads_woken++; wake_up(&rqstp->rq_wait); } else { @@ -643,8 +640,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (xprt) { rqstp->rq_xprt = xprt; svc_xprt_get(xprt); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); /* As there is a shortage of threads and this request * had to be queued, don't allow the thread to wait so @@ -741,6 +736,8 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) else len = xprt->xpt_ops->xpo_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } svc_xprt_received(xprt); @@ -797,7 +794,8 @@ int svc_send(struct svc_rqst *rqstp) /* Grab mutex to serialize outgoing data. */ mutex_lock(&xprt->xpt_mutex); - if (test_bit(XPT_DEAD, &xprt->xpt_flags)) + if (test_bit(XPT_DEAD, &xprt->xpt_flags) + || test_bit(XPT_CLOSE, &xprt->xpt_flags)) len = -ENOTCONN; else len = xprt->xpt_ops->xpo_sendto(rqstp); @@ -819,7 +817,6 @@ static void svc_age_temp_xprts(unsigned long closure) struct svc_serv *serv = (struct svc_serv *)closure; struct svc_xprt *xprt; struct list_head *le, *next; - LIST_HEAD(to_be_aged); dprintk("svc_age_temp_xprts\n"); @@ -840,25 +837,15 @@ static void svc_age_temp_xprts(unsigned long closure) if (atomic_read(&xprt->xpt_ref.refcount) > 1 || test_bit(XPT_BUSY, &xprt->xpt_flags)) continue; - svc_xprt_get(xprt); - list_move(le, &to_be_aged); + list_del_init(le); set_bit(XPT_CLOSE, &xprt->xpt_flags); set_bit(XPT_DETACHED, &xprt->xpt_flags); - } - spin_unlock_bh(&serv->sv_lock); - - while (!list_empty(&to_be_aged)) { - le = to_be_aged.next; - /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ - list_del_init(le); - xprt = list_entry(le, struct svc_xprt, xpt_list); - dprintk("queuing xprt %p for closing\n", xprt); /* a thread will dequeue and close it soon */ svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); } + spin_unlock_bh(&serv->sv_lock); mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 824d32fb3121..f190ea96f112 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1137,9 +1137,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len >= 0) svsk->sk_tcplen += len; if (len != want) { + svc_tcp_save_pages(svsk, rqstp); if (len < 0 && len != -EAGAIN) goto err_other; - svc_tcp_save_pages(svsk, rqstp); dprintk("svc: incomplete TCP record (%d of %d)\n", svsk->sk_tcplen, svsk->sk_reclen); goto err_noclose; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index da72492360b8..feea4741edda 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -969,11 +969,11 @@ static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) return false; } -static void xprt_alloc_slot(struct rpc_task *task) +void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req; + spin_lock(&xprt->reserve_lock); if (!list_empty(&xprt->free)) { req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); list_del(&req->rq_list); @@ -994,12 +994,29 @@ static void xprt_alloc_slot(struct rpc_task *task) default: task->tk_status = -EAGAIN; } + spin_unlock(&xprt->reserve_lock); return; out_init_req: task->tk_status = 0; task->tk_rqstp = req; xprt_request_init(task, xprt); + spin_unlock(&xprt->reserve_lock); +} +EXPORT_SYMBOL_GPL(xprt_alloc_slot); + +void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) +{ + /* Note: grabbing the xprt_lock_write() ensures that we throttle + * new slot allocation if the transport is congested (i.e. when + * reconnecting a stream transport or when out of socket write + * buffer space). + */ + if (xprt_lock_write(xprt, task)) { + xprt_alloc_slot(xprt, task); + xprt_release_write(xprt, task); + } } +EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { @@ -1083,20 +1100,9 @@ void xprt_reserve(struct rpc_task *task) if (task->tk_rqstp != NULL) return; - /* Note: grabbing the xprt_lock_write() here is not strictly needed, - * but ensures that we throttle new slot allocation if the transport - * is congested (e.g. if reconnecting or if we're out of socket - * write buffer space). - */ task->tk_timeout = 0; task->tk_status = -EAGAIN; - if (!xprt_lock_write(xprt, task)) - return; - - spin_lock(&xprt->reserve_lock); - xprt_alloc_slot(task); - spin_unlock(&xprt->reserve_lock); - xprt_release_write(xprt, task); + xprt->ops->alloc_slot(xprt, task); } static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) @@ -1133,10 +1139,18 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) void xprt_release(struct rpc_task *task) { struct rpc_xprt *xprt; - struct rpc_rqst *req; + struct rpc_rqst *req = task->tk_rqstp; - if (!(req = task->tk_rqstp)) + if (req == NULL) { + if (task->tk_client) { + rcu_read_lock(); + xprt = rcu_dereference(task->tk_client->cl_xprt); + if (xprt->snd_task == task) + xprt_release_write(xprt, task); + rcu_read_unlock(); + } return; + } xprt = req->rq_xprt; if (task->tk_ops->rpc_count_stats != NULL) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 06cdbff79e4a..5d9202dc7cb1 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -713,6 +713,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) static struct rpc_xprt_ops xprt_rdma_procs = { .reserve_xprt = xprt_rdma_reserve_xprt, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ + .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index b88c6bf657ba..79064471cd01 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -254,7 +254,6 @@ struct sock_xprt { void (*old_data_ready)(struct sock *, int); void (*old_state_change)(struct sock *); void (*old_write_space)(struct sock *); - void (*old_error_report)(struct sock *); }; /* @@ -737,10 +736,10 @@ static int xs_tcp_send_request(struct rpc_task *task) dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); case -ECONNRESET: - case -EPIPE: xs_tcp_shutdown(xprt); case -ECONNREFUSED: case -ENOTCONN: + case -EPIPE: clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); } @@ -781,7 +780,6 @@ static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) transport->old_data_ready = sk->sk_data_ready; transport->old_state_change = sk->sk_state_change; transport->old_write_space = sk->sk_write_space; - transport->old_error_report = sk->sk_error_report; } static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk) @@ -789,7 +787,6 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_data_ready = transport->old_data_ready; sk->sk_state_change = transport->old_state_change; sk->sk_write_space = transport->old_write_space; - sk->sk_error_report = transport->old_error_report; } static void xs_reset_transport(struct sock_xprt *transport) @@ -1028,6 +1025,16 @@ static void xs_udp_data_ready(struct sock *sk, int len) read_unlock_bh(&sk->sk_callback_lock); } +/* + * Helper function to force a TCP close if the server is sending + * junk and/or it has put us in CLOSE_WAIT + */ +static void xs_tcp_force_close(struct rpc_xprt *xprt) +{ + set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + xprt_force_disconnect(xprt); +} + static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1054,7 +1061,7 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_rea /* Sanity check of the record length */ if (unlikely(transport->tcp_reclen < 8)) { dprintk("RPC: invalid TCP record fragment length\n"); - xprt_force_disconnect(xprt); + xs_tcp_force_close(xprt); return; } dprintk("RPC: reading TCP record fragment of length %d\n", @@ -1135,7 +1142,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport, break; default: dprintk("RPC: invalid request message type\n"); - xprt_force_disconnect(&transport->xprt); + xs_tcp_force_close(&transport->xprt); } xs_tcp_check_fraghdr(transport); } @@ -1455,12 +1462,19 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) xprt_clear_connecting(xprt); } -static void xs_sock_mark_closed(struct rpc_xprt *xprt) +static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); smp_mb__after_clear_bit(); +} + +static void xs_sock_mark_closed(struct rpc_xprt *xprt) +{ + xs_sock_reset_connection_flags(xprt); /* Mark transport as closed and wake up all pending tasks */ xprt_disconnect_done(xprt); } @@ -1515,8 +1529,9 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ - xprt_force_disconnect(xprt); xprt->connect_cookie++; + clear_bit(XPRT_CONNECTED, &xprt->state); + xs_tcp_force_close(xprt); case TCP_CLOSING: /* * If the server closed down the connection, make sure that @@ -1540,25 +1555,6 @@ static void xs_tcp_state_change(struct sock *sk) read_unlock_bh(&sk->sk_callback_lock); } -/** - * xs_error_report - callback mainly for catching socket errors - * @sk: socket - */ -static void xs_error_report(struct sock *sk) -{ - struct rpc_xprt *xprt; - - read_lock_bh(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk))) - goto out; - dprintk("RPC: %s client %p...\n" - "RPC: error %d\n", - __func__, xprt, sk->sk_err); - xprt_wake_pending_tasks(xprt, -EAGAIN); -out: - read_unlock_bh(&sk->sk_callback_lock); -} - static void xs_write_space(struct sock *sk) { struct socket *sock; @@ -1858,7 +1854,6 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_user_data = xprt; sk->sk_data_ready = xs_local_data_ready; sk->sk_write_space = xs_udp_write_space; - sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; xprt_clear_connected(xprt); @@ -1947,7 +1942,6 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; - sk->sk_error_report = xs_error_report; sk->sk_no_check = UDP_CSUM_NORCV; sk->sk_allocation = GFP_ATOMIC; @@ -2015,10 +2009,8 @@ static void xs_abort_connection(struct sock_xprt *transport) any.sa_family = AF_UNSPEC; result = kernel_connect(transport->sock, &any, sizeof(any), 0); if (!result) - xs_sock_mark_closed(&transport->xprt); - else - dprintk("RPC: AF_UNSPEC connect return code %d\n", - result); + xs_sock_reset_connection_flags(&transport->xprt); + dprintk("RPC: AF_UNSPEC connect return code %d\n", result); } static void xs_tcp_reuse_connection(struct sock_xprt *transport) @@ -2063,7 +2055,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; - sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; /* socket options */ @@ -2159,8 +2150,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) /* We're probably in TIME_WAIT. Get rid of existing socket, * and retry */ - set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); - xprt_force_disconnect(xprt); + xs_tcp_force_close(xprt); break; case -ECONNREFUSED: case -ECONNRESET: @@ -2433,6 +2423,7 @@ static void bc_destroy(struct rpc_xprt *xprt) static struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, + .alloc_slot = xprt_alloc_slot, .rpcbind = xs_local_rpcbind, .set_port = xs_local_set_port, .connect = xs_connect, @@ -2449,6 +2440,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, @@ -2466,6 +2458,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, + .alloc_slot = xprt_lock_and_alloc_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, @@ -2485,6 +2478,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { static struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, + .alloc_slot = xprt_alloc_slot, .rpcbind = xs_local_rpcbind, .buf_alloc = bc_malloc, .buf_free = bc_free, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d510353ef431..109e30beaa69 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1446,7 +1446,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; @@ -1607,7 +1607,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; diff --git a/net/wireless/core.c b/net/wireless/core.c index bb5302dd592d..7917c74b25b2 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -552,8 +552,7 @@ int wiphy_register(struct wiphy *wiphy) for (i = 0; i < sband->n_channels; i++) { sband->channels[i].orig_flags = sband->channels[i].flags; - sband->channels[i].orig_mag = - sband->channels[i].max_antenna_gain; + sband->channels[i].orig_mag = INT_MAX; sband->channels[i].orig_mpwr = sband->channels[i].max_power; sband->channels[i].band = band; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 460af03d8149..4dc83474db2e 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -134,9 +134,8 @@ static const struct ieee80211_regdomain world_regdom = { .reg_rules = { /* IEEE 802.11b/g, channels 1..11 */ REG_RULE(2412-10, 2462+10, 40, 6, 20, 0), - /* IEEE 802.11b/g, channels 12..13. No HT40 - * channel fits here. */ - REG_RULE(2467-10, 2472+10, 20, 6, 20, + /* IEEE 802.11b/g, channels 12..13. */ + REG_RULE(2467-10, 2472+10, 40, 6, 20, NL80211_RRF_PASSIVE_SCAN | NL80211_RRF_NO_IBSS), /* IEEE 802.11 channel 14 - Only JP enables @@ -340,6 +339,9 @@ static void reg_regdb_search(struct work_struct *work) struct reg_regdb_search_request *request; const struct ieee80211_regdomain *curdom, *regdom; int i, r; + bool set_reg = false; + + mutex_lock(&cfg80211_mutex); mutex_lock(®_regdb_search_mutex); while (!list_empty(®_regdb_search_list)) { @@ -355,9 +357,7 @@ static void reg_regdb_search(struct work_struct *work) r = reg_copy_regd(®dom, curdom); if (r) break; - mutex_lock(&cfg80211_mutex); - set_regdom(regdom); - mutex_unlock(&cfg80211_mutex); + set_reg = true; break; } } @@ -365,6 +365,11 @@ static void reg_regdb_search(struct work_struct *work) kfree(request); } mutex_unlock(®_regdb_search_mutex); + + if (set_reg) + set_regdom(regdom); + + mutex_unlock(&cfg80211_mutex); } static DECLARE_WORK(reg_regdb_work, reg_regdb_search); diff --git a/net/wireless/util.c b/net/wireless/util.c index d835377b4da7..d22dce7b8b80 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -290,23 +290,21 @@ unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); -static int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) +unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) { int ae = meshhdr->flags & MESH_FLAGS_AE; - /* 7.1.3.5a.2 */ + /* 802.11-2012, 8.2.4.7.3 */ switch (ae) { + default: case 0: return 6; case MESH_FLAGS_AE_A4: return 12; case MESH_FLAGS_AE_A5_A6: return 18; - case (MESH_FLAGS_AE_A4 | MESH_FLAGS_AE_A5_A6): - return 24; - default: - return 6; } } +EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, enum nl80211_iftype iftype) @@ -354,6 +352,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, /* make sure meshdr->flags is on the linear part */ if (!pskb_may_pull(skb, hdrlen + 1)) return -1; + if (meshdr->flags & MESH_FLAGS_AE_A4) + return -1; if (meshdr->flags & MESH_FLAGS_AE_A5_A6) { skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), @@ -378,6 +378,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, /* make sure meshdr->flags is on the linear part */ if (!pskb_may_pull(skb, hdrlen + 1)) return -1; + if (meshdr->flags & MESH_FLAGS_AE_A5_A6) + return -1; if (meshdr->flags & MESH_FLAGS_AE_A4) skb_copy_bits(skb, hdrlen + offsetof(struct ieee80211s_hdr, eaddr1), diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 54a0dc2e2f8d..ab2bb42fe094 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -212,7 +212,7 @@ resume: /* only the first xfrm gets the encap type */ encap_type = 0; - if (async && x->repl->check(x, skb, seq)) { + if (async && x->repl->recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index a15d2a03172a..71c80c763d57 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1761,7 +1761,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, if (!afinfo) { dst_release(dst_orig); - ret = ERR_PTR(-EINVAL); + return ERR_PTR(-EINVAL); } else { ret = afinfo->blackhole_route(net, dst_orig); } diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 2f6d11d04a2b..3efb07d3eb27 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -420,6 +420,18 @@ err: return -EINVAL; } +static int xfrm_replay_recheck_esn(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + if (unlikely(XFRM_SKB_CB(skb)->seq.input.hi != + htonl(xfrm_replay_seqhi(x, net_seq)))) { + x->stats.replay_window++; + return -EINVAL; + } + + return xfrm_replay_check_esn(x, skb, net_seq); +} + static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) { unsigned int bitnr, nr, i; @@ -479,6 +491,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) static struct xfrm_replay xfrm_replay_legacy = { .advance = xfrm_replay_advance, .check = xfrm_replay_check, + .recheck = xfrm_replay_check, .notify = xfrm_replay_notify, .overflow = xfrm_replay_overflow, }; @@ -486,6 +499,7 @@ static struct xfrm_replay xfrm_replay_legacy = { static struct xfrm_replay xfrm_replay_bmp = { .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, + .recheck = xfrm_replay_check_bmp, .notify = xfrm_replay_notify_bmp, .overflow = xfrm_replay_overflow_bmp, }; @@ -493,6 +507,7 @@ static struct xfrm_replay xfrm_replay_bmp = { static struct xfrm_replay xfrm_replay_esn = { .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, + .recheck = xfrm_replay_recheck_esn, .notify = xfrm_replay_notify_bmp, .overflow = xfrm_replay_overflow_esn, }; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7128dde0fe1a..c8b903df943f 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -123,9 +123,21 @@ static inline int verify_replay(struct xfrm_usersa_info *p, struct nlattr **attrs) { struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; + struct xfrm_replay_state_esn *rs; - if ((p->flags & XFRM_STATE_ESN) && !rt) - return -EINVAL; + if (p->flags & XFRM_STATE_ESN) { + if (!rt) + return -EINVAL; + + rs = nla_data(rt); + + if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) + return -EINVAL; + + if (nla_len(rt) < xfrm_replay_state_esn_len(rs) && + nla_len(rt) != sizeof(*rs)) + return -EINVAL; + } if (!rt) return 0; @@ -370,14 +382,15 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es struct nlattr *rp) { struct xfrm_replay_state_esn *up; + int ulen; if (!replay_esn || !rp) return 0; up = nla_data(rp); + ulen = xfrm_replay_state_esn_len(up); - if (xfrm_replay_state_esn_len(replay_esn) != - xfrm_replay_state_esn_len(up)) + if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen) return -EINVAL; return 0; @@ -388,22 +401,28 @@ static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn struct nlattr *rta) { struct xfrm_replay_state_esn *p, *pp, *up; + int klen, ulen; if (!rta) return 0; up = nla_data(rta); + klen = xfrm_replay_state_esn_len(up); + ulen = nla_len(rta) >= klen ? klen : sizeof(*up); - p = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); + p = kzalloc(klen, GFP_KERNEL); if (!p) return -ENOMEM; - pp = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); + pp = kzalloc(klen, GFP_KERNEL); if (!pp) { kfree(p); return -ENOMEM; } + memcpy(p, up, ulen); + memcpy(pp, up, ulen); + *replay_esn = p; *preplay_esn = pp; @@ -442,10 +461,11 @@ static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info * * somehow made shareable and move it to xfrm_state.c - JHS * */ -static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs) +static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, + int update_esn) { struct nlattr *rp = attrs[XFRMA_REPLAY_VAL]; - struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL]; + struct nlattr *re = update_esn ? attrs[XFRMA_REPLAY_ESN_VAL] : NULL; struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; @@ -555,7 +575,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; /* override default values from above */ - xfrm_update_ae_params(x, attrs); + xfrm_update_ae_params(x, attrs, 0); return x; @@ -689,6 +709,7 @@ out: static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) { + memset(p, 0, sizeof(*p)); memcpy(&p->id, &x->id, sizeof(p->id)); memcpy(&p->sel, &x->sel, sizeof(p->sel)); memcpy(&p->lft, &x->lft, sizeof(p->lft)); @@ -742,7 +763,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) return -EMSGSIZE; algo = nla_data(nla); - strcpy(algo->alg_name, auth->alg_name); + strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8); algo->alg_key_len = auth->alg_key_len; @@ -862,6 +883,7 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, { struct xfrm_dump_info info; struct sk_buff *skb; + int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) @@ -872,9 +894,10 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, info.nlmsg_seq = seq; info.nlmsg_flags = 0; - if (dump_one_state(x, 0, &info)) { + err = dump_one_state(x, 0, &info); + if (err) { kfree_skb(skb); - return NULL; + return ERR_PTR(err); } return skb; @@ -1297,6 +1320,7 @@ static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir) { + memset(p, 0, sizeof(*p)); memcpy(&p->sel, &xp->selector, sizeof(p->sel)); memcpy(&p->lft, &xp->lft, sizeof(p->lft)); memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft)); @@ -1401,6 +1425,7 @@ static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb) struct xfrm_user_tmpl *up = &vec[i]; struct xfrm_tmpl *kp = &xp->xfrm_vec[i]; + memset(up, 0, sizeof(*up)); memcpy(&up->id, &kp->id, sizeof(up->id)); up->family = kp->encap_family; memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr)); @@ -1529,6 +1554,7 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, { struct xfrm_dump_info info; struct sk_buff *skb; + int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) @@ -1539,9 +1565,10 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, info.nlmsg_seq = seq; info.nlmsg_flags = 0; - if (dump_one_policy(xp, dir, 0, &info) < 0) { + err = dump_one_policy(xp, dir, 0, &info); + if (err) { kfree_skb(skb); - return NULL; + return ERR_PTR(err); } return skb; @@ -1794,7 +1821,7 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; spin_lock_bh(&x->lock); - xfrm_update_ae_params(x, attrs); + xfrm_update_ae_params(x, attrs, 1); spin_unlock_bh(&x->lock); c.event = nlh->nlmsg_type; |