From 500404ebcbd074ca11aa0c3fd9a268aa4054fd8b Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 3 Nov 2015 12:28:10 +0200 Subject: dmaengine: of_dma: Correct return code for of_dma_request_slave_channel in case !CONFIG_OF of_dma_request_slave_channel should return either pointer for valid dma_chan or ERR_PTR() error code, NULL is not expected to be returned. Signed-off-by: Peter Ujfalusi Acked-by: Arnd Bergmann Signed-off-by: Vinod Koul --- include/linux/of_dma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h index 36112cdd665a..b90d8ec57c1f 100644 --- a/include/linux/of_dma.h +++ b/include/linux/of_dma.h @@ -80,7 +80,7 @@ static inline int of_dma_router_register(struct device_node *np, static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np, const char *name) { - return NULL; + return ERR_PTR(-ENODEV); } static inline struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec, -- cgit v1.2.3 From aedf17f4515b12ba1cd73298e66baa69cf93010e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:36 +0100 Subject: lightnvm: change max_phys_sect to uint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The max_phys_sect variable is defined as a char. We do a boundary check to maximally allow 256 physical page descriptors per command. As we are not indexing from zero. This expression is always false. Bump the max_phys_sect to an unsigned int to support the range check. Signed-off-by: Matias Bjørling Reported-by: Geert Uytterhoeven Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 69c9057e1ab8..32b5369e814e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -220,7 +220,7 @@ struct nvm_dev_ops { nvm_dev_dma_alloc_fn *dev_dma_alloc; nvm_dev_dma_free_fn *dev_dma_free; - uint8_t max_phys_sect; + unsigned int max_phys_sect; }; struct nvm_lun { -- cgit v1.2.3 From 11450469830f2481a9e7cb181609288d40f41323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:37 +0100 Subject: lightnvm: update bad block table format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The specification was changed to reflect a multi-value bad block table. Instead of bit-based bad block table, the bad block table now allows eight bad block categories. Currently four are defined: * Factory bad blocks * Grown bad blocks * Device-side reserved blocks * Host-side reserved blocks The factory and grown bad blocks are the regular bad blocks. The reserved blocks are either for internal use or external use. In particular, the device-side reserved blocks allows the host to bootstrap from a limited number of flash blocks. Reducing the flash blocks to scan upon super block initialization. Support for both get bad block table and set bad block table is added. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 32b5369e814e..9b3dc1bc9296 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -191,11 +191,11 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) struct nvm_block; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); -typedef int (nvm_bb_update_fn)(u32, void *, unsigned int, void *); +typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, int, unsigned int, +typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, struct ppa_addr, int, nvm_bb_update_fn *, void *); typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); @@ -210,7 +210,7 @@ struct nvm_dev_ops { nvm_id_fn *identity; nvm_get_l2p_tbl_fn *get_l2p_tbl; nvm_op_bb_tbl_fn *get_bb_tbl; - nvm_op_set_bb_fn *set_bb; + nvm_op_set_bb_fn *set_bb_tbl; nvm_submit_io_fn *submit_io; nvm_erase_blk_fn *erase_block; -- cgit v1.2.3 From 12be5edf68e785dd5dc8665db5a88152b49c1fe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:39 +0100 Subject: lightnvm: expose mccap in identify command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mccap field is required for I/O command option support. It defines the following flash access modes: * SLC mode * Erase/Program Suspension * Scramble On/Off * Encryption It is slotted in between mpos and cpar, changing the offset for cpar as well. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 9b3dc1bc9296..2572856e2a89 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -74,6 +74,7 @@ struct nvm_id_group { u32 tbet; u32 tbem; u32 mpos; + u32 mccap; u16 cpar; u8 res[913]; } __packed; -- cgit v1.2.3 From 73387e7bed260c89628fc6a4e3632b45be9776b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:40 +0100 Subject: lightnvm: remove unused attrs in nvm_id structs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvm_id, nvm_id_group and nvm_addr_format data structures contain reserved attributes. They are unused by media managers and targets. Remove them. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 2572856e2a89..e6ef8aaf533f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -58,7 +58,6 @@ enum { struct nvm_id_group { u8 mtype; u8 fmtype; - u16 res16; u8 num_ch; u8 num_lun; u8 num_pln; @@ -76,8 +75,7 @@ struct nvm_id_group { u32 mpos; u32 mccap; u16 cpar; - u8 res[913]; -} __packed; +}; struct nvm_addr_format { u8 ch_offset; @@ -92,19 +90,16 @@ struct nvm_addr_format { u8 pg_len; u8 sect_offset; u8 sect_len; - u8 res[4]; }; struct nvm_id { u8 ver_id; u8 vmnt; u8 cgrps; - u8 res[5]; u32 cap; u32 dom; struct nvm_addr_format ppaf; u8 ppat; - u8 resv[224]; struct nvm_id_group groups[4]; } __packed; -- cgit v1.2.3 From 7386af270c72be65c7cb2ba4ad0d4e70dc373106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 16 Nov 2015 15:34:44 +0100 Subject: lightnvm: remove linear and device addr modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The linear and device specific address modes can be replaced with a simple offset and bit length conversion that is generic across all devices. This both simplifies the specification and removes the special case for qemu nvme, that previously relied on the linear address mapping. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 154 +++++++++++------------------------------------ 1 file changed, 34 insertions(+), 120 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index e6ef8aaf533f..cbe288acb1de 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -99,7 +99,6 @@ struct nvm_id { u32 cap; u32 dom; struct nvm_addr_format ppaf; - u8 ppat; struct nvm_id_group groups[4]; } __packed; @@ -119,39 +118,28 @@ struct nvm_tgt_instance { #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCH 0 -#define NVM_SEC_BITS (8) -#define NVM_PL_BITS (6) -#define NVM_PG_BITS (16) #define NVM_BLK_BITS (16) -#define NVM_LUN_BITS (10) +#define NVM_PG_BITS (16) +#define NVM_SEC_BITS (8) +#define NVM_PL_BITS (8) +#define NVM_LUN_BITS (8) #define NVM_CH_BITS (8) struct ppa_addr { + /* Generic structure for all addresses */ union { - /* Channel-based PPA format in nand 4x2x2x2x8x10 */ - struct { - u64 ch : 4; - u64 sec : 2; /* 4 sectors per page */ - u64 pl : 2; /* 4 planes per LUN */ - u64 lun : 2; /* 4 LUNs per channel */ - u64 pg : 8; /* 256 pages per block */ - u64 blk : 10;/* 1024 blocks per plane */ - u64 resved : 36; - } chnl; - - /* Generic structure for all addresses */ struct { + u64 blk : NVM_BLK_BITS; + u64 pg : NVM_PG_BITS; u64 sec : NVM_SEC_BITS; u64 pl : NVM_PL_BITS; - u64 pg : NVM_PG_BITS; - u64 blk : NVM_BLK_BITS; u64 lun : NVM_LUN_BITS; u64 ch : NVM_CH_BITS; } g; u64 ppa; }; -} __packed; +}; struct nvm_rq { struct nvm_tgt_instance *ins; @@ -259,8 +247,7 @@ struct nvm_dev { int blks_per_lun; int sec_size; int oob_size; - int addr_mode; - struct nvm_addr_format addr_format; + struct nvm_addr_format ppaf; /* Calculated/Cached values. These do not reflect the actual usable * blocks at run-time. @@ -286,118 +273,45 @@ struct nvm_dev { char name[DISK_NAME_LEN]; }; -/* fallback conversion */ -static struct ppa_addr __generic_to_linear_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct ppa_addr l; - - l.ppa = r.g.sec + - r.g.pg * dev->sec_per_pg + - r.g.blk * (dev->pgs_per_blk * - dev->sec_per_pg) + - r.g.lun * (dev->blks_per_lun * - dev->pgs_per_blk * - dev->sec_per_pg) + - r.g.ch * (dev->blks_per_lun * - dev->pgs_per_blk * - dev->luns_per_chnl * - dev->sec_per_pg); - - return l; -} - -/* fallback conversion */ -static struct ppa_addr __linear_to_generic_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct ppa_addr l; - int secs, pgs, blks, luns; - sector_t ppa = r.ppa; - - l.ppa = 0; - - div_u64_rem(ppa, dev->sec_per_pg, &secs); - l.g.sec = secs; - - sector_div(ppa, dev->sec_per_pg); - div_u64_rem(ppa, dev->sec_per_blk, &pgs); - l.g.pg = pgs; - - sector_div(ppa, dev->pgs_per_blk); - div_u64_rem(ppa, dev->blks_per_lun, &blks); - l.g.blk = blks; - - sector_div(ppa, dev->blks_per_lun); - div_u64_rem(ppa, dev->luns_per_chnl, &luns); - l.g.lun = luns; - - sector_div(ppa, dev->luns_per_chnl); - l.g.ch = ppa; - - return l; -} - -static struct ppa_addr __generic_to_chnl_addr(struct ppa_addr r) +static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, + struct ppa_addr r) { struct ppa_addr l; - l.ppa = 0; - - l.chnl.sec = r.g.sec; - l.chnl.pl = r.g.pl; - l.chnl.pg = r.g.pg; - l.chnl.blk = r.g.blk; - l.chnl.lun = r.g.lun; - l.chnl.ch = r.g.ch; + l.ppa = ((u64)r.g.blk) << dev->ppaf.blk_offset; + l.ppa |= ((u64)r.g.pg) << dev->ppaf.pg_offset; + l.ppa |= ((u64)r.g.sec) << dev->ppaf.sect_offset; + l.ppa |= ((u64)r.g.pl) << dev->ppaf.pln_offset; + l.ppa |= ((u64)r.g.lun) << dev->ppaf.lun_offset; + l.ppa |= ((u64)r.g.ch) << dev->ppaf.ch_offset; return l; } -static struct ppa_addr __chnl_to_generic_addr(struct ppa_addr r) +static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, + struct ppa_addr r) { struct ppa_addr l; - l.ppa = 0; - - l.g.sec = r.chnl.sec; - l.g.pl = r.chnl.pl; - l.g.pg = r.chnl.pg; - l.g.blk = r.chnl.blk; - l.g.lun = r.chnl.lun; - l.g.ch = r.chnl.ch; + /* + * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. + */ + l.g.blk = (r.ppa >> dev->ppaf.blk_offset) & + (((1 << dev->ppaf.blk_len) - 1)); + l.g.pg |= (r.ppa >> dev->ppaf.pg_offset) & + (((1 << dev->ppaf.pg_len) - 1)); + l.g.sec |= (r.ppa >> dev->ppaf.sect_offset) & + (((1 << dev->ppaf.sect_len) - 1)); + l.g.pl |= (r.ppa >> dev->ppaf.pln_offset) & + (((1 << dev->ppaf.pln_len) - 1)); + l.g.lun |= (r.ppa >> dev->ppaf.lun_offset) & + (((1 << dev->ppaf.lun_len) - 1)); + l.g.ch |= (r.ppa >> dev->ppaf.ch_offset) & + (((1 << dev->ppaf.ch_len) - 1)); return l; } -static inline struct ppa_addr addr_to_generic_mode(struct nvm_dev *dev, - struct ppa_addr gppa) -{ - switch (dev->addr_mode) { - case NVM_ADDRMODE_LINEAR: - return __linear_to_generic_addr(dev, gppa); - case NVM_ADDRMODE_CHANNEL: - return __chnl_to_generic_addr(gppa); - default: - BUG(); - } - return gppa; -} - -static inline struct ppa_addr generic_to_addr_mode(struct nvm_dev *dev, - struct ppa_addr gppa) -{ - switch (dev->addr_mode) { - case NVM_ADDRMODE_LINEAR: - return __generic_to_linear_addr(dev, gppa); - case NVM_ADDRMODE_CHANNEL: - return __generic_to_chnl_addr(gppa); - default: - BUG(); - } - return gppa; -} - static inline int ppa_empty(struct ppa_addr ppa_addr) { return (ppa_addr.ppa == ADDR_EMPTY); -- cgit v1.2.3 From 0f45c26fc302c02b0576db37d4849baa53a2bb41 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 11 Nov 2015 11:29:09 +0100 Subject: drm/atomic: add a drm_atomic_clean_old_fb helper. This is useful for all the boilerplate code about cleaning old_fb. Signed-off-by: Maarten Lankhorst Reviewed-by: Daniel Vetter Signed-off-by: Jani Nikula Link: http://patchwork.freedesktop.org/patch/msgid/1447237751-9663-4-git-send-email-maarten.lankhorst@ubuntu.com --- include/drm/drm_atomic.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h index e67aeac2aee0..4b74c97d297a 100644 --- a/include/drm/drm_atomic.h +++ b/include/drm/drm_atomic.h @@ -136,6 +136,9 @@ drm_atomic_connectors_for_crtc(struct drm_atomic_state *state, void drm_atomic_legacy_backoff(struct drm_atomic_state *state); +void +drm_atomic_clean_old_fb(struct drm_device *dev, unsigned plane_mask, int ret); + int __must_check drm_atomic_check_only(struct drm_atomic_state *state); int __must_check drm_atomic_commit(struct drm_atomic_state *state); int __must_check drm_atomic_async_commit(struct drm_atomic_state *state); -- cgit v1.2.3 From 451c2b5caf37b526ae34a1081b71115e1de2d063 Mon Sep 17 00:00:00 2001 From: Aya Mahfouz Date: Wed, 18 Nov 2015 08:36:44 +0200 Subject: net: dns_resolver: convert time_t to time64_t Changes the definition of the pointer _expiry from time_t to time64_t. This is to handle the Y2038 problem where time_t will overflow in the year 2038. The change is safe because the kernel subsystems that call dns_query pass NULL. Signed-off-by: Arnd Bergmann Signed-off-by: Aya Mahfouz Signed-off-by: David S. Miller --- include/linux/dns_resolver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dns_resolver.h b/include/linux/dns_resolver.h index cc92268af89a..6ac3cad9aef1 100644 --- a/include/linux/dns_resolver.h +++ b/include/linux/dns_resolver.h @@ -27,7 +27,7 @@ #ifdef __KERNEL__ extern int dns_query(const char *type, const char *name, size_t namelen, - const char *options, char **_result, time_t *_expiry); + const char *options, char **_result, time64_t *_expiry); #endif /* KERNEL */ -- cgit v1.2.3 From db27a7a37aa0b1f8b373f8b0fb72a2ccaafb85b7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 5 Nov 2015 09:03:50 +0100 Subject: KVM: Provide function for VCPU lookup by id Let's provide a function to lookup a VCPU by id. Reviewed-by: Christian Borntraeger Reviewed-by: Dominik Dingel Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger [split patch from refactoring patch] --- include/linux/kvm_host.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5706a2108f0a..c923350ca20a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -460,6 +460,17 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \ idx++) +static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu->vcpu_id == id) + return vcpu; + return NULL; +} + #define kvm_for_each_memslot(memslot, slots) \ for (memslot = &slots->memslots[0]; \ memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\ -- cgit v1.2.3 From 851df3dc11136fde86ebd78ee7527cb43c7cd349 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Nov 2015 22:34:58 +0100 Subject: scpi: hide get_scpi_ops in module from built-in code The scpi_clock driver can be built-in when CONFIG_COMPILE_TEST is set even when ARM_SCPI_PROTOCOL is a loadable module, and that results in a link error: drivers/built-in.o: In function `scpi_clocks_probe': (.text+0x14453c): undefined reference to `get_scpi_ops' Using #if IS_REACHABLE() around the get_scpi_ops() declaration makes it build successfully in this case for compile-testing, but the effect is the same as when ARM_SCPI_PROTOCOL is disabled, as the code will not be used. Signed-off-by: Arnd Bergmann Acked-by: Punit Agrawal --- include/linux/scpi_protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/scpi_protocol.h b/include/linux/scpi_protocol.h index 80af3cd35ae4..72ce932c69b2 100644 --- a/include/linux/scpi_protocol.h +++ b/include/linux/scpi_protocol.h @@ -71,7 +71,7 @@ struct scpi_ops { int (*sensor_get_value)(u16, u32 *); }; -#if IS_ENABLED(CONFIG_ARM_SCPI_PROTOCOL) +#if IS_REACHABLE(CONFIG_ARM_SCPI_PROTOCOL) struct scpi_ops *get_scpi_ops(void); #else static inline struct scpi_ops *get_scpi_ops(void) { return NULL; } -- cgit v1.2.3 From f5c4a42a7549cbc29dbac2b5ede05a3bc2bb4522 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 5 Nov 2015 07:09:59 +0100 Subject: Bluetooth: Add hci_skb_* helper wrappers for bt_cb(skb) access For all the HCI driver related variables accesssed via bt_cb(skb), provide helper wrappers. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 42844d7b154a..663e0ef1eaa0 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -316,6 +316,10 @@ struct bt_skb_cb { }; #define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb)) +#define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type +#define hci_skb_expect(skb) bt_cb((skb))->expect +#define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode + static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how) { struct sk_buff *skb; -- cgit v1.2.3 From 5fcc86bd2695d4ccad2d87cb424f4c01a4e544f3 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 5 Nov 2015 09:31:39 +0200 Subject: Bluetooth: Remove redundant setting to zero of bt_cb The socket allocation functions will always memset skb->cb to zero so there's no need to make other initializations needing the same thing. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 663e0ef1eaa0..a85e6d3d75ef 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -325,10 +325,8 @@ static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how) struct sk_buff *skb; skb = alloc_skb(len + BT_SKB_RESERVE, how); - if (skb) { + if (skb) skb_reserve(skb, BT_SKB_RESERVE); - bt_cb(skb)->incoming = 0; - } return skb; } @@ -338,10 +336,8 @@ static inline struct sk_buff *bt_skb_send_alloc(struct sock *sk, struct sk_buff *skb; skb = sock_alloc_send_skb(sk, len + BT_SKB_RESERVE, nb, err); - if (skb) { + if (skb) skb_reserve(skb, BT_SKB_RESERVE); - bt_cb(skb)->incoming = 0; - } if (!skb && *err) return NULL; -- cgit v1.2.3 From 44d271377479c4d4fe7f2d07d188656684773fbd Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 5 Nov 2015 09:31:40 +0200 Subject: Bluetooth: Compress the size of struct hci_ctrl We can reduce the size of the hci_ctrl struct by converting 'bool req_start' to 'u8 req_flags' and making the two function pointers a union (since only one is ever set at a time). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index a85e6d3d75ef..8d38f411009c 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -296,12 +296,17 @@ typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status, u16 opcode); typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb); +#define HCI_REQ_START BIT(0) +#define HCI_REQ_SKB BIT(1) + struct hci_ctrl { __u16 opcode; - bool req_start; + u8 req_flags; u8 req_event; - hci_req_complete_t req_complete; - hci_req_complete_skb_t req_complete_skb; + union { + hci_req_complete_t req_complete; + hci_req_complete_skb_t req_complete_skb; + }; }; struct bt_skb_cb { -- cgit v1.2.3 From dd31506d4aece48943802c2bca3f1f7d2e7266b4 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 8 Nov 2015 07:47:12 +0100 Subject: Bluetooth: Add support for sending system notes to monitor channel The monitor channel can be used to send generic system notes as text strings for debugging purposes. This adds the system note monitor code and uses it for including kernel and subsystem version into traces. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 2 ++ include/net/bluetooth/hci_mon.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 8d38f411009c..bfd1590821d6 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -29,6 +29,8 @@ #include #include +#define BT_SUBSYS_VERSION "2.21" + #ifndef AF_BLUETOOTH #define AF_BLUETOOTH 31 #define PF_BLUETOOTH AF_BLUETOOTH diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 2b67567cf28d..c91bb23eb29e 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -43,6 +43,7 @@ struct hci_mon_hdr { #define HCI_MON_CLOSE_INDEX 9 #define HCI_MON_INDEX_INFO 10 #define HCI_MON_VENDOR_DIAG 11 +#define HCI_MON_SYSTEM_NOTE 12 struct hci_mon_new_index { __u8 type; -- cgit v1.2.3 From ac71494934c475e3f51e5e3e64a12f57618d82a4 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 8 Nov 2015 07:47:13 +0100 Subject: Bluetooth: Add support for controller specific logging To enable controller specific logging, the userspace daemon has to have the ability to log per controller. To facilitate this support, provide a dedicated logging channel. Messages in this channel will be included in the monitor queue and with that also forwarded to monitoring tools along with the actual hardware traces. All messages from the logging channel are timestamped and with that allow an easy correlation between userspace messages and hardware events. This will increase the ability to debug problems faster. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_mon.h | 1 + include/net/bluetooth/hci_sock.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index c91bb23eb29e..587d0131b349 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -44,6 +44,7 @@ struct hci_mon_hdr { #define HCI_MON_INDEX_INFO 10 #define HCI_MON_VENDOR_DIAG 11 #define HCI_MON_SYSTEM_NOTE 12 +#define HCI_MON_USER_LOGGING 13 struct hci_mon_new_index { __u8 type; diff --git a/include/net/bluetooth/hci_sock.h b/include/net/bluetooth/hci_sock.h index 9a46d665c1b5..8e9138acdae1 100644 --- a/include/net/bluetooth/hci_sock.h +++ b/include/net/bluetooth/hci_sock.h @@ -45,6 +45,7 @@ struct sockaddr_hci { #define HCI_CHANNEL_USER 1 #define HCI_CHANNEL_MONITOR 2 #define HCI_CHANNEL_CONTROL 3 +#define HCI_CHANNEL_LOGGING 4 struct hci_filter { unsigned long type_mask; -- cgit v1.2.3 From 030e7f8141a262e32dc064d7cf12377d769d45c2 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 10 Nov 2015 09:44:53 +0200 Subject: Bluetooth: Remove unnecessary call to hci_update_background_scan The hci_conn_params_clear_all() function is only called from hci_unregister_dev() at which point it's completely futile to try to do any LE scanning updates. Simply remove this unnecessary function call. At the same time we can make the function static since it's only accessed from within the same c-file. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1878d0a96333..15e6a2bffc2b 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1036,7 +1036,6 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); -void hci_conn_params_clear_all(struct hci_dev *hdev); void hci_conn_params_clear_disabled(struct hci_dev *hdev); struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, -- cgit v1.2.3 From 2e93e53b8f86fb38a9a3c3bd08e539c40b3f8d89 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:11:17 +0200 Subject: Bluetooth: Run all background scan updates through req_workqueue Instead of firing off a simple async request queue all background scan updates through req_workqueue and use hci_req_sync() there to ensure that no two updates overlap with each other. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 15e6a2bffc2b..c2ca6a58d1e0 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -327,6 +327,8 @@ struct hci_dev { struct work_struct cmd_work; struct work_struct tx_work; + struct work_struct bg_scan_update; + struct sk_buff_head rx_q; struct sk_buff_head raw_q; struct sk_buff_head cmd_q; -- cgit v1.2.3 From 4ebeee2dff9815619be6ff9a845d33716f48468c Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:11:19 +0200 Subject: Bluetooth: Add HCI status return parameter to hci_req_sync() In some cases it may be important to get the exact HCI status rather than the converted HCI-to-errno value. Add an optional return parameter to the hci_req_sync() API to allow for this. Since there are no good HCI translation candidates for cancelation and timeout, use the "unknown" status code for those cases. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 0205b80cc90b..cc2216727655 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -452,7 +452,8 @@ enum { #define HCI_ERROR_REMOTE_POWER_OFF 0x15 #define HCI_ERROR_LOCAL_HOST_TERM 0x16 #define HCI_ERROR_PAIRING_NOT_ALLOWED 0x18 -#define HCI_ERROR_INVALID_LL_PARAMS 0x1E +#define HCI_ERROR_INVALID_LL_PARAMS 0x1e +#define HCI_ERROR_UNSPECIFIED 0x1f #define HCI_ERROR_ADVERTISING_TIMEOUT 0x3c /* Flow control modes */ -- cgit v1.2.3 From 7c1fbed23981faff2840ddc8909e7c78d80ade30 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:11:23 +0200 Subject: Bluetooth: Move LE scan disable/restart behind req_workqueue To avoid any risks of races, place also these LE scan modification work callbacks behind the same work queue as the other LE scan changes. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c2ca6a58d1e0..1f75aebbd8c4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -328,6 +328,8 @@ struct hci_dev { struct work_struct tx_work; struct work_struct bg_scan_update; + struct delayed_work le_scan_disable; + struct delayed_work le_scan_restart; struct sk_buff_head rx_q; struct sk_buff_head raw_q; @@ -372,9 +374,6 @@ struct hci_dev { DECLARE_BITMAP(dev_flags, __HCI_NUM_FLAGS); - struct delayed_work le_scan_disable; - struct delayed_work le_scan_restart; - __s8 adv_tx_power; __u8 adv_data[HCI_MAX_AD_LENGTH]; __u8 adv_data_len; -- cgit v1.2.3 From e68f072b7396574df5324e1cf93e4b0c92460735 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:30:30 +0200 Subject: Bluetooth: Move Start Discovery to req_workqueue Since discovery also deals with LE scanning it makes sense to move it behind the same req_workqueue as other LE scanning changes. This also simplifies the logic since we do many of the actions in a synchronous manner. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1f75aebbd8c4..72ea8a6d7d70 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -327,6 +327,7 @@ struct hci_dev { struct work_struct cmd_work; struct work_struct tx_work; + struct work_struct discov_update; struct work_struct bg_scan_update; struct delayed_work le_scan_disable; struct delayed_work le_scan_restart; @@ -1473,6 +1474,7 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status); void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status); void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status); +void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len); -- cgit v1.2.3 From 2154d3f4fb83c812a161c4910948dd876997e111 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:30:45 +0200 Subject: Bluetooth: Move Stop Discovery to req_workqueue Since discovery also deals with LE scanning it makes sense to move it behind the same req_workqueue as other LE scanning changes. This also simplifies the logic since we do many of the actions in a synchronous manner. Part of this refactoring is moving hci_req_stop_discovery() to hci_request.c. At the same time the function receives support for properly handling the STOPPING state since that's the state we'll be in when stopping through the req_workqueue. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 72ea8a6d7d70..609f4a03899c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1475,6 +1475,7 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status); void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status); void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status); +void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len); -- cgit v1.2.3 From 0ad06aa6a7682319bb1adcc187a1fa8db6b2da2c Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 14:44:57 +0200 Subject: Bluetooth: Fix specifying role for LE connections The hci_connect_le_scan() is (as the name implies) a master/central role API, so it makes no sense in passing a role parameter to it. At the same time this patch also fixes the direct advertising support for LE L2CAP sockets where we now call the more appropriate hci_le_connect() API if slave/peripheral role is desired. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 609f4a03899c..55ce209157b1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -877,7 +877,7 @@ struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle); struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, - u16 conn_timeout, u8 role); + u16 conn_timeout); struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, u8 role); -- cgit v1.2.3 From 2e6edc95382cc36423aff18a237173ad62d5ab52 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 19 Nov 2015 13:29:28 -0800 Subject: block: protect rw_page against device teardown Fix use after free crashes like the following: general protection fault: 0000 [#1] SMP Call Trace: [] ? pmem_do_bvec.isra.12+0xa6/0xf0 [nd_pmem] [] pmem_rw_page+0x42/0x80 [nd_pmem] [] bdev_read_page+0x50/0x60 [] do_mpage_readpage+0x510/0x770 [] ? I_BDEV+0x20/0x20 [] ? lru_cache_add+0x1c/0x50 [] mpage_readpages+0x107/0x170 [] ? I_BDEV+0x20/0x20 [] ? I_BDEV+0x20/0x20 [] blkdev_readpages+0x1d/0x20 [] __do_page_cache_readahead+0x28f/0x310 [] ? __do_page_cache_readahead+0x169/0x310 [] ? pagecache_get_page+0x2d/0x1d0 [] filemap_fault+0x396/0x530 [] __do_fault+0x4e/0xf0 [] handle_mm_fault+0x11bd/0x1b50 Cc: Cc: Jens Axboe Cc: Alexander Viro Reported-by: kbuild test robot Acked-by: Matthew Wilcox [willy: symmetry fixups] Signed-off-by: Dan Williams --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3fe27f8d91f0..c0d2b7927c1f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -794,6 +794,8 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); +extern int blk_queue_enter(struct request_queue *q, gfp_t gfp); +extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -- cgit v1.2.3 From ac0621971a26526cad8cf9db7626d5e50562a441 Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Tue, 17 Nov 2015 10:24:38 +0200 Subject: mac80211: always set the buf_size in AddBA req to 64 Advertising reordering window in ADDBA less than 64 can crash some APs, an example is LinkSys WRT120N (with FW v1.0.07 build 002 Jun 18 2012). On the other hand, a driver may need to limit Tx A-MPDU size for its own reasons, like specific HW limitations. Signed-off-by: Gregory Greenman Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 82045fca388b..760bc4d5a2cf 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2003,8 +2003,10 @@ enum ieee80211_hw_flags { * it shouldn't be set. * * @max_tx_aggregation_subframes: maximum number of subframes in an - * aggregate an HT driver will transmit, used by the peer as a - * hint to size its reorder buffer. + * aggregate an HT driver will transmit. Though ADDBA will advertise + * a constant value of 64 as some older APs can crash if the window + * size is smaller (an example is LinkSys WRT120N with FW v1.0.07 + * build 002 Jun 18 2012). * * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX * (if %IEEE80211_HW_QUEUE_CONTROL is set) -- cgit v1.2.3 From 0b59733b95f9d7af6bee6e6a4d0d444eb694c514 Mon Sep 17 00:00:00 2001 From: Javier Gonzalez Date: Fri, 20 Nov 2015 13:47:56 +0100 Subject: lightnvm: keep track of block counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintain number of in use blocks, free blocks, and bad blocks in a per lun basis. This allows the upper layers to get information about the state of each lun. Also, account for blocks reserved to the device on the free block count. nr_free_blocks matches now the actual number of blocks on the free list when the device is booted. Signed-off-by: Javier Gonzalez Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cbe288acb1de..831a20cf070c 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -213,7 +213,9 @@ struct nvm_lun { int lun_id; int chnl_id; + unsigned int nr_inuse_blocks; /* Number of used blocks */ unsigned int nr_free_blocks; /* Number of unused blocks */ + unsigned int nr_bad_blocks; /* Number of bad blocks */ struct nvm_block *blocks; spinlock_t lock; -- cgit v1.2.3 From 2fde0e482db2b43bb4ed0e9aebfbe78ebcbbf5a6 Mon Sep 17 00:00:00 2001 From: Javier Gonzalez Date: Fri, 20 Nov 2015 13:47:57 +0100 Subject: lightnvm: add free and bad lun info to show luns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add free block, used block, and bad block information to the show debug interface. This information is used to debug how targets track blocks. Also, change debug function name to make it more generic. Signed-off-by: Javier Gonzalez Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 831a20cf070c..3db5552b17d5 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -380,7 +380,7 @@ typedef int (nvmm_end_io_fn)(struct nvm_rq *, int); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, unsigned long); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); -typedef void (nvmm_free_blocks_print_fn)(struct nvm_dev *); +typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *); struct nvmm_type { const char *name; @@ -404,7 +404,7 @@ struct nvmm_type { nvmm_get_lun_fn *get_lun; /* Statistics */ - nvmm_free_blocks_print_fn *free_blocks_print; + nvmm_lun_info_print_fn *lun_info_print; struct list_head list; }; -- cgit v1.2.3 From b11cfb5807e30333b36c02701382b820b7dcf0d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: cgroup: record ancestor IDs and reimplement cgroup_is_descendant() using it cgroup_is_descendant() currently walks up the hierarchy and compares each ancestor to the cgroup in question. While enough for cgroup core usages, this can't be used in hot paths to test cgroup membership. This patch adds cgroup->ancestor_ids[] which records the IDs of all ancestors including self and cgroup->level for the nesting level. This allows testing whether a given cgroup is a descendant of another in three finite steps - testing whether the two belong to the same hierarchy, whether the descendant candidate is at the same or a higher level than the ancestor and comparing the recorded ancestor_id at the matching level. cgroup_is_descendant() is accordingly reimplmented and made inline. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 14 ++++++++++++++ include/linux/cgroup.h | 18 +++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 60d44b26276d..504d8591b6d3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -234,6 +234,14 @@ struct cgroup { */ int id; + /* + * The depth this cgroup is at. The root is at depth zero and each + * step down the hierarchy increments the level. This along with + * ancestor_ids[] can determine whether a given cgroup is a + * descendant of another without traversing the hierarchy. + */ + int level; + /* * Each non-empty css_set associated with this cgroup contributes * one to populated_cnt. All children with non-zero popuplated_cnt @@ -289,6 +297,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + + /* ids of the ancestors at each level including self */ + int ancestor_ids[]; }; /* @@ -308,6 +319,9 @@ struct cgroup_root { /* The root cgroup. Root is destroyed on its release. */ struct cgroup cgrp; + /* for cgrp->ancestor_ids[0] */ + int cgrp_ancestor_id_storage; + /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 22e3754f89c5..b5ee2c4210f9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -81,7 +81,6 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); @@ -459,6 +458,23 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, return task_css(task, subsys_id)->cgroup; } +/** + * cgroup_is_descendant - test ancestry + * @cgrp: the cgroup to be tested + * @ancestor: possible ancestor of @cgrp + * + * Test whether @cgrp is a descendant of @ancestor. It also returns %true + * if @cgrp == @ancestor. This function is safe to call as long as @cgrp + * and @ancestor are accessible. + */ +static inline bool cgroup_is_descendant(struct cgroup *cgrp, + struct cgroup *ancestor) +{ + if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) + return false; + return cgrp->ancestor_ids[ancestor->level] == ancestor->id; +} + /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { -- cgit v1.2.3 From bd96f76a2454c6b97d70945902e30b4c31510678 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: kernfs: implement kernfs_walk_and_get() Implement kernfs_walk_and_get() which is similar to kernfs_find_and_get() but can walk a path instead of just a name. v2: Use strlcpy() instead of strlen() + memcpy() as suggested by David. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: David Miller --- include/linux/kernfs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5d4e9c4b821d..af51df35d749 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -274,6 +274,8 @@ void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns); +struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, + const char *path, const void *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); @@ -350,6 +352,10 @@ static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { return NULL; } +static inline struct kernfs_node * +kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, + const void *ns) +{ return NULL; } static inline void kernfs_get(struct kernfs_node *kn) { } static inline void kernfs_put(struct kernfs_node *kn) { } @@ -430,6 +436,12 @@ kernfs_find_and_get(struct kernfs_node *kn, const char *name) return kernfs_find_and_get_ns(kn, name, NULL); } +static inline struct kernfs_node * +kernfs_walk_and_get(struct kernfs_node *kn, const char *path) +{ + return kernfs_walk_and_get_ns(kn, path, NULL); +} + static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) -- cgit v1.2.3 From 16af439645455fbf36984ca5e72f31073ee19ab7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: cgroup: implement cgroup_get_from_path() and expose cgroup_put() Implement cgroup_get_from_path() using kernfs_walk_and_get() which obtains a default hierarchy cgroup from its path. This will be used to allow cgroup path based matching from outside cgroup proper - e.g. networking and perf. v2: Add EXPORT_SYMBOL_GPL(cgroup_get_from_path). Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b5ee2c4210f9..4c3ffab81ba7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -81,6 +81,8 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); +struct cgroup *cgroup_get_from_path(const char *path); + int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); @@ -351,6 +353,11 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) percpu_ref_put_many(&css->refcnt, n); } +static inline void cgroup_put(struct cgroup *cgrp) +{ + css_put(&cgrp->self); +} + /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for -- cgit v1.2.3 From 94a58c360a45c066ab5472cfd2bf2a4ba63aa532 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 20 Nov 2015 15:56:48 -0800 Subject: slab.h: sprinkle __assume_aligned attributes The various allocators return aligned memory. Telling the compiler that allows it to generate better code in many cases, for example when the return value is immediately passed to memset(). Some code does become larger, but at least we win twice as much as we lose: $ scripts/bloat-o-meter /tmp/vmlinux vmlinux add/remove: 0/0 grow/shrink: 13/52 up/down: 995/-2140 (-1145) An example of the different (and smaller) code can be seen in mm_alloc(). Before: : 48 8d 78 08 lea 0x8(%rax),%rdi : 48 89 c1 mov %rax,%rcx : 48 89 c2 mov %rax,%rdx : 48 c7 00 00 00 00 00 movq $0x0,(%rax) : 48 c7 80 48 03 00 00 movq $0x0,0x348(%rax) : 00 00 00 00 : 31 c0 xor %eax,%eax : 48 83 e7 f8 and $0xfffffffffffffff8,%rdi : 48 29 f9 sub %rdi,%rcx : 81 c1 50 03 00 00 add $0x350,%ecx : c1 e9 03 shr $0x3,%ecx : f3 48 ab rep stos %rax,%es:(%rdi) After: : 48 89 c2 mov %rax,%rdx : b9 6a 00 00 00 mov $0x6a,%ecx : 31 c0 xor %eax,%eax : 48 89 d7 mov %rdx,%rdi : f3 48 ab rep stos %rax,%es:(%rdi) So gcc's strategy is to do two possibly (but not really, of course) unaligned stores to the first and last word, then do an aligned rep stos covering the middle part with a little overlap. Maybe arches which do not allow unaligned stores gain even more. I don't know if gcc can actually make use of alignments greater than 8 for anything, so one could probably drop the __assume_xyz_alignment macros and just use __assume_aligned(8). The increases in code size are mostly caused by gcc deciding to opencode strlen() using the check-four-bytes-at-a-time trick when it knows the buffer is sufficiently aligned (one function grew by 200 bytes). Now it turns out that many of these strlen() calls showing up were in fact redundant, and they're gone from -next. Applying the two patches to next-20151001 bloat-o-meter instead says add/remove: 0/0 grow/shrink: 6/52 up/down: 244/-2140 (-1896) Signed-off-by: Rasmus Villemoes Acked-by: Christoph Lameter Cc: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 7c82e3b307a3..96940772bb92 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -157,6 +157,24 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) #endif +/* + * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment. + * Intended for arches that get misalignment faults even for 64 bit integer + * aligned buffers. + */ +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) +#endif + +/* + * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned + * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN + * aligned pointers. + */ +#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN) +#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN) +#define __assume_page_alignment __assume_aligned(PAGE_SIZE) + /* * Kmalloc array related definitions */ @@ -286,8 +304,8 @@ static __always_inline int kmalloc_index(size_t size) } #endif /* !CONFIG_SLOB */ -void *__kmalloc(size_t size, gfp_t flags); -void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); +void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment; +void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment; void kmem_cache_free(struct kmem_cache *, void *); /* @@ -301,8 +319,8 @@ void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); #ifdef CONFIG_NUMA -void *__kmalloc_node(size_t size, gfp_t flags, int node); -void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); +void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment; +void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment; #else static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) { @@ -316,12 +334,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f #endif #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t); +extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment; #ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size); + int node, size_t size) __assume_slab_alignment; #else static __always_inline void * kmem_cache_alloc_node_trace(struct kmem_cache *s, @@ -354,10 +372,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment; #ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); +extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment; #else static __always_inline void * kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) @@ -482,15 +500,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) return __kmalloc_node(size, flags, node); } -/* - * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment. - * Intended for arches that get misalignment faults even for 64 bit integer - * aligned buffers. - */ -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) -#endif - struct memcg_cache_array { struct rcu_head rcu; struct kmem_cache *entries[0]; -- cgit v1.2.3 From 5cf6a51e6062afe7cc507f32f1e5f7e6497ae844 Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Fri, 20 Nov 2015 15:56:53 -0800 Subject: configfs: allow dynamic group creation This patchset introduces IIO software triggers, offers a way of configuring them via configfs and adds the IIO hrtimer based interrupt source to be used with software triggers. The architecture is now split in 3 parts, to remove all IIO trigger specific parts from IIO configfs core: (1) IIO configfs - creates the root of the IIO configfs subsys. (2) IIO software triggers - software trigger implementation, dynamically creating /config/iio/triggers group. (3) IIO hrtimer trigger - is the first interrupt source for software triggers (with syfs to follow). Each trigger type can implement its own set of attributes. Lockdep seems to be happy with the locking in configfs patch. This patch (of 5): We don't want to hardcode default groups at subsystem creation time. We export: * configfs_register_group * configfs_unregister_group to allow drivers to programatically create/destroy groups later, after module init time. This is needed for IIO configfs support. (akpm: the other 4 patches to be merged via the IIO tree) Signed-off-by: Daniel Baluta Suggested-by: Lars-Peter Clausen Reviewed-by: Christoph Hellwig Acked-by: Joel Becker Cc: Hartmut Knaack Cc: Octavian Purdila Cc: Paul Bolle Cc: Adriana Reus Cc: Cristina Opriceana Cc: Peter Meerwald Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/configfs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index a8a335b7fce0..758a029011b1 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -197,6 +197,16 @@ static inline struct configfs_subsystem *to_configfs_subsystem(struct config_gro int configfs_register_subsystem(struct configfs_subsystem *subsys); void configfs_unregister_subsystem(struct configfs_subsystem *subsys); +int configfs_register_group(struct config_group *parent_group, + struct config_group *group); +void configfs_unregister_group(struct config_group *group); + +struct config_group * +configfs_register_default_group(struct config_group *parent_group, + const char *name, + struct config_item_type *item_type); +void configfs_unregister_default_group(struct config_group *group); + /* These functions can sleep and can alloc with GFP_KERNEL */ /* WARNING: These cannot be called underneath configfs callbacks!! */ int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target); -- cgit v1.2.3 From 9d8a765211335cfdad464b90fb19f546af5706ae Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 20 Nov 2015 15:57:21 -0800 Subject: kernel/signal.c: unexport sigsuspend() sigsuspend() is nowhere used except in signal.c itself, so we can mark it static do not pollute the global namespace. But this patch is more than a boring cleanup patch, it fixes a real issue on UserModeLinux. UML has a special console driver to display ttys using xterm, or other terminal emulators, on the host side. Vegard reported that sometimes UML is unable to spawn a xterm and he's facing the following warning: WARNING: CPU: 0 PID: 908 at include/linux/thread_info.h:128 sigsuspend+0xab/0xc0() It turned out that this warning makes absolutely no sense as the UML xterm code calls sigsuspend() on the host side, at least it tries. But as the kernel itself offers a sigsuspend() symbol the linker choose this one instead of the glibc wrapper. Interestingly this code used to work since ever but always blocked signals on the wrong side. Some recent kernel change made the WARN_ON() trigger and uncovered the bug. It is a wonderful example of how much works by chance on computers. :-) Fixes: 68f3f16d9ad0f1 ("new helper: sigsuspend()") Signed-off-by: Richard Weinberger Reported-by: Vegard Nossum Tested-by: Vegard Nossum Acked-by: Oleg Nesterov Cc: [3.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/signal.h b/include/linux/signal.h index ab1e0392b5ac..92557bbce7e7 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -239,7 +239,6 @@ extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; -extern int sigsuspend(sigset_t *); struct sigaction { #ifndef __ARCH_HAS_IRIX_SIGACTION -- cgit v1.2.3 From 21fa8442799945beaca074cb5bcf7cfe24969d59 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 20 Nov 2015 15:57:32 -0800 Subject: mm: fix up sparse warning in gfpflags_allow_blocking sparse says: include/linux/gfp.h:274:26: warning: incorrect type in return expression (different base types) include/linux/gfp.h:274:26: expected bool include/linux/gfp.h:274:26: got restricted gfp_t ...add a forced cast to silence the warning. Signed-off-by: Jeff Layton Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6523109e136d..8942af0813e3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -271,7 +271,7 @@ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { - return gfp_flags & __GFP_DIRECT_RECLAIM; + return (bool __force)(gfp_flags & __GFP_DIRECT_RECLAIM); } #ifdef CONFIG_HIGHMEM -- cgit v1.2.3 From 6b2a3d628aa752f0ab825fc6d4d07b09e274d1c1 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sun, 8 Nov 2015 08:52:31 -0500 Subject: tty: audit: Fix audit source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The data to audit/record is in the 'from' buffer (ie., the input read buffer). Fixes: 72586c6061ab ("n_tty: Fix auditing support for cannonical mode") Cc: stable # 4.1+ Cc: Miloslav Trmač Signed-off-by: Peter Hurley Acked-by: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tty.h b/include/linux/tty.h index 5b04b0a5375b..5e31f1b99037 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -607,7 +607,7 @@ extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops); /* tty_audit.c */ #ifdef CONFIG_AUDIT -extern void tty_audit_add_data(struct tty_struct *tty, unsigned char *data, +extern void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size, unsigned icanon); extern void tty_audit_exit(void); extern void tty_audit_fork(struct signal_struct *sig); @@ -615,8 +615,8 @@ extern void tty_audit_tiocsti(struct tty_struct *tty, char ch); extern void tty_audit_push(struct tty_struct *tty); extern int tty_audit_push_current(void); #else -static inline void tty_audit_add_data(struct tty_struct *tty, - unsigned char *data, size_t size, unsigned icanon) +static inline void tty_audit_add_data(struct tty_struct *tty, const void *data, + size_t size, unsigned icanon) { } static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch) -- cgit v1.2.3 From 40b25fe5dc57a6557b96241b75ae63dce716a487 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 19 Nov 2015 16:16:43 +0100 Subject: Bluetooth: Add support for Get Advertising Size Information command The Get Advertising Size Information command allows to retrieve size information for advertising data and scan response data fields depending on the selected flags. This is useful if applications want to know the available size ahead of time. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/mgmt.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index b831242d48a4..af17774c9416 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -571,6 +571,19 @@ struct mgmt_rp_remove_advertising { __u8 instance; } __packed; +#define MGMT_OP_GET_ADV_SIZE_INFO 0x0040 +struct mgmt_cp_get_adv_size_info { + __u8 instance; + __le32 flags; +} __packed; +#define MGMT_GET_ADV_SIZE_INFO_SIZE 5 +struct mgmt_rp_get_adv_size_info { + __u8 instance; + __le32 flags; + __u8 max_adv_data_len; + __u8 max_scan_rsp_len; +} __packed; + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; -- cgit v1.2.3 From 865762a8119e74b5f0e236d2d8eaaf8be9292a06 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 20 Nov 2015 15:57:58 -0800 Subject: slab/slub: adjust kmem_cache_alloc_bulk API Adjust kmem_cache_alloc_bulk API before we have any real users. Adjust API to return type 'int' instead of previously type 'bool'. This is done to allow future extension of the bulk alloc API. A future extension could be to allow SLUB to stop at a page boundary, when specified by a flag, and then return the number of objects. The advantage of this approach, would make it easier to make bulk alloc run without local IRQs disabled. With an approach of cmpxchg "stealing" the entire c->freelist or page->freelist. To avoid overshooting we would stop processing at a slab-page boundary. Else we always end up returning some objects at the cost of another cmpxchg. To keep compatible with future users of this API linking against an older kernel when using the new flag, we need to return the number of allocated objects with this API change. Signed-off-by: Jesper Dangaard Brouer Cc: Vladimir Davydov Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 96940772bb92..2037a861e367 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -316,7 +316,7 @@ void kmem_cache_free(struct kmem_cache *, void *); * Note that interrupts must be enabled when calling these functions. */ void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); -bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment; -- cgit v1.2.3 From cc30c16344fc3a25153175c7eb9037b2136cd466 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 20 Nov 2015 03:56:23 +0100 Subject: net: dsa: Add support for a switch reset gpio Some boards have a gpio line tied to the switch reset pin. Allow this gpio to be retrieved from the device tree, and take the switch out of reset before performing the probe. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 82a4c6011173..3f23dd9d6a69 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,13 @@ struct dsa_chip_data { * NULL if there is only one switch chip. */ s8 *rtable; + + /* + * A switch may have a GPIO line tied to its reset pin. Parse + * this from the device tree, and use it before performing + * switch soft reset. + */ + struct gpio_desc *reset; }; struct dsa_platform_data { -- cgit v1.2.3 From 7d267278a9ece963d77eefec61630223fce08c6c Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Fri, 20 Nov 2015 22:07:23 +0000 Subject: unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron Signed-off-by: David S. Miller --- include/net/af_unix.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b36d837c701e..2a91a0561a47 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -62,6 +62,7 @@ struct unix_sock { #define UNIX_GC_CANDIDATE 0 #define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; + wait_queue_t peer_wake; }; static inline struct unix_sock *unix_sk(const struct sock *sk) -- cgit v1.2.3 From 7ef8f65df976369588fa1b6466668b1b6a26eb3c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sat, 21 Nov 2015 15:57:27 +0100 Subject: net: ipmr: fix code and comment style Trivial code and comment style fixes, also removed some extra newlines, spaces and tabs. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 59 +++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index a382d2c04a42..cf943016930f 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -4,15 +4,13 @@ #include #include -/* - * Based on the MROUTING 3.5 defines primarily to keep - * source compatibility with BSD. +/* Based on the MROUTING 3.5 defines primarily to keep + * source compatibility with BSD. * - * See the mrouted code for the original history. - * - * Protocol Independent Multicast (PIM) data structures included - * Carlos Picoto (cap@di.fc.ul.pt) + * See the mrouted code for the original history. * + * Protocol Independent Multicast (PIM) data structures included + * Carlos Picoto (cap@di.fc.ul.pt) */ #define MRT_BASE 200 @@ -34,15 +32,13 @@ #define SIOCGETSGCNT (SIOCPROTOPRIVATE+1) #define SIOCGETRPF (SIOCPROTOPRIVATE+2) -#define MAXVIFS 32 +#define MAXVIFS 32 typedef unsigned long vifbitmap_t; /* User mode code depends on this lot */ typedef unsigned short vifi_t; #define ALL_VIFS ((vifi_t)(-1)) -/* - * Same idea as select - */ - +/* Same idea as select */ + #define VIFM_SET(n,m) ((m)|=(1<<(n))) #define VIFM_CLR(n,m) ((m)&=~(1<<(n))) #define VIFM_ISSET(n,m) ((m)&(1<<(n))) @@ -50,11 +46,9 @@ typedef unsigned short vifi_t; #define VIFM_COPY(mfrom,mto) ((mto)=(mfrom)) #define VIFM_SAME(m1,m2) ((m1)==(m2)) -/* - * Passed by mrouted for an MRT_ADD_VIF - again we use the - * mrouted 3.6 structures for compatibility +/* Passed by mrouted for an MRT_ADD_VIF - again we use the + * mrouted 3.6 structures for compatibility */ - struct vifctl { vifi_t vifc_vifi; /* Index of VIF */ unsigned char vifc_flags; /* VIFF_ flags */ @@ -73,10 +67,7 @@ struct vifctl { #define VIFF_USE_IFINDEX 0x8 /* use vifc_lcl_ifindex instead of vifc_lcl_addr to find an interface */ -/* - * Cache manipulation structures for mrouted and PIMd - */ - +/* Cache manipulation structures for mrouted and PIMd */ struct mfcctl { struct in_addr mfcc_origin; /* Origin of mcast */ struct in_addr mfcc_mcastgrp; /* Group in question */ @@ -88,10 +79,7 @@ struct mfcctl { int mfcc_expire; }; -/* - * Group count retrieval for mrouted - */ - +/* Group count retrieval for mrouted */ struct sioc_sg_req { struct in_addr src; struct in_addr grp; @@ -100,10 +88,7 @@ struct sioc_sg_req { unsigned long wrong_if; }; -/* - * To get vif packet counts - */ - +/* To get vif packet counts */ struct sioc_vif_req { vifi_t vifi; /* Which iface */ unsigned long icount; /* In packets */ @@ -112,11 +97,9 @@ struct sioc_vif_req { unsigned long obytes; /* Out bytes */ }; -/* - * This is the format the mroute daemon expects to see IGMP control - * data. Magically happens to be like an IP packet as per the original +/* This is the format the mroute daemon expects to see IGMP control + * data. Magically happens to be like an IP packet as per the original */ - struct igmpmsg { __u32 unused1,unused2; unsigned char im_msgtype; /* What is this */ @@ -126,21 +109,13 @@ struct igmpmsg { struct in_addr im_src,im_dst; }; -/* - * That's all usermode folks - */ - - +/* That's all usermode folks */ #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ -/* - * Pseudo messages used by mrouted - */ - +/* Pseudo messages used by mrouted */ #define IGMPMSG_NOCACHE 1 /* Kern cache fill request to mrouted */ #define IGMPMSG_WRONGVIF 2 /* For PIM assert processing (unused) */ #define IGMPMSG_WHOLEPKT 3 /* For PIM Register processing */ - #endif /* _UAPI__LINUX_MROUTE_H */ -- cgit v1.2.3 From c86b3de8c8b02d7e474fdc002c8df533b844524c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Nov 2015 17:48:52 +0100 Subject: thermal: fix thermal_zone_bind_cooling_device prototype When the prototype for thermal_zone_bind_cooling_device changed, the static inline wrapper function was left alone, which in theory can cause build warnings: I have seen this error in the past: drivers/thermal/db8500_thermal.c: In function 'db8500_cdev_bind': drivers/thermal/db8500_thermal.c:78:9: error: too many arguments to function 'thermal_zone_bind_cooling_device' ret = thermal_zone_bind_cooling_device(thermal, i, cdev, while this one no longer shows up, there is no doubt that the prototype is still wrong, so let's just fix it anyway. Signed-off-by: Arnd Bergmann Fixes: 6cd9e9f629f1 ("thermal: of: fix cooling device weights in device tree") Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 4014a59828fc..613c29bd6baf 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -438,7 +438,8 @@ static inline void thermal_zone_device_unregister( static inline int thermal_zone_bind_cooling_device( struct thermal_zone_device *tz, int trip, struct thermal_cooling_device *cdev, - unsigned long upper, unsigned long lower) + unsigned long upper, unsigned long lower, + unsigned int weight) { return -ENODEV; } static inline int thermal_zone_unbind_cooling_device( struct thermal_zone_device *tz, int trip, -- cgit v1.2.3 From 0f42a6a9b807b092841f7e1b381f8c7e80a0d86a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Nov 2015 09:38:48 +0100 Subject: nfs: use btrfs ioctl defintions for clone The NFS CLONE_RANGE defintion was wrong and thus never worked. Fix this by simply using the btrfs ioctl defintion. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- include/uapi/linux/nfs.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h index 654bae3f1a38..5e6296160361 100644 --- a/include/uapi/linux/nfs.h +++ b/include/uapi/linux/nfs.h @@ -33,17 +33,6 @@ #define NFS_PIPE_DIRNAME "nfs" -/* NFS ioctls */ -/* Let's follow btrfs lead on CLONE to avoid messing userspace */ -#define NFS_IOC_CLONE _IOW(0x94, 9, int) -#define NFS_IOC_CLONE_RANGE _IOW(0x94, 13, int) - -struct nfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_off, count; - __u64 dst_off; -}; - /* * NFS stats. The good thing with these values is that NFSv3 errors are * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which -- cgit v1.2.3 From 91ab4b4d16e6649fbbf65f303c0c4e20ed680bd1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 19 Nov 2015 14:30:26 -0500 Subject: nfs: use sliding delay when LAYOUTGET gets NFS4ERR_DELAY When LAYOUTGET gets NFS4ERR_DELAY, we currently will wait 15s before retrying the call. That is a _very_ long time, so add a timeout value to struct nfs4_layoutget and pass nfs4_async_handle_error a pointer to it. This allows the RPC engine to use a sliding delay window, instead of a 15s delay. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 570d630f98ae..11bbae44f4cb 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -251,6 +251,7 @@ struct nfs4_layoutget { struct nfs4_layoutget_res res; struct rpc_cred *cred; gfp_t gfp_flags; + long timeout; }; struct nfs4_getdeviceinfo_args { -- cgit v1.2.3 From 0e3dfda91d9fe8e2c4d0b5d21434b173a241eeaf Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 24 Nov 2015 16:23:05 +0100 Subject: KVM: arm/arm64: arch_timer: Preserve physical dist. active state on LR.active We were incorrectly removing the active state from the physical distributor on the timer interrupt when the timer output level was deasserted. We shouldn't be doing this without considering the virtual interrupt's active state, because the architecture requires that when an LR has the HW bit set and the pending or active bits set, then the physical interrupt must also have the corresponding bits set. This addresses an issue where we have been observing an inconsistency between the LR state and the physical distributor state where the LR state was active and the physical distributor was not active, which shouldn't happen. Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9c747cb14ad8..d2f41477f8ae 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -342,10 +342,10 @@ int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, struct irq_phys_map *map, bool level); void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); -int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu); struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int irq); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map); +bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) -- cgit v1.2.3 From 264640fc2c5f4f913db5c73fa3eb1ead2c45e9d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Tue, 24 Nov 2015 15:07:11 +0100 Subject: ipv6: distinguish frag queues by device for multicast and link-local packets If a fragmented multicast packet is received on an ethernet device which has an active macvlan on top of it, each fragment is duplicated and received both on the underlying device and the macvlan. If some fragments for macvlan are processed before the whole packet for the underlying device is reassembled, the "overlapping fragments" test in ip6_frag_queue() discards the whole fragment queue. To resolve this, add device ifindex to the search key and require it to match reassembling multicast packets and packets to link-local addresses. Note: similar patch has been already submitted by Yoshifuji Hideaki in http://patchwork.ozlabs.org/patch/220979/ but got lost and forgotten for some reason. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index e1a10b0ac0b0..ea5a13ef85a6 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -490,6 +490,7 @@ struct ip6_create_arg { u32 user; const struct in6_addr *src; const struct in6_addr *dst; + int iif; u8 ecn; }; -- cgit v1.2.3 From fbc416ff86183e2203cdf975e2881d7c164b0271 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Nov 2015 12:12:21 +0100 Subject: arm64: fix building without CONFIG_UID16 As reported by Michal Simek, building an ARM64 kernel with CONFIG_UID16 disabled currently fails because the system call table still needs to reference the individual function entry points that are provided by kernel/sys_ni.c in this case, and the declarations are hidden inside of #ifdef CONFIG_UID16: arch/arm64/include/asm/unistd32.h:57:8: error: 'sys_lchown16' undeclared here (not in a function) __SYSCALL(__NR_lchown, sys_lchown16) I believe this problem only exists on ARM64, because older architectures tend to not need declarations when their system call table is built in assembly code, while newer architectures tend to not need UID16 support. ARM64 only uses these system calls for compatibility with 32-bit ARM binaries. This changes the CONFIG_UID16 check into CONFIG_HAVE_UID16, which is set unconditionally on ARM64 with CONFIG_COMPAT, so we see the declarations whenever we need them, but otherwise the behavior is unchanged. Fixes: af1839eb4bd4 ("Kconfig: clean up the long arch list for the UID16 config option") Signed-off-by: Arnd Bergmann Acked-by: Will Deacon Cc: stable@vger.kernel.org Signed-off-by: Catalin Marinas --- include/linux/syscalls.h | 2 +- include/linux/types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a156b82dd14c..c2b66a277e98 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -524,7 +524,7 @@ asmlinkage long sys_chown(const char __user *filename, asmlinkage long sys_lchown(const char __user *filename, uid_t user, gid_t group); asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group); -#ifdef CONFIG_UID16 +#ifdef CONFIG_HAVE_UID16 asmlinkage long sys_chown16(const char __user *filename, old_uid_t user, old_gid_t group); asmlinkage long sys_lchown16(const char __user *filename, diff --git a/include/linux/types.h b/include/linux/types.h index 70d8500bddf1..70dd3dfde631 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -35,7 +35,7 @@ typedef __kernel_gid16_t gid16_t; typedef unsigned long uintptr_t; -#ifdef CONFIG_UID16 +#ifdef CONFIG_HAVE_UID16 /* This is defined by include/asm-{arch}/posix_types.h */ typedef __kernel_old_uid_t old_uid_t; typedef __kernel_old_gid_t old_gid_t; -- cgit v1.2.3 From 9458ceab49179b7fd2d5192fd9dcf316ca195dc0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 24 Nov 2015 15:30:21 -0800 Subject: net: phy: bcm7xxx: Add entry for Broadcom BCM7435 Add a PHY entry for the Broadcom BCM7435 chips, this is a 40nm generation Ethernet PHY which is analogous to its 7425 and 7429 counter parts. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 59f4a7304419..f0ba9c2ec639 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -26,6 +26,7 @@ #define PHY_ID_BCM7366 0x600d8490 #define PHY_ID_BCM7425 0x600d86b0 #define PHY_ID_BCM7429 0x600d8730 +#define PHY_ID_BCM7435 0x600d8750 #define PHY_ID_BCM7439 0x600d8480 #define PHY_ID_BCM7439_2 0xae025080 #define PHY_ID_BCM7445 0x600d8510 -- cgit v1.2.3 From c9da161c6517ba12154059d3b965c2cbaf16f90f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Nov 2015 21:28:15 +0100 Subject: bpf: fix clearing on persistent program array maps Currently, when having map file descriptors pointing to program arrays, there's still the issue that we unconditionally flush program array contents via bpf_fd_array_map_clear() in bpf_map_release(). This happens when such a file descriptor is released and is independent of the map's refcount. Having this flush independent of the refcount is for a reason: there can be arbitrary complex dependency chains among tail calls, also circular ones (direct or indirect, nesting limit determined during runtime), and we need to make sure that the map drops all references to eBPF programs it holds, so that the map's refcount can eventually drop to zero and initiate its freeing. Btw, a walk of the whole dependency graph would not be possible for various reasons, one being complexity and another one inconsistency, i.e. new programs can be added to parts of the graph at any time, so there's no guaranteed consistent state for the time of such a walk. Now, the program array pinning itself works, but the issue is that each derived file descriptor on close would nevertheless call unconditionally into bpf_fd_array_map_clear(). Instead, keep track of users and postpone this flush until the last reference to a user is dropped. As this only concerns a subset of references (f.e. a prog array could hold a program that itself has reference on the prog array holding it, etc), we need to track them separately. Short analysis on the refcounting: on map creation time usercnt will be one, so there's no change in behaviour for bpf_map_release(), if unpinned. If we already fail in map_create(), we are immediately freed, and no file descriptor has been made public yet. In bpf_obj_pin_user(), we need to probe for a possible map in bpf_fd_probe_obj() already with a usercnt reference, so before we drop the reference on the fd with fdput(). Therefore, if actual pinning fails, we need to drop that reference again in bpf_any_put(), otherwise we keep holding it. When last reference drops on the inode, the bpf_any_put() in bpf_evict_inode() will take care of dropping the usercnt again. In the bpf_obj_get_user() case, the bpf_any_get() will grab a reference on the usercnt, still at a time when we have the reference on the path. Should we later on fail to grab a new file descriptor, bpf_any_put() will drop it, otherwise we hold it until bpf_map_release() time. Joint work with Alexei. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index de464e6683b6..83d1926c61e4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -40,6 +40,7 @@ struct bpf_map { struct user_struct *user; const struct bpf_map_ops *ops; struct work_struct work; + atomic_t usercnt; }; struct bpf_map_type_list { @@ -167,8 +168,10 @@ struct bpf_prog *bpf_prog_get(u32 ufd); void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_put_rcu(struct bpf_prog *prog); -struct bpf_map *bpf_map_get(u32 ufd); +struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); +void bpf_map_inc(struct bpf_map *map, bool uref); +void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); extern int sysctl_unprivileged_bpf_disabled; -- cgit v1.2.3 From 7c7a0e945349a3d0d497d7f32db6ed33d4031110 Mon Sep 17 00:00:00 2001 From: Gabriele Paoloni Date: Wed, 11 Nov 2015 09:12:25 +0800 Subject: ARM/PCI: Move align_resource function pointer to pci_host_bridge structure Commit b3a72384fe29 ("ARM/PCI: Replace pci_sys_data->align_resource with global function pointer") introduced an ARM-specific align_resource() function pointer. This is not portable to other arches and doesn't work for platforms with two different PCIe host bridge controllers. Move the function pointer to the pci_host_bridge structure so each host bridge driver can specify its own align_resource() function. Signed-off-by: Gabriele Paoloni Signed-off-by: Bjorn Helgaas Reviewed-by: Arnd Bergmann --- include/linux/pci.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index e828e7b4afec..6ae25aae88fd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -412,9 +412,18 @@ struct pci_host_bridge { void (*release_fn)(struct pci_host_bridge *); void *release_data; unsigned int ignore_reset_delay:1; /* for entire hierarchy */ + /* Resource alignment requirements */ + resource_size_t (*align_resource)(struct pci_dev *dev, + const struct resource *res, + resource_size_t start, + resource_size_t size, + resource_size_t align); }; #define to_pci_host_bridge(n) container_of(n, struct pci_host_bridge, dev) + +struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus); + void pci_set_host_bridge_release(struct pci_host_bridge *bridge, void (*release_fn)(struct pci_host_bridge *), void *release_data); -- cgit v1.2.3 From 057085e522f8bf94c2e691a5b76880f68060f8ba Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 5 Nov 2015 23:37:59 -0800 Subject: target: Fix race for SCF_COMPARE_AND_WRITE_POST checking This patch addresses a race + use after free where the first stage of COMPARE_AND_WRITE in compare_and_write_callback() is rescheduled after the backend sends the secondary WRITE, resulting in second stage compare_and_write_post() callback completing in target_complete_ok_work() before the first can return. Because current code depends on checking se_cmd->se_cmd_flags after return from se_cmd->transport_complete_callback(), this results in first stage having SCF_COMPARE_AND_WRITE_POST set, which incorrectly falls through into second stage CAW processing code, eventually triggering a NULL pointer dereference due to use after free. To address this bug, pass in a new *post_ret parameter into se_cmd->transport_complete_callback(), and depend upon this value instead of ->se_cmd_flags to determine when to return or fall through into ->queue_status() code for CAW. Cc: Sagi Grimberg Cc: # v3.12+ Signed-off-by: Nicholas Bellinger --- include/target/target_core_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 0a2c74008e53..aabf0aca0171 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -474,7 +474,7 @@ struct se_cmd { struct completion cmd_wait_comp; const struct target_core_fabric_ops *se_tfo; sense_reason_t (*execute_cmd)(struct se_cmd *); - sense_reason_t (*transport_complete_callback)(struct se_cmd *, bool); + sense_reason_t (*transport_complete_callback)(struct se_cmd *, bool, int *); void *protocol_data; unsigned char *t_task_cdb; -- cgit v1.2.3 From 3a66d7dca186ebdef9b0bf55e216778fa598062c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 22 Oct 2015 16:02:14 -0700 Subject: kref: Remove kref_put_spinlock_irqsave() The last user is gone. Hence remove this function. Signed-off-by: Bart Van Assche Cc: Greg Kroah-Hartman Cc: Christoph Hellwig Cc: Joern Engel Signed-off-by: Nicholas Bellinger --- include/linux/kref.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include') diff --git a/include/linux/kref.h b/include/linux/kref.h index 484604d184be..e15828fd71f1 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -19,7 +19,6 @@ #include #include #include -#include struct kref { atomic_t refcount; @@ -99,38 +98,6 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref) return kref_sub(kref, 1, release); } -/** - * kref_put_spinlock_irqsave - decrement refcount for object. - * @kref: object. - * @release: pointer to the function that will clean up the object when the - * last reference to the object is released. - * This pointer is required, and it is not acceptable to pass kfree - * in as this function. - * @lock: lock to take in release case - * - * Behaves identical to kref_put with one exception. If the reference count - * drops to zero, the lock will be taken atomically wrt dropping the reference - * count. The release function has to call spin_unlock() without _irqrestore. - */ -static inline int kref_put_spinlock_irqsave(struct kref *kref, - void (*release)(struct kref *kref), - spinlock_t *lock) -{ - unsigned long flags; - - WARN_ON(release == NULL); - if (atomic_add_unless(&kref->refcount, -1, 1)) - return 0; - spin_lock_irqsave(lock, flags); - if (atomic_dec_and_test(&kref->refcount)) { - release(kref); - local_irq_restore(flags); - return 1; - } - spin_unlock_irqrestore(lock, flags); - return 0; -} - static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *lock) -- cgit v1.2.3 From 08236c6bb2980561fba657c58fdc76f2865f236c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sat, 28 Nov 2015 16:49:27 +0100 Subject: lightnvm: unconverted ppa returned in get_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get_bb_tbl function takes ppa as a generic address, which is converted to the ppa device address within the device driver. When the update_bbtbl callback is called from get_bb_tbl, the device specific ppa is used, instead of the generic ppa. Make sure to pass the generic ppa. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 3db5552b17d5..c6916aec43b6 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -179,7 +179,7 @@ typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, struct ppa_addr, int, +typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, nvm_bb_update_fn *, void *); typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); -- cgit v1.2.3 From bf4e6b4e757488dee1b6a581f49c7ac34cd217f8 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 26 Nov 2015 08:46:57 +0100 Subject: block: Always check queue limits for cloned requests When a cloned request is retried on other queues it always needs to be checked against the queue limits of that queue. Otherwise the calculations for nr_phys_segments might be wrong, leading to a crash in scsi_init_sgtable(). To clarify this the patch renames blk_rq_check_limits() to blk_cloned_rq_check_limits() and removes the symbol export, as the new function should only be used for cloned requests and never exported. Cc: Mike Snitzer Cc: Ewan Milne Cc: Jeff Moyer Signed-off-by: Hannes Reinecke Fixes: e2a60da74 ("block: Clean up special command handling logic") Cc: stable@vger.kernel.org # 3.7+ Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c0d2b7927c1f..c06f8eaa42ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -773,7 +773,6 @@ extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); -extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, -- cgit v1.2.3 From 880621c2605b82eb5af91a2c94223df6f5a3fb64 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 22 Nov 2015 17:46:09 +0100 Subject: packet: Allow packets with only a header (but no payload) Commit 9c7077622dd91 ("packet: make packet_snd fail on len smaller than l2 header") added validation for the packet size in packet_snd. This change enforces that every packet needs a header (with at least hard_header_len bytes) plus a payload with at least one byte. Before this change the payload was optional. This fixes PPPoE connections which do not have a "Service" or "Host-Uniq" configured (which is violating the spec, but is still widely used in real-world setups). Those are currently failing with the following message: "pppd: packet size is too short (24 <= 24)" Signed-off-by: Martin Blumenstingl Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67bfac1abfc1..3b5d134e945a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1398,7 +1398,8 @@ enum netdev_priv_flags { * @dma: DMA channel * @mtu: Interface MTU value * @type: Interface hardware type - * @hard_header_len: Hardware header length + * @hard_header_len: Hardware header length, which means that this is the + * minimum size of a packet. * * @needed_headroom: Extra headroom the hardware may need, but not in all * cases can this be guaranteed -- cgit v1.2.3 From 1ce0bf50ae2233c7115a18c0c623662d177b434c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 26 Nov 2015 13:55:39 +0800 Subject: net: Generalise wq_has_sleeper helper The memory barrier in the helper wq_has_sleeper is needed by just about every user of waitqueue_active. This patch generalises it by making it take a wait_queue_head_t directly. The existing helper is renamed to skwq_has_sleeper. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/wait.h | 21 +++++++++++++++++++++ include/net/sock.h | 15 +++++---------- 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f963a9..6aa09a875fbd 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -107,6 +107,27 @@ static inline int waitqueue_active(wait_queue_head_t *q) return !list_empty(&q->task_list); } +/** + * wq_has_sleeper - check if there are any waiting processes + * @wq: wait queue head + * + * Returns true if wq has waiting processes + * + * Please refer to the comment for waitqueue_active. + */ +static inline bool wq_has_sleeper(wait_queue_head_t *wq) +{ + /* + * We need to be sure we are in sync with the + * add_wait_queue modifications to the wait queue. + * + * This memory barrier should be paired with one on the + * waiting side. + */ + smp_mb(); + return waitqueue_active(wq); +} + extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait); extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); diff --git a/include/net/sock.h b/include/net/sock.h index 7f89e4ba18d1..62d35afcb3ac 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -1879,12 +1880,12 @@ static inline bool sk_has_allocations(const struct sock *sk) } /** - * wq_has_sleeper - check if there are any waiting processes + * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * * Returns true if socket_wq has waiting processes * - * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory + * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths: @@ -1910,15 +1911,9 @@ static inline bool sk_has_allocations(const struct sock *sk) * data on the socket. * */ -static inline bool wq_has_sleeper(struct socket_wq *wq) +static inline bool skwq_has_sleeper(struct socket_wq *wq) { - /* We need to be sure we are in sync with the - * add_wait_queue modifications to the wait queue. - * - * This memory barrier is paired in the sock_poll_wait. - */ - smp_mb(); - return wq && waitqueue_active(&wq->wait); + return wq && wq_has_sleeper(&wq->wait); } /** -- cgit v1.2.3 From 06bd6c0370bb88a2256c6763a32bc4e4ade06521 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:45 +0100 Subject: net: ipmr: remove unused MFC_NOTIFY flag and make the flags enum MFC_NOTIFY was introduced in kernel 2.1.68 but afaik it hasn't been used and I couldn't find any users currently so just remove it. Only MFC_STATIC is left, so move it into an enum, add a description and use BIT(). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 79aaa9fc1a15..fa66ebc1fed6 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -64,6 +64,13 @@ struct vif_device { #define VIFF_STATIC 0x8000 +/* mfc_flags: + * MFC_STATIC - the entry was added statically (not by a routing daemon) + */ +enum { + MFC_STATIC = BIT(0), +}; + struct mfc_cache { struct list_head list; __be32 mfc_mcastgrp; /* Group the entry belongs to */ @@ -89,9 +96,6 @@ struct mfc_cache { struct rcu_head rcu; }; -#define MFC_STATIC 1 -#define MFC_NOTIFY 2 - #define MFC_LINES 64 #ifdef __BIG_ENDIAN -- cgit v1.2.3 From 520191bb404c4b7b4cdb70a5480ada974b0c2d60 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:46 +0100 Subject: net: ipmr: adjust mroute.h style and drop extern Remove extra spaces and tabs, adjust function definitions, remove an unnecessary ifdef (already used below, just move code) and drop extern from the functions. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index fa66ebc1fed6..7c567a2679ce 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -9,38 +9,28 @@ #ifdef CONFIG_IP_MROUTE static inline int ip_mroute_opt(int opt) { - return (opt >= MRT_BASE) && (opt <= MRT_MAX); + return opt >= MRT_BASE && opt <= MRT_MAX; } -#else -static inline int ip_mroute_opt(int opt) -{ - return 0; -} -#endif -#ifdef CONFIG_IP_MROUTE -extern int ip_mroute_setsockopt(struct sock *, int, char __user *, unsigned int); -extern int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); -extern int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); -extern int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); -extern int ip_mr_init(void); +int ip_mroute_setsockopt(struct sock *, int, char __user *, unsigned int); +int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); +int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); +int ip_mr_init(void); #else -static inline -int ip_mroute_setsockopt(struct sock *sock, - int optname, char __user *optval, unsigned int optlen) +static inline int ip_mroute_setsockopt(struct sock *sock, int optname, + char __user *optval, unsigned int optlen) { return -ENOPROTOOPT; } -static inline -int ip_mroute_getsockopt(struct sock *sock, - int optname, char __user *optval, int __user *optlen) +static inline int ip_mroute_getsockopt(struct sock *sock, int optname, + char __user *optval, int __user *optlen) { return -ENOPROTOOPT; } -static inline -int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) +static inline int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) { return -ENOIOCTLCMD; } @@ -49,6 +39,11 @@ static inline int ip_mr_init(void) { return 0; } + +static inline int ip_mroute_opt(int opt) +{ + return 0; +} #endif struct vif_device { @@ -96,16 +91,16 @@ struct mfc_cache { struct rcu_head rcu; }; -#define MFC_LINES 64 +#define MFC_LINES 64 #ifdef __BIG_ENDIAN #define MFC_HASH(a,b) (((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1)) #else #define MFC_HASH(a,b) ((((__force u32)(__be32)a)^(((__force u32)(__be32)b)>>2))&(MFC_LINES-1)) -#endif +#endif struct rtmsg; -extern int ipmr_get_route(struct net *net, struct sk_buff *skb, - __be32 saddr, __be32 daddr, - struct rtmsg *rtm, int nowait); +int ipmr_get_route(struct net *net, struct sk_buff *skb, + __be32 saddr, __be32 daddr, + struct rtmsg *rtm, int nowait); #endif -- cgit v1.2.3 From 5ea1f13299d8b8edcb2969eda4c81f8e3264b706 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:47 +0100 Subject: net: ipmr: move struct mr_table and VIF_EXISTS to mroute.h Move the definitions of VIF_EXISTS() and struct mr_table to mroute.h Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 7c567a2679ce..bf9b322cb0b0 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -59,6 +59,25 @@ struct vif_device { #define VIFF_STATIC 0x8000 +#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) +#define MFC_LINES 64 + +struct mr_table { + struct list_head list; + possible_net_t net; + u32 id; + struct sock __rcu *mroute_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc_unres_queue; + struct list_head mfc_cache_array[MFC_LINES]; + struct vif_device vif_table[MAXVIFS]; + int maxvif; + atomic_t cache_resolve_queue_len; + bool mroute_do_assert; + bool mroute_do_pim; + int mroute_reg_vif_num; +}; + /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) */ @@ -91,8 +110,6 @@ struct mfc_cache { struct rcu_head rcu; }; -#define MFC_LINES 64 - #ifdef __BIG_ENDIAN #define MFC_HASH(a,b) (((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1)) #else -- cgit v1.2.3 From 1973a4ea6ceaa47671227c3077f90508ea30897b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:48 +0100 Subject: net: ipmr: move pimsm_enabled to pim.h and rename Move the inline pimsm_enabled() to pim.h and rename it to ipmr_pimsm_enabled to show it's for the ipv4 ipmr code since pim.h is used by IPv6 too. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/pim.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/pim.h b/include/linux/pim.h index 252bf6644c51..e1d756f81348 100644 --- a/include/linux/pim.h +++ b/include/linux/pim.h @@ -13,6 +13,11 @@ #define PIM_NULL_REGISTER cpu_to_be32(0x40000000) +static inline bool ipmr_pimsm_enabled(void) +{ + return IS_BUILTIN(CONFIG_IP_PIMSM_V1) || IS_BUILTIN(CONFIG_IP_PIMSM_V2); +} + /* PIMv2 register message header layout (ietf-draft-idmr-pimvsm-v2-00.ps */ struct pimreghdr { -- cgit v1.2.3 From 304d888b29cf96f1dd53511ee686499cd8cdf249 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 27 Nov 2015 18:17:05 +0100 Subject: Revert "ipv6: ndisc: inherit metadata dst when creating ndisc requests" This reverts commit ab450605b35caa768ca33e86db9403229bf42be4. In IPv6, we cannot inherit the dst of the original dst. ndisc packets are IPv6 packets and may take another route than the original packet. This patch breaks the following scenario: a packet comes from eth0 and is forwarded through vxlan1. The encapsulated packet triggers an NS which cannot be sent because of the wrong route. CC: Jiri Benc CC: Thomas Graf Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/ndisc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index bf3937431030..2d8edaad29cb 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -181,8 +181,7 @@ void ndisc_cleanup(void); int ndisc_rcv(struct sk_buff *skb); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr, - struct sk_buff *oskb); + const struct in6_addr *daddr, const struct in6_addr *saddr); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); -- cgit v1.2.3 From 9cd3e072b0be17446e37d7414eac8a3499e0601e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 20:03:10 -0800 Subject: net: rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA This patch is a cleanup to make following patch easier to review. Goal is to move SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA from (struct socket)->flags to a (struct socket_wq)->flags to benefit from RCU protection in sock_wake_async() To ease backports, we rename both constants. Two new helpers, sk_set_bit(int nr, struct sock *sk) and sk_clear_bit(int net, struct sock *sk) are added so that following patch can change their implementation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 6 +++--- include/net/sock.h | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 70ac5e28e6b7..f514e4dd5521 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -34,8 +34,8 @@ struct inode; struct file; struct net; -#define SOCK_ASYNC_NOSPACE 0 -#define SOCK_ASYNC_WAITDATA 1 +#define SOCKWQ_ASYNC_NOSPACE 0 +#define SOCKWQ_ASYNC_WAITDATA 1 #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 @@ -96,7 +96,7 @@ struct socket_wq { * struct socket - general BSD socket * @state: socket state (%SS_CONNECTED, etc) * @type: socket type (%SOCK_STREAM, etc) - * @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc) + * @flags: socket flags (%SOCK_NOSPACE, etc) * @ops: protocol specific socket operations * @file: File back pointer for gc * @sk: internal networking protocol agnostic socket representation diff --git a/include/net/sock.h b/include/net/sock.h index 7f89e4ba18d1..c155d09d8af4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2005,6 +2005,16 @@ static inline unsigned long sock_wspace(struct sock *sk) return amt; } +static inline void sk_set_bit(int nr, struct sock *sk) +{ + set_bit(nr, &sk->sk_socket->flags); +} + +static inline void sk_clear_bit(int nr, struct sock *sk) +{ + clear_bit(nr, &sk->sk_socket->flags); +} + static inline void sk_wake_async(struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) -- cgit v1.2.3 From ceb5d58b217098a657f3850b7a2640f995032e62 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 20:03:11 -0800 Subject: net: fix sock_wake_async() rcu protection Dmitry provided a syzkaller (http://github.com/google/syzkaller) triggering a fault in sock_wake_async() when async IO is requested. Said program stressed af_unix sockets, but the issue is generic and should be addressed in core networking stack. The problem is that by the time sock_wake_async() is called, we should not access the @flags field of 'struct socket', as the inode containing this socket might be freed without further notice, and without RCU grace period. We already maintain an RCU protected structure, "struct socket_wq" so moving SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA into it is the safe route. It also reduces number of cache lines needing dirtying, so might provide a performance improvement anyway. In followup patches, we might move remaining flags (SOCK_NOSPACE, SOCK_PASSCRED, SOCK_PASSSEC) to save 8 bytes and let 'struct socket' being mostly read and let it being shared between cpus. Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 7 ++++++- include/net/sock.h | 23 ++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index f514e4dd5521..0b4ac7da583a 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -34,6 +34,10 @@ struct inode; struct file; struct net; +/* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located + * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. + * Eventually all flags will be in sk->sk_wq_flags. + */ #define SOCKWQ_ASYNC_NOSPACE 0 #define SOCKWQ_ASYNC_WAITDATA 1 #define SOCK_NOSPACE 2 @@ -89,6 +93,7 @@ struct socket_wq { /* Note: wait MUST be first field of socket_wq */ wait_queue_head_t wait; struct fasync_struct *fasync_list; + unsigned long flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */ struct rcu_head rcu; } ____cacheline_aligned_in_smp; @@ -202,7 +207,7 @@ enum { SOCK_WAKE_URG, }; -int sock_wake_async(struct socket *sk, int how, int band); +int sock_wake_async(struct socket_wq *sk_wq, int how, int band); int sock_register(const struct net_proto_family *fam); void sock_unregister(int family); int __sock_create(struct net *net, int family, int type, int proto, diff --git a/include/net/sock.h b/include/net/sock.h index c155d09d8af4..0434138c5f95 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -384,8 +384,10 @@ struct sock { int sk_rcvbuf; struct sk_filter __rcu *sk_filter; - struct socket_wq __rcu *sk_wq; - + union { + struct socket_wq __rcu *sk_wq; + struct socket_wq *sk_wq_raw; + }; #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif @@ -2005,20 +2007,27 @@ static inline unsigned long sock_wspace(struct sock *sk) return amt; } +/* Note: + * We use sk->sk_wq_raw, from contexts knowing this + * pointer is not NULL and cannot disappear/change. + */ static inline void sk_set_bit(int nr, struct sock *sk) { - set_bit(nr, &sk->sk_socket->flags); + set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { - clear_bit(nr, &sk->sk_socket->flags); + clear_bit(nr, &sk->sk_wq_raw->flags); } -static inline void sk_wake_async(struct sock *sk, int how, int band) +static inline void sk_wake_async(const struct sock *sk, int how, int band) { - if (sock_flag(sk, SOCK_FASYNC)) - sock_wake_async(sk->sk_socket, how, band); + if (sock_flag(sk, SOCK_FASYNC)) { + rcu_read_lock(); + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); + rcu_read_unlock(); + } } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might -- cgit v1.2.3 From 91420b83baa046ada1a899c97f3b2c52a9045705 Mon Sep 17 00:00:00 2001 From: Sudarsana Kalluru Date: Mon, 30 Nov 2015 12:25:03 +0200 Subject: qed: Add support for changing LED state Physical LEDs are being controlled by the management FW. This adds the qed functionality required to request management FW to change the LED configuration, as well as the necessary APIs for this functionality to later be used by the protocol drivers. Signed-off-by: Sudarsana Kalluru Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index dc9a1353f971..d4a32e878180 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -25,6 +25,12 @@ #include #include +enum qed_led_mode { + QED_LED_MODE_OFF, + QED_LED_MODE_ON, + QED_LED_MODE_RESTORE +}; + #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \ (void __iomem *)(reg_addr)) @@ -252,6 +258,17 @@ struct qed_common_ops { void (*chain_free)(struct qed_dev *cdev, struct qed_chain *p_chain); + +/** + * @brief set_led - Configure LED mode + * + * @param cdev + * @param mode - LED mode + * + * @return 0 on success, error otherwise. + */ + int (*set_led)(struct qed_dev *cdev, + enum qed_led_mode mode); }; /** -- cgit v1.2.3 From 45f6fad84cc305103b28d73482b344d7f5b76f39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 19:37:57 -0800 Subject: ipv6: add complete rcu protection around np->opt This patch addresses multiple problems : UDP/RAW sendmsg() need to get a stable struct ipv6_txoptions while socket is not locked : Other threads can change np->opt concurrently. Dmitry posted a syzkaller (http://github.com/google/syzkaller) program desmonstrating use-after-free. Starting with TCP/DCCP lockless listeners, tcp_v6_syn_recv_sock() and dccp_v6_request_recv_sock() also need to use RCU protection to dereference np->opt once (before calling ipv6_dup_options()) This patch adds full RCU protection to np->opt Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 +- include/net/ipv6.h | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 0ef2a97ccdb5..402753bccafa 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -227,7 +227,7 @@ struct ipv6_pinfo { struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_fl_socklist __rcu *ipv6_fl_list; - struct ipv6_txoptions *opt; + struct ipv6_txoptions __rcu *opt; struct sk_buff *pktoptions; struct sk_buff *rxpmtu; struct inet6_cork cork; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index ea5a13ef85a6..9a5c9f013784 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -205,6 +205,7 @@ extern rwlock_t ip6_ra_lock; */ struct ipv6_txoptions { + atomic_t refcnt; /* Length of this structure */ int tot_len; @@ -217,7 +218,7 @@ struct ipv6_txoptions { struct ipv6_opt_hdr *dst0opt; struct ipv6_rt_hdr *srcrt; /* Routing Header */ struct ipv6_opt_hdr *dst1opt; - + struct rcu_head rcu; /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */ }; @@ -252,6 +253,24 @@ struct ipv6_fl_socklist { struct rcu_head rcu; }; +static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) +{ + struct ipv6_txoptions *opt; + + rcu_read_lock(); + opt = rcu_dereference(np->opt); + if (opt && !atomic_inc_not_zero(&opt->refcnt)) + opt = NULL; + rcu_read_unlock(); + return opt; +} + +static inline void txopt_put(struct ipv6_txoptions *opt) +{ + if (opt && atomic_dec_and_test(&opt->refcnt)) + kfree_rcu(opt, rcu); +} + struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label); struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, -- cgit v1.2.3 From 38ee8fb67c3457f36f5137073c4b8ac2436d2393 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 30 Nov 2015 12:17:06 -0200 Subject: sctp: convert sack_needed and sack_generation to bits They don't need to be any bigger than that and with this we start a new bitfield for tracking association runtime stuff, like zero window situation. Signed-off-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 495c87e367b3..7bbb71081aeb 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -775,10 +775,10 @@ struct sctp_transport { hb_sent:1, /* Is the Path MTU update pending on this tranport */ - pmtu_pending:1; + pmtu_pending:1, - /* Has this transport moved the ctsn since we last sacked */ - __u32 sack_generation; + /* Has this transport moved the ctsn since we last sacked */ + sack_generation:1; u32 dst_cookie; struct flowi fl; @@ -1482,19 +1482,19 @@ struct sctp_association { prsctp_capable:1, /* Can peer do PR-SCTP? */ auth_capable:1; /* Is peer doing SCTP-AUTH? */ - /* Ack State : This flag indicates if the next received + /* sack_needed : This flag indicates if the next received * : packet is to be responded to with a - * : SACK. This is initializedto 0. When a packet - * : is received it is incremented. If this value + * : SACK. This is initialized to 0. When a packet + * : is received sack_cnt is incremented. If this value * : reaches 2 or more, a SACK is sent and the * : value is reset to 0. Note: This is used only * : when no DATA chunks are received out of * : order. When DATA chunks are out of order, * : SACK's are not delayed (see Section 6). */ - __u8 sack_needed; /* Do we need to sack the peer? */ + __u8 sack_needed:1, /* Do we need to sack the peer? */ + sack_generation:1; __u32 sack_cnt; - __u32 sack_generation; __u32 adaptation_ind; /* Adaptation Code point. */ -- cgit v1.2.3 From c0eb454034aab783dc602739237a63b30867f5bd Mon Sep 17 00:00:00 2001 From: KY Srinivasan Date: Tue, 1 Dec 2015 16:43:10 -0800 Subject: hv_netvsc: Don't ask for additional head room in the skb The rndis header is 116 bytes big and can be placed in the default head room that will be available in the skb. Since the netvsc packet is less than 48 bytes, we can use the skb control buffer for the netvsc packet. With these changes we don't need to ask for additional head room. Signed-off-by: K. Y. Srinivasan Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7d2d1d7aaec7..fcbc5259c630 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -132,7 +132,9 @@ static inline bool dev_xmit_complete(int rc) * used. */ -#if defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) +#if defined(CONFIG_HYPERV_NET) +# define LL_MAX_HEADER 128 +#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) # if defined(CONFIG_MAC80211_MESH) # define LL_MAX_HEADER 128 # else -- cgit v1.2.3 From 7450aaf61f0ae2ee6cc6491138d11df2c25e7609 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Nov 2015 08:57:28 -0800 Subject: tcp: suppress too verbose messages in tcp_send_ack() If tcp_send_ack() can not allocate skb, we properly handle this and setup a timer to try later. Use __GFP_NOWARN to avoid polluting syslog in the case host is under memory pressure, so that pertinent messages are not lost under a flood of useless information. sk_gfp_atomic() can use its gfp_mask argument (all callers currently were using GFP_ATOMIC before this patch) We rename sk_gfp_atomic() to sk_gfp_mask() to clearly express this function now takes into account its second argument (gfp_mask) Note that when tcp_transmit_skb() is called with clone_it set to false, we do not attempt memory allocations, so can pass a 0 gfp_mask, which most compilers can emit faster than a non zero or constant value. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 62d35afcb3ac..9065f8b7e646 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -775,9 +775,9 @@ static inline int sk_memalloc_socks(void) #endif -static inline gfp_t sk_gfp_atomic(const struct sock *sk, gfp_t gfp_mask) +static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) { - return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC); + return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) -- cgit v1.2.3 From 6bd4f355df2eae80b8a5c7b097371cd1e05f20d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Dec 2015 21:53:57 -0800 Subject: ipv6: kill sk_dst_lock While testing the np->opt RCU conversion, I found that UDP/IPv6 was using a mixture of xchg() and sk_dst_lock to protect concurrent changes to sk->sk_dst_cache, leading to possible corruptions and crashes. ip6_sk_dst_lookup_flow() uses sk_dst_check() anyway, so the simplest way to fix the mess is to remove sk_dst_lock completely, as we did for IPv4. __ip6_dst_store() and ip6_dst_store() share same implementation. sk_setup_caps() being called with socket lock being held or not, we have to use sk_dst_set() instead of __sk_dst_set() Note that I had to move the "np->dst_cookie = rt6_get_cookie(rt);" in ip6_dst_store() before the sk_setup_caps(sk, dst) call. This is because ip6_dst_store() can be called from process context, without any lock held. As soon as the dst is installed in sk->sk_dst_cache, dst can be freed from another cpu doing a concurrent ip6_dst_store() Doing the dst dereference before doing the install is needed to make sure no use after free would trigger. Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- include/net/ip6_route.h | 17 ++++------------- include/net/sock.h | 3 +-- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 2bfb2ad2fab1..877f682989b8 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -133,27 +133,18 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); /* * Store a destination cache entry in a socket */ -static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct ipv6_pinfo *np = inet6_sk(sk); - struct rt6_info *rt = (struct rt6_info *) dst; + np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr; #endif - np->dst_cookie = rt6_get_cookie(rt); -} - -static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - struct in6_addr *daddr, struct in6_addr *saddr) -{ - spin_lock(&sk->sk_dst_lock); - __ip6_dst_store(sk, dst, daddr, saddr); - spin_unlock(&sk->sk_dst_lock); } static inline bool ipv6_unicast_destination(const struct sk_buff *skb) diff --git a/include/net/sock.h b/include/net/sock.h index 0434138c5f95..52d27ee924f4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -254,7 +254,6 @@ struct cg_proto; * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_dst_cache: destination cache - * @sk_dst_lock: destination cache lock * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed @@ -393,7 +392,7 @@ struct sock { #endif struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; - spinlock_t sk_dst_lock; + /* Note: 32bit hole on 64bit arches */ atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; -- cgit v1.2.3 From c981e4213e9d2d4ec79501bd607722ec712742a2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:06 +0100 Subject: net: add netif_is_team_master helper Similar to other helpers, caller can use this to find out if device is team master. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fcbc5259c630..2b889be65d88 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1273,6 +1273,7 @@ struct net_device_ops { * @IFF_NO_QUEUE: device can run without qdisc attached * @IFF_OPENVSWITCH: device is a Open vSwitch master * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device + * @IFF_TEAM: device is a team device */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1299,6 +1300,7 @@ enum netdev_priv_flags { IFF_NO_QUEUE = 1<<21, IFF_OPENVSWITCH = 1<<22, IFF_L3MDEV_SLAVE = 1<<23, + IFF_TEAM = 1<<24, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1325,6 +1327,7 @@ enum netdev_priv_flags { #define IFF_NO_QUEUE IFF_NO_QUEUE #define IFF_OPENVSWITCH IFF_OPENVSWITCH #define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE +#define IFF_TEAM IFF_TEAM /** * struct net_device - The DEVICE structure. @@ -3889,6 +3892,11 @@ static inline bool netif_is_ovs_master(const struct net_device *dev) return dev->priv_flags & IFF_OPENVSWITCH; } +static inline bool netif_is_team_master(struct net_device *dev) +{ + return dev->priv_flags & IFF_TEAM; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From f7f019ee6d117de5007d0b10e7960696bbf111eb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:07 +0100 Subject: net: add netif_is_team_port helper Similar to other helpers, caller can use this to find out if device is team port. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2b889be65d88..b3601f8a9b42 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3897,6 +3897,11 @@ static inline bool netif_is_team_master(struct net_device *dev) return dev->priv_flags & IFF_TEAM; } +static inline bool netif_is_team_port(struct net_device *dev) +{ + return dev->priv_flags & IFF_TEAM_PORT; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From 7be61833042e7757745345eedc7b0efee240c189 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:08 +0100 Subject: net: add netif_is_lag_master helper Some code does not mind if the master is bond or team and treats them the same, as generic LAG. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b3601f8a9b42..3ca083efa560 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3902,6 +3902,11 @@ static inline bool netif_is_team_port(struct net_device *dev) return dev->priv_flags & IFF_TEAM_PORT; } +static inline bool netif_is_lag_master(struct net_device *dev) +{ + return netif_is_bond_master(dev) || netif_is_team_master(dev); +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From e0ba1414f310c83bf425fe26fa2cd5f1befcd6dc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:09 +0100 Subject: net: add netif_is_lag_port helper Some code does not mind if a device is bond slave or team port and treats them the same, as generic LAG ports. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ca083efa560..1506be58c59a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3907,6 +3907,11 @@ static inline bool netif_is_lag_master(struct net_device *dev) return netif_is_bond_master(dev) || netif_is_team_master(dev); } +static inline bool netif_is_lag_port(struct net_device *dev) +{ + return netif_is_bond_slave(dev) || netif_is_team_port(dev); +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From 6dffb0447c25476f499d205dfceb1972e8dae919 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:10 +0100 Subject: net: propagate upper priv via netdev_master_upper_dev_link Eliminate netdev_master_upper_dev_link_private and pass priv directly as a parameter of netdev_master_upper_dev_link. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1506be58c59a..939b8f3de810 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3619,10 +3619,8 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev); -int netdev_master_upper_dev_link_private(struct net_device *dev, - struct net_device *upper_dev, - void *private); + struct net_device *upper_dev, + void *upper_priv); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); -- cgit v1.2.3 From 29bf24afb29042f568fa67b1b0eee46796725ed2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:11 +0100 Subject: net: add possibility to pass information about upper device via notifier Sometimes the drivers and other code would find it handy to know some internal information about upper device being changed. So allow upper-code to pass information down to notifier listeners during linking. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 939b8f3de810..aea556c64f2c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2163,6 +2163,7 @@ struct netdev_notifier_changeupper_info { struct net_device *upper_dev; /* new upper dev */ bool master; /* is upper dev master */ bool linking; /* is the nofication for link or unlink */ + void *upper_info; /* upper dev info */ }; static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, @@ -3620,7 +3621,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv); + void *upper_priv, void *upper_info); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); -- cgit v1.2.3 From 764f5e544118508add420724789f46e04dba91eb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:12 +0100 Subject: net: add info struct for LAG changeupper This struct will be shared by bonding and team to pass internal information to notifier listeners. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index aea556c64f2c..3ab90ea0ed03 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2110,6 +2110,19 @@ struct pcpu_sw_netstats { #define netdev_alloc_pcpu_stats(type) \ __netdev_alloc_pcpu_stats(type, GFP_KERNEL); +enum netdev_lag_tx_type { + NETDEV_LAG_TX_TYPE_UNKNOWN, + NETDEV_LAG_TX_TYPE_RANDOM, + NETDEV_LAG_TX_TYPE_BROADCAST, + NETDEV_LAG_TX_TYPE_ROUNDROBIN, + NETDEV_LAG_TX_TYPE_ACTIVEBACKUP, + NETDEV_LAG_TX_TYPE_HASH, +}; + +struct netdev_lag_upper_info { + enum netdev_lag_tx_type tx_type; +}; + #include /* netdevice notifier chain. Please remember to update the rtnetlink -- cgit v1.2.3 From 8fd728566a354f7bc9cb6e781f185b8c39cf505b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:13 +0100 Subject: team: fill-up LAG changeupper info struct and pass it along Initialize netdev_lag_upper_info structure by TX type according to current team mode and pass it along via netdev_master_upper_dev_link. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_team.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index a6aa970758a2..b84e49c3a738 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -164,6 +164,7 @@ struct team_mode { size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; + enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 -- cgit v1.2.3 From 04d482660a07039fc4e9a42bb3517db236d98f96 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:15 +0100 Subject: net: introduce change lower state notifier When lower device like bonding slave, team/bridge port, etc changes its state, it is useful for others to notice this change. Currently this is implemented specificly for bonding as NETDEV_BONDING_INFO notifier. This patch aims to replace this specific usage and make this more generic to be used for all upper-lower devices. Introduce NETDEV_CHANGELOWERSTATE netdev notifier type and netdev_lower_state_changed() helper. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ab90ea0ed03..ad69f237aa78 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2158,6 +2158,7 @@ struct netdev_lag_upper_info { #define NETDEV_CHANGEINFODATA 0x0018 #define NETDEV_BONDING_INFO 0x0019 #define NETDEV_PRECHANGEUPPER 0x001A +#define NETDEV_CHANGELOWERSTATE 0x001B int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); @@ -2179,6 +2180,11 @@ struct netdev_notifier_changeupper_info { void *upper_info; /* upper dev info */ }; +struct netdev_notifier_changelowerstate_info { + struct netdev_notifier_info info; /* must be first */ + void *lower_state_info; /* is lower dev state */ +}; + static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { @@ -3640,6 +3646,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info); /* RSS keys are 40 or 52 bytes long */ #define NETDEV_RSS_KEY_LEN 52 -- cgit v1.2.3 From fb1b2e3ce53aef80b3cef71f3885d584cdbdc6b8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:16 +0100 Subject: net: introduce lower state changed info structure for LAG lowers This is shared info structure for bonding and team. Serves to pass down info about link state and port activity to notification listeners. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ad69f237aa78..fa84b59eb197 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2123,6 +2123,11 @@ struct netdev_lag_upper_info { enum netdev_lag_tx_type tx_type; }; +struct netdev_lag_lower_state_info { + u8 link_up : 1, + tx_enabled : 1; +}; + #include /* netdevice notifier chain. Please remember to update the rtnetlink -- cgit v1.2.3 From 5d397061ca2081d8a99e4bee5792122faa6aaf86 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:19 +0100 Subject: bonding: allow notifications for bond_set_slave_link_state Similar to state notifications. We allow caller to indicate if the notification should happen now or later, depending on if he holds rtnl mutex or not. Introduce bond_slave_link_notify function (similar to bond_slave_state_notify) which is later on called with rtnl mutex and goes over slaves and executes delayed notification. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/bonding.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index c1740a2794a3..1df437715e2f 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -165,7 +165,8 @@ struct slave { u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ - should_notify:1; /* indicateds whether the state changed */ + should_notify:1, /* indicates whether the state changed */ + should_notify_link:1; /* indicates whether the link changed */ u8 duplex; u32 original_mtu; u32 link_failure_count; @@ -504,10 +505,35 @@ static inline bool bond_is_slave_inactive(struct slave *slave) return slave->inactive; } -static inline void bond_set_slave_link_state(struct slave *slave, int state) +static inline void bond_set_slave_link_state(struct slave *slave, int state, + bool notify) { + if (slave->link == state) + return; + slave->link = state; - bond_queue_slave_event(slave); + if (notify) { + bond_queue_slave_event(slave); + slave->should_notify_link = 0; + } else { + if (slave->should_notify_link) + slave->should_notify_link = 0; + else + slave->should_notify_link = 1; + } +} + +static inline void bond_slave_link_notify(struct bonding *bond) +{ + struct list_head *iter; + struct slave *tmp; + + bond_for_each_slave(bond, tmp, iter) { + if (tmp->should_notify_link) { + bond_queue_slave_event(tmp); + tmp->should_notify_link = 0; + } + } } static inline __be32 bond_confirm_addr(struct net_device *dev, __be32 dst, __be32 local) -- cgit v1.2.3 From f7c7eb7f7af7f87e0fc150994785fd139576e43a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:20 +0100 Subject: bonding: implement lower state change propagation Let netdev notifier listeners know about link and slave state change. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/bonding.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index 1df437715e2f..ee6c52053aa3 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -247,6 +247,7 @@ struct bonding { ((struct slave *) rtnl_dereference(dev->rx_handler_data)) void bond_queue_slave_event(struct slave *slave); +void bond_lower_state_changed(struct slave *slave); struct bond_vlan_tag { __be16 vlan_proto; @@ -328,6 +329,7 @@ static inline void bond_set_active_slave(struct slave *slave) if (slave->backup) { slave->backup = 0; bond_queue_slave_event(slave); + bond_lower_state_changed(slave); rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); } } @@ -337,6 +339,7 @@ static inline void bond_set_backup_slave(struct slave *slave) if (!slave->backup) { slave->backup = 1; bond_queue_slave_event(slave); + bond_lower_state_changed(slave); rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); } } @@ -349,6 +352,7 @@ static inline void bond_set_slave_state(struct slave *slave, slave->backup = slave_state; if (notify) { + bond_lower_state_changed(slave); rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); bond_queue_slave_event(slave); slave->should_notify = 0; @@ -380,6 +384,7 @@ static inline void bond_slave_state_notify(struct bonding *bond) bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify) { + bond_lower_state_changed(tmp); rtmsg_ifinfo(RTM_NEWLINK, tmp->dev, 0, GFP_ATOMIC); tmp->should_notify = 0; } @@ -514,6 +519,7 @@ static inline void bond_set_slave_link_state(struct slave *slave, int state, slave->link = state; if (notify) { bond_queue_slave_event(slave); + bond_lower_state_changed(slave); slave->should_notify_link = 0; } else { if (slave->should_notify_link) @@ -531,6 +537,7 @@ static inline void bond_slave_link_notify(struct bonding *bond) bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify_link) { bond_queue_slave_event(tmp); + bond_lower_state_changed(tmp); tmp->should_notify_link = 0; } } -- cgit v1.2.3 From fc50db98ff872372f266695858f87a12eb1b4f05 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 1 Dec 2015 18:03:09 +0200 Subject: net/mlx5_core: Add base sriov support This patch adds SRIOV base support for mlx5 supported devices. The same driver is used for both PFs and VFs; VFs are identified by the driver through the flag MLX5_PCI_DEV_IS_VF added to the pci table entries. Virtual functions are created as usual through writing a value to the sriov_numvs sysfs file of the PF device. Upon instantiating VFs, they will all be probed by the driver on the hypervisor. One can gracefully unbind them through /sys/bus/pci/drivers/mlx5_core/unbind. mlx5_wait_for_vf_pages() was added to ensure that when a VF dies without executing proper teardown, the hypervisor driver waits till all of the pages that were allocated at the hypervisor to maintain its operation are returned. In order for the VF to be operational, the PF needs to call enable_hca for it. This can be done before the VFs are created through a call to pci_enable_sriov. If the there are VFs assigned to a VMs when the driver of the PF is unloaded, all the VF will experience system error and PF driver unloads cleanly; in this case pci_disable_sriov is not called and the devices will show when running lspci. Once the PF driver is reloaded, it will sync its data structures which maintain state on its VFs. Signed-off-by: Eli Cohen Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 24 ++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5c857f2a20d7..efebb87163c8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -426,6 +426,16 @@ struct mlx5_mr_table { struct radix_tree_root tree; }; +struct mlx5_vf_context { + int enabled; +}; + +struct mlx5_core_sriov { + struct mlx5_vf_context *vfs_ctx; + int num_vfs; + int enabled_vfs; +}; + struct mlx5_irq_info { cpumask_var_t mask; char name[MLX5_MAX_IRQ_NAME]; @@ -447,6 +457,7 @@ struct mlx5_priv { int fw_pages; atomic_t reg_pages; struct list_head free_list; + int vfs_pages; struct mlx5_core_health health; @@ -485,6 +496,8 @@ struct mlx5_priv { struct list_head dev_list; struct list_head ctx_list; spinlock_t ctx_lock; + struct mlx5_core_sriov sriov; + unsigned long pci_dev_data; }; enum mlx5_device_state { @@ -739,6 +752,8 @@ void mlx5_pagealloc_init(struct mlx5_core_dev *dev); void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev); int mlx5_pagealloc_start(struct mlx5_core_dev *dev); void mlx5_pagealloc_stop(struct mlx5_core_dev *dev); +int mlx5_sriov_init(struct mlx5_core_dev *dev); +int mlx5_sriov_cleanup(struct mlx5_core_dev *dev); void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id, s32 npages); int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); @@ -884,6 +899,15 @@ struct mlx5_profile { } mr_cache[MAX_MR_CACHE_ENTRIES]; }; +enum { + MLX5_PCI_DEV_IS_VF = 1 << 0, +}; + +static inline int mlx5_core_is_pf(struct mlx5_core_dev *dev) +{ + return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF); +} + static inline int mlx5_get_gid_table_len(u16 param) { if (param > 4) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1565324eb620..9b76fddd696b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -665,7 +665,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_17[0x1]; u8 ets[0x1]; u8 nic_flow_table[0x1]; - u8 reserved_18[0x4]; + u8 reserved_18_0; + u8 early_vf_enable; + u8 reserved_18[0x2]; u8 local_ca_ack_delay[0x5]; u8 reserved_19[0x6]; u8 port_type[0x2]; -- cgit v1.2.3 From 54f0a411ec72cb437d57d0c9654dcbd0f198ff3a Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:10 +0200 Subject: net/mlx5: Add HW capabilities and structs for SR-IOV E-Switch Update HCA capabilities and HW struct to include needed capabilities for upcoming Ethernet Switch (SR-IOV E-Switch). Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 9b76fddd696b..836cf0e43174 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -665,7 +665,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_17[0x1]; u8 ets[0x1]; u8 nic_flow_table[0x1]; - u8 reserved_18_0; + u8 eswitch_flow_table[0x1]; u8 early_vf_enable; u8 reserved_18[0x2]; u8 local_ca_ack_delay[0x5]; @@ -789,22 +789,30 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_60[0x1b]; u8 log_max_wq_sz[0x5]; - u8 reserved_61[0xa0]; - + u8 nic_vport_change_event[0x1]; + u8 reserved_61[0xa]; + u8 log_max_vlan_list[0x5]; u8 reserved_62[0x3]; + u8 log_max_current_mc_list[0x5]; + u8 reserved_63[0x3]; + u8 log_max_current_uc_list[0x5]; + + u8 reserved_64[0x80]; + + u8 reserved_65[0x3]; u8 log_max_l2_table[0x5]; - u8 reserved_63[0x8]; + u8 reserved_66[0x8]; u8 log_uar_page_sz[0x10]; - u8 reserved_64[0x100]; + u8 reserved_67[0xe0]; - u8 reserved_65[0x1f]; + u8 reserved_68[0x1f]; u8 cqe_zip[0x1]; u8 cqe_zip_timeout[0x10]; u8 cqe_zip_max_num[0x10]; - u8 reserved_66[0x220]; + u8 reserved_69[0x220]; }; enum { @@ -2135,10 +2143,6 @@ struct mlx5_ifc_rmpc_bits { struct mlx5_ifc_wq_bits wq; }; -enum { - MLX5_NIC_VPORT_CONTEXT_ALLOWED_LIST_TYPE_CURRENT_UC_MAC_ADDRESS = 0x0, -}; - struct mlx5_ifc_nic_vport_context_bits { u8 reserved_0[0x1f]; u8 roce_en[0x1]; -- cgit v1.2.3 From e1d7d349c69d12721c420f1fe673ce9aa462aadd Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:11 +0200 Subject: net/mlx5: Update access functions to Query/Modify vport MAC address In preparation for SR-IOV we add here an API to enable each e-switch client (PF/VF) to configure its L2 MAC addresses and for the e-switch manager (usually the PF) to access them in order to be able to configure them into the e-switch. Therefore we now pass vport num parameter to mlx5_query_nic_vport_context, so PF can access other vports contexts. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/vport.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 967e0fd06e89..43e82d9f5463 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -36,7 +36,10 @@ #include u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); -void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr); +int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, + u16 vport, u8 *addr); +int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, + u16 vport, u8 *addr); int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, u16 vf_num, u16 gid_index, union ib_gid *gid); -- cgit v1.2.3 From e16aea2744abea612c27ee0eef606c6a6a8204de Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:12 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport mac lists Those functions are needed to notify the upcoming L2 table and SR-IOV E-Switch(FDB) manager(PF), of the NIC vport (vf) UC/MC mac lists changes. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/vport.h | 10 ++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0b473cbfa7ef..0d2f0435a9f0 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1102,6 +1102,12 @@ enum { MLX5_FLOW_CONTEXT_DEST_TYPE_TIR = 2, }; +enum mlx5_list_type { + MLX5_NVPRT_LIST_TYPE_UC = 0x0, + MLX5_NVPRT_LIST_TYPE_MC = 0x1, + MLX5_NVPRT_LIST_TYPE_VLAN = 0x2, +}; + enum { MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE = 0x0, MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM = 0x1, diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 43e82d9f5463..00bbec8d9527 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -34,6 +34,7 @@ #define __MLX5_VPORT_H__ #include +#include u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, @@ -54,5 +55,14 @@ int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev, u64 *sys_image_guid); int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev, u64 *node_guid); +int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, + u32 vport, + enum mlx5_list_type list_type, + u8 addr_list[][ETH_ALEN], + int *list_size); +int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, + enum mlx5_list_type list_type, + u8 addr_list[][ETH_ALEN], + int list_size); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From e75465148b7df7f2796c75bf98bf33f171edeb2b Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:13 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport state In preparation for SR-IOV we add here an API to enable each e-switch manager (PF) to configure its VFs link states in e-switch preparation for ethernet sriov. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 1 + include/linux/mlx5/vport.h | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 836cf0e43174..655184702ea2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2946,6 +2946,7 @@ struct mlx5_ifc_query_vport_state_out_bits { enum { MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT = 0x0, + MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT = 0x1, }; struct mlx5_ifc_query_vport_state_in_bits { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 00bbec8d9527..c1bba5948851 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -36,7 +36,11 @@ #include #include -u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); +u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); +u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport); +int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 state); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr); int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, -- cgit v1.2.3 From d82b73186dab70d6d332dd2afdb48608be2e5230 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:14 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport promisc mode Those functions are needed to notify the upcoming SR-IOV E-Switch(FDB) manager(PF), of the NIC vport (vf) promisc mode changes. Preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 28 +++++++++++++++++++++++----- include/linux/mlx5/vport.h | 9 +++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 655184702ea2..2728b5f6c017 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2147,16 +2147,31 @@ struct mlx5_ifc_nic_vport_context_bits { u8 reserved_0[0x1f]; u8 roce_en[0x1]; - u8 reserved_1[0x760]; + u8 arm_change_event[0x1]; + u8 reserved_1[0x1a]; + u8 event_on_mtu[0x1]; + u8 event_on_promisc_change[0x1]; + u8 event_on_vlan_change[0x1]; + u8 event_on_mc_address_change[0x1]; + u8 event_on_uc_address_change[0x1]; - u8 reserved_2[0x5]; + u8 reserved_2[0xf0]; + + u8 mtu[0x10]; + + u8 reserved_3[0x640]; + + u8 promisc_uc[0x1]; + u8 promisc_mc[0x1]; + u8 promisc_all[0x1]; + u8 reserved_4[0x2]; u8 allowed_list_type[0x3]; - u8 reserved_3[0xc]; + u8 reserved_5[0xc]; u8 allowed_list_size[0xc]; struct mlx5_ifc_mac_address_layout_bits permanent_address; - u8 reserved_4[0x20]; + u8 reserved_6[0x20]; u8 current_uc_mac_address[0][0x40]; }; @@ -4235,7 +4250,10 @@ struct mlx5_ifc_modify_nic_vport_context_out_bits { }; struct mlx5_ifc_modify_nic_vport_field_select_bits { - u8 reserved_0[0x1c]; + u8 reserved_0[0x19]; + u8 mtu[0x1]; + u8 change_event[0x1]; + u8 promisc[0x1]; u8 permanent_address[0x1]; u8 addresses_list[0x1]; u8 roce_en[0x1]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c1bba5948851..dbbaed9f975a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -68,5 +68,14 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, enum mlx5_list_type list_type, u8 addr_list[][ETH_ALEN], int list_size); +int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev, + u32 vport, + int *promisc_uc, + int *promisc_mc, + int *promisc_all); +int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, + int promisc_uc, + int promisc_mc, + int promisc_all); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From c0046cf7b81ac55b8bf056c71918ec04edd99379 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:15 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport vlans Those functions are needed to notify the upcoming L2 table and SR-IOV E-Switch(FDB) manager(PF), of the NIC vport (vf) vlan table changes. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 7 +++++++ include/linux/mlx5/vport.h | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2728b5f6c017..39487d0c305d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -910,6 +910,13 @@ struct mlx5_ifc_mac_address_layout_bits { u8 mac_addr_31_0[0x20]; }; +struct mlx5_ifc_vlan_layout_bits { + u8 reserved_0[0x14]; + u8 vlan[0x0c]; + + u8 reserved_1[0x20]; +}; + struct mlx5_ifc_cong_control_r_roce_ecn_np_bits { u8 reserved_0[0xa0]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index dbbaed9f975a..638f2ca7a527 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -77,5 +77,12 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, int promisc_uc, int promisc_mc, int promisc_all); +int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev, + u32 vport, + u16 vlans[], + int *size); +int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, + u16 vlans[], + int list_size); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From 073bb189a41d7bbad509b576a690611c46c4858f Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:18 +0200 Subject: net/mlx5: Introducing E-Switch and l2 table E-Switch is the software entity that represents and manages ConnectX4 inter-HCA ethernet l2 switching. E-Switch has its own Virtual Ports, each Vport/vNIC/VF can be connected to the device through a vport of an e-switch. Each e-switch is managed by one vNIC identified by HCA_CAP.vport_group_manager (usually it is the PF/vport[0]), and its main responsibility is to forward each packet to the right vport. e-Switch needs to manage its own l2-table and FDB tables. L2 table is a flow table that is managed by FW, it is needed for Multi-host (Multi PF) configuration for inter HCA switching between PFs. FDB table is a flow table that is totally managed by e-Switch driver, its main responsibility is to switch packets between e-Swtich internal vports and uplink vport that belong to the same. This patch introduces only e-Swtich l2 table management, FDB managemnt will come later when ethernet SRIOV/VFs will be enabled. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 8 ++++++++ include/linux/mlx5/driver.h | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0d2f0435a9f0..90a4cb6dc4cb 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -251,6 +251,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, MLX5_EVENT_TYPE_PAGE_FAULT = 0xc, + MLX5_EVENT_TYPE_NIC_VPORT_CHANGE = 0xd, }; enum { @@ -520,6 +521,12 @@ struct mlx5_eqe_page_fault { __be32 flags_qpn; } __packed; +struct mlx5_eqe_vport_change { + u8 rsvd0[2]; + __be16 vport_num; + __be32 rsvd1[6]; +} __packed; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -532,6 +539,7 @@ union ev_data { struct mlx5_eqe_stall_vl stall_vl; struct mlx5_eqe_page_req req_pages; struct mlx5_eqe_page_fault page_fault; + struct mlx5_eqe_vport_change vport_change; } __packed; struct mlx5_eqe { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index efebb87163c8..ac098b6b97bf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -441,6 +441,8 @@ struct mlx5_irq_info { char name[MLX5_MAX_IRQ_NAME]; }; +struct mlx5_eswitch; + struct mlx5_priv { char name[MLX5_MAX_NAME_LEN]; struct mlx5_eq_table eq_table; @@ -496,6 +498,8 @@ struct mlx5_priv { struct list_head dev_list; struct list_head ctx_list; spinlock_t ctx_lock; + + struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; unsigned long pci_dev_data; }; -- cgit v1.2.3 From 495716b191f607b2cb2175f7499966daef79f663 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:19 +0200 Subject: net/mlx5: E-Switch, Introduce FDB hardware capabilities Define needed hardware structures and capabilities needed for E-Switch FDB flow tables and read them on driver load. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 15 +++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 13 +++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 90a4cb6dc4cb..bce9caed1eed 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1138,6 +1138,7 @@ enum mlx5_cap_type { MLX5_CAP_IPOIB_OFFLOADS, MLX5_CAP_EOIB_OFFLOADS, MLX5_CAP_FLOW_TABLE, + MLX5_CAP_ESWITCH_FLOW_TABLE, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1175,6 +1176,20 @@ enum mlx5_cap_type { #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \ MLX5_GET(flow_table_nic_cap, mdev->hca_caps_max[MLX5_CAP_FLOW_TABLE], cap) +#define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \ + MLX5_GET(flow_table_eswitch_cap, \ + mdev->hca_caps_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) + +#define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \ + MLX5_GET(flow_table_eswitch_cap, \ + mdev->hca_caps_max[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) + +#define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \ + MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap) + +#define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \ + MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap) + #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 39487d0c305d..ae7c08adba4a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -447,6 +447,18 @@ struct mlx5_ifc_flow_table_nic_cap_bits { u8 reserved_3[0x7200]; }; +struct mlx5_ifc_flow_table_eswitch_cap_bits { + u8 reserved_0[0x200]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_ingress; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress; + + u8 reserved_1[0x7800]; +}; + struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 csum_cap[0x1]; u8 vlan_cap[0x1]; @@ -1846,6 +1858,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_roce_cap_bits roce_cap; struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps; struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; + struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; u8 reserved_0[0x8000]; }; -- cgit v1.2.3 From 81848731ff4070a3e4136efa6a99d507177a53fe Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:20 +0200 Subject: net/mlx5: E-Switch, Add SR-IOV (FDB) support Enabling E-Switch SRIOV for nvfs+1 vports. Create E-Switch FDB for L2 UC/MC mac steering between VFs/PF and external vport (Uplink). FDB contains forwarding rules such as: UC MAC0 -> vport0(PF). UC MAC1 -> vport1. UC MAC2 -> vport2. MC MACX -> vport0, vport2, Uplink. MC MACY -> vport1, Uplink. For unmatched traffic FDB has the following default rules: Unmached Traffic (src vport != Uplink) -> Uplink. Unmached Traffic (src vport == Uplink) -> vport0(PF). FDB rules population: Each NIC vport (VF) will notify E-Switch manager of its UC/MC vport context changes via modify vport context command, which will be translated to an event that will be handled by E-Switch manager (PF) which will update FDB table accordingly. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/flow_table.h | 9 +++++++++ include/linux/mlx5/mlx5_ifc.h | 7 ++++--- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index bce9caed1eed..88eb4490a8b3 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1074,6 +1074,12 @@ enum { VPORT_STATE_UP = 0x1, }; +enum { + MLX5_ESW_VPORT_ADMIN_STATE_DOWN = 0x0, + MLX5_ESW_VPORT_ADMIN_STATE_UP = 0x1, + MLX5_ESW_VPORT_ADMIN_STATE_AUTO = 0x2, +}; + enum { MLX5_L3_PROT_TYPE_IPV4 = 0, MLX5_L3_PROT_TYPE_IPV6 = 1, diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h index 5f922c6d4fc2..0f2a15cf3317 100644 --- a/include/linux/mlx5/flow_table.h +++ b/include/linux/mlx5/flow_table.h @@ -41,6 +41,15 @@ struct mlx5_flow_table_group { u32 match_criteria[MLX5_ST_SZ_DW(fte_match_param)]; }; +struct mlx5_flow_destination { + enum mlx5_flow_destination_type type; + union { + u32 tir_num; + void *ft; + u32 vport_num; + }; +}; + void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type, u16 num_groups, struct mlx5_flow_table_group *group); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ae7c08adba4a..a81b008738fd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -827,9 +827,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_69[0x220]; }; -enum { - MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_FLOW_TABLE_ = 0x1, - MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_TIR = 0x2, +enum mlx5_flow_destination_type { + MLX5_FLOW_DESTINATION_TYPE_VPORT = 0x0, + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, + MLX5_FLOW_DESTINATION_TYPE_TIR = 0x2, }; struct mlx5_ifc_dest_format_struct_bits { -- cgit v1.2.3 From d6666753c6e85834f1669c7b831cc2b7fc9e4390 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:22 +0200 Subject: net/mlx5: E-Switch, Introduce HCA cap and E-Switch vport context E-Switch vport context is unlike NIC vport context, managed by the E-Switch manager or vport_group_manager and not by the NIC(VF) driver. The E-Switch manager can access (read/modify) any of its vports E-Switch context. Currently E-Switch vport context includes only clietnt and server vlan insertion and striping data (for later support of VST mode). Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 9 +++++ include/linux/mlx5/mlx5_ifc.h | 90 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 88eb4490a8b3..7d3a85faefb7 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1145,6 +1145,7 @@ enum mlx5_cap_type { MLX5_CAP_EOIB_OFFLOADS, MLX5_CAP_FLOW_TABLE, MLX5_CAP_ESWITCH_FLOW_TABLE, + MLX5_CAP_ESWITCH, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1196,6 +1197,14 @@ enum mlx5_cap_type { #define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \ MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap) +#define MLX5_CAP_ESW(mdev, cap) \ + MLX5_GET(e_switch_cap, \ + mdev->hca_caps_cur[MLX5_CAP_ESWITCH], cap) + +#define MLX5_CAP_ESW_MAX(mdev, cap) \ + MLX5_GET(e_switch_cap, \ + mdev->hca_caps_max[MLX5_CAP_ESWITCH], cap) + #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a81b008738fd..f5d94495758a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -459,6 +459,17 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits { u8 reserved_1[0x7800]; }; +struct mlx5_ifc_e_switch_cap_bits { + u8 vport_svlan_strip[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_insert_if_not_exist[0x1]; + u8 vport_cvlan_insert_overwrite[0x1]; + u8 reserved_0[0x1b]; + + u8 reserved_1[0x7e0]; +}; + struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 csum_cap[0x1]; u8 vlan_cap[0x1]; @@ -1860,6 +1871,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps; struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; + struct mlx5_ifc_e_switch_cap_bits e_switch_cap; u8 reserved_0[0x8000]; }; @@ -2305,6 +2317,26 @@ struct mlx5_ifc_hca_vport_context_bits { u8 reserved_6[0xca0]; }; +struct mlx5_ifc_esw_vport_context_bits { + u8 reserved_0[0x3]; + u8 vport_svlan_strip[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_insert[0x2]; + u8 reserved_1[0x18]; + + u8 reserved_2[0x20]; + + u8 svlan_cfi[0x1]; + u8 svlan_pcp[0x3]; + u8 svlan_id[0xc]; + u8 cvlan_cfi[0x1]; + u8 cvlan_pcp[0x3]; + u8 cvlan_id[0xc]; + + u8 reserved_3[0x7a0]; +}; + enum { MLX5_EQC_STATUS_OK = 0x0, MLX5_EQC_STATUS_EQ_WRITE_FAILURE = 0xa, @@ -3743,6 +3775,64 @@ struct mlx5_ifc_query_flow_group_in_bits { u8 reserved_5[0x120]; }; +struct mlx5_ifc_query_esw_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_esw_vport_context_bits esw_vport_context; +}; + +struct mlx5_ifc_query_esw_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_modify_esw_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_esw_vport_context_fields_select_bits { + u8 reserved[0x1c]; + u8 vport_cvlan_insert[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_strip[0x1]; +}; + +struct mlx5_ifc_modify_esw_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + struct mlx5_ifc_esw_vport_context_fields_select_bits field_select; + + struct mlx5_ifc_esw_vport_context_bits esw_vport_context; +}; + struct mlx5_ifc_query_eq_out_bits { u8 status[0x8]; u8 reserved_0[0x18]; -- cgit v1.2.3 From 2d1e0254ef8310e4f0756130a7ffc007ad1d58df Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 1 Dec 2015 14:55:21 +0000 Subject: pci_ids: add Netronome Systems vendor Add PCI vendor id for Netronome Systems. Signed-off-by: Jakub Kicinski Signed-off-by: Rolf Neugebauer Signed-off-by: David S. Miller --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d9ba49cedc5d..1acbefc4bbda 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2495,6 +2495,8 @@ #define PCI_DEVICE_ID_KORENIX_JETCARDF2 0x1700 #define PCI_DEVICE_ID_KORENIX_JETCARDF3 0x17ff +#define PCI_VENDOR_ID_NETRONOME 0x19ee + #define PCI_VENDOR_ID_QMI 0x1a32 #define PCI_VENDOR_ID_AZWAVE 0x1a3b -- cgit v1.2.3 From 4eaf3b84f2881c9c028f1d5e76c52ab575fe3a66 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Dec 2015 20:08:51 -0800 Subject: net_sched: fix qdisc_tree_decrease_qlen() races qdisc_tree_decrease_qlen() suffers from two problems on multiqueue devices. One problem is that it updates sch->q.qlen and sch->qstats.drops on the mq/mqprio root qdisc, while it should not : Daniele reported underflows errors : [ 681.774821] PAX: sch->q.qlen: 0 n: 1 [ 681.774825] PAX: size overflow detected in function qdisc_tree_decrease_qlen net/sched/sch_api.c:769 cicus.693_49 min, count: 72, decl: qlen; num: 0; context: sk_buff_head; [ 681.774954] CPU: 2 PID: 19 Comm: ksoftirqd/2 Tainted: G O 4.2.6.201511282239-1-grsec #1 [ 681.774955] Hardware name: ASUSTeK COMPUTER INC. X302LJ/X302LJ, BIOS X302LJ.202 03/05/2015 [ 681.774956] ffffffffa9a04863 0000000000000000 0000000000000000 ffffffffa990ff7c [ 681.774959] ffffc90000d3bc38 ffffffffa95d2810 0000000000000007 ffffffffa991002b [ 681.774960] ffffc90000d3bc68 ffffffffa91a44f4 0000000000000001 0000000000000001 [ 681.774962] Call Trace: [ 681.774967] [] dump_stack+0x4c/0x7f [ 681.774970] [] report_size_overflow+0x34/0x50 [ 681.774972] [] qdisc_tree_decrease_qlen+0x152/0x160 [ 681.774976] [] fq_codel_dequeue+0x7b1/0x820 [sch_fq_codel] [ 681.774978] [] ? qdisc_peek_dequeued+0xa0/0xa0 [sch_fq_codel] [ 681.774980] [] __qdisc_run+0x4d/0x1d0 [ 681.774983] [] net_tx_action+0xc2/0x160 [ 681.774985] [] __do_softirq+0xf1/0x200 [ 681.774987] [] run_ksoftirqd+0x1e/0x30 [ 681.774989] [] smpboot_thread_fn+0x150/0x260 [ 681.774991] [] ? sort_range+0x40/0x40 [ 681.774992] [] kthread+0xe4/0x100 [ 681.774994] [] ? kthread_worker_fn+0x170/0x170 [ 681.774995] [] ret_from_fork+0x3e/0x70 mq/mqprio have their own ways to report qlen/drops by folding stats on all their queues, with appropriate locking. A second problem is that qdisc_tree_decrease_qlen() calls qdisc_lookup() without proper locking : concurrent qdisc updates could corrupt the list that qdisc_match_from_root() parses to find a qdisc given its handle. Fix first problem adding a TCQ_F_NOPARENT qdisc flag that qdisc_tree_decrease_qlen() can use to abort its tree traversal, as soon as it meets a mq/mqprio qdisc children. Second problem can be fixed by RCU protection. Qdisc are already freed after RCU grace period, so qdisc_list_add() and qdisc_list_del() simply have to use appropriate rcu list variants. A future patch will add a per struct netdev_queue list anchor, so that qdisc_tree_decrease_qlen() can have more efficient lookups. Reported-by: Daniele Fucini Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 4c79ce8c1f92..b2a8e6338576 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -61,6 +61,9 @@ struct Qdisc { */ #define TCQ_F_WARN_NONWC (1 << 16) #define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */ +#define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : + * qdisc_tree_decrease_qlen() should stop. + */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; -- cgit v1.2.3 From 357ab2234d57f6c74386f64ded42dff8e3c0500b Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 2 Dec 2015 14:43:59 +0800 Subject: VSOCK: Introduce vsock_find_unbound_socket and vsock_bind_dgram_generic Signed-off-by: Asias He Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index e9eb2d6791b3..a0c8fa2ababf 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -175,8 +175,10 @@ void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); +struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); +int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr); #endif /* __AF_VSOCK_H__ */ -- cgit v1.2.3 From 80a19e338d458abb5a700df3fd00795c51361f06 Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 2 Dec 2015 14:44:00 +0800 Subject: VSOCK: Introduce virtio-vsock-common.ko This module contains the common code and header files for the following virtio-vsock and virtio-vhost kernel modules. Signed-off-by: Asias He Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 209 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_ids.h | 1 + include/uapi/linux/virtio_vsock.h | 89 ++++++++++++++++ 3 files changed, 299 insertions(+) create mode 100644 include/linux/virtio_vsock.h create mode 100644 include/uapi/linux/virtio_vsock.h (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h new file mode 100644 index 000000000000..a5f3ecc038f7 --- /dev/null +++ b/include/linux/virtio_vsock.h @@ -0,0 +1,209 @@ +/* + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) Red Hat, Inc., 2013-2015 + * Copyright (C) Asias He , 2013 + * Copyright (C) Stefan Hajnoczi , 2015 + */ + +#ifndef _LINUX_VIRTIO_VSOCK_H +#define _LINUX_VIRTIO_VSOCK_H + +#include +#include +#include + +#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 +#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) +#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) +#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE (1024 * 1024 * 16) +#define VIRTIO_VSOCK_MAX_DGRAM_SIZE (1024 * 64) + +struct vsock_transport_recv_notify_data; +struct vsock_transport_send_notify_data; +struct sockaddr_vm; +struct vsock_sock; + +enum { + VSOCK_VQ_CTRL = 0, + VSOCK_VQ_RX = 1, /* for host to guest data */ + VSOCK_VQ_TX = 2, /* for guest to host data */ + VSOCK_VQ_MAX = 3, +}; + +/* virtio transport socket state */ +struct virtio_transport { + struct virtio_transport_pkt_ops *ops; + struct vsock_sock *vsk; + + u32 buf_size; + u32 buf_size_min; + u32 buf_size_max; + + struct mutex tx_lock; + struct mutex rx_lock; + + struct list_head rx_queue; + u32 rx_bytes; + + /* Protected by trans->tx_lock */ + u32 tx_cnt; + u32 buf_alloc; + u32 peer_fwd_cnt; + u32 peer_buf_alloc; + /* Protected by trans->rx_lock */ + u32 fwd_cnt; + + /* Protected by sk_lock */ + u16 dgram_id; + struct list_head incomplete_dgrams; /* dgram fragments */ +}; + +struct virtio_vsock_pkt { + struct virtio_vsock_hdr hdr; + struct virtio_transport *trans; + struct work_struct work; + struct list_head list; + void *buf; + u32 len; + u32 off; +}; + +struct virtio_vsock_pkt_info { + u32 remote_cid, remote_port; + struct msghdr *msg; + u32 pkt_len; + u16 type; + u16 op; + u32 flags; + u16 dgram_id; + u16 dgram_len; +}; + +struct virtio_transport_pkt_ops { + int (*send_pkt)(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info); +}; + +void virtio_vsock_dumppkt(const char *func, + const struct virtio_vsock_pkt *pkt); + +struct sock * +virtio_transport_get_pending(struct sock *listener, + struct virtio_vsock_pkt *pkt); +struct virtio_vsock_pkt * +virtio_transport_alloc_pkt(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port); +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, + int type); +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk); +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now); +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_available_now); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); +bool virtio_transport_stream_is_active(struct vsock_sock *vsk); +bool virtio_transport_stream_allow(u32 cid, u32 port); +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr); +bool virtio_transport_dgram_allow(u32 cid, u32 port); + +int virtio_transport_connect(struct vsock_sock *vsk); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); + +void virtio_transport_release(struct vsock_sock *vsk); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len); +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t len); + +void virtio_transport_destruct(struct vsock_sock *vsk); + +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt); +u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted); +void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit); +#endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 77925f587b15..16dcf5d06cd7 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -39,6 +39,7 @@ #define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ +#define VIRTIO_ID_VSOCK 13 /* virtio vsock transport */ #define VIRTIO_ID_GPU 16 /* virtio GPU */ #define VIRTIO_ID_INPUT 18 /* virtio input */ diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h new file mode 100644 index 000000000000..8cf9b5682628 --- /dev/null +++ b/include/uapi/linux/virtio_vsock.h @@ -0,0 +1,89 @@ +/* + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) Red Hat, Inc., 2013-2015 + * Copyright (C) Asias He , 2013 + * Copyright (C) Stefan Hajnoczi , 2015 + */ + +#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H +#define _UAPI_LINUX_VIRTIO_VOSCK_H + +#include +#include +#include + +struct virtio_vsock_config { + __le32 guest_cid; + __le32 max_virtqueue_pairs; +}; + +struct virtio_vsock_hdr { + __le32 src_cid; + __le32 src_port; + __le32 dst_cid; + __le32 dst_port; + __le32 len; + __le16 type; /* enum virtio_vsock_type */ + __le16 op; /* enum virtio_vsock_op */ + __le32 flags; + __le32 buf_alloc; + __le32 fwd_cnt; +}; + +enum virtio_vsock_type { + VIRTIO_VSOCK_TYPE_STREAM = 1, + VIRTIO_VSOCK_TYPE_DGRAM = 2, +}; + +enum virtio_vsock_op { + VIRTIO_VSOCK_OP_INVALID = 0, + + /* Connect operations */ + VIRTIO_VSOCK_OP_REQUEST = 1, + VIRTIO_VSOCK_OP_RESPONSE = 2, + VIRTIO_VSOCK_OP_ACK = 3, + VIRTIO_VSOCK_OP_RST = 4, + VIRTIO_VSOCK_OP_SHUTDOWN = 5, + + /* To send payload */ + VIRTIO_VSOCK_OP_RW = 6, + + /* Tell the peer our credit info */ + VIRTIO_VSOCK_OP_CREDIT_UPDATE = 7, + /* Request the peer to send the credit info to us */ + VIRTIO_VSOCK_OP_CREDIT_REQUEST = 8, +}; + +/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */ +enum virtio_vsock_shutdown { + VIRTIO_VSOCK_SHUTDOWN_RCV = 1, + VIRTIO_VSOCK_SHUTDOWN_SEND = 2, +}; + +#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */ -- cgit v1.2.3 From 3110489117581a980537b6d999a3724214ba772c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 22 Oct 2015 17:35:19 +0200 Subject: mac80211: allow driver to prevent two stations w/ same address Some devices or drivers cannot deal with having the same station address for different virtual interfaces, say as a client to two virtual AP interfaces. Rather than requiring each driver with a limitation like that to enforce it, add a hardware flag for it. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 760bc4d5a2cf..8628118214cc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1901,6 +1901,11 @@ struct ieee80211_txq { * @IEEE80211_HW_BEACON_TX_STATUS: The device/driver provides TX status * for sent beacons. * + * @IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR: Hardware (or driver) requires that each + * station has a unique address, i.e. each station entry can be identified + * by just its MAC address; this prevents, for example, the same station + * from connecting to two virtual AP interfaces at the same time. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -1936,6 +1941,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_TDLS_WIDER_BW, IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU, IEEE80211_HW_BEACON_TX_STATUS, + IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From 0483eeac59876ac37d4edbabd48727a468416d5b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 23 Oct 2015 09:50:03 +0200 Subject: cfg80211: replace ieee80211_ie_split() with an inline The function is a very simple wrapper around another one, just adds a few default parameters, so replace it with a static inline instead of using EXPORT_SYMBOL, reducing the module size slightly. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2c7bdb81d30c..e568872203a5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5173,8 +5173,11 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, * buffer starts, which may be @ielen if the entire (remainder) * of the buffer should be used. */ -size_t ieee80211_ie_split(const u8 *ies, size_t ielen, - const u8 *ids, int n_ids, size_t offset); +static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, size_t offset) +{ + return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset); +} /** * cfg80211_report_wowlan_wakeup - report wakeup from WoWLAN -- cgit v1.2.3 From cf595922b9a3b61e308224fd29909f54ecb557d4 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Mon, 9 Nov 2015 12:17:37 +0200 Subject: nl80211: clarify NL80211_ATTR_SCHED_SCAN_DELAY usage with net-detect In this attribute's documentation, it was not clear whether the delay started counting when WoWLAN net-detect was enabled or when the system was suspended. The correct answer is that it starts when the system suspends (which is when, in practice, the scan is scheduled). Clarify that in the nl80211.h documentation. Suggested-by: Samuel Tan Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1f0b4cf5dd03..07099cb14778 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1764,8 +1764,9 @@ enum nl80211_commands { * over all channels. * * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before the first cycle of a - * scheduled scan (or a WoWLAN net-detect scan) is started, u32 - * in seconds. + * scheduled scan is started. Or the delay before a WoWLAN + * net-detect scan is started, counting from the moment the + * system is suspended. This value is a u32, in seconds. * @NL80211_ATTR_REG_INDOOR: flag attribute, if set indicates that the device * is operating in an indoor environment. -- cgit v1.2.3 From 0ead2510f8cec11ce96308d79a1b4ee272fb5238 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Tue, 17 Nov 2015 10:24:36 +0200 Subject: mac80211: allow the driver to send EOSP when needed This can happen when the driver needs to send less frames than expected and then needs to close the SP. Mac80211 still needs to set the more_data properly based on its buffer state (ps_tx_buffer and buffered frames on other TIDs). To that end, refactor the code that delivers frames upon uAPSD trigger frames to be able to get only the more_data bit without actually delivering those frames in case the driver is just asking to set a NDP with EOSP and MORE_DATA bit properly set. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 8628118214cc..18ac733afc91 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4868,6 +4868,28 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, */ void ieee80211_sta_eosp(struct ieee80211_sta *pubsta); +/** + * ieee80211_send_eosp_nullfunc - ask mac80211 to send NDP with EOSP + * @pubsta: the station + * @tid: the tid of the NDP + * + * Sometimes the device understands that it needs to close + * the Service Period unexpectedly. This can happen when + * sending frames that are filling holes in the BA window. + * In this case, the device can ask mac80211 to send a + * Nullfunc frame with EOSP set. When that happens, the + * driver must have called ieee80211_sta_set_buffered() to + * let mac80211 know that there are no buffered frames any + * more, otherwise mac80211 will get the more_data bit wrong. + * The low level driver must have made sure that the frame + * will be sent despite the station being in power-save. + * Mac80211 won't call allow_buffered_frames(). + * Note that calling this function, doesn't exempt the driver + * from closing the EOSP properly, it will still have to call + * ieee80211_sta_eosp when the NDP is sent. + */ +void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); + /** * ieee80211_iter_keys - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() -- cgit v1.2.3 From ef044763a3ca6b9e0bb65a9ce0cb38c0eca62756 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Tue, 17 Nov 2015 10:24:37 +0200 Subject: mac80211: add atomic uploaded keys iterator add ieee80211_iter_keys_rcu() to iterate over uploaded keys in atomic context (when rcu is locked) The station removal code removes the keys only after calling synchronize_net(), so it's not safe to iterate the keys at this point (and postponing the actual key deletion with call_rcu() might result in some badly-ordered ops calls). Add a flag to indicate a station is being removed, and skip the configured keys if it's set. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 18ac733afc91..a68051c41ac3 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4917,6 +4917,30 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw, void *data), void *iter_data); +/** + * ieee80211_iter_keys_rcu - iterate keys programmed into the device + * @hw: pointer obtained from ieee80211_alloc_hw() + * @vif: virtual interface to iterate, may be %NULL for all + * @iter: iterator function that will be called for each key + * @iter_data: custom data to pass to the iterator function + * + * This function can be used to iterate all the keys known to + * mac80211, even those that weren't previously programmed into + * the device. Note that due to locking reasons, keys of station + * in removal process will be skipped. + * + * This function requires being called in an RCU critical section, + * and thus iter must be atomic. + */ +void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + struct ieee80211_key_conf *key, + void *data), + void *iter_data); + /** * ieee80211_iter_chan_contexts_atomic - iterate channel contexts * @hw: pointre obtained from ieee80211_alloc_hw(). -- cgit v1.2.3 From b115b972997428b9134aba377721fea6486adbd0 Mon Sep 17 00:00:00 2001 From: "Janusz.Dziedzic@tieto.com" Date: Tue, 27 Oct 2015 08:38:40 +0100 Subject: mac80211: add new IEEE80211_VIF_GET_NOA_UPDATE flag Add new VIF flag, that will allow get NOA update notification when driver will request this, even this is not pure P2P vif (eg. STA vif). Signed-off-by: Janusz Dziedzic Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a68051c41ac3..7c30faff245f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1321,11 +1321,15 @@ struct ieee80211_channel_switch { * interface. This flag should be set during interface addition, * but may be set/cleared as late as authentication to an AP. It is * only valid for managed/station mode interfaces. + * @IEEE80211_VIF_GET_NOA_UPDATE: request to handle NOA attributes + * and send P2P_PS notification to the driver if NOA changed, even + * this is not pure P2P vif. */ enum ieee80211_vif_flags { IEEE80211_VIF_BEACON_FILTER = BIT(0), IEEE80211_VIF_SUPPORTS_CQM_RSSI = BIT(1), IEEE80211_VIF_SUPPORTS_UAPSD = BIT(2), + IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), }; /** -- cgit v1.2.3 From 91d3ab46730379e89e1e908c6f62fbcadb3d8f08 Mon Sep 17 00:00:00 2001 From: Vidyullatha Kanchanapally Date: Fri, 30 Oct 2015 19:14:49 +0530 Subject: cfg80211: Add support for aborting an ongoing scan Implement new functionality for aborting an ongoing scan. Add NL80211_CMD_ABORT_SCAN to the nl80211 interface. After aborting the scan, driver shall provide the scan status by calling cfg80211_scan_done(). Reviewed-by: Jouni Malinen Signed-off-by: Vidyullatha Kanchanapally Signed-off-by: Sunil Dutt [change command to take wdev instead of netdev so that it can be used on p2p-device scans] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e568872203a5..9bcaaf7cd15a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2321,6 +2321,8 @@ struct cfg80211_qos_map { * the driver, and will be valid until passed to cfg80211_scan_done(). * For scan results, call cfg80211_inform_bss(); you can call this outside * the scan/scan_done bracket too. + * @abort_scan: Tell the driver to abort an ongoing scan. The driver shall + * indicate the status of the scan through cfg80211_scan_done(). * * @auth: Request to authenticate with the specified peer * (invoked with the wireless_dev mutex held) @@ -2593,6 +2595,7 @@ struct cfg80211_ops { int (*scan)(struct wiphy *wiphy, struct cfg80211_scan_request *request); + void (*abort_scan)(struct wiphy *wiphy, struct wireless_dev *wdev); int (*auth)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_auth_request *req); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 07099cb14778..5b7b5ebe7ca8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -820,6 +820,10 @@ * as an event to indicate changes for devices with wiphy-specific regdom * management. * + * @NL80211_CMD_ABORT_SCAN: Stop an ongoing scan. Returns -ENOENT if a scan is + * not running. The driver indicates the status of the scan through + * cfg80211_scan_done(). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1006,6 +1010,8 @@ enum nl80211_commands { NL80211_CMD_WIPHY_REG_CHANGE, + NL80211_CMD_ABORT_SCAN, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ -- cgit v1.2.3 From 2f8364a291e8adde25c93f97a76abbcaf4b1ed3f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 3 Dec 2015 21:12:31 +0100 Subject: WAN: HDLC: Call notifiers before and after changing device type An HDLC device can change type when the protocol driver is changed. Calling the notifier change allows potential users of the interface know about this planned change, and even block it. After the change has occurred, send a second notification to users can evaluate the new device type etc. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/hdlc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h index 1acb1445e05f..e31bcd4c7859 100644 --- a/include/linux/hdlc.h +++ b/include/linux/hdlc.h @@ -101,7 +101,7 @@ netdev_tx_t hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev); int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto, size_t size); /* May be used by hardware driver to gain control over HDLC device */ -void detach_hdlc_protocol(struct net_device *dev); +int detach_hdlc_protocol(struct net_device *dev); static __inline__ __be16 hdlc_type_trans(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From b618aaa91b5870e7bd139987ac4b7bf0851142d0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 4 Dec 2015 15:01:31 +0100 Subject: net: constify netif_is_* helpers net_device param As suggested by Eric, these helpers should have const dev param. Suggested-by: Eric Dumazet Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 2 +- include/linux/netdevice.h | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 67ce5bd3b56a..05f5879821b8 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -73,7 +73,7 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); -static inline bool is_vlan_dev(struct net_device *dev) +static inline bool is_vlan_dev(const struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3efe017fe419..1bb21ff0fa64 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3661,7 +3661,7 @@ extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; void netdev_rss_key_fill(void *buffer, size_t len); int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(struct net_device *dev)); + bool (*type_check)(const struct net_device *dev)); int skb_checksum_help(struct sk_buff *skb); struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); @@ -3858,32 +3858,32 @@ static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, skb->mac_len = mac_len; } -static inline bool netif_is_macvlan(struct net_device *dev) +static inline bool netif_is_macvlan(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN; } -static inline bool netif_is_macvlan_port(struct net_device *dev) +static inline bool netif_is_macvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN_PORT; } -static inline bool netif_is_ipvlan(struct net_device *dev) +static inline bool netif_is_ipvlan(const struct net_device *dev) { return dev->priv_flags & IFF_IPVLAN_SLAVE; } -static inline bool netif_is_ipvlan_port(struct net_device *dev) +static inline bool netif_is_ipvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_IPVLAN_MASTER; } -static inline bool netif_is_bond_master(struct net_device *dev) +static inline bool netif_is_bond_master(const struct net_device *dev) { return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING; } -static inline bool netif_is_bond_slave(struct net_device *dev) +static inline bool netif_is_bond_slave(const struct net_device *dev) { return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING; } @@ -3918,22 +3918,22 @@ static inline bool netif_is_ovs_master(const struct net_device *dev) return dev->priv_flags & IFF_OPENVSWITCH; } -static inline bool netif_is_team_master(struct net_device *dev) +static inline bool netif_is_team_master(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM; } -static inline bool netif_is_team_port(struct net_device *dev) +static inline bool netif_is_team_port(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM_PORT; } -static inline bool netif_is_lag_master(struct net_device *dev) +static inline bool netif_is_lag_master(const struct net_device *dev) { return netif_is_bond_master(dev) || netif_is_team_master(dev); } -static inline bool netif_is_lag_port(struct net_device *dev) +static inline bool netif_is_lag_port(const struct net_device *dev) { return netif_is_bond_slave(dev) || netif_is_team_port(dev); } -- cgit v1.2.3 From 5f61385d2ebc2bd62bc389c7da0d8d2f263be1eb Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Sun, 6 Dec 2015 18:07:41 +0200 Subject: net/mlx4_core: Keep VLAN/MAC tables mirrored in multifunc HA mode Due to HW limitations, indexes to MAC and VLAN tables are always taken from the table of the actual port. So, if a resource holds an index to a table, it may refer to different values during the lifetime of the resource, unless the tables are mirrored. Also, even when driver is not in HA mode the policy of allocating an index to these tables is such to make sure, as much as possible, that when the time comes the mirroring will be successful. This means that in multifunction mode the allocation of a free index in a port's table tries to make sure that the same index in the other's port table is also free. Signed-off-by: Moni Shoua Reviewed-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/driver.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index 5a06d969338e..2e8af001c5da 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -75,6 +75,11 @@ static inline int mlx4_is_bonded(struct mlx4_dev *dev) return !!(dev->flags & MLX4_FLAG_BONDED); } +static inline int mlx4_is_mf_bonded(struct mlx4_dev *dev) +{ + return (mlx4_is_bonded(dev) && mlx4_is_mfunc(dev)); +} + struct mlx4_port_map { u8 port1; u8 port2; -- cgit v1.2.3 From ea3793ee29d3621faf857fa8ef5425e9ff9a756d Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Sun, 6 Dec 2015 21:11:34 +0000 Subject: core: enable more fine-grained datagram reception control The __skb_recv_datagram routine in core/ datagram.c provides a general skb reception factility supposed to be utilized by protocol modules providing datagram sockets. It encompasses both the actual recvmsg code and a surrounding 'sleep until data is available' loop. This is inconvenient if a protocol module has to use additional locking in order to maintain some per-socket state the generic datagram socket code is unaware of (as the af_unix code does). The patch below moves the recvmsg proper code into a new __skb_try_recv_datagram routine which doesn't sleep and renames wait_for_more_packets to __skb_wait_for_more_packets, both routines being exported interfaces. The original __skb_recv_datagram routine is reimplemented on top of these two functions such that its user-visible behaviour remains unchanged. Signed-off-by: Rainer Weikusat Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c9c394bf0771..9b9b9ead7bb3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2785,6 +2785,12 @@ static inline void skb_frag_list_init(struct sk_buff *skb) #define skb_walk_frags(skb, iter) \ for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) + +int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, + const struct sk_buff *skb); +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags, + int *peeked, int *off, int *err, + struct sk_buff **last); struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, int *peeked, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, -- cgit v1.2.3 From 4baee937b8d551c89f61542a575378e407b63415 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 7 Dec 2015 13:57:32 +0100 Subject: net: dsa: remove DSA link polling Since no more DSA driver uses the polling callback, and since the phylib handles the link detection, remove the link polling work and timer code. Signed-off-by: Neil Armstrong Signed-off-by: David S. Miller --- include/net/dsa.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 3f23dd9d6a69..26a0e86e611e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -116,13 +116,6 @@ struct dsa_switch_tree { s8 cpu_switch; s8 cpu_port; - /* - * Link state polling. - */ - int link_poll_needed; - struct work_struct link_poll_work; - struct timer_list link_poll_timer; - /* * Data for the individual switch chips. */ @@ -231,11 +224,6 @@ struct dsa_switch_driver { int (*phy_write)(struct dsa_switch *ds, int port, int regnum, u16 val); - /* - * Link state polling and IRQ handling. - */ - void (*poll_link)(struct dsa_switch *ds); - /* * Link state adjustment (called from libphy) */ -- cgit v1.2.3 From 8ac2837c89c8c0fcad557e4380aeef80580390f9 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 9 Dec 2015 10:51:12 +0800 Subject: Revert "Merge branch 'vsock-virtio'" This reverts commit 0d76d6e8b2507983a2cae4c09880798079007421 and merge commit c402293bd76fbc93e52ef8c0947ab81eea3ae019, reversing changes made to c89359a42e2a49656451569c382eed63e781153c. The virtio-vsock device specification is not finalized yet. Michael Tsirkin voiced concerned about merging this code when the hardware interface (and possibly the userspace interface) could still change. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 209 -------------------------------------- include/net/af_vsock.h | 2 - include/uapi/linux/virtio_ids.h | 1 - include/uapi/linux/virtio_vsock.h | 89 ---------------- 4 files changed, 301 deletions(-) delete mode 100644 include/linux/virtio_vsock.h delete mode 100644 include/uapi/linux/virtio_vsock.h (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h deleted file mode 100644 index a5f3ecc038f7..000000000000 --- a/include/linux/virtio_vsock.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers: - * - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Copyright (C) Red Hat, Inc., 2013-2015 - * Copyright (C) Asias He , 2013 - * Copyright (C) Stefan Hajnoczi , 2015 - */ - -#ifndef _LINUX_VIRTIO_VSOCK_H -#define _LINUX_VIRTIO_VSOCK_H - -#include -#include -#include - -#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 -#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) -#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL -#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) -#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE (1024 * 1024 * 16) -#define VIRTIO_VSOCK_MAX_DGRAM_SIZE (1024 * 64) - -struct vsock_transport_recv_notify_data; -struct vsock_transport_send_notify_data; -struct sockaddr_vm; -struct vsock_sock; - -enum { - VSOCK_VQ_CTRL = 0, - VSOCK_VQ_RX = 1, /* for host to guest data */ - VSOCK_VQ_TX = 2, /* for guest to host data */ - VSOCK_VQ_MAX = 3, -}; - -/* virtio transport socket state */ -struct virtio_transport { - struct virtio_transport_pkt_ops *ops; - struct vsock_sock *vsk; - - u32 buf_size; - u32 buf_size_min; - u32 buf_size_max; - - struct mutex tx_lock; - struct mutex rx_lock; - - struct list_head rx_queue; - u32 rx_bytes; - - /* Protected by trans->tx_lock */ - u32 tx_cnt; - u32 buf_alloc; - u32 peer_fwd_cnt; - u32 peer_buf_alloc; - /* Protected by trans->rx_lock */ - u32 fwd_cnt; - - /* Protected by sk_lock */ - u16 dgram_id; - struct list_head incomplete_dgrams; /* dgram fragments */ -}; - -struct virtio_vsock_pkt { - struct virtio_vsock_hdr hdr; - struct virtio_transport *trans; - struct work_struct work; - struct list_head list; - void *buf; - u32 len; - u32 off; -}; - -struct virtio_vsock_pkt_info { - u32 remote_cid, remote_port; - struct msghdr *msg; - u32 pkt_len; - u16 type; - u16 op; - u32 flags; - u16 dgram_id; - u16 dgram_len; -}; - -struct virtio_transport_pkt_ops { - int (*send_pkt)(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info); -}; - -void virtio_vsock_dumppkt(const char *func, - const struct virtio_vsock_pkt *pkt); - -struct sock * -virtio_transport_get_pending(struct sock *listener, - struct virtio_vsock_pkt *pkt); -struct virtio_vsock_pkt * -virtio_transport_alloc_pkt(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info, - size_t len, - u32 src_cid, - u32 src_port, - u32 dst_cid, - u32 dst_port); -ssize_t -virtio_transport_stream_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, - int type); -int -virtio_transport_dgram_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, int flags); - -s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); -s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); - -int virtio_transport_do_socket_init(struct vsock_sock *vsk, - struct vsock_sock *psk); -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); -int -virtio_transport_notify_poll_in(struct vsock_sock *vsk, - size_t target, - bool *data_ready_now); -int -virtio_transport_notify_poll_out(struct vsock_sock *vsk, - size_t target, - bool *space_available_now); - -int virtio_transport_notify_recv_init(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, - size_t target, ssize_t copied, bool data_read, - struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_send_init(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, - ssize_t written, struct vsock_transport_send_notify_data *data); - -u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); -bool virtio_transport_stream_is_active(struct vsock_sock *vsk); -bool virtio_transport_stream_allow(u32 cid, u32 port); -int virtio_transport_dgram_bind(struct vsock_sock *vsk, - struct sockaddr_vm *addr); -bool virtio_transport_dgram_allow(u32 cid, u32 port); - -int virtio_transport_connect(struct vsock_sock *vsk); - -int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); - -void virtio_transport_release(struct vsock_sock *vsk); - -ssize_t -virtio_transport_stream_enqueue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len); -int -virtio_transport_dgram_enqueue(struct vsock_sock *vsk, - struct sockaddr_vm *remote_addr, - struct msghdr *msg, - size_t len); - -void virtio_transport_destruct(struct vsock_sock *vsk); - -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt); -u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted); -void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit); -#endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index a0c8fa2ababf..e9eb2d6791b3 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -175,10 +175,8 @@ void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); -struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); -int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr); #endif /* __AF_VSOCK_H__ */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 16dcf5d06cd7..77925f587b15 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -39,7 +39,6 @@ #define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ -#define VIRTIO_ID_VSOCK 13 /* virtio vsock transport */ #define VIRTIO_ID_GPU 16 /* virtio GPU */ #define VIRTIO_ID_INPUT 18 /* virtio input */ diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h deleted file mode 100644 index 8cf9b5682628..000000000000 --- a/include/uapi/linux/virtio_vsock.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers: - * - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Copyright (C) Red Hat, Inc., 2013-2015 - * Copyright (C) Asias He , 2013 - * Copyright (C) Stefan Hajnoczi , 2015 - */ - -#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H -#define _UAPI_LINUX_VIRTIO_VOSCK_H - -#include -#include -#include - -struct virtio_vsock_config { - __le32 guest_cid; - __le32 max_virtqueue_pairs; -}; - -struct virtio_vsock_hdr { - __le32 src_cid; - __le32 src_port; - __le32 dst_cid; - __le32 dst_port; - __le32 len; - __le16 type; /* enum virtio_vsock_type */ - __le16 op; /* enum virtio_vsock_op */ - __le32 flags; - __le32 buf_alloc; - __le32 fwd_cnt; -}; - -enum virtio_vsock_type { - VIRTIO_VSOCK_TYPE_STREAM = 1, - VIRTIO_VSOCK_TYPE_DGRAM = 2, -}; - -enum virtio_vsock_op { - VIRTIO_VSOCK_OP_INVALID = 0, - - /* Connect operations */ - VIRTIO_VSOCK_OP_REQUEST = 1, - VIRTIO_VSOCK_OP_RESPONSE = 2, - VIRTIO_VSOCK_OP_ACK = 3, - VIRTIO_VSOCK_OP_RST = 4, - VIRTIO_VSOCK_OP_SHUTDOWN = 5, - - /* To send payload */ - VIRTIO_VSOCK_OP_RW = 6, - - /* Tell the peer our credit info */ - VIRTIO_VSOCK_OP_CREDIT_UPDATE = 7, - /* Request the peer to send the credit info to us */ - VIRTIO_VSOCK_OP_CREDIT_REQUEST = 8, -}; - -/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */ -enum virtio_vsock_shutdown { - VIRTIO_VSOCK_SHUTDOWN_RCV = 1, - VIRTIO_VSOCK_SHUTDOWN_SEND = 2, -}; - -#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */ -- cgit v1.2.3 From 297dbde19cf6a0ccb6fd4396c6220a5912ed61e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:51 -0500 Subject: netprio_cgroup: limit the maximum css->id to USHRT_MAX netprio builds per-netdev contiguous priomap array which is indexed by css->id. The array is allocated using kzalloc() effectively limiting the maximum ID supported to some thousand range. This patch caps the maximum supported css->id to USHRT_MAX which should be way above what is actually useable. This allows reducing sock->sk_cgrp_prioidx to u16 from u32. The freed up part will be used to overload the cgroup related fields. sock->sk_cgrp_prioidx's position is swapped with sk_mark so that the two cgroup related fields are adjacent. Signed-off-by: Tejun Heo Acked-by: Daniel Wagner Cc: Daniel Borkmann CC: Neil Horman Signed-off-by: David S. Miller --- include/net/sock.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 6f58b84fc742..a95bcf7d6efa 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -288,7 +288,6 @@ struct cg_proto; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_priority: %SO_PRIORITY setting - * @sk_cgrp_prioidx: socket group's priority map index * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_pid: &struct pid for this socket's peer @@ -309,6 +308,7 @@ struct cg_proto; * @sk_send_head: front of stuff to transmit * @sk_security: used by security modules * @sk_mark: generic packet mark + * @sk_cgrp_prioidx: socket group's priority map index * @sk_classid: this socket's cgroup classid * @sk_cgrp: this socket's cgroup-specific proto data * @sk_write_pending: a write to stream socket waits to start @@ -425,9 +425,7 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; __u32 sk_priority; -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) - __u32 sk_cgrp_prioidx; -#endif + __u32 sk_mark; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; @@ -445,7 +443,9 @@ struct sock { #ifdef CONFIG_SECURITY void *sk_security; #endif - __u32 sk_mark; +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) + u16 sk_cgrp_prioidx; +#endif #ifdef CONFIG_CGROUP_NET_CLASSID u32 sk_classid; #endif -- cgit v1.2.3 From 2a56a1fec290bf0bc4676bbf4efdb3744953a3e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:52 -0500 Subject: net: wrap sock->sk_cgrp_prioidx and ->sk_classid inside a struct Introduce sock->sk_cgrp_data which is a struct sock_cgroup_data. ->sk_cgroup_prioidx and ->sk_classid are moved into it. The struct and its accessors are defined in cgroup-defs.h. This is to prepare for overloading the fields with a cgroup pointer. This patch mostly performs equivalent conversions but the followings are noteworthy. * Equality test before updating classid is removed from sock_update_classid(). This shouldn't make any noticeable difference and a similar test will be implemented on the helper side later. * sock_update_netprioidx() now takes struct sock_cgroup_data and can be moved to netprio_cgroup.h without causing include dependency loop. Moved. * The dummy version of sock_update_netprioidx() converted to a static inline function while at it. Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 36 ++++++++++++++++++++++++++++++++++++ include/net/cls_cgroup.h | 11 +++++------ include/net/netprio_cgroup.h | 16 +++++++++++++--- include/net/sock.h | 11 +++-------- 4 files changed, 57 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 504d8591b6d3..ed128fed0335 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -542,4 +542,40 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #endif /* CONFIG_CGROUPS */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +struct sock_cgroup_data { + u16 prioidx; + u32 classid; +}; + +static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) +{ + return skcd->prioidx; +} + +static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) +{ + return skcd->classid; +} + +static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, + u16 prioidx) +{ + skcd->prioidx = prioidx; +} + +static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, + u32 classid) +{ + skcd->classid = classid; +} + +#else /* CONFIG_SOCK_CGROUP_DATA */ + +struct sock_cgroup_data { +}; + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #endif /* _LINUX_CGROUP_DEFS_H */ diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index ccd6d8bffa4d..c0a92e2c286d 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -41,13 +41,12 @@ static inline u32 task_cls_classid(struct task_struct *p) return classid; } -static inline void sock_update_classid(struct sock *sk) +static inline void sock_update_classid(struct sock_cgroup_data *skcd) { u32 classid; classid = task_cls_classid(current); - if (classid != sk->sk_classid) - sk->sk_classid = classid; + sock_cgroup_set_classid(skcd, classid); } static inline u32 task_get_classid(const struct sk_buff *skb) @@ -64,17 +63,17 @@ static inline u32 task_get_classid(const struct sk_buff *skb) * softirqs always disables bh. */ if (in_serving_softirq()) { - /* If there is an sk_classid we'll use that. */ + /* If there is an sock_cgroup_classid we'll use that. */ if (!skb->sk) return 0; - classid = skb->sk->sk_classid; + classid = sock_cgroup_classid(&skb->sk->sk_cgrp_data); } return classid; } #else /* !CONFIG_CGROUP_NET_CLASSID */ -static inline void sock_update_classid(struct sock *sk) +static inline void sock_update_classid(struct sock_cgroup_data *skcd) { } diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index f2a9597ff53c..604190596cde 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -25,8 +25,6 @@ struct netprio_map { u32 priomap[]; }; -void sock_update_netprioidx(struct sock *sk); - static inline u32 task_netprioidx(struct task_struct *p) { struct cgroup_subsys_state *css; @@ -38,13 +36,25 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_unlock(); return idx; } + +static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) +{ + if (in_interrupt()) + return; + + sock_cgroup_set_prioidx(skcd, task_netprioidx(current)); +} + #else /* !CONFIG_CGROUP_NET_PRIO */ + static inline u32 task_netprioidx(struct task_struct *p) { return 0; } -#define sock_update_netprioidx(sk) +static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) +{ +} #endif /* CONFIG_CGROUP_NET_PRIO */ #endif /* _NET_CLS_CGROUP_H */ diff --git a/include/net/sock.h b/include/net/sock.h index a95bcf7d6efa..0ca22b014de1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -308,8 +309,7 @@ struct cg_proto; * @sk_send_head: front of stuff to transmit * @sk_security: used by security modules * @sk_mark: generic packet mark - * @sk_cgrp_prioidx: socket group's priority map index - * @sk_classid: this socket's cgroup classid + * @sk_cgrp_data: cgroup data for this cgroup * @sk_cgrp: this socket's cgroup-specific proto data * @sk_write_pending: a write to stream socket waits to start * @sk_state_change: callback to indicate change in the state of the sock @@ -443,12 +443,7 @@ struct sock { #ifdef CONFIG_SECURITY void *sk_security; #endif -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) - u16 sk_cgrp_prioidx; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - u32 sk_classid; -#endif + struct sock_cgroup_data sk_cgrp_data; struct cg_proto *sk_cgrp; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk); -- cgit v1.2.3 From bd1060a1d67128bb8fbe2e1384c518912cbe54e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:53 -0500 Subject: sock, cgroup: add sock->sk_cgroup In cgroup v1, dealing with cgroup membership was difficult because the number of membership associations was unbound. As a result, cgroup v1 grew several controllers whose primary purpose is either tagging membership or pull in configuration knobs from other subsystems so that cgroup membership test can be avoided. net_cls and net_prio controllers are examples of the latter. They allow configuring network-specific attributes from cgroup side so that network subsystem can avoid testing cgroup membership; unfortunately, these are not only cumbersome but also problematic. Both net_cls and net_prio aren't properly hierarchical. Both inherit configuration from the parent on creation but there's no interaction afterwards. An ancestor doesn't restrict the behavior in its subtree in anyway and configuration changes aren't propagated downwards. Especially when combined with cgroup delegation, this is problematic because delegatees can mess up whatever network configuration implemented at the system level. net_prio would allow the delegatees to set whatever priority value regardless of CAP_NET_ADMIN and net_cls the same for classid. While it is possible to solve these issues from controller side by implementing hierarchical allowable ranges in both controllers, it would involve quite a bit of complexity in the controllers and further obfuscate network configuration as it becomes even more difficult to tell what's actually being configured looking from the network side. While not much can be done for v1 at this point, as membership handling is sane on cgroup v2, it'd be better to make cgroup matching behave like other network matches and classifiers than introducing further complications. In preparation, this patch updates sock->sk_cgrp_data handling so that it points to the v2 cgroup that sock was created in until either net_prio or net_cls is used. Once either of the two is used, sock->sk_cgrp_data reverts to its previous role of carrying prioidx and classid. This is to avoid adding yet another cgroup related field to struct sock. As the mode switching can happen at most once per boot, the switching mechanism is aimed at lowering hot path overhead. It may leak a finite, likely small, number of cgroup refs and report spurious prioidx or classid on switching; however, dynamic updates of prioidx and classid have always been racy and lossy - socks between creation and fd installation are never updated, config changes don't update existing sockets at all, and prioidx may index with dead and recycled cgroup IDs. Non-critical inaccuracies from small race windows won't make any noticeable difference. This patch doesn't make use of the pointer yet. The following patch will implement netfilter match for cgroup2 membership. v2: Use sock_cgroup_data to avoid inflating struct sock w/ another cgroup specific field. v3: Add comments explaining why sock_data_prioidx() and sock_data_classid() use different fallback values. Signed-off-by: Tejun Heo Cc: Daniel Borkmann Cc: Daniel Wagner CC: Neil Horman Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 88 +++++++++++++++++++++++++++++++++++++++++---- include/linux/cgroup.h | 41 +++++++++++++++++++++ 2 files changed, 123 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ed128fed0335..9dc226345e4e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -544,31 +544,107 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #ifdef CONFIG_SOCK_CGROUP_DATA +/* + * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains + * per-socket cgroup information except for memcg association. + * + * On legacy hierarchies, net_prio and net_cls controllers directly set + * attributes on each sock which can then be tested by the network layer. + * On the default hierarchy, each sock is associated with the cgroup it was + * created in and the networking layer can match the cgroup directly. + * + * To avoid carrying all three cgroup related fields separately in sock, + * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. + * On boot, sock_cgroup_data records the cgroup that the sock was created + * in so that cgroup2 matches can be made; however, once either net_prio or + * net_cls starts being used, the area is overriden to carry prioidx and/or + * classid. The two modes are distinguished by whether the lowest bit is + * set. Clear bit indicates cgroup pointer while set bit prioidx and + * classid. + * + * While userland may start using net_prio or net_cls at any time, once + * either is used, cgroup2 matching no longer works. There is no reason to + * mix the two and this is in line with how legacy and v2 compatibility is + * handled. On mode switch, cgroup references which are already being + * pointed to by socks may be leaked. While this can be remedied by adding + * synchronization around sock_cgroup_data, given that the number of leaked + * cgroups is bound and highly unlikely to be high, this seems to be the + * better trade-off. + */ struct sock_cgroup_data { - u16 prioidx; - u32 classid; + union { +#ifdef __LITTLE_ENDIAN + struct { + u8 is_data; + u8 padding; + u16 prioidx; + u32 classid; + } __packed; +#else + struct { + u32 classid; + u16 prioidx; + u8 padding; + u8 is_data; + } __packed; +#endif + u64 val; + }; }; +/* + * There's a theoretical window where the following accessors race with + * updaters and return part of the previous pointer as the prioidx or + * classid. Such races are short-lived and the result isn't critical. + */ static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) { - return skcd->prioidx; + /* fallback to 1 which is always the ID of the root cgroup */ + return (skcd->is_data & 1) ? skcd->prioidx : 1; } static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) { - return skcd->classid; + /* fallback to 0 which is the unconfigured default classid */ + return (skcd->is_data & 1) ? skcd->classid : 0; } +/* + * If invoked concurrently, the updaters may clobber each other. The + * caller is responsible for synchronization. + */ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - skcd->prioidx = prioidx; + struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + + if (sock_cgroup_prioidx(&skcd_buf) == prioidx) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.prioidx = prioidx; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - skcd->classid = classid; + struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + + if (sock_cgroup_classid(&skcd_buf) == classid) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.classid = classid; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ } #else /* CONFIG_SOCK_CGROUP_DATA */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4c3ffab81ba7..a8ba1ea0ea5a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -578,4 +578,45 @@ static inline int cgroup_init(void) { return 0; } #endif /* !CONFIG_CGROUPS */ +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) +extern spinlock_t cgroup_sk_update_lock; +#endif + +void cgroup_sk_alloc_disable(void); +void cgroup_sk_alloc(struct sock_cgroup_data *skcd); +void cgroup_sk_free(struct sock_cgroup_data *skcd); + +static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) +{ +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + unsigned long v; + + /* + * @skcd->val is 64bit but the following is safe on 32bit too as we + * just need the lower ulong to be written and read atomically. + */ + v = READ_ONCE(skcd->val); + + if (v & 1) + return &cgrp_dfl_root.cgrp; + + return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; +#else + return (struct cgroup *)(unsigned long)skcd->val; +#endif +} + +#else /* CONFIG_CGROUP_DATA */ + +static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} +static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} + +#endif /* CONFIG_CGROUP_DATA */ + #endif /* _LINUX_CGROUP_H */ -- cgit v1.2.3 From ad2c8c73d29702c3193f739390f6661f9a4ecad9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Dec 2015 12:30:46 -0500 Subject: cgroup: fix sock_cgroup_data initialization on earlier compilers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sock_cgroup_data is a struct containing an anonymous union. sock_cgroup_set_prioidx() and sock_cgroup_set_classid() were initializing a field inside the anonymous union as follows. struct sock_ccgroup_data skcd_buf = { .val = VAL }; While this is fine on more recent compilers, gcc-4.4.7 triggers the following errors. include/linux/cgroup-defs.h: In function ‘sock_cgroup_set_prioidx’: include/linux/cgroup-defs.h:619: error: unknown field ‘val’ specified in initializer include/linux/cgroup-defs.h:619: warning: missing braces around initializer include/linux/cgroup-defs.h:619: warning: (near initialization for ‘skcd_buf.’) This is because .val belongs to the anonymous union nested inside the struct but the initializer is missing the nesting. Fix it by adding an extra pair of braces. Signed-off-by: Tejun Heo Reported-by: Alaa Hleihel Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 9dc226345e4e..097901a68671 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -616,7 +616,7 @@ static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; if (sock_cgroup_prioidx(&skcd_buf) == prioidx) return; @@ -633,7 +633,7 @@ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; if (sock_cgroup_classid(&skcd_buf) == classid) return; -- cgit v1.2.3 From 899077791403ff7a2d8cfaa87bd1a82d729463e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 8 Dec 2015 16:32:27 +0100 Subject: netcp: try to reduce type confusion in descriptors The netcp driver produces tons of warnings when CONFIG_LPAE is enabled on ARM: drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_tx_map_skb': drivers/net/ethernet/ti/netcp_core.c:1084:13: warning: passing argument 1 of 'set_words' from incompatible pointer type [-Wincompatible-pointer-types] This is the result of trying to pass a pointer to a dma_addr_t to a function that expects a u32 pointer to copy that into a DMA descriptor. Looking at that code in more detail to fix the warnings, I see multiple related problems: * The conversion functions are not endian-safe, as the DMA descriptors are almost certainly fixed-endian, but the CPU is not. * On 64-bit machines, passing a pointer through a u32 variable is a bug, accessing an indirect pointer as a u32 pointer even more so. * The handling of epib and psdata mixes native-endian and device-endian data. In this patch, I try to sort out the types for most accesses here, adding le32_to_cpu/cpu_to_le32 where appropriate, and passing pointers through two 32-bit words in the descriptor padding, to make it plausible that the driver does the right thing if compiled for big-endian or 64-bit systems. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/soc/ti/knav_dma.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h index dad035c16d94..343c13ac4f71 100644 --- a/include/linux/soc/ti/knav_dma.h +++ b/include/linux/soc/ti/knav_dma.h @@ -144,17 +144,17 @@ struct knav_dma_cfg { * @psdata: Protocol specific */ struct knav_dma_desc { - u32 desc_info; - u32 tag_info; - u32 packet_info; - u32 buff_len; - u32 buff; - u32 next_desc; - u32 orig_len; - u32 orig_buff; - u32 epib[KNAV_DMA_NUM_EPIB_WORDS]; - u32 psdata[KNAV_DMA_NUM_PS_WORDS]; - u32 pad[4]; + __le32 desc_info; + __le32 tag_info; + __le32 packet_info; + __le32 buff_len; + __le32 buff; + __le32 next_desc; + __le32 orig_len; + __le32 orig_buff; + __le32 epib[KNAV_DMA_NUM_EPIB_WORDS]; + __le32 psdata[KNAV_DMA_NUM_PS_WORDS]; + __le32 pad[4]; } ____cacheline_aligned; #if IS_ENABLED(CONFIG_KEYSTONE_NAVIGATOR_DMA) -- cgit v1.2.3 From 26a8145390b36cbe97a5bd0b9e97249f21af6aea Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 10 Dec 2015 17:12:39 +0200 Subject: net/mlx5_core: Introduce flow steering firmware commands Introduce new Flow Steering (FS) firmware commands, in-order to support the new flow steering infrastructure. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 47 +++++++++++++++++++++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 32 ++++++++++++++++++++--------- 2 files changed, 69 insertions(+), 10 deletions(-) create mode 100644 include/linux/mlx5/fs.h (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h new file mode 100644 index 000000000000..34fd8dc0b3e1 --- /dev/null +++ b/include/linux/mlx5/fs.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MLX5_FS_ +#define _MLX5_FS_ + +#include + +struct mlx5_flow_table; + +struct mlx5_flow_destination { + enum mlx5_flow_destination_type type; + union { + u32 tir_num; + struct mlx5_flow_table *ft; + }; +}; +#endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f5d94495758a..131a2737cfa3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -256,25 +256,27 @@ struct mlx5_ifc_flow_table_fields_supported_bits { struct mlx5_ifc_flow_table_prop_layout_bits { u8 ft_support[0x1]; - u8 reserved_0[0x1f]; + u8 reserved_0[0x2]; + u8 flow_modify_en[0x1]; + u8 reserved_1[0x1c]; - u8 reserved_1[0x2]; + u8 reserved_2[0x2]; u8 log_max_ft_size[0x6]; - u8 reserved_2[0x10]; + u8 reserved_3[0x10]; u8 max_ft_level[0x8]; - u8 reserved_3[0x20]; + u8 reserved_4[0x20]; - u8 reserved_4[0x18]; + u8 reserved_5[0x18]; u8 log_max_ft_num[0x8]; - u8 reserved_5[0x18]; + u8 reserved_6[0x18]; u8 log_max_destination[0x8]; - u8 reserved_6[0x18]; + u8 reserved_7[0x18]; u8 log_max_flow[0x8]; - u8 reserved_7[0x40]; + u8 reserved_8[0x40]; struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support; @@ -2843,6 +2845,13 @@ struct mlx5_ifc_set_hca_cap_in_bits { union mlx5_ifc_hca_cap_union_bits capability; }; +enum { + MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION = 0x0, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_TAG = 0x1, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST = 0x2, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS = 0x3 +}; + struct mlx5_ifc_set_fte_out_bits { u8 status[0x8]; u8 reserved_0[0x18]; @@ -2867,11 +2876,14 @@ struct mlx5_ifc_set_fte_in_bits { u8 reserved_4[0x8]; u8 table_id[0x18]; - u8 reserved_5[0x40]; + u8 reserved_5[0x18]; + u8 modify_enable_mask[0x8]; + + u8 reserved_6[0x20]; u8 flow_index[0x20]; - u8 reserved_6[0xe0]; + u8 reserved_7[0xe0]; struct mlx5_ifc_flow_context_bits flow_context; }; -- cgit v1.2.3 From 2530236303d9e705db6a28eb9a10c8d79b288b37 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 10 Dec 2015 17:12:43 +0200 Subject: net/mlx5_core: Flow steering tree initialization Flow steering initialization is based on static tree which illustrates the flow steering tree when the driver is loaded. The initialization considers the max supported flow table level of the device, a minimum of 2 kernel flow tables(vlan and mac) are required to have kernel flow table functionality. The tree structures when the driver is loaded: root_namespace(receive nic) | priority-0 (kernel priority) | namespace(kernel namespace) | priority-0 (flow tables priority) In the following patches, When the EN driver will use the flow steering API, it create two flow tables and their flow groups under priority-0(flow tables priority). Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/fs.h | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ac098b6b97bf..2fd7019f69db 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -502,6 +502,8 @@ struct mlx5_priv { struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; unsigned long pci_dev_data; + struct mlx5_flow_root_namespace *root_ns; + struct mlx5_flow_root_namespace *fdb_root_ns; }; enum mlx5_device_state { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 34fd8dc0b3e1..16ae5233dc7b 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -35,6 +35,13 @@ #include +#define MLX5_FS_DEFAULT_FLOW_TAG 0x0 + +enum mlx5_flow_namespace_type { + MLX5_FLOW_NAMESPACE_KERNEL, + MLX5_FLOW_NAMESPACE_FDB, +}; + struct mlx5_flow_table; struct mlx5_flow_destination { @@ -42,6 +49,7 @@ struct mlx5_flow_destination { union { u32 tir_num; struct mlx5_flow_table *ft; + u32 vport_num; }; }; #endif -- cgit v1.2.3 From 86d722ad2c3bd2f0536b196b7fd67ae2a7e2a492 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 10 Dec 2015 17:12:44 +0200 Subject: net/mlx5: Use flow steering infrastructure for mlx5_en Expose the new flow steering API and remove the old one. Few changes are required: 1. The Ethernet flow steering follows the existing implementation, but uses the new steering API. The old flow steering implementation is removed. 2. Move the E-switch FDB management to use the new API. 3. When driver is loaded call to mlx5_init_fs which initialize the flow steering tree structure, open namespaces for NIC receive and for E-switch FDB. 4. Call to mlx5_cleanup_fs when the driver is unloaded. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/flow_table.h | 63 ----------------------------------------- include/linux/mlx5/fs.h | 38 +++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 63 deletions(-) delete mode 100644 include/linux/mlx5/flow_table.h (limited to 'include') diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h deleted file mode 100644 index 0f2a15cf3317..000000000000 --- a/include/linux/mlx5/flow_table.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef MLX5_FLOW_TABLE_H -#define MLX5_FLOW_TABLE_H - -#include - -struct mlx5_flow_table_group { - u8 log_sz; - u8 match_criteria_enable; - u32 match_criteria[MLX5_ST_SZ_DW(fte_match_param)]; -}; - -struct mlx5_flow_destination { - enum mlx5_flow_destination_type type; - union { - u32 tir_num; - void *ft; - u32 vport_num; - }; -}; - -void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type, - u16 num_groups, - struct mlx5_flow_table_group *group); -void mlx5_destroy_flow_table(void *flow_table); -int mlx5_add_flow_table_entry(void *flow_table, u8 match_criteria_enable, - void *match_criteria, void *flow_context, - u32 *flow_index); -void mlx5_del_flow_table_entry(void *flow_table, u32 flow_index); -u32 mlx5_get_flow_table_id(void *flow_table); - -#endif /* MLX5_FLOW_TABLE_H */ diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 16ae5233dc7b..bc7ad019afde 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -33,6 +33,7 @@ #ifndef _MLX5_FS_ #define _MLX5_FS_ +#include #include #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 @@ -43,6 +44,9 @@ enum mlx5_flow_namespace_type { }; struct mlx5_flow_table; +struct mlx5_flow_group; +struct mlx5_flow_rule; +struct mlx5_flow_namespace; struct mlx5_flow_destination { enum mlx5_flow_destination_type type; @@ -52,4 +56,38 @@ struct mlx5_flow_destination { u32 vport_num; }; }; + +struct mlx5_flow_namespace * +mlx5_get_flow_namespace(struct mlx5_core_dev *dev, + enum mlx5_flow_namespace_type type); + +struct mlx5_flow_table * +mlx5_create_flow_table(struct mlx5_flow_namespace *ns, + int prio, + int num_flow_table_entries); +int mlx5_destroy_flow_table(struct mlx5_flow_table *ft); + +/* inbox should be set with the following values: + * start_flow_index + * end_flow_index + * match_criteria_enable + * match_criteria + */ +struct mlx5_flow_group * +mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in); +void mlx5_destroy_flow_group(struct mlx5_flow_group *fg); + +/* Single destination per rule. + * Group ID is implied by the match criteria. + */ +struct mlx5_flow_rule * +mlx5_add_flow_rule(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + u32 *match_criteria, + u32 *match_value, + u32 action, + u32 flow_tag, + struct mlx5_flow_destination *dest); +void mlx5_del_flow_rule(struct mlx5_flow_rule *fr); + #endif -- cgit v1.2.3 From 369620a09bc5ab867342d51f1820c66b00d78a2c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 10 Dec 2015 12:37:44 -0800 Subject: rco: Clean up casting errors Fixe a couple of cast errors found by sparse. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/checksum.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 9fcaedf994ee..10a16b5bd1c7 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -165,7 +165,8 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum, csum = csum_sub(csum, csum_partial(ptr, start, 0)); /* Set derived checksum in packet */ - delta = csum_sub(csum_fold(csum), *psum); + delta = csum_sub((__force __wsum)csum_fold(csum), + (__force __wsum)*psum); *psum = csum_fold(csum); return delta; -- cgit v1.2.3 From abe492b4f50c3ae2ebcfaa2f5c16176aebaa1c68 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 10 Dec 2015 12:37:45 -0800 Subject: geneve: UDP checksum configuration via netlink Add support to enable and disable UDP checksums via netlink. This is similar to how VXLAN and GUE allow this. This includes support for enabling the UDP zero checksum (for both TX and RX). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5ad57375a99f..2be1dd5a103f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -462,6 +462,9 @@ enum { IFLA_GENEVE_PORT, /* destination port */ IFLA_GENEVE_COLLECT_METADATA, IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3