From 747ada36ee23225d81657e4d633ac93b8ccbea7d Mon Sep 17 00:00:00 2001 From: Olaf Dabrunz Date: Wed, 11 Jun 2008 16:35:13 +0200 Subject: pci: add PCI IDs for devices that need boot irq quirks Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- include/linux/pci_ids.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 65953822c9cb..7f3f101e03c1 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2235,6 +2235,10 @@ #define PCI_DEVICE_ID_INTEL_PXH_0 0x0329 #define PCI_DEVICE_ID_INTEL_PXH_1 0x032A #define PCI_DEVICE_ID_INTEL_PXHV 0x032C +#define PCI_DEVICE_ID_INTEL_80332_0 0x0330 +#define PCI_DEVICE_ID_INTEL_80332_1 0x0332 +#define PCI_DEVICE_ID_INTEL_80333_0 0x0370 +#define PCI_DEVICE_ID_INTEL_80333_1 0x0372 #define PCI_DEVICE_ID_INTEL_82375 0x0482 #define PCI_DEVICE_ID_INTEL_82424 0x0483 #define PCI_DEVICE_ID_INTEL_82378 0x0484 @@ -2307,6 +2311,7 @@ #define PCI_DEVICE_ID_INTEL_ESB_4 0x25a4 #define PCI_DEVICE_ID_INTEL_ESB_5 0x25a6 #define PCI_DEVICE_ID_INTEL_ESB_9 0x25ab +#define PCI_DEVICE_ID_INTEL_ESB_10 0x25ac #define PCI_DEVICE_ID_INTEL_82820_HB 0x2500 #define PCI_DEVICE_ID_INTEL_82820_UP_HB 0x2501 #define PCI_DEVICE_ID_INTEL_82850_HB 0x2530 -- cgit v1.2.3 From e1d3a90846b40ad3160bf4b648d36c6badad39ac Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Wed, 11 Jun 2008 16:35:17 +0200 Subject: pci, acpi: reroute PCI interrupt to legacy boot interrupt equivalent Some chipsets (e.g. intel 6700PXH) generate a legacy INTx when the IRQ entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel does during interrupt handling). On chipsets where this INTx generation cannot be disabled, we reroute the valid interrupts to their legacy equivalent to get rid of spurious interrupts that might otherwise bring down (vital) interrupt lines through spurious interrupt detection in note_interrupt(). This patch benefited from discussions with Alexander Graf, Torsten Duwe, Ihno Krumreich, Daniel Gollub, Hannes Reinecke. The conclusions we drew and the patch itself are the authors' responsibility alone. Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- include/linux/pci.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index d18b1dd49fab..6755cf5ac109 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -117,6 +117,11 @@ enum pci_dev_flags { PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG = (__force pci_dev_flags_t) 1, }; +enum pci_irq_reroute_variant { + INTEL_IRQ_REROUTE_VARIANT = 1, + MAX_IRQ_REROUTE_VARIANTS = 3 +}; + typedef unsigned short __bitwise pci_bus_flags_t; enum pci_bus_flags { PCI_BUS_FLAGS_NO_MSI = (__force pci_bus_flags_t) 1, @@ -194,6 +199,7 @@ struct pci_dev { unsigned int no_d1d2:1; /* only allow d0 or d3 */ unsigned int block_ucfg_access:1; /* userspace config space access is blocked */ unsigned int broken_parity_status:1; /* Device generates false positive parity */ + unsigned int irq_reroute_variant:2; /* device needs IRQ rerouting variant */ unsigned int msi_enabled:1; unsigned int msix_enabled:1; unsigned int is_managed:1; -- cgit v1.2.3 From a53ccab3ccac9e8676a683df9822a2daec83ef54 Mon Sep 17 00:00:00 2001 From: Matthew Ranostay Date: Sat, 25 Oct 2008 01:05:04 -0400 Subject: ALSA: jack: lineout support to jack abstraction layer This patch introduces support for reporting SW_LINEOUT_INSERT detection events via the jack abstraction layer. Also adds a SND_JACK_LINEOUT define to the input system header. Signed-off-by: Matthew Ranostay Cc: Dmitry Torokhov Acked-by: Mark Brown Signed-off-by: Takashi Iwai --- include/linux/input.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index a5802c9c81a4..7323d2ff5151 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -644,6 +644,7 @@ struct input_absinfo { #define SW_RADIO SW_RFKILL_ALL /* deprecated */ #define SW_MICROPHONE_INSERT 0x04 /* set = inserted */ #define SW_DOCK 0x05 /* set = plugged into dock */ +#define SW_LINEOUT_INSERT 0x06 /* set = inserted */ #define SW_MAX 0x0f #define SW_CNT (SW_MAX+1) -- cgit v1.2.3 From def8b4faff5ca349beafbbfeb2c51f3602a6ef3a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 28 Oct 2008 13:24:06 -0700 Subject: net: reduce structures when XFRM=n ifdef out * struct sk_buff::sp (pointer) * struct dst_entry::xfrm (pointer) * struct sock::sk_policy (2 pointers) Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2725f4e5a9bf..487e34507b41 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -269,8 +269,9 @@ struct sk_buff { struct dst_entry *dst; struct rtable *rtable; }; +#ifdef CONFIG_XFRM struct sec_path *sp; - +#endif /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you @@ -1864,6 +1865,18 @@ static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_bu to->queue_mapping = from->queue_mapping; } +#ifdef CONFIG_XFRM +static inline struct sec_path *skb_sec_path(struct sk_buff *skb) +{ + return skb->sp; +} +#else +static inline struct sec_path *skb_sec_path(struct sk_buff *skb) +{ + return NULL; +} +#endif + static inline int skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; -- cgit v1.2.3 From 3a2dfbe8acb154905fdc2fd03ec56df42e6c4cc4 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 28 Oct 2008 16:01:07 -0700 Subject: xfrm: Notify changes in UDP encapsulation via netlink Add new_mapping() implementation to the netlink xfrm_mgr to notify address/port changes detected in UDP encapsulated ESP packets. Signed-off-by: Martin Willi Signed-off-by: David S. Miller --- include/linux/xfrm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 4bc1e6b86cb2..52f3abd453a1 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -199,6 +199,9 @@ enum { #define XFRM_MSG_NEWSPDINFO XFRM_MSG_NEWSPDINFO XFRM_MSG_GETSPDINFO, #define XFRM_MSG_GETSPDINFO XFRM_MSG_GETSPDINFO + + XFRM_MSG_MAPPING, +#define XFRM_MSG_MAPPING XFRM_MSG_MAPPING __XFRM_MSG_MAX }; #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1) @@ -438,6 +441,15 @@ struct xfrm_user_migrate { __u16 new_family; }; +struct xfrm_user_mapping { + struct xfrm_usersa_id id; + __u32 reqid; + xfrm_address_t old_saddr; + xfrm_address_t new_saddr; + __be16 old_sport; + __be16 new_sport; +}; + #ifndef __KERNEL__ /* backwards compatibility for userspace */ #define XFRMGRP_ACQUIRE 1 @@ -464,6 +476,8 @@ enum xfrm_nlgroups { #define XFRMNLGRP_REPORT XFRMNLGRP_REPORT XFRMNLGRP_MIGRATE, #define XFRMNLGRP_MIGRATE XFRMNLGRP_MIGRATE + XFRMNLGRP_MAPPING, +#define XFRMNLGRP_MAPPING XFRMNLGRP_MAPPING __XFRMNLGRP_MAX }; #define XFRMNLGRP_MAX (__XFRMNLGRP_MAX - 1) -- cgit v1.2.3 From 0c6ce78abf6e228d44c3840edb8a4ae0c1299825 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 28 Oct 2008 16:09:23 -0700 Subject: net: replace uses of NIP6_FMT with %p6 Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- include/linux/sunrpc/svc_xprt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 6fd7b016517f..42e01c93c7ea 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -145,8 +145,8 @@ static inline char *__svc_print_addr(struct sockaddr *addr, break; case AF_INET6: - snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u", - NIP6(((struct sockaddr_in6 *) addr)->sin6_addr), + snprintf(buf, len, "%p6, port=%u", + &((struct sockaddr_in6 *)addr)->sin6_addr, ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); break; -- cgit v1.2.3 From b189db5d299c6824780af5590564ff608adb3dea Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 28 Oct 2008 22:38:52 -0700 Subject: net: remove NIP6(), NIP6_FMT, NIP6_SEQFMT and final users Open code NIP6_FMT in the one call inside sscanf and one user of NIP6() that could use %p6 in the netfilter code. Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- include/linux/kernel.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 396a350b87a6..77777c460099 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -357,18 +357,6 @@ static inline char *pack_hex_byte(char *buf, u8 byte) ((unsigned char *)&addr)[3] #define NIPQUAD_FMT "%u.%u.%u.%u" -#define NIP6(addr) \ - ntohs((addr).s6_addr16[0]), \ - ntohs((addr).s6_addr16[1]), \ - ntohs((addr).s6_addr16[2]), \ - ntohs((addr).s6_addr16[3]), \ - ntohs((addr).s6_addr16[4]), \ - ntohs((addr).s6_addr16[5]), \ - ntohs((addr).s6_addr16[6]), \ - ntohs((addr).s6_addr16[7]) -#define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x" -#define NIP6_SEQFMT "%04x%04x%04x%04x%04x%04x%04x%04x" - #if defined(__LITTLE_ENDIAN) #define HIPQUAD(addr) \ ((unsigned char *)&addr)[3], \ -- cgit v1.2.3 From 96631ed16c514cf8b28fab991a076985ce378c26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Oct 2008 11:19:58 -0700 Subject: udp: introduce sk_for_each_rcu_safenext() Corey Minyard found a race added in commit 271b72c7fa82c2c7a795bc16896149933110672d (udp: RCU handling for Unicast packets.) "If the socket is moved from one list to another list in-between the time the hash is calculated and the next field is accessed, and the socket has moved to the end of the new list, the traversal will not complete properly on the list it should have, since the socket will be on the end of the new list and there's not a way to tell it's on a new list and restart the list traversal. I think that this can be solved by pre-fetching the "next" field (with proper barriers) before checking the hash." This patch corrects this problem, introducing a new sk_for_each_rcu_safenext() macro. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rculist.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e649bd3f2c97..3ba2998b22ba 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -383,5 +383,22 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference(pos->next)) +/** + * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * @next: the &struct hlist_node to use as a next cursor + * + * Special version of hlist_for_each_entry_rcu that make sure + * each next pointer is fetched before each iteration. + */ +#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \ + for (pos = rcu_dereference((head)->first); \ + pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ + pos = rcu_dereference(next)) + #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From 5b095d98928fdb9e3b75be20a54b7a6cbf6ca9ad Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 29 Oct 2008 12:52:50 -0700 Subject: net: replace %p6 with %pI6 Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- include/linux/sunrpc/svc_xprt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 42e01c93c7ea..51cb75ea42d5 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -145,7 +145,7 @@ static inline char *__svc_print_addr(struct sockaddr *addr, break; case AF_INET6: - snprintf(buf, len, "%p6, port=%u", + snprintf(buf, len, "%pI6, port=%u", &((struct sockaddr_in6 *)addr)->sin6_addr, ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); break; -- cgit v1.2.3 From 2cb1599f9b2ecdd7a9e59feeee647eb258966839 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 30 Oct 2008 17:32:23 +1100 Subject: Inode: Allow external initialisers To allow XFS to combine the XFS and linux inodes into a single structure, we need to drive inode lookup from the XFS inode cache, not the generic inode cache. This means that we need initialise a struct inode from a context outside alloc_inode() as it is no longer used by XFS. Factor and export the struct inode initialisation code from alloc_inode() to inode_init_always() as a counterpart to inode_init_once(). i.e. we have to call this init function for each inode instantiation (always), as opposed inode_init_once() which is only called on slab object instantiation (once). Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b248d61430c..04abead4b021 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1881,6 +1881,7 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); +extern struct inode * inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); extern void iput(struct inode *); extern struct inode * igrab(struct inode *); -- cgit v1.2.3 From 8290c35f87304a6b73d4fd17b03580b4f7425de8 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 30 Oct 2008 17:35:24 +1100 Subject: Inode: Allow external list initialisation To allow XFS to combine the XFS and linux inodes into a single structure, we need to drive inode lookup from the XFS inode cache, not the generic inode cache. This means that we need initialise a struct inode from a context outside alloc_inode() as it is no longer used by XFS. After inode allocation and initialisation, we need to add the inode to the superblock list, the in-use list, hash it and do some accounting. This all needs to be done with the inode_lock held and there are already several places in fs/inode.c that do this list manipulation. Factor out the common code, add a locking wrapper and export the function so ti can be called from XFS. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 04abead4b021..1deedf235d55 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1883,6 +1883,7 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); extern struct inode * inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); +extern void inode_add_to_lists(struct super_block *, struct inode *); extern void iput(struct inode *); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); -- cgit v1.2.3 From 3685f25de1b0447fff381c420de1e25bd57c9efb Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 31 Oct 2008 00:56:49 -0700 Subject: misc: replace NIPQUAD() Using NIPQUAD() with NIPQUAD_FMT, %d.%d.%d.%d or %u.%u.%u.%u can be replaced with %pI4 Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- include/linux/sunrpc/svc_xprt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 51cb75ea42d5..0127daca4354 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -139,8 +139,8 @@ static inline char *__svc_print_addr(struct sockaddr *addr, { switch (addr->sa_family) { case AF_INET: - snprintf(buf, len, "%u.%u.%u.%u, port=%u", - NIPQUAD(((struct sockaddr_in *) addr)->sin_addr), + snprintf(buf, len, "%pI4, port=%u", + &((struct sockaddr_in *)addr)->sin_addr, ntohs(((struct sockaddr_in *) addr)->sin_port)); break; -- cgit v1.2.3 From d9fe60dea7779d412b34679f1177c5ca1940ea8d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 9 Oct 2008 12:13:49 +0200 Subject: 802.11: clean up/fix HT support This patch cleans up a number of things: * the unusable definition of the HT capabilities/HT information information elements * variable names that are hard to understand * mac80211: move ieee80211_handle_ht to ht.c and remove the unused enable_ht parameter * mac80211: fix bug with MCS rate 32 in ieee80211_handle_ht * mac80211: fix bug with casting the result of ieee80211_bss_get_ie to an information element _contents_ rather than the whole element, add size checking (another out-of-bounds access bug fixed!) * mac80211: remove some unused return values in favour of BUG_ON checking * a few minor other things Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 133 +++++++++++++++++++++++++++++++++------------- 1 file changed, 95 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 14126bc36641..64a4abce6d91 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -685,28 +685,88 @@ struct ieee80211_bar { #define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL 0x0000 #define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA 0x0004 + +#define IEEE80211_HT_MCS_MASK_LEN 10 + +/** + * struct ieee80211_mcs_info - MCS information + * @rx_mask: RX mask + * @rx_highest: highest supported RX rate + * @tx_params: TX parameters + */ +struct ieee80211_mcs_info { + u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN]; + __le16 rx_highest; + u8 tx_params; + u8 reserved[3]; +} __attribute__((packed)); + +/* 802.11n HT capability MSC set */ +#define IEEE80211_HT_MCS_RX_HIGHEST_MASK 0x3ff +#define IEEE80211_HT_MCS_TX_DEFINED 0x01 +#define IEEE80211_HT_MCS_TX_RX_DIFF 0x02 +/* value 0 == 1 stream etc */ +#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK 0x0C +#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT 2 +#define IEEE80211_HT_MCS_TX_MAX_STREAMS 4 +#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION 0x10 + +/* + * 802.11n D5.0 20.3.5 / 20.6 says: + * - indices 0 to 7 and 32 are single spatial stream + * - 8 to 31 are multiple spatial streams using equal modulation + * [8..15 for two streams, 16..23 for three and 24..31 for four] + * - remainder are multiple spatial streams using unequal modulation + */ +#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33 +#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \ + (IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8) + /** * struct ieee80211_ht_cap - HT capabilities * - * This structure refers to "HT capabilities element" as - * described in 802.11n draft section 7.3.2.52 + * This structure is the "HT capabilities element" as + * described in 802.11n D5.0 7.3.2.57 */ struct ieee80211_ht_cap { __le16 cap_info; u8 ampdu_params_info; - u8 supp_mcs_set[16]; + + /* 16 bytes MCS information */ + struct ieee80211_mcs_info mcs; + __le16 extended_ht_cap_info; __le32 tx_BF_cap_info; u8 antenna_selection_info; } __attribute__ ((packed)); +/* 802.11n HT capabilities masks (for cap_info) */ +#define IEEE80211_HT_CAP_LDPC_CODING 0x0001 +#define IEEE80211_HT_CAP_SUP_WIDTH_20_40 0x0002 +#define IEEE80211_HT_CAP_SM_PS 0x000C +#define IEEE80211_HT_CAP_GRN_FLD 0x0010 +#define IEEE80211_HT_CAP_SGI_20 0x0020 +#define IEEE80211_HT_CAP_SGI_40 0x0040 +#define IEEE80211_HT_CAP_TX_STBC 0x0080 +#define IEEE80211_HT_CAP_RX_STBC 0x0300 +#define IEEE80211_HT_CAP_DELAY_BA 0x0400 +#define IEEE80211_HT_CAP_MAX_AMSDU 0x0800 +#define IEEE80211_HT_CAP_DSSSCCK40 0x1000 +#define IEEE80211_HT_CAP_PSMP_SUPPORT 0x2000 +#define IEEE80211_HT_CAP_40MHZ_INTOLERANT 0x4000 +#define IEEE80211_HT_CAP_LSIG_TXOP_PROT 0x8000 + +/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */ +#define IEEE80211_HT_AMPDU_PARM_FACTOR 0x03 +#define IEEE80211_HT_AMPDU_PARM_DENSITY 0x1C + /** - * struct ieee80211_ht_cap - HT additional information + * struct ieee80211_ht_info - HT information * - * This structure refers to "HT information element" as - * described in 802.11n draft section 7.3.2.53 + * This structure is the "HT information element" as + * described in 802.11n D5.0 7.3.2.58 */ -struct ieee80211_ht_addt_info { +struct ieee80211_ht_info { u8 control_chan; u8 ht_param; __le16 operation_mode; @@ -714,36 +774,33 @@ struct ieee80211_ht_addt_info { u8 basic_set[16]; } __attribute__ ((packed)); -/* 802.11n HT capabilities masks */ -#define IEEE80211_HT_CAP_SUP_WIDTH 0x0002 -#define IEEE80211_HT_CAP_SM_PS 0x000C -#define IEEE80211_HT_CAP_GRN_FLD 0x0010 -#define IEEE80211_HT_CAP_SGI_20 0x0020 -#define IEEE80211_HT_CAP_SGI_40 0x0040 -#define IEEE80211_HT_CAP_DELAY_BA 0x0400 -#define IEEE80211_HT_CAP_MAX_AMSDU 0x0800 -#define IEEE80211_HT_CAP_DSSSCCK40 0x1000 -/* 802.11n HT capability AMPDU settings */ -#define IEEE80211_HT_CAP_AMPDU_FACTOR 0x03 -#define IEEE80211_HT_CAP_AMPDU_DENSITY 0x1C -/* 802.11n HT capability MSC set */ -#define IEEE80211_SUPP_MCS_SET_UEQM 4 -#define IEEE80211_HT_CAP_MAX_STREAMS 4 -#define IEEE80211_SUPP_MCS_SET_LEN 10 -/* maximum streams the spec allows */ -#define IEEE80211_HT_CAP_MCS_TX_DEFINED 0x01 -#define IEEE80211_HT_CAP_MCS_TX_RX_DIFF 0x02 -#define IEEE80211_HT_CAP_MCS_TX_STREAMS 0x0C -#define IEEE80211_HT_CAP_MCS_TX_UEQM 0x10 -/* 802.11n HT IE masks */ -#define IEEE80211_HT_IE_CHA_SEC_OFFSET 0x03 -#define IEEE80211_HT_IE_CHA_SEC_NONE 0x00 -#define IEEE80211_HT_IE_CHA_SEC_ABOVE 0x01 -#define IEEE80211_HT_IE_CHA_SEC_BELOW 0x03 -#define IEEE80211_HT_IE_CHA_WIDTH 0x04 -#define IEEE80211_HT_IE_HT_PROTECTION 0x0003 -#define IEEE80211_HT_IE_NON_GF_STA_PRSNT 0x0004 -#define IEEE80211_HT_IE_NON_HT_STA_PRSNT 0x0010 +/* for ht_param */ +#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET 0x03 +#define IEEE80211_HT_PARAM_CHA_SEC_NONE 0x00 +#define IEEE80211_HT_PARAM_CHA_SEC_ABOVE 0x01 +#define IEEE80211_HT_PARAM_CHA_SEC_BELOW 0x03 +#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY 0x04 +#define IEEE80211_HT_PARAM_RIFS_MODE 0x08 +#define IEEE80211_HT_PARAM_SPSMP_SUPPORT 0x10 +#define IEEE80211_HT_PARAM_SERV_INTERVAL_GRAN 0xE0 + +/* for operation_mode */ +#define IEEE80211_HT_OP_MODE_PROTECTION 0x0003 +#define IEEE80211_HT_OP_MODE_PROTECTION_NONE 0 +#define IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER 1 +#define IEEE80211_HT_OP_MODE_PROTECTION_20MHZ 2 +#define IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED 3 +#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT 0x0004 +#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT 0x0010 + +/* for stbc_param */ +#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON 0x0040 +#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT 0x0080 +#define IEEE80211_HT_STBC_PARAM_STBC_BEACON 0x0100 +#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT 0x0200 +#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE 0x0400 +#define IEEE80211_HT_STBC_PARAM_PCO_PHASE 0x0800 + /* block-ack parameters */ #define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002 @@ -949,7 +1006,7 @@ enum ieee80211_eid { WLAN_EID_EXT_SUPP_RATES = 50, /* 802.11n */ WLAN_EID_HT_CAPABILITY = 45, - WLAN_EID_HT_EXTRA_INFO = 61, + WLAN_EID_HT_INFORMATION = 61, /* 802.11i */ WLAN_EID_RSN = 48, WLAN_EID_WPA = 221, -- cgit v1.2.3 From d51626df5747efaa8d2c00678f64cb503845effe Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 9 Oct 2008 12:20:13 +0200 Subject: nl80211: export HT capabilities This exports the local HT capabilities in nl80211. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 9bad65400fba..41720d47d618 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -452,17 +452,29 @@ enum nl80211_mpath_info { * an array of nested frequency attributes * @NL80211_BAND_ATTR_RATES: supported bitrates in this band, * an array of nested bitrate attributes + * @NL80211_BAND_ATTR_HT_MCS_SET: 16-byte attribute containing the MCS set as + * defined in 802.11n + * @NL80211_BAND_ATTR_HT_CAPA: HT capabilities, as in the HT information IE + * @NL80211_BAND_ATTR_HT_AMPDU_FACTOR: A-MPDU factor, as in 11n + * @NL80211_BAND_ATTR_HT_AMPDU_DENSITY: A-MPDU density, as in 11n */ enum nl80211_band_attr { __NL80211_BAND_ATTR_INVALID, NL80211_BAND_ATTR_FREQS, NL80211_BAND_ATTR_RATES, + NL80211_BAND_ATTR_HT_MCS_SET, + NL80211_BAND_ATTR_HT_CAPA, + NL80211_BAND_ATTR_HT_AMPDU_FACTOR, + NL80211_BAND_ATTR_HT_AMPDU_DENSITY, + /* keep last */ __NL80211_BAND_ATTR_AFTER_LAST, NL80211_BAND_ATTR_MAX = __NL80211_BAND_ATTR_AFTER_LAST - 1 }; +#define NL80211_BAND_ATTR_HT_CAPA NL80211_BAND_ATTR_HT_CAPA + /** * enum nl80211_frequency_attr - frequency attributes * @NL80211_FREQUENCY_ATTR_FREQ: Frequency in MHz -- cgit v1.2.3 From 93da9cc17c5ae8a751886fd4732db89ad5e9bdb9 Mon Sep 17 00:00:00 2001 From: "colin@cozybit.com" Date: Tue, 21 Oct 2008 12:03:48 -0700 Subject: Add nl80211 commands to get and set o11s mesh networking parameters The two new commands are NL80211_CMD_GET_MESH_PARAMS and NL80211_CMD_SET_MESH_PARAMS. There is a new attribute enum, NL80211_ATTR_MESH_PARAMS, which enumerates the various mesh configuration parameters. Moved struct mesh_config from mac80211/ieee80211_i.h to net/cfg80211.h. nl80211_get_mesh_params and nl80211_set_mesh_params unpack the netlink messages and ask the driver to get or set the configuration. This is done via two new function stubs, get_mesh_params and set_mesh_params, in struct cfg80211_ops. Signed-off-by: Colin McCabe Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 86 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 41720d47d618..e4cc7869b22f 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -106,6 +106,12 @@ * to the the specified ISO/IEC 3166-1 alpha2 country code. The core will * store this as a valid request and then query userspace for it. * + * @NL80211_CMD_GET_MESH_PARAMS: Get mesh networking properties for the + * interface identified by %NL80211_ATTR_IFINDEX + * + * @NL80211_CMD_SET_MESH_PARAMS: Set mesh networking properties for the + * interface identified by %NL80211_ATTR_IFINDEX + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -148,6 +154,9 @@ enum nl80211_commands { NL80211_CMD_SET_REG, NL80211_CMD_REQ_SET_REG, + NL80211_CMD_GET_MESH_PARAMS, + NL80211_CMD_SET_MESH_PARAMS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -296,6 +305,8 @@ enum nl80211_attrs { NL80211_ATTR_REG_ALPHA2, NL80211_ATTR_REG_RULES, + NL80211_ATTR_MESH_PARAMS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -606,4 +617,79 @@ enum nl80211_mntr_flags { NL80211_MNTR_FLAG_MAX = __NL80211_MNTR_FLAG_AFTER_LAST - 1 }; +/** + * enum nl80211_meshconf_params - mesh configuration parameters + * + * Mesh configuration parameters + * + * @__NL80211_MESHCONF_INVALID: internal use + * + * @NL80211_MESHCONF_RETRY_TIMEOUT: specifies the initial retry timeout in + * millisecond units, used by the Peer Link Open message + * + * @NL80211_MESHCONF_CONFIRM_TIMEOUT: specifies the inital confirm timeout, in + * millisecond units, used by the peer link management to close a peer link + * + * @NL80211_MESHCONF_HOLDING_TIMEOUT: specifies the holding timeout, in + * millisecond units + * + * @NL80211_MESHCONF_MAX_PEER_LINKS: maximum number of peer links allowed + * on this mesh interface + * + * @NL80211_MESHCONF_MAX_RETRIES: specifies the maximum number of peer link + * open retries that can be sent to establish a new peer link instance in a + * mesh + * + * @NL80211_MESHCONF_TTL: specifies the value of TTL field set at a source mesh + * point. + * + * @NL80211_MESHCONF_AUTO_OPEN_PLINKS: whether we should automatically + * open peer links when we detect compatible mesh peers. + * + * @NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES: the number of action frames + * containing a PREQ that an MP can send to a particular destination (path + * target) + * + * @NL80211_MESHCONF_PATH_REFRESH_TIME: how frequently to refresh mesh paths + * (in milliseconds) + * + * @NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT: minimum length of time to wait + * until giving up on a path discovery (in milliseconds) + * + * @NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT: The time (in TUs) for which mesh + * points receiving a PREQ shall consider the forwarding information from the + * root to be valid. (TU = time unit) + * + * @NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL: The minimum interval of time (in + * TUs) during which an MP can send only one action frame containing a PREQ + * reference element + * + * @NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME: The interval of time (in TUs) + * that it takes for an HWMP information element to propagate across the mesh + * + * @NL80211_MESHCONF_ATTR_MAX: highest possible mesh configuration attribute + * + * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use + */ +enum nl80211_meshconf_params { + __NL80211_MESHCONF_INVALID, + NL80211_MESHCONF_RETRY_TIMEOUT, + NL80211_MESHCONF_CONFIRM_TIMEOUT, + NL80211_MESHCONF_HOLDING_TIMEOUT, + NL80211_MESHCONF_MAX_PEER_LINKS, + NL80211_MESHCONF_MAX_RETRIES, + NL80211_MESHCONF_TTL, + NL80211_MESHCONF_AUTO_OPEN_PLINKS, + NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, + NL80211_MESHCONF_PATH_REFRESH_TIME, + NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, + NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, + NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, + NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, + + /* keep last */ + __NL80211_MESHCONF_ATTR_AFTER_LAST, + NL80211_MESHCONF_ATTR_MAX = __NL80211_MESHCONF_ATTR_AFTER_LAST - 1 +}; + #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From 9387b7caf3049168fc97a8a9111af8fe2143af18 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Tue, 30 Sep 2008 20:59:05 -0400 Subject: wireless: use individual buffers for printing ssid values Also change escape_ssid to print_ssid to match print_mac semantics. Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 64a4abce6d91..b0726e2079b5 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -12,8 +12,8 @@ * published by the Free Software Foundation. */ -#ifndef IEEE80211_H -#define IEEE80211_H +#ifndef LINUX_IEEE80211_H +#define LINUX_IEEE80211_H #include #include @@ -1114,4 +1114,4 @@ static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr) return hdr->addr1; } -#endif /* IEEE80211_H */ +#endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From 72118015271e6d3852cb9f647efe0987d131adaa Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Tue, 30 Sep 2008 21:43:03 -0400 Subject: wireless: avoid some net/ieee80211.h vs. linux/ieee80211.h conflicts There is quite a lot of overlap in definitions between these headers... Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b0726e2079b5..aad99195a4cc 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -826,8 +826,7 @@ struct ieee80211_ht_info { /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 -#define WLAN_AUTH_FAST_BSS_TRANSITION 2 -#define WLAN_AUTH_LEAP 128 +#define WLAN_AUTH_LEAP 2 #define WLAN_AUTH_CHALLENGE_LEN 128 -- cgit v1.2.3 From 8b30b1fe368ab03049435884c11c5c50e4c4ef0b Mon Sep 17 00:00:00 2001 From: Sujith Date: Fri, 24 Oct 2008 09:55:27 +0530 Subject: mac80211: Re-enable aggregation Wireless HW without any dedicated queues for aggregation do not need the ampdu_queues mechanism present right now in mac80211. Since mac80211 is still incomplete wrt TX MQ changes, do not allow aggregation sessions for drivers that set ampdu_queues. This is only an interim hack until Intel fixes the requeue issue. Signed-off-by: Sujith Signed-off-by: Luis Rodriguez Signed-off-by: John W. Linville --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 487e34507b41..a01b6f84e3bc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -250,6 +250,9 @@ typedef unsigned char *sk_buff_data_t; * @tc_verd: traffic control verdict * @ndisc_nodetype: router type (from link layer) * @do_not_encrypt: set to prevent encryption of this frame + * @requeue: set to indicate that the wireless core should attempt + * a software retry on this frame if we failed to + * receive an ACK for it * @dma_cookie: a cookie to one of several possible DMA operations * done by skb DMA functions * @secmark: security marking @@ -326,6 +329,7 @@ struct sk_buff { #endif #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) __u8 do_not_encrypt:1; + __u8 requeue:1; #endif /* 0/13/14 bit hole */ -- cgit v1.2.3 From 6cf3f41e6c08bca6641a695449791c38a25f35ff Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Mon, 3 Nov 2008 18:16:50 -0800 Subject: bonding, net: Move last_rx update into bonding recv logic The only user of the net_device->last_rx field is bonding. This patch adds a conditional update of last_rx to the bonding special logic in skb_bond_should_drop, causing last_rx to only be updated when the ARP monitor is running. This frees network device drivers from the necessity of updating last_rx, which can have cache line thrash issues. Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- include/linux/if.h | 1 + include/linux/netdevice.h | 32 ++++++++++++++++++-------------- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if.h b/include/linux/if.h index 65246846c844..2a6e29620a96 100644 --- a/include/linux/if.h +++ b/include/linux/if.h @@ -65,6 +65,7 @@ #define IFF_BONDING 0x20 /* bonding master or slave */ #define IFF_SLAVE_NEEDARP 0x40 /* need ARPs for validation */ #define IFF_ISATAP 0x80 /* ISATAP interface (RFC4214) */ +#define IFF_MASTER_ARPMON 0x100 /* bonding master, ARP mon in use */ #define IF_GET_IFACE 0x0001 /* for querying only */ #define IF_GET_PROTO 0x0002 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9d77b1d7dca8..f1b0dbe58464 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1742,22 +1742,26 @@ static inline int skb_bond_should_drop(struct sk_buff *skb) struct net_device *dev = skb->dev; struct net_device *master = dev->master; - if (master && - (dev->priv_flags & IFF_SLAVE_INACTIVE)) { - if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && - skb->protocol == __constant_htons(ETH_P_ARP)) - return 0; - - if (master->priv_flags & IFF_MASTER_ALB) { - if (skb->pkt_type != PACKET_BROADCAST && - skb->pkt_type != PACKET_MULTICAST) + if (master) { + if (master->priv_flags & IFF_MASTER_ARPMON) + dev->last_rx = jiffies; + + if (dev->priv_flags & IFF_SLAVE_INACTIVE) { + if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && + skb->protocol == __constant_htons(ETH_P_ARP)) return 0; - } - if (master->priv_flags & IFF_MASTER_8023AD && - skb->protocol == __constant_htons(ETH_P_SLOW)) - return 0; - return 1; + if (master->priv_flags & IFF_MASTER_ALB) { + if (skb->pkt_type != PACKET_BROADCAST && + skb->pkt_type != PACKET_MULTICAST) + return 0; + } + if (master->priv_flags & IFF_MASTER_8023AD && + skb->protocol == __constant_htons(ETH_P_SLOW)) + return 0; + + return 1; + } } return 0; } -- cgit v1.2.3 From 511061e2dd1b84bb21bb97c9216a19606c29ac02 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 4 Nov 2008 14:22:55 +0100 Subject: netfilter: netns ebtables: part 1 * propagate netns from userspace, register table in passed netns * remporarily register every ebt_table in init_net P. S.: one needs to add ".netns_ok = 1" to igmp_protocol to test with ebtables(8) in netns. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter_bridge/ebtables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index d45e29cd1cfb..624e7883068c 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -300,7 +300,7 @@ struct ebt_table #define EBT_ALIGN(s) (((s) + (__alignof__(struct ebt_replace)-1)) & \ ~(__alignof__(struct ebt_replace)-1)) -extern int ebt_register_table(struct ebt_table *table); +extern int ebt_register_table(struct net *net, struct ebt_table *table); extern void ebt_unregister_table(struct ebt_table *table); extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, -- cgit v1.2.3 From 6beceee5aa2cb94c4ae9f0784c7d3135d343f5b5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 4 Nov 2008 14:27:15 +0100 Subject: netfilter: netns ebtables: part 2 * return ebt_table from ebt_register_table(), module code will save it into per-netns data for unregistration * duplicate ebt_table at the very beginning of registration -- it's added into list, so one ebt_table wouldn't end up in many lists (and each netns has different one) * introduce underscored tables in individial modules, this is temporary to not break bisection. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter_bridge/ebtables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 624e7883068c..e40ddb94b1af 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -300,7 +300,8 @@ struct ebt_table #define EBT_ALIGN(s) (((s) + (__alignof__(struct ebt_replace)-1)) & \ ~(__alignof__(struct ebt_replace)-1)) -extern int ebt_register_table(struct net *net, struct ebt_table *table); +extern struct ebt_table *ebt_register_table(struct net *net, + struct ebt_table *table); extern void ebt_unregister_table(struct ebt_table *table); extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, -- cgit v1.2.3 From fd8cd7e1919fc1c27fe2fdccd2a1cd32f791ef0f Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 3 Nov 2008 15:50:38 -0800 Subject: x86: vmware: look for DMI string in the product serial key Impact: Should permit VMware detection on older platforms where the vendor is changed. Could theoretically cause a regression if some weird serial number scheme contains the string "VMware" by pure chance. Seems unlikely, especially with the mixed case. In some user configured cases, VMware may choose not to put a VMware specific DMI string, but the product serial key is always there and is VMware specific. Add a interface to check the serial key, when checking for VMware in the DMI information. Signed-off-by: Alok N Kataria Signed-off-by: H. Peter Anvin --- include/linux/dmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index e5084eb5943a..2bfda178f274 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -44,6 +44,7 @@ extern const struct dmi_device * dmi_find_device(int type, const char *name, extern void dmi_scan_machine(void); extern int dmi_get_year(int field); extern int dmi_name_in_vendors(const char *str); +extern int dmi_name_in_serial(const char *str); extern int dmi_available; extern int dmi_walk(void (*decode)(const struct dmi_header *)); @@ -56,6 +57,7 @@ static inline const struct dmi_device * dmi_find_device(int type, const char *na static inline void dmi_scan_machine(void) { return; } static inline int dmi_get_year(int year) { return 0; } static inline int dmi_name_in_vendors(const char *s) { return 0; } +static inline int dmi_name_in_serial(const char *s) { return 0; } #define dmi_available 0 static inline int dmi_walk(void (*decode)(const struct dmi_header *)) { return -1; } -- cgit v1.2.3 From 7d43d1a0f2cf535167ec7247f110a1f85cecac43 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 4 Nov 2008 23:43:47 -0800 Subject: dccp: Implement lookup table for feature-negotiation information A lookup table for feature-negotiation information, extracted from RFC 4340/42, is provided by this patch. All currently known features can be found in this table, along with their feature location, their default value, and type. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/dccp.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6080449fbec9..3978aff197d9 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -176,19 +176,20 @@ enum { }; /* DCCP features (RFC 4340 section 6.4) */ -enum { +enum dccp_feature_numbers { DCCPF_RESERVED = 0, DCCPF_CCID = 1, - DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ + DCCPF_SHORT_SEQNOS = 2, DCCPF_SEQUENCE_WINDOW = 3, - DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ + DCCPF_ECN_INCAPABLE = 4, DCCPF_ACK_RATIO = 5, DCCPF_SEND_ACK_VECTOR = 6, DCCPF_SEND_NDP_COUNT = 7, DCCPF_MIN_CSUM_COVER = 8, - DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ + DCCPF_DATA_CHECKSUM = 9, /* 10-127 reserved */ DCCPF_MIN_CCID_SPECIFIC = 128, + DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ DCCPF_MAX_CCID_SPECIFIC = 255, }; -- cgit v1.2.3 From ac75773c2742d82cbcb078708df406e9017224b7 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 4 Nov 2008 23:55:49 -0800 Subject: dccp: Per-socket initialisation of feature negotiation This provides feature-negotiation initialisation for both DCCP sockets and DCCP request_sockets, to support feature negotiation during connection setup. It also resolves a FIXME regarding the congestion control initialisation. Thanks to Wei Yongjun for help with the IPv6 side of this patch. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- include/linux/dccp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 3978aff197d9..484b8a1fb023 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -412,6 +412,7 @@ extern void dccp_minisock_init(struct dccp_minisock *dmsk); * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) * @dreq_isr: initial sequence number received on the Request * @dreq_service: service code present on the Request (there is just one) + * @dreq_featneg: feature negotiation options for this connection * The following two fields are analogous to the ones in dccp_sock: * @dreq_timestamp_echo: last received timestamp to echo (13.1) * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo @@ -421,6 +422,7 @@ struct dccp_request_sock { __u64 dreq_iss; __u64 dreq_isr; __be32 dreq_service; + struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; }; @@ -498,6 +500,7 @@ struct dccp_ackvec; * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) * @dccps_minisock - associated minisock (accessed via dccp_msk) + * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) @@ -535,6 +538,7 @@ struct dccp_sock { __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; + struct list_head dccps_featneg; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; -- cgit v1.2.3 From 1f29fae29709b4668979e244c09b2fa78ff1ad59 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 5 Nov 2008 16:08:52 -0600 Subject: file capabilities: add no_file_caps switch (v4) Add a no_file_caps boot option when file capabilities are compiled into the kernel (CONFIG_SECURITY_FILE_CAPABILITIES=y). This allows distributions to ship a kernel with file capabilities compiled in, without forcing users to use (and understand and trust) them. When no_file_caps is specified at boot, then when a process executes a file, any file capabilities stored with that file will not be used in the calculation of the process' new capability sets. This means that booting with the no_file_caps boot option will not be the same as booting a kernel with file capabilities compiled out - in particular a task with CAP_SETPCAP will not have any chance of passing capabilities to another task (which isn't "really" possible anyway, and which may soon by killed altogether by David Howells in any case), and it will instead be able to put new capabilities in its pI. However since fI will always be empty and pI is masked with fI, it gains the task nothing. We also support the extra prctl options, setting securebits and dropping capabilities from the per-process bounding set. The other remaining difference is that killpriv, task_setscheduler, setioprio, and setnice will continue to be hooked. That will be noticable in the case where a root task changed its uid while keeping some caps, and another task owned by the new uid tries to change settings for the more privileged task. Changelog: Nov 05 2008: (v4) trivial port on top of always-start-\ with-clear-caps patch Sep 23 2008: nixed file_caps_enabled when file caps are not compiled in as it isn't used. Document no_file_caps in kernel-parameters.txt. Signed-off-by: Serge Hallyn Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- include/linux/capability.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 9d1fe30b6f6c..5bc145bd759a 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -68,6 +68,9 @@ typedef struct __user_cap_data_struct { #define VFS_CAP_U32 VFS_CAP_U32_2 #define VFS_CAP_REVISION VFS_CAP_REVISION_2 +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES +extern int file_caps_enabled; +#endif struct vfs_cap_data { __le32 magic_etc; /* Little endian */ -- cgit v1.2.3 From ae33bc40c0d96d02f51a996482ea7e41c5152695 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 5 Nov 2008 16:00:02 -0800 Subject: net: Guaranetee the proper ordering of the loopback device. I was recently hunting a bug that occurred in network namespace cleanup. In looking at the code it became apparrent that we have and will continue to have cases where if we have anything going on in a network namespace there will be assumptions that the loopback device is present. Things like sending igmp unsubscribe messages when we bring down network devices invokes the routing code which assumes that at least the loopback driver is present. Therefore to avoid magic initcall ordering hackery that is hard to follow and hard to get right insert a call to register the loopback device directly from net_dev_init(). This guarantes that the loopback device is the first device registered and the last network device to go away. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f1b0dbe58464..12d7f4469dc9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1766,6 +1766,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb) return 0; } +extern struct pernet_operations __net_initdata loopback_net_ops; #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ -- cgit v1.2.3 From fd9abb3d97c2ab883e4732ec1214fe64190236e7 Mon Sep 17 00:00:00 2001 From: Steve Glendinning Date: Wed, 5 Nov 2008 00:35:37 +0000 Subject: SMSC LAN911x and LAN921x vendor driver Attached is a driver for SMSC's LAN911x and LAN921x families of embedded ethernet controllers. There is an existing smc911x driver in the tree; this is intended to replace it. Dustin McIntire (the author of the smc911x driver) has expressed his support for switching to this driver. This driver contains workarounds for all known hardware issues, and has been tested on all flavours of the chip on multiple architectures. This driver now uses phylib, so this patch also adds support for the device's internal phy Signed-off-by: Steve Glendinning Signed-off-by: Bahadir Balban Signed-off-by: Dustin Mcintire Signed-off-by: Bill Gatliff Signed-off-by: Jeff Garzik --- include/linux/smsc911x.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 include/linux/smsc911x.h (limited to 'include/linux') diff --git a/include/linux/smsc911x.h b/include/linux/smsc911x.h new file mode 100644 index 000000000000..47c4ffd10dbb --- /dev/null +++ b/include/linux/smsc911x.h @@ -0,0 +1,42 @@ +/*************************************************************************** + * + * Copyright (C) 2004-2008 SMSC + * Copyright (C) 2005-2008 ARM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + ***************************************************************************/ +#ifndef __LINUX_SMSC911X_H__ +#define __LINUX_SMSC911X_H__ + +#include + +/* platform_device configuration data, should be assigned to + * the platform_device's dev.platform_data */ +struct smsc911x_platform_config { + unsigned int irq_polarity; + unsigned int irq_type; + phy_interface_t phy_interface; +}; + +/* Constants for platform_device irq polarity configuration */ +#define SMSC911X_IRQ_POLARITY_ACTIVE_LOW 0 +#define SMSC911X_IRQ_POLARITY_ACTIVE_HIGH 1 + +/* Constants for platform_device irq type configuration */ +#define SMSC911X_IRQ_TYPE_OPEN_DRAIN 0 +#define SMSC911X_IRQ_TYPE_PUSH_PULL 1 + +#endif /* __LINUX_SMSC911X_H__ */ -- cgit v1.2.3 From 3d8160b1493bcadca74fbb635d79b3928b8999cf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 7 Nov 2008 22:52:14 -0800 Subject: Revert "net: Guaranetee the proper ordering of the loopback device." This reverts commit ae33bc40c0d96d02f51a996482ea7e41c5152695. --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 12d7f4469dc9..f1b0dbe58464 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1766,7 +1766,6 @@ static inline int skb_bond_should_drop(struct sk_buff *skb) return 0; } -extern struct pernet_operations __net_initdata loopback_net_ops; #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ -- cgit v1.2.3 From 505d4f73dda9e20d59da05008f1f5eb432613e71 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 7 Nov 2008 22:54:20 -0800 Subject: net: Guaranetee the proper ordering of the loopback device. v2 I was recently hunting a bug that occurred in network namespace cleanup. In looking at the code it became apparrent that we have and will continue to have cases where if we have anything going on in a network namespace there will be assumptions that the loopback device is present. Things like sending igmp unsubscribe messages when we bring down network devices invokes the routing code which assumes that at least the loopback driver is present. Therefore to avoid magic initcall ordering hackery that is hard to follow and hard to get right insert a call to register the loopback device directly from net_dev_init(). This guarantes that the loopback device is the first device registered and the last network device to go away. But do it carefully so we register the loopback device after we clear dev_boot_phase. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f1b0dbe58464..12d7f4469dc9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1766,6 +1766,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb) return 0; } +extern struct pernet_operations __net_initdata loopback_net_ops; #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ -- cgit v1.2.3 From f400923735ecbb67cbe4a3606c9479f694754f51 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 7 Nov 2008 22:56:00 -0800 Subject: pkt_sched: Control group classifier The classifier should cover the most common use case and will work without any special configuration. The principle of the classifier is to directly access the task_struct via get_current(). In order for this to work, classification requests from softirqs must be ignored. This is not a problem because the vast majority of packets in softirq context are not assigned to a task anyway. For this to work, a mechanism is needed to trace softirq context. This repost goes back to the method of relying on the number of nested bh disable calls for the sake of not adding too much complexity and the option to come up with something more reliable if actually needed. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/cgroup_subsys.h | 6 ++++++ include/linux/pkt_cls.h | 14 ++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c22396e8b50..9c8d31bacf46 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -54,3 +54,9 @@ SUBSYS(freezer) #endif /* */ + +#ifdef CONFIG_NET_CLS_CGROUP +SUBSYS(net_cls) +#endif + +/* */ diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h index 7cf7824df778..e6aa8482ad7a 100644 --- a/include/linux/pkt_cls.h +++ b/include/linux/pkt_cls.h @@ -394,6 +394,20 @@ enum #define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1) + +/* Cgroup classifier */ + +enum +{ + TCA_CGROUP_UNSPEC, + TCA_CGROUP_ACT, + TCA_CGROUP_POLICE, + TCA_CGROUP_EMATCHES, + __TCA_CGROUP_MAX, +}; + +#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1) + /* Extended Matches */ struct tcf_ematch_tree_hdr -- cgit v1.2.3 From 1239cd58d237fa6ad501acaec8776262a5784ec8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 28 Oct 2008 11:12:57 +0100 Subject: wireless: move mesh config length constant This is a constant from the 802.11 specification. Signed-off-by: Johannes Berg Cc: Javier Cardona Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index aad99195a4cc..9dc288b920c8 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -97,7 +97,10 @@ #define IEEE80211_MAX_FRAME_LEN 2352 #define IEEE80211_MAX_SSID_LEN 32 + #define IEEE80211_MAX_MESH_ID_LEN 32 +#define IEEE80211_MESH_CONFIG_LEN 19 + #define IEEE80211_QOS_CTL_LEN 2 #define IEEE80211_QOS_CTL_TID_MASK 0x000F #define IEEE80211_QOS_CTL_TAG1D_MASK 0x0007 -- cgit v1.2.3 From 90c97a040d6b08cc4890328aa262fdc37336ab01 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 30 Oct 2008 16:59:22 +0200 Subject: nl80211: Add basic rate configuration for AP mode Add a new attribute, NL80211_ATTR_BSS_BASIC_RATES, that can be used with NL80211_CMD_SET_BSS for userspace (e.g., hostapd) to set which rates are in the basic rate set. Signed-off-by: Jouni Malinen Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index e4cc7869b22f..5009809588c0 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -243,6 +243,9 @@ enum nl80211_commands { * (u8, 0 or 1) * @NL80211_ATTR_BSS_SHORT_SLOT_TIME: whether short slot time enabled * (u8, 0 or 1) + * @NL80211_ATTR_BSS_BASIC_RATES: basic rates, array of basic + * rates in format defined by IEEE 802.11 7.3.2.2 but without the length + * restriction (at most %NL80211_MAX_SUPP_RATES). * * @NL80211_ATTR_HT_CAPABILITY: HT Capability information element (from * association request when used with NL80211_CMD_NEW_STATION) @@ -307,6 +310,8 @@ enum nl80211_attrs { NL80211_ATTR_MESH_PARAMS, + NL80211_ATTR_BSS_BASIC_RATES, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -318,6 +323,7 @@ enum nl80211_attrs { * here */ #define NL80211_ATTR_HT_CAPABILITY NL80211_ATTR_HT_CAPABILITY +#define NL80211_ATTR_BSS_BASIC_RATES NL80211_ATTR_BSS_BASIC_RATES #define NL80211_MAX_SUPP_RATES 32 #define NL80211_MAX_SUPP_REG_RULES 32 -- cgit v1.2.3 From 318884875bdddca663ecc373c813cf8e117d9e43 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 30 Oct 2008 16:59:24 +0200 Subject: nl80211: Add TX queue parameter configuration Add a new attribute, NL80211_ATTR_WIPHY_TXQ_PARAMS, that can be used with NL80211_CMD_SET_WIPHY for userspace (e.g., hostapd) to set TX queue parameters (txop, cwmin, cwmax, aifs). Signed-off-by: Jouni Malinen Signed-off-by: John W. Linville --- include/linux/nl80211.h | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 5009809588c0..79827345351d 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -25,8 +25,9 @@ * * @NL80211_CMD_GET_WIPHY: request information about a wiphy or dump request * to get a list of all present wiphys. - * @NL80211_CMD_SET_WIPHY: set wiphy name, needs %NL80211_ATTR_WIPHY and - * %NL80211_ATTR_WIPHY_NAME. + * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or + * %NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME + * and/or %NL80211_ATTR_WIPHY_TXQ_PARAMS. * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request * or rename notification. Has attributes %NL80211_ATTR_WIPHY and * %NL80211_ATTR_WIPHY_NAME. @@ -178,6 +179,7 @@ enum nl80211_commands { * @NL80211_ATTR_WIPHY: index of wiphy to operate on, cf. * /sys/class/ieee80211//index * @NL80211_ATTR_WIPHY_NAME: wiphy name (used for renaming) + * @NL80211_ATTR_WIPHY_TXQ_PARAMS: a nested array of TX queue parameters * * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on * @NL80211_ATTR_IFNAME: network interface name @@ -312,6 +314,8 @@ enum nl80211_attrs { NL80211_ATTR_BSS_BASIC_RATES, + NL80211_ATTR_WIPHY_TXQ_PARAMS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -324,6 +328,7 @@ enum nl80211_attrs { */ #define NL80211_ATTR_HT_CAPABILITY NL80211_ATTR_HT_CAPABILITY #define NL80211_ATTR_BSS_BASIC_RATES NL80211_ATTR_BSS_BASIC_RATES +#define NL80211_ATTR_WIPHY_TXQ_PARAMS NL80211_ATTR_WIPHY_TXQ_PARAMS #define NL80211_MAX_SUPP_RATES 32 #define NL80211_MAX_SUPP_REG_RULES 32 @@ -698,4 +703,38 @@ enum nl80211_meshconf_params { NL80211_MESHCONF_ATTR_MAX = __NL80211_MESHCONF_ATTR_AFTER_LAST - 1 }; +/** + * enum nl80211_txq_attr - TX queue parameter attributes + * @__NL80211_TXQ_ATTR_INVALID: Attribute number 0 is reserved + * @NL80211_TXQ_ATTR_QUEUE: TX queue identifier (NL80211_TXQ_Q_*) + * @NL80211_TXQ_ATTR_TXOP: Maximum burst time in units of 32 usecs, 0 meaning + * disabled + * @NL80211_TXQ_ATTR_CWMIN: Minimum contention window [a value of the form + * 2^n-1 in the range 1..32767] + * @NL80211_TXQ_ATTR_CWMAX: Maximum contention window [a value of the form + * 2^n-1 in the range 1..32767] + * @NL80211_TXQ_ATTR_AIFS: Arbitration interframe space [0..255] + * @__NL80211_TXQ_ATTR_AFTER_LAST: Internal + * @NL80211_TXQ_ATTR_MAX: Maximum TXQ attribute number + */ +enum nl80211_txq_attr { + __NL80211_TXQ_ATTR_INVALID, + NL80211_TXQ_ATTR_QUEUE, + NL80211_TXQ_ATTR_TXOP, + NL80211_TXQ_ATTR_CWMIN, + NL80211_TXQ_ATTR_CWMAX, + NL80211_TXQ_ATTR_AIFS, + + /* keep last */ + __NL80211_TXQ_ATTR_AFTER_LAST, + NL80211_TXQ_ATTR_MAX = __NL80211_TXQ_ATTR_AFTER_LAST - 1 +}; + +enum nl80211_txq_q { + NL80211_TXQ_Q_VO, + NL80211_TXQ_Q_VI, + NL80211_TXQ_Q_BE, + NL80211_TXQ_Q_BK +}; + #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From fc6971d491517ba15e800540ff88caa55dc65b01 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 30 Oct 2008 19:59:05 +0200 Subject: mac80211_hwsim: Add support for client PS mode This introduces a debugfs file (ieee80211/phy#/hwsim/ps) that can be used to force a simulated radio into power save mode. Following values can be written into this file to change PS mode: 0 = power save disabled (constantly awake) 1 = power save enabled (drop all frames; do not send PS-Poll) 2 = power save enabled (send PS-Poll frames automatically to receive buffered unicast frames); not yet fully implemented 3 = manual PS-Poll trigger (send a single PS-Poll frame) Two different behavior for power save mode processing can be tested: - move between modes 1 and 0 (i.e., receive all buffered frames at a time) - move to mode 1 and use manual PS-Poll frames (write 3 to the 'ps' debugfs file) to fetch power save buffered frames one at a time Mode 2 (automatic PS-Poll) does not yet parse Beacon frames, but eventually, it should take a look at TIM IE and send PS-Poll if a traffic bit is set for our AID. Signed-off-by: Jouni Malinen Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 9dc288b920c8..56b0eb25d927 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -669,6 +669,13 @@ struct ieee80211_cts { u8 ra[6]; } __attribute__ ((packed)); +struct ieee80211_pspoll { + __le16 frame_control; + __le16 aid; + u8 bssid[6]; + u8 ta[6]; +} __attribute__ ((packed)); + /** * struct ieee80211_bar - HT Block Ack Request * -- cgit v1.2.3 From 9d36be76c55ad2c2bb29683b752b0d9ad2e4eeef Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:07 +1100 Subject: Document the order of arguments for cap_issubset. It's not instantly clear which order the argument should be in. So give an example. Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/capability.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 5bc145bd759a..b5750d0b96e0 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -457,6 +457,13 @@ static inline int cap_isclear(const kernel_cap_t a) return 1; } +/* + * Check if "a" is a subset of "set". + * return 1 if ALL of the capabilities in "a" are also in "set" + * cap_issubset(0101, 1111) will return 1 + * return 0 if ANY of the capabilities in "a" are not in "set" + * cap_issubset(1111, 0101) will return 0 + */ static inline int cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { kernel_cap_t dest; -- cgit v1.2.3 From c0b004413a46a0a5744e6d2b85220fe9d2c33d48 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:10 +1100 Subject: This patch add a generic cpu endian caps structure and externally available functions which retrieve fcaps information from disk. This information is necessary so fcaps information can be collected and recorded by the audit system. Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/capability.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index b5750d0b96e0..d567af247ed8 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -99,6 +99,13 @@ typedef struct kernel_cap_struct { __u32 cap[_KERNEL_CAPABILITY_U32S]; } kernel_cap_t; +/* exact same as vfs_cap_data but in cpu endian and always filled completely */ +struct cpu_vfs_cap_data { + __u32 magic_etc; + kernel_cap_t permitted; + kernel_cap_t inheritable; +}; + #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) -- cgit v1.2.3 From 851f7ff56d9c21272f289dd85fb3f1b6cf7a6e10 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:14 +1100 Subject: This patch will print cap_permitted and cap_inheritable data in the PATH records of any file that has file capabilities set. Files which do not have fcaps set will not have different PATH records. An example audit record if you run: setcap "cap_net_admin+pie" /bin/bash /bin/bash type=SYSCALL msg=audit(1225741937.363:230): arch=c000003e syscall=59 success=yes exit=0 a0=2119230 a1=210da30 a2=20ee290 a3=8 items=2 ppid=2149 pid=2923 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=EXECVE msg=audit(1225741937.363:230): argc=2 a0="ping" a1="www.google.com" type=CWD msg=audit(1225741937.363:230): cwd="/root" type=PATH msg=audit(1225741937.363:230): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0104755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fi=0000000000002000 cap_fe=1 cap_fver=2 type=PATH msg=audit(1225741937.363:230): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0 Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/capability.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index d567af247ed8..0f1950181102 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -53,6 +53,7 @@ typedef struct __user_cap_data_struct { #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX #define VFS_CAP_REVISION_MASK 0xFF000000 +#define VFS_CAP_REVISION_SHIFT 24 #define VFS_CAP_FLAGS_MASK ~VFS_CAP_REVISION_MASK #define VFS_CAP_FLAGS_EFFECTIVE 0x000001 @@ -534,6 +535,10 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); extern int capable(int cap); +/* audit system wants to get cap info from files as well */ +struct dentry; +extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); + #endif /* __KERNEL__ */ #endif /* !_LINUX_CAPABILITY_H */ -- cgit v1.2.3 From 3fc689e96c0c90b6fede5946d6c31075e9464f69 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:18 +1100 Subject: Any time fcaps or a setuid app under SECURE_NOROOT is used to result in a non-zero pE we will crate a new audit record which contains the entire set of known information about the executable in question, fP, fI, fE, fversion and includes the process's pE, pI, pP. Before and after the bprm capability are applied. This record type will only be emitted from execve syscalls. an example of making ping use fcaps instead of setuid: setcap "cat_net_raw+pe" /bin/ping type=SYSCALL msg=audit(1225742021.015:236): arch=c000003e syscall=59 success=yes exit=0 a0=1457f30 a1=14606b0 a2=1463940 a3=321b770a70 items=2 ppid=2929 pid=2963 auid=0 uid=500 gid=500 euid=500 suid=500 fsuid=500 egid=500 sgid=500 fsgid=500 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1321] msg=audit(1225742021.015:236): fver=2 fp=0000000000002000 fi=0000000000000000 fe=1 old_pp=0000000000000000 old_pi=0000000000000000 old_pe=0000000000000000 new_pp=0000000000002000 new_pi=0000000000000000 new_pe=0000000000002000 type=EXECVE msg=audit(1225742021.015:236): argc=2 a0="ping" a1="127.0.0.1" type=CWD msg=audit(1225742021.015:236): cwd="/home/test" type=PATH msg=audit(1225742021.015:236): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fe=1 cap_fver=2 type=PATH msg=audit(1225742021.015:236): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0 Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/audit.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 6272a395d43c..8cfb9feb2a05 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -99,6 +99,7 @@ #define AUDIT_OBJ_PID 1318 /* ptrace target */ #define AUDIT_TTY 1319 /* Input on an administrative TTY */ #define AUDIT_EOE 1320 /* End of multi-record event */ +#define AUDIT_BPRM_FCAPS 1321 /* Information about fcaps increasing perms */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ @@ -452,6 +453,7 @@ extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_pr extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout); extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification); extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); +extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE); static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp) { @@ -501,6 +503,29 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) return __audit_mq_getsetattr(mqdes, mqstat); return 0; } + +/* + * ieieeeeee, an audit function without a return code! + * + * This function might fail! I decided that it didn't matter. We are too late + * to fail the syscall and the information isn't REQUIRED for any purpose. It's + * just nice to have. We should be able to look at past audit logs to figure + * out this process's current cap set along with the fcaps from the PATH record + * and use that to come up with the final set. Yeah, its ugly, but all the info + * is still in the audit log. So I'm not going to bother mentioning we failed + * if we couldn't allocate memory. + * + * If someone changes their mind they could create the aux record earlier and + * then search here and use that earlier allocation. But I don't wanna. + * + * -Eric + */ +static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE) +{ + if (unlikely(!audit_dummy_context())) + __audit_log_bprm_fcaps(bprm, pP, pE); +} + extern int audit_n_rules; extern int audit_signals; #else @@ -532,6 +557,7 @@ extern int audit_signals; #define audit_mq_timedreceive(d,l,p,t) ({ 0; }) #define audit_mq_notify(d,n) ({ 0; }) #define audit_mq_getsetattr(d,s) ({ 0; }) +#define audit_log_bprm_fcaps(b, p, e) do { ; } while (0) #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 -- cgit v1.2.3 From e68b75a027bb94066576139ee33676264f867b87 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:22 +1100 Subject: When the capset syscall is used it is not possible for audit to record the actual capbilities being added/removed. This patch adds a new record type which emits the target pid and the eff, inh, and perm cap sets. example output if you audit capset syscalls would be: type=SYSCALL msg=audit(1225743140.465:76): arch=c000003e syscall=126 success=yes exit=0 a0=17f2014 a1=17f201c a2=80000000 a3=7fff2ab7f060 items=0 ppid=2160 pid=2223 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=1 comm="setcap" exe="/usr/sbin/setcap" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1322] msg=audit(1225743140.465:76): pid=0 cap_pi=ffffffffffffffff cap_pp=ffffffffffffffff cap_pe=ffffffffffffffff Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/audit.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 8cfb9feb2a05..6fbebac7b1bf 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -100,6 +100,7 @@ #define AUDIT_TTY 1319 /* Input on an administrative TTY */ #define AUDIT_EOE 1320 /* End of multi-record event */ #define AUDIT_BPRM_FCAPS 1321 /* Information about fcaps increasing perms */ +#define AUDIT_CAPSET 1322 /* Record showing argument to sys_capset */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ @@ -454,6 +455,7 @@ extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __u extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification); extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE); +extern int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm); static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp) { @@ -526,6 +528,13 @@ static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t __audit_log_bprm_fcaps(bprm, pP, pE); } +static inline int audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm) +{ + if (unlikely(!audit_dummy_context())) + return __audit_log_capset(pid, eff, inh, perm); + return 0; +} + extern int audit_n_rules; extern int audit_signals; #else @@ -558,6 +567,7 @@ extern int audit_signals; #define audit_mq_notify(d,n) ({ 0; }) #define audit_mq_getsetattr(d,s) ({ 0; }) #define audit_log_bprm_fcaps(b, p, e) do { ; } while (0) +#define audit_log_capset(pid, e, i, p) ({ 0; }) #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 -- cgit v1.2.3 From 06112163f5fd9e491a7f810443d81efa9d88e247 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 22:02:50 +1100 Subject: Add a new capable interface that will be used by systems that use audit to make an A or B type decision instead of a security decision. Currently this is the case at least for filesystems when deciding if a process can use the reserved 'root' blocks and for the case of things like the oom algorithm determining if processes are root processes and should be less likely to be killed. These types of security system requests should not be audited or logged since they are not really security decisions. It would be possible to solve this problem like the vm_enough_memory security check did by creating a new LSM interface and moving all of the policy into that interface but proves the needlessly bloat the LSM and provide complex indirection. This merely allows those decisions to be made where they belong and to not flood logs or printk with denials for thing that are not security decisions. Signed-off-by: Eric Paris Acked-by: Stephen Smalley Signed-off-by: James Morris --- include/linux/capability.h | 3 +++ include/linux/security.h | 16 +++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 0f1950181102..b313ba1dd5d1 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -521,6 +521,8 @@ extern const kernel_cap_t __cap_init_eff_set; kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); +extern int security_capable(struct task_struct *t, int cap); +extern int security_capable_noaudit(struct task_struct *t, int cap); /** * has_capability - Determine if a task has a superior capability available * @t: The task in question @@ -532,6 +534,7 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); * Note that this does not set PF_SUPERPRIV on the task. */ #define has_capability(t, cap) (security_capable((t), (cap)) == 0) +#define has_capability_noaudit(t, cap) (security_capable_noaudit((t), (cap)) == 0) extern int capable(int cap); diff --git a/include/linux/security.h b/include/linux/security.h index c13f1cec9abb..5fe28a671cd3 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -37,6 +37,10 @@ /* Maximum number of letters for an LSM name string */ #define SECURITY_NAME_MAX 10 +/* If capable should audit the security request */ +#define SECURITY_CAP_NOAUDIT 0 +#define SECURITY_CAP_AUDIT 1 + struct ctl_table; struct audit_krule; @@ -44,7 +48,7 @@ struct audit_krule; * These functions are in security/capability.c and are used * as the default capabilities functions */ -extern int cap_capable(struct task_struct *tsk, int cap); +extern int cap_capable(struct task_struct *tsk, int cap, int audit); extern int cap_settime(struct timespec *ts, struct timezone *tz); extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); @@ -1307,7 +1311,7 @@ struct security_operations { kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); - int (*capable) (struct task_struct *tsk, int cap); + int (*capable) (struct task_struct *tsk, int cap, int audit); int (*acct) (struct file *file); int (*sysctl) (struct ctl_table *table, int op); int (*quotactl) (int cmds, int type, int id, struct super_block *sb); @@ -1577,6 +1581,7 @@ void security_capset_set(struct task_struct *target, kernel_cap_t *inheritable, kernel_cap_t *permitted); int security_capable(struct task_struct *tsk, int cap); +int security_capable_noaudit(struct task_struct *tsk, int cap); int security_acct(struct file *file); int security_sysctl(struct ctl_table *table, int op); int security_quotactl(int cmds, int type, int id, struct super_block *sb); @@ -1782,7 +1787,12 @@ static inline void security_capset_set(struct task_struct *target, static inline int security_capable(struct task_struct *tsk, int cap) { - return cap_capable(tsk, cap); + return cap_capable(tsk, cap, SECURITY_CAP_AUDIT); +} + +static inline int security_capable_noaudit(struct task_struct *tsk, int cap) +{ + return cap_capable(tsk, cap, SECURITY_CAP_NOAUDIT); } static inline int security_acct(struct file *file) -- cgit v1.2.3 From d90ebcbfa7f5a8b4e20518c9f94c5c4e4cd3c2e5 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Wed, 12 Nov 2008 00:47:26 -0800 Subject: dccp: Query supported CCIDs This provides a data structure to record which CCIDs are locally supported and three accessor functions: - a test function for internal use which is used to validate CCID requests made by the user; - a copy function so that the list can be used for feature-negotiation; - documented getsockopt() support so that the user can query capabilities. The data structure is a table which is filled in at compile-time with the list of available CCIDs (which in turn depends on the Kconfig choices). Using the copy function for cloning the list of supported CCIDs is useful for feature negotiation, since the negotiation is now with the full list of available CCIDs (e.g. {2, 3}) instead of the default value {2}. This means negotiation will not fail if the peer requests to use CCID3 instead of CCID2. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- include/linux/dccp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 484b8a1fb023..d3ac1bde60b4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -209,6 +209,7 @@ struct dccp_so_feat { #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 +#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 -- cgit v1.2.3 From 92a77aac9812d5397abbe6f1920e085e50838635 Mon Sep 17 00:00:00 2001 From: James Morris Date: Wed, 12 Nov 2008 21:20:00 +1100 Subject: security: remove broken and useless declarations Remove broken declarations for security_capable* functions, which were not needed anyway. Signed-off-by: James Morris --- include/linux/capability.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index b313ba1dd5d1..7f26580a5a4d 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -521,8 +521,6 @@ extern const kernel_cap_t __cap_init_eff_set; kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); -extern int security_capable(struct task_struct *t, int cap); -extern int security_capable_noaudit(struct task_struct *t, int cap); /** * has_capability - Determine if a task has a superior capability available * @t: The task in question -- cgit v1.2.3 From da9592edebceeba1b9301beafe80ec8b9c2db0ce Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:05 +1100 Subject: CRED: Wrap task credential accesses in the filesystem subsystem Wrap access to task credentials so that they can be separated more easily from the task_struct during the introduction of COW creds. Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id(). Change some task->e?[ug]id to task_e?[ug]id(). In some places it makes more sense to use RCU directly rather than a convenient wrapper; these will be addressed by later patches. Signed-off-by: David Howells Reviewed-by: James Morris Acked-by: Serge Hallyn Cc: Al Viro Signed-off-by: James Morris --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0dcdd9458f4b..b3d404aaabed 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1193,7 +1193,7 @@ enum { #define has_fs_excl() atomic_read(¤t->fs_excl) #define is_owner_or_cap(inode) \ - ((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER)) + ((current_fsuid() == (inode)->i_uid) || capable(CAP_FOWNER)) /* not quite ready to be deprecated, but... */ extern void lock_super(struct super_block *); -- cgit v1.2.3 From e9e349b051d98799b743ebf248cc2d986fedf090 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:13 +1100 Subject: KEYS: Disperse linux/key_ui.h Disperse the bits of linux/key_ui.h as the reason they were put here (keyfs) didn't get in. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- include/linux/key-ui.h | 66 -------------------------------------------------- 1 file changed, 66 deletions(-) delete mode 100644 include/linux/key-ui.h (limited to 'include/linux') diff --git a/include/linux/key-ui.h b/include/linux/key-ui.h deleted file mode 100644 index e8b8a7a5c496..000000000000 --- a/include/linux/key-ui.h +++ /dev/null @@ -1,66 +0,0 @@ -/* key-ui.h: key userspace interface stuff - * - * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _LINUX_KEY_UI_H -#define _LINUX_KEY_UI_H - -#include - -/* the key tree */ -extern struct rb_root key_serial_tree; -extern spinlock_t key_serial_lock; - -/* required permissions */ -#define KEY_VIEW 0x01 /* require permission to view attributes */ -#define KEY_READ 0x02 /* require permission to read content */ -#define KEY_WRITE 0x04 /* require permission to update / modify */ -#define KEY_SEARCH 0x08 /* require permission to search (keyring) or find (key) */ -#define KEY_LINK 0x10 /* require permission to link */ -#define KEY_SETATTR 0x20 /* require permission to change attributes */ -#define KEY_ALL 0x3f /* all the above permissions */ - -/* - * the keyring payload contains a list of the keys to which the keyring is - * subscribed - */ -struct keyring_list { - struct rcu_head rcu; /* RCU deletion hook */ - unsigned short maxkeys; /* max keys this list can hold */ - unsigned short nkeys; /* number of keys currently held */ - unsigned short delkey; /* key to be unlinked by RCU */ - struct key *keys[0]; -}; - -/* - * check to see whether permission is granted to use a key in the desired way - */ -extern int key_task_permission(const key_ref_t key_ref, - struct task_struct *context, - key_perm_t perm); - -static inline int key_permission(const key_ref_t key_ref, key_perm_t perm) -{ - return key_task_permission(key_ref, current, perm); -} - -extern key_ref_t lookup_user_key(struct task_struct *context, - key_serial_t id, int create, int partial, - key_perm_t perm); - -extern long join_session_keyring(const char *name); - -extern struct key_type *key_type_lookup(const char *type); -extern void key_type_put(struct key_type *ktype); - -#define key_negative_timeout 60 /* default timeout on a negative key's existence */ - - -#endif /* _LINUX_KEY_UI_H */ -- cgit v1.2.3 From 8bbf4976b59fc9fc2861e79cab7beb3f6d647640 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:14 +1100 Subject: KEYS: Alter use of key instantiation link-to-keyring argument Alter the use of the key instantiation and negation functions' link-to-keyring arguments. Currently this specifies a keyring in the target process to link the key into, creating the keyring if it doesn't exist. This, however, can be a problem for copy-on-write credentials as it means that the instantiating process can alter the credentials of the requesting process. This patch alters the behaviour such that: (1) If keyctl_instantiate_key() or keyctl_negate_key() are given a specific keyring by ID (ringid >= 0), then that keyring will be used. (2) If keyctl_instantiate_key() or keyctl_negate_key() are given one of the special constants that refer to the requesting process's keyrings (KEY_SPEC_*_KEYRING, all <= 0), then: (a) If sys_request_key() was given a keyring to use (destringid) then the key will be attached to that keyring. (b) If sys_request_key() was given a NULL keyring, then the key being instantiated will be attached to the default keyring as set by keyctl_set_reqkey_keyring(). (3) No extra link will be made. Decision point (1) follows current behaviour, and allows those instantiators who've searched for a specifically named keyring in the requestor's keyring so as to partition the keys by type to still have their named keyrings. Decision point (2) allows the requestor to make sure that the key or keys that get produced by request_key() go where they want, whilst allowing the instantiator to request that the key is retained. This is mainly useful for situations where the instantiator makes a secondary request, the key for which should be retained by the initial requestor: +-----------+ +--------------+ +--------------+ | | | | | | | Requestor |------->| Instantiator |------->| Instantiator | | | | | | | +-----------+ +--------------+ +--------------+ request_key() request_key() This might be useful, for example, in Kerberos, where the requestor requests a ticket, and then the ticket instantiator requests the TGT, which someone else then has to go and fetch. The TGT, however, should be retained in the keyrings of the requestor, not the first instantiator. To make this explict an extra special keyring constant is also added. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- include/linux/key.h | 16 ++++++++-------- include/linux/keyctl.h | 4 +++- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index 1b70e35a71e3..df709e1af3cd 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -287,11 +287,11 @@ extern void key_fsuid_changed(struct task_struct *tsk); extern void key_fsgid_changed(struct task_struct *tsk); extern void key_init(void); -#define __install_session_keyring(tsk, keyring) \ -({ \ - struct key *old_session = tsk->signal->session_keyring; \ - tsk->signal->session_keyring = keyring; \ - old_session; \ +#define __install_session_keyring(keyring) \ +({ \ + struct key *old_session = current->signal->session_keyring; \ + current->signal->session_keyring = keyring; \ + old_session; \ }) #else /* CONFIG_KEYS */ @@ -302,11 +302,11 @@ extern void key_init(void); #define key_revoke(k) do { } while(0) #define key_put(k) do { } while(0) #define key_ref_put(k) do { } while(0) -#define make_key_ref(k, p) ({ NULL; }) -#define key_ref_to_ptr(k) ({ NULL; }) +#define make_key_ref(k, p) NULL +#define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 #define switch_uid_keyring(u) do { } while(0) -#define __install_session_keyring(t, k) ({ NULL; }) +#define __install_session_keyring(k) ({ NULL; }) #define copy_keys(f,t) 0 #define copy_thread_group_keys(t) 0 #define exit_keys(t) do { } while(0) diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h index 656ee6b77a4a..c0688eb72093 100644 --- a/include/linux/keyctl.h +++ b/include/linux/keyctl.h @@ -1,6 +1,6 @@ /* keyctl.h: keyctl command IDs * - * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2004, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -20,6 +20,7 @@ #define KEY_SPEC_USER_SESSION_KEYRING -5 /* - key ID for UID-session keyring */ #define KEY_SPEC_GROUP_KEYRING -6 /* - key ID for GID-specific keyring */ #define KEY_SPEC_REQKEY_AUTH_KEY -7 /* - key ID for assumed request_key auth key */ +#define KEY_SPEC_REQUESTOR_KEYRING -8 /* - key ID for request_key() dest keyring */ /* request-key default keyrings */ #define KEY_REQKEY_DEFL_NO_CHANGE -1 @@ -30,6 +31,7 @@ #define KEY_REQKEY_DEFL_USER_KEYRING 4 #define KEY_REQKEY_DEFL_USER_SESSION_KEYRING 5 #define KEY_REQKEY_DEFL_GROUP_KEYRING 6 +#define KEY_REQKEY_DEFL_REQUESTOR_KEYRING 7 /* keyctl commands */ #define KEYCTL_GET_KEYRING_ID 0 /* ask for a keyring's ID */ -- cgit v1.2.3 From 1cdcbec1a3372c0c49c59d292e708fd07b509f18 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:14 +1100 Subject: CRED: Neuter sys_capset() Take away the ability for sys_capset() to affect processes other than current. This means that current will not need to lock its own credentials when reading them against interference by other processes. This has effectively been the case for a while anyway, since: (1) Without LSM enabled, sys_capset() is disallowed. (2) With file-based capabilities, sys_capset() is neutered. Signed-off-by: David Howells Acked-by: Serge Hallyn Acked-by: Andrew G. Morgan Acked-by: James Morris Signed-off-by: James Morris --- include/linux/security.h | 48 ++++++++++++++++-------------------------------- 1 file changed, 16 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 5fe28a671cd3..d1ce8beddbd7 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -53,8 +53,8 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz); extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -extern int cap_capset_check(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -extern void cap_capset_set(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); +extern int cap_capset_check(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); +extern void cap_capset_set(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern int cap_bprm_set_security(struct linux_binprm *bprm); extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); extern int cap_bprm_secureexec(struct linux_binprm *bprm); @@ -1191,24 +1191,14 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Return 0 if the capability sets were successfully obtained. * @capset_check: * Check permission before setting the @effective, @inheritable, and - * @permitted capability sets for the @target process. - * Caveat: @target is also set to current if a set of processes is - * specified (i.e. all processes other than current and init or a - * particular process group). Hence, the capset_set hook may need to - * revalidate permission to the actual target process. - * @target contains the task_struct structure for target process. + * @permitted capability sets for the current process. * @effective contains the effective capability set. * @inheritable contains the inheritable capability set. * @permitted contains the permitted capability set. * Return 0 if permission is granted. * @capset_set: * Set the @effective, @inheritable, and @permitted capability sets for - * the @target process. Since capset_check cannot always check permission - * to the real @target process, this hook may also perform permission - * checking to determine if the current process is allowed to set the - * capability sets of the @target process. However, this hook has no way - * of returning an error due to the structure of the sys_capset code. - * @target contains the task_struct structure for target process. + * the current process. * @effective contains the effective capability set. * @inheritable contains the inheritable capability set. * @permitted contains the permitted capability set. @@ -1303,12 +1293,10 @@ struct security_operations { int (*capget) (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); - int (*capset_check) (struct task_struct *target, - kernel_cap_t *effective, + int (*capset_check) (kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); - void (*capset_set) (struct task_struct *target, - kernel_cap_t *effective, + void (*capset_set) (kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); int (*capable) (struct task_struct *tsk, int cap, int audit); @@ -1572,12 +1560,10 @@ int security_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -int security_capset_check(struct task_struct *target, - kernel_cap_t *effective, +int security_capset_check(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -void security_capset_set(struct task_struct *target, - kernel_cap_t *effective, +void security_capset_set(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); int security_capable(struct task_struct *tsk, int cap); @@ -1769,20 +1755,18 @@ static inline int security_capget(struct task_struct *target, return cap_capget(target, effective, inheritable, permitted); } -static inline int security_capset_check(struct task_struct *target, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) +static inline int security_capset_check(kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) { - return cap_capset_check(target, effective, inheritable, permitted); + return cap_capset_check(effective, inheritable, permitted); } -static inline void security_capset_set(struct task_struct *target, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) +static inline void security_capset_set(kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) { - cap_capset_set(target, effective, inheritable, permitted); + cap_capset_set(effective, inheritable, permitted); } static inline int security_capable(struct task_struct *tsk, int cap) -- cgit v1.2.3 From 15a2460ed0af7538ca8e6c610fe607a2cd9da142 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:15 +1100 Subject: CRED: Constify the kernel_cap_t arguments to the capset LSM hooks Constify the kernel_cap_t arguments to the capset LSM hooks. Signed-off-by: David Howells Acked-by: Serge Hallyn Acked-by: James Morris Signed-off-by: James Morris --- include/linux/security.h | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index d1ce8beddbd7..9f305d4a31a7 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -53,8 +53,12 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz); extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -extern int cap_capset_check(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -extern void cap_capset_set(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); +extern int cap_capset_check(const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); +extern void cap_capset_set(const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); extern int cap_bprm_set_security(struct linux_binprm *bprm); extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); extern int cap_bprm_secureexec(struct linux_binprm *bprm); @@ -1293,12 +1297,12 @@ struct security_operations { int (*capget) (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); - int (*capset_check) (kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted); - void (*capset_set) (kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted); + int (*capset_check) (const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); + void (*capset_set) (const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); int (*capable) (struct task_struct *tsk, int cap, int audit); int (*acct) (struct file *file); int (*sysctl) (struct ctl_table *table, int op); @@ -1560,12 +1564,12 @@ int security_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -int security_capset_check(kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted); -void security_capset_set(kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted); +int security_capset_check(const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); +void security_capset_set(const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); int security_capable(struct task_struct *tsk, int cap); int security_capable_noaudit(struct task_struct *tsk, int cap); int security_acct(struct file *file); @@ -1755,16 +1759,16 @@ static inline int security_capget(struct task_struct *target, return cap_capget(target, effective, inheritable, permitted); } -static inline int security_capset_check(kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) +static inline int security_capset_check(const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted) { return cap_capset_check(effective, inheritable, permitted); } -static inline void security_capset_set(kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) +static inline void security_capset_set(const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted) { cap_capset_set(effective, inheritable, permitted); } -- cgit v1.2.3 From b6dff3ec5e116e3af6f537d4caedcad6b9e5082a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:16 +1100 Subject: CRED: Separate task security context from task_struct Separate the task security context from task_struct. At this point, the security data is temporarily embedded in the task_struct with two pointers pointing to it. Note that the Alpha arch is altered as it refers to (E)UID and (E)GID in entry.S via asm-offsets. With comment fixes Signed-off-by: Marc Dionne Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/cred.h | 155 ++++++++++++++++++++++++++++++++++++++------- include/linux/init_task.h | 24 +++++-- include/linux/sched.h | 52 ++------------- include/linux/securebits.h | 2 +- 4 files changed, 155 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index b69222cc1fd2..3e65587a72e5 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -12,39 +12,150 @@ #ifndef _LINUX_CRED_H #define _LINUX_CRED_H -#define get_current_user() (get_uid(current->user)) - -#define task_uid(task) ((task)->uid) -#define task_gid(task) ((task)->gid) -#define task_euid(task) ((task)->euid) -#define task_egid(task) ((task)->egid) - -#define current_uid() (current->uid) -#define current_gid() (current->gid) -#define current_euid() (current->euid) -#define current_egid() (current->egid) -#define current_suid() (current->suid) -#define current_sgid() (current->sgid) -#define current_fsuid() (current->fsuid) -#define current_fsgid() (current->fsgid) -#define current_cap() (current->cap_effective) +#include +#include +#include + +struct user_struct; +struct cred; + +/* + * COW Supplementary groups list + */ +#define NGROUPS_SMALL 32 +#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) + +struct group_info { + atomic_t usage; + int ngroups; + int nblocks; + gid_t small_block[NGROUPS_SMALL]; + gid_t *blocks[0]; +}; + +/** + * get_group_info - Get a reference to a group info structure + * @group_info: The group info to reference + * + * This must be called with the owning task locked (via task_lock()) when task + * != current. The reason being that the vast majority of callers are looking + * at current->group_info, which can not be changed except by the current task. + * Changing current->group_info requires the task lock, too. + */ +#define get_group_info(group_info) \ +do { \ + atomic_inc(&(group_info)->usage); \ +} while (0) + +/** + * put_group_info - Release a reference to a group info structure + * @group_info: The group info to release + */ +#define put_group_info(group_info) \ +do { \ + if (atomic_dec_and_test(&(group_info)->usage)) \ + groups_free(group_info); \ +} while (0) + +extern struct group_info *groups_alloc(int); +extern void groups_free(struct group_info *); +extern int set_current_groups(struct group_info *); +extern int set_groups(struct cred *, struct group_info *); +extern int groups_search(struct group_info *, gid_t); + +/* access the groups "array" with this macro */ +#define GROUP_AT(gi, i) \ + ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK]) + +extern int in_group_p(gid_t); +extern int in_egroup_p(gid_t); + +/* + * The security context of a task + * + * The parts of the context break down into two categories: + * + * (1) The objective context of a task. These parts are used when some other + * task is attempting to affect this one. + * + * (2) The subjective context. These details are used when the task is acting + * upon another object, be that a file, a task, a key or whatever. + * + * Note that some members of this structure belong to both categories - the + * LSM security pointer for instance. + * + * A task has two security pointers. task->real_cred points to the objective + * context that defines that task's actual details. The objective part of this + * context is used whenever that task is acted upon. + * + * task->cred points to the subjective context that defines the details of how + * that task is going to act upon another object. This may be overridden + * temporarily to point to another security context, but normally points to the + * same context as task->real_cred. + */ +struct cred { + atomic_t usage; + uid_t uid; /* real UID of the task */ + gid_t gid; /* real GID of the task */ + uid_t suid; /* saved UID of the task */ + gid_t sgid; /* saved GID of the task */ + uid_t euid; /* effective UID of the task */ + gid_t egid; /* effective GID of the task */ + uid_t fsuid; /* UID for VFS ops */ + gid_t fsgid; /* GID for VFS ops */ + unsigned securebits; /* SUID-less security management */ + kernel_cap_t cap_inheritable; /* caps our children can inherit */ + kernel_cap_t cap_permitted; /* caps we're permitted */ + kernel_cap_t cap_effective; /* caps we can actually use */ + kernel_cap_t cap_bset; /* capability bounding set */ +#ifdef CONFIG_KEYS + unsigned char jit_keyring; /* default keyring to attach requested + * keys to */ + struct key *thread_keyring; /* keyring private to this thread */ + struct key *request_key_auth; /* assumed request_key authority */ +#endif +#ifdef CONFIG_SECURITY + void *security; /* subjective LSM security */ +#endif + struct user_struct *user; /* real user ID subscription */ + struct group_info *group_info; /* supplementary groups for euid/fsgid */ + struct rcu_head rcu; /* RCU deletion hook */ + spinlock_t lock; /* lock for pointer changes */ +}; + +#define get_current_user() (get_uid(current->cred->user)) + +#define task_uid(task) ((task)->cred->uid) +#define task_gid(task) ((task)->cred->gid) +#define task_euid(task) ((task)->cred->euid) +#define task_egid(task) ((task)->cred->egid) + +#define current_uid() (current->cred->uid) +#define current_gid() (current->cred->gid) +#define current_euid() (current->cred->euid) +#define current_egid() (current->cred->egid) +#define current_suid() (current->cred->suid) +#define current_sgid() (current->cred->sgid) +#define current_fsuid() (current->cred->fsuid) +#define current_fsgid() (current->cred->fsgid) +#define current_cap() (current->cred->cap_effective) #define current_uid_gid(_uid, _gid) \ do { \ - *(_uid) = current->uid; \ - *(_gid) = current->gid; \ + *(_uid) = current->cred->uid; \ + *(_gid) = current->cred->gid; \ } while(0) #define current_euid_egid(_uid, _gid) \ do { \ - *(_uid) = current->euid; \ - *(_gid) = current->egid; \ + *(_uid) = current->cred->euid; \ + *(_gid) = current->cred->egid; \ } while(0) #define current_fsuid_fsgid(_uid, _gid) \ do { \ - *(_uid) = current->fsuid; \ - *(_gid) = current->fsgid; \ + *(_uid) = current->cred->fsuid; \ + *(_gid) = current->cred->fsgid; \ } while(0) #endif /* _LINUX_CRED_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd8909b9e5..9de41ccd67b5 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -113,6 +113,21 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif +extern struct cred init_cred; + +#define INIT_CRED(p) \ +{ \ + .usage = ATOMIC_INIT(3), \ + .securebits = SECUREBITS_DEFAULT, \ + .cap_inheritable = CAP_INIT_INH_SET, \ + .cap_permitted = CAP_FULL_SET, \ + .cap_effective = CAP_INIT_EFF_SET, \ + .cap_bset = CAP_INIT_BSET, \ + .user = INIT_USER, \ + .group_info = &init_groups, \ + .lock = __SPIN_LOCK_UNLOCKED(p.lock), \ +} + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -147,13 +162,8 @@ extern struct group_info init_groups; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ - .group_info = &init_groups, \ - .cap_effective = CAP_INIT_EFF_SET, \ - .cap_inheritable = CAP_INIT_INH_SET, \ - .cap_permitted = CAP_FULL_SET, \ - .cap_bset = CAP_INIT_BSET, \ - .securebits = SECUREBITS_DEFAULT, \ - .user = INIT_USER, \ + .__temp_cred = INIT_CRED(tsk.__temp_cred), \ + .cred = &tsk.__temp_cred, \ .comm = "swapper", \ .thread = INIT_THREAD, \ .fs = &init_fs, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index b483f39a7112..c8b92502354d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -660,6 +660,7 @@ extern struct user_struct *find_user(uid_t); extern struct user_struct root_user; #define INIT_USER (&root_user) + struct backing_dev_info; struct reclaim_state; @@ -883,38 +884,7 @@ partition_sched_domains(int ndoms_new, cpumask_t *doms_new, #endif /* !CONFIG_SMP */ struct io_context; /* See blkdev.h */ -#define NGROUPS_SMALL 32 -#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) -struct group_info { - int ngroups; - atomic_t usage; - gid_t small_block[NGROUPS_SMALL]; - int nblocks; - gid_t *blocks[0]; -}; - -/* - * get_group_info() must be called with the owning task locked (via task_lock()) - * when task != current. The reason being that the vast majority of callers are - * looking at current->group_info, which can not be changed except by the - * current task. Changing current->group_info requires the task lock, too. - */ -#define get_group_info(group_info) do { \ - atomic_inc(&(group_info)->usage); \ -} while (0) -#define put_group_info(group_info) do { \ - if (atomic_dec_and_test(&(group_info)->usage)) \ - groups_free(group_info); \ -} while (0) - -extern struct group_info *groups_alloc(int gidsetsize); -extern void groups_free(struct group_info *group_info); -extern int set_current_groups(struct group_info *group_info); -extern int groups_search(struct group_info *group_info, gid_t grp); -/* access the groups "array" with this macro */ -#define GROUP_AT(gi, i) \ - ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK extern void prefetch_stack(struct task_struct *t); @@ -1181,17 +1151,9 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - uid_t uid,euid,suid,fsuid; - gid_t gid,egid,sgid,fsgid; - struct group_info *group_info; - kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; - struct user_struct *user; - unsigned securebits; -#ifdef CONFIG_KEYS - unsigned char jit_keyring; /* default keyring to attach requested keys to */ - struct key *request_key_auth; /* assumed request_key authority */ - struct key *thread_keyring; /* keyring private to this thread */ -#endif + struct cred __temp_cred __deprecated; /* temporary credentials to be removed */ + struct cred *cred; /* actual/objective task credentials */ + char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) @@ -1228,9 +1190,6 @@ struct task_struct { int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; -#ifdef CONFIG_SECURITY - void *security; -#endif struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL uid_t loginuid; @@ -1787,9 +1746,6 @@ extern void wake_up_new_task(struct task_struct *tsk, extern void sched_fork(struct task_struct *p, int clone_flags); extern void sched_dead(struct task_struct *p); -extern int in_group_p(gid_t); -extern int in_egroup_p(gid_t); - extern void proc_caches_init(void); extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); diff --git a/include/linux/securebits.h b/include/linux/securebits.h index 92f09bdf1175..6d389491bfa2 100644 --- a/include/linux/securebits.h +++ b/include/linux/securebits.h @@ -32,7 +32,7 @@ setting is locked or not. A setting which is locked cannot be changed from user-level. */ #define issecure_mask(X) (1 << (X)) -#define issecure(X) (issecure_mask(X) & current->securebits) +#define issecure(X) (issecure_mask(X) & current->cred->securebits) #define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \ issecure_mask(SECURE_NO_SETUID_FIXUP) | \ -- cgit v1.2.3 From f1752eec6145c97163dbce62d17cf5d928e28a27 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:17 +1100 Subject: CRED: Detach the credentials from task_struct Detach the credentials from task_struct, duplicating them in copy_process() and releasing them in __put_task_struct(). Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/cred.h | 29 +++++++++++++++++++++++++++++ include/linux/init_task.h | 16 +--------------- include/linux/sched.h | 1 - include/linux/security.h | 26 +++++++++++++------------- 4 files changed, 43 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 3e65587a72e5..a7a686074cb0 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -158,4 +158,33 @@ do { \ *(_gid) = current->cred->fsgid; \ } while(0) +extern void __put_cred(struct cred *); +extern int copy_creds(struct task_struct *, unsigned long); + +/** + * get_cred - Get a reference on a set of credentials + * @cred: The credentials to reference + * + * Get a reference on the specified set of credentials. The caller must + * release the reference. + */ +static inline struct cred *get_cred(struct cred *cred) +{ + atomic_inc(&cred->usage); + return cred; +} + +/** + * put_cred - Release a reference to a set of credentials + * @cred: The credentials to release + * + * Release a reference to a set of credentials, deleting them when the last ref + * is released. + */ +static inline void put_cred(struct cred *cred) +{ + if (atomic_dec_and_test(&(cred)->usage)) + __put_cred(cred); +} + #endif /* _LINUX_CRED_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9de41ccd67b5..5e24c54b6dfd 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -115,19 +115,6 @@ extern struct group_info init_groups; extern struct cred init_cred; -#define INIT_CRED(p) \ -{ \ - .usage = ATOMIC_INIT(3), \ - .securebits = SECUREBITS_DEFAULT, \ - .cap_inheritable = CAP_INIT_INH_SET, \ - .cap_permitted = CAP_FULL_SET, \ - .cap_effective = CAP_INIT_EFF_SET, \ - .cap_bset = CAP_INIT_BSET, \ - .user = INIT_USER, \ - .group_info = &init_groups, \ - .lock = __SPIN_LOCK_UNLOCKED(p.lock), \ -} - /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -162,8 +149,7 @@ extern struct cred init_cred; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ - .__temp_cred = INIT_CRED(tsk.__temp_cred), \ - .cred = &tsk.__temp_cred, \ + .cred = &init_cred, \ .comm = "swapper", \ .thread = INIT_THREAD, \ .fs = &init_fs, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index c8b92502354d..740cf946c8cc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1151,7 +1151,6 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - struct cred __temp_cred __deprecated; /* temporary credentials to be removed */ struct cred *cred; /* actual/objective task credentials */ char comm[TASK_COMM_LEN]; /* executable name excluding path diff --git a/include/linux/security.h b/include/linux/security.h index 9f305d4a31a7..9239cc11eb9c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -593,15 +593,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * manual page for definitions of the @clone_flags. * @clone_flags contains the flags indicating what should be shared. * Return 0 if permission is granted. - * @task_alloc_security: - * @p contains the task_struct for child process. - * Allocate and attach a security structure to the p->security field. The - * security field is initialized to NULL when the task structure is + * @cred_alloc_security: + * @cred contains the cred struct for child process. + * Allocate and attach a security structure to the cred->security field. + * The security field is initialized to NULL when the task structure is * allocated. * Return 0 if operation was successful. - * @task_free_security: - * @p contains the task_struct for process. - * Deallocate and clear the p->security field. + * @cred_free: + * @cred points to the credentials. + * Deallocate and clear the cred->security field in a set of credentials. * @task_setuid: * Check permission before setting one or more of the user identity * attributes of the current process. The @flags parameter indicates @@ -1405,8 +1405,8 @@ struct security_operations { int (*dentry_open) (struct file *file); int (*task_create) (unsigned long clone_flags); - int (*task_alloc_security) (struct task_struct *p); - void (*task_free_security) (struct task_struct *p); + int (*cred_alloc_security) (struct cred *cred); + void (*cred_free) (struct cred *cred); int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags); int (*task_post_setuid) (uid_t old_ruid /* or fsuid */ , uid_t old_euid, uid_t old_suid, int flags); @@ -1660,8 +1660,8 @@ int security_file_send_sigiotask(struct task_struct *tsk, int security_file_receive(struct file *file); int security_dentry_open(struct file *file); int security_task_create(unsigned long clone_flags); -int security_task_alloc(struct task_struct *p); -void security_task_free(struct task_struct *p); +int security_cred_alloc(struct cred *cred); +void security_cred_free(struct cred *cred); int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags); int security_task_post_setuid(uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags); @@ -2181,12 +2181,12 @@ static inline int security_task_create(unsigned long clone_flags) return 0; } -static inline int security_task_alloc(struct task_struct *p) +static inline int security_cred_alloc(struct cred *cred) { return 0; } -static inline void security_task_free(struct task_struct *p) +static inline void security_cred_free(struct cred *cred) { } static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, -- cgit v1.2.3 From 86a264abe542cfececb4df129bc45a0338d8cdb9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:18 +1100 Subject: CRED: Wrap current->cred and a few other accessors Wrap current->cred and a few other accessors to hide their actual implementation. Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/cred.h | 187 ++++++++++++++++++++++++++++++++++----------- include/linux/securebits.h | 2 +- 2 files changed, 144 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index a7a686074cb0..4221ec6000c1 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -37,15 +37,16 @@ struct group_info { * get_group_info - Get a reference to a group info structure * @group_info: The group info to reference * - * This must be called with the owning task locked (via task_lock()) when task - * != current. The reason being that the vast majority of callers are looking - * at current->group_info, which can not be changed except by the current task. - * Changing current->group_info requires the task lock, too. + * This gets a reference to a set of supplementary groups. + * + * If the caller is accessing a task's credentials, they must hold the RCU read + * lock when reading. */ -#define get_group_info(group_info) \ -do { \ - atomic_inc(&(group_info)->usage); \ -} while (0) +static inline struct group_info *get_group_info(struct group_info *gi) +{ + atomic_inc(&gi->usage); + return gi; +} /** * put_group_info - Release a reference to a group info structure @@ -61,7 +62,7 @@ extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); extern int set_current_groups(struct group_info *); extern int set_groups(struct cred *, struct group_info *); -extern int groups_search(struct group_info *, gid_t); +extern int groups_search(const struct group_info *, gid_t); /* access the groups "array" with this macro */ #define GROUP_AT(gi, i) \ @@ -123,41 +124,6 @@ struct cred { spinlock_t lock; /* lock for pointer changes */ }; -#define get_current_user() (get_uid(current->cred->user)) - -#define task_uid(task) ((task)->cred->uid) -#define task_gid(task) ((task)->cred->gid) -#define task_euid(task) ((task)->cred->euid) -#define task_egid(task) ((task)->cred->egid) - -#define current_uid() (current->cred->uid) -#define current_gid() (current->cred->gid) -#define current_euid() (current->cred->euid) -#define current_egid() (current->cred->egid) -#define current_suid() (current->cred->suid) -#define current_sgid() (current->cred->sgid) -#define current_fsuid() (current->cred->fsuid) -#define current_fsgid() (current->cred->fsgid) -#define current_cap() (current->cred->cap_effective) - -#define current_uid_gid(_uid, _gid) \ -do { \ - *(_uid) = current->cred->uid; \ - *(_gid) = current->cred->gid; \ -} while(0) - -#define current_euid_egid(_uid, _gid) \ -do { \ - *(_uid) = current->cred->euid; \ - *(_gid) = current->cred->egid; \ -} while(0) - -#define current_fsuid_fsgid(_uid, _gid) \ -do { \ - *(_uid) = current->cred->fsuid; \ - *(_gid) = current->cred->fsgid; \ -} while(0) - extern void __put_cred(struct cred *); extern int copy_creds(struct task_struct *, unsigned long); @@ -187,4 +153,137 @@ static inline void put_cred(struct cred *cred) __put_cred(cred); } +/** + * current_cred - Access the current task's credentials + * + * Access the credentials of the current task. + */ +#define current_cred() \ + (current->cred) + +/** + * __task_cred - Access another task's credentials + * @task: The task to query + * + * Access the credentials of another task. The caller must hold the + * RCU readlock. + * + * The caller must make sure task doesn't go away, either by holding a ref on + * task or by holding tasklist_lock to prevent it from being unlinked. + */ +#define __task_cred(task) \ + ((const struct cred *)(rcu_dereference((task)->cred))) + +/** + * get_task_cred - Get another task's credentials + * @task: The task to query + * + * Get the credentials of a task, pinning them so that they can't go away. + * Accessing a task's credentials directly is not permitted. + * + * The caller must make sure task doesn't go away, either by holding a ref on + * task or by holding tasklist_lock to prevent it from being unlinked. + */ +#define get_task_cred(task) \ +({ \ + struct cred *__cred; \ + rcu_read_lock(); \ + __cred = (struct cred *) __task_cred((task)); \ + get_cred(__cred); \ + rcu_read_unlock(); \ + __cred; \ +}) + +/** + * get_current_cred - Get the current task's credentials + * + * Get the credentials of the current task, pinning them so that they can't go + * away. Accessing the current task's credentials directly is not permitted. + */ +#define get_current_cred() \ + (get_cred(current_cred())) + +/** + * get_current_user - Get the current task's user_struct + * + * Get the user record of the current task, pinning it so that it can't go + * away. + */ +#define get_current_user() \ +({ \ + struct user_struct *__u; \ + struct cred *__cred; \ + __cred = (struct cred *) current_cred(); \ + __u = get_uid(__cred->user); \ + __u; \ +}) + +/** + * get_current_groups - Get the current task's supplementary group list + * + * Get the supplementary group list of the current task, pinning it so that it + * can't go away. + */ +#define get_current_groups() \ +({ \ + struct group_info *__groups; \ + struct cred *__cred; \ + __cred = (struct cred *) current_cred(); \ + __groups = get_group_info(__cred->group_info); \ + __groups; \ +}) + +#define task_cred_xxx(task, xxx) \ +({ \ + __typeof__(task->cred->xxx) ___val; \ + rcu_read_lock(); \ + ___val = __task_cred((task))->xxx; \ + rcu_read_unlock(); \ + ___val; \ +}) + +#define task_uid(task) (task_cred_xxx((task), uid)) +#define task_euid(task) (task_cred_xxx((task), euid)) + +#define current_cred_xxx(xxx) \ +({ \ + current->cred->xxx; \ +}) + +#define current_uid() (current_cred_xxx(uid)) +#define current_gid() (current_cred_xxx(gid)) +#define current_euid() (current_cred_xxx(euid)) +#define current_egid() (current_cred_xxx(egid)) +#define current_suid() (current_cred_xxx(suid)) +#define current_sgid() (current_cred_xxx(sgid)) +#define current_fsuid() (current_cred_xxx(fsuid)) +#define current_fsgid() (current_cred_xxx(fsgid)) +#define current_cap() (current_cred_xxx(cap_effective)) +#define current_user() (current_cred_xxx(user)) +#define current_security() (current_cred_xxx(security)) + +#define current_uid_gid(_uid, _gid) \ +do { \ + const struct cred *__cred; \ + __cred = current_cred(); \ + *(_uid) = __cred->uid; \ + *(_gid) = __cred->gid; \ +} while(0) + +#define current_euid_egid(_euid, _egid) \ +do { \ + const struct cred *__cred; \ + __cred = current_cred(); \ + *(_euid) = __cred->euid; \ + *(_egid) = __cred->egid; \ +} while(0) + +#define current_fsuid_fsgid(_fsuid, _fsgid) \ +do { \ + const struct cred *__cred; \ + __cred = current_cred(); \ + *(_fsuid) = __cred->fsuid; \ + *(_fsgid) = __cred->fsgid; \ +} while(0) + #endif /* _LINUX_CRED_H */ diff --git a/include/linux/securebits.h b/include/linux/securebits.h index 6d389491bfa2..d2c5ed845bcc 100644 --- a/include/linux/securebits.h +++ b/include/linux/securebits.h @@ -32,7 +32,7 @@ setting is locked or not. A setting which is locked cannot be changed from user-level. */ #define issecure_mask(X) (1 << (X)) -#define issecure(X) (issecure_mask(X) & current->cred->securebits) +#define issecure(X) (issecure_mask(X) & current_cred_xxx(securebits)) #define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \ issecure_mask(SECURE_NO_SETUID_FIXUP) | \ -- cgit v1.2.3 From c69e8d9c01db2adc503464993c358901c9af9de4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:19 +1100 Subject: CRED: Use RCU to access another task's creds and to release a task's own creds Use RCU to access another task's creds and to release a task's own creds. This means that it will be possible for the credentials of a task to be replaced without another task (a) requiring a full lock to read them, and (b) seeing deallocated memory. Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/cred.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 4221ec6000c1..166ce4ddba64 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -147,8 +147,9 @@ static inline struct cred *get_cred(struct cred *cred) * Release a reference to a set of credentials, deleting them when the last ref * is released. */ -static inline void put_cred(struct cred *cred) +static inline void put_cred(const struct cred *_cred) { + struct cred *cred = (struct cred *) _cred; if (atomic_dec_and_test(&(cred)->usage)) __put_cred(cred); } -- cgit v1.2.3 From bb952bb98a7e479262c7eb25d5592545a3af147d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:20 +1100 Subject: CRED: Separate per-task-group keyrings from signal_struct Separate per-task-group keyrings from signal_struct and dangle their anchor from the cred struct rather than the signal_struct. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- include/linux/cred.h | 16 ++++++++++++++++ include/linux/key.h | 8 ++------ include/linux/sched.h | 6 ------ 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 166ce4ddba64..62b9e532422d 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -71,6 +71,21 @@ extern int groups_search(const struct group_info *, gid_t); extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); +/* + * The common credentials for a thread group + * - shared by CLONE_THREAD + */ +#ifdef CONFIG_KEYS +struct thread_group_cred { + atomic_t usage; + pid_t tgid; /* thread group process ID */ + spinlock_t lock; + struct key *session_keyring; /* keyring inherited over fork */ + struct key *process_keyring; /* keyring private to this process */ + struct rcu_head rcu; /* RCU deletion hook */ +}; +#endif + /* * The security context of a task * @@ -114,6 +129,7 @@ struct cred { * keys to */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ + struct thread_group_cred *tgcred; /* thread-group shared credentials */ #endif #ifdef CONFIG_SECURITY void *security; /* subjective LSM security */ diff --git a/include/linux/key.h b/include/linux/key.h index df709e1af3cd..0836cc838b0c 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -278,9 +278,7 @@ extern ctl_table key_sysctls[]; */ extern void switch_uid_keyring(struct user_struct *new_user); extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk); -extern int copy_thread_group_keys(struct task_struct *tsk); extern void exit_keys(struct task_struct *tsk); -extern void exit_thread_group_keys(struct signal_struct *tg); extern int suid_keys(struct task_struct *tsk); extern int exec_keys(struct task_struct *tsk); extern void key_fsuid_changed(struct task_struct *tsk); @@ -289,8 +287,8 @@ extern void key_init(void); #define __install_session_keyring(keyring) \ ({ \ - struct key *old_session = current->signal->session_keyring; \ - current->signal->session_keyring = keyring; \ + struct key *old_session = current->cred->tgcred->session_keyring; \ + current->cred->tgcred->session_keyring = keyring; \ old_session; \ }) @@ -308,9 +306,7 @@ extern void key_init(void); #define switch_uid_keyring(u) do { } while(0) #define __install_session_keyring(k) ({ NULL; }) #define copy_keys(f,t) 0 -#define copy_thread_group_keys(t) 0 #define exit_keys(t) do { } while(0) -#define exit_thread_group_keys(tg) do { } while(0) #define suid_keys(t) do { } while(0) #define exec_keys(t) do { } while(0) #define key_fsuid_changed(t) do { } while(0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 740cf946c8cc..2913252989b3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -571,12 +571,6 @@ struct signal_struct { */ struct rlimit rlim[RLIM_NLIMITS]; - /* keep the process-shared keyrings here so that they do the right - * thing in threads created with CLONE_THREAD */ -#ifdef CONFIG_KEYS - struct key *session_keyring; /* keyring inherited over fork */ - struct key *process_keyring; /* keyring private to this process */ -#endif #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif -- cgit v1.2.3 From 745ca2475a6ac596e3d8d37c2759c0fbe2586227 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:22 +1100 Subject: CRED: Pass credentials through dentry_open() Pass credentials through dentry_open() so that the COW creds patch can have SELinux's flush_unauthorized_files() pass the appropriate creds back to itself when it opens its null chardev. The security_dentry_open() call also now takes a creds pointer, as does the dentry_open hook in struct security_operations. Signed-off-by: David Howells Acked-by: James Morris Signed-off-by: James Morris --- include/linux/fs.h | 4 +++- include/linux/security.h | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b3d404aaabed..3bfec1327b8d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,7 @@ struct poll_table_struct; struct kstatfs; struct vm_area_struct; struct vfsmount; +struct cred; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -1673,7 +1674,8 @@ extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, extern long do_sys_open(int dfd, const char __user *filename, int flags, int mode); extern struct file *filp_open(const char *, int, int); -extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); +extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, + const struct cred *); extern int filp_close(struct file *, fl_owner_t id); extern char * getname(const char __user *); diff --git a/include/linux/security.h b/include/linux/security.h index 9239cc11eb9c..7e9fe046a0d1 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1402,7 +1402,7 @@ struct security_operations { int (*file_send_sigiotask) (struct task_struct *tsk, struct fown_struct *fown, int sig); int (*file_receive) (struct file *file); - int (*dentry_open) (struct file *file); + int (*dentry_open) (struct file *file, const struct cred *cred); int (*task_create) (unsigned long clone_flags); int (*cred_alloc_security) (struct cred *cred); @@ -1658,7 +1658,7 @@ int security_file_set_fowner(struct file *file); int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig); int security_file_receive(struct file *file); -int security_dentry_open(struct file *file); +int security_dentry_open(struct file *file, const struct cred *cred); int security_task_create(unsigned long clone_flags); int security_cred_alloc(struct cred *cred); void security_cred_free(struct cred *cred); @@ -2171,7 +2171,8 @@ static inline int security_file_receive(struct file *file) return 0; } -static inline int security_dentry_open(struct file *file) +static inline int security_dentry_open(struct file *file, + const struct cred *cred) { return 0; } -- cgit v1.2.3 From d84f4f992cbd76e8f39c488cf0c5d123843923b1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:23 +1100 Subject: CRED: Inaugurate COW credentials Inaugurate copy-on-write credentials management. This uses RCU to manage the credentials pointer in the task_struct with respect to accesses by other tasks. A process may only modify its own credentials, and so does not need locking to access or modify its own credentials. A mutex (cred_replace_mutex) is added to the task_struct to control the effect of PTRACE_ATTACHED on credential calculations, particularly with respect to execve(). With this patch, the contents of an active credentials struct may not be changed directly; rather a new set of credentials must be prepared, modified and committed using something like the following sequence of events: struct cred *new = prepare_creds(); int ret = blah(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); There are some exceptions to this rule: the keyrings pointed to by the active credentials may be instantiated - keyrings violate the COW rule as managing COW keyrings is tricky, given that it is possible for a task to directly alter the keys in a keyring in use by another task. To help enforce this, various pointers to sets of credentials, such as those in the task_struct, are declared const. The purpose of this is compile-time discouragement of altering credentials through those pointers. Once a set of credentials has been made public through one of these pointers, it may not be modified, except under special circumstances: (1) Its reference count may incremented and decremented. (2) The keyrings to which it points may be modified, but not replaced. The only safe way to modify anything else is to create a replacement and commit using the functions described in Documentation/credentials.txt (which will be added by a later patch). This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). This now prepares and commits credentials in various places in the security code rather than altering the current creds directly. (2) Temporary credential overrides. do_coredump() and sys_faccessat() now prepare their own credentials and temporarily override the ones currently on the acting thread, whilst preventing interference from other threads by holding cred_replace_mutex on the thread being dumped. This will be replaced in a future patch by something that hands down the credentials directly to the functions being called, rather than altering the task's objective credentials. (3) LSM interface. A number of functions have been changed, added or removed: (*) security_capset_check(), ->capset_check() (*) security_capset_set(), ->capset_set() Removed in favour of security_capset(). (*) security_capset(), ->capset() New. This is passed a pointer to the new creds, a pointer to the old creds and the proposed capability sets. It should fill in the new creds or return an error. All pointers, barring the pointer to the new creds, are now const. (*) security_bprm_apply_creds(), ->bprm_apply_creds() Changed; now returns a value, which will cause the process to be killed if it's an error. (*) security_task_alloc(), ->task_alloc_security() Removed in favour of security_prepare_creds(). (*) security_cred_free(), ->cred_free() New. Free security data attached to cred->security. (*) security_prepare_creds(), ->cred_prepare() New. Duplicate any security data attached to cred->security. (*) security_commit_creds(), ->cred_commit() New. Apply any security effects for the upcoming installation of new security by commit_creds(). (*) security_task_post_setuid(), ->task_post_setuid() Removed in favour of security_task_fix_setuid(). (*) security_task_fix_setuid(), ->task_fix_setuid() Fix up the proposed new credentials for setuid(). This is used by cap_set_fix_setuid() to implicitly adjust capabilities in line with setuid() changes. Changes are made to the new credentials, rather than the task itself as in security_task_post_setuid(). (*) security_task_reparent_to_init(), ->task_reparent_to_init() Removed. Instead the task being reparented to init is referred directly to init's credentials. NOTE! This results in the loss of some state: SELinux's osid no longer records the sid of the thread that forked it. (*) security_key_alloc(), ->key_alloc() (*) security_key_permission(), ->key_permission() Changed. These now take cred pointers rather than task pointers to refer to the security context. (4) sys_capset(). This has been simplified and uses less locking. The LSM functions it calls have been merged. (5) reparent_to_kthreadd(). This gives the current thread the same credentials as init by simply using commit_thread() to point that way. (6) __sigqueue_alloc() and switch_uid() __sigqueue_alloc() can't stop the target task from changing its creds beneath it, so this function gets a reference to the currently applicable user_struct which it then passes into the sigqueue struct it returns if successful. switch_uid() is now called from commit_creds(), and possibly should be folded into that. commit_creds() should take care of protecting __sigqueue_alloc(). (7) [sg]et[ug]id() and co and [sg]et_current_groups. The set functions now all use prepare_creds(), commit_creds() and abort_creds() to build and check a new set of credentials before applying it. security_task_set[ug]id() is called inside the prepared section. This guarantees that nothing else will affect the creds until we've finished. The calling of set_dumpable() has been moved into commit_creds(). Much of the functionality of set_user() has been moved into commit_creds(). The get functions all simply access the data directly. (8) security_task_prctl() and cap_task_prctl(). security_task_prctl() has been modified to return -ENOSYS if it doesn't want to handle a function, or otherwise return the return value directly rather than through an argument. Additionally, cap_task_prctl() now prepares a new set of credentials, even if it doesn't end up using it. (9) Keyrings. A number of changes have been made to the keyrings code: (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have all been dropped and built in to the credentials functions directly. They may want separating out again later. (b) key_alloc() and search_process_keyrings() now take a cred pointer rather than a task pointer to specify the security context. (c) copy_creds() gives a new thread within the same thread group a new thread keyring if its parent had one, otherwise it discards the thread keyring. (d) The authorisation key now points directly to the credentials to extend the search into rather pointing to the task that carries them. (e) Installing thread, process or session keyrings causes a new set of credentials to be created, even though it's not strictly necessary for process or session keyrings (they're shared). (10) Usermode helper. The usermode helper code now carries a cred struct pointer in its subprocess_info struct instead of a new session keyring pointer. This set of credentials is derived from init_cred and installed on the new process after it has been cloned. call_usermodehelper_setup() allocates the new credentials and call_usermodehelper_freeinfo() discards them if they haven't been used. A special cred function (prepare_usermodeinfo_creds()) is provided specifically for call_usermodehelper_setup() to call. call_usermodehelper_setkeys() adjusts the credentials to sport the supplied keyring as the new session keyring. (11) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) selinux_setprocattr() no longer does its check for whether the current ptracer can access processes with the new SID inside the lock that covers getting the ptracer's SID. Whilst this lock ensures that the check is done with the ptracer pinned, the result is only valid until the lock is released, so there's no point doing it inside the lock. (12) is_single_threaded(). This function has been extracted from selinux_setprocattr() and put into a file of its own in the lib/ directory as join_session_keyring() now wants to use it too. The code in SELinux just checked to see whether a task shared mm_structs with other tasks (CLONE_VM), but that isn't good enough. We really want to know if they're part of the same thread group (CLONE_THREAD). (13) nfsd. The NFS server daemon now has to use the COW credentials to set the credentials it is going to use. It really needs to pass the credentials down to the functions it calls, but it can't do that until other patches in this series have been applied. Signed-off-by: David Howells Acked-by: James Morris Signed-off-by: James Morris --- include/linux/audit.h | 22 ++++-- include/linux/capability.h | 2 - include/linux/cred.h | 44 ++++++++--- include/linux/init_task.h | 2 + include/linux/key.h | 22 +----- include/linux/sched.h | 6 +- include/linux/security.h | 178 +++++++++++++++++++++------------------------ 7 files changed, 139 insertions(+), 137 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 6fbebac7b1bf..0b2fcb698a63 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -454,8 +454,10 @@ extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_pr extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout); extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification); extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); -extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE); -extern int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm); +extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm, + const struct cred *new, + const struct cred *old); +extern int __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old); static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp) { @@ -522,16 +524,20 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) * * -Eric */ -static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE) +static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, + const struct cred *new, + const struct cred *old) { if (unlikely(!audit_dummy_context())) - __audit_log_bprm_fcaps(bprm, pP, pE); + return __audit_log_bprm_fcaps(bprm, new, old); + return 0; } -static inline int audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm) +static inline int audit_log_capset(pid_t pid, const struct cred *new, + const struct cred *old) { if (unlikely(!audit_dummy_context())) - return __audit_log_capset(pid, eff, inh, perm); + return __audit_log_capset(pid, new, old); return 0; } @@ -566,8 +572,8 @@ extern int audit_signals; #define audit_mq_timedreceive(d,l,p,t) ({ 0; }) #define audit_mq_notify(d,n) ({ 0; }) #define audit_mq_getsetattr(d,s) ({ 0; }) -#define audit_log_bprm_fcaps(b, p, e) do { ; } while (0) -#define audit_log_capset(pid, e, i, p) ({ 0; }) +#define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; }) +#define audit_log_capset(pid, ncr, ocr) ({ 0; }) #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 diff --git a/include/linux/capability.h b/include/linux/capability.h index 7f26580a5a4d..e22f48c2a46f 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -519,8 +519,6 @@ extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_full_set; extern const kernel_cap_t __cap_init_eff_set; -kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); - /** * has_capability - Determine if a task has a superior capability available * @t: The task in question diff --git a/include/linux/cred.h b/include/linux/cred.h index 62b9e532422d..eaf6fa695a04 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -84,6 +84,8 @@ struct thread_group_cred { struct key *process_keyring; /* keyring private to this process */ struct rcu_head rcu; /* RCU deletion hook */ }; + +extern void release_tgcred(struct cred *cred); #endif /* @@ -137,11 +139,30 @@ struct cred { struct user_struct *user; /* real user ID subscription */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ struct rcu_head rcu; /* RCU deletion hook */ - spinlock_t lock; /* lock for pointer changes */ }; extern void __put_cred(struct cred *); extern int copy_creds(struct task_struct *, unsigned long); +extern struct cred *prepare_creds(void); +extern struct cred *prepare_usermodehelper_creds(void); +extern int commit_creds(struct cred *); +extern void abort_creds(struct cred *); +extern const struct cred *override_creds(const struct cred *) __deprecated; +extern void revert_creds(const struct cred *) __deprecated; +extern void __init cred_init(void); + +/** + * get_new_cred - Get a reference on a new set of credentials + * @cred: The new credentials to reference + * + * Get a reference on the specified set of new credentials. The caller must + * release the reference. + */ +static inline struct cred *get_new_cred(struct cred *cred) +{ + atomic_inc(&cred->usage); + return cred; +} /** * get_cred - Get a reference on a set of credentials @@ -150,10 +171,9 @@ extern int copy_creds(struct task_struct *, unsigned long); * Get a reference on the specified set of credentials. The caller must * release the reference. */ -static inline struct cred *get_cred(struct cred *cred) +static inline const struct cred *get_cred(const struct cred *cred) { - atomic_inc(&cred->usage); - return cred; + return get_new_cred((struct cred *) cred); } /** @@ -166,6 +186,8 @@ static inline struct cred *get_cred(struct cred *cred) static inline void put_cred(const struct cred *_cred) { struct cred *cred = (struct cred *) _cred; + + BUG_ON(atomic_read(&(cred)->usage) <= 0); if (atomic_dec_and_test(&(cred)->usage)) __put_cred(cred); } @@ -250,13 +272,13 @@ static inline void put_cred(const struct cred *_cred) __groups; \ }) -#define task_cred_xxx(task, xxx) \ -({ \ - __typeof__(task->cred->xxx) ___val; \ - rcu_read_lock(); \ - ___val = __task_cred((task))->xxx; \ - rcu_read_unlock(); \ - ___val; \ +#define task_cred_xxx(task, xxx) \ +({ \ + __typeof__(((struct cred *)NULL)->xxx) ___val; \ + rcu_read_lock(); \ + ___val = __task_cred((task))->xxx; \ + rcu_read_unlock(); \ + ___val; \ }) #define task_uid(task) (task_cred_xxx((task), uid)) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 5e24c54b6dfd..08c3b24ad9a8 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -150,6 +150,8 @@ extern struct cred init_cred; .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ .cred = &init_cred, \ + .cred_exec_mutex = \ + __MUTEX_INITIALIZER(tsk.cred_exec_mutex), \ .comm = "swapper", \ .thread = INIT_THREAD, \ .fs = &init_fs, \ diff --git a/include/linux/key.h b/include/linux/key.h index 0836cc838b0c..69ecf0934b02 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -73,6 +73,7 @@ struct key; struct seq_file; struct user_struct; struct signal_struct; +struct cred; struct key_type; struct key_owner; @@ -181,7 +182,7 @@ struct key { extern struct key *key_alloc(struct key_type *type, const char *desc, uid_t uid, gid_t gid, - struct task_struct *ctx, + const struct cred *cred, key_perm_t perm, unsigned long flags); @@ -249,7 +250,7 @@ extern int key_unlink(struct key *keyring, struct key *key); extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, - struct task_struct *ctx, + const struct cred *cred, unsigned long flags, struct key *dest); @@ -276,22 +277,12 @@ extern ctl_table key_sysctls[]; /* * the userspace interface */ -extern void switch_uid_keyring(struct user_struct *new_user); -extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk); -extern void exit_keys(struct task_struct *tsk); -extern int suid_keys(struct task_struct *tsk); +extern int install_thread_keyring_to_cred(struct cred *cred); extern int exec_keys(struct task_struct *tsk); extern void key_fsuid_changed(struct task_struct *tsk); extern void key_fsgid_changed(struct task_struct *tsk); extern void key_init(void); -#define __install_session_keyring(keyring) \ -({ \ - struct key *old_session = current->cred->tgcred->session_keyring; \ - current->cred->tgcred->session_keyring = keyring; \ - old_session; \ -}) - #else /* CONFIG_KEYS */ #define key_validate(k) 0 @@ -303,11 +294,6 @@ extern void key_init(void); #define make_key_ref(k, p) NULL #define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 -#define switch_uid_keyring(u) do { } while(0) -#define __install_session_keyring(k) ({ NULL; }) -#define copy_keys(f,t) 0 -#define exit_keys(t) do { } while(0) -#define suid_keys(t) do { } while(0) #define exec_keys(t) do { } while(0) #define key_fsuid_changed(t) do { } while(0) #define key_fsgid_changed(t) do { } while(0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2913252989b3..121d655e460d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1145,7 +1145,8 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - struct cred *cred; /* actual/objective task credentials */ + const struct cred *cred; /* actual/objective task credentials (COW) */ + struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock @@ -1720,7 +1721,6 @@ static inline struct user_struct *get_uid(struct user_struct *u) return u; } extern void free_uid(struct user_struct *); -extern void switch_uid(struct user_struct *); extern void release_uids(struct user_namespace *ns); #include @@ -1870,6 +1870,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) +extern bool is_single_threaded(struct task_struct *); + /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. diff --git a/include/linux/security.h b/include/linux/security.h index 7e9fe046a0d1..68be11251447 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -53,24 +53,21 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz); extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -extern int cap_capset_check(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); -extern void cap_capset_set(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); +extern int cap_capset(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); extern int cap_bprm_set_security(struct linux_binprm *bprm); -extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); +extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); extern int cap_bprm_secureexec(struct linux_binprm *bprm); extern int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int cap_inode_removexattr(struct dentry *dentry, const char *name); extern int cap_inode_need_killpriv(struct dentry *dentry); extern int cap_inode_killpriv(struct dentry *dentry); -extern int cap_task_post_setuid(uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags); -extern void cap_task_reparent_to_init(struct task_struct *p); +extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5, long *rc_p); + unsigned long arg4, unsigned long arg5); extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); @@ -170,8 +167,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Compute and set the security attributes of a process being transformed * by an execve operation based on the old attributes (current->security) * and the information saved in @bprm->security by the set_security hook. - * Since this hook function (and its caller) are void, this hook can not - * return an error. However, it can leave the security attributes of the + * Since this function may return an error, in which case the process will + * be killed. However, it can leave the security attributes of the * process unchanged if an access failure occurs at this point. * bprm_apply_creds is called under task_lock. @unsafe indicates various * reasons why it may be unsafe to change security state. @@ -593,15 +590,18 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * manual page for definitions of the @clone_flags. * @clone_flags contains the flags indicating what should be shared. * Return 0 if permission is granted. - * @cred_alloc_security: - * @cred contains the cred struct for child process. - * Allocate and attach a security structure to the cred->security field. - * The security field is initialized to NULL when the task structure is - * allocated. - * Return 0 if operation was successful. * @cred_free: * @cred points to the credentials. * Deallocate and clear the cred->security field in a set of credentials. + * @cred_prepare: + * @new points to the new credentials. + * @old points to the original credentials. + * @gfp indicates the atomicity of any memory allocations. + * Prepare a new set of credentials by copying the data from the old set. + * @cred_commit: + * @new points to the new credentials. + * @old points to the original credentials. + * Install a new set of credentials. * @task_setuid: * Check permission before setting one or more of the user identity * attributes of the current process. The @flags parameter indicates @@ -614,15 +614,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @id2 contains a uid. * @flags contains one of the LSM_SETID_* values. * Return 0 if permission is granted. - * @task_post_setuid: + * @task_fix_setuid: * Update the module's state after setting one or more of the user * identity attributes of the current process. The @flags parameter * indicates which of the set*uid system calls invoked this hook. If - * @flags is LSM_SETID_FS, then @old_ruid is the old fs uid and the other - * parameters are not used. - * @old_ruid contains the old real uid (or fs uid if LSM_SETID_FS). - * @old_euid contains the old effective uid (or -1 if LSM_SETID_FS). - * @old_suid contains the old saved uid (or -1 if LSM_SETID_FS). + * @new is the set of credentials that will be installed. Modifications + * should be made to this rather than to @current->cred. + * @old is the set of credentials that are being replaces * @flags contains one of the LSM_SETID_* values. * Return 0 on success. * @task_setgid: @@ -725,13 +723,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @arg3 contains a argument. * @arg4 contains a argument. * @arg5 contains a argument. - * @rc_p contains a pointer to communicate back the forced return code - * Return 0 if permission is granted, and non-zero if the security module - * has taken responsibility (setting *rc_p) for the prctl call. - * @task_reparent_to_init: - * Set the security attributes in @p->security for a kernel thread that - * is being reparented to the init task. - * @p contains the task_struct for the kernel thread. + * Return -ENOSYS if no-one wanted to handle this op, any other value to + * cause prctl() to return immediately with that value. * @task_to_inode: * Set the security attributes for an inode based on an associated task's * security attributes, e.g. for /proc/pid inodes. @@ -1008,7 +1001,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * See whether a specific operational right is granted to a process on a * key. * @key_ref refers to the key (key pointer + possession attribute bit). - * @context points to the process to provide the context against which to + * @cred points to the credentials to provide the context against which to * evaluate the security data on the key. * @perm describes the combination of permissions required of this key. * Return 1 if permission granted, 0 if permission denied and -ve it the @@ -1170,6 +1163,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @child process. * Security modules may also want to perform a process tracing check * during an execve in the set_security or apply_creds hooks of + * tracing check during an execve in the bprm_set_creds hook of * binprm_security_ops if the process is being traced and its security * attributes would be changed by the execve. * @child contains the task_struct structure for the target process. @@ -1193,19 +1187,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @inheritable contains the inheritable capability set. * @permitted contains the permitted capability set. * Return 0 if the capability sets were successfully obtained. - * @capset_check: - * Check permission before setting the @effective, @inheritable, and - * @permitted capability sets for the current process. - * @effective contains the effective capability set. - * @inheritable contains the inheritable capability set. - * @permitted contains the permitted capability set. - * Return 0 if permission is granted. - * @capset_set: + * @capset: * Set the @effective, @inheritable, and @permitted capability sets for * the current process. + * @new contains the new credentials structure for target process. + * @old contains the current credentials structure for target process. * @effective contains the effective capability set. * @inheritable contains the inheritable capability set. * @permitted contains the permitted capability set. + * Return 0 and update @new if permission is granted. * @capable: * Check whether the @tsk process has the @cap capability. * @tsk contains the task_struct for the process. @@ -1297,12 +1287,11 @@ struct security_operations { int (*capget) (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); - int (*capset_check) (const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); - void (*capset_set) (const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); + int (*capset) (struct cred *new, + const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); int (*capable) (struct task_struct *tsk, int cap, int audit); int (*acct) (struct file *file); int (*sysctl) (struct ctl_table *table, int op); @@ -1314,7 +1303,7 @@ struct security_operations { int (*bprm_alloc_security) (struct linux_binprm *bprm); void (*bprm_free_security) (struct linux_binprm *bprm); - void (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe); + int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe); void (*bprm_post_apply_creds) (struct linux_binprm *bprm); int (*bprm_set_security) (struct linux_binprm *bprm); int (*bprm_check_security) (struct linux_binprm *bprm); @@ -1405,11 +1394,13 @@ struct security_operations { int (*dentry_open) (struct file *file, const struct cred *cred); int (*task_create) (unsigned long clone_flags); - int (*cred_alloc_security) (struct cred *cred); void (*cred_free) (struct cred *cred); + int (*cred_prepare)(struct cred *new, const struct cred *old, + gfp_t gfp); + void (*cred_commit)(struct cred *new, const struct cred *old); int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags); - int (*task_post_setuid) (uid_t old_ruid /* or fsuid */ , - uid_t old_euid, uid_t old_suid, int flags); + int (*task_fix_setuid) (struct cred *new, const struct cred *old, + int flags); int (*task_setgid) (gid_t id0, gid_t id1, gid_t id2, int flags); int (*task_setpgid) (struct task_struct *p, pid_t pgid); int (*task_getpgid) (struct task_struct *p); @@ -1429,8 +1420,7 @@ struct security_operations { int (*task_wait) (struct task_struct *p); int (*task_prctl) (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, - unsigned long arg5, long *rc_p); - void (*task_reparent_to_init) (struct task_struct *p); + unsigned long arg5); void (*task_to_inode) (struct task_struct *p, struct inode *inode); int (*ipc_permission) (struct kern_ipc_perm *ipcp, short flag); @@ -1535,10 +1525,10 @@ struct security_operations { /* key management security hooks */ #ifdef CONFIG_KEYS - int (*key_alloc) (struct key *key, struct task_struct *tsk, unsigned long flags); + int (*key_alloc) (struct key *key, const struct cred *cred, unsigned long flags); void (*key_free) (struct key *key); int (*key_permission) (key_ref_t key_ref, - struct task_struct *context, + const struct cred *cred, key_perm_t perm); int (*key_getsecurity)(struct key *key, char **_buffer); #endif /* CONFIG_KEYS */ @@ -1564,12 +1554,10 @@ int security_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); -int security_capset_check(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); -void security_capset_set(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted); +int security_capset(struct cred *new, const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted); int security_capable(struct task_struct *tsk, int cap); int security_capable_noaudit(struct task_struct *tsk, int cap); int security_acct(struct file *file); @@ -1583,7 +1571,7 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_vm_enough_memory_kern(long pages); int security_bprm_alloc(struct linux_binprm *bprm); void security_bprm_free(struct linux_binprm *bprm); -void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); +int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); void security_bprm_post_apply_creds(struct linux_binprm *bprm); int security_bprm_set(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); @@ -1660,11 +1648,12 @@ int security_file_send_sigiotask(struct task_struct *tsk, int security_file_receive(struct file *file); int security_dentry_open(struct file *file, const struct cred *cred); int security_task_create(unsigned long clone_flags); -int security_cred_alloc(struct cred *cred); void security_cred_free(struct cred *cred); +int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp); +void security_commit_creds(struct cred *new, const struct cred *old); int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags); -int security_task_post_setuid(uid_t old_ruid, uid_t old_euid, - uid_t old_suid, int flags); +int security_task_fix_setuid(struct cred *new, const struct cred *old, + int flags); int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags); int security_task_setpgid(struct task_struct *p, pid_t pgid); int security_task_getpgid(struct task_struct *p); @@ -1683,8 +1672,7 @@ int security_task_kill(struct task_struct *p, struct siginfo *info, int sig, u32 secid); int security_task_wait(struct task_struct *p); int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5, long *rc_p); -void security_task_reparent_to_init(struct task_struct *p); + unsigned long arg4, unsigned long arg5); void security_task_to_inode(struct task_struct *p, struct inode *inode); int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag); void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid); @@ -1759,18 +1747,13 @@ static inline int security_capget(struct task_struct *target, return cap_capget(target, effective, inheritable, permitted); } -static inline int security_capset_check(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) +static inline int security_capset(struct cred *new, + const struct cred *old, + const kernel_cap_t *effective, + const kernel_cap_t *inheritable, + const kernel_cap_t *permitted) { - return cap_capset_check(effective, inheritable, permitted); -} - -static inline void security_capset_set(const kernel_cap_t *effective, - const kernel_cap_t *inheritable, - const kernel_cap_t *permitted) -{ - cap_capset_set(effective, inheritable, permitted); + return cap_capset(new, old, effective, inheritable, permitted); } static inline int security_capable(struct task_struct *tsk, int cap) @@ -1837,9 +1820,9 @@ static inline int security_bprm_alloc(struct linux_binprm *bprm) static inline void security_bprm_free(struct linux_binprm *bprm) { } -static inline void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) { - cap_bprm_apply_creds(bprm, unsafe); + return cap_bprm_apply_creds(bprm, unsafe); } static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm) @@ -2182,13 +2165,20 @@ static inline int security_task_create(unsigned long clone_flags) return 0; } -static inline int security_cred_alloc(struct cred *cred) +static inline void security_cred_free(struct cred *cred) +{ } + +static inline int security_prepare_creds(struct cred *new, + const struct cred *old, + gfp_t gfp) { return 0; } -static inline void security_cred_free(struct cred *cred) -{ } +static inline void security_commit_creds(struct cred *new, + const struct cred *old) +{ +} static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) @@ -2196,10 +2186,11 @@ static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, return 0; } -static inline int security_task_post_setuid(uid_t old_ruid, uid_t old_euid, - uid_t old_suid, int flags) +static inline int security_task_fix_setuid(struct cred *new, + const struct cred *old, + int flags) { - return cap_task_post_setuid(old_ruid, old_euid, old_suid, flags); + return cap_task_fix_setuid(new, old, flags); } static inline int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, @@ -2286,14 +2277,9 @@ static inline int security_task_wait(struct task_struct *p) static inline int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, - unsigned long arg5, long *rc_p) -{ - return cap_task_prctl(option, arg2, arg3, arg3, arg5, rc_p); -} - -static inline void security_task_reparent_to_init(struct task_struct *p) + unsigned long arg5) { - cap_task_reparent_to_init(p); + return cap_task_prctl(option, arg2, arg3, arg3, arg5); } static inline void security_task_to_inode(struct task_struct *p, struct inode *inode) @@ -2719,16 +2705,16 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi #ifdef CONFIG_KEYS #ifdef CONFIG_SECURITY -int security_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags); +int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags); void security_key_free(struct key *key); int security_key_permission(key_ref_t key_ref, - struct task_struct *context, key_perm_t perm); + const struct cred *cred, key_perm_t perm); int security_key_getsecurity(struct key *key, char **_buffer); #else static inline int security_key_alloc(struct key *key, - struct task_struct *tsk, + const struct cred *cred, unsigned long flags) { return 0; @@ -2739,7 +2725,7 @@ static inline void security_key_free(struct key *key) } static inline int security_key_permission(key_ref_t key_ref, - struct task_struct *context, + const struct cred *cred, key_perm_t perm) { return 0; -- cgit v1.2.3 From a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:24 +1100 Subject: CRED: Make execve() take advantage of copy-on-write credentials Make execve() take advantage of copy-on-write credentials, allowing it to set up the credentials in advance, and then commit the whole lot after the point of no return. This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). The credential bits from struct linux_binprm are, for the most part, replaced with a single credentials pointer (bprm->cred). This means that all the creds can be calculated in advance and then applied at the point of no return with no possibility of failure. I would like to replace bprm->cap_effective with: cap_isclear(bprm->cap_effective) but this seems impossible due to special behaviour for processes of pid 1 (they always retain their parent's capability masks where normally they'd be changed - see cap_bprm_set_creds()). The following sequence of events now happens: (a) At the start of do_execve, the current task's cred_exec_mutex is locked to prevent PTRACE_ATTACH from obsoleting the calculation of creds that we make. (a) prepare_exec_creds() is then called to make a copy of the current task's credentials and prepare it. This copy is then assigned to bprm->cred. This renders security_bprm_alloc() and security_bprm_free() unnecessary, and so they've been removed. (b) The determination of unsafe execution is now performed immediately after (a) rather than later on in the code. The result is stored in bprm->unsafe for future reference. (c) prepare_binprm() is called, possibly multiple times. (i) This applies the result of set[ug]id binaries to the new creds attached to bprm->cred. Personality bit clearance is recorded, but now deferred on the basis that the exec procedure may yet fail. (ii) This then calls the new security_bprm_set_creds(). This should calculate the new LSM and capability credentials into *bprm->cred. This folds together security_bprm_set() and parts of security_bprm_apply_creds() (these two have been removed). Anything that might fail must be done at this point. (iii) bprm->cred_prepared is set to 1. bprm->cred_prepared is 0 on the first pass of the security calculations, and 1 on all subsequent passes. This allows SELinux in (ii) to base its calculations only on the initial script and not on the interpreter. (d) flush_old_exec() is called to commit the task to execution. This performs the following steps with regard to credentials: (i) Clear pdeath_signal and set dumpable on certain circumstances that may not be covered by commit_creds(). (ii) Clear any bits in current->personality that were deferred from (c.i). (e) install_exec_creds() [compute_creds() as was] is called to install the new credentials. This performs the following steps with regard to credentials: (i) Calls security_bprm_committing_creds() to apply any security requirements, such as flushing unauthorised files in SELinux, that must be done before the credentials are changed. This is made up of bits of security_bprm_apply_creds() and security_bprm_post_apply_creds(), both of which have been removed. This function is not allowed to fail; anything that might fail must have been done in (c.ii). (ii) Calls commit_creds() to apply the new credentials in a single assignment (more or less). Possibly pdeath_signal and dumpable should be part of struct creds. (iii) Unlocks the task's cred_replace_mutex, thus allowing PTRACE_ATTACH to take place. (iv) Clears The bprm->cred pointer as the credentials it was holding are now immutable. (v) Calls security_bprm_committed_creds() to apply any security alterations that must be done after the creds have been changed. SELinux uses this to flush signals and signal handlers. (f) If an error occurs before (d.i), bprm_free() will call abort_creds() to destroy the proposed new credentials and will then unlock cred_replace_mutex. No changes to the credentials will have been made. (2) LSM interface. A number of functions have been changed, added or removed: (*) security_bprm_alloc(), ->bprm_alloc_security() (*) security_bprm_free(), ->bprm_free_security() Removed in favour of preparing new credentials and modifying those. (*) security_bprm_apply_creds(), ->bprm_apply_creds() (*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds() Removed; split between security_bprm_set_creds(), security_bprm_committing_creds() and security_bprm_committed_creds(). (*) security_bprm_set(), ->bprm_set_security() Removed; folded into security_bprm_set_creds(). (*) security_bprm_set_creds(), ->bprm_set_creds() New. The new credentials in bprm->creds should be checked and set up as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the second and subsequent calls. (*) security_bprm_committing_creds(), ->bprm_committing_creds() (*) security_bprm_committed_creds(), ->bprm_committed_creds() New. Apply the security effects of the new credentials. This includes closing unauthorised files in SELinux. This function may not fail. When the former is called, the creds haven't yet been applied to the process; when the latter is called, they have. The former may access bprm->cred, the latter may not. (3) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) The bprm_security_struct struct has been removed in favour of using the credentials-under-construction approach. (c) flush_unauthorized_files() now takes a cred pointer and passes it on to inode_has_perm(), file_has_perm() and dentry_open(). Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/audit.h | 16 -------- include/linux/binfmts.h | 16 +++++--- include/linux/cred.h | 3 +- include/linux/key.h | 2 - include/linux/security.h | 103 +++++++++++++++++------------------------------ 5 files changed, 48 insertions(+), 92 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 0b2fcb698a63..e8ce2c4c7ac7 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -508,22 +508,6 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) return 0; } -/* - * ieieeeeee, an audit function without a return code! - * - * This function might fail! I decided that it didn't matter. We are too late - * to fail the syscall and the information isn't REQUIRED for any purpose. It's - * just nice to have. We should be able to look at past audit logs to figure - * out this process's current cap set along with the fcaps from the PATH record - * and use that to come up with the final set. Yeah, its ugly, but all the info - * is still in the audit log. So I'm not going to bother mentioning we failed - * if we couldn't allocate memory. - * - * If someone changes their mind they could create the aux record earlier and - * then search here and use that earlier allocation. But I don't wanna. - * - * -Eric - */ static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 7394b5b349ff..6cbfbe297180 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -35,16 +35,20 @@ struct linux_binprm{ struct mm_struct *mm; unsigned long p; /* current top of mem */ unsigned int sh_bang:1, - misc_bang:1; + misc_bang:1, + cred_prepared:1,/* true if creds already prepared (multiple + * preps happen for interpreters) */ + cap_effective:1;/* true if has elevated effective capabilities, + * false if not; except for init which inherits + * its parent's caps anyway */ #ifdef __alpha__ unsigned int taso:1; #endif unsigned int recursion_depth; struct file * file; - int e_uid, e_gid; - kernel_cap_t cap_post_exec_permitted; - bool cap_effective; - void *security; + struct cred *cred; /* new credentials */ + int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ + unsigned int per_clear; /* bits to clear in current->personality */ int argc, envc; char * filename; /* Name of binary as seen by procps */ char * interp; /* Name of the binary really executed. Most @@ -101,7 +105,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, int executable_stack); extern int bprm_mm_init(struct linux_binprm *bprm); extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); -extern void compute_creds(struct linux_binprm *binprm); +extern void install_exec_creds(struct linux_binprm *bprm); extern int do_coredump(long signr, int exit_code, struct pt_regs * regs); extern int set_binfmt(struct linux_binfmt *new); extern void free_bprm(struct linux_binprm *); diff --git a/include/linux/cred.h b/include/linux/cred.h index eaf6fa695a04..8edb4d1d5427 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -84,8 +84,6 @@ struct thread_group_cred { struct key *process_keyring; /* keyring private to this process */ struct rcu_head rcu; /* RCU deletion hook */ }; - -extern void release_tgcred(struct cred *cred); #endif /* @@ -144,6 +142,7 @@ struct cred { extern void __put_cred(struct cred *); extern int copy_creds(struct task_struct *, unsigned long); extern struct cred *prepare_creds(void); +extern struct cred *prepare_exec_creds(void); extern struct cred *prepare_usermodehelper_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); diff --git a/include/linux/key.h b/include/linux/key.h index 69ecf0934b02..21d32a142c00 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -278,7 +278,6 @@ extern ctl_table key_sysctls[]; * the userspace interface */ extern int install_thread_keyring_to_cred(struct cred *cred); -extern int exec_keys(struct task_struct *tsk); extern void key_fsuid_changed(struct task_struct *tsk); extern void key_fsgid_changed(struct task_struct *tsk); extern void key_init(void); @@ -294,7 +293,6 @@ extern void key_init(void); #define make_key_ref(k, p) NULL #define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 -#define exec_keys(t) do { } while(0) #define key_fsuid_changed(t) do { } while(0) #define key_fsgid_changed(t) do { } while(0) #define key_init() do { } while(0) diff --git a/include/linux/security.h b/include/linux/security.h index 68be11251447..56a0eed65673 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -57,8 +57,7 @@ extern int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted); -extern int cap_bprm_set_security(struct linux_binprm *bprm); -extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); +extern int cap_bprm_set_creds(struct linux_binprm *bprm); extern int cap_bprm_secureexec(struct linux_binprm *bprm); extern int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); @@ -110,7 +109,7 @@ extern unsigned long mmap_min_addr; struct sched_param; struct request_sock; -/* bprm_apply_creds unsafe reasons */ +/* bprm->unsafe reasons */ #define LSM_UNSAFE_SHARE 1 #define LSM_UNSAFE_PTRACE 2 #define LSM_UNSAFE_PTRACE_CAP 4 @@ -154,36 +153,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * * Security hooks for program execution operations. * - * @bprm_alloc_security: - * Allocate and attach a security structure to the @bprm->security field. - * The security field is initialized to NULL when the bprm structure is - * allocated. - * @bprm contains the linux_binprm structure to be modified. - * Return 0 if operation was successful. - * @bprm_free_security: - * @bprm contains the linux_binprm structure to be modified. - * Deallocate and clear the @bprm->security field. - * @bprm_apply_creds: - * Compute and set the security attributes of a process being transformed - * by an execve operation based on the old attributes (current->security) - * and the information saved in @bprm->security by the set_security hook. - * Since this function may return an error, in which case the process will - * be killed. However, it can leave the security attributes of the - * process unchanged if an access failure occurs at this point. - * bprm_apply_creds is called under task_lock. @unsafe indicates various - * reasons why it may be unsafe to change security state. - * @bprm contains the linux_binprm structure. - * @bprm_post_apply_creds: - * Runs after bprm_apply_creds with the task_lock dropped, so that - * functions which cannot be called safely under the task_lock can - * be used. This hook is a good place to perform state changes on - * the process such as closing open file descriptors to which access - * is no longer granted if the attributes were changed. - * Note that a security module might need to save state between - * bprm_apply_creds and bprm_post_apply_creds to store the decision - * on whether the process may proceed. - * @bprm contains the linux_binprm structure. - * @bprm_set_security: + * @bprm_set_creds: * Save security information in the bprm->security field, typically based * on information about the bprm->file, for later use by the apply_creds * hook. This hook may also optionally check permissions (e.g. for @@ -196,15 +166,30 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. * @bprm_check_security: - * This hook mediates the point when a search for a binary handler will - * begin. It allows a check the @bprm->security value which is set in - * the preceding set_security call. The primary difference from - * set_security is that the argv list and envp list are reliably - * available in @bprm. This hook may be called multiple times - * during a single execve; and in each pass set_security is called - * first. + * This hook mediates the point when a search for a binary handler will + * begin. It allows a check the @bprm->security value which is set in the + * preceding set_creds call. The primary difference from set_creds is + * that the argv list and envp list are reliably available in @bprm. This + * hook may be called multiple times during a single execve; and in each + * pass set_creds is called first. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. + * @bprm_committing_creds: + * Prepare to install the new security attributes of a process being + * transformed by an execve operation, based on the old credentials + * pointed to by @current->cred and the information set in @bprm->cred by + * the bprm_set_creds hook. @bprm points to the linux_binprm structure. + * This hook is a good place to perform state changes on the process such + * as closing open file descriptors to which access will no longer be + * granted when the attributes are changed. This is called immediately + * before commit_creds(). + * @bprm_committed_creds: + * Tidy up after the installation of the new security attributes of a + * process being transformed by an execve operation. The new credentials + * have, by this point, been set to @current->cred. @bprm points to the + * linux_binprm structure. This hook is a good place to perform state + * changes on the process such as clearing out non-inheritable signal + * state. This is called immediately after commit_creds(). * @bprm_secureexec: * Return a boolean value (0 or 1) indicating whether a "secure exec" * is required. The flag is passed in the auxiliary table @@ -1301,13 +1286,11 @@ struct security_operations { int (*settime) (struct timespec *ts, struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); - int (*bprm_alloc_security) (struct linux_binprm *bprm); - void (*bprm_free_security) (struct linux_binprm *bprm); - int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe); - void (*bprm_post_apply_creds) (struct linux_binprm *bprm); - int (*bprm_set_security) (struct linux_binprm *bprm); + int (*bprm_set_creds) (struct linux_binprm *bprm); int (*bprm_check_security) (struct linux_binprm *bprm); int (*bprm_secureexec) (struct linux_binprm *bprm); + void (*bprm_committing_creds) (struct linux_binprm *bprm); + void (*bprm_committed_creds) (struct linux_binprm *bprm); int (*sb_alloc_security) (struct super_block *sb); void (*sb_free_security) (struct super_block *sb); @@ -1569,12 +1552,10 @@ int security_settime(struct timespec *ts, struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_vm_enough_memory_kern(long pages); -int security_bprm_alloc(struct linux_binprm *bprm); -void security_bprm_free(struct linux_binprm *bprm); -int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); -void security_bprm_post_apply_creds(struct linux_binprm *bprm); -int security_bprm_set(struct linux_binprm *bprm); +int security_bprm_set_creds(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); +void security_bprm_committing_creds(struct linux_binprm *bprm); +void security_bprm_committed_creds(struct linux_binprm *bprm); int security_bprm_secureexec(struct linux_binprm *bprm); int security_sb_alloc(struct super_block *sb); void security_sb_free(struct super_block *sb); @@ -1812,32 +1793,22 @@ static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) return cap_vm_enough_memory(mm, pages); } -static inline int security_bprm_alloc(struct linux_binprm *bprm) -{ - return 0; -} - -static inline void security_bprm_free(struct linux_binprm *bprm) -{ } - -static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +static inline int security_bprm_set_creds(struct linux_binprm *bprm) { - return cap_bprm_apply_creds(bprm, unsafe); + return cap_bprm_set_creds(bprm); } -static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm) +static inline int security_bprm_check(struct linux_binprm *bprm) { - return; + return 0; } -static inline int security_bprm_set(struct linux_binprm *bprm) +static inline void security_bprm_committing_creds(struct linux_binprm *bprm) { - return cap_bprm_set_security(bprm); } -static inline int security_bprm_check(struct linux_binprm *bprm) +static inline void security_bprm_committed_creds(struct linux_binprm *bprm) { - return 0; } static inline int security_bprm_secureexec(struct linux_binprm *bprm) -- cgit v1.2.3 From d76b0d9b2d87cfc95686e148767cbf7d0e22bdc0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:25 +1100 Subject: CRED: Use creds in file structs Attach creds to file structs and discard f_uid/f_gid. file_operations::open() methods (such as hppfs_open()) should use file->f_cred rather than current_cred(). At the moment file->f_cred will be current_cred() at this point. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3bfec1327b8d..c0fb6d81d89b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -827,7 +827,7 @@ struct file { fmode_t f_mode; loff_t f_pos; struct fown_struct f_owner; - unsigned int f_uid, f_gid; + const struct cred *f_cred; struct file_ra_state f_ra; u64 f_version; -- cgit v1.2.3 From 98870ab0a5a3f1822aee681d2997017e1c87d026 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:26 +1100 Subject: CRED: Documentation Document credentials and the new credentials API. Signed-off-by: David Howells Signed-off-by: James Morris --- include/linux/cred.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 8edb4d1d5427..794aab5c66e5 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -1,4 +1,4 @@ -/* Credentials management +/* Credentials management - see Documentation/credentials.txt * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -169,6 +169,12 @@ static inline struct cred *get_new_cred(struct cred *cred) * * Get a reference on the specified set of credentials. The caller must * release the reference. + * + * This is used to deal with a committed set of credentials. Although the + * pointer is const, this will temporarily discard the const and increment the + * usage count. The purpose of this is to attempt to catch at compile time the + * accidental alteration of a set of credentials that should be considered + * immutable. */ static inline const struct cred *get_cred(const struct cred *cred) { @@ -181,6 +187,10 @@ static inline const struct cred *get_cred(const struct cred *cred) * * Release a reference to a set of credentials, deleting them when the last ref * is released. + * + * This takes a const pointer to a set of credentials because the credentials + * on task_struct are attached by const pointers to prevent accidental + * alteration of otherwise immutable credential sets. */ static inline void put_cred(const struct cred *_cred) { -- cgit v1.2.3 From 3b11a1decef07c19443d24ae926982bc8ec9f4c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:26 +1100 Subject: CRED: Differentiate objective and effective subjective credentials on a task Differentiate the objective and real subjective credentials from the effective subjective credentials on a task by introducing a second credentials pointer into the task_struct. task_struct::real_cred then refers to the objective and apparent real subjective credentials of a task, as perceived by the other tasks in the system. task_struct::cred then refers to the effective subjective credentials of a task, as used by that task when it's actually running. These are not visible to the other tasks in the system. __task_cred(task) then refers to the objective/real credentials of the task in question. current_cred() refers to the effective subjective credentials of the current task. prepare_creds() uses the objective creds as a base and commit_creds() changes both pointers in the task_struct (indeed commit_creds() requires them to be the same). override_creds() and revert_creds() change the subjective creds pointer only, and the former returns the old subjective creds. These are used by NFSD, faccessat() and do_coredump(), and will by used by CacheFiles. In SELinux, current_has_perm() is provided as an alternative to task_has_perm(). This uses the effective subjective context of current, whereas task_has_perm() uses the objective/real context of the subject. Signed-off-by: David Howells Signed-off-by: James Morris --- include/linux/cred.h | 29 +++++++++++++++-------------- include/linux/init_task.h | 1 + include/linux/sched.h | 5 ++++- 3 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 794aab5c66e5..55a9c995d694 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -146,8 +146,8 @@ extern struct cred *prepare_exec_creds(void); extern struct cred *prepare_usermodehelper_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); -extern const struct cred *override_creds(const struct cred *) __deprecated; -extern void revert_creds(const struct cred *) __deprecated; +extern const struct cred *override_creds(const struct cred *); +extern void revert_creds(const struct cred *); extern void __init cred_init(void); /** @@ -202,32 +202,32 @@ static inline void put_cred(const struct cred *_cred) } /** - * current_cred - Access the current task's credentials + * current_cred - Access the current task's subjective credentials * - * Access the credentials of the current task. + * Access the subjective credentials of the current task. */ #define current_cred() \ (current->cred) /** - * __task_cred - Access another task's credentials + * __task_cred - Access a task's objective credentials * @task: The task to query * - * Access the credentials of another task. The caller must hold the - * RCU readlock. + * Access the objective credentials of a task. The caller must hold the RCU + * readlock. * * The caller must make sure task doesn't go away, either by holding a ref on * task or by holding tasklist_lock to prevent it from being unlinked. */ #define __task_cred(task) \ - ((const struct cred *)(rcu_dereference((task)->cred))) + ((const struct cred *)(rcu_dereference((task)->real_cred))) /** - * get_task_cred - Get another task's credentials + * get_task_cred - Get another task's objective credentials * @task: The task to query * - * Get the credentials of a task, pinning them so that they can't go away. - * Accessing a task's credentials directly is not permitted. + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. * * The caller must make sure task doesn't go away, either by holding a ref on * task or by holding tasklist_lock to prevent it from being unlinked. @@ -243,10 +243,11 @@ static inline void put_cred(const struct cred *_cred) }) /** - * get_current_cred - Get the current task's credentials + * get_current_cred - Get the current task's subjective credentials * - * Get the credentials of the current task, pinning them so that they can't go - * away. Accessing the current task's credentials directly is not permitted. + * Get the subjective credentials of the current task, pinning them so that + * they can't go away. Accessing the current task's credentials directly is + * not permitted. */ #define get_current_cred() \ (get_cred(current_cred())) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 08c3b24ad9a8..2597858035cd 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -149,6 +149,7 @@ extern struct cred init_cred; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ + .real_cred = &init_cred, \ .cred = &init_cred, \ .cred_exec_mutex = \ __MUTEX_INITIALIZER(tsk.cred_exec_mutex), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 121d655e460d..3443123b0709 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1145,7 +1145,10 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - const struct cred *cred; /* actual/objective task credentials (COW) */ + const struct cred *real_cred; /* objective and real subjective task + * credentials (COW) */ + const struct cred *cred; /* effective (overridable) subjective task + * credentials (COW) */ struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */ char comm[TASK_COMM_LEN]; /* executable name excluding path -- cgit v1.2.3 From 3a3b7ce9336952ea7b9564d976d068a238976c9d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:28 +1100 Subject: CRED: Allow kernel services to override LSM settings for task actions Allow kernel services to override LSM settings appropriate to the actions performed by a task by duplicating a set of credentials, modifying it and then using task_struct::cred to point to it when performing operations on behalf of a task. This is used, for example, by CacheFiles which has to transparently access the cache on behalf of a process that thinks it is doing, say, NFS accesses with a potentially inappropriate (with respect to accessing the cache) set of credentials. This patch provides two LSM hooks for modifying a task security record: (*) security_kernel_act_as() which allows modification of the security datum with which a task acts on other objects (most notably files). (*) security_kernel_create_files_as() which allows modification of the security datum that is used to initialise the security data on a file that a task creates. The patch also provides four new credentials handling functions, which wrap the LSM functions: (1) prepare_kernel_cred() Prepare a set of credentials for a kernel service to use, based either on a daemon's credentials or on init_cred. All the keyrings are cleared. (2) set_security_override() Set the LSM security ID in a set of credentials to a specific security context, assuming permission from the LSM policy. (3) set_security_override_from_ctx() As (2), but takes the security context as a string. (4) set_create_files_as() Set the file creation LSM security ID in a set of credentials to be the same as that on a particular inode. Signed-off-by: Casey Schaufler [Smack changes] Signed-off-by: David Howells Signed-off-by: James Morris --- include/linux/cred.h | 6 ++++++ include/linux/security.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 55a9c995d694..26c1ab179946 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -18,6 +18,7 @@ struct user_struct; struct cred; +struct inode; /* * COW Supplementary groups list @@ -148,6 +149,11 @@ extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); extern void revert_creds(const struct cred *); +extern struct cred *prepare_kernel_cred(struct task_struct *); +extern int change_create_files_as(struct cred *, struct inode *); +extern int set_security_override(struct cred *, u32); +extern int set_security_override_from_ctx(struct cred *, const char *); +extern int set_create_files_as(struct cred *, struct inode *); extern void __init cred_init(void); /** diff --git a/include/linux/security.h b/include/linux/security.h index 56a0eed65673..59a11e19b617 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -587,6 +587,19 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @new points to the new credentials. * @old points to the original credentials. * Install a new set of credentials. + * @kernel_act_as: + * Set the credentials for a kernel service to act as (subjective context). + * @new points to the credentials to be modified. + * @secid specifies the security ID to be set + * The current task must be the one that nominated @secid. + * Return 0 if successful. + * @kernel_create_files_as: + * Set the file creation context in a set of credentials to be the same as + * the objective context of the specified inode. + * @new points to the credentials to be modified. + * @inode points to the inode to use as a reference. + * The current task must be the one that nominated @inode. + * Return 0 if successful. * @task_setuid: * Check permission before setting one or more of the user identity * attributes of the current process. The @flags parameter indicates @@ -1381,6 +1394,8 @@ struct security_operations { int (*cred_prepare)(struct cred *new, const struct cred *old, gfp_t gfp); void (*cred_commit)(struct cred *new, const struct cred *old); + int (*kernel_act_as)(struct cred *new, u32 secid); + int (*kernel_create_files_as)(struct cred *new, struct inode *inode); int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags); int (*task_fix_setuid) (struct cred *new, const struct cred *old, int flags); @@ -1632,6 +1647,8 @@ int security_task_create(unsigned long clone_flags); void security_cred_free(struct cred *cred); int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp); void security_commit_creds(struct cred *new, const struct cred *old); +int security_kernel_act_as(struct cred *new, u32 secid); +int security_kernel_create_files_as(struct cred *new, struct inode *inode); int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags); int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags); @@ -2151,6 +2168,17 @@ static inline void security_commit_creds(struct cred *new, { } +static inline int security_kernel_act_as(struct cred *cred, u32 secid) +{ + return 0; +} + +static inline int security_kernel_create_files_as(struct cred *cred, + struct inode *inode) +{ + return 0; +} + static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { -- cgit v1.2.3 From f004f3ea34209d8b836426b26ade3dc502631b18 Mon Sep 17 00:00:00 2001 From: Paulius Zaleckas Date: Fri, 14 Nov 2008 00:24:34 +0000 Subject: phylib: make mdio-gpio work without OF (v4) make mdio-gpio work with non OpenFirmware gpio implementation. Aditional changes to mdio-gpio: - use gpio_request() and gpio_free() - place irq[] array in struct mdio_gpio_info - add module description, author and license - add note about compiling this driver as module - rename mdc and mdio function (were ugly names) - change MII to MDIO in bus name - add __init __exit to module (un)loading functions - probe fails if no phys added to the bus - kzalloc bitbang with sizeof(*bitbang) Changes since v3: - keep bus naming "%x" to be compatible with existing drivers. Changes since v2: - more #ifdefs reduction - platform driver will be registered on OF platforms also - unified platform and OF bus_id to phy%i Changes since v1: - removed NO_IRQ - reduced #idefs Laurent, please test this driver under OF. Signed-off-by: Paulius Zaleckas Signed-off-by: David S. Miller --- include/linux/mdio-gpio.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 include/linux/mdio-gpio.h (limited to 'include/linux') diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h new file mode 100644 index 000000000000..e9d3fdfe41d7 --- /dev/null +++ b/include/linux/mdio-gpio.h @@ -0,0 +1,25 @@ +/* + * MDIO-GPIO bus platform data structures + * + * Copyright (C) 2008, Paulius Zaleckas + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#ifndef __LINUX_MDIO_GPIO_H +#define __LINUX_MDIO_GPIO_H + +#include + +struct mdio_gpio_platform_data { + /* GPIO numbers for bus pins */ + unsigned int mdc; + unsigned int mdio; + + unsigned int phy_mask; + int irqs[PHY_MAX_ADDR]; +}; + +#endif /* __LINUX_MDIO_GPIO_H */ -- cgit v1.2.3 From e8b2dfe9b4501ed0047459b2756ba26e5a940a69 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Sun, 16 Nov 2008 19:32:39 -0800 Subject: TPROXY: implemented IP_RECVORIGDSTADDR socket option In case UDP traffic is redirected to a local UDP socket, the originally addressed destination address/port cannot be recovered with the in-kernel tproxy. This patch adds an IP_RECVORIGDSTADDR sockopt that enables a IP_ORIGDSTADDR ancillary message in recvmsg(). This ancillary message contains the original destination address/port of the packet being received. Signed-off-by: Balazs Scheidler Signed-off-by: David S. Miller --- include/linux/in.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/in.h b/include/linux/in.h index db458beef19d..d60122a3a088 100644 --- a/include/linux/in.h +++ b/include/linux/in.h @@ -80,6 +80,10 @@ struct in_addr { /* BSD compatibility */ #define IP_RECVRETOPTS IP_RETOPTS +/* TProxy original addresses */ +#define IP_ORIGDSTADDR 20 +#define IP_RECVORIGDSTADDR IP_ORIGDSTADDR + /* IP_MTU_DISCOVER values */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ #define IP_PMTUDISC_WANT 1 /* Use per route hints */ -- cgit v1.2.3 From bbaffaca4810de1a25e32ecaf836eeaacc7a3d11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Nov 2008 19:37:55 -0800 Subject: rcu: Introduce hlist_nulls variant of hlist hlist uses NULL value to finish a chain. hlist_nulls variant use the low order bit set to 1 to signal an end-of-list marker. This allows to store many different end markers, so that some RCU lockless algos (used in TCP/UDP stack for example) can save some memory barriers in fast paths. Two new files are added : include/linux/list_nulls.h - mimics hlist part of include/linux/list.h, derived to hlist_nulls variant include/linux/rculist_nulls.h - mimics hlist part of include/linux/rculist.h, derived to hlist_nulls variant Only four helpers are declared for the moment : hlist_nulls_del_init_rcu(), hlist_nulls_del_rcu(), hlist_nulls_add_head_rcu() and hlist_nulls_for_each_entry_rcu() prefetches() were removed, since an end of list is not anymore NULL value. prefetches() could trigger useless (and possibly dangerous) memory transactions. Example of use (extracted from __udp4_lib_lookup()) struct sock *sk, *result; struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; int score, badness; rcu_read_lock(); begin: result = NULL; badness = -1; sk_nulls_for_each_rcu(sk, node, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { result = sk; badness = score; } } /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != hash) goto begin; if (result) { if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, saddr, hnum, sport, daddr, dport, dif) < badness)) { sock_put(result); goto begin; } } rcu_read_unlock(); return result; Signed-off-by: Eric Dumazet Acked-by: Peter Zijlstra Signed-off-by: David S. Miller --- include/linux/list_nulls.h | 94 ++++++++++++++++++++++++++++++++++++ include/linux/rculist_nulls.h | 110 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 include/linux/list_nulls.h create mode 100644 include/linux/rculist_nulls.h (limited to 'include/linux') diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h new file mode 100644 index 000000000000..93150ecf3ea4 --- /dev/null +++ b/include/linux/list_nulls.h @@ -0,0 +1,94 @@ +#ifndef _LINUX_LIST_NULLS_H +#define _LINUX_LIST_NULLS_H + +/* + * Special version of lists, where end of list is not a NULL pointer, + * but a 'nulls' marker, which can have many different values. + * (up to 2^31 different values guaranteed on all platforms) + * + * In the standard hlist, termination of a list is the NULL pointer. + * In this special 'nulls' variant, we use the fact that objects stored in + * a list are aligned on a word (4 or 8 bytes alignment). + * We therefore use the last significant bit of 'ptr' : + * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1) + * Set to 0 : This is a pointer to some object (ptr) + */ + +struct hlist_nulls_head { + struct hlist_nulls_node *first; +}; + +struct hlist_nulls_node { + struct hlist_nulls_node *next, **pprev; +}; +#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ + ((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1))) + +#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) +/** + * ptr_is_a_nulls - Test if a ptr is a nulls + * @ptr: ptr to be tested + * + */ +static inline int is_a_nulls(const struct hlist_nulls_node *ptr) +{ + return ((unsigned long)ptr & 1); +} + +/** + * get_nulls_value - Get the 'nulls' value of the end of chain + * @ptr: end of chain + * + * Should be called only if is_a_nulls(ptr); + */ +static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) +{ + return ((unsigned long)ptr) >> 1; +} + +static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) +{ + return !h->pprev; +} + +static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) +{ + return is_a_nulls(h->first); +} + +static inline void __hlist_nulls_del(struct hlist_nulls_node *n) +{ + struct hlist_nulls_node *next = n->next; + struct hlist_nulls_node **pprev = n->pprev; + *pprev = next; + if (!is_a_nulls(next)) + next->pprev = pprev; +} + +/** + * hlist_nulls_for_each_entry - iterate over list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * + */ +#define hlist_nulls_for_each_entry(tpos, pos, head, member) \ + for (pos = (head)->first; \ + (!is_a_nulls(pos)) && \ + ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @member: the name of the hlist_node within the struct. + * + */ +#define hlist_nulls_for_each_entry_from(tpos, pos, member) \ + for (; (!is_a_nulls(pos)) && \ + ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +#endif diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h new file mode 100644 index 000000000000..f9ddd03961a8 --- /dev/null +++ b/include/linux/rculist_nulls.h @@ -0,0 +1,110 @@ +#ifndef _LINUX_RCULIST_NULLS_H +#define _LINUX_RCULIST_NULLS_H + +#ifdef __KERNEL__ + +/* + * RCU-protected list version + */ +#include +#include + +/** + * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization + * @n: the element to delete from the hash list. + * + * Note: hlist_nulls_unhashed() on the node return true after this. It is + * useful for RCU based read lockfree traversal if the writer side + * must know if the list entry is still hashed or already unhashed. + * + * In particular, it means that we can not poison the forward pointers + * that may still be used for walking the hash list and we can only + * zero the pprev pointer so list_unhashed() will return true after + * this. + * + * The caller must take whatever precautions are necessary (such as + * holding appropriate locks) to avoid racing with another + * list-mutation primitive, such as hlist_nulls_add_head_rcu() or + * hlist_nulls_del_rcu(), running on this same list. However, it is + * perfectly legal to run concurrently with the _rcu list-traversal + * primitives, such as hlist_nulls_for_each_entry_rcu(). + */ +static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) +{ + if (!hlist_nulls_unhashed(n)) { + __hlist_nulls_del(n); + n->pprev = NULL; + } +} + +/** + * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization + * @n: the element to delete from the hash list. + * + * Note: hlist_nulls_unhashed() on entry does not return true after this, + * the entry is in an undefined state. It is useful for RCU based + * lockfree traversal. + * + * In particular, it means that we can not poison the forward + * pointers that may still be used for walking the hash list. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() + * or hlist_nulls_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_nulls_for_each_entry(). + */ +static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) +{ + __hlist_nulls_del(n); + n->pprev = LIST_POISON2; +} + +/** + * hlist_nulls_add_head_rcu + * @n: the element to add to the hash list. + * @h: the list to add to. + * + * Description: + * Adds the specified element to the specified hlist_nulls, + * while permitting racing traversals. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() + * or hlist_nulls_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. Regardless of the type of CPU, the + * list-traversal primitive must be guarded by rcu_read_lock(). + */ +static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, + struct hlist_nulls_head *h) +{ + struct hlist_nulls_node *first = h->first; + + n->next = first; + n->pprev = &h->first; + rcu_assign_pointer(h->first, n); + if (!is_a_nulls(first)) + first->pprev = &n->next; +} +/** + * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_nulls_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_nulls_node within the struct. + * + */ +#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = rcu_dereference((head)->first); \ + (!is_a_nulls(pos)) && \ + ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ + pos = rcu_dereference(pos->next)) + +#endif +#endif -- cgit v1.2.3 From 88ab1932eac721c6e7336708558fa5ed02c85c80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Nov 2008 19:39:21 -0800 Subject: udp: Use hlist_nulls in UDP RCU code This is a straightforward patch, using hlist_nulls infrastructure. RCUification already done on UDP two weeks ago. Using hlist_nulls permits us to avoid some memory barriers, both at lookup time and delete time. Patch is large because it adds new macros to include/net/sock.h. These macros will be used by TCP & DCCP in next patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rculist.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 3ba2998b22ba..e649bd3f2c97 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -383,22 +383,5 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference(pos->next)) -/** - * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type - * @tpos: the type * to use as a loop cursor. - * @pos: the &struct hlist_node to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the hlist_node within the struct. - * @next: the &struct hlist_node to use as a next cursor - * - * Special version of hlist_for_each_entry_rcu that make sure - * each next pointer is fetched before each iteration. - */ -#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \ - for (pos = rcu_dereference((head)->first); \ - pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && \ - ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference(next)) - #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From 3f2c31d90327f21d76d296af34aa4ca547932ff4 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Sun, 16 Nov 2008 22:41:34 -0800 Subject: virtio_net: VIRTIO_NET_F_MSG_RXBUF (imprive rcv buffer allocation) If segmentation offload is enabled by the host, we currently allocate maximum sized packet buffers and pass them to the host. This uses up 20 ring entries, allowing us to supply only 20 packet buffers to the host with a 256 entry ring. This is a huge overhead when receiving small packets, and is most keenly felt when receiving MTU sized packets from off-host. The VIRTIO_NET_F_MRG_RXBUF feature flag is set by hosts which support using receive buffers which are smaller than the maximum packet size. In order to transfer large packets to the guest, the host merges together multiple receive buffers to form a larger logical buffer. The number of merged buffers is returned to the guest via a field in the virtio_net_hdr. Make use of this support by supplying single page receive buffers to the host. On receive, we extract the virtio_net_hdr, copy 128 bytes of the payload to the skb's linear data buffer and adjust the fragment offset to point to the remaining data. This ensures proper alignment and allows us to not use any paged data for small packets. If the payload occupies multiple pages, we simply append those pages as fragments and free the associated skbs. This scheme allows us to be efficient in our use of ring entries while still supporting large packets. Benchmarking using netperf from an external machine to a guest over a 10Gb/s network shows a 100% improvement from ~1Gb/s to ~2Gb/s. With a local host->guest benchmark with GSO disabled on the host side, throughput was seen to increase from 700Mb/s to 1.7Gb/s. Based on a patch from Herbert Xu. Signed-off-by: Mark McLoughlin Signed-off-by: Rusty Russell (use netdev_priv) Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 5e33761b9b8a..5cdd0aa8bde9 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -20,6 +20,7 @@ #define VIRTIO_NET_F_HOST_TSO6 12 /* Host can handle TSOv6 in. */ #define VIRTIO_NET_F_HOST_ECN 13 /* Host can handle TSO[6] w/ ECN in. */ #define VIRTIO_NET_F_HOST_UFO 14 /* Host can handle UFO in. */ +#define VIRTIO_NET_F_MRG_RXBUF 15 /* Host can merge receive buffers. */ struct virtio_net_config { @@ -44,4 +45,12 @@ struct virtio_net_hdr __u16 csum_start; /* Position to start checksumming from */ __u16 csum_offset; /* Offset after that to place checksum */ }; + +/* This is the version of the header to use when the MRG_RXBUF + * feature has been negotiated. */ +struct virtio_net_hdr_mrg_rxbuf { + struct virtio_net_hdr hdr; + __u16 num_buffers; /* Number of merged rx buffers */ +}; + #endif /* _LINUX_VIRTIO_NET_H */ -- cgit v1.2.3 From 49aebc66d6b896f9c7c5739d85c4548c00015aa7 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 16 Nov 2008 22:51:23 -0800 Subject: dccp: Deprecate old setsockopt framework The previous setsockopt interface, which passed socket options via struct dccp_so_feat, is complicated/difficult to use. Continuing to support it leads to ugly code since the old approach did not distinguish between NN and SP values. This patch removes the old setsockopt interface and replaces it with two new functions to register NN/SP values for feature negotiation. These are essentially wrappers around the internal __feat_register functions, with checking added to avoid * wrong usage (type); * changing values while the connection is in progress. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- include/linux/dccp.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index d3ac1bde60b4..6eaaca9b037a 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -193,13 +193,6 @@ enum dccp_feature_numbers { DCCPF_MAX_CCID_SPECIFIC = 255, }; -/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ -struct dccp_so_feat { - __u8 dccpsf_feat; - __u8 __user *dccpsf_val; - __u8 dccpsf_len; -}; - /* DCCP socket options */ #define DCCP_SOCKOPT_PACKET_SIZE 1 /* XXX deprecated, without effect */ #define DCCP_SOCKOPT_SERVICE 2 -- cgit v1.2.3 From 29450559849da7066813601effb7666966869853 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 16 Nov 2008 22:53:48 -0800 Subject: dccp: Feature negotiation for minimum-checksum-coverage This provides feature negotiation for server minimum checksum coverage which so far has been missing. Since sender/receiver coverage values range only from 0...15, their type has also been reduced in size from u16 to u4. Feature-negotiation options are now generated for both sender and receiver coverage, i.e. when the peer has `forgotten' to enable partial coverage then feature negotiation will automatically enable (negotiate) the partial coverage value for this connection. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- include/linux/dccp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6eaaca9b037a..5a5a89935dbc 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -527,8 +527,8 @@ struct dccp_sock { __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; - __u16 dccps_pcslen; - __u16 dccps_pcrlen; + __u8 dccps_pcslen:4; + __u8 dccps_pcrlen:4; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; -- cgit v1.2.3 From dd9c0e363cef32b7d6f23d4c87e8dfe4f91fd1c5 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 16 Nov 2008 22:55:08 -0800 Subject: dccp: Deprecate Ack Ratio sysctl This patch deprecates the Ack Ratio sysctl, since * Ack Ratio is entirely ignored by CCID-3 and CCID-4, * Ack Ratio currently doesn't work in CCID-2 (i.e. is always set to 1); * even if it would work in CCID-2, there is no point for a user to change it: - Ack Ratio is constrained by cwnd (RFC 4341, 6.1.2), - if Ack Ratio > cwnd, the system resorts to spurious RTO timeouts (since waiting for Acks which will never arrive in this window), - cwnd is not a user-configurable value. The only reasonable place for Ack Ratio is to print it for debugging. It is planned to do this later on, as part of e.g. dccp_probe. With this patch Ack Ratio is now under full control of feature negotiation: * Ack Ratio is resolved as a dependency of the selected CCID; * if the chosen CCID supports it (i.e. CCID == CCID-2), Ack Ratio is set to the default of 2, following RFC 4340, 11.3 - "New connections start with Ack Ratio 2 for both endpoints"; * what happens then is part of another patch set, since it concerns the dynamic update of Ack Ratio while the connection is in full flight. Thanks to Tomasz Grobelny for discussion leading up to this patch. Signed-off-by: Gerrit Renker Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/dccp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 5a5a89935dbc..eda389ce04f4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -368,7 +368,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * @dccpms_ccid - Congestion Control Id (CCID) (section 10) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) - * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ @@ -378,7 +377,6 @@ struct dccp_minisock { __u8 dccpms_tx_ccid; __u8 dccpms_send_ack_vector; __u8 dccpms_send_ndp_count; - __u8 dccpms_ack_ratio; struct list_head dccpms_pending; struct list_head dccpms_conf; }; -- cgit v1.2.3 From 1e291b14c8f1101b9093434489bd4dc0e03f3d0f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 12 Nov 2008 18:54:42 +0000 Subject: of: Add helpers for finding device nodes which have a given property This commit adds a routine for finding a device node which has a certain property. The contents of the property are not taken into account, merely the presence or absence of the property. Based on that routine, we add a for_each_ macro for iterating over all nodes that have a certain property. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras --- include/linux/of.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index e2488f5e7cb2..6a7efa242f5e 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -57,6 +57,12 @@ extern struct device_node *of_get_next_child(const struct device_node *node, for (child = of_get_next_child(parent, NULL); child != NULL; \ child = of_get_next_child(parent, child)) +extern struct device_node *of_find_node_with_property( + struct device_node *from, const char *prop_name); +#define for_each_node_with_property(dn, prop_name) \ + for (dn = of_find_node_with_property(NULL, prop_name); dn; \ + dn = of_find_node_with_property(dn, prop_name)) + extern struct property *of_find_property(const struct device_node *np, const char *name, int *lenp); -- cgit v1.2.3 From d314774cf2cd5dfeb39a00d37deee65d4c627927 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 19 Nov 2008 21:32:24 -0800 Subject: netdev: network device operations infrastructure This patch changes the network device internal API to move adminstrative operations out of the network device structure and into a separate structure. This patch involves some hackery to maintain compatablity between the new and old model, so all 300+ drivers don't have to be changed at once. For drivers that aren't converted yet, the netdevice_ops virt function list still resides in the net_device structure. For old protocols, the new net_device_ops are copied out to the old net_device pointers. After the transistion is completed the nag message can be changed to an WARN_ON, and the compatiablity code can be made configurable. Some function pointers aren't moved: * destructor can't be in net_device_ops because it may need to be referenced after the module is unloaded. * neighbor setup is manipulated in a couple of places that need special consideration * hard_start_xmit is in the fast path for transmit. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 232 +++++++++++++++++++++++++++++++++------------- 1 file changed, 168 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 12d7f4469dc9..9060f5f3517a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -451,6 +451,131 @@ struct netdev_queue { struct Qdisc *qdisc_sleeping; } ____cacheline_aligned_in_smp; + +/* + * This structure defines the management hooks for network devices. + * The following hooks can bed defined and are optonal (can be null) + * unless otherwise noted. + * + * int (*ndo_init)(struct net_device *dev); + * This function is called once when network device is registered. + * The network device can use this to any late stage initializaton + * or semantic validattion. It can fail with an error code which will + * be propogated back to register_netdev + * + * void (*ndo_uninit)(struct net_device *dev); + * This function is called when device is unregistered or when registration + * fails. It is not called if init fails. + * + * int (*ndo_open)(struct net_device *dev); + * This function is called when network device transistions to the up + * state. + * + * int (*ndo_stop)(struct net_device *dev); + * This function is called when network device transistions to the down + * state. + * + * void (*ndo_change_rx_flags)(struct net_device *dev, int flags); + * This function is called to allow device receiver to make + * changes to configuration when multicast or promiscious is enabled. + * + * void (*ndo_set_rx_mode)(struct net_device *dev); + * This function is called device changes address list filtering. + * + * void (*ndo_set_multicast_list)(struct net_device *dev); + * This function is called when the multicast address list changes. + * + * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); + * This function is called when the Media Access Control address + * needs to be changed. If not this interface is not defined, the + * mac address can not be changed. + * + * int (*ndo_validate_addr)(struct net_device *dev); + * Test if Media Access Control address is valid for the device. + * + * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); + * Called when a user request an ioctl which can't be handled by + * the generic interface code. If not defined ioctl's return + * not supported error code. + * + * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); + * Used to set network devices bus interface parameters. This interface + * is retained for legacy reason, new devices should use the bus + * interface (PCI) for low level management. + * + * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); + * Called when a user wants to change the Maximum Transfer Unit + * of a device. If not defined, any request to change MTU will + * will return an error. + * + * void (*ndo_tx_timeout) (struct net_device *dev); + * Callback uses when the transmitter has not made any progress + * for dev->watchdog ticks. + * + * struct net_device_stats* (*get_stats)(struct net_device *dev); + * Called when a user wants to get the network device usage + * statistics. If not defined, the counters in dev->stats will + * be used. + * + * void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp); + * If device support VLAN receive accleration + * (ie. dev->features & NETIF_F_HW_VLAN_RX), then this function is called + * when vlan groups for the device changes. Note: grp is NULL + * if no vlan's groups are being used. + * + * void (*ndo_vlan_rx_add_vid)(struct net_device *dev, unsigned short vid); + * If device support VLAN filtering (dev->features & NETIF_F_HW_VLAN_FILTER) + * this function is called when a VLAN id is registered. + * + * void (*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid); + * If device support VLAN filtering (dev->features & NETIF_F_HW_VLAN_FILTER) + * this function is called when a VLAN id is unregistered. + * + * void (*ndo_poll_controller)(struct net_device *dev); + */ +struct net_device_ops { + int (*ndo_init)(struct net_device *dev); + void (*ndo_uninit)(struct net_device *dev); + int (*ndo_open)(struct net_device *dev); + int (*ndo_stop)(struct net_device *dev); +#define HAVE_CHANGE_RX_FLAGS + void (*ndo_change_rx_flags)(struct net_device *dev, + int flags); +#define HAVE_SET_RX_MODE + void (*ndo_set_rx_mode)(struct net_device *dev); +#define HAVE_MULTICAST + void (*ndo_set_multicast_list)(struct net_device *dev); +#define HAVE_SET_MAC_ADDR + int (*ndo_set_mac_address)(struct net_device *dev, + void *addr); +#define HAVE_VALIDATE_ADDR + int (*ndo_validate_addr)(struct net_device *dev); +#define HAVE_PRIVATE_IOCTL + int (*ndo_do_ioctl)(struct net_device *dev, + struct ifreq *ifr, int cmd); +#define HAVE_SET_CONFIG + int (*ndo_set_config)(struct net_device *dev, + struct ifmap *map); +#define HAVE_CHANGE_MTU + int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); + +#define HAVE_TX_TIMEOUT + void (*ndo_tx_timeout) (struct net_device *dev); + + struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); + + void (*ndo_vlan_rx_register)(struct net_device *dev, + struct vlan_group *grp); + void (*ndo_vlan_rx_add_vid)(struct net_device *dev, + unsigned short vid); + void (*ndo_vlan_rx_kill_vid)(struct net_device *dev, + unsigned short vid); +#ifdef CONFIG_NET_POLL_CONTROLLER +#define HAVE_NETDEV_POLL + void (*ndo_poll_controller)(struct net_device *dev); +#endif +}; + /* * The DEVICE structure. * Actually, this whole structure is a big mistake. It mixes I/O @@ -498,11 +623,6 @@ struct net_device #ifdef CONFIG_NETPOLL struct list_head napi_list; #endif - - /* The device initialization function. Called only once. */ - int (*init)(struct net_device *dev); - - /* ------- Fields preinitialized in Space.c finish here ------- */ /* Net device features */ unsigned long features; @@ -546,15 +666,13 @@ struct net_device * for all in netdev_increment_features. */ #define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \ - NETIF_F_SG | NETIF_F_HIGHDMA | \ + NETIF_F_SG | NETIF_F_HIGHDMA | \ NETIF_F_FRAGLIST) /* Interface index. Unique device identifier */ int ifindex; int iflink; - - struct net_device_stats* (*get_stats)(struct net_device *dev); struct net_device_stats stats; #ifdef CONFIG_WIRELESS_EXT @@ -564,18 +682,13 @@ struct net_device /* Instance data managed by the core of Wireless Extensions. */ struct iw_public_data * wireless_data; #endif + /* Management operations */ + const struct net_device_ops *netdev_ops; const struct ethtool_ops *ethtool_ops; /* Hardware header description */ const struct header_ops *header_ops; - /* - * This marks the end of the "visible" part of the structure. All - * fields hereafter are internal to the system, and may change at - * will (read: may be cleaned up at will). - */ - - unsigned int flags; /* interface flags (a la BSD) */ unsigned short gflags; unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */ @@ -634,7 +747,7 @@ struct net_device unsigned long last_rx; /* Time of last Rx */ /* Interface address info used in eth_type_trans() */ unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast - because most packets are unicast) */ + because most packets are unicast) */ unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ @@ -648,6 +761,10 @@ struct net_device /* Number of TX queues currently active in device */ unsigned int real_num_tx_queues; + /* Map buffer to appropriate transmit queue */ + u16 (*select_queue)(struct net_device *dev, + struct sk_buff *skb); + unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; /* @@ -662,9 +779,6 @@ struct net_device int watchdog_timeo; /* used by dev_watchdog() */ struct timer_list watchdog_timer; -/* - * refcnt is a very hot point, so align it on SMP - */ /* Number of references to this device */ atomic_t refcnt ____cacheline_aligned_in_smp; @@ -683,56 +797,14 @@ struct net_device NETREG_RELEASED, /* called free_netdev */ } reg_state; - /* Called after device is detached from network. */ - void (*uninit)(struct net_device *dev); - /* Called after last user reference disappears. */ - void (*destructor)(struct net_device *dev); + /* Called from unregister, can be used to call free_netdev */ + void (*destructor)(struct net_device *dev); - /* Pointers to interface service routines. */ - int (*open)(struct net_device *dev); - int (*stop)(struct net_device *dev); -#define HAVE_NETDEV_POLL -#define HAVE_CHANGE_RX_FLAGS - void (*change_rx_flags)(struct net_device *dev, - int flags); -#define HAVE_SET_RX_MODE - void (*set_rx_mode)(struct net_device *dev); -#define HAVE_MULTICAST - void (*set_multicast_list)(struct net_device *dev); -#define HAVE_SET_MAC_ADDR - int (*set_mac_address)(struct net_device *dev, - void *addr); -#define HAVE_VALIDATE_ADDR - int (*validate_addr)(struct net_device *dev); -#define HAVE_PRIVATE_IOCTL - int (*do_ioctl)(struct net_device *dev, - struct ifreq *ifr, int cmd); -#define HAVE_SET_CONFIG - int (*set_config)(struct net_device *dev, - struct ifmap *map); -#define HAVE_CHANGE_MTU - int (*change_mtu)(struct net_device *dev, int new_mtu); + int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); -#define HAVE_TX_TIMEOUT - void (*tx_timeout) (struct net_device *dev); - - void (*vlan_rx_register)(struct net_device *dev, - struct vlan_group *grp); - void (*vlan_rx_add_vid)(struct net_device *dev, - unsigned short vid); - void (*vlan_rx_kill_vid)(struct net_device *dev, - unsigned short vid); - - int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); #ifdef CONFIG_NETPOLL struct netpoll_info *npinfo; #endif -#ifdef CONFIG_NET_POLL_CONTROLLER - void (*poll_controller)(struct net_device *dev); -#endif - - u16 (*select_queue)(struct net_device *dev, - struct sk_buff *skb); #ifdef CONFIG_NET_NS /* Network namespace this network device is inside */ @@ -763,6 +835,38 @@ struct net_device /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; + +#ifdef CONFIG_COMPAT_NET_DEV_OPS + struct { + int (*init)(struct net_device *dev); + void (*uninit)(struct net_device *dev); + int (*open)(struct net_device *dev); + int (*stop)(struct net_device *dev); + void (*change_rx_flags)(struct net_device *dev, + int flags); + void (*set_rx_mode)(struct net_device *dev); + void (*set_multicast_list)(struct net_device *dev); + int (*set_mac_address)(struct net_device *dev, + void *addr); + int (*validate_addr)(struct net_device *dev); + int (*do_ioctl)(struct net_device *dev, + struct ifreq *ifr, int cmd); + int (*set_config)(struct net_device *dev, + struct ifmap *map); + int (*change_mtu)(struct net_device *dev, int new_mtu); + void (*tx_timeout) (struct net_device *dev); + struct net_device_stats* (*get_stats)(struct net_device *dev); + void (*vlan_rx_register)(struct net_device *dev, + struct vlan_group *grp); + void (*vlan_rx_add_vid)(struct net_device *dev, + unsigned short vid); + void (*vlan_rx_kill_vid)(struct net_device *dev, + unsigned short vid); +#ifdef CONFIG_NET_POLL_CONTROLLER + void (*poll_controller)(struct net_device *dev); +#endif +#endif + }; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From eeda3fd64f75bcbfaa70ce946513abaf3f23b8e0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 19 Nov 2008 21:40:23 -0800 Subject: netdev: introduce dev_get_stats() In order for the network device ops get_stats call to be immutable, the handling of the default internal network device stats block has to be changed. Add a new helper function which replaces the old use of internal_get_stats. Note: change return code to make it clear that the caller should not go changing the returned statistics. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9060f5f3517a..981a089d5149 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -864,9 +864,9 @@ struct net_device unsigned short vid); #ifdef CONFIG_NET_POLL_CONTROLLER void (*poll_controller)(struct net_device *dev); -#endif #endif }; +#endif }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -1780,6 +1780,8 @@ extern void netdev_features_change(struct net_device *dev); /* Load a device via the kmod */ extern void dev_load(struct net *net, const char *name); extern void dev_mcast_init(void); +extern const struct net_device_stats *dev_get_stats(struct net_device *dev); + extern int netdev_max_backlog; extern int weight_p; extern int netdev_set_master(struct net_device *dev, struct net_device *master); -- cgit v1.2.3 From ccad637b0c57de1825ffd34c311bf71487545ac2 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 19 Nov 2008 22:42:31 -0800 Subject: netdev: expose ethernet address primitives When ethernet devices are converted, the function pointer setup by eth_setup() need to be done during intialization. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 25d62e6e3290..0e5e97060034 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -41,6 +41,10 @@ extern int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh); extern void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); +extern int eth_mac_addr(struct net_device *dev, void *p); +extern int eth_change_mtu(struct net_device *dev, int new_mtu); +extern int eth_validate_addr(struct net_device *dev); + extern struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count); -- cgit v1.2.3 From d214c7537bbf2f247991fb65b3420b0b3d712c67 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 20 Nov 2008 00:49:27 -0800 Subject: filter: add SKF_AD_NLATTR_NEST to look for nested attributes SKF_AD_NLATTR allows us to find the first matching attribute in a stream of netlink attributes from one offset to the end of the netlink message. This is not suitable to look for a specific matching inside a set of nested attributes. For example, in ctnetlink messages, if we look for the CTA_V6_SRC attribute in a message that talks about an IPv4 connection, SKF_AD_NLATTR returns the offset of CTA_STATUS which has the same value of CTA_V6_SRC but outside the nest. To differenciate CTA_STATUS and CTA_V6_SRC, we would have to make assumptions on the size of the attribute and the usual offset, resulting in horrible BSF code. This patch adds SKF_AD_NLATTR_NEST, which is a variant of SKF_AD_NLATTR, that looks for an attribute inside the limits of a nested attributes, but not further. This patch validates that we have enough room to look for the nested attributes - based on a suggestion from Patrick McHardy. Signed-off-by: Pablo Neira Ayuso Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/filter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index b6ea9aa9e853..1354aaf6abbe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -122,7 +122,8 @@ struct sock_fprog /* Required for SO_ATTACH_FILTER. */ #define SKF_AD_PKTTYPE 4 #define SKF_AD_IFINDEX 8 #define SKF_AD_NLATTR 12 -#define SKF_AD_MAX 16 +#define SKF_AD_NLATTR_NEST 16 +#define SKF_AD_MAX 20 #define SKF_NET_OFF (-0x100000) #define SKF_LL_OFF (-0x200000) -- cgit v1.2.3 From 0c19b0adb8dd33dbd10ff48e41971231c486855c Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 20 Nov 2008 04:08:29 -0800 Subject: netlink: avoid memset of 0 bytes sparse warning A netlink attribute padding of zero triggers this sparse warning: include/linux/netlink.h:245:8: warning: memset with byte count of 0 Avoid the memset when the size parameter is constant and requires no padding. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netlink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 9ff1b54908f3..51b09a1f46c3 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -242,7 +242,8 @@ __nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, int type, int len, int flags) nlh->nlmsg_flags = flags; nlh->nlmsg_pid = pid; nlh->nlmsg_seq = seq; - memset(NLMSG_DATA(nlh) + len, 0, NLMSG_ALIGN(size) - size); + if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) + memset(NLMSG_DATA(nlh) + len, 0, NLMSG_ALIGN(size) - size); return nlh; } -- cgit v1.2.3 From 13d2a1d2b032de08d7dcab6a1edcd47802681f96 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 20 Nov 2008 04:10:00 -0800 Subject: pkt_sched: add DRR scheduler Add classful DRR scheduler as a more flexible replacement for SFQ. The main difference to the algorithm described in "Efficient Fair Queueing using Deficit Round Robin" is that this implementation doesn't drop packets from the longest queue on overrun because its classful and limits are handled by each individual child qdisc. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 5d921fa91a5b..e3f133adba78 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -500,4 +500,20 @@ struct tc_netem_corrupt #define NETEM_DIST_SCALE 8192 +/* DRR */ + +enum +{ + TCA_DRR_UNSPEC, + TCA_DRR_QUANTUM, + __TCA_DRR_MAX +}; + +#define TCA_DRR_MAX (__TCA_DRR_MAX - 1) + +struct tc_drr_stats +{ + u32 deficit; +}; + #endif -- cgit v1.2.3 From 018a7bf1e55000dd792194238c9043918d24d3dd Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 20 Nov 2008 15:59:56 +0100 Subject: netfilter: ip{,6}t_policy.h should include xp_policy.h It seems that all of the include/netfilter_{ipv4,ipv6}/{ipt,ip6t}_*.h which share constants include the corresponding include/netfilter/xp_*.h files. Neither ipt_policy.h not ip6t_policy.h do. Make these consistant with the norm. Signed-off-by: Andy Whitcroft Signed-off-by: Patrick McHardy --- include/linux/netfilter_ipv4/ipt_policy.h | 2 ++ include/linux/netfilter_ipv6/ip6t_policy.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4/ipt_policy.h b/include/linux/netfilter_ipv4/ipt_policy.h index b9478a255301..1037fb2cd206 100644 --- a/include/linux/netfilter_ipv4/ipt_policy.h +++ b/include/linux/netfilter_ipv4/ipt_policy.h @@ -1,6 +1,8 @@ #ifndef _IPT_POLICY_H #define _IPT_POLICY_H +#include + #define IPT_POLICY_MAX_ELEM XT_POLICY_MAX_ELEM /* ipt_policy_flags */ diff --git a/include/linux/netfilter_ipv6/ip6t_policy.h b/include/linux/netfilter_ipv6/ip6t_policy.h index 6bab3163d2fb..b1c449d7ec89 100644 --- a/include/linux/netfilter_ipv6/ip6t_policy.h +++ b/include/linux/netfilter_ipv6/ip6t_policy.h @@ -1,6 +1,8 @@ #ifndef _IP6T_POLICY_H #define _IP6T_POLICY_H +#include + #define IP6T_POLICY_MAX_ELEM XT_POLICY_MAX_ELEM /* ip6t_policy_flags */ -- cgit v1.2.3 From 008298231abbeb91bc7be9e8b078607b816d1a4a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 20 Nov 2008 20:14:53 -0800 Subject: netdev: add more functions to netdevice ops This patch moves neigh_setup and hard_start_xmit into the network device ops structure. For bisection, fix all the previously converted drivers as well. Bonding driver took the biggest hit on this. Added a prefetch of the hard_start_xmit in the fast path to try and reduce any impact this would have. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 981a089d5149..d8fb23679ee3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -454,8 +454,8 @@ struct netdev_queue { /* * This structure defines the management hooks for network devices. - * The following hooks can bed defined and are optonal (can be null) - * unless otherwise noted. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. * * int (*ndo_init)(struct net_device *dev); * This function is called once when network device is registered. @@ -475,6 +475,15 @@ struct netdev_queue { * This function is called when network device transistions to the down * state. * + * int (*ndo_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev); + * Called when a packet needs to be transmitted. + * Must return NETDEV_TX_OK , NETDEV_TX_BUSY, or NETDEV_TX_LOCKED, + * Required can not be NULL. + * + * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb); + * Called to decide which queue to when device supports multiple + * transmit queues. + * * void (*ndo_change_rx_flags)(struct net_device *dev, int flags); * This function is called to allow device receiver to make * changes to configuration when multicast or promiscious is enabled. @@ -508,7 +517,7 @@ struct netdev_queue { * of a device. If not defined, any request to change MTU will * will return an error. * - * void (*ndo_tx_timeout) (struct net_device *dev); + * void (*ndo_tx_timeout)(struct net_device *dev); * Callback uses when the transmitter has not made any progress * for dev->watchdog ticks. * @@ -538,6 +547,10 @@ struct net_device_ops { void (*ndo_uninit)(struct net_device *dev); int (*ndo_open)(struct net_device *dev); int (*ndo_stop)(struct net_device *dev); + int (*ndo_start_xmit) (struct sk_buff *skb, + struct net_device *dev); + u16 (*ndo_select_queue)(struct net_device *dev, + struct sk_buff *skb); #define HAVE_CHANGE_RX_FLAGS void (*ndo_change_rx_flags)(struct net_device *dev, int flags); @@ -557,8 +570,10 @@ struct net_device_ops { int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); #define HAVE_CHANGE_MTU - int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); - + int (*ndo_change_mtu)(struct net_device *dev, + int new_mtu); + int (*ndo_neigh_setup)(struct net_device *dev, + struct neigh_parms *); #define HAVE_TX_TIMEOUT void (*ndo_tx_timeout) (struct net_device *dev); @@ -761,18 +776,12 @@ struct net_device /* Number of TX queues currently active in device */ unsigned int real_num_tx_queues; - /* Map buffer to appropriate transmit queue */ - u16 (*select_queue)(struct net_device *dev, - struct sk_buff *skb); - unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; /* * One part is mostly used on xmit path (device) */ void *priv; /* pointer to private data */ - int (*hard_start_xmit) (struct sk_buff *skb, - struct net_device *dev); /* These may be needed for future network-power-down code. */ unsigned long trans_start; /* Time (in jiffies) of last Tx */ @@ -800,8 +809,6 @@ struct net_device /* Called from unregister, can be used to call free_netdev */ void (*destructor)(struct net_device *dev); - int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); - #ifdef CONFIG_NETPOLL struct netpoll_info *npinfo; #endif @@ -842,6 +849,10 @@ struct net_device void (*uninit)(struct net_device *dev); int (*open)(struct net_device *dev); int (*stop)(struct net_device *dev); + int (*hard_start_xmit) (struct sk_buff *skb, + struct net_device *dev); + u16 (*select_queue)(struct net_device *dev, + struct sk_buff *skb); void (*change_rx_flags)(struct net_device *dev, int flags); void (*set_rx_mode)(struct net_device *dev); @@ -854,6 +865,8 @@ struct net_device int (*set_config)(struct net_device *dev, struct ifmap *map); int (*change_mtu)(struct net_device *dev, int new_mtu); + int (*neigh_setup)(struct net_device *dev, + struct neigh_parms *); void (*tx_timeout) (struct net_device *dev); struct net_device_stats* (*get_stats)(struct net_device *dev); void (*vlan_rx_register)(struct net_device *dev, -- cgit v1.2.3 From 145186a39570244aead77dc2efc559e5cac90548 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 20 Nov 2008 20:29:48 -0800 Subject: fddi: convert to new network device ops Similar to ethernet. Convert infrastructure and the one lone FDDI driver (for the one lone user of that hardware??). Compile tested only. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/fddidevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fddidevice.h b/include/linux/fddidevice.h index e61e42dfd317..155bafd9e886 100644 --- a/include/linux/fddidevice.h +++ b/include/linux/fddidevice.h @@ -27,6 +27,7 @@ #ifdef __KERNEL__ extern __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev); +extern int fddi_change_mtu(struct net_device *dev, int new_mtu); extern struct net_device *alloc_fddidev(int sizeof_priv); #endif -- cgit v1.2.3 From 748ff68fad9600593c6abe47856037602bd5d133 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 20 Nov 2008 20:32:15 -0800 Subject: hippi: convert driver to net_device_ops Convert the HIPPI infrastructure for use with net_device_ops. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/hippidevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hippidevice.h b/include/linux/hippidevice.h index bab303dafd6e..f148e4908410 100644 --- a/include/linux/hippidevice.h +++ b/include/linux/hippidevice.h @@ -32,7 +32,9 @@ struct hippi_cb { }; extern __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev); - +extern int hippi_change_mtu(struct net_device *dev, int new_mtu); +extern int hippi_mac_addr(struct net_device *dev, void *p); +extern int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p); extern struct net_device *alloc_hippi_dev(int sizeof_priv); #endif -- cgit v1.2.3 From 2f90b8657ec942d1880f720e0177ee71df7c8e3c Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 20 Nov 2008 20:52:10 -0800 Subject: ixgbe: this patch adds support for DCB to the kernel and ixgbe driver This adds support for Data Center Bridging (DCB) features in the ixgbe driver and adds an rtnetlink interface for configuring DCB to the kernel. The DCB feature support included are Priority Grouping (PG) - which allows bandwidth guarantees to be allocated to groups to traffic based on the 802.1q priority, and Priority Based Flow Control (PFC) - which introduces a new MAC control PAUSE frame which works at granularity of the 802.1p priority instead of the link (IEEE 802.3x). Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: David S. Miller --- include/linux/dcbnl.h | 230 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/netdevice.h | 8 ++ include/linux/rtnetlink.h | 5 + 3 files changed, 243 insertions(+) create mode 100644 include/linux/dcbnl.h (limited to 'include/linux') diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h new file mode 100644 index 000000000000..32d32c1ee410 --- /dev/null +++ b/include/linux/dcbnl.h @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Lucy Liu + */ + +#ifndef __LINUX_DCBNL_H__ +#define __LINUX_DCBNL_H__ + +#define DCB_PROTO_VERSION 1 + +struct dcbmsg { + unsigned char dcb_family; + __u8 cmd; + __u16 dcb_pad; +}; + +/** + * enum dcbnl_commands - supported DCB commands + * + * @DCB_CMD_UNDEFINED: unspecified command to catch errors + * @DCB_CMD_GSTATE: request the state of DCB in the device + * @DCB_CMD_SSTATE: set the state of DCB in the device + * @DCB_CMD_PGTX_GCFG: request the priority group configuration for Tx + * @DCB_CMD_PGTX_SCFG: set the priority group configuration for Tx + * @DCB_CMD_PGRX_GCFG: request the priority group configuration for Rx + * @DCB_CMD_PGRX_SCFG: set the priority group configuration for Rx + * @DCB_CMD_PFC_GCFG: request the priority flow control configuration + * @DCB_CMD_PFC_SCFG: set the priority flow control configuration + * @DCB_CMD_SET_ALL: apply all changes to the underlying device + * @DCB_CMD_GPERM_HWADDR: get the permanent MAC address of the underlying + * device. Only useful when using bonding. + */ +enum dcbnl_commands { + DCB_CMD_UNDEFINED, + + DCB_CMD_GSTATE, + DCB_CMD_SSTATE, + + DCB_CMD_PGTX_GCFG, + DCB_CMD_PGTX_SCFG, + DCB_CMD_PGRX_GCFG, + DCB_CMD_PGRX_SCFG, + + DCB_CMD_PFC_GCFG, + DCB_CMD_PFC_SCFG, + + DCB_CMD_SET_ALL, + DCB_CMD_GPERM_HWADDR, + + __DCB_CMD_ENUM_MAX, + DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1, +}; + + +/** + * enum dcbnl_attrs - DCB top-level netlink attributes + * + * @DCB_ATTR_UNDEFINED: unspecified attribute to catch errors + * @DCB_ATTR_IFNAME: interface name of the underlying device (NLA_STRING) + * @DCB_ATTR_STATE: enable state of DCB in the device (NLA_U8) + * @DCB_ATTR_PFC_STATE: enable state of PFC in the device (NLA_U8) + * @DCB_ATTR_PFC_CFG: priority flow control configuration (NLA_NESTED) + * @DCB_ATTR_NUM_TC: number of traffic classes supported in the device (NLA_U8) + * @DCB_ATTR_PG_CFG: priority group configuration (NLA_NESTED) + * @DCB_ATTR_SET_ALL: bool to commit changes to hardware or not (NLA_U8) + * @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED) + */ +enum dcbnl_attrs { + DCB_ATTR_UNDEFINED, + + DCB_ATTR_IFNAME, + DCB_ATTR_STATE, + DCB_ATTR_PFC_STATE, + DCB_ATTR_PFC_CFG, + DCB_ATTR_NUM_TC, + DCB_ATTR_PG_CFG, + DCB_ATTR_SET_ALL, + DCB_ATTR_PERM_HWADDR, + + __DCB_ATTR_ENUM_MAX, + DCB_ATTR_MAX = __DCB_ATTR_ENUM_MAX - 1, +}; + +/** + * enum dcbnl_pfc_attrs - DCB Priority Flow Control user priority nested attrs + * + * @DCB_PFC_UP_ATTR_UNDEFINED: unspecified attribute to catch errors + * @DCB_PFC_UP_ATTR_0: Priority Flow Control value for User Priority 0 (NLA_U8) + * @DCB_PFC_UP_ATTR_1: Priority Flow Control value for User Priority 1 (NLA_U8) + * @DCB_PFC_UP_ATTR_2: Priority Flow Control value for User Priority 2 (NLA_U8) + * @DCB_PFC_UP_ATTR_3: Priority Flow Control value for User Priority 3 (NLA_U8) + * @DCB_PFC_UP_ATTR_4: Priority Flow Control value for User Priority 4 (NLA_U8) + * @DCB_PFC_UP_ATTR_5: Priority Flow Control value for User Priority 5 (NLA_U8) + * @DCB_PFC_UP_ATTR_6: Priority Flow Control value for User Priority 6 (NLA_U8) + * @DCB_PFC_UP_ATTR_7: Priority Flow Control value for User Priority 7 (NLA_U8) + * @DCB_PFC_UP_ATTR_MAX: highest attribute number currently defined + * @DCB_PFC_UP_ATTR_ALL: apply to all priority flow control attrs (NLA_FLAG) + * + */ +enum dcbnl_pfc_up_attrs { + DCB_PFC_UP_ATTR_UNDEFINED, + + DCB_PFC_UP_ATTR_0, + DCB_PFC_UP_ATTR_1, + DCB_PFC_UP_ATTR_2, + DCB_PFC_UP_ATTR_3, + DCB_PFC_UP_ATTR_4, + DCB_PFC_UP_ATTR_5, + DCB_PFC_UP_ATTR_6, + DCB_PFC_UP_ATTR_7, + DCB_PFC_UP_ATTR_ALL, + + __DCB_PFC_UP_ATTR_ENUM_MAX, + DCB_PFC_UP_ATTR_MAX = __DCB_PFC_UP_ATTR_ENUM_MAX - 1, +}; + +/** + * enum dcbnl_pg_attrs - DCB Priority Group attributes + * + * @DCB_PG_ATTR_UNDEFINED: unspecified attribute to catch errors + * @DCB_PG_ATTR_TC_0: Priority Group Traffic Class 0 configuration (NLA_NESTED) + * @DCB_PG_ATTR_TC_1: Priority Group Traffic Class 1 configuration (NLA_NESTED) + * @DCB_PG_ATTR_TC_2: Priority Group Traffic Class 2 configuration (NLA_NESTED) + * @DCB_PG_ATTR_TC_3: Priority Group Traffic Class 3 configuration (NLA_NESTED) + * @DCB_PG_ATTR_TC_4: Priority Group Traffic Class 4 configuration (NLA_NESTED) + * @DCB_PG_ATTR_TC_5: Priority Group Traffic Class 5 configuration (NLA_NESTED) + * @DCB_PG_ATTR_TC_6: Priority Group Traffic Class 6 configuration (NLA_NESTED) + * @DCB_PG_ATTR_TC_7: Priority Group Traffic Class 7 configuration (NLA_NESTED) + * @DCB_PG_ATTR_TC_MAX: highest attribute number currently defined + * @DCB_PG_ATTR_TC_ALL: apply to all traffic classes (NLA_NESTED) + * @DCB_PG_ATTR_BW_ID_0: Percent of link bandwidth for Priority Group 0 (NLA_U8) + * @DCB_PG_ATTR_BW_ID_1: Percent of link bandwidth for Priority Group 1 (NLA_U8) + * @DCB_PG_ATTR_BW_ID_2: Percent of link bandwidth for Priority Group 2 (NLA_U8) + * @DCB_PG_ATTR_BW_ID_3: Percent of link bandwidth for Priority Group 3 (NLA_U8) + * @DCB_PG_ATTR_BW_ID_4: Percent of link bandwidth for Priority Group 4 (NLA_U8) + * @DCB_PG_ATTR_BW_ID_5: Percent of link bandwidth for Priority Group 5 (NLA_U8) + * @DCB_PG_ATTR_BW_ID_6: Percent of link bandwidth for Priority Group 6 (NLA_U8) + * @DCB_PG_ATTR_BW_ID_7: Percent of link bandwidth for Priority Group 7 (NLA_U8) + * @DCB_PG_ATTR_BW_ID_MAX: highest attribute number currently defined + * @DCB_PG_ATTR_BW_ID_ALL: apply to all priority groups (NLA_FLAG) + * + */ +enum dcbnl_pg_attrs { + DCB_PG_ATTR_UNDEFINED, + + DCB_PG_ATTR_TC_0, + DCB_PG_ATTR_TC_1, + DCB_PG_ATTR_TC_2, + DCB_PG_ATTR_TC_3, + DCB_PG_ATTR_TC_4, + DCB_PG_ATTR_TC_5, + DCB_PG_ATTR_TC_6, + DCB_PG_ATTR_TC_7, + DCB_PG_ATTR_TC_MAX, + DCB_PG_ATTR_TC_ALL, + + DCB_PG_ATTR_BW_ID_0, + DCB_PG_ATTR_BW_ID_1, + DCB_PG_ATTR_BW_ID_2, + DCB_PG_ATTR_BW_ID_3, + DCB_PG_ATTR_BW_ID_4, + DCB_PG_ATTR_BW_ID_5, + DCB_PG_ATTR_BW_ID_6, + DCB_PG_ATTR_BW_ID_7, + DCB_PG_ATTR_BW_ID_MAX, + DCB_PG_ATTR_BW_ID_ALL, + + __DCB_PG_ATTR_ENUM_MAX, + DCB_PG_ATTR_MAX = __DCB_PG_ATTR_ENUM_MAX - 1, +}; + +/** + * enum dcbnl_tc_attrs - DCB Traffic Class attributes + * + * @DCB_TC_ATTR_PARAM_UNDEFINED: unspecified attribute to catch errors + * @DCB_TC_ATTR_PARAM_PGID: (NLA_U8) Priority group the traffic class belongs to + * Valid values are: 0-7 + * @DCB_TC_ATTR_PARAM_UP_MAPPING: (NLA_U8) Traffic class to user priority map + * Some devices may not support changing the + * user priority map of a TC. + * @DCB_TC_ATTR_PARAM_STRICT_PRIO: (NLA_U8) Strict priority setting + * 0 - none + * 1 - group strict + * 2 - link strict + * @DCB_TC_ATTR_PARAM_BW_PCT: optional - (NLA_U8) If supported by the device and + * not configured to use link strict priority, + * this is the percentage of bandwidth of the + * priority group this traffic class belongs to + * @DCB_TC_ATTR_PARAM_ALL: (NLA_FLAG) all traffic class parameters + * + */ +enum dcbnl_tc_attrs { + DCB_TC_ATTR_PARAM_UNDEFINED, + + DCB_TC_ATTR_PARAM_PGID, + DCB_TC_ATTR_PARAM_UP_MAPPING, + DCB_TC_ATTR_PARAM_STRICT_PRIO, + DCB_TC_ATTR_PARAM_BW_PCT, + DCB_TC_ATTR_PARAM_ALL, + + __DCB_TC_ATTR_PARAM_ENUM_MAX, + DCB_TC_ATTR_PARAM_MAX = __DCB_TC_ATTR_PARAM_ENUM_MAX - 1, +}; + +/** + * enum dcb_general_attr_values - general DCB attribute values + * + * @DCB_ATTR_UNDEFINED: value used to indicate an attribute is not supported + * + */ +enum dcb_general_attr_values { + DCB_ATTR_VALUE_UNDEFINED = 0xff +}; + + +#endif /* __LINUX_DCBNL_H__ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8fb23679ee3..6095af572dfd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -43,6 +43,9 @@ #include #include +#ifdef CONFIG_DCBNL +#include +#endif struct vlan_group; struct ethtool_ops; @@ -843,6 +846,11 @@ struct net_device #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; +#ifdef CONFIG_DCBNL + /* Data Center Bridging netlink ops */ + struct dcbnl_rtnl_ops *dcbnl_ops; +#endif + #ifdef CONFIG_COMPAT_NET_DEV_OPS struct { int (*init)(struct net_device *dev); diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 2b3d51c6ec9c..e88f7058b3a1 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -107,6 +107,11 @@ enum { RTM_GETADDRLABEL, #define RTM_GETADDRLABEL RTM_GETADDRLABEL + RTM_GETDCB = 78, +#define RTM_GETDCB RTM_GETDCB + RTM_SETDCB, +#define RTM_SETDCB RTM_SETDCB + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; -- cgit v1.2.3 From 46132188bf72e22ef097f16ed5c969ee8cea1e8b Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 20 Nov 2008 21:05:08 -0800 Subject: DCB: Add interface to query for the DCB capabilities of an device. Adds to the netlink interface for Data Center Bridging (DCB), allowing the DCB capabilities supported by a device to be queried. Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: David S. Miller --- include/linux/dcbnl.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h index 32d32c1ee410..13f0c638a695 100644 --- a/include/linux/dcbnl.h +++ b/include/linux/dcbnl.h @@ -43,6 +43,7 @@ struct dcbmsg { * @DCB_CMD_SET_ALL: apply all changes to the underlying device * @DCB_CMD_GPERM_HWADDR: get the permanent MAC address of the underlying * device. Only useful when using bonding. + * @DCB_CMD_GCAP: request the DCB capabilities of the device */ enum dcbnl_commands { DCB_CMD_UNDEFINED, @@ -60,6 +61,7 @@ enum dcbnl_commands { DCB_CMD_SET_ALL, DCB_CMD_GPERM_HWADDR, + DCB_CMD_GCAP, __DCB_CMD_ENUM_MAX, DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1, @@ -78,6 +80,7 @@ enum dcbnl_commands { * @DCB_ATTR_PG_CFG: priority group configuration (NLA_NESTED) * @DCB_ATTR_SET_ALL: bool to commit changes to hardware or not (NLA_U8) * @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED) + * @DCB_ATTR_CAP: DCB capabilities of the device (NLA_NESTED) */ enum dcbnl_attrs { DCB_ATTR_UNDEFINED, @@ -90,6 +93,7 @@ enum dcbnl_attrs { DCB_ATTR_PG_CFG, DCB_ATTR_SET_ALL, DCB_ATTR_PERM_HWADDR, + DCB_ATTR_CAP, __DCB_ATTR_ENUM_MAX, DCB_ATTR_MAX = __DCB_ATTR_ENUM_MAX - 1, @@ -216,6 +220,39 @@ enum dcbnl_tc_attrs { DCB_TC_ATTR_PARAM_MAX = __DCB_TC_ATTR_PARAM_ENUM_MAX - 1, }; +/** + * enum dcbnl_cap_attrs - DCB Capability attributes + * + * @DCB_CAP_ATTR_UNDEFINED: unspecified attribute to catch errors + * @DCB_CAP_ATTR_ALL: (NLA_FLAG) all capability parameters + * @DCB_CAP_ATTR_PG: (NLA_U8) device supports Priority Groups + * @DCB_CAP_ATTR_PFC: (NLA_U8) device supports Priority Flow Control + * @DCB_CAP_ATTR_UP2TC: (NLA_U8) device supports user priority to + * traffic class mapping + * @DCB_CAP_ATTR_PG_TCS: (NLA_U8) bitmap where each bit represents a + * number of traffic classes the device + * can be configured to use for Priority Groups + * @DCB_CAP_ATTR_PFC_TCS: (NLA_U8) bitmap where each bit represents a + * number of traffic classes the device can be + * configured to use for Priority Flow Control + * @DCB_CAP_ATTR_GSP: (NLA_U8) device supports group strict priority + * @DCB_CAP_ATTR_BCN: (NLA_U8) device supports Backwards Congestion + * Notification + */ +enum dcbnl_cap_attrs { + DCB_CAP_ATTR_UNDEFINED, + DCB_CAP_ATTR_ALL, + DCB_CAP_ATTR_PG, + DCB_CAP_ATTR_PFC, + DCB_CAP_ATTR_UP2TC, + DCB_CAP_ATTR_PG_TCS, + DCB_CAP_ATTR_PFC_TCS, + DCB_CAP_ATTR_GSP, + DCB_CAP_ATTR_BCN, + + __DCB_CAP_ATTR_ENUM_MAX, + DCB_CAP_ATTR_MAX = __DCB_CAP_ATTR_ENUM_MAX - 1, +}; /** * enum dcb_general_attr_values - general DCB attribute values * -- cgit v1.2.3 From 33dbabc4a7f7bd72313c73a3c199f31f3900336f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 20 Nov 2008 21:08:19 -0800 Subject: DCB: Add interface to query # of TCs supported by device Adds interface for Data Center Bridging (DCB) to query (and set if supported) the number of traffic classes currently supported by the device for the two (DCB) features: priority groups (PG) and priority flow control (PFC). Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: David S. Miller --- include/linux/dcbnl.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h index 13f0c638a695..1077fba1dadc 100644 --- a/include/linux/dcbnl.h +++ b/include/linux/dcbnl.h @@ -44,6 +44,8 @@ struct dcbmsg { * @DCB_CMD_GPERM_HWADDR: get the permanent MAC address of the underlying * device. Only useful when using bonding. * @DCB_CMD_GCAP: request the DCB capabilities of the device + * @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported + * @DCB_CMD_SNUMTCS: set the number of traffic classes */ enum dcbnl_commands { DCB_CMD_UNDEFINED, @@ -62,6 +64,8 @@ enum dcbnl_commands { DCB_CMD_SET_ALL, DCB_CMD_GPERM_HWADDR, DCB_CMD_GCAP, + DCB_CMD_GNUMTCS, + DCB_CMD_SNUMTCS, __DCB_CMD_ENUM_MAX, DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1, @@ -81,6 +85,7 @@ enum dcbnl_commands { * @DCB_ATTR_SET_ALL: bool to commit changes to hardware or not (NLA_U8) * @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED) * @DCB_ATTR_CAP: DCB capabilities of the device (NLA_NESTED) + * @DCB_ATTR_NUMTCS: number of traffic classes supported (NLA_NESTED) */ enum dcbnl_attrs { DCB_ATTR_UNDEFINED, @@ -94,6 +99,7 @@ enum dcbnl_attrs { DCB_ATTR_SET_ALL, DCB_ATTR_PERM_HWADDR, DCB_ATTR_CAP, + DCB_ATTR_NUMTCS, __DCB_ATTR_ENUM_MAX, DCB_ATTR_MAX = __DCB_ATTR_ENUM_MAX - 1, @@ -253,6 +259,27 @@ enum dcbnl_cap_attrs { __DCB_CAP_ATTR_ENUM_MAX, DCB_CAP_ATTR_MAX = __DCB_CAP_ATTR_ENUM_MAX - 1, }; + +/** + * enum dcbnl_numtcs_attrs - number of traffic classes + * + * @DCB_NUMTCS_ATTR_UNDEFINED: unspecified attribute to catch errors + * @DCB_NUMTCS_ATTR_ALL: (NLA_FLAG) all traffic class attributes + * @DCB_NUMTCS_ATTR_PG: (NLA_U8) number of traffic classes used for + * priority groups + * @DCB_NUMTCS_ATTR_PFC: (NLA_U8) number of traffic classes which can + * support priority flow control + */ +enum dcbnl_numtcs_attrs { + DCB_NUMTCS_ATTR_UNDEFINED, + DCB_NUMTCS_ATTR_ALL, + DCB_NUMTCS_ATTR_PG, + DCB_NUMTCS_ATTR_PFC, + + __DCB_NUMTCS_ATTR_ENUM_MAX, + DCB_NUMTCS_ATTR_MAX = __DCB_NUMTCS_ATTR_ENUM_MAX - 1, +}; + /** * enum dcb_general_attr_values - general DCB attribute values * -- cgit v1.2.3 From 0eb3aa9bab20217fb42244ccdcb5bf8a002f504c Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 20 Nov 2008 21:09:23 -0800 Subject: DCB: Add interface to query the state of PFC feature. Adds a netlink interface for Data Center Bridging (DCB) to get and set the enable state of the Priority Flow Control (PFC) feature. Primarily, this is a way to turn off PFC in the driver while DCB remains enabled. Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: David S. Miller --- include/linux/dcbnl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h index 1077fba1dadc..6cc4560bc376 100644 --- a/include/linux/dcbnl.h +++ b/include/linux/dcbnl.h @@ -66,6 +66,8 @@ enum dcbnl_commands { DCB_CMD_GCAP, DCB_CMD_GNUMTCS, DCB_CMD_SNUMTCS, + DCB_CMD_PFC_GSTATE, + DCB_CMD_PFC_SSTATE, __DCB_CMD_ENUM_MAX, DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1, -- cgit v1.2.3 From 859ee3c43812051e21816c6d6d4cc04fb7ce9b2e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 20 Nov 2008 21:10:23 -0800 Subject: DCB: Add support for DCB BCN Adds an interface to configure the Backward Congestion Notification (BCN) feature. In a BCN capabale network, congestion notifications from congested points out in the network can cause the end station limit the rate of a given traffic flow. Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: David S. Miller --- include/linux/dcbnl.h | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h index 6cc4560bc376..e73a61449ad6 100644 --- a/include/linux/dcbnl.h +++ b/include/linux/dcbnl.h @@ -46,6 +46,8 @@ struct dcbmsg { * @DCB_CMD_GCAP: request the DCB capabilities of the device * @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported * @DCB_CMD_SNUMTCS: set the number of traffic classes + * @DCB_CMD_GBCN: set backward congestion notification configuration + * @DCB_CMD_SBCN: get backward congestion notification configration. */ enum dcbnl_commands { DCB_CMD_UNDEFINED, @@ -62,18 +64,24 @@ enum dcbnl_commands { DCB_CMD_PFC_SCFG, DCB_CMD_SET_ALL, + DCB_CMD_GPERM_HWADDR, + DCB_CMD_GCAP, + DCB_CMD_GNUMTCS, DCB_CMD_SNUMTCS, + DCB_CMD_PFC_GSTATE, DCB_CMD_PFC_SSTATE, + DCB_CMD_BCN_GCFG, + DCB_CMD_BCN_SCFG, + __DCB_CMD_ENUM_MAX, DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1, }; - /** * enum dcbnl_attrs - DCB top-level netlink attributes * @@ -88,6 +96,7 @@ enum dcbnl_commands { * @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED) * @DCB_ATTR_CAP: DCB capabilities of the device (NLA_NESTED) * @DCB_ATTR_NUMTCS: number of traffic classes supported (NLA_NESTED) + * @DCB_ATTR_BCN: backward congestion notification configuration (NLA_NESTED) */ enum dcbnl_attrs { DCB_ATTR_UNDEFINED, @@ -102,6 +111,7 @@ enum dcbnl_attrs { DCB_ATTR_PERM_HWADDR, DCB_ATTR_CAP, DCB_ATTR_NUMTCS, + DCB_ATTR_BCN, __DCB_ATTR_ENUM_MAX, DCB_ATTR_MAX = __DCB_ATTR_ENUM_MAX - 1, @@ -282,6 +292,38 @@ enum dcbnl_numtcs_attrs { DCB_NUMTCS_ATTR_MAX = __DCB_NUMTCS_ATTR_ENUM_MAX - 1, }; +enum dcbnl_bcn_attrs{ + DCB_BCN_ATTR_UNDEFINED = 0, + + DCB_BCN_ATTR_RP_0, + DCB_BCN_ATTR_RP_1, + DCB_BCN_ATTR_RP_2, + DCB_BCN_ATTR_RP_3, + DCB_BCN_ATTR_RP_4, + DCB_BCN_ATTR_RP_5, + DCB_BCN_ATTR_RP_6, + DCB_BCN_ATTR_RP_7, + DCB_BCN_ATTR_RP_ALL, + + DCB_BCN_ATTR_ALPHA, + DCB_BCN_ATTR_BETA, + DCB_BCN_ATTR_GD, + DCB_BCN_ATTR_GI, + DCB_BCN_ATTR_TMAX, + DCB_BCN_ATTR_TD, + DCB_BCN_ATTR_RMIN, + DCB_BCN_ATTR_W, + DCB_BCN_ATTR_RD, + DCB_BCN_ATTR_RU, + DCB_BCN_ATTR_WRTT, + DCB_BCN_ATTR_RI, + DCB_BCN_ATTR_C, + DCB_BCN_ATTR_ALL, + + __DCB_BCN_ATTR_ENUM_MAX, + DCB_BCN_ATTR_MAX = __DCB_BCN_ATTR_ENUM_MAX - 1, +}; + /** * enum dcb_general_attr_values - general DCB attribute values * -- cgit v1.2.3 From 2baf8a2daab65cdd3f20bfeb4676a2f6aff7c3bf Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Fri, 21 Nov 2008 16:34:18 -0800 Subject: netdevice hdlc: Convert directly reference of netdev->priv For killing directly reference of netdev->priv, use netdev->ml_priv to replace it. Because the private pvc data comes from add_pvc() and can't be allocated in alloc_netdev(). Signed-off-by: Wang Chen Acked-by: Krzysztof Halasa Signed-off-by: David S. Miller --- include/linux/hdlc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h index c59769693bee..e960faac609d 100644 --- a/include/linux/hdlc.h +++ b/include/linux/hdlc.h @@ -80,7 +80,7 @@ struct net_device *alloc_hdlcdev(void *priv); static inline struct hdlc_device* dev_to_hdlc(struct net_device *dev) { - return dev->priv; + return netdev_priv(dev); } static __inline__ void debug_frame(const struct sk_buff *skb) -- cgit v1.2.3 From e262a7ba31f0cd4dae225e6d2e9037e5ac6108e8 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Sun, 23 Nov 2008 14:34:43 +0000 Subject: irq.h: remove padding from irq_desc on 64bits Impact: reduce struct irq_desc size struct irq_desc: reorder to remove padding on 64bits shrinks irq_desc to 128 bytes which saves data space & cache lines On a generic x86_64/SMP build this reduces the reported data size by 64k. Signed-off-by: Richard Kennedy Signed-off-by: Ingo Molnar --- include/linux/irq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index d058c57be02d..838a977885e1 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -142,8 +142,8 @@ struct irq_chip { * @depth: disable-depth, for nested irq_disable() calls * @wake_depth: enable depth, for multiple set_irq_wake() callers * @irq_count: stats field to detect stalled irqs - * @irqs_unhandled: stats field for spurious unhandled interrupts * @last_unhandled: aging timer for unhandled count + * @irqs_unhandled: stats field for spurious unhandled interrupts * @lock: locking for SMP * @affinity: IRQ affinity on SMP * @cpu: cpu index useful for balancing @@ -165,8 +165,8 @@ struct irq_desc { unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ unsigned int irq_count; /* For detecting broken IRQs */ - unsigned int irqs_unhandled; unsigned long last_unhandled; /* Aging timer for unhandled count */ + unsigned int irqs_unhandled; spinlock_t lock; #ifdef CONFIG_SMP cpumask_t affinity; -- cgit v1.2.3 From b20a9c24d5c5d466d7e4a25c6f1bedbd2d16ad4f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 23 Nov 2008 16:02:31 -0800 Subject: dccp: Set per-connection CCIDs via socket options With this patch, TX/RX CCIDs can now be changed on a per-connection basis, which overrides the defaults set by the global sysctl variables for TX/RX CCIDs. To make full use of this facility, the remaining patches of this patch set are needed, which track dependencies and activate negotiated feature values. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- include/linux/dccp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index eda389ce04f4..6a72ff52a8a4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -168,6 +168,8 @@ enum { DCCPO_MIN_CCID_SPECIFIC = 128, DCCPO_MAX_CCID_SPECIFIC = 255, }; +/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ +#define DCCP_SINGLE_OPT_MAXLEN 253 /* DCCP CCIDS */ enum { @@ -203,6 +205,9 @@ enum dccp_feature_numbers { #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 #define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 +#define DCCP_SOCKOPT_CCID 13 +#define DCCP_SOCKOPT_TX_CCID 14 +#define DCCP_SOCKOPT_RX_CCID 15 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 -- cgit v1.2.3 From 1f87e235e6fb92c2968b52b9191de04f1aff8e77 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 23 Nov 2008 23:24:32 -0800 Subject: eth: Declare an optimized compare_ether_addr_64bits() function Linus mentioned we could try to perform long word operations, even on potentially unaligned addresses, on x86 at least. David mentioned the HAVE_EFFICIENT_UNALIGNED_ACCESS test to handle this on all arches that have efficient unailgned accesses. I tried this idea and got nice assembly on 32 bits: 158: 33 82 38 01 00 00 xor 0x138(%edx),%eax 15e: 33 8a 34 01 00 00 xor 0x134(%edx),%ecx 164: c1 e0 10 shl $0x10,%eax 167: 09 c1 or %eax,%ecx 169: 74 0b je 176 And very nice assembly on 64 bits of course (one xor, one shl) Nice oprofile improvement in eth_type_trans(), 0.17 % instead of 0.41 %, expected since we remove 8 instructions on a fast path. This patch implements a compare_ether_addr_64bits() function, that uses the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ifdef to efficiently perform the 6 bytes comparison on all capable arches. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 0e5e97060034..1cb0f0b90926 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -27,6 +27,7 @@ #include #include #include +#include #ifdef __KERNEL__ extern __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); @@ -140,6 +141,47 @@ static inline unsigned compare_ether_addr(const u8 *addr1, const u8 *addr2) BUILD_BUG_ON(ETH_ALEN != 6); return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) != 0; } + +static inline unsigned long zap_last_2bytes(unsigned long value) +{ +#ifdef __BIG_ENDIAN + return value >> 16; +#else + return value << 16; +#endif +} + +/** + * compare_ether_addr_64bits - Compare two Ethernet addresses + * @addr1: Pointer to an array of 8 bytes + * @addr2: Pointer to an other array of 8 bytes + * + * Compare two ethernet addresses, returns 0 if equal. + * Same result than "memcmp(addr1, addr2, ETH_ALEN)" but without conditional + * branches, and possibly long word memory accesses on CPU allowing cheap + * unaligned memory reads. + * arrays = { byte1, byte2, byte3, byte4, byte6, byte7, pad1, pad2} + * + * Please note that alignment of addr1 & addr2 is only guaranted to be 16 bits. + */ + +static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2], + const u8 addr2[6+2]) +{ +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + unsigned long fold = ((*(unsigned long *)addr1) ^ + (*(unsigned long *)addr2)); + + if (sizeof(fold) == 8) + return zap_last_2bytes(fold) != 0; + + fold |= zap_last_2bytes((*(unsigned long *)(addr1 + 4)) ^ + (*(unsigned long *)(addr2 + 4))); + return fold != 0; +#else + return compare_ether_addr(addr1, addr2); +#endif +} #endif /* __KERNEL__ */ #endif /* _LINUX_ETHERDEVICE_H */ -- cgit v1.2.3 From 1acdac104668a0834cfa267de9946fac7764d486 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Nov 2008 10:02:53 -0800 Subject: futex: make clock selectable for FUTEX_WAIT_BITSET FUTEX_WAIT_BITSET could be used instead of FUTEX_WAIT by setting the bit set to FUTEX_BITSET_MATCH_ANY, but FUTEX_WAIT uses CLOCK_REALTIME while FUTEX_WAIT_BITSET uses CLOCK_MONOTONIC. Add a flag to select CLOCK_REALTIME for FUTEX_WAIT_BITSET so glibc can replace the FUTEX_WAIT logic which needs to do gettimeofday() calls before and after the syscall to convert the absolute timeout to a relative timeout for FUTEX_WAIT. Signed-off-by: Thomas Gleixner Cc: Ulrich Drepper --- include/linux/futex.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/futex.h b/include/linux/futex.h index 8f627b9ae2b1..3bf5bb5a34f9 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -25,7 +25,8 @@ union ktime; #define FUTEX_WAKE_BITSET 10 #define FUTEX_PRIVATE_FLAG 128 -#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG +#define FUTEX_CLOCK_REALTIME 256 +#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME) #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) -- cgit v1.2.3 From 18b6e0414e42d95183f07d8177e3ff0241abd825 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Wed, 15 Oct 2008 16:38:45 -0500 Subject: User namespaces: set of cleanups (v2) The user_ns is moved from nsproxy to user_struct, so that a struct cred by itself is sufficient to determine access (which it otherwise would not be). Corresponding ecryptfs fixes (by David Howells) are here as well. Fix refcounting. The following rules now apply: 1. The task pins the user struct. 2. The user struct pins its user namespace. 3. The user namespace pins the struct user which created it. User namespaces are cloned during copy_creds(). Unsharing a new user_ns is no longer possible. (We could re-add that, but it'll cause code duplication and doesn't seem useful if PAM doesn't need to clone user namespaces). When a user namespace is created, its first user (uid 0) gets empty keyrings and a clean group_info. This incorporates a previous patch by David Howells. Here is his original patch description: >I suggest adding the attached incremental patch. It makes the following >changes: > > (1) Provides a current_user_ns() macro to wrap accesses to current's user > namespace. > > (2) Fixes eCryptFS. > > (3) Renames create_new_userns() to create_user_ns() to be more consistent > with the other associated functions and because the 'new' in the name is > superfluous. > > (4) Moves the argument and permission checks made for CLONE_NEWUSER to the > beginning of do_fork() so that they're done prior to making any attempts > at allocation. > > (5) Calls create_user_ns() after prepare_creds(), and gives it the new creds > to fill in rather than have it return the new root user. I don't imagine > the new root user being used for anything other than filling in a cred > struct. > > This also permits me to get rid of a get_uid() and a free_uid(), as the > reference the creds were holding on the old user_struct can just be > transferred to the new namespace's creator pointer. > > (6) Makes create_user_ns() reset the UIDs and GIDs of the creds under > preparation rather than doing it in copy_creds(). > >David >Signed-off-by: David Howells Changelog: Oct 20: integrate dhowells comments 1. leave thread_keyring alone 2. use current_user_ns() in set_user() Signed-off-by: Serge Hallyn --- include/linux/cred.h | 2 ++ include/linux/init_task.h | 1 - include/linux/nsproxy.h | 1 - include/linux/sched.h | 1 + include/linux/user_namespace.h | 13 ++++--------- 5 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 26c1ab179946..3282ee4318e7 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -60,6 +60,7 @@ do { \ } while (0) extern struct group_info *groups_alloc(int); +extern struct group_info init_groups; extern void groups_free(struct group_info *); extern int set_current_groups(struct group_info *); extern int set_groups(struct cred *, struct group_info *); @@ -315,6 +316,7 @@ static inline void put_cred(const struct cred *_cred) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) +#define current_user_ns() (current_cred_xxx(user)->user_ns) #define current_security() (current_cred_xxx(security)) #define current_uid_gid(_uid, _gid) \ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 2597858035cd..959f5522d10a 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -57,7 +57,6 @@ extern struct nsproxy init_nsproxy; .mnt_ns = NULL, \ INIT_NET_NS(net_ns) \ INIT_IPC_NS(ipc_ns) \ - .user_ns = &init_user_ns, \ } #define INIT_SIGHAND(sighand) { \ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index c8a768e59640..afad7dec1b36 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -27,7 +27,6 @@ struct nsproxy { struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns; - struct user_namespace *user_ns; struct net *net_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2036e9f26020..7f8015a3082e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -638,6 +638,7 @@ struct user_struct { /* Hash table maintenance information */ struct hlist_node uidhash_node; uid_t uid; + struct user_namespace *user_ns; #ifdef CONFIG_USER_SCHED struct task_group *tg; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index b5f41d4c2eec..315bcd375224 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -12,7 +12,7 @@ struct user_namespace { struct kref kref; struct hlist_head uidhash_table[UIDHASH_SZ]; - struct user_struct *root_user; + struct user_struct *creator; }; extern struct user_namespace init_user_ns; @@ -26,8 +26,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) return ns; } -extern struct user_namespace *copy_user_ns(int flags, - struct user_namespace *old_ns); +extern int create_user_ns(struct cred *new); extern void free_user_ns(struct kref *kref); static inline void put_user_ns(struct user_namespace *ns) @@ -43,13 +42,9 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) return &init_user_ns; } -static inline struct user_namespace *copy_user_ns(int flags, - struct user_namespace *old_ns) +static inline int create_user_ns(struct cred *new) { - if (flags & CLONE_NEWUSER) - return ERR_PTR(-EINVAL); - - return old_ns; + return -EINVAL; } static inline void put_user_ns(struct user_namespace *ns) -- cgit v1.2.3 From 832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:20:15 -0800 Subject: tcp: Try to restore large SKBs while SACK processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During SACK processing, most of the benefits of TSO are eaten by the SACK blocks that one-by-one fragment SKBs to MSS sized chunks. Then we're in problems when cleanup work for them has to be done when a large cumulative ACK comes. Try to return back to pre-split state already while more and more SACK info gets discovered by combining newly discovered SACK areas with the previous skb if that's SACKed as well. This approach has a number of benefits: 1) The processing overhead is spread more equally over the RTT 2) Write queue has less skbs to process (affect everything which has to walk in the queue past the sacked areas) 3) Write queue is consistent whole the time, so no other parts of TCP has to be aware of this (this was not the case with some other approach that was, well, quite intrusive all around). 4) Clean_rtx_queue can release most of the pages using single put_page instead of previous PAGE_SIZE/mss+1 calls In case a hole is fully filled by the new SACK block, we attempt to combine the next skb too which allows construction of skbs that are even larger than what tso split them to and it handles hole per on every nth patterns that often occur during slow start overshoot pretty nicely. Though this to be really useful also a retransmission would have to get lost since cumulative ACKs advance one hole at a time in the most typical case. TODO: handle upwards only merging. That should be rather easy when segment is fully sacked but I'm leaving that as future work item (it won't make very large difference anyway since this current approach already covers quite a lot of normal cases). I was earlier thinking of some sophisticated way of tracking timestamps of the first and the last segment but later on realized that it won't be that necessary at all to store the timestamp of the last segment. The cases that can occur are basically either: 1) ambiguous => no sensible measurement can be taken anyway 2) non-ambiguous is due to reordering => having the timestamp of the last segment there is just skewing things more off than does some good since the ack got triggered by one of the holes (besides some substle issues that would make determining right hole/skb even harder problem). Anyway, it has nothing to do with this change then. I choose to route some abnormal looking cases with goto noop, some could be handled differently (eg., by stopping the walking at that skb but again). In general, they either shouldn't happen at all or are rare enough to make no difference in practice. In theory this change (as whole) could cause some macroscale regression (global) because of cache misses that are taken over the round-trip time but it gets very likely better because of much less (local) cache misses per other write queue walkers and the big recovery clearing cumulative ack. Worth to note that these benefits would be very easy to get also without TSO/GSO being on as long as the data is in pages so that we can merge them. Currently I won't let that happen because DSACK splitting at fragment that would mess up pcounts due to sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets avoided, we have some conditions that can be made less strict. TODO: I will probably have to convert the excessive pointer passing to struct sacktag_state... :-) My testing revealed that considerable amount of skbs couldn't be shifted because they were cloned (most likely still awaiting tx reclaim)... [The rest is considering future work instead since I got repeatably EFAULT to tcpdump's recvfrom when I added pskb_expand_head to deal with clones, so I separated that into another, later patch] ...To counter that, I gave up on the fifth advantage: 5) When growing previous SACK block, less allocs for new skbs are done, basically a new alloc is needed only when new hole is detected and when the previous skb runs out of frags space ...which now only happens of if reclaim is fast enough to dispose the clone before the SACK block comes in (the window is RTT long), otherwise we'll have to alloc some. With clones being handled I got these numbers (will be somewhat worse without that), taken with fine-grained mibs: TCPSackShifted 398 TCPSackMerged 877 TCPSackShiftFallback 320 TCPSACKCOLLAPSEFALLBACKGSO 0 TCPSACKCOLLAPSEFALLBACKSKBBITS 0 TCPSACKCOLLAPSEFALLBACKSKBDATA 0 TCPSACKCOLLAPSEFALLBACKBELOW 0 TCPSACKCOLLAPSEFALLBACKFIRST 1 TCPSACKCOLLAPSEFALLBACKPREVBITS 318 TCPSACKCOLLAPSEFALLBACKMSS 1 TCPSACKCOLLAPSEFALLBACKNOHEAD 0 TCPSACKCOLLAPSEFALLBACKSHIFT 0 TCPSACKCOLLAPSENOOPSEQ 0 TCPSACKCOLLAPSENOOPSMALLPCOUNT 0 TCPSACKCOLLAPSENOOPSMALLLEN 0 TCPSACKCOLLAPSEHOLE 12 Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/skbuff.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a01b6f84e3bc..acf17af45af9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -492,6 +492,19 @@ static inline bool skb_queue_is_last(const struct sk_buff_head *list, return (skb->next == (struct sk_buff *) list); } +/** + * skb_queue_is_first - check if skb is the first entry in the queue + * @list: queue head + * @skb: buffer + * + * Returns true if @skb is the first buffer on the list. + */ +static inline bool skb_queue_is_first(const struct sk_buff_head *list, + const struct sk_buff *skb) +{ + return (skb->prev == (struct sk_buff *) list); +} + /** * skb_queue_next - return the next packet in the queue * @list: queue head @@ -510,6 +523,24 @@ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list, return skb->next; } +/** + * skb_queue_prev - return the prev packet in the queue + * @list: queue head + * @skb: current buffer + * + * Return the prev packet in @list before @skb. It is only valid to + * call this if skb_queue_is_first() evaluates to false. + */ +static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list, + const struct sk_buff *skb) +{ + /* This BUG_ON may seem severe, but if we just return then we + * are going to dereference garbage. + */ + BUG_ON(skb_queue_is_first(list, skb)); + return skb->prev; +} + /** * skb_get - reference buffer * @skb: buffer to reference @@ -1652,6 +1683,8 @@ extern int skb_splice_bits(struct sk_buff *skb, extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); +extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, + int shiftlen); extern struct sk_buff *skb_segment(struct sk_buff *skb, int features); -- cgit v1.2.3 From 111cc8b913b42ef07793648b1699288332f273e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:27:22 -0800 Subject: tcp: add some mibs to track collapsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/snmp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 7a6e6bba4a71..aee3f1e1d1ce 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -216,6 +216,9 @@ enum LINUX_MIB_TCPSPURIOUSRTOS, /* TCPSpuriousRTOs */ LINUX_MIB_TCPMD5NOTFOUND, /* TCPMD5NotFound */ LINUX_MIB_TCPMD5UNEXPECTED, /* TCPMD5Unexpected */ + LINUX_MIB_SACKSHIFTED, + LINUX_MIB_SACKMERGED, + LINUX_MIB_SACKSHIFTFALLBACK, __LINUX_MIB_MAX }; -- cgit v1.2.3 From 47fd5b8373ecc6bf5473e4139b62b06425448252 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 25 Nov 2008 00:20:43 -0800 Subject: netdev: add HAVE_NET_DEVICE_OPS As a concession to vendors who have to deal with one source for different kernel versions, add a HAVE_NET_DEVICE_OPS so they don't end up hard coding ifdef against kernel version. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6095af572dfd..76a89f8e6a19 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -545,6 +545,7 @@ struct netdev_queue { * * void (*ndo_poll_controller)(struct net_device *dev); */ +#define HAVE_NET_DEVICE_OPS struct net_device_ops { int (*ndo_init)(struct net_device *dev); void (*ndo_uninit)(struct net_device *dev); -- cgit v1.2.3 From 7a6b6f515f77d1c62a2f383b6dce18cb0af0cf4f Mon Sep 17 00:00:00 2001 From: Jeff Kirsher Date: Tue, 25 Nov 2008 01:02:08 -0800 Subject: DCB: fix kconfig option Since the netlink option for DCB is necessary to actually be useful, simplified the Kconfig option. In addition, added useful help text for the Kconfig option. Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 76a89f8e6a19..0df0db068ac3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -43,7 +43,7 @@ #include #include -#ifdef CONFIG_DCBNL +#ifdef CONFIG_DCB #include #endif @@ -847,7 +847,7 @@ struct net_device #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; -#ifdef CONFIG_DCBNL +#ifdef CONFIG_DCB /* Data Center Bridging netlink ops */ struct dcbnl_rtnl_ops *dcbnl_ops; #endif -- cgit v1.2.3 From ca109491f612aab5c8152207631c0444f63da97f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Nov 2008 12:43:51 +0100 Subject: hrtimer: removing all ur callback modes Impact: cleanup, move all hrtimer processing into hardirq context This is an attempt at removing some of the hrtimer complexity by reducing the number of callback modes to 1. This means that all hrtimer callback functions will be ran from HARD-irq context. I went through all the 30 odd hrtimer callback functions in the kernel and saw only one that I'm not quite sure of, which is the one in net/can/bcm.c - hence I'm CC-ing the folks responsible for that code. Furthermore, the hrtimer core now calls callbacks directly with IRQs disabled in case you try to enqueue an expired timer. If this timer is a periodic timer (which should use hrtimer_forward() to advance its time) then it might be possible to end up in an inf. recursive loop due to the fact that hrtimer_forward() doesn't round up to the next timer granularity, and therefore keeps on calling the callback - obviously this needs a fix. Aside from that, this seems to compile and actually boot on my dual core test box - although I'm sure there are some bugs in, me not hitting any makes me certain :-) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/hrtimer.h | 34 ++-------------------------------- include/linux/interrupt.h | 3 --- 2 files changed, 2 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 3eba43878dcb..bd37078c2d7d 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -42,26 +42,6 @@ enum hrtimer_restart { HRTIMER_RESTART, /* Timer must be restarted */ }; -/* - * hrtimer callback modes: - * - * HRTIMER_CB_SOFTIRQ: Callback must run in softirq context - * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context - * Special mode for tick emulation and - * scheduler timer. Such timers are per - * cpu and not allowed to be migrated on - * cpu unplug. - * HRTIMER_CB_IRQSAFE_UNLOCKED: Callback should run in hardirq context - * with timer->base lock unlocked - * used for timers which call wakeup to - * avoid lock order problems with rq->lock - */ -enum hrtimer_cb_mode { - HRTIMER_CB_SOFTIRQ, - HRTIMER_CB_IRQSAFE_PERCPU, - HRTIMER_CB_IRQSAFE_UNLOCKED, -}; - /* * Values to track state of the timer * @@ -70,7 +50,6 @@ enum hrtimer_cb_mode { * 0x00 inactive * 0x01 enqueued into rbtree * 0x02 callback function running - * 0x04 callback pending (high resolution mode) * * Special cases: * 0x03 callback function running and enqueued @@ -92,8 +71,7 @@ enum hrtimer_cb_mode { #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 #define HRTIMER_STATE_CALLBACK 0x02 -#define HRTIMER_STATE_PENDING 0x04 -#define HRTIMER_STATE_MIGRATE 0x08 +#define HRTIMER_STATE_MIGRATE 0x04 /** * struct hrtimer - the basic hrtimer structure @@ -109,8 +87,6 @@ enum hrtimer_cb_mode { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @cb_mode: high resolution timer feature to select the callback execution - * mode * @cb_entry: list head to enqueue an expired timer into the callback list * @start_site: timer statistics field to store the site where the timer * was started @@ -129,7 +105,6 @@ struct hrtimer { struct hrtimer_clock_base *base; unsigned long state; struct list_head cb_entry; - enum hrtimer_cb_mode cb_mode; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -188,15 +163,11 @@ struct hrtimer_clock_base { * @check_clocks: Indictator, when set evaluate time source and clock * event devices whether high resolution mode can be * activated. - * @cb_pending: Expired timers are moved from the rbtree to this - * list in the timer interrupt. The list is processed - * in the softirq. * @nr_events: Total number of timer interrupt events */ struct hrtimer_cpu_base { spinlock_t lock; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; - struct list_head cb_pending; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; @@ -404,8 +375,7 @@ static inline int hrtimer_active(const struct hrtimer *timer) */ static inline int hrtimer_is_queued(struct hrtimer *timer) { - return timer->state & - (HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING); + return timer->state & HRTIMER_STATE_ENQUEUED; } /* diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f58a0cf8929a..d6210a97a8ca 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -251,9 +251,6 @@ enum BLOCK_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, -#ifdef CONFIG_HIGH_RES_TIMERS - HRTIMER_SOFTIRQ, -#endif RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS -- cgit v1.2.3 From 3f2355cb9111ac04e7ae06a4d7044da2ae813863 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Nov 2008 14:22:02 -0800 Subject: cfg80211/mac80211: Add 802.11d support This adds country IE parsing to mac80211 and enables its usage within the new regulatory infrastructure in cfg80211. We parse the country IEs only on management beacons for the BSSID you are associated to and disregard the IEs when the country and environment (indoor, outdoor, any) matches the already processed country IE. To avoid following misinformed or outdated APs we build and use a regulatory domain out of the intersection between what the AP provides us on the country IE and what CRDA is aware is allowed on the same country. A secondary device is allowed to follow only the same country IE as it make no sense for two devices on a system to be in two different countries. In the case the AP is using country IEs for an incorrect country the user may help compliance further by setting the regulatory domain before or after the IE is parsed and in that case another intersection will be performed. CONFIG_WIRELESS_OLD_REGULATORY is supported but requires CRDA present. Signed-off-by: Luis R. Rodriguez Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 56b0eb25d927..a6ec928186ad 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1042,6 +1042,68 @@ enum ieee80211_spectrum_mgmt_actioncode { WLAN_ACTION_SPCT_CHL_SWITCH = 4, }; +/* + * IEEE 802.11-2007 7.3.2.9 Country information element + * + * Minimum length is 8 octets, ie len must be evenly + * divisible by 2 + */ + +/* Although the spec says 8 I'm seeing 6 in practice */ +#define IEEE80211_COUNTRY_IE_MIN_LEN 6 + +/* + * For regulatory extension stuff see IEEE 802.11-2007 + * Annex I (page 1141) and Annex J (page 1147). Also + * review 7.3.2.9. + * + * When dot11RegulatoryClassesRequired is true and the + * first_channel/reg_extension_id is >= 201 then the IE + * compromises of the 'ext' struct represented below: + * + * - Regulatory extension ID - when generating IE this just needs + * to be monotonically increasing for each triplet passed in + * the IE + * - Regulatory class - index into set of rules + * - Coverage class - index into air propagation time (Table 7-27), + * in microseconds, you can compute the air propagation time from + * the index by multiplying by 3, so index 10 yields a propagation + * of 10 us. Valid values are 0-31, values 32-255 are not defined + * yet. A value of 0 inicates air propagation of <= 1 us. + * + * See also Table I.2 for Emission limit sets and table + * I.3 for Behavior limit sets. Table J.1 indicates how to map + * a reg_class to an emission limit set and behavior limit set. + */ +#define IEEE80211_COUNTRY_EXTENSION_ID 201 + +/* + * Channels numbers in the IE must be monotonically increasing + * if dot11RegulatoryClassesRequired is not true. + * + * If dot11RegulatoryClassesRequired is true consecutive + * subband triplets following a regulatory triplet shall + * have monotonically increasing first_channel number fields. + * + * Channel numbers shall not overlap. + * + * Note that max_power is signed. + */ +struct ieee80211_country_ie_triplet { + union { + struct { + u8 first_channel; + u8 num_channels; + s8 max_power; + } __attribute__ ((packed)) chans; + struct { + u8 reg_extension_id; + u8 reg_class; + u8 coverage_class; + } __attribute__ ((packed)) ext; + }; +} __attribute__ ((packed)); + /* BACK action code */ enum ieee80211_back_actioncode { WLAN_ACTION_ADDBA_REQ = 0, -- cgit v1.2.3 From ce71e27c6fdc43c29f36d307b9100bde70c947fc Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Tue, 19 Aug 2008 20:43:25 +0300 Subject: SLUB: Replace __builtin_return_address(0) with _RET_IP_. This patch replaces __builtin_return_address(0) with _RET_IP_, since a previous patch moved _RET_IP_ and _THIS_IP_ to include/linux/kernel.h and they're widely available now. This makes for shorter and easier to read code. [penberg@cs.helsinki.fi: remove _RET_IP_ casts to void pointer] Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- include/linux/slab.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 000da12b5cf0..c97ed28559ec 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -253,9 +253,9 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep, * request comes from. */ #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) -extern void *__kmalloc_track_caller(size_t, gfp_t, void*); +extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ - __kmalloc_track_caller(size, flags, __builtin_return_address(0)) + __kmalloc_track_caller(size, flags, _RET_IP_) #else #define kmalloc_track_caller(size, flags) \ __kmalloc(size, flags) @@ -271,10 +271,10 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*); * allocation request comes from. */ #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) -extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *); +extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ - __builtin_return_address(0)) + _RET_IP_) #else #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node(size, flags, node) -- cgit v1.2.3 From e2f367f269fe19375f10e63efe0f2a6d3ddef8e6 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Fri, 21 Nov 2008 19:01:30 +0200 Subject: nl80211: Report max TX power in NL80211_BAND_ATTR_FREQS This is useful information to provide for userspace (e.g., hostapd needs this to generate Country IE). Signed-off-by: Jouni Malinen Signed-off-by: John W. Linville --- include/linux/nl80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 79827345351d..54d6ebe38e39 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -508,6 +508,7 @@ enum nl80211_band_attr { * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_RADAR: Radar detection is mandatory * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_MAX_TX_POWER: Maximum transmission power in dBm. */ enum nl80211_frequency_attr { __NL80211_FREQUENCY_ATTR_INVALID, @@ -516,12 +517,15 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_PASSIVE_SCAN, NL80211_FREQUENCY_ATTR_NO_IBSS, NL80211_FREQUENCY_ATTR_RADAR, + NL80211_FREQUENCY_ATTR_MAX_TX_POWER, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, NL80211_FREQUENCY_ATTR_MAX = __NL80211_FREQUENCY_ATTR_AFTER_LAST - 1 }; +#define NL80211_FREQUENCY_ATTR_MAX_TX_POWER NL80211_FREQUENCY_ATTR_MAX_TX_POWER + /** * enum nl80211_bitrate_attr - bitrate attributes * @NL80211_BITRATE_ATTR_RATE: Bitrate in units of 100 kbps -- cgit v1.2.3 From f80b5e99c7dac5a9a0d72496cec5075a12cd1476 Mon Sep 17 00:00:00 2001 From: Henrique de Moraes Holschuh Date: Fri, 21 Nov 2008 20:40:09 -0200 Subject: rfkill: preserve state across suspend The rfkill class API requires that the driver connected to a class call rfkill_force_state() on resume to update the real state of the rfkill controller, OR that it provides a get_state() hook. This means there is potentially a hidden call in the resume code flow that changes rfkill->state (i.e. rfkill_force_state()), so the previous state of the transmitter was being lost. The simplest and most future-proof way to fix this is to explicitly store the pre-sleep state on the rfkill structure, and restore from that on resume. Signed-off-by: Henrique de Moraes Holschuh Acked-by: Ivo van Doorn Cc: Matthew Garrett Cc: Alan Jenkins Signed-off-by: John W. Linville --- include/linux/rfkill.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 4cd64b0d9825..f376a93927f7 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -108,6 +108,7 @@ struct rfkill { struct device dev; struct list_head node; + enum rfkill_state state_for_resume; }; #define to_rfkill(d) container_of(d, struct rfkill, dev) -- cgit v1.2.3 From bf8c1ac6d81ba8c0e4dc2215f84f5e2a3c8227e8 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 22 Nov 2008 22:00:31 +0200 Subject: nl80211: Change max TX power to be in mBm instead of dBm In order to be consistent with NL80211_ATTR_POWER_RULE_MAX_EIRP, change NL80211_FREQUENCY_ATTR_MAX_TX_POWER to use mBm and U32 instead of dBm and U8. This is a userspace interface change, but the previous version had not yet been pushed upstream and there are no userspace programs using this yet, so there is justification to get this change in as long as it goes in before the previous version gets out. Signed-off-by: Jouni Malinen Signed-off-by: John W. Linville --- include/linux/nl80211.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 54d6ebe38e39..e08c8bcfb78d 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -508,7 +508,8 @@ enum nl80211_band_attr { * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_RADAR: Radar detection is mandatory * on this channel in current regulatory domain. - * @NL80211_FREQUENCY_ATTR_MAX_TX_POWER: Maximum transmission power in dBm. + * @NL80211_FREQUENCY_ATTR_MAX_TX_POWER: Maximum transmission power in mBm + * (100 * dBm). */ enum nl80211_frequency_attr { __NL80211_FREQUENCY_ATTR_INVALID, -- cgit v1.2.3 From d211af055d0c12dc3416c2886e6fbdc6eb74a381 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Mon, 24 Nov 2008 15:38:45 +0100 Subject: i386: get rid of the use of KPROBE_ENTRY / KPROBE_END entry_32.S is now the only user of KPROBE_ENTRY / KPROBE_END, treewide. This patch reorders entry_64.S and explicitly generates a separate section for functions that need the protection. The generated code before and after the patch is equal. The KPROBE_ENTRY and KPROBE_END macro's are removed too. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- include/linux/linkage.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 9fd1f859021b..fee9e59649c1 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -64,14 +64,6 @@ name: #endif -#define KPROBE_ENTRY(name) \ - .pushsection .kprobes.text, "ax"; \ - ENTRY(name) - -#define KPROBE_END(name) \ - END(name); \ - .popsection - #ifndef END #define END(name) \ .size name, .-name -- cgit v1.2.3 From 8b752e3ef6e3f5cde87afc649dd51d92b1e549c1 Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Fri, 28 Nov 2008 09:52:40 +0800 Subject: softirq: remove useless function __local_bh_enable Impact: remove unused code __local_bh_enable has been replaced with _local_bh_enable. As comments says "it always nests inside local_bh_enable() sections" has not been valid now. Also there is no reason to use __local_bh_enable anywhere, so we can remove this useless function. Signed-off-by: Liming Wang Signed-off-by: Ingo Molnar --- include/linux/bottom_half.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 777dbf695d44..27b1bcffe408 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -2,7 +2,6 @@ #define _LINUX_BH_H extern void local_bh_disable(void); -extern void __local_bh_enable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); -- cgit v1.2.3 From 0f0ca340e57bd7446855fefd07a64249acf81223 Mon Sep 17 00:00:00 2001 From: Giuseppe Cavallaro Date: Fri, 28 Nov 2008 16:24:56 -0800 Subject: phy: power management support This patch adds the power management support into the physical abstraction layer. Suspend and resume functions respectively turns on/off the bit 11 into the PHY Basic mode control register. Generic PHY device starts supporting PM. In order to support the wake-on LAN and avoid to put in power down the PHY device, the MDIO is aware of what the Ethernet device wants to do. Voluntary, no CONFIG_PM defines were added into the sources. Also generic suspend/resume functions are exported to allow other drivers use them (such as genphy_config_aneg etc.). Within the phy_driver_register function, we need to remove the memset. It overrides the device driver owner and it is not good. Signed-off-by: Giuseppe Cavallaro Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 77c4ed60b982..d7e54d98869f 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -467,6 +467,8 @@ int genphy_restart_aneg(struct phy_device *phydev); int genphy_config_aneg(struct phy_device *phydev); int genphy_update_link(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); +int genphy_suspend(struct phy_device *phydev); +int genphy_resume(struct phy_device *phydev); void phy_driver_unregister(struct phy_driver *drv); int phy_driver_register(struct phy_driver *new_driver); void phy_prepare_link(struct phy_device *phydev, -- cgit v1.2.3 From b908b53d580c3e9aba81ebe3339c5b7b4fa8031d Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 1 Dec 2008 06:30:04 +0000 Subject: of/gpio: Implement of_get_gpio_flags() This adds a new function, of_get_gpio_flags, which is like of_get_gpio(), but accepts a new "flags" argument. This new function will be used by the drivers that need to retrieve additional GPIO information, such as active-low flag. Also, this changes the default ("simple") .xlate routine to warn about bogus (< 2) #gpio-cells usage: the second cell should always be present for GPIO flags. Signed-off-by: Anton Vorontsov Signed-off-by: Paul Mackerras --- include/linux/of_gpio.h | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h index 67db101d0eb8..e25abf610cb6 100644 --- a/include/linux/of_gpio.h +++ b/include/linux/of_gpio.h @@ -14,9 +14,22 @@ #ifndef __LINUX_OF_GPIO_H #define __LINUX_OF_GPIO_H +#include +#include #include #include +struct device_node; + +/* + * This is Linux-specific flags. By default controllers' and Linux' mapping + * match, but GPIO controllers are free to translate their own flags to + * Linux-specific in their .xlate callback. Though, 1:1 mapping is recommended. + */ +enum of_gpio_flags { + OF_GPIO_ACTIVE_LOW = 0x1, +}; + #ifdef CONFIG_OF_GPIO /* @@ -26,7 +39,7 @@ struct of_gpio_chip { struct gpio_chip gc; int gpio_cells; int (*xlate)(struct of_gpio_chip *of_gc, struct device_node *np, - const void *gpio_spec); + const void *gpio_spec, enum of_gpio_flags *flags); }; static inline struct of_gpio_chip *to_of_gpio_chip(struct gpio_chip *gc) @@ -50,20 +63,37 @@ static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc) return container_of(of_gc, struct of_mm_gpio_chip, of_gc); } -extern int of_get_gpio(struct device_node *np, int index); +extern int of_get_gpio_flags(struct device_node *np, int index, + enum of_gpio_flags *flags); + extern int of_mm_gpiochip_add(struct device_node *np, struct of_mm_gpio_chip *mm_gc); extern int of_gpio_simple_xlate(struct of_gpio_chip *of_gc, struct device_node *np, - const void *gpio_spec); + const void *gpio_spec, + enum of_gpio_flags *flags); #else /* Drivers may not strictly depend on the GPIO support, so let them link. */ -static inline int of_get_gpio(struct device_node *np, int index) +static inline int of_get_gpio_flags(struct device_node *np, int index, + enum of_gpio_flags *flags) { return -ENOSYS; } #endif /* CONFIG_OF_GPIO */ +/** + * of_get_gpio - Get a GPIO number to use with GPIO API + * @np: device node to get GPIO from + * @index: index of the GPIO + * + * Returns GPIO number to use with Linux generic GPIO API, or one of the errno + * value on the error condition. + */ +static inline int of_get_gpio(struct device_node *np, int index) +{ + return of_get_gpio_flags(np, index, NULL); +} + #endif /* __LINUX_OF_GPIO_H */ -- cgit v1.2.3 From 8865c418caf4e9dd2c24bdfae3a5a4106e143e60 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 3 Dec 2008 22:12:38 -0800 Subject: atm: 32-bit ioctl compatibility We lack compat ioctl support through most of the ATM code. This patch deals with most of it, and I can now at least use BR2684 and PPPoATM with 32-bit userspace. I haven't added a .compat_ioctl method to struct atm_ioctl, because AFAICT none of the current users need any conversion -- so we can just call the ->ioctl() method in every case. I looked at br2684, clip, lec, mpc, pppoatm and atmtcp. In svc_compat_ioctl() the only mangling which is needed is to change COMPAT_ATM_ADDPARTY to ATM_ADDPARTY. Although it's defined as _IOW('a', ATMIOC_SPECIAL+4,struct atm_iobuf) it doesn't actually _take_ a struct atm_iobuf as an argument -- it takes a struct sockaddr_atmsvc, which _is_ the same between 32-bit and 64-bit code, so doesn't need conversion. Almost all of vcc_ioctl() would have been identical, so I converted that into a core do_vcc_ioctl() function with an 'int compat' argument. I've done the same with atm_dev_ioctl(), where there _are_ a few differences, but still it's relatively contained and there would otherwise have been a lot of duplication. I haven't done any of the actual device-specific ioctls, although I've added a compat_ioctl method to struct atmdev_ops. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- include/linux/atm.h | 17 ++++++++++++++--- include/linux/atmdev.h | 15 +++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atm.h b/include/linux/atm.h index c791ddd96939..d3b292174aeb 100644 --- a/include/linux/atm.h +++ b/include/linux/atm.h @@ -231,10 +231,21 @@ static __inline__ int atmpvc_addr_in_use(struct sockaddr_atmpvc addr) */ struct atmif_sioc { - int number; - int length; - void __user *arg; + int number; + int length; + void __user *arg; }; +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +#include +struct compat_atmif_sioc { + int number; + int length; + compat_uptr_t arg; +}; +#endif +#endif + typedef unsigned short atm_backend_t; #endif diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index a3d07c29d16c..086e5c362d3a 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -100,6 +100,10 @@ struct atm_dev_stats { /* use backend to make new if */ #define ATM_ADDPARTY _IOW('a', ATMIOC_SPECIAL+4,struct atm_iobuf) /* add party to p2mp call */ +#ifdef CONFIG_COMPAT +/* It actually takes struct sockaddr_atmsvc, not struct atm_iobuf */ +#define COMPAT_ATM_ADDPARTY _IOW('a', ATMIOC_SPECIAL+4,struct compat_atm_iobuf) +#endif #define ATM_DROPPARTY _IOW('a', ATMIOC_SPECIAL+5,int) /* drop party from p2mp call */ @@ -224,6 +228,13 @@ struct atm_cirange { extern struct proc_dir_entry *atm_proc_root; #endif +#ifdef CONFIG_COMPAT +#include +struct compat_atm_iobuf { + int length; + compat_uptr_t buffer; +}; +#endif struct k_atm_aal_stats { #define __HANDLE_ITEM(i) atomic_t i @@ -379,6 +390,10 @@ struct atmdev_ops { /* only send is required */ int (*open)(struct atm_vcc *vcc); void (*close)(struct atm_vcc *vcc); int (*ioctl)(struct atm_dev *dev,unsigned int cmd,void __user *arg); +#ifdef CONFIG_COMPAT + int (*compat_ioctl)(struct atm_dev *dev,unsigned int cmd, + void __user *arg); +#endif int (*getsockopt)(struct atm_vcc *vcc,int level,int optname, void __user *optval,int optlen); int (*setsockopt)(struct atm_vcc *vcc,int level,int optname, -- cgit v1.2.3 From 00ef9f7348dfd2fc223ec42aceb30836e86b367f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Dec 2008 09:00:17 +0100 Subject: lockdep: change a held lock's class Impact: introduce new lockdep API Allow to change a held lock's class. Basically the same as the existing code to change a subclass therefore reuse all that. The XFS code will be able to use this to annotate their inode locking. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 8956daf64abd..37a0361f4685 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -314,8 +314,15 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); -extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, - unsigned long ip); +extern void lock_set_class(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, unsigned int subclass, + unsigned long ip); + +static inline void lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + lock_set_class(lock, lock->name, lock->key, subclass, ip); +} # define INIT_LOCKDEP .lockdep_recursion = 0, @@ -333,6 +340,7 @@ static inline void lockdep_on(void) # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) +# define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) -- cgit v1.2.3 From c9bb6003dd096ad190e1594a7d835ae1c39fae8f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 29 Oct 2008 23:46:09 -0700 Subject: of: Fix comment, sparc no longer uses of_device objects on special busses. It only uses of_platform_bus_type. Signed-off-by: David S. Miller --- include/linux/of_platform.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index a8efcfeea732..3d327b67d7e2 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -26,8 +26,7 @@ extern struct bus_type of_platform_bus_type; /* * An of_platform_driver driver is attached to a basic of_device on - * the "platform bus" (of_platform_bus_type) (or ISA, EBUS and SBUS - * busses on sparc). + * the "platform bus" (of_platform_bus_type). */ struct of_platform_driver { -- cgit v1.2.3 From 72bdcf34380917260da41e3c49e10edee04bc5cd Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 26 Nov 2008 16:15:24 +0200 Subject: nl80211: Add frequency configuration (including HT40) This patch adds new NL80211_CMD_SET_WIPHY attributes NL80211_ATTR_WIPHY_FREQ and NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET to allow userspace to set the operating channel (e.g., hostapd for AP mode). Signed-off-by: Jouni Malinen Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index e08c8bcfb78d..92f79d2bdd8c 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -26,8 +26,9 @@ * @NL80211_CMD_GET_WIPHY: request information about a wiphy or dump request * to get a list of all present wiphys. * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or - * %NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME - * and/or %NL80211_ATTR_WIPHY_TXQ_PARAMS. + * %NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME, + * %NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ, and/or + * %NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET. * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request * or rename notification. Has attributes %NL80211_ATTR_WIPHY and * %NL80211_ATTR_WIPHY_NAME. @@ -180,6 +181,14 @@ enum nl80211_commands { * /sys/class/ieee80211//index * @NL80211_ATTR_WIPHY_NAME: wiphy name (used for renaming) * @NL80211_ATTR_WIPHY_TXQ_PARAMS: a nested array of TX queue parameters + * @NL80211_ATTR_WIPHY_FREQ: frequency of the selected channel in MHz + * @NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET: included with NL80211_ATTR_WIPHY_FREQ + * if HT20 or HT40 are allowed (i.e., 802.11n disabled if not included): + * NL80211_SEC_CHAN_NO_HT = HT not allowed (i.e., same as not including + * this attribute) + * NL80211_SEC_CHAN_DISABLED = HT20 only + * NL80211_SEC_CHAN_BELOW = secondary channel is below the primary channel + * NL80211_SEC_CHAN_ABOVE = secondary channel is above the primary channel * * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on * @NL80211_ATTR_IFNAME: network interface name @@ -315,6 +324,8 @@ enum nl80211_attrs { NL80211_ATTR_BSS_BASIC_RATES, NL80211_ATTR_WIPHY_TXQ_PARAMS, + NL80211_ATTR_WIPHY_FREQ, + NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET, /* add attributes here, update the policy in nl80211.c */ @@ -329,6 +340,8 @@ enum nl80211_attrs { #define NL80211_ATTR_HT_CAPABILITY NL80211_ATTR_HT_CAPABILITY #define NL80211_ATTR_BSS_BASIC_RATES NL80211_ATTR_BSS_BASIC_RATES #define NL80211_ATTR_WIPHY_TXQ_PARAMS NL80211_ATTR_WIPHY_TXQ_PARAMS +#define NL80211_ATTR_WIPHY_FREQ NL80211_ATTR_WIPHY_FREQ +#define NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET #define NL80211_MAX_SUPP_RATES 32 #define NL80211_MAX_SUPP_REG_RULES 32 @@ -742,4 +755,10 @@ enum nl80211_txq_q { NL80211_TXQ_Q_BK }; +enum nl80211_sec_chan_offset { + NL80211_SEC_CHAN_NO_HT /* No HT */, + NL80211_SEC_CHAN_DISABLED /* HT20 only */, + NL80211_SEC_CHAN_BELOW /* HT40- */, + NL80211_SEC_CHAN_ABOVE /* HT40+ */ +}; #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From 10ec4f1d0851eb97cd53db66150835dd7f64829d Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 26 Nov 2008 13:03:08 -0800 Subject: nl80211: relicense nl80211.h under the ISC We have a few BSD/ISC licensed userspace applications which include nl80211.h from the kernel. To avoid legal ambiguity for usage of the header file in these projects we rather simply relicense the header file under the ISC. We've received consent from all contributors to it. Signed-off-by: Luis R. Rodriguez Acked-by: Johannes Berg Acked-by: Michael Wu Acked-by: Luis Carlos Cobo Acked-by: Michael Buesch Acked-by: Jouni Malinen Acked-by: Colin McCabe Acked-by: Javier Cardona Cc: johannes@sipsolutions.net Cc: altape@eden.rutgers.edu Cc: luisca@cozybit.com Cc: mb@bu3sch.de Cc: jouni.malinen@atheros.com Cc: colin@cozybit.com Cc: javier@cozybit.com Signed-off-by: John W. Linville --- include/linux/nl80211.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 92f79d2bdd8c..04d4516f9c71 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -3,7 +3,26 @@ /* * 802.11 netlink interface public header * - * Copyright 2006, 2007 Johannes Berg + * Copyright 2006, 2007, 2008 Johannes Berg + * Copyright 2008 Michael Wu + * Copyright 2008 Luis Carlos Cobo + * Copyright 2008 Michael Buesch + * Copyright 2008 Luis R. Rodriguez + * Copyright 2008 Jouni Malinen + * Copyright 2008 Colin McCabe + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * */ /** -- cgit v1.2.3 From b74ca3a896b9ab5f952bc440154758e708c48884 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Mon, 8 Dec 2008 01:14:16 -0800 Subject: netdevice: Kill netdev->priv This is the last shoot of this series. After I removing all directly reference of netdev->priv, I am killing "priv" of "struct net_device" and fixing relative comments/docs. Anyone will not be allowed to reference netdev->priv directly. If you want to reference the memory of private data, use netdev_priv() instead. If the private data is not allocted when alloc_netdev(), use netdev->ml_priv to point that memory after you creating that private data. Signed-off-by: Wang Chen Signed-off-by: David S. Miller --- include/linux/hdlc.h | 2 +- include/linux/netdevice.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h index e960faac609d..fd47a151665e 100644 --- a/include/linux/hdlc.h +++ b/include/linux/hdlc.h @@ -43,7 +43,7 @@ struct hdlc_proto { }; -/* Pointed to by dev->priv */ +/* Pointed to by netdev_priv(dev) */ typedef struct hdlc_device { /* used by HDLC layer to take control over HDLC device from hw driver*/ int (*attach)(struct net_device *dev, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0df0db068ac3..47e731528315 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -785,7 +785,6 @@ struct net_device /* * One part is mostly used on xmit path (device) */ - void *priv; /* pointer to private data */ /* These may be needed for future network-power-down code. */ unsigned long trans_start; /* Time (in jiffies) of last Tx */ -- cgit v1.2.3 From 0049bab5e765aa74cf767a834fa336e19453fc5e Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 8 Dec 2008 01:18:05 -0800 Subject: dccp: Remove obsolete parts of the old CCID interface The TX/RX CCIDs of the minisock are now redundant: similar to the Ack Vector case, their value equals initially that of the sysctl, but at the end of feature negotiation may be something different. The old interface removed by this patch thus has been replaced by the newer interface to dynamically query the currently loaded CCIDs. Also removed are the constructors for the TX CCID and the RX CCID, since the switch "rx <-> non-rx" is done by the handler in minisocks.c (and the handler is the only place in the code where CCIDs are loaded). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- include/linux/dccp.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6a72ff52a8a4..46daea312d92 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -370,7 +370,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_ccid - Congestion Control Id (CCID) (section 10) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) * @dccpms_pending - List of features being negotiated @@ -378,8 +377,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) */ struct dccp_minisock { __u64 dccpms_sequence_window; - __u8 dccpms_rx_ccid; - __u8 dccpms_tx_ccid; __u8 dccpms_send_ack_vector; __u8 dccpms_send_ndp_count; struct list_head dccpms_pending; -- cgit v1.2.3 From 4098dce5be537a157eed4a326efd464109825b8b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 8 Dec 2008 01:18:37 -0800 Subject: dccp: Remove manual influence on NDP Count feature Updating the NDP count feature is handled automatically now: * for CCID-2 it is disabled, since the code does not use NDP counts; * for CCID-3 it is enabled, as NDP counts are used to determine loss lengths. Allowing the user to change NDP values leads to unpredictable and failing behaviour, since it is then possible to disable NDP counts even when they are needed (e.g. in CCID-3). This means that only those user settings are sensible that agree with the values for Send NDP Count implied by the choice of CCID. But those settings are already activated by the feature negotiation (CCID dependency tracking), hence this form of support is redundant. At startup the initialisation of the NDP count feature uses the default value of 0, which is done implicitly by the zeroing-out of the socket when it is allocated. If the choice of CCID or feature negotiation enables NDP count, this will then be updated via the NDP activation handler. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- include/linux/dccp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 46daea312d92..60e94438eadd 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -371,14 +371,12 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) - * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; __u8 dccpms_send_ack_vector; - __u8 dccpms_send_ndp_count; struct list_head dccpms_pending; struct list_head dccpms_conf; }; @@ -490,6 +488,7 @@ struct dccp_ackvec; * @dccps_r_ack_ratio - feature-remote Ack Ratio * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) + * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) @@ -529,6 +528,7 @@ struct dccp_sock { __u16 dccps_r_ack_ratio; __u8 dccps_pcslen:4; __u8 dccps_pcrlen:4; + __u8 dccps_send_ndp_count:1; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; -- cgit v1.2.3 From 6fdd34d43bff8be9bb925b49d87a0ee144d2ab07 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 8 Dec 2008 01:19:06 -0800 Subject: dccp ccid-2: Phase out the use of boolean Ack Vector sysctl This removes the use of the sysctl and the minisock variable for the Send Ack Vector feature, as it now is handled fully dynamically via feature negotiation (i.e. when CCID-2 is enabled, Ack Vectors are automatically enabled as per RFC 4341, 4.). Using a sysctl in parallel to this implementation would open the door to crashes, since much of the code relies on tests of the boolean minisock / sysctl variable. Thus, this patch replaces all tests of type if (dccp_msk(sk)->dccpms_send_ack_vector) /* ... */ with if (dp->dccps_hc_rx_ackvec != NULL) /* ... */ The dccps_hc_rx_ackvec is allocated by the dccp_hdlr_ackvec() when feature negotiation concluded that Ack Vectors are to be used on the half-connection. Otherwise, it is NULL (due to dccp_init_sock/dccp_create_openreq_child), so that the test is a valid one. The activation handler for Ack Vectors is called as soon as the feature negotiation has concluded at the * server when the Ack marking the transition RESPOND => OPEN arrives; * client after it has sent its ACK, marking the transition REQUEST => PARTOPEN. Adding the sequence number of the Response packet to the Ack Vector has been removed, since (a) connection establishment implies that the Response has been received; (b) the CCIDs only look at packets received in the (PART)OPEN state, i.e. this entry will always be ignored; (c) it can not be used for anything useful - to detect loss for instance, only packets received after the loss can serve as pseudo-dupacks. There was a FIXME to change the error code when dccp_ackvec_add() fails. I removed this after finding out that: * the check whether ackno < ISN is already made earlier, * this Response is likely the 1st packet with an Ackno that the client gets, * so when dccp_ackvec_add() fails, the reason is likely not a packet error. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- include/linux/dccp.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 60e94438eadd..61734e27abb7 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -360,7 +360,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 #define DCCPF_INITIAL_ACK_RATIO 2 #define DCCPF_INITIAL_CCID DCCPC_CCID2 -#define DCCPF_INITIAL_SEND_ACK_VECTOR 1 /* FIXME: for now we're default to 1 but it should really be 0 */ #define DCCPF_INITIAL_SEND_NDP_COUNT 1 @@ -370,13 +369,11 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; - __u8 dccpms_send_ack_vector; struct list_head dccpms_pending; struct list_head dccpms_conf; }; -- cgit v1.2.3 From 1e641743f055f075ed9a4edd75f1fb1e05669ddc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 9 Dec 2008 09:23:33 +0000 Subject: Audit: Log TIOCSTI AUDIT_TTY records currently log all data read by processes marked for TTY input auditing, even if the data was "pushed back" using the TIOCSTI ioctl, not typed by the user. This patch records all TIOCSTI calls to disambiguate the input. It generates one audit message per character pushed back; considering TIOCSTI is used very rarely, this simple solution is probably good enough. (The only program I could find that uses TIOCSTI is mailx/nail in "header editing" mode, e.g. using the ~h escape. mailx is used very rarely, and the escapes are used even rarer.) Signed-Off-By: Miloslav Trmac Signed-off-by: Al Viro Signed-off-by: James Morris --- include/linux/tty.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 3b8121d4e36f..580700f20a1c 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -442,6 +442,7 @@ extern void tty_audit_add_data(struct tty_struct *tty, unsigned char *data, size_t size); extern void tty_audit_exit(void); extern void tty_audit_fork(struct signal_struct *sig); +extern void tty_audit_tiocsti(struct tty_struct *tty, char ch); extern void tty_audit_push(struct tty_struct *tty); extern void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid, u32 sessionid); @@ -450,6 +451,9 @@ static inline void tty_audit_add_data(struct tty_struct *tty, unsigned char *data, size_t size) { } +static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch) +{ +} static inline void tty_audit_exit(void) { } -- cgit v1.2.3 From 58494487581cb143a0d763e3056a894d5009d60a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 26 Nov 2008 12:02:53 +0100 Subject: oprofile: update comment for oprofile_add_sample() The cpu argument is no longer part of the parameter list. Signed-off-by: Robert Richter --- include/linux/oprofile.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 5231861f357d..1ce9fe572e51 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -86,8 +86,7 @@ int oprofile_arch_init(struct oprofile_operations * ops); void oprofile_arch_exit(void); /** - * Add a sample. This may be called from any context. Pass - * smp_processor_id() as cpu. + * Add a sample. This may be called from any context. */ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event); -- cgit v1.2.3 From e09373f22e76cc048ca5fe10a9ff9012f5d64309 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 26 Nov 2008 14:04:19 +0100 Subject: ring_buffer: add remaining cpu functions to ring_buffer.h These functions are not yet in ring_buffer.h though they seems to be part of the API. Cc: Steven Rostedt Signed-off-by: Robert Richter --- include/linux/ring_buffer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index e097c2e6b6dc..de9d8c12e5ec 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -116,6 +116,8 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_entries(struct ring_buffer *buffer); unsigned long ring_buffer_overruns(struct ring_buffer *buffer); +unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(int cpu); void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); -- cgit v1.2.3 From cdc693643271b2e6a693cf8f6afb258cce01f058 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 10 Dec 2008 13:55:49 +0000 Subject: ALSA: Add support for mechanical jack insertion Some systems support both mechanical and electrical jack detection, allowing them to report that a jack is physically present but does not have any functioning connections. Add a new jack type for these, allowing user space to report faulty connections. Thanks to Guillem Jover for the suggestion. Signed-off-by: Mark Brown Signed-off-by: Takashi Iwai --- include/linux/input.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index 7323d2ff5151..abd223b0f586 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -645,6 +645,7 @@ struct input_absinfo { #define SW_MICROPHONE_INSERT 0x04 /* set = inserted */ #define SW_DOCK 0x05 /* set = plugged into dock */ #define SW_LINEOUT_INSERT 0x06 /* set = inserted */ +#define SW_JACK_PHYSICAL_INSERT 0x07 /* set = mechanical switch set */ #define SW_MAX 0x0f #define SW_CNT (SW_MAX+1) -- cgit v1.2.3 From 2107fb8b5bf018be691afdd4c6ffaecf0c3307be Mon Sep 17 00:00:00 2001 From: Steve Glendinning Date: Wed, 5 Nov 2008 00:35:38 +0000 Subject: smsc911x: add dynamic bus configuration Convert the driver to select 16-bit or 32-bit bus access at runtime, at a small performance cost. Signed-off-by: Steve Glendinning Acked-by: Catalin Marinas Signed-off-by: David S. Miller --- include/linux/smsc911x.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smsc911x.h b/include/linux/smsc911x.h index 47c4ffd10dbb..1cbf0313adde 100644 --- a/include/linux/smsc911x.h +++ b/include/linux/smsc911x.h @@ -28,6 +28,7 @@ struct smsc911x_platform_config { unsigned int irq_polarity; unsigned int irq_type; + unsigned int flags; phy_interface_t phy_interface; }; @@ -39,4 +40,8 @@ struct smsc911x_platform_config { #define SMSC911X_IRQ_TYPE_OPEN_DRAIN 0 #define SMSC911X_IRQ_TYPE_PUSH_PULL 1 +/* Constants for flags */ +#define SMSC911X_USE_16BIT (BIT(0)) +#define SMSC911X_USE_32BIT (BIT(1)) + #endif /* __LINUX_SMSC911X_H__ */ -- cgit v1.2.3 From bd91b8bf372911c1e4d66d6bb44fe409349a6791 Mon Sep 17 00:00:00 2001 From: Benjamin Thery Date: Wed, 10 Dec 2008 16:07:08 -0800 Subject: netns: ip6mr: allocate mroute6_socket per-namespace. Preliminary work to make IPv6 multicast forwarding netns-aware. Make IPv6 multicast forwarding mroute6_socket per-namespace, moves it into struct netns_ipv6. At the moment, mroute6_socket is only referenced in init_net. Signed-off-by: Benjamin Thery Signed-off-by: David S. Miller --- include/linux/mroute6.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 6f4c180179e2..2cd9901ee5c7 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -117,6 +117,7 @@ struct sioc_mif_req6 #include #include /* for struct sk_buff_head */ +#include #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) @@ -232,10 +233,13 @@ struct rtmsg; extern int ip6mr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait); #ifdef CONFIG_IPV6_MROUTE -extern struct sock *mroute6_socket; +static inline struct sock *mroute6_socket(struct net *net) +{ + return net->ipv6.mroute6_sk; +} extern int ip6mr_sk_done(struct sock *sk); #else -#define mroute6_socket NULL +static inline struct sock *mroute6_socket(struct net *net) { return NULL; } static inline int ip6mr_sk_done(struct sock *sk) { return 0; } #endif #endif -- cgit v1.2.3 From 58701ad41105638baa0b38ffe9ac5b10469c1fd3 Mon Sep 17 00:00:00 2001 From: Benjamin Thery Date: Wed, 10 Dec 2008 16:22:34 -0800 Subject: netns: ip6mr: store netns in struct mfc6_cache This patch stores into struct mfc6_cache the network namespace each mfc6_cache belongs to. The new member is mfc6_net. mfc6_net is assigned at cache allocation and doesn't change during the rest of the cache entry life. This will help to retrieve the current netns around the IPv6 multicast forwarding code. At the moment, all mfc6_cache are allocated in init_net. Changelog: ========== * Use write_pnet()/read_pnet() to set and get mfc6_net. Signed-off-by: Benjamin Thery Signed-off-by: David S. Miller --- include/linux/mroute6.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 2cd9901ee5c7..15d85fe12bbd 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -188,6 +188,9 @@ struct mif_device struct mfc6_cache { struct mfc6_cache *next; /* Next entry on cache line */ +#ifdef CONFIG_NET_NS + struct net *mfc6_net; +#endif struct in6_addr mf6c_mcastgrp; /* Group the entry belongs to */ struct in6_addr mf6c_origin; /* Source of packet */ mifi_t mf6c_parent; /* Source interface */ @@ -210,6 +213,18 @@ struct mfc6_cache } mfc_un; }; +static inline +struct net *mfc6_net(const struct mfc6_cache *mfc) +{ + return read_pnet(&mfc->mfc6_net); +} + +static inline +void mfc6_net_set(struct mfc6_cache *mfc, struct net *net) +{ + write_pnet(&mfc->mfc6_net, hold_net(net)); +} + #define MFC_STATIC 1 #define MFC_NOTIFY 2 -- cgit v1.2.3 From 8229efdaef1e7913ae1712c0ba752f267e5fcd5e Mon Sep 17 00:00:00 2001 From: Benjamin Thery Date: Wed, 10 Dec 2008 16:30:15 -0800 Subject: netns: ip6mr: enable namespace support in ipv6 multicast forwarding code This last patch makes the appropriate changes to use and propagate the network namespace where needed in IPv6 multicast forwarding code. This consists mainly in replacing all the remaining init_net occurences with current netns pointer retrieved from sockets, net devices or mfc6_caches depending on the routines' contexts. Some routines receive a new 'struct net' parameter to propagate the current netns: * ip6mr_get_route * ip6mr_cache_report * ip6mr_cache_find * ip6mr_cache_unresolved * mif6_add/mif6_delete * ip6mr_mfc_add/ip6mr_mfc_delete * ip6mr_reg_vif All the IPv6 multicast forwarding variables moved to struct netns_ipv6 by the previous patches are now referenced in the correct namespace. Changelog: ========== * Take into account the net associated to mfc6_cache when matching entries in mfc_unres_queue list. * Call mroute_clean_tables() in ip6mr_net_exit() to free memory allocated per-namespace. * Call dev_net_set() in ip6mr_reg_vif() to initialize dev->nd_net correctly. Signed-off-by: Benjamin Thery Signed-off-by: David S. Miller --- include/linux/mroute6.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 15d85fe12bbd..5375faca1f72 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -245,7 +245,8 @@ void mfc6_net_set(struct mfc6_cache *mfc, struct net *net) #ifdef __KERNEL__ struct rtmsg; -extern int ip6mr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait); +extern int ip6mr_get_route(struct net *net, struct sk_buff *skb, + struct rtmsg *rtm, int nowait); #ifdef CONFIG_IPV6_MROUTE static inline struct sock *mroute6_socket(struct net *net) -- cgit v1.2.3 From 4d4be482a4d78ca906f45e99fd9fdb91e907f5ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Dec 2008 04:47:33 -0500 Subject: [XFS] add a FMODE flag to make XFS invisible I/O less hacky XFS has a mode called invisble I/O that doesn't update any of the timestamps. It's used for HSM-style applications and exposed through the nasty open by handle ioctl. Instead of doing directly assignment of file operations that set an internal flag for it add a new FMODE_NOCMTIME flag that we can check in the normal file operations. (addition of the generic VFS flag has been ACKed by Al as an interims solution) Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- include/linux/fs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 51bd9370d437..965b9ba3865d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -81,6 +81,14 @@ extern int dir_notify_enable; #define FMODE_WRITE_IOCTL ((__force fmode_t)128) #define FMODE_NDELAY_NOW ((__force fmode_t)256) +/* + * Don't update ctime and mtime. + * + * Currently a special hack for the XFS open_by_handle ioctl, but we'll + * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. + */ +#define FMODE_NOCMTIME ((__force fmode_t)2048) + #define RW_MASK 1 #define RWA_MASK 2 #define READ 0 -- cgit v1.2.3 From c2724775ce57c98b8af9694857b941dc61056516 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:49:59 +0100 Subject: x86, bts: provide in-kernel branch-trace interface Impact: cleanup Move the BTS bits from ptrace.c into ds.c. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b81fc5f7731..dc5ea65dc716 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1176,6 +1176,7 @@ struct task_struct { * The buffer to hold the BTS data. */ void *bts_buffer; + size_t bts_size; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ -- cgit v1.2.3 From bcbc4f20b52c2c40c43a4d2337707dcdfe81bc3a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Dec 2008 23:54:20 +0100 Subject: tracing/function-graph-tracer: annotate do_IRQ and smp_apic_timer_interrupt Impact: move most important x86 irq entry-points to a separate subsection Annotate do_IRQ and smp_apic_timer_interrupt to put them into the .irqentry.text subsection. These function will so be recognized as hardirq entrypoints for the function-graph-tracer. We could also annotate other irq entries but the others are far less important but they can be added on request. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 11cac81eed08..44020f31bd81 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -377,6 +377,16 @@ struct ftrace_graph_ret { */ #define __notrace_funcgraph notrace +/* + * We want to which function is an entrypoint of a hardirq. + * That will help us to put a signal on output. + */ +#define __irq_entry __attribute__((__section__(".irqentry.text"))) + +/* Limits of hardirq entrypoints */ +extern char __irqentry_text_start[]; +extern char __irqentry_text_end[]; + #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of the callback handlers for tracing function graph*/ @@ -414,6 +424,7 @@ static inline void unpause_graph_tracing(void) #else #define __notrace_funcgraph +#define __irq_entry static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } -- cgit v1.2.3 From 27af4245b6ce99e08c6a7c38825383bf51119cc9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Dec 2008 14:18:13 -0800 Subject: posix-timers: use "struct pid*" instead of "struct task_struct*" Impact: restructure, clean up code k_itimer holds the ref to the ->it_process until sys_timer_delete(). This allows to pin up to RLIMIT_SIGPENDING dead task_struct's. Change the code to use "struct pid *" instead. The patch doesn't kill ->it_process, it places ->it_pid into the union. ->it_process is still used by do_cpu_nanosleep() as before. It would be trivial to change the nanosleep code as well, but since it uses it_process in a special way I think it is better to keep this field for grep. The patch bloats the kernel by 104 bytes and it also adds the new pointer, ->it_signal, to k_itimer. It is used by lock_timer() to verify that the found timer was not created by another process. It is not clear why do we use the global database (and thus the global idr_lock) for posix timers. We still need the signal_struct->posix_timers which contains all useable timers, perhaps it is better to use some form of per-process array instead. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/posix-timers.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index a7c721355549..4f71bf4e628c 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -45,7 +45,11 @@ struct k_itimer { int it_requeue_pending; /* waiting to requeue this timer */ #define REQUEUE_PENDING 1 int it_sigev_notify; /* notify word of sigevent struct */ - struct task_struct *it_process; /* process to send signal to */ + struct signal_struct *it_signal; + union { + struct pid *it_pid; /* pid of process to send signal to */ + struct task_struct *it_process; /* for clock_nanosleep */ + }; struct sigqueue *sigq; /* signal queue entry. */ union { struct { -- cgit v1.2.3 From c29541b24fb2c6301021637229ae5347c877330a Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 1 Dec 2008 14:18:11 -0800 Subject: linux/timex.h: cleanup for userspace Impact: fix user-space exported use Move all the kernel-specific defines and includes into the __KERNEL__ section so that they don't get leaked into userspace. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 73 ++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 9007313b5b71..998a55d80acf 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -53,46 +53,10 @@ #ifndef _LINUX_TIMEX_H #define _LINUX_TIMEX_H -#include #include -#include - #define NTP_API 4 /* NTP API version */ -/* - * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen - * for a slightly underdamped convergence characteristic. SHIFT_KH - * establishes the damping of the FLL and is chosen by wisdom and black - * art. - * - * MAXTC establishes the maximum time constant of the PLL. With the - * SHIFT_KG and SHIFT_KF values given and a time constant range from - * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours, - * respectively. - */ -#define SHIFT_PLL 4 /* PLL frequency factor (shift) */ -#define SHIFT_FLL 2 /* FLL frequency factor (shift) */ -#define MAXTC 10 /* maximum time constant (shift) */ - -/* - * SHIFT_USEC defines the scaling (shift) of the time_freq and - * time_tolerance variables, which represent the current frequency - * offset and maximum frequency tolerance. - */ -#define SHIFT_USEC 16 /* frequency offset scale (shift) */ -#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) -#define PPM_SCALE_INV_SHIFT 19 -#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ - PPM_SCALE + 1) - -#define MAXPHASE 500000000l /* max phase error (ns) */ -#define MAXFREQ 500000 /* max frequency error (ns/s) */ -#define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT) -#define MINSEC 256 /* min interval between updates (s) */ -#define MAXSEC 2048 /* max interval between updates (s) */ -#define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */ - /* * syscall interface - used (mainly by NTP daemon) * to discipline kernel clock oscillator @@ -199,8 +163,45 @@ struct timex { #define TIME_BAD TIME_ERROR /* bw compat */ #ifdef __KERNEL__ +#include +#include +#include + #include +/* + * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen + * for a slightly underdamped convergence characteristic. SHIFT_KH + * establishes the damping of the FLL and is chosen by wisdom and black + * art. + * + * MAXTC establishes the maximum time constant of the PLL. With the + * SHIFT_KG and SHIFT_KF values given and a time constant range from + * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours, + * respectively. + */ +#define SHIFT_PLL 4 /* PLL frequency factor (shift) */ +#define SHIFT_FLL 2 /* FLL frequency factor (shift) */ +#define MAXTC 10 /* maximum time constant (shift) */ + +/* + * SHIFT_USEC defines the scaling (shift) of the time_freq and + * time_tolerance variables, which represent the current frequency + * offset and maximum frequency tolerance. + */ +#define SHIFT_USEC 16 /* frequency offset scale (shift) */ +#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) +#define PPM_SCALE_INV_SHIFT 19 +#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ + PPM_SCALE + 1) + +#define MAXPHASE 500000000l /* max phase error (ns) */ +#define MAXFREQ 500000 /* max frequency error (ns/s) */ +#define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT) +#define MINSEC 256 /* min interval between updates (s) */ +#define MAXSEC 2048 /* max interval between updates (s) */ +#define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */ + /* * kernel variables * Note: maximum error = NTP synch distance = dispersion + delay / 2; -- cgit v1.2.3 From bb608e9db7d29616fb6e0d856c23434610d4a1bd Mon Sep 17 00:00:00 2001 From: Senthil Balasubramanian Date: Thu, 4 Dec 2008 20:38:13 +0530 Subject: wireless: Incorrect LEAP authentication algorithm identifier. This patch fixes a regression introduced by "wireless: avoid some net/ieee80211.h vs. linux/ieee80211.h conflicts" LEAP authentication algorithm identifier should be 128. Signed-off-by: Senthil Balasubramanian Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a6ec928186ad..c4e6ca1a6306 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -836,7 +836,7 @@ struct ieee80211_ht_info { /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 -#define WLAN_AUTH_LEAP 2 +#define WLAN_AUTH_LEAP 128 #define WLAN_AUTH_CHALLENGE_LEN 128 -- cgit v1.2.3 From 4dec9b807be757780ca3611a959ac22c28d292a7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Dec 2008 17:48:48 +0100 Subject: rfkill: strip pointless notifier chain No users, so no reason to have it. Signed-off-by: Johannes Berg Acked-by: Ivo van Doorn Signed-off-by: John W. Linville --- include/linux/rfkill.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index f376a93927f7..164332cbb77c 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -149,11 +149,4 @@ static inline char *rfkill_get_led_name(struct rfkill *rfkill) #endif } -/* rfkill notification chain */ -#define RFKILL_STATE_CHANGED 0x0001 /* state of a normal rfkill - switch has changed */ - -int register_rfkill_notifier(struct notifier_block *nb); -int unregister_rfkill_notifier(struct notifier_block *nb); - #endif /* RFKILL_H */ -- cgit v1.2.3 From b690ace50be7d10d77cb7a6d5ef1bd9de649852f Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 21 Oct 2008 14:07:03 +0100 Subject: [ARM] S3C6400: serial support for S3C6400 and S3C6410 SoCs Add support to the Samsung serial driver for the S3C6400 and S3C6410 serial ports. Signed-off-by: Ben Dooks --- include/linux/serial_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 4e4f1277f3bf..feb3b939ec4b 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -158,6 +158,8 @@ /* SH-SCI */ #define PORT_SCIFA 83 +#define PORT_S3C6400 84 + #ifdef __KERNEL__ #include -- cgit v1.2.3 From b53c7583e26746ef6f66c866841e10450150ed8e Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 4 Dec 2008 10:01:52 -0800 Subject: rapidio: struct device - replace bus_id with dev_name(), dev_set_name() Cc: Matt Porter Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman Signed-off-by: Paul Mackerras --- include/linux/rio_drv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index 90987b7bcc1b..32c0547ffafc 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -427,9 +427,9 @@ void rio_dev_put(struct rio_dev *); * Get the unique RIO device identifier. Returns the device * identifier string. */ -static inline char *rio_name(struct rio_dev *rdev) +static inline const char *rio_name(struct rio_dev *rdev) { - return rdev->dev.bus_id; + return dev_name(&rdev->dev); } /** -- cgit v1.2.3 From 1a881f27c50b4fbd6858a8696a189263621136b0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:27:47 -0800 Subject: net: Add frag_list support to GSO This patch allows GSO to handle frag_list in a limited way for the purposes of allowing packets merged by GRO to be refragmented on output. Most hardware won't (and aren't expected to) support handling GRO frag_list packets directly. Therefore we will perform GSO in software for those cases. However, for drivers that can support it (such as virtual NICs) we may not have to segment the packets at all. Whether the added overhead of GRO/GSO is worthwhile for bridges and routers when weighed against the benefit of potentially increasing the MTU within the host is still an open question. However, for the case of host nodes this is undoubtedly a win. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b60c26b7d31c..bdf5465deb91 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1858,6 +1858,8 @@ static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { return skb_is_gso(skb) && (!skb_gso_ok(skb, dev->features) || + (skb_shinfo(skb)->frag_list && + !(dev->features & NETIF_F_FRAGLIST)) || unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); } -- cgit v1.2.3 From d565b0a1a9b6ee7dff46e1f68b26b526ac11ae50 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:38:52 -0800 Subject: net: Add Generic Receive Offload infrastructure This patch adds the top-level GRO (Generic Receive Offload) infrastructure. This is pretty similar to LRO except that this is protocol-independent. Instead of holding packets in an lro_mgr structure, they're now held in napi_struct. For drivers that intend to use this, they can set the NETIF_F_GRO bit and call napi_gro_receive instead of netif_receive_skb or just call netif_rx. The latter will call napi_receive_skb automatically. When napi_gro_receive is used, the driver must either call napi_complete/napi_rx_complete, or call napi_gro_flush in softirq context if the driver uses the primitives __napi_complete/__napi_rx_complete. Protocols will set the gro_receive and gro_complete function pointers in order to participate in this scheme. In addition to the packet, gro_receive will get a list of currently held packets. Each packet in the list has a same_flow field which is non-zero if it is a potential match for the new packet. For each packet that may match, they also have a flush field which is non-zero if the held packet must not be merged with the new packet. Once gro_receive has determined that the new skb matches a held packet, the held packet may be processed immediately if the new skb cannot be merged with it. In this case gro_receive should return the pointer to the existing skb in gro_list. Otherwise the new skb should be merged into the existing packet and NULL should be returned, unless the new skb makes it impossible for any further merges to be made (e.g., FIN packet) where the merged skb should be returned. Whenever the skb is merged into an existing entry, the gro_receive function should set NAPI_GRO_CB(skb)->same_flow. Note that if an skb merely matches an existing entry but can't be merged with it, then this shouldn't be set. If gro_receive finds it pointless to hold the new skb for future merging, it should set NAPI_GRO_CB(skb)->flush. Held packets will be flushed by napi_gro_flush which is called by napi_complete and napi_rx_complete. Currently held packets are stored in a singly liked list just like LRO. The list is limited to a maximum of 8 entries. In future, this may be expanded to use a hash table to allow more flows to be held for merging. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 80 +++++++++++++++++------------------------------ include/linux/netpoll.h | 5 --- 2 files changed, 28 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bdf5465deb91..58856b6737fb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -314,8 +314,9 @@ struct napi_struct { spinlock_t poll_lock; int poll_owner; struct net_device *dev; - struct list_head dev_list; #endif + struct list_head dev_list; + struct sk_buff *gro_list; }; enum @@ -376,22 +377,8 @@ static inline int napi_reschedule(struct napi_struct *napi) * * Mark NAPI processing as complete. */ -static inline void __napi_complete(struct napi_struct *n) -{ - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - list_del(&n->poll_list); - smp_mb__before_clear_bit(); - clear_bit(NAPI_STATE_SCHED, &n->state); -} - -static inline void napi_complete(struct napi_struct *n) -{ - unsigned long flags; - - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); -} +extern void __napi_complete(struct napi_struct *n); +extern void napi_complete(struct napi_struct *n); /** * napi_disable - prevent NAPI from scheduling @@ -640,9 +627,7 @@ struct net_device unsigned long state; struct list_head dev_list; -#ifdef CONFIG_NETPOLL struct list_head napi_list; -#endif /* Net device features */ unsigned long features; @@ -661,6 +646,7 @@ struct net_device #define NETIF_F_LLTX 4096 /* LockLess TX - deprecated. Please */ /* do not use LLTX in new drivers */ #define NETIF_F_NETNS_LOCAL 8192 /* Does not change network namespaces */ +#define NETIF_F_GRO 16384 /* Generic receive offload */ #define NETIF_F_LRO 32768 /* large receive offload */ /* Segmentation offload features */ @@ -984,22 +970,8 @@ static inline void *netdev_priv(const struct net_device *dev) * netif_napi_add() must be used to initialize a napi context prior to calling * *any* of the other napi related functions. */ -static inline void netif_napi_add(struct net_device *dev, - struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), - int weight) -{ - INIT_LIST_HEAD(&napi->poll_list); - napi->poll = poll; - napi->weight = weight; -#ifdef CONFIG_NETPOLL - napi->dev = dev; - list_add(&napi->dev_list, &dev->napi_list); - spin_lock_init(&napi->poll_lock); - napi->poll_owner = -1; -#endif - set_bit(NAPI_STATE_SCHED, &napi->state); -} +void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight); /** * netif_napi_del - remove a napi context @@ -1007,12 +979,20 @@ static inline void netif_napi_add(struct net_device *dev, * * netif_napi_del() removes a napi context from the network device napi list */ -static inline void netif_napi_del(struct napi_struct *napi) -{ -#ifdef CONFIG_NETPOLL - list_del(&napi->dev_list); -#endif -} +void netif_napi_del(struct napi_struct *napi); + +struct napi_gro_cb { + /* This is non-zero if the packet may be of the same flow. */ + int same_flow; + + /* This is non-zero if the packet cannot be merged with the new skb. */ + int flush; + + /* Number of segments aggregated. */ + int count; +}; + +#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) struct packet_type { __be16 type; /* This is really htons(ether_type). */ @@ -1024,6 +1004,9 @@ struct packet_type { struct sk_buff *(*gso_segment)(struct sk_buff *skb, int features); int (*gso_send_check)(struct sk_buff *skb); + struct sk_buff **(*gro_receive)(struct sk_buff **head, + struct sk_buff *skb); + int (*gro_complete)(struct sk_buff *skb); void *af_packet_priv; struct list_head list; }; @@ -1377,6 +1360,9 @@ extern int netif_rx(struct sk_buff *skb); extern int netif_rx_ni(struct sk_buff *skb); #define HAVE_NETIF_RECEIVE_SKB 1 extern int netif_receive_skb(struct sk_buff *skb); +extern void napi_gro_flush(struct napi_struct *napi); +extern int napi_gro_receive(struct napi_struct *napi, + struct sk_buff *skb); extern void netif_nit_deliver(struct sk_buff *skb); extern int dev_valid_name(const char *name); extern int dev_ioctl(struct net *net, unsigned int cmd, void __user *); @@ -1621,17 +1607,7 @@ static inline void __netif_rx_complete(struct net_device *dev, static inline void netif_rx_complete(struct net_device *dev, struct napi_struct *napi) { - unsigned long flags; - - /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu - */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &napi->state))) - return; - local_irq_save(flags); - __netif_rx_complete(dev, napi); - local_irq_restore(flags); + napi_complete(napi); } static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index e3d79593fb3a..e38d3c9dccda 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -94,11 +94,6 @@ static inline void netpoll_poll_unlock(void *have) rcu_read_unlock(); } -static inline void netpoll_netdev_init(struct net_device *dev) -{ - INIT_LIST_HEAD(&dev->napi_list); -} - #else static inline int netpoll_rx(struct sk_buff *skb) { -- cgit v1.2.3 From 71d93b39e52e92aea35f1058d957cf12250d0b75 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:42:33 -0800 Subject: net: Add skb_gro_receive This patch adds the helper skb_gro_receive to merge packets for GRO. The current method is to allocate a new header skb and then chain the original packets to its frag_list. This is done to make it easier to integrate into the existing GSO framework. In future as GSO is moved into the drivers, we can undo this and simply chain the original packets together. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index acf17af45af9..cf2cb50f77d1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1687,6 +1687,8 @@ extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); extern struct sk_buff *skb_segment(struct sk_buff *skb, int features); +extern int skb_gro_receive(struct sk_buff **head, + struct sk_buff *skb); static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) -- cgit v1.2.3 From b240a0e5644eb817c4a397098a40e1ad42a615bc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:44:31 -0800 Subject: ethtool: Add GGRO and SGRO ops This patch adds the ethtool ops to enable and disable GRO. It also makes GRO depend on RX checksum offload much the same as how TSO depends on SG support. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index b4b038b89ee6..27c67a542235 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -467,6 +467,8 @@ struct ethtool_ops { #define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */ #define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */ +#define ETHTOOL_GGRO 0x0000002b /* Get GRO enable (ethtool_value) */ +#define ETHTOOL_SGRO 0x0000002c /* Set GRO enable (ethtool_value) */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET -- cgit v1.2.3 From 092cab7e2cd868cb0b30209a0337689c3ffd6133 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 16 Dec 2008 01:19:41 -0800 Subject: netfilter: ctnetlink: fix missing CTA_NAT_SEQ_UNSPEC This patch fixes an inconsistency in nfnetlink_conntrack.h that I introduced myself. The problem is that CTA_NAT_SEQ_UNSPEC is missing from enum ctattr_natseq. This inconsistency may lead to problems in the message parsing in userspace (if the message contains the CTA_NAT_SEQ_* attributes, of course). This patch breaks backward compatibility, however, the only known client of this code is libnetfilter_conntrack which indeed crashes because it assumes the existence of CTA_NAT_SEQ_UNSPEC to do the parsing. The CTA_NAT_SEQ_* attributes were introduced in 2.6.25. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink_conntrack.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index c19595c89304..29fe9ea1d346 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -141,6 +141,7 @@ enum ctattr_protonat { #define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1) enum ctattr_natseq { + CTA_NAT_SEQ_UNSPEC, CTA_NAT_SEQ_CORRECTION_POS, CTA_NAT_SEQ_OFFSET_BEFORE, CTA_NAT_SEQ_OFFSET_AFTER, -- cgit v1.2.3 From e18ce3465477502108187c6c08b6423fb784a313 Mon Sep 17 00:00:00 2001 From: Steve Glendinning Date: Tue, 16 Dec 2008 02:00:00 -0800 Subject: net: Move flow control definitions to mii.h flags used within drivers for indicating tx and rx flow control are defined in 4 drivers (and probably more), move these constants to mii.h. The 3 SMSC drivers use the same constants (FLOW_CTRL_TX), but TG3 uses TG3_FLOW_CTRL_TX, so this patch also renames the constants within TG3. Signed-off-by: Steve Glendinning Signed-off-by: David S. Miller --- include/linux/mii.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mii.h b/include/linux/mii.h index 151b7e0182c7..4a376e0816fd 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -135,6 +135,10 @@ #define LPA_1000FULL 0x0800 /* Link partner 1000BASE-T full duplex */ #define LPA_1000HALF 0x0400 /* Link partner 1000BASE-T half duplex */ +/* Flow control flags */ +#define FLOW_CTRL_TX 0x01 +#define FLOW_CTRL_RX 0x02 + /* This structure is used in all SIOCxMIIxxx ioctl calls */ struct mii_ioctl_data { __u16 phy_id; -- cgit v1.2.3 From bc02ff95fe4ebd3e5ee7455c0aa6f76ebe39ebca Mon Sep 17 00:00:00 2001 From: Steve Glendinning Date: Tue, 16 Dec 2008 02:00:48 -0800 Subject: net: Refactor full duplex flow control resolution These 4 drivers have identical full duplex flow control resolution functions. This patch changes them all to use one common function. The function in question decides whether a device should enable TX and RX flow control in a standard way (IEEE 802.3-2005 table 28B-3), so this should also be useful for other drivers. Signed-off-by: Steve Glendinning Signed-off-by: David S. Miller --- include/linux/mii.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mii.h b/include/linux/mii.h index 4a376e0816fd..ad748588faf1 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -239,5 +239,34 @@ static inline unsigned int mii_duplex (unsigned int duplex_lock, return 0; } +/** + * mii_resolve_flowctrl_fdx + * @lcladv: value of MII ADVERTISE register + * @rmtadv: value of MII LPA register + * + * Resolve full duplex flow control as per IEEE 802.3-2005 table 28B-3 + */ +static inline u8 mii_resolve_flowctrl_fdx(u16 lcladv, u16 rmtadv) +{ + u8 cap = 0; + + if (lcladv & ADVERTISE_PAUSE_CAP) { + if (lcladv & ADVERTISE_PAUSE_ASYM) { + if (rmtadv & LPA_PAUSE_CAP) + cap = FLOW_CTRL_TX | FLOW_CTRL_RX; + else if (rmtadv & LPA_PAUSE_ASYM) + cap = FLOW_CTRL_RX; + } else { + if (rmtadv & LPA_PAUSE_CAP) + cap = FLOW_CTRL_TX | FLOW_CTRL_RX; + } + } else if (lcladv & ADVERTISE_PAUSE_ASYM) { + if ((rmtadv & LPA_PAUSE_CAP) && (rmtadv & LPA_PAUSE_ASYM)) + cap = FLOW_CTRL_TX; + } + + return cap; +} + #endif /* __KERNEL__ */ #endif /* __LINUX_MII_H__ */ -- cgit v1.2.3 From b24a2516d10751d7ed5afb58420df25370c9dffb Mon Sep 17 00:00:00 2001 From: Yang Hongyang Date: Tue, 16 Dec 2008 02:06:23 -0800 Subject: ipv6: Add IPV6_PKTINFO sticky option support to setsockopt() There are three reasons for me to add this support: 1.When no interface is specified in an IPV6_PKTINFO ancillary data item, the interface specified in an IPV6_PKTINFO sticky optionis is used. RFC3542: 6.7. Summary of Outgoing Interface Selection This document and [RFC-3493] specify various methods that affect the selection of the packet's outgoing interface. This subsection summarizes the ordering among those in order to ensure deterministic behavior. For a given outgoing packet on a given socket, the outgoing interface is determined in the following order: 1. if an interface is specified in an IPV6_PKTINFO ancillary data item, the interface is used. 2. otherwise, if an interface is specified in an IPV6_PKTINFO sticky option, the interface is used. 2.When no IPV6_PKTINFO ancillary data is received,getsockopt() should return the sticky option value which set with setsockopt(). RFC 3542: Issuing getsockopt() for the above options will return the sticky option value i.e., the value set with setsockopt(). If no sticky option value has been set getsockopt() will return the following values: 3.Make the setsockopt implementation POSIX compliant. Signed-off-by: Yang Hongyang Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 641e026eee8f..0b816cae533e 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -278,6 +278,7 @@ struct ipv6_pinfo { struct in6_addr saddr; struct in6_addr rcv_saddr; struct in6_addr daddr; + struct in6_pktinfo sticky_pktinfo; struct in6_addr *daddr_cache; #ifdef CONFIG_IPV6_SUBTREES struct in6_addr *saddr_cache; -- cgit v1.2.3 From 8c5df16bec8a60bb8589fc232b9e26cac0ed4b2c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:17:26 -0800 Subject: swiotlb: allow architectures to override swiotlb pool allocation Impact: generalize swiotlb allocation code Architectures may need to allocate memory specially for use with the swiotlb. Create the weak function swiotlb_alloc_boot() and swiotlb_alloc() defaulting to the current behaviour. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ian Campbell Signed-off-by: Ingo Molnar --- include/linux/swiotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b18ec5533e8c..b8c5fc766a56 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -10,6 +10,9 @@ struct scatterlist; extern void swiotlb_init(void); +extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs); +extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); + extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); -- cgit v1.2.3 From 0016fdee927f7aa0f428494bcf11ae60c7470a02 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 16 Dec 2008 12:17:27 -0800 Subject: swiotlb: move some definitions to header Impact: cleanup Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/linux/swiotlb.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b8c5fc766a56..58b996a642f9 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -7,6 +7,20 @@ struct device; struct dma_attrs; struct scatterlist; +/* + * Maximum allowable number of contiguous slabs to map, + * must be a power of 2. What is the appropriate value ? + * The complexity of {map,unmap}_single is linearly dependent on this value. + */ +#define IO_TLB_SEGSIZE 128 + + +/* + * log of the size of each IO TLB slab. The number of slabs is command line + * controllable. + */ +#define IO_TLB_SHIFT 11 + extern void swiotlb_init(void); -- cgit v1.2.3 From b31a1d8b41513b96e9c7ec2f68c5734cef0b26a4 Mon Sep 17 00:00:00 2001 From: Andy Fleming Date: Tue, 16 Dec 2008 15:29:15 -0800 Subject: gianfar: Convert gianfar to an of_platform_driver Does the same for the accompanying MDIO driver, and then modifies the TBI configuration method. The old way used fields in einfo, which no longer exists. The new way is to create an MDIO device-tree node for each instance of gianfar, and create a tbi-handle property to associate ethernet controllers with the TBI PHYs they are connected to. Signed-off-by: Andy Fleming Signed-off-by: David S. Miller --- include/linux/fsl_devices.h | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h index 708bab58d8d0..d9051d717d27 100644 --- a/include/linux/fsl_devices.h +++ b/include/linux/fsl_devices.h @@ -47,12 +47,7 @@ struct gianfar_platform_data { /* device specific information */ u32 device_flags; - /* board specific information */ - u32 board_flags; - int mdio_bus; /* Bus controlled by us */ - char bus_id[MII_BUS_ID_SIZE]; /* Bus PHY is on */ - u32 phy_id; - u8 mac_addr[6]; + char bus_id[BUS_ID_SIZE]; phy_interface_t interface; }; @@ -61,17 +56,6 @@ struct gianfar_mdio_data { int irq[32]; }; -/* Flags related to gianfar device features */ -#define FSL_GIANFAR_DEV_HAS_GIGABIT 0x00000001 -#define FSL_GIANFAR_DEV_HAS_COALESCE 0x00000002 -#define FSL_GIANFAR_DEV_HAS_RMON 0x00000004 -#define FSL_GIANFAR_DEV_HAS_MULTI_INTR 0x00000008 -#define FSL_GIANFAR_DEV_HAS_CSUM 0x00000010 -#define FSL_GIANFAR_DEV_HAS_VLAN 0x00000020 -#define FSL_GIANFAR_DEV_HAS_EXTENDED_HASH 0x00000040 -#define FSL_GIANFAR_DEV_HAS_PADDING 0x00000080 -#define FSL_GIANFAR_DEV_HAS_MAGIC_PACKET 0x00000100 - /* Flags in gianfar_platform_data */ #define FSL_GIANFAR_BRD_HAS_PHY_INTR 0x00000001 /* set or use a timer */ #define FSL_GIANFAR_BRD_IS_REDUCED 0x00000002 /* Set if RGMII, RMII */ -- cgit v1.2.3 From e08e1f7adba522378e8d2ae941bf25443866136d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 16 Dec 2008 12:17:30 -0800 Subject: swiotlb: allow architectures to override phys<->bus<->phys conversions Impact: generalize phys<->bus<->phys conversions in the swiotlb code Architectures may need to override these conversions. Implement a __weak hook point containing the default implementation. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/linux/swiotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 58b996a642f9..694f1839cbc0 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -27,6 +27,9 @@ swiotlb_init(void); extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs); extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); +extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address); +extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address); + extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); -- cgit v1.2.3 From b81ea27b2329bf44b30c427800954f845896d476 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 16 Dec 2008 12:17:31 -0800 Subject: swiotlb: add arch hook to force mapping Impact: generalize the sw-IOTLB range checks Some architectures require special rules to determine whether a range needs mapping or not. This adds a weak function for architectures to override. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/linux/swiotlb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 694f1839cbc0..325af1de0351 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -30,6 +30,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address); extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address); +extern int swiotlb_arch_range_needs_mapping(void *ptr, size_t size); + extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); -- cgit v1.2.3 From 2d91d78b68606ff7ce52ea70e187dee7831aa2f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 17 Dec 2008 15:47:29 -0800 Subject: Phonet: allocate a non-Ethernet ARP type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also leave some room for more 802.11 types. Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- include/linux/if_arp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h index 4d3401812e6c..11df77ab2dbb 100644 --- a/include/linux/if_arp.h +++ b/include/linux/if_arp.h @@ -87,6 +87,8 @@ #define ARPHRD_IEEE80211_PRISM 802 /* IEEE 802.11 + Prism2 header */ #define ARPHRD_IEEE80211_RADIOTAP 803 /* IEEE 802.11 + radiotap header */ +#define ARPHRD_PHONET 820 /* PhoNet media type */ + #define ARPHRD_VOID 0xFFFF /* Void type, nothing is known */ #define ARPHRD_NONE 0xFFFE /* zero header length */ -- cgit v1.2.3 From 57c81fffc863fb4c1804bc963bcbfb82d736c6df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 17 Dec 2008 15:47:48 -0800 Subject: Phonet: allocate separate ARP type for GPRS over a Phonet pipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A separate xmit lock class supports GPRS over a Phonet pipe over a TUN device (type ARPHRD_NONE). Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- include/linux/if_arp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h index 11df77ab2dbb..5ff89809a581 100644 --- a/include/linux/if_arp.h +++ b/include/linux/if_arp.h @@ -88,6 +88,7 @@ #define ARPHRD_IEEE80211_RADIOTAP 803 /* IEEE 802.11 + radiotap header */ #define ARPHRD_PHONET 820 /* PhoNet media type */ +#define ARPHRD_PHONET_PIPE 821 /* PhoNet pipe header */ #define ARPHRD_VOID 0xFFFF /* Void type, nothing is known */ #define ARPHRD_NONE 0xFFFE /* zero header length */ -- cgit v1.2.3 From f38f1d2aa5a3520cf05da7cd6bd12fe2b0c509b7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Dec 2008 23:06:40 -0500 Subject: trace: add a way to enable or disable the stack tracer Impact: enhancement to stack tracer The stack tracer currently is either on when configured in or off when it is not. It can not be disabled when it is configured on. (besides disabling the function tracer that it uses) This patch adds a way to enable or disable the stack tracer at run time. It defaults off on bootup, but a kernel parameter 'stacktrace' has been added to enable it on bootup. A new sysctl has been added "kernel.stack_tracer_enabled" to let the user enable or disable the stack tracer at run time. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 44020f31bd81..6b0db53caa7d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -86,6 +86,14 @@ static inline void ftrace_stop(void) { } static inline void ftrace_start(void) { } #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_STACK_TRACER +extern int stack_tracer_enabled; +int +stack_trace_sysctl(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + #ifdef CONFIG_DYNAMIC_FTRACE /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ #include -- cgit v1.2.3 From 9c2c48020ec0dd6ecd27e5a1298f73b40d85a595 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Tue, 16 Dec 2008 23:41:22 -0800 Subject: schedstat: consolidate per-task cpu runtime stats Impact: simplify code When we turn on CONFIG_SCHEDSTATS, per-task cpu runtime is accumulated twice. Once in task->se.sum_exec_runtime and once in sched_info.cpu_time. These two stats are exactly the same. Given that task->se.sum_exec_runtime is always accumulated by the core scheduler, sched_info can reuse that data instead of duplicate the accounting. Signed-off-by: Ken Chen Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8cccd6dc5d66..2d1e840ddd35 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -670,8 +670,7 @@ struct reclaim_state; struct sched_info { /* cumulative counters */ unsigned long pcount; /* # of times run on this cpu */ - unsigned long long cpu_time, /* time spent on the cpu */ - run_delay; /* time spent waiting on a runqueue */ + unsigned long long run_delay; /* time spent waiting on a runqueue */ /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ -- cgit v1.2.3 From 74c8a6130486bed224e960790f4aa72dd09c061e Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 17 Dec 2008 19:40:33 +0900 Subject: locking, irq: enclose irq_desc_lock_class in CONFIG_LOCKDEP Impact: simplify code commit "08678b0: generic: sparse irqs: use irq_desc() [...]" introduced the irq_desc_lock_class variable. But it is used only if CONFIG_SPARSE_IRQ=Y or CONFIG_TRACE_IRQFLAGS=Y. Otherwise, following warnings happen: CC kernel/irq/handle.o kernel/irq/handle.c:26: warning: 'irq_desc_lock_class' defined but not used Actually, current early_init_irq_lock_class has a bit strange and messy ifdef. In addition, it is not valueable. 1. this function is protected by !CONFIG_SPARSE_IRQ, but that is not necessary. if CONFIG_SPARSE_IRQ=Y, desc of all irq number are initialized by NULL at first - then this function calling is safe. 2. this function protected by CONFIG_TRACE_IRQFLAGS too. but it is not necessary either, because lockdep_set_class() doesn't have bad side effect even if CONFIG_TRACE_IRQFLAGS=n. This patch bloat kernel size a bit on CONFIG_TRACE_IRQFLAGS=n and CONFIG_SPARSE_IRQ=Y - but that's ok. early_init_irq_lock_class() is not a fastpatch at all. To avoid messy ifdefs is more important than a few bytes diet. Signed-off-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 29aec6e10020..9dba554c802c 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -377,7 +377,7 @@ do { \ #endif /* CONFIG_LOCK_STAT */ -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) +#ifdef CONFIG_GENERIC_HARDIRQS extern void early_init_irq_lock_class(void); #else static inline void early_init_irq_lock_class(void) -- cgit v1.2.3 From 40aa4a30d0fd075fb934de4ee8163056827052ab Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 16 Dec 2008 10:15:12 +0000 Subject: ASoC: Add WM8350 AudioPlus codec driver The WM8350 is an integrated audio and power management subsystem which provides a single-chip solution for portable audio and multimedia systems. The integrated audio CODEC provides all the necessary functions for high-quality stereo recording and playback. Programmable on-chip amplifiers allow for the direct connection of headphones and microphones with a minimum of external components. A programmable low-noise bias voltage is available to feed one or more electret microphones. Additional audio features include programmable high-pass filter in the ADC input path. This driver was originally written by Liam Girdwood with further updates from me. Signed-off-by: Mark Brown --- include/linux/mfd/wm8350/audio.h | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/wm8350/audio.h b/include/linux/mfd/wm8350/audio.h index 217bb22ebb8e..af95a1d2f3a1 100644 --- a/include/linux/mfd/wm8350/audio.h +++ b/include/linux/mfd/wm8350/audio.h @@ -1,7 +1,7 @@ /* * audio.h -- Audio Driver for Wolfson WM8350 PMIC * - * Copyright 2007 Wolfson Microelectronics PLC + * Copyright 2007, 2008 Wolfson Microelectronics PLC * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -70,9 +70,9 @@ #define WM8350_CODEC_ISEL_0_5 3 /* x0.5 */ #define WM8350_VMID_OFF 0 -#define WM8350_VMID_500K 1 -#define WM8350_VMID_100K 2 -#define WM8350_VMID_10K 3 +#define WM8350_VMID_300K 1 +#define WM8350_VMID_50K 2 +#define WM8350_VMID_5K 3 /* * R40 (0x28) - Clock Control 1 @@ -591,8 +591,38 @@ #define WM8350_IRQ_CODEC_MICSCD 41 #define WM8350_IRQ_CODEC_MICD 42 +/* + * WM8350 Platform data. + * + * This must be initialised per platform for best audio performance. + * Please see WM8350 datasheet for information. + */ +struct wm8350_audio_platform_data { + int vmid_discharge_msecs; /* VMID --> OFF discharge time */ + int drain_msecs; /* OFF drain time */ + int cap_discharge_msecs; /* Cap ON (from OFF) discharge time */ + int vmid_charge_msecs; /* vmid power up time */ + u32 vmid_s_curve:2; /* vmid enable s curve speed */ + u32 dis_out4:2; /* out4 discharge speed */ + u32 dis_out3:2; /* out3 discharge speed */ + u32 dis_out2:2; /* out2 discharge speed */ + u32 dis_out1:2; /* out1 discharge speed */ + u32 vroi_out4:1; /* out4 tie off */ + u32 vroi_out3:1; /* out3 tie off */ + u32 vroi_out2:1; /* out2 tie off */ + u32 vroi_out1:1; /* out1 tie off */ + u32 vroi_enable:1; /* enable tie off */ + u32 codec_current_on:2; /* current level ON */ + u32 codec_current_standby:2; /* current level STANDBY */ + u32 codec_current_charge:2; /* codec current @ vmid charge */ +}; + +struct snd_soc_codec; + struct wm8350_codec { struct platform_device *pdev; + struct snd_soc_codec *codec; + struct wm8350_audio_platform_data *platform_data; }; #endif -- cgit v1.2.3 From 64db4cfff99c04cd5f550357edcc8780f96b54a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Dec 2008 21:55:32 +0100 Subject: "Tree RCU": scalable classic RCU implementation This patch fixes a long-standing performance bug in classic RCU that results in massive internal-to-RCU lock contention on systems with more than a few hundred CPUs. Although this patch creates a separate flavor of RCU for ease of review and patch maintenance, it is intended to replace classic RCU. This patch still handles stress better than does mainline, so I am still calling it ready for inclusion. This patch is against the -tip tree. Nevertheless, experience on an actual 1000+ CPU machine would still be most welcome. Most of the changes noted below were found while creating an rcutiny (which should permit ejecting the current rcuclassic) and while doing detailed line-by-line documentation. Updates from v9 (http://lkml.org/lkml/2008/12/2/334): o Fixes from remainder of line-by-line code walkthrough, including comment spelling, initialization, undesirable narrowing due to type conversion, removing redundant memory barriers, removing redundant local-variable initialization, and removing redundant local variables. I do not believe that any of these fixes address the CPU-hotplug issues that Andi Kleen was seeing, but please do give it a whirl in case the machine is smarter than I am. A writeup from the walkthrough may be found at the following URL, in case you are suffering from terminal insomnia or masochism: http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf o Made rcutree tracing use seq_file, as suggested some time ago by Lai Jiangshan. o Added a .csv variant of the rcudata debugfs trace file, to allow people having thousands of CPUs to drop the data into a spreadsheet. Tested with oocalc and gnumeric. Updated documentation to suit. Updates from v8 (http://lkml.org/lkml/2008/11/15/139): o Fix a theoretical race between grace-period initialization and force_quiescent_state() that could occur if more than three jiffies were required to carry out the grace-period initialization. Which it might, if you had enough CPUs. o Apply Ingo's printk-standardization patch. o Substitute local variables for repeated accesses to global variables. o Fix comment misspellings and redundant (but harmless) increments of ->n_rcu_pending (this latter after having explicitly added it). o Apply checkpatch fixes. Updates from v7 (http://lkml.org/lkml/2008/10/10/291): o Fixed a number of problems noted by Gautham Shenoy, including the cpu-stall-detection bug that he was having difficulty convincing me was real. ;-) o Changed cpu-stall detection to wait for ten seconds rather than three in order to reduce false positive, as suggested by Ingo Molnar. o Produced a design document (http://lwn.net/Articles/305782/). The act of writing this document uncovered a number of both theoretical and "here and now" bugs as noted below. o Fix dynticks_nesting accounting confusion, simplify WARN_ON() condition, fix kerneldoc comments, and add memory barriers in dynticks interface functions. o Add more data to tracing. o Remove unused "rcu_barrier" field from rcu_data structure. o Count calls to rcu_pending() from scheduling-clock interrupt to use as a surrogate timebase should jiffies stop counting. o Fix a theoretical race between force_quiescent_state() and grace-period initialization. Yes, initialization does have to go on for some jiffies for this race to occur, but given enough CPUs... Updates from v6 (http://lkml.org/lkml/2008/9/23/448): o Fix a number of checkpatch.pl complaints. o Apply review comments from Ingo Molnar and Lai Jiangshan on the stall-detection code. o Fix several bugs in !CONFIG_SMP builds. o Fix a misspelled config-parameter name so that RCU now announces at boot time if stall detection is configured. o Run tests on numerous combinations of configurations parameters, which after the fixes above, now build and run correctly. Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line): o Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a changeset some time ago, and finally got around to retesting this option). o Fix some tracing bugs in rcupreempt that caused incorrect totals to be printed. o I now test with a more brutal random-selection online/offline script (attached). Probably more brutal than it needs to be on the people reading it as well, but so it goes. o A number of optimizations and usability improvements: o Make rcu_pending() ignore the grace-period timeout when there is no grace period in progress. o Make force_quiescent_state() avoid going for a global lock in the case where there is no grace period in progress. o Rearrange struct fields to improve struct layout. o Make call_rcu() initiate a grace period if RCU was idle, rather than waiting for the next scheduling clock interrupt. o Invoke rcu_irq_enter() and rcu_irq_exit() only when idle, as suggested by Andi Kleen. I still don't completely trust this change, and might back it out. o Make CONFIG_RCU_TRACE be the single config variable manipulated for all forms of RCU, instead of the prior confusion. o Document tracing files and formats for both rcupreempt and rcutree. Updates from v4 for those missing v5 given its bad subject line: o Separated dynticks interface so that NMIs and irqs call separate functions, greatly simplifying it. In particular, this code no longer requires a proof of correctness. ;-) o Separated dynticks state out into its own per-CPU structure, avoiding the duplicated accounting. o The case where a dynticks-idle CPU runs an irq handler that invokes call_rcu() is now correctly handled, forcing that CPU out of dynticks-idle mode. o Review comments have been applied (thank you all!!!). For but one example, fixed the dynticks-ordering issue that Manfred pointed out, saving me much debugging. ;-) o Adjusted rcuclassic and rcupreempt to handle dynticks changes. Attached is an updated patch to Classic RCU that applies a hierarchy, greatly reducing the contention on the top-level lock for large machines. This passes 10-hour concurrent rcutorture and online-offline testing on 128-CPU ppc64 without dynticks enabled, and exposes some timekeeping bugs in presence of dynticks (exciting working on a system where "sleep 1" hangs until interrupted...), which were fixed in the 2.6.27 kernel. It is getting more reliable than mainline by some measures, so the next version will be against -tip for inclusion. See also Manfred Spraul's recent patches (or his earlier work from 2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2). We will converge onto a common patch in the fullness of time, but are currently exploring different regions of the design space. That said, I have already gratefully stolen quite a few of Manfred's ideas. This patch provides CONFIG_RCU_FANOUT, which controls the bushiness of the RCU hierarchy. Defaults to 32 on 32-bit machines and 64 on 64-bit machines. If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT, there is no hierarchy. By default, the RCU initialization code will adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable this balancing, allowing the hierarchy to be exactly aligned to the underlying hardware. Up to two levels of hierarchy are permitted (in addition to the root node), allowing up to 16,384 CPUs on 32-bit systems and up to 262,144 CPUs on 64-bit systems. I just know that I am going to regret saying this, but this seems more than sufficient for the foreseeable future. (Some architectures might wish to set CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs. If this becomes a real problem, additional levels can be added, but I doubt that it will make a significant difference on real hardware.) In the common case, a given CPU will manipulate its private rcu_data structure and the rcu_node structure that it shares with its immediate neighbors. This can reduce both lock and memory contention by multiple orders of magnitude, which should eliminate the need for the strange manipulations that are reported to be required when running Linux on very large systems. Some shortcomings: o More bugs will probably surface as a result of an ongoing line-by-line code inspection. Patches will be provided as required. o There are probably hangs, rcutorture failures, &c. Seems quite stable on a 128-CPU machine, but that is kind of small compared to 4096 CPUs. However, seems to do better than mainline. Patches will be provided as required. o The memory footprint of this version is several KB larger than rcuclassic. A separate UP-only rcutiny patch will be provided, which will reduce the memory footprint significantly, even compared to the old rcuclassic. One such patch passes light testing, and has a memory footprint smaller even than rcuclassic. Initial reaction from various embedded guys was "it is not worth it", so am putting it aside. Credits: o Manfred Spraul for ideas, review comments, and bugs spotted, as well as some good friendly competition. ;-) o Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers, Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton for reviews and comments. o Thomas Gleixner for much-needed help with some timer issues (see patches below). o Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos, Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines alive despite my heavy abuse^Wtesting. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 14 +- include/linux/rcupdate.h | 10 +- include/linux/rcutree.h | 329 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 344 insertions(+), 9 deletions(-) create mode 100644 include/linux/rcutree.h (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 181006cc94a0..9b70b9231693 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -118,13 +118,17 @@ static inline void account_system_vtime(struct task_struct *tsk) } #endif -#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ) +#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) extern void rcu_irq_enter(void); extern void rcu_irq_exit(void); +extern void rcu_nmi_enter(void); +extern void rcu_nmi_exit(void); #else # define rcu_irq_enter() do { } while (0) # define rcu_irq_exit() do { } while (0) -#endif /* CONFIG_PREEMPT_RCU */ +# define rcu_nmi_enter() do { } while (0) +# define rcu_nmi_exit() do { } while (0) +#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */ /* * It is safe to do non-atomic ops on ->hardirq_context, @@ -134,7 +138,6 @@ extern void rcu_irq_exit(void); */ #define __irq_enter() \ do { \ - rcu_irq_enter(); \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ @@ -153,7 +156,6 @@ extern void irq_enter(void); trace_hardirq_exit(); \ account_system_vtime(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ - rcu_irq_exit(); \ } while (0) /* @@ -161,7 +163,7 @@ extern void irq_enter(void); */ extern void irq_exit(void); -#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0) -#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) +#define nmi_enter() do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0) +#define nmi_exit() do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 86f1f5e43e33..bfd289aff576 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -52,11 +52,15 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; -#ifdef CONFIG_CLASSIC_RCU +#if defined(CONFIG_CLASSIC_RCU) #include -#else /* #ifdef CONFIG_CLASSIC_RCU */ +#elif defined(CONFIG_TREE_RCU) +#include +#elif defined(CONFIG_PREEMPT_RCU) #include -#endif /* #else #ifdef CONFIG_CLASSIC_RCU */ +#else +#error "Unknown RCU implementation specified to kernel configuration" +#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */ #define RCU_HEAD_INIT { .next = NULL, .func = NULL } #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h new file mode 100644 index 000000000000..d4368b7975c3 --- /dev/null +++ b/include/linux/rcutree.h @@ -0,0 +1,329 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Dipankar Sarma + * Paul E. McKenney Hierarchical algorithm + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ + +#ifndef __LINUX_RCUTREE_H +#define __LINUX_RCUTREE_H + +#include +#include +#include +#include +#include +#include + +/* + * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this has not been tested, so there is probably some + * bug somewhere. + */ +#define MAX_RCU_LVLS 3 +#define RCU_FANOUT (CONFIG_RCU_FANOUT) +#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) +#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT +# define NUM_RCU_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (NR_CPUS) +# define NUM_RCU_LVL_2 0 +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_SQ +# define NUM_RCU_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_2 (NR_CPUS) +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_CUBE +# define NUM_RCU_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_3 NR_CPUS +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT */ + +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) + +/* + * Dynticks per-CPU state. + */ +struct rcu_dynticks { + int dynticks_nesting; /* Track nesting level, sort of. */ + int dynticks; /* Even value for dynticks-idle, else odd. */ + int dynticks_nmi; /* Even value for either dynticks-idle or */ + /* not in nmi handler, else odd. So this */ + /* remains even for nmi from irq handler. */ +}; + +/* + * Definition for node within the RCU grace-period-detection hierarchy. + */ +struct rcu_node { + spinlock_t lock; + unsigned long qsmask; /* CPUs or groups that need to switch in */ + /* order for current grace period to proceed.*/ + unsigned long qsmaskinit; + /* Per-GP initialization for qsmask. */ + unsigned long grpmask; /* Mask to apply to parent qsmask. */ + int grplo; /* lowest-numbered CPU or group here. */ + int grphi; /* highest-numbered CPU or group here. */ + u8 grpnum; /* CPU/group number for next level up. */ + u8 level; /* root is at level 0. */ + struct rcu_node *parent; +} ____cacheline_internodealigned_in_smp; + +/* Index values for nxttail array in struct rcu_data. */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_NEXT_SIZE 4 + +/* Per-CPU data for read-copy update. */ +struct rcu_data { + /* 1) quiescent-state and grace-period handling : */ + long completed; /* Track rsp->completed gp number */ + /* in order to detect GP end. */ + long gpnum; /* Highest gp number that this CPU */ + /* is aware of having started. */ + long passed_quiesc_completed; + /* Value of completed at time of qs. */ + bool passed_quiesc; /* User-mode/idle loop etc. */ + bool qs_pending; /* Core waits for quiesc state. */ + bool beenonline; /* CPU online at least once. */ + struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ + unsigned long grpmask; /* Mask to apply to leaf qsmask. */ + + /* 2) batch handling */ + /* + * If nxtlist is not NULL, it is partitioned as follows. + * Any of the partitions might be empty, in which case the + * pointer to that partition will be equal to the pointer for + * the following partition. When the list is empty, all of + * the nxttail elements point to nxtlist, which is NULL. + * + * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [nxtlist, *nxttail[RCU_DONE_TAIL]): + * Entries that batch # <= ->completed + * The grace period for these entries has completed, and + * the other grace-period-completed entries may be moved + * here temporarily in rcu_process_callbacks(). + */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail[RCU_NEXT_SIZE]; + long qlen; /* # of queued callbacks */ + long blimit; /* Upper limit on a processed batch */ + +#ifdef CONFIG_NO_HZ + /* 3) dynticks interface. */ + struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ + int dynticks_snap; /* Per-GP tracking for dynticks. */ + int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ +#endif /* #ifdef CONFIG_NO_HZ */ + + /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ +#ifdef CONFIG_NO_HZ + unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ +#endif /* #ifdef CONFIG_NO_HZ */ + unsigned long offline_fqs; /* Kicked due to being offline. */ + unsigned long resched_ipi; /* Sent a resched IPI. */ + + /* 5) state to allow this CPU to force_quiescent_state on others */ + long n_rcu_pending; /* rcu_pending() calls since boot. */ + long n_rcu_pending_force_qs; /* when to force quiescent states. */ + + int cpu; +}; + +/* Values for signaled field in struct rcu_state. */ +#define RCU_GP_INIT 0 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#ifdef CONFIG_NO_HZ +#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +#else /* #ifdef CONFIG_NO_HZ */ +#define RCU_SIGNAL_INIT RCU_FORCE_QS +#endif /* #else #ifdef CONFIG_NO_HZ */ + +#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ + +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * RCU global state, including node hierarchy. This hierarchy is + * represented in "heap" form in a dense array. The root (first level) + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), + * and the third level in ->node[m+1] and following (->node[m+1] referenced + * by ->level[2]). The number of levels is determined by the number of + * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" + * consisting of a single rcu_node. + */ +struct rcu_state { + struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ + struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ + u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + + /* The following fields are guarded by the root rcu_node's lock. */ + + u8 signaled ____cacheline_internodealigned_in_smp; + /* Force QS state. */ + long gpnum; /* Current gp number. */ + long completed; /* # of last completed gp. */ + spinlock_t onofflock; /* exclude on/offline and */ + /* starting new GP. */ + spinlock_t fqslock; /* Only one task forcing */ + /* quiescent states. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ + /* force_quiescent_state(). */ + unsigned long n_force_qs; /* Number of calls to */ + /* force_quiescent_state(). */ + unsigned long n_force_qs_lh; /* ~Number of calls leaving */ + /* due to lock unavailable. */ + unsigned long n_force_qs_ngp; /* Number of calls leaving */ + /* due to no GP active. */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started, */ + /* but in jiffies. */ + unsigned long jiffies_stall; /* Time at which to check */ + /* for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_NO_HZ + long dynticks_completed; /* Value of completed @ snap. */ +#endif /* #ifdef CONFIG_NO_HZ */ +}; + +extern struct rcu_state rcu_state; +DECLARE_PER_CPU(struct rcu_data, rcu_data); + +extern struct rcu_state rcu_bh_state; +DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); + +/* + * Increment the quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +static inline void rcu_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + rdp->passed_quiesc = 1; + rdp->passed_quiesc_completed = rdp->completed; +} +static inline void rcu_bh_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + rdp->passed_quiesc = 1; + rdp->passed_quiesc_completed = rdp->completed; +} + +extern int rcu_pending(int cpu); +extern int rcu_needs_cpu(int cpu); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern struct lockdep_map rcu_lock_map; +# define rcu_read_acquire() \ + lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) +# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) +#else +# define rcu_read_acquire() do { } while (0) +# define rcu_read_release() do { } while (0) +#endif + +static inline void __rcu_read_lock(void) +{ + preempt_disable(); + __acquire(RCU); + rcu_read_acquire(); +} +static inline void __rcu_read_unlock(void) +{ + rcu_read_release(); + __release(RCU); + preempt_enable(); +} +static inline void __rcu_read_lock_bh(void) +{ + local_bh_disable(); + __acquire(RCU_BH); + rcu_read_acquire(); +} +static inline void __rcu_read_unlock_bh(void) +{ + rcu_read_release(); + __release(RCU_BH); + local_bh_enable(); +} + +#define __synchronize_sched() synchronize_rcu() + +#define call_rcu_sched(head, func) call_rcu(head, func) + +static inline void rcu_init_sched(void) +{ +} + +extern void __rcu_init(void); +extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_restart_cpu(int cpu); + +extern long rcu_batches_completed(void); +extern long rcu_batches_completed_bh(void); + +#ifdef CONFIG_NO_HZ +void rcu_enter_nohz(void); +void rcu_exit_nohz(void); +#else /* CONFIG_NO_HZ */ +static inline void rcu_enter_nohz(void) +{ +} +static inline void rcu_exit_nohz(void) +{ +} +#endif /* CONFIG_NO_HZ */ + +#endif /* __LINUX_RCUTREE_H */ -- cgit v1.2.3 From 3c8bb73ace6249bd089b70c941440441940e3365 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:27 -0800 Subject: x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3 Impact: Code transformation, new functions added should have no effect. Drivers use mmap followed by pgprot_* and remap_pfn_range or vm_insert_pfn, in order to export reserved memory to userspace. Currently, such mappings are not tracked and hence not kept consistent with other mappings (/dev/mem, pci resource, ioremap) for the sme memory, that may exist in the system. The following patchset adds x86 PAT attribute tracking and untracking for pfnmap related APIs. First three patches in the patchset are changing the generic mm code to fit in this tracking. Last four patches are x86 specific to make things work with x86 PAT code. The patchset aso introduces pgprot_writecombine interface, which gives writecombine mapping when enabled, falling back to pgprot_noncached otherwise. This patch: While working on x86 PAT, we faced some hurdles with trackking remap_pfn_range() regions, as we do not have any information to say whether that PFNMAP mapping is linear for the entire vma range or it is smaller granularity regions within the vma. A simple solution to this is to use vm_pgoff as an indicator for linear mapping over the vma region. Currently, remap_pfn_range only sets vm_pgoff for COW mappings. Below patch changes the logic and sets the vm_pgoff irrespective of COW. This will still not be enough for the case where pfn is zero (vma region mapped to physical address zero). But, for all the other cases, we can look at pfnmap VMAs and say whether the mappng is for the entire vma region or not. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ffee2f743418..2be8d9b5e46f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -145,6 +145,15 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ +static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) +{ + return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff); +} + +static inline int is_pfn_mapping(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_PFNMAP); +} /* * vm_fault is filled by the the pagefault handler and passed to the vma's -- cgit v1.2.3 From e121e418441525b5636321fe03d16f0193ad218e Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:28 -0800 Subject: x86: PAT: add follow_pfnmp_pte routine to help tracking pfnmap pages - v3 Impact: New currently unused interface. Add a generic interface to follow pfn in a pfnmap vma range. This is used by one of the subsequent x86 PAT related patch to keep track of memory types for vma regions across vma copy and free. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2be8d9b5e46f..a25024ff9c11 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1223,6 +1223,9 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ +int follow_pfnmap_pte(struct vm_area_struct *vma, + unsigned long address, pte_t *ret_ptep); + typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, -- cgit v1.2.3 From 2ab640379a0ab4cef746ced1d7e04a0941774bcb Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:29 -0800 Subject: x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3 Impact: Introduces new hooks, which are currently null. Introduce generic hooks in remap_pfn_range and vm_insert_pfn and corresponding copy and free routines with reserve and free tracking. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a25024ff9c11..87ecb40e11a0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,6 +155,12 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma) return (vma->vm_flags & VM_PFNMAP); } +extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size); +extern int track_pfn_vma_copy(struct vm_area_struct *vma); +extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size); + /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask -- cgit v1.2.3 From 078a55db075bf4dcc03115dc94875b3dd83f69d4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 18 Dec 2008 16:57:52 -0800 Subject: sparseirq: add kernel-doc notation for new member in irq_desc, -v2 Signed-off-by: Yinghai Lu Acked-by: Randy Dunlap Signed-off-by: Ingo Molnar --- include/linux/irq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 36a015746788..7fa7f30fccb6 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -134,6 +134,9 @@ struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @irq: interrupt number for this descriptor + * @timer_rand_state: pointer to timer rand state struct + * @kstat_irqs: irq stats per cpu + * @irq_2_iommu: iommu with this irq * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] * @chip: low level interrupt hardware access * @msi_desc: MSI descriptor -- cgit v1.2.3 From 7b4967c532045a1983d6d4af5c69cc7c5109f62b Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 19 Dec 2008 16:56:37 +1030 Subject: cpumask: Add alloc_cpumask_var_node() Impact: New API This will be needed in x86 code to allocate the domain and old_domain cpumasks on the same node as where the containing irq_cfg struct is allocated. (Also fixes double-dump_stack on rare CONFIG_DEBUG_PER_CPU_MAPS case) Signed-off-by: Mike Travis Signed-off-by: Rusty Russell (re-impl alloc_cpumask_var) --- include/linux/cpumask.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index d4bf52603e6b..b5ad19a6f43f 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1025,6 +1025,7 @@ static inline size_t cpumask_size(void) #ifdef CONFIG_CPUMASK_OFFSTACK typedef struct cpumask *cpumask_var_t; +bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); @@ -1038,6 +1039,12 @@ static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) return true; } +static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, + int node) +{ + return true; +} + static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } -- cgit v1.2.3 From e057d7aea9d8f2a46cd440d8bfb72245d4e72d79 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 15 Dec 2008 20:26:48 -0800 Subject: cpumask: add sysfs displays for configured and disabled cpu maps Impact: add new sysfs files. Add sysfs files "kernel_max" and "offline" to display the max CPU index allowed (NR_CPUS-1), and the map of cpus that are offline. Cpus can be offlined via HOTPLUG, disabled by the BIOS ACPI tables, or if they exceed the number of cpus allowed by the NR_CPUS config option, or the "maxcpus=NUM" kernel start parameter. The "possible_cpus=NUM" parameter can also extend the number of possible cpus allowed, in which case the cpus not present at startup will be in the offline state. (These cpus can be HOTPLUGGED ON after system startup [pending a follow-on patch to provide the capability via the /sys/devices/sys/cpu/cpuN/online mechanism to bring them online.]) By design, the "offlined cpus > possible cpus" display will always use the following formats: * all possible cpus online: "x$" or "x-y$" * some possible cpus offline: ".*,x$" or ".*,x-y$" where: x == number of possible cpus (nr_cpu_ids); and y == number of cpus >= NR_CPUS or maxcpus (if y > x). One use of this feature is for distros to select (or configure) the appropriate kernel to install for the resident system. Notes: * cpus offlined <= possible cpus will be printed for all architectures. * cpus offlined > possible cpus will only be printed for arches that set 'total_cpus' [X86 only in this patch]. Based on tip/cpus4096 + .../rusty/linux-2.6-for-ingo.git/master + x86-only-patches sent 12/15. Signed-off-by: Mike Travis Signed-off-by: Rusty Russell --- include/linux/smp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index 3f9a60043a97..0d5770c2e43a 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -21,6 +21,9 @@ struct call_single_data { u16 priv; }; +/* total number of cpus in this system (may exceed NR_CPUS) */ +extern unsigned int total_cpus; + #ifdef CONFIG_SMP #include -- cgit v1.2.3 From 420e7fabd9c6d907280ed6b3e40eef425c5d8d8d Mon Sep 17 00:00:00 2001 From: Henning Rogge Date: Thu, 11 Dec 2008 22:04:19 +0100 Subject: nl80211: Add signal strength and bandwith to nl80211station info This patch adds signal strength and transmission bitrate to the station_info of nl80211. Signed-off-by: Henning Rogge Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 04d4516f9c71..7501acfcfdc4 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -424,6 +424,32 @@ enum nl80211_sta_flags { NL80211_STA_FLAG_MAX = __NL80211_STA_FLAG_AFTER_LAST - 1 }; +/** + * enum nl80211_rate_info - bitrate information + * + * These attribute types are used with %NL80211_STA_INFO_TXRATE + * when getting information about the bitrate of a station. + * + * @__NL80211_RATE_INFO_INVALID: attribute number 0 is reserved + * @NL80211_RATE_INFO_BITRATE: total bitrate (u16, 100kbit/s) + * @NL80211_RATE_INFO_MCS: mcs index for 802.11n (u8) + * @NL80211_RATE_INFO_40_MHZ_WIDTH: 40 Mhz dualchannel bitrate + * @NL80211_RATE_INFO_SHORT_GI: 400ns guard interval + * @NL80211_RATE_INFO_MAX: highest rate_info number currently defined + * @__NL80211_RATE_INFO_AFTER_LAST: internal use + */ +enum nl80211_rate_info { + __NL80211_RATE_INFO_INVALID, + NL80211_RATE_INFO_BITRATE, + NL80211_RATE_INFO_MCS, + NL80211_RATE_INFO_40_MHZ_WIDTH, + NL80211_RATE_INFO_SHORT_GI, + + /* keep last */ + __NL80211_RATE_INFO_AFTER_LAST, + NL80211_RATE_INFO_MAX = __NL80211_RATE_INFO_AFTER_LAST - 1 +}; + /** * enum nl80211_sta_info - station information * @@ -436,6 +462,9 @@ enum nl80211_sta_flags { * @NL80211_STA_INFO_TX_BYTES: total transmitted bytes (u32, to this station) * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute + * @NL80211_STA_INFO_SIGNAL: signal strength of last received PPDU (u8, dBm) + * @NL80211_STA_INFO_TX_BITRATE: current unicast tx rate, nested attribute + * containing info as possible, see &enum nl80211_sta_info_txrate. */ enum nl80211_sta_info { __NL80211_STA_INFO_INVALID, @@ -445,6 +474,8 @@ enum nl80211_sta_info { NL80211_STA_INFO_LLID, NL80211_STA_INFO_PLID, NL80211_STA_INFO_PLINK_STATE, + NL80211_STA_INFO_SIGNAL, + NL80211_STA_INFO_TX_BITRATE, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, -- cgit v1.2.3 From 094d05dc32fc2930e381189a942016e5561775d9 Mon Sep 17 00:00:00 2001 From: Sujith Date: Fri, 12 Dec 2008 11:57:43 +0530 Subject: mac80211: Fix HT channel selection HT management is done differently for AP and STA modes, unify to just the ->config() callback since HT is fundamentally a PHY property and cannot be per-BSS. Rename enum nl80211_sec_chan_offset as nl80211_channel_type to denote the channel type ( NO_HT, HT20, HT40+, HT40- ). Signed-off-by: Johannes Berg Signed-off-by: Sujith Signed-off-by: John W. Linville --- include/linux/nl80211.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 7501acfcfdc4..e86ed59f9ad5 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -201,13 +201,13 @@ enum nl80211_commands { * @NL80211_ATTR_WIPHY_NAME: wiphy name (used for renaming) * @NL80211_ATTR_WIPHY_TXQ_PARAMS: a nested array of TX queue parameters * @NL80211_ATTR_WIPHY_FREQ: frequency of the selected channel in MHz - * @NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET: included with NL80211_ATTR_WIPHY_FREQ + * @NL80211_ATTR_WIPHY_CHANNEL_TYPE: included with NL80211_ATTR_WIPHY_FREQ * if HT20 or HT40 are allowed (i.e., 802.11n disabled if not included): - * NL80211_SEC_CHAN_NO_HT = HT not allowed (i.e., same as not including + * NL80211_CHAN_NO_HT = HT not allowed (i.e., same as not including * this attribute) - * NL80211_SEC_CHAN_DISABLED = HT20 only - * NL80211_SEC_CHAN_BELOW = secondary channel is below the primary channel - * NL80211_SEC_CHAN_ABOVE = secondary channel is above the primary channel + * NL80211_CHAN_HT20 = HT20 only + * NL80211_CHAN_HT40MINUS = secondary channel is below the primary channel + * NL80211_CHAN_HT40PLUS = secondary channel is above the primary channel * * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on * @NL80211_ATTR_IFNAME: network interface name @@ -344,7 +344,7 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_TXQ_PARAMS, NL80211_ATTR_WIPHY_FREQ, - NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET, + NL80211_ATTR_WIPHY_CHANNEL_TYPE, /* add attributes here, update the policy in nl80211.c */ @@ -805,10 +805,10 @@ enum nl80211_txq_q { NL80211_TXQ_Q_BK }; -enum nl80211_sec_chan_offset { - NL80211_SEC_CHAN_NO_HT /* No HT */, - NL80211_SEC_CHAN_DISABLED /* HT20 only */, - NL80211_SEC_CHAN_BELOW /* HT40- */, - NL80211_SEC_CHAN_ABOVE /* HT40+ */ +enum nl80211_channel_type { + NL80211_CHAN_NO_HT, + NL80211_CHAN_HT20, + NL80211_CHAN_HT40MINUS, + NL80211_CHAN_HT40PLUS }; #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From 12204e24b1330428c3062faee10a0d80b8a5cb61 Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 19 Dec 2008 10:44:42 +1100 Subject: security: pass mount flags to security_sb_kern_mount() Pass mount flags to security_sb_kern_mount(), so security modules can determine if a mount operation is being performed by the kernel. Signed-off-by: James Morris Acked-by: Stephen Smalley --- include/linux/security.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 6423abf1ac0f..3416cb85e77b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1308,7 +1308,7 @@ struct security_operations { int (*sb_alloc_security) (struct super_block *sb); void (*sb_free_security) (struct super_block *sb); int (*sb_copy_data) (char *orig, char *copy); - int (*sb_kern_mount) (struct super_block *sb, void *data); + int (*sb_kern_mount) (struct super_block *sb, int flags, void *data); int (*sb_show_options) (struct seq_file *m, struct super_block *sb); int (*sb_statfs) (struct dentry *dentry); int (*sb_mount) (char *dev_name, struct path *path, @@ -1575,7 +1575,7 @@ int security_bprm_secureexec(struct linux_binprm *bprm); int security_sb_alloc(struct super_block *sb); void security_sb_free(struct super_block *sb); int security_sb_copy_data(char *orig, char *copy); -int security_sb_kern_mount(struct super_block *sb, void *data); +int security_sb_kern_mount(struct super_block *sb, int flags, void *data); int security_sb_show_options(struct seq_file *m, struct super_block *sb); int security_sb_statfs(struct dentry *dentry); int security_sb_mount(char *dev_name, struct path *path, @@ -1850,7 +1850,7 @@ static inline int security_sb_copy_data(char *orig, char *copy) return 0; } -static inline int security_sb_kern_mount(struct super_block *sb, void *data) +static inline int security_sb_kern_mount(struct super_block *sb, int flags, void *data) { return 0; } -- cgit v1.2.3 From 6bd9cd50c830eb88d571c492ec370a30bf999e15 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:26 -0800 Subject: x86: PAT: clarify is_linear_pfn_mapping() interface Impact: Documentation only Incremental patches to address the review comments from Nick Piggin for v3 version of x86 PAT pfnmap changes patchset here http://lkml.indiana.edu/hypermail/linux/kernel/0812.2/01330.html This patch: Clarify is_linear_pfn_mapping() and its usage. It is used by x86 PAT code for performance reasons. Identifying pfnmap as linear over entire vma helps speedup reserve and free of memtype for the region. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 87ecb40e11a0..35f811b0cd69 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -145,6 +145,14 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ +/* + * This interface is used by x86 PAT code to identify a pfn mapping that is + * linear over entire vma. This is to optimize PAT code that deals with + * marking the physical region with a particular prot. This is not for generic + * mm use. Note also that this check will not work if the pfn mapping is + * linear for a vma starting at physical address 0. In which case PAT code + * falls back to slow path of reserving physical range page by page. + */ static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) { return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff); -- cgit v1.2.3 From d87fe6607c31944f7572f965c1507ae77026c133 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:27 -0800 Subject: x86: PAT: modify follow_phys to return phys_addr prot and return value Impact: Changes and globalizes an existing static interface. Follow_phys does similar things as follow_pfnmap_pte. Make a minor change to follow_phys so that it can be used in place of follow_pfnmap_pte. Physical address return value with 0 as error return does not work in follow_phys as the actual physical address 0 mapping may exist in pte. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 35f811b0cd69..2f6e2f886d4b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -804,6 +804,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int follow_phys(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); -- cgit v1.2.3 From 982d789ab76c8a11426852fec2fdf2f412e21c0c Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:28 -0800 Subject: x86: PAT: remove follow_pfnmap_pte in favor of follow_phys Impact: Cleanup - removes a new function in favor of a recently modified older one. Replace follow_pfnmap_pte in pat code with follow_phys. follow_phys lso returns protection eliminating the need of pte_pgprot call. Using follow_phys also eliminates the need for pte_pa. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2f6e2f886d4b..36f9b3fa5e15 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,9 +1239,6 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ -int follow_pfnmap_pte(struct vm_area_struct *vma, - unsigned long address, pte_t *ret_ptep); - typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, -- cgit v1.2.3 From 34801ba9bf0381fcf0e2b08179d2c07f2c6ede74 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:29 -0800 Subject: x86: PAT: move track untrack pfnmap stubs to asm-generic Impact: Cleanup and branch hints only. Move the track and untrack pfn stub routines from memory.c to asm-generic. Also add unlikely to pfnmap related calls in fork and exit path. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 36f9b3fa5e15..d3ddd735e375 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -163,12 +163,6 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma) return (vma->vm_flags & VM_PFNMAP); } -extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, - unsigned long pfn, unsigned long size); -extern int track_pfn_vma_copy(struct vm_area_struct *vma); -extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); - /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask -- cgit v1.2.3 From bf53de907dfdaac178c92d774aae7370d7b97d20 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:10:24 +0100 Subject: x86, bts: add fork and exit handling Impact: introduce new ptrace facility Add arch_ptrace_untrace() function that is called when the tracer detaches (either voluntarily or when the tracing task dies); ptrace_disable() is only called on a voluntary detach. Add ptrace_fork() and arch_ptrace_fork(). They are called when a traced task is forked. Clear DS and BTS related fields on fork. Release DS resources and reclaim memory in ptrace_untrace(). This releases resources already when the tracing task dies. We used to do that when the traced task dies. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- include/linux/ptrace.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 22641d5d45df..98b93ca4db06 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); +extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 /* Returns 0 on success, -errno on denial. */ @@ -313,6 +314,27 @@ static inline void user_enable_block_step(struct task_struct *task) #define arch_ptrace_stop(code, info) do { } while (0) #endif +#ifndef arch_ptrace_untrace +/* + * Do machine-specific work before untracing child. + * + * This is called for a normal detach as well as from ptrace_exit() + * when the tracing task dies. + * + * Called with write_lock(&tasklist_lock) held. + */ +#define arch_ptrace_untrace(task) do { } while (0) +#endif + +#ifndef arch_ptrace_fork +/* + * Do machine-specific work to initialize a new task. + * + * This is called from copy_process(). + */ +#define arch_ptrace_fork(child, clone_flags) do { } while (0) +#endif + extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); -- cgit v1.2.3 From c5dee6177f4bd2095aab7d9be9f6ebdddd6deee9 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:17:02 +0100 Subject: x86, bts: memory accounting Impact: move the BTS buffer accounting to the mlock bucket Add alloc_locked_buffer() and free_locked_buffer() functions to mm/mlock.c to kalloc a buffer and account the locked memory to current. Account the memory for the BTS buffer to the tracer. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ffee2f743418..9979d3fab6e7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1286,5 +1286,7 @@ int vmemmap_populate_basepages(struct page *start_page, int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); +extern void *alloc_locked_buffer(size_t size); +extern void free_locked_buffer(void *buffer, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From 749820928a2fd47ff536773d869d2c3f8038b7d1 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Fri, 5 Dec 2008 08:15:54 +0000 Subject: of/gpio: Implement of_gpio_count() This function is used to count how many GPIOs are specified for a device node. Signed-off-by: Anton Vorontsov Signed-off-by: Paul Mackerras --- include/linux/of_gpio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h index e25abf610cb6..fc2472c3c254 100644 --- a/include/linux/of_gpio.h +++ b/include/linux/of_gpio.h @@ -65,6 +65,7 @@ static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc) extern int of_get_gpio_flags(struct device_node *np, int index, enum of_gpio_flags *flags); +extern unsigned int of_gpio_count(struct device_node *np); extern int of_mm_gpiochip_add(struct device_node *np, struct of_mm_gpio_chip *mm_gc); @@ -81,6 +82,11 @@ static inline int of_get_gpio_flags(struct device_node *np, int index, return -ENOSYS; } +static inline unsigned int of_gpio_count(struct device_node *np) +{ + return 0; +} + #endif /* CONFIG_OF_GPIO */ /** -- cgit v1.2.3 From 3ddeb912f41801fd1968c7880d031702a396e4d0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 20 Dec 2008 17:15:14 +0800 Subject: ftrace: enable format arguments checking Impact: broaden gcc printf format checks for ftrace_printk() format arguments checking for ftrace_printk() is __printf(1, 2), not __printf(1, 0). Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 04b52e6ebc66..677432b9cb7e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -303,7 +303,7 @@ extern void ftrace_dump(void); static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } static inline int -ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))); +ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); static inline void tracing_start(void) { } static inline void tracing_stop(void) { } -- cgit v1.2.3 From f4314e815e87b4ab1c9b1115dd5853cd20ca999c Mon Sep 17 00:00:00 2001 From: Don Skidmore Date: Sun, 21 Dec 2008 20:10:29 -0800 Subject: net: add DCNA attribute to the BCN interface for DCB Adds the Backward Congestion Notification Address (BCNA) attribute to the Backward Congestion Notification (BCN) interface for Data Center Bridging (DCB), which was missing. Receive the BCNA attribute in the ixgbe driver. The BCNA attribute is for a switch to inform the endstation about the physical port identification in order to support BCN on aggregated links. Signed-off-by: Don Skidmore Signed-off-by: Eric W Multanen Signed-off-by: Jeff Kirsher --- include/linux/dcbnl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h index e73a61449ad6..b0ef274e0031 100644 --- a/include/linux/dcbnl.h +++ b/include/linux/dcbnl.h @@ -305,6 +305,8 @@ enum dcbnl_bcn_attrs{ DCB_BCN_ATTR_RP_7, DCB_BCN_ATTR_RP_ALL, + DCB_BCN_ATTR_BCNA_0, + DCB_BCN_ATTR_BCNA_1, DCB_BCN_ATTR_ALPHA, DCB_BCN_ATTR_BETA, DCB_BCN_ATTR_GD, -- cgit v1.2.3 From 209aa4fdc39eacc145a7f9c32a4b9ffcc68912c6 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 12 Dec 2008 16:35:40 +0900 Subject: fb: SH-5 uses __raw I/O accessors now also, drop the special casing. Signed-off-by: Paul Mundt --- include/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 75a81eaf3430..1ee63df5be92 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -888,7 +888,7 @@ struct fb_info { #define fb_writeq sbus_writeq #define fb_memset sbus_memset_io -#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__) || defined(__avr32__) +#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || defined(__avr32__) #define fb_readb __raw_readb #define fb_readw __raw_readw -- cgit v1.2.3 From b8dd786f9417e5885929bfe33a235c76a9c1c569 Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Mon, 22 Dec 2008 07:15:03 -0800 Subject: mlx4_core: Add support for multiple completion event vectors When using MSI-X mode, create a completion event queue for each CPU. Report the number of completion EQs in a new struct mlx4_caps member, num_comp_vectors, and extend the mlx4_cq_alloc() interface with a vector parameter so that consumers can specify which completion EQ should be used to report events for the CQ being created. Signed-off-by: Yevgeny Petrilin Signed-off-by: Roland Dreier --- include/linux/mlx4/device.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 371086fd946f..8f659cc29960 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -206,6 +206,7 @@ struct mlx4_caps { int reserved_cqs; int num_eqs; int reserved_eqs; + int num_comp_vectors; int num_mpts; int num_mtt_segs; int fmr_reserved_mtts; @@ -328,6 +329,7 @@ struct mlx4_cq { int arm_sn; int cqn; + unsigned vector; atomic_t refcount; struct completion free; @@ -437,7 +439,7 @@ void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres, int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq, - int collapsed); + unsigned vector, int collapsed); void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq); int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base); -- cgit v1.2.3 From 908a7a16b852ffd618a9127be8d62432182d81b4 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 22 Dec 2008 20:43:12 -0800 Subject: net: Remove unused netdev arg from some NAPI interfaces. When the napi api was changed to separate its 1:1 binding to the net_device struct, the netif_rx_[prep|schedule|complete] api failed to remove the now vestigual net_device structure parameter. This patch cleans up that api by properly removing it.. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 58856b6737fb..41e1224651cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1555,8 +1555,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) } /* Test if receive needs to be scheduled but only if up */ -static inline int netif_rx_schedule_prep(struct net_device *dev, - struct napi_struct *napi) +static inline int netif_rx_schedule_prep(struct napi_struct *napi) { return napi_schedule_prep(napi); } @@ -1564,27 +1563,24 @@ static inline int netif_rx_schedule_prep(struct net_device *dev, /* Add interface to tail of rx poll list. This assumes that _prep has * already been called and returned 1. */ -static inline void __netif_rx_schedule(struct net_device *dev, - struct napi_struct *napi) +static inline void __netif_rx_schedule(struct napi_struct *napi) { __napi_schedule(napi); } /* Try to reschedule poll. Called by irq handler. */ -static inline void netif_rx_schedule(struct net_device *dev, - struct napi_struct *napi) +static inline void netif_rx_schedule(struct napi_struct *napi) { - if (netif_rx_schedule_prep(dev, napi)) - __netif_rx_schedule(dev, napi); + if (netif_rx_schedule_prep(napi)) + __netif_rx_schedule(napi); } /* Try to reschedule poll. Called by dev->poll() after netif_rx_complete(). */ -static inline int netif_rx_reschedule(struct net_device *dev, - struct napi_struct *napi) +static inline int netif_rx_reschedule(struct napi_struct *napi) { if (napi_schedule_prep(napi)) { - __netif_rx_schedule(dev, napi); + __netif_rx_schedule(napi); return 1; } return 0; @@ -1593,8 +1589,7 @@ static inline int netif_rx_reschedule(struct net_device *dev, /* same as netif_rx_complete, except that local_irq_save(flags) * has already been issued */ -static inline void __netif_rx_complete(struct net_device *dev, - struct napi_struct *napi) +static inline void __netif_rx_complete(struct napi_struct *napi) { __napi_complete(napi); } @@ -1604,8 +1599,7 @@ static inline void __netif_rx_complete(struct net_device *dev, * it completes the work. The device cannot be out of poll list at this * moment, it is BUG(). */ -static inline void netif_rx_complete(struct net_device *dev, - struct napi_struct *napi) +static inline void netif_rx_complete(struct napi_struct *napi) { napi_complete(napi); } -- cgit v1.2.3 From 88a9fe8cae3bb52e82489447f45e8d7ba1409ca8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Dec 2008 15:21:31 -0500 Subject: SUNRPC: Remove the last remnant of the BKL... Somehow, this escaped the previous purge. There should be no need to keep any extra locks in the XDR callbacks. The NFS client XDR code only writes into private objects, whereas all reads of shared objects are confined to fields that do not change, such as filehandles... Ditto for lockd, the NFSv2/v3 client mount code, and rpcbind. The nfsd XDR code may require the BKL, but since it does a synchronous RPC call from a thread that already holds the lock, that issue is moot. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index e4057d729f03..49e1eb454465 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -36,21 +36,6 @@ struct xdr_netobj { */ typedef int (*kxdrproc_t)(void *rqstp, __be32 *data, void *obj); -/* - * We're still requiring the BKL in the xdr code until it's been - * more carefully audited, at which point this wrapper will become - * unnecessary. - */ -static inline int rpc_call_xdrproc(kxdrproc_t xdrproc, void *rqstp, __be32 *data, void *obj) -{ - int ret; - - lock_kernel(); - ret = xdrproc(rqstp, data, obj); - unlock_kernel(); - return ret; -} - /* * Basic structure for transmission/reception of a client XDR message. * Features a header (for a linear buffer containing RPC headers -- cgit v1.2.3 From 146ec944bbd31d241a44a00518b054fb01921d22 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 23 Dec 2008 15:21:34 -0500 Subject: NFS: Move declaration of nfs_mount() to fs/nfs/internal.h Clean up: The nfs_mount() function is not to be used outside of the NFS client. Move its public declaration to fs/nfs/internal.h. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4eaa8347a0d9..f11077285a6c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -532,12 +532,6 @@ static inline void nfs3_forget_cached_acls(struct inode *inode) } #endif /* CONFIG_NFS_V3_ACL */ -/* - * linux/fs/mount_clnt.c - */ -extern int nfs_mount(struct sockaddr *, size_t, char *, char *, - int, int, struct nfs_fh *); - /* * inline functions */ -- cgit v1.2.3 From d740351bf0960e89ce1aef45cfe00167cb0f9e5b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 23 Dec 2008 15:21:37 -0500 Subject: NFS: add "[no]resvport" mount option The standard default security setting for NFS is AUTH_SYS. An NFS client connects to NFS servers via a privileged source port and a fixed standard destination port (2049). The client sends raw uid and gid numbers to identify users making NFS requests, and the server assumes an appropriate authority on the client has vetted these values because the source port is privileged. On Linux, by default in-kernel RPC services use a privileged port in the range between 650 and 1023 to avoid using source ports of well- known IP services. Using such a small range limits the number of NFS mount points and the number of unique NFS servers to which a client can connect concurrently. An NFS client can use unprivileged source ports to expand the range of source port numbers, allowing more concurrent server connections and more NFS mount points. Servers must explicitly allow NFS connections from unprivileged ports for this to work. In the past, bumping the value of the sunrpc.max_resvport sysctl on the client would permit the NFS client to use unprivileged ports. Bumping this setting also changes the maximum port number used by other in-kernel RPC services, some of which still required a port number less than 1023. This is exacerbated by the way source port numbers are chosen by the Linux RPC client, which starts at the top of the range and works downwards. It means that bumping the maximum means all RPC services requesting a source port will likely get an unprivileged port instead of a privileged one. Changing this setting effects all NFS mount points on a client. A sysadmin could not selectively choose which mount points would use non-privileged ports and which could not. Lastly, this mechanism of expanding the limit on the number of NFS mount points was entirely undocumented. To address the need for the NFS client to use a large range of source ports without interfering with the activity of other in-kernel RPC services, we introduce a new NFS mount option. This option explicitly tells only the NFS client to use a non-privileged source port when communicating with the NFS server for one specific mount point. This new mount option is called "resvport," like the similar NFS mount option on FreeBSD and Mac OS X. A sister patch for nfs-utils will be submitted that documents this new option in nfs(5). The default setting for this new mount option requires the NFS client to use a privileged port, as before. Explicitly specifying the "noresvport" mount option allows the NFS client to use an unprivileged source port for this mount point when connecting to the NFS server port. This mount option is supported only for text-based NFS mounts. [ Sidebar: it is widely known that security mechanisms based on the use of privileged source ports are ineffective. However, the NFS client can combine the use of unprivileged ports with the use of secure authentication mechanisms, such as Kerberos. This allows a large number of connections and mount points while ensuring a useful level of security. Eventually we may change the default setting for this option depending on the security flavor used for the mount. For example, if the mount is using only AUTH_SYS, then the default setting will be "resvport;" if the mount is using a strong security flavor such as krb5, the default setting will be "noresvport." ] Signed-off-by: Chuck Lever [Trond.Myklebust@netapp.com: Fixed a bug whereby nfs4_init_client() was being called with incorrect arguments.] Signed-off-by: Trond Myklebust --- include/linux/nfs_mount.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h index 6549a06ac16e..4499016e6d0d 100644 --- a/include/linux/nfs_mount.h +++ b/include/linux/nfs_mount.h @@ -45,7 +45,7 @@ struct nfs_mount_data { char context[NFS_MAX_CONTEXT_LEN + 1]; /* 6 */ }; -/* bits in the flags field */ +/* bits in the flags field visible to user space */ #define NFS_MOUNT_SOFT 0x0001 /* 1 */ #define NFS_MOUNT_INTR 0x0002 /* 1 */ /* now unused, but ABI */ @@ -68,5 +68,6 @@ struct nfs_mount_data { /* The following are for internal use only */ #define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 #define NFS_MOUNT_LOOKUP_CACHE_NONE 0x20000 +#define NFS_MOUNT_NORESVPORT 0x40000 #endif -- cgit v1.2.3 From 0cb2659b818eca99235e17c04291cfa9985c14f7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 23 Dec 2008 15:21:38 -0500 Subject: NLM: allow lockd requests from an unprivileged port If the admin has specified the "noresvport" option for an NFS mount point, the kernel's NFS client uses an unprivileged source port for the main NFS transport. The kernel's lockd client should use an unprivileged port in this case as well. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/lockd/bind.h | 1 + include/linux/lockd/lockd.h | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index e5872dc994c0..fbc48f898521 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -41,6 +41,7 @@ struct nlmclnt_initdata { size_t addrlen; unsigned short protocol; u32 nfs_version; + int noresvport; }; /* diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index b56d5aa9b194..23da3fa69efa 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -49,6 +49,7 @@ struct nlm_host { unsigned short h_proto; /* transport proto */ unsigned short h_reclaiming : 1, h_server : 1, /* server side, not client side */ + h_noresvport : 1, h_inuse : 1; wait_queue_head_t h_gracewait; /* wait while reclaiming */ struct rw_semaphore h_rwsem; /* Reboot recovery lock */ @@ -220,7 +221,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const size_t salen, const unsigned short protocol, const u32 version, - const char *hostname); + const char *hostname, + int noresvport); struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, const char *hostname, const size_t hostname_len); -- cgit v1.2.3 From 95d35cb4c473c754824967c0b069bbeb7efa4847 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Dec 2008 15:21:45 -0500 Subject: NFSv4: Remove nfs_client->cl_sem Now that we're using the flags to indicate state that needs to be recovered, as well as having implemented proper refcounting and spinlocking on the state and open_owners, we can get rid of nfs_client->cl_sem. The only remaining case that was dubious was the file locking, and that case is now covered by the nfsi->rwsem. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 4e477ae58699..9bb81aec91cf 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -42,12 +42,6 @@ struct nfs_client { struct rb_root cl_openowner_id; struct rb_root cl_lockowner_id; - /* - * The following rwsem ensures exclusive access to the server - * while we recover the state following a lease expiration. - */ - struct rw_semaphore cl_sem; - struct list_head cl_delegations; struct rb_root cl_state_owners; spinlock_t cl_lock; -- cgit v1.2.3 From bd7bf9d540c001055fba796ebf146d90e4dd2eb2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Dec 2008 15:21:53 -0500 Subject: NFSv4: Convert delegation->type field to fmode_t Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 +- include/linux/nfs_xdr.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f11077285a6c..8d71d7b7c78a 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -180,7 +180,7 @@ struct nfs_inode { /* NFSv4 state */ struct list_head open_states; struct nfs_delegation *delegation; - int delegation_state; + fmode_t delegation_state; struct rw_semaphore rwsem; #endif /* CONFIG_NFS_V4*/ struct inode vfs_inode; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c1c31acb8a2b..32c1a0ecdbff 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -126,7 +126,7 @@ struct nfs_openargs { struct iattr * attrs; /* UNCHECKED, GUARDED */ nfs4_verifier verifier; /* EXCLUSIVE */ nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ - int delegation_type; /* CLAIM_PREVIOUS */ + fmode_t delegation_type; /* CLAIM_PREVIOUS */ } u; const struct qstr * name; const struct nfs_server *server; /* Needed for ID mapping */ @@ -143,7 +143,7 @@ struct nfs_openres { struct nfs_fattr * dir_attr; struct nfs_seqid * seqid; const struct nfs_server *server; - int delegation_type; + fmode_t delegation_type; nfs4_stateid delegation; __u32 do_recall; __u64 maxsize; -- cgit v1.2.3 From dc0b027dfadfcb8a5504f7d8052754bf8d501ab9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Dec 2008 15:21:56 -0500 Subject: NFSv4: Convert the open and close ops to use fmode Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 4 ++-- include/linux/nfs_xdr.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 8d71d7b7c78a..b8d9c6dd4f63 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -83,7 +83,7 @@ struct nfs_open_context { struct rpc_cred *cred; struct nfs4_state *state; fl_owner_t lockowner; - int mode; + fmode_t mode; unsigned long flags; #define NFS_CONTEXT_ERROR_WRITE (0) @@ -342,7 +342,7 @@ extern int nfs_setattr(struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); -extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode); +extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 32c1a0ecdbff..a550b528319f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -120,6 +120,7 @@ struct nfs_openargs { const struct nfs_fh * fh; struct nfs_seqid * seqid; int open_flags; + fmode_t fmode; __u64 clientid; __u64 id; union { @@ -171,7 +172,7 @@ struct nfs_closeargs { struct nfs_fh * fh; nfs4_stateid * stateid; struct nfs_seqid * seqid; - int open_flags; + fmode_t fmode; const u32 * bitmask; }; -- cgit v1.2.3 From 64672d55d93c26fb4035fd1a84a803cbc09cb058 Mon Sep 17 00:00:00 2001 From: Peter Staubach Date: Tue, 23 Dec 2008 15:21:56 -0500 Subject: optimize attribute timeouts for "noac" and "actimeo=0" Hi. I've been looking at a bugzilla which describes a problem where a customer was advised to use either the "noac" or "actimeo=0" mount options to solve a consistency problem that they were seeing in the file attributes. It turned out that this solution did not work reliably for them because sometimes, the local attribute cache was believed to be valid and not timed out. (With an attribute cache timeout of 0, the cache should always appear to be timed out.) In looking at this situation, it appears to me that the problem is that the attribute cache timeout code has an off-by-one error in it. It is assuming that the cache is valid in the region, [read_cache_jiffies, read_cache_jiffies + attrtimeo]. The cache should be considered valid only in the region, [read_cache_jiffies, read_cache_jiffies + attrtimeo). With this change, the options, "noac" and "actimeo=0", work as originally expected. This problem was previously addressed by special casing the attrtimeo == 0 case. However, since the problem is only an off- by-one error, the cleaner solution is address the off-by-one error and thus, not require the special case. Thanx... ps Signed-off-by: Peter Staubach Signed-off-by: Trond Myklebust --- include/linux/jiffies.h | 10 ++++++++++ include/linux/nfs_fs.h | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index abb6ac639e8e..1a9cf78bfce5 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -115,10 +115,20 @@ static inline u64 get_jiffies_64(void) ((long)(a) - (long)(b) >= 0)) #define time_before_eq(a,b) time_after_eq(b,a) +/* + * Calculate whether a is in the range of [b, c]. + */ #define time_in_range(a,b,c) \ (time_after_eq(a,b) && \ time_before_eq(a,c)) +/* + * Calculate whether a is in the range of [b, c). + */ +#define time_in_range_open(a,b,c) \ + (time_after_eq(a,b) && \ + time_before(a,c)) + /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64() */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b8d9c6dd4f63..db867b04ac3c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -130,7 +130,10 @@ struct nfs_inode { * * We need to revalidate the cached attrs for this inode if * - * jiffies - read_cache_jiffies > attrtimeo + * jiffies - read_cache_jiffies >= attrtimeo + * + * Please note the comparison is greater than or equal + * so that zero timeout values can be specified. */ unsigned long read_cache_jiffies; unsigned long attrtimeo; -- cgit v1.2.3 From c977a2ef40a38c45537ad03823d0a004f06373f0 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 23 Dec 2008 16:06:13 -0500 Subject: sunrpc: get rid of rpc_rqst.rq_bufsize rq_bufsize is not used. Signed-off-by: Mike Sager Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 4d80a118d538..11fc71d50c1e 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -76,8 +76,7 @@ struct rpc_rqst { struct list_head rq_list; __u32 * rq_buffer; /* XDR encode buffer */ - size_t rq_bufsize, - rq_callsize, + size_t rq_callsize, rq_rcvsize; struct xdr_buf rq_private_buf; /* The receive buffer -- cgit v1.2.3 From c381060869317b3c84430d4f54965d409cbfe65f Mon Sep 17 00:00:00 2001 From: "\\\"J. Bruce Fields\\" Date: Tue, 23 Dec 2008 16:08:32 -0500 Subject: rpc: add an rpc_pipe_open method We want to transition to a new gssd upcall which is text-based and more easily extensible. To simplify upgrades, as well as testing and debugging, it will help if we can upgrade gssd (to a version which understands the new upcall) without having to choose at boot (or module-load) time whether we want the new or the old upcall. We will do this by providing two different pipes: one named, as currently, after the mechanism (normally "krb5"), and supporting the old upcall. One named "gssd" and supporting the new upcall version. We allow gssd to indicate which version it supports by its choice of which pipe to open. As we have no interest in supporting *simultaneous* use of both versions, we'll forbid opening both pipes at the same time. So, add a new pipe_open callback to the rpc_pipefs api, which the gss code can use to track which pipes have been open, and to refuse opens of incompatible pipes. We only need this to be called on the first open of a given pipe. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/rpc_pipe_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index 51b977a4ca20..cea764c2359f 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -15,6 +15,7 @@ struct rpc_pipe_ops { ssize_t (*upcall)(struct file *, struct rpc_pipe_msg *, char __user *, size_t); ssize_t (*downcall)(struct file *, const char __user *, size_t); void (*release_pipe)(struct inode *); + int (*open_pipe)(struct inode *); void (*destroy_msg)(struct rpc_pipe_msg *); }; -- cgit v1.2.3 From 68e76ad0baf8f5d5060377c2423ee6eed5c63057 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 23 Dec 2008 16:17:15 -0500 Subject: nfsd: pass client principal name in rsc downcall Two principals are involved in krb5 authentication: the target, who we authenticate *to* (normally the name of the server, like nfs/server.citi.umich.edu@CITI.UMICH.EDU), and the source, we we authenticate *as* (normally a user, like bfields@UMICH.EDU) In the case of NFSv4 callbacks, the target of the callback should be the source of the client's setclientid call, and the source should be the nfs server's own principal. Therefore we allow svcgssd to pass down the name of the principal that just authenticated, so that on setclientid we can store that principal name with the new client, to be used later on callbacks. Signed-off-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/nfsd/state.h | 1 + include/linux/sunrpc/svcauth_gss.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index d0fe2e378452..ce7cbf4b7c93 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -124,6 +124,7 @@ struct nfs4_client { nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ __be32 cl_addr; /* client ipaddress */ + char *cl_principal; /* setclientid principal name */ struct svc_cred cl_cred; /* setclientid principal */ clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h index c9165d9771a8..ca7d725861fc 100644 --- a/include/linux/sunrpc/svcauth_gss.h +++ b/include/linux/sunrpc/svcauth_gss.h @@ -20,6 +20,7 @@ int gss_svc_init(void); void gss_svc_shutdown(void); int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name); u32 svcauth_gss_flavor(struct auth_domain *dom); +char *svc_gss_principal(struct svc_rqst *); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */ -- cgit v1.2.3 From 608207e8884e083ad8b8d33eda868da70f0d63e8 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 23 Dec 2008 16:17:40 -0500 Subject: rpc: pass target name down to rpc level on callbacks The rpc client needs to know the principal that the setclientid was done as, so it can tell gssd who to authenticate to. Signed-off-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 6f0ee1b84a4f..c39a21040dcb 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -58,6 +58,7 @@ struct rpc_clnt { struct rpc_timeout cl_timeout_default; struct rpc_program * cl_program; char cl_inline_name[32]; + char *cl_principal; /* target to authenticate to */ }; /* @@ -108,6 +109,7 @@ struct rpc_create_args { u32 version; rpc_authflavor_t authflavor; unsigned long flags; + char *client_name; }; /* Values for "flags" field */ -- cgit v1.2.3 From 61054b14d545e257b9415d5ca0cd5f43762b4d0c Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 23 Dec 2008 16:19:00 -0500 Subject: nfsd: support callbacks with gss flavors This patch adds server-side support for callbacks other than AUTH_SYS. Signed-off-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/nfsd/state.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index ce7cbf4b7c93..128298c0362d 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -124,6 +124,7 @@ struct nfs4_client { nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ __be32 cl_addr; /* client ipaddress */ + u32 cl_flavor; /* setclientid pseudoflavor */ char *cl_principal; /* setclientid principal name */ struct svc_cred cl_cred; /* setclientid principal */ clientid_t cl_clientid; /* generated by server */ -- cgit v1.2.3 From 4a7794860ba2b56693b1d89fd485fd08cdc763e3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 13 Sep 2008 18:19:03 -0700 Subject: crypto: api - Move type exit function into crypto_tfm The type exit function needs to undo any allocations done by the type init function. However, the type init function may differ depending on the upper-level type of the transform (e.g., a crypto_blkcipher instantiated as a crypto_ablkcipher). So we need to move the exit function out of the lower-level structure and into crypto_tfm itself. As it stands this is a no-op since nobody uses exit functions at all. However, all cases where a lower-level type is instantiated as a different upper-level type (such as blkcipher as ablkcipher) will be converted such that they allocate the underlying transform and use that instead of casting (e.g., crypto_ablkcipher casted into crypto_blkcipher). That will need to use a different exit function depending on the upper-level type. This patch also allows the type init/exit functions to call (or not) cra_init/cra_exit instead of always calling them from the top level. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 3d2317e4af2e..ea52cd944fd9 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -480,6 +480,8 @@ struct crypto_tfm { struct compress_tfm compress; struct rng_tfm rng; } crt_u; + + void (*exit)(struct crypto_tfm *tfm); struct crypto_alg *__crt_alg; -- cgit v1.2.3 From 7b0bac64cd5b74d6f1147524c26216de13a501fd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 21 Sep 2008 06:52:53 +0900 Subject: crypto: api - Rebirth of crypto_alloc_tfm This patch reintroduces a completely revamped crypto_alloc_tfm. The biggest change is that we now take two crypto_type objects when allocating a tfm, a frontend and a backend. In fact this simply formalises what we've been doing behind the API's back. For example, as it stands crypto_alloc_ahash may use an actual ahash algorithm or a crypto_hash algorithm. Putting this in the API allows us to do this much more cleanly. The existing types will be converted across gradually. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index ea52cd944fd9..ffaaa418cf59 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -546,7 +546,9 @@ struct crypto_attr_u32 { * Transform user interface. */ -struct crypto_tfm *crypto_alloc_tfm(const char *alg_name, u32 tfm_flags); +struct crypto_tfm *crypto_alloc_tfm(const char *alg_name, + const struct crypto_type *frontend, + u32 type, u32 mask); struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask); void crypto_free_tfm(struct crypto_tfm *tfm); -- cgit v1.2.3 From 7b5a080b3c46f0cac71c0d0262634c6517d4ee4f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 31 Aug 2008 15:47:27 +1000 Subject: crypto: hash - Add shash interface The shash interface replaces the current synchronous hash interface. It improves over hash in two ways. Firstly shash is reentrant, meaning that the same tfm may be used by two threads simultaneously as all hashing state is stored in a local descriptor. The other enhancement is that shash no longer takes scatter list entries. This is because shash is specifically designed for synchronous algorithms and as such scatter lists are unnecessary. All existing hash users will be converted to shash once the algorithms have been completely converted. There is also a new finup function that combines update with final. This will be extended to ahash once the algorithm conversion is done. This is also the first time that an algorithm type has their own registration function. Existing algorithm types will be converted to this way in due course. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index ffaaa418cf59..ee95c748695c 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -39,6 +39,7 @@ #define CRYPTO_ALG_TYPE_HASH 0x00000009 #define CRYPTO_ALG_TYPE_AHASH 0x0000000a #define CRYPTO_ALG_TYPE_RNG 0x0000000c +#define CRYPTO_ALG_TYPE_SHASH 0x0000000d #define CRYPTO_ALG_TYPE_HASH_MASK 0x0000000e #define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000c -- cgit v1.2.3 From 3b2f6df08258e2875f42bd630eece7e7241a053b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 31 Aug 2008 18:52:18 +1000 Subject: crypto: hash - Export shash through ahash This patch allows shash algorithms to be used through the ahash interface. This is required before we can convert digest algorithms over to shash. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index ee95c748695c..44c72f0f9b05 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -38,8 +38,8 @@ #define CRYPTO_ALG_TYPE_DIGEST 0x00000008 #define CRYPTO_ALG_TYPE_HASH 0x00000009 #define CRYPTO_ALG_TYPE_AHASH 0x0000000a +#define CRYPTO_ALG_TYPE_SHASH 0x0000000b #define CRYPTO_ALG_TYPE_RNG 0x0000000c -#define CRYPTO_ALG_TYPE_SHASH 0x0000000d #define CRYPTO_ALG_TYPE_HASH_MASK 0x0000000e #define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000c -- cgit v1.2.3 From dec8b78606ebd5f309c38f2fb10196ce996dd18d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 2 Nov 2008 21:38:11 +0800 Subject: crypto: hash - Add import/export interface It is often useful to save the partial state of a hash function so that it can be used as a base for two or more computations. The most prominent example is HMAC where all hashes start from a base determined by the key. Having an import/export interface means that we only have to compute that base once rather than for each message. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 44c72f0f9b05..77a1f3d9416d 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -221,6 +221,7 @@ struct ablkcipher_alg { struct ahash_alg { int (*init)(struct ahash_request *req); + int (*reinit)(struct ahash_request *req); int (*update)(struct ahash_request *req); int (*final)(struct ahash_request *req); int (*digest)(struct ahash_request *req); -- cgit v1.2.3 From 5f7082ed4f482f05db01d84dbf58190492ebf0ad Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 31 Aug 2008 22:21:09 +1000 Subject: crypto: hash - Export shash through hash This patch allows shash algorithms to be used through the old hash interface. This is a transitional measure so we can convert the underlying algorithms to shash before converting the users across. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 77a1f3d9416d..3bacd71509fb 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -36,9 +36,9 @@ #define CRYPTO_ALG_TYPE_ABLKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_GIVCIPHER 0x00000006 #define CRYPTO_ALG_TYPE_DIGEST 0x00000008 -#define CRYPTO_ALG_TYPE_HASH 0x00000009 +#define CRYPTO_ALG_TYPE_HASH 0x00000008 +#define CRYPTO_ALG_TYPE_SHASH 0x00000009 #define CRYPTO_ALG_TYPE_AHASH 0x0000000a -#define CRYPTO_ALG_TYPE_SHASH 0x0000000b #define CRYPTO_ALG_TYPE_RNG 0x0000000c #define CRYPTO_ALG_TYPE_HASH_MASK 0x0000000e -- cgit v1.2.3 From 69c35efcf1576ab5f00cba83e8ca740923afb6c9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 7 Nov 2008 15:11:47 +0800 Subject: libcrc32c: Move implementation to crypto crc32c This patch swaps the role of libcrc32c and crc32c. Previously the implementation was in libcrc32c and crc32c was a wrapper. Now the code is in crc32c and libcrc32c just calls the crypto layer. The reason for the change is to tap into the algorithm selection capability of the crypto API so that optimised implementations such as the one utilising Intel's CRC32C instruction can be used where available. Signed-off-by: Herbert Xu --- include/linux/crc32c.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h index 508f512e5a2f..66fa8ff795ec 100644 --- a/include/linux/crc32c.h +++ b/include/linux/crc32c.h @@ -3,9 +3,6 @@ #include -extern u32 crc32c_le(u32 crc, unsigned char const *address, size_t length); -extern u32 crc32c_be(u32 crc, unsigned char const *address, size_t length); - -#define crc32c(seed, data, length) crc32c_le(seed, (unsigned char const *)data, length) +extern u32 crc32c(u32 crc, const void *address, unsigned int length); #endif /* _LINUX_CRC32C_H */ -- cgit v1.2.3 From 0426c166424ea6d3d0412f47879c8ba268f874c4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 11 Nov 2008 12:20:06 +0800 Subject: libcrc32c: Add crc32c_le macro The bnx2x driver actually uses the crc32c_le name so this patch restores the crc32c_le symbol through a macro. Signed-off-by: Herbert Xu --- include/linux/crc32c.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h index 66fa8ff795ec..bd8b44d96bdc 100644 --- a/include/linux/crc32c.h +++ b/include/linux/crc32c.h @@ -5,4 +5,7 @@ extern u32 crc32c(u32 crc, const void *address, unsigned int length); +/* This macro exists for backwards-compatibility. */ +#define crc32c_le crc32c + #endif /* _LINUX_CRC32C_H */ -- cgit v1.2.3 From 1eca4365be25c540650693e941bc06a66cf38f94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Nov 2008 20:03:17 +0900 Subject: libata: beef up iterators There currently are the following looping constructs. * __ata_port_for_each_link() for all available links * ata_port_for_each_link() for edge links * ata_link_for_each_dev() for all devices * ata_link_for_each_dev_reverse() for all devices in reverse order Now there's a need for looping construct which is similar to __ata_port_for_each_link() but iterates over PMP links before the host link. Instead of adding another one with long name, do the following cleanup. * Implement and export ata_link_next() and ata_dev_next() which take @mode parameter and can be used to build custom loop. * Implement ata_for_each_link() and ata_for_each_dev() which take looping mode explicitly. The following iteration modes are implemented. * ATA_LITER_EDGE : loop over edge links * ATA_LITER_HOST_FIRST : loop over all links, host link first * ATA_LITER_PMP_FIRST : loop over all links, PMP links first * ATA_DITER_ENABLED : loop over enabled devices * ATA_DITER_ENABLED_REVERSE : loop over enabled devices in reverse order * ATA_DITER_ALL : loop over all devices * ATA_DITER_ALL_REVERSE : loop over all devices in reverse order This change removes exlicit device enabledness checks from many loops and makes it clear which ones are iterated over in which direction. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 76 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index ed3f26eb5df1..3b2a0c6444ee 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1285,26 +1285,62 @@ static inline int ata_link_active(struct ata_link *link) return ata_tag_valid(link->active_tag) || link->sactive; } -extern struct ata_link *__ata_port_next_link(struct ata_port *ap, - struct ata_link *link, - bool dev_only); - -#define __ata_port_for_each_link(link, ap) \ - for ((link) = __ata_port_next_link((ap), NULL, false); (link); \ - (link) = __ata_port_next_link((ap), (link), false)) - -#define ata_port_for_each_link(link, ap) \ - for ((link) = __ata_port_next_link((ap), NULL, true); (link); \ - (link) = __ata_port_next_link((ap), (link), true)) - -#define ata_link_for_each_dev(dev, link) \ - for ((dev) = (link)->device; \ - (dev) < (link)->device + ata_link_max_devices(link) || ((dev) = NULL); \ - (dev)++) - -#define ata_link_for_each_dev_reverse(dev, link) \ - for ((dev) = (link)->device + ata_link_max_devices(link) - 1; \ - (dev) >= (link)->device || ((dev) = NULL); (dev)--) +/* + * Iterators + * + * ATA_LITER_* constants are used to select link iteration mode and + * ATA_DITER_* device iteration mode. + * + * For a custom iteration directly using ata_{link|dev}_next(), if + * @link or @dev, respectively, is NULL, the first element is + * returned. @dev and @link can be any valid device or link and the + * next element according to the iteration mode will be returned. + * After the last element, NULL is returned. + */ +enum ata_link_iter_mode { + ATA_LITER_EDGE, /* if present, PMP links only; otherwise, + * host link. no slave link */ + ATA_LITER_HOST_FIRST, /* host link followed by PMP or slave links */ + ATA_LITER_PMP_FIRST, /* PMP links followed by host link, + * slave link still comes after host link */ +}; + +enum ata_dev_iter_mode { + ATA_DITER_ENABLED, + ATA_DITER_ENABLED_REVERSE, + ATA_DITER_ALL, + ATA_DITER_ALL_REVERSE, +}; + +extern struct ata_link *ata_link_next(struct ata_link *link, + struct ata_port *ap, + enum ata_link_iter_mode mode); + +extern struct ata_device *ata_dev_next(struct ata_device *dev, + struct ata_link *link, + enum ata_dev_iter_mode mode); + +/* + * Shortcut notation for iterations + * + * ata_for_each_link() iterates over each link of @ap according to + * @mode. @link points to the current link in the loop. @link is + * NULL after loop termination. ata_for_each_dev() works the same way + * except that it iterates over each device of @link. + * + * Note that the mode prefixes ATA_{L|D}ITER_ shouldn't need to be + * specified when using the following shorthand notations. Only the + * mode itself (EDGE, HOST_FIRST, ENABLED, etc...) should be + * specified. This not only increases brevity but also makes it + * impossible to use ATA_LITER_* for device iteration or vice-versa. + */ +#define ata_for_each_link(link, ap, mode) \ + for ((link) = ata_link_next(NULL, (ap), ATA_LITER_##mode); (link); \ + (link) = ata_link_next((link), (ap), ATA_LITER_##mode)) + +#define ata_for_each_dev(dev, link, mode) \ + for ((dev) = ata_dev_next(NULL, (link), ATA_DITER_##mode); (dev); \ + (dev) = ata_dev_next((dev), (link), ATA_DITER_##mode)) /** * ata_ncq_enabled - Test whether NCQ is enabled -- cgit v1.2.3 From ece180d1cfe5fa751eaa85bf796cf28b2150af15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Nov 2008 20:04:37 +0900 Subject: libata: perform port detach in EH ata_port_detach() first made sure EH saw ATA_PFLAG_UNLOADING and then assumed EH context belongs to it and performed detach operation itself. However, UNLOADING doesn't disable all of EH and this could lead to problems including triggering WARN_ON()'s in EH path. This patch makes port detach behave more like other EH actions such that ata_port_detach() requests EH to detach and waits for completion. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 3b2a0c6444ee..3449de597eff 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -213,10 +213,11 @@ enum { ATA_PFLAG_FROZEN = (1 << 2), /* port is frozen */ ATA_PFLAG_RECOVERED = (1 << 3), /* recovery action performed */ ATA_PFLAG_LOADING = (1 << 4), /* boot/loading probe */ - ATA_PFLAG_UNLOADING = (1 << 5), /* module is unloading */ ATA_PFLAG_SCSI_HOTPLUG = (1 << 6), /* SCSI hotplug scheduled */ ATA_PFLAG_INITIALIZING = (1 << 7), /* being initialized, don't touch */ ATA_PFLAG_RESETTING = (1 << 8), /* reset in progress */ + ATA_PFLAG_UNLOADING = (1 << 9), /* driver is being unloaded */ + ATA_PFLAG_UNLOADED = (1 << 10), /* driver is unloaded */ ATA_PFLAG_SUSPENDED = (1 << 17), /* port is suspended (power) */ ATA_PFLAG_PM_PENDING = (1 << 18), /* PM operation pending */ -- cgit v1.2.3 From 88e740f1654bf28565edd528a060695c1f2b5ad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=C3=A1zquez=20Cao?= Date: Mon, 27 Oct 2008 18:44:46 +0900 Subject: block: add queue flag for paravirt frontend drivers As is the case with SSD devices, we do not want to idle in AS/CFQ when the block device is a paravirt front-end driver. This patch adds a flag (QUEUE_FLAG_VIRT) which should be used by front-end drivers such as virtio_blk and xen-blkfront to indicate a paravirtualized device. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 031a315c0509..482e9600f7a2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -449,6 +449,7 @@ struct request_queue #define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ #define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ #define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ +#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ static inline int queue_is_locked(struct request_queue *q) { -- cgit v1.2.3 From 08bafc0341f2f7920e9045bc32c40299cac8c21b Mon Sep 17 00:00:00 2001 From: Keith Mannthey Date: Tue, 25 Nov 2008 10:24:35 +0100 Subject: block: Supress Buffer I/O errors when SCSI REQ_QUIET flag set Allow the scsi request REQ_QUIET flag to be propagated to the buffer file system layer. The basic ideas is to pass the flag from the scsi request to the bio (block IO) and then to the buffer layer. The buffer layer can then suppress needless printks. This patch declutters the kernel log by removed the 40-50 (per lun) buffer io error messages seen during a boot in my multipath setup . It is a good chance any real errors will be missed in the "noise" it the logs without this patch. During boot I see blocks of messages like " __ratelimit: 211 callbacks suppressed Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242847 Buffer I/O error on device sdm, logical block 1 Buffer I/O error on device sdm, logical block 5242878 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242872 " in my logs. My disk environment is multipath fiber channel using the SCSI_DH_RDAC code and multipathd. This topology includes an "active" and "ghost" path for each lun. IO's to the "ghost" path will never complete and the SCSI layer, via the scsi device handler rdac code, quick returns the IOs to theses paths and sets the REQ_QUIET scsi flag to suppress the scsi layer messages. I am wanting to extend the QUIET behavior to include the buffer file system layer to deal with these errors as well. I have been running this patch for a while now on several boxes without issue. A few runs of bonnie++ show no noticeable difference in performance in my setup. Thanks for John Stultz for the quiet_error finalization. Submitted-by: Keith Mannthey Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + include/linux/buffer_head.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 6a642098e5c3..cf132bfbbacf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -117,6 +117,7 @@ struct bio { #define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ #define BIO_NULL_MAPPED 9 /* contains invalid user pages */ #define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ +#define BIO_QUIET 11 /* Make BIO Quiet */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3ce64b90118c..8605f8a74df9 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -35,6 +35,7 @@ enum bh_state_bits { BH_Ordered, /* ordered write */ BH_Eopnotsupp, /* operation not supported (barrier) */ BH_Unwritten, /* Buffer is allocated on disk but not written */ + BH_Quiet, /* Buffer Error Prinks to be quiet */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities -- cgit v1.2.3 From 64d01dc9e1927e6535627d73f2336c75d1dd3fe2 Mon Sep 17 00:00:00 2001 From: Cheng Renquan Date: Wed, 3 Dec 2008 12:41:39 +0100 Subject: block: use cancel_work_sync() instead of kblockd_flush_work() After many improvements on kblockd_flush_work, it is now identical to cancel_work_sync, so a direct call to cancel_work_sync is suggested. The only difference is that cancel_work_sync is a GPL symbol, so no non-GPL modules anymore. Signed-off-by: Cheng Renquan Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 482e9600f7a2..e9bb73ff1d64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -978,7 +978,6 @@ static inline void put_dev_sector(Sector p) struct work_struct; int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); -void kblockd_flush_work(struct work_struct *work); #define MODULE_ALIAS_BLOCKDEV(major,minor) \ MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) -- cgit v1.2.3 From ba744d5e290055d171c68067259fcc1e2721f542 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Wed, 3 Dec 2008 12:41:40 +0100 Subject: block: reorder struct bio to remove padding on 64bit Remove 8 bytes of padding from struct bio which also removes 16 bytes from struct bio_pair to make it 248 bytes. bio_pair then fits into one fewer cache lines & into a smaller slab. Signed-off-by: Richard Kennedy Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index cf132bfbbacf..3ed714eb54d9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -90,10 +90,11 @@ struct bio { unsigned int bi_comp_cpu; /* completion CPU */ + atomic_t bi_cnt; /* pin count */ + struct bio_vec *bi_io_vec; /* the actual vec list */ bio_end_io_t *bi_end_io; - atomic_t bi_cnt; /* pin count */ void *bi_private; #if defined(CONFIG_BLK_DEV_INTEGRITY) -- cgit v1.2.3 From 313e42999dbc0f234ca5909a236f78f082cb43b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:02 +0900 Subject: block: reorganize QUEUE_ORDERED_* constants Separate out ordering type (drain,) and action masks (preflush, postflush, fua) from visible ordering mode selectors (QUEUE_ORDERED_*). Ordering types are now named QUEUE_ORDERED_BY_* while action masks are named QUEUE_ORDERED_DO_*. This change is necessary to add QUEUE_ORDERED_DO_BAR and make it optional to improve empty barrier implementation. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e9bb73ff1d64..5c92b4432399 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -523,22 +523,29 @@ enum { * TAG_FLUSH : ordering by tag w/ pre and post flushes * TAG_FUA : ordering by tag w/ pre flush and FUA write */ - QUEUE_ORDERED_NONE = 0x00, - QUEUE_ORDERED_DRAIN = 0x01, - QUEUE_ORDERED_TAG = 0x02, - - QUEUE_ORDERED_PREFLUSH = 0x10, - QUEUE_ORDERED_POSTFLUSH = 0x20, - QUEUE_ORDERED_FUA = 0x40, - - QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH, - QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA, - QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH, - QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA, + QUEUE_ORDERED_BY_DRAIN = 0x01, + QUEUE_ORDERED_BY_TAG = 0x02, + QUEUE_ORDERED_DO_PREFLUSH = 0x10, + QUEUE_ORDERED_DO_POSTFLUSH = 0x40, + QUEUE_ORDERED_DO_FUA = 0x80, + + QUEUE_ORDERED_NONE = 0x00, + + QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN, + QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_POSTFLUSH, + QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_FUA, + + QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG, + QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_POSTFLUSH, + QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_FUA, /* * Ordered operation sequence -- cgit v1.2.3 From f671620e7d895af221bdfeda751d54fa55ed9546 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:04 +0900 Subject: block: make every barrier action optional In all barrier sequences, the barrier write itself was always assumed to be issued and thus didn't have corresponding control flag. This patch adds QUEUE_ORDERED_DO_BAR and unify action mask handling in start_ordered() such that any barrier action can be skipped. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c92b4432399..b044267009ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -526,12 +526,14 @@ enum { QUEUE_ORDERED_BY_DRAIN = 0x01, QUEUE_ORDERED_BY_TAG = 0x02, QUEUE_ORDERED_DO_PREFLUSH = 0x10, + QUEUE_ORDERED_DO_BAR = 0x20, QUEUE_ORDERED_DO_POSTFLUSH = 0x40, QUEUE_ORDERED_DO_FUA = 0x80, QUEUE_ORDERED_NONE = 0x00, - QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN, + QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN | + QUEUE_ORDERED_DO_BAR, QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | QUEUE_ORDERED_DO_PREFLUSH | QUEUE_ORDERED_DO_POSTFLUSH, @@ -539,7 +541,8 @@ enum { QUEUE_ORDERED_DO_PREFLUSH | QUEUE_ORDERED_DO_FUA, - QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG, + QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG | + QUEUE_ORDERED_DO_BAR, QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | QUEUE_ORDERED_DO_PREFLUSH | QUEUE_ORDERED_DO_POSTFLUSH, -- cgit v1.2.3 From 8f11b3e99a1136fcbb67316c3260f085299c0bff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:05 +0900 Subject: block: make barrier completion more robust Barrier completion had the following assumptions. * start_ordered() couldn't finish the whole sequence properly. If all actions are to be skipped, q->ordseq is set correctly but the actual completion was never triggered thus hanging the barrier request. * Drain completion in elv_complete_request() assumed that there's always at least one request in the queue when drain completes. Both assumptions are true but these assumptions need to be removed to improve empty barrier implementation. This patch makes the following changes. * Make start_ordered() use blk_ordered_complete_seq() to mark skipped steps complete and notify __elv_next_request() that it should fetch the next request if the whole barrier has completed inside start_ordered(). * Make drain completion path in elv_complete_request() check whether the queue is empty. Empty queue also indicates drain completion. * While at it, convert 0/1 return from blk_do_ordered() to false/true. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b044267009ed..3c7078e0129d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -866,10 +866,10 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); -extern int blk_do_ordered(struct request_queue *, struct request **); +extern bool blk_do_ordered(struct request_queue *, struct request **); extern unsigned blk_ordered_cur_seq(struct request_queue *); extern unsigned blk_ordered_req_seq(struct request *); -extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int); +extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); -- cgit v1.2.3 From 58eea927d2de43dc6f03d1ba2c46e55854b31540 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:06 +0900 Subject: block: simplify empty barrier implementation Empty barrier required special handling in __elv_next_request() to complete it without letting the low level driver see it. With previous changes, barrier code is now flexible enough to skip the BAR step using the same barrier sequence selection mechanism. Drop the special handling and mask off q->ordered from start_ordered(). Remove blk_empty_barrier() test which now has no user. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3c7078e0129d..41bbadfd17f6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -596,7 +596,6 @@ enum { #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA) #define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) -#define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors) /* rq->queuelist of dequeued request must be list_empty() */ #define blk_queued_rq(rq) (!list_empty(&(rq)->queuelist)) -- cgit v1.2.3 From 7ff9345ffac56743b5001561bc2dc1e041b79149 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Dec 2008 11:53:43 +0100 Subject: bio: only mempool back the largest bio_vec slab cache We only very rarely need the mempool backing, so it makes sense to get rid of all but one of the mempool in a bio_set. So keep the largest bio_vec count mempool so we can always honor the largest allocation, and "upgrade" callers that fail. Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 3ed714eb54d9..d76e4bf22f29 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -397,13 +397,14 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) */ #define BIO_POOL_SIZE 2 #define BIOVEC_NR_POOLS 6 +#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) struct bio_set { mempool_t *bio_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t *bio_integrity_pool; #endif - mempool_t *bvec_pools[BIOVEC_NR_POOLS]; + mempool_t *bvec_pool; }; struct biovec_slab { -- cgit v1.2.3 From 1b4344986926da324b5cd10b683e5a1a5e1b7db3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Oct 2008 20:32:58 +0200 Subject: bio: move the slab pointer inside the bio_set In preparation for adding differently sized bios. Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index d76e4bf22f29..9340098d75dc 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -400,6 +400,7 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) #define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) struct bio_set { + struct kmem_cache *bio_slab; mempool_t *bio_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t *bio_integrity_pool; -- cgit v1.2.3 From bb799ca0202a360fa74d5f17039b9100caebdde7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Dec 2008 15:35:05 +0100 Subject: bio: allow individual slabs in the bio_set Instead of having a global bio slab cache, add a reference to one in each bio_set that is created. This allows for personalized slabs in each bio_set, so that they can have bios of different sizes. This means we can personalize the bios we return. File systems may want to embed the bio inside another structure, to avoid allocation more items (and stuffing them in ->bi_private) after the get a bio. Or we may want to embed a number of bio_vecs directly at the end of a bio, to avoid doing two allocations to return a bio. This is now possible. Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9340098d75dc..4b80d3537f97 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -334,7 +334,7 @@ struct bio_pair { extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); extern void bio_pair_release(struct bio_pair *dbio); -extern struct bio_set *bioset_create(int, int); +extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); extern struct bio *bio_alloc(gfp_t, int); @@ -379,6 +379,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); +extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); /* @@ -401,6 +402,8 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) struct bio_set { struct kmem_cache *bio_slab; + unsigned int front_pad; + mempool_t *bio_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t *bio_integrity_pool; @@ -415,6 +418,7 @@ struct biovec_slab { }; extern struct bio_set *fs_bio_set; +extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly; /* * a small number of entries is fine, not going to be performance critical. -- cgit v1.2.3 From 392ddc32982a5c661dd90dd49a3cb37f1c68b782 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Dec 2008 12:42:54 +0100 Subject: bio: add support for inlining a number of bio_vecs inside the bio When we go and allocate a bio for IO, we actually do two allocations. One for the bio itself, and one for the bi_io_vec that holds the actual pages we are interested in. This feature inlines a definable amount of io vecs inside the bio itself, so we eliminate the bio_vec array allocation for IO's up to a certain size. It defaults to 4 vecs, which is typically 16k of IO. Signed-off-by: Jens Axboe --- include/linux/bio.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 4b80d3537f97..18462c5b8fff 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -102,6 +102,13 @@ struct bio { #endif bio_destructor_t *bi_destructor; /* destructor */ + + /* + * We can inline a number of vecs at the end of the bio, to avoid + * double allocations for a small number of bio_vecs. This member + * MUST obviously be kept at the very end of the bio. + */ + struct bio_vec bi_inline_vecs[0]; }; /* @@ -213,6 +220,11 @@ static inline void *bio_data(struct bio *bio) return NULL; } +static inline int bio_has_allocated_vec(struct bio *bio) +{ + return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs; +} + /* * will die */ -- cgit v1.2.3 From abf137dd7712132ee56d5b3143c2ff61a72a5faa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Dec 2008 08:11:22 +0100 Subject: aio: make the lookup_ioctx() lockless The mm->ioctx_list is currently protected by a reader-writer lock, so we always grab that lock on the read side for doing ioctx lookups. As the workload is extremely reader biased, turn this into an rcu hlist so we can make lookup_ioctx() lockless. Get rid of the rwlock and use a spinlock for providing update side exclusion. There's usually only 1 entry on this list, so it doesn't make sense to look into fancier data structures. Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/linux/aio.h | 5 ++++- include/linux/mm_types.h | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/aio.h b/include/linux/aio.h index f6b8cf99b596..b16a957030f8 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -183,7 +184,7 @@ struct kioctx { /* This needs improving */ unsigned long user_id; - struct kioctx *next; + struct hlist_node list; wait_queue_head_t wait; @@ -199,6 +200,8 @@ struct kioctx { struct aio_ring_info ring_info; struct delayed_work wq; + + struct rcu_head rcu_head; }; /* prototypes */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fe825471d5aa..9cfc9b627fdd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -232,8 +232,9 @@ struct mm_struct { struct core_state *core_state; /* coredumping support */ /* aio bits */ - rwlock_t ioctx_list_lock; /* aio lock */ - struct kioctx *ioctx_list; + spinlock_t ioctx_lock; + struct hlist_head ioctx_list; + #ifdef CONFIG_MM_OWNER /* * "owner" points to a task that is regarded as the canonical -- cgit v1.2.3 From b374d18a4bfce705e4a99ae9f501b53e86ecb283 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 31 Oct 2008 10:05:07 +0100 Subject: block: get rid of elevator_t typedef Just use struct elevator_queue everywhere instead. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- include/linux/elevator.h | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 41bbadfd17f6..7035cec583b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -26,7 +26,6 @@ struct scsi_ioctl_command; struct request_queue; struct elevator_queue; -typedef struct elevator_queue elevator_t; struct request_pm_state; struct blk_trace; struct request; @@ -313,7 +312,7 @@ struct request_queue */ struct list_head queue_head; struct request *last_merge; - elevator_t *elevator; + struct elevator_queue *elevator; /* * the queue request freelist, one for reads and one for writes diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 92f6f634e3e6..7a204256b155 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -28,7 +28,7 @@ typedef void (elevator_activate_req_fn) (struct request_queue *, struct request typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); typedef void *(elevator_init_fn) (struct request_queue *); -typedef void (elevator_exit_fn) (elevator_t *); +typedef void (elevator_exit_fn) (struct elevator_queue *); struct elevator_ops { @@ -62,8 +62,8 @@ struct elevator_ops struct elv_fs_entry { struct attribute attr; - ssize_t (*show)(elevator_t *, char *); - ssize_t (*store)(elevator_t *, const char *, size_t); + ssize_t (*show)(struct elevator_queue *, char *); + ssize_t (*store)(struct elevator_queue *, const char *, size_t); }; /* @@ -130,7 +130,7 @@ extern ssize_t elv_iosched_show(struct request_queue *, char *); extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); extern int elevator_init(struct request_queue *, char *); -extern void elevator_exit(elevator_t *); +extern void elevator_exit(struct elevator_queue *); extern int elv_rq_merge_ok(struct request *, struct bio *); /* -- cgit v1.2.3 From a6f23657d3072bde6844055bbc2290e497f33fbc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Oct 2008 12:52:42 +0200 Subject: block: add one-hit cache for disk partition lookup disk_map_sector_rcu() returns a partition from a sector offset, which we use for IO statistics on a per-partition basis. The lookup itself is an O(N) list lookup, where N is the number of partitions. This actually hurts performance quite a bit, even on the lower end partitions. On higher numbered partitions, it can get pretty bad. Solve this by adding a one-hit cache for partition lookup. This makes the lookup O(1) for the case where we do most IO to one partition. Even for mixed partition workloads, amortized cost is pretty close to O(1) since the natural IO batching makes the one-hit cache last for lots of IOs. Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3df7742ce246..16948eaecae3 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -126,6 +126,7 @@ struct blk_scsi_cmd_filter { struct disk_part_tbl { struct rcu_head rcu_head; int len; + struct hd_struct *last_lookup; struct hd_struct *part[]; }; -- cgit v1.2.3 From b3a6ffe16b5cc48abe7db8d04882dc45280eb693 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Dec 2008 09:51:16 +0100 Subject: Get rid of CONFIG_LSF We have two seperate config entries for large devices/files. One is CONFIG_LBD that guards just the devices, the other is CONFIG_LSF that handles large files. This doesn't make a lot of sense, you typically want both or none. So get rid of CONFIG_LSF and change CONFIG_LBD wording to indicate that it covers both. Acked-by: Jean Delvare Signed-off-by: Jens Axboe --- include/linux/types.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index 1d98330b1f2c..121f349cb7ec 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -135,19 +135,14 @@ typedef __s64 int64_t; * * Linux always considers sectors to be 512 bytes long independently * of the devices real block size. + * + * blkcnt_t is the type of the inode's block count. */ #ifdef CONFIG_LBD typedef u64 sector_t; -#else -typedef unsigned long sector_t; -#endif - -/* - * The type of the inode's block count. - */ -#ifdef CONFIG_LSF typedef u64 blkcnt_t; #else +typedef unsigned long sector_t; typedef unsigned long blkcnt_t; #endif -- cgit v1.2.3 From f453ba0460742ad027ae0c4c7d61e62817b3e7ef Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 7 Nov 2008 14:05:41 -0800 Subject: DRM: add mode setting support Add mode setting support to the DRM layer. This is a fairly big chunk of work that allows DRM drivers to provide full output control and configuration capabilities to userspace. It was motivated by several factors: - the fb layer's APIs aren't suited for anything but simple configurations - coordination between the fb layer, DRM layer, and various userspace drivers is poor to non-existent (radeonfb excepted) - user level mode setting drivers makes displaying panic & oops messages more difficult - suspend/resume of graphics state is possible in many more configurations with kernel level support This commit just adds the core DRM part of the mode setting APIs. Driver specific commits using these new structure and APIs will follow. Co-authors: Jesse Barnes , Jakob Bornecrantz Contributors: Alan Hourihane , Maarten Maathuis Signed-off-by: Jesse Barnes Signed-off-by: Eric Anholt Signed-off-by: Dave Airlie --- include/linux/console.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 248e6e3b9b73..a67a90cf8268 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -153,4 +153,8 @@ void vcs_remove_sysfs(struct tty_struct *tty); #define VESA_HSYNC_SUSPEND 2 #define VESA_POWERDOWN 3 +#ifdef CONFIG_VGA_CONSOLE +extern bool vgacon_text_force(void); +#endif + #endif /* _LINUX_CONSOLE_H */ -- cgit v1.2.3 From 773ff60e841461cb1f9374a713ffcda029b8c317 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 23 Dec 2008 19:37:01 +0900 Subject: SLUB: failslab support Currently fault-injection capability for SLAB allocator is only available to SLAB. This patch makes it available to SLUB, too. [penberg@cs.helsinki.fi: unify slab and slub implementations] Cc: Christoph Lameter Cc: Matt Mackall Signed-off-by: Akinobu Mita Signed-off-by: Pekka Enberg --- include/linux/fault-inject.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 32368c4f0326..06ca9b21dad2 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -81,4 +81,13 @@ static inline void cleanup_fault_attr_dentries(struct fault_attr *attr) #endif /* CONFIG_FAULT_INJECTION */ +#ifdef CONFIG_FAILSLAB +extern bool should_failslab(size_t size, gfp_t gfpflags); +#else +static inline bool should_failslab(size_t size, gfp_t gfpflags) +{ + return false; +} +#endif /* CONFIG_FAILSLAB */ + #endif /* _LINUX_FAULT_INJECT_H */ -- cgit v1.2.3 From dfcd3610289132a762b7dc0eaf33998262cd9e20 Mon Sep 17 00:00:00 2001 From: Pascal Terjan Date: Tue, 25 Nov 2008 15:08:19 +0100 Subject: slab: Fix comment on #endif This #endif in slab.h is described as closing the inner block while it's for the big CONFIG_NUMA one. That makes reading the code a bit harder. This trivial patch fixes the comment. Signed-off-by: Pascal Terjan Signed-off-by: Pekka Enberg --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 000da12b5cf0..9d8ca14be3c4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -285,7 +285,7 @@ extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *); #define kmalloc_node_track_caller(size, flags, node) \ kmalloc_track_caller(size, flags) -#endif /* DEBUG_SLAB */ +#endif /* CONFIG_NUMA */ /* * Shortcuts -- cgit v1.2.3 From d61c72e52b98411d1cfef1fdb3f5a8bb070f8966 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 10 Dec 2008 14:07:21 +0100 Subject: DMI: add dmi_match Add a wrapper for testing system_info which will handle also NULL system infos. This will be used by the ata PIIX driver. Signed-off-by: Jiri Slaby Cc: Alexandru Romanescu Cc: Tejun Heo Cc: Alan Cox Signed-off-by: Jeff Garzik --- include/linux/dmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index 2bfda178f274..34161907b2f8 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -47,6 +47,7 @@ extern int dmi_name_in_vendors(const char *str); extern int dmi_name_in_serial(const char *str); extern int dmi_available; extern int dmi_walk(void (*decode)(const struct dmi_header *)); +extern bool dmi_match(enum dmi_field f, const char *str); #else @@ -61,6 +62,8 @@ static inline int dmi_name_in_serial(const char *s) { return 0; } #define dmi_available 0 static inline int dmi_walk(void (*decode)(const struct dmi_header *)) { return -1; } +static inline bool dmi_match(enum dmi_field f, const char *str) + { return false; } #endif -- cgit v1.2.3 From 2a2ca6a96194c4744a2adeefbc09ce881f3c5abe Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:31 +0100 Subject: ide: replace the global ide_lock spinlock by per-hwgroup spinlocks (v2) Now that (almost) all host drivers have been fixed not to abuse ide_lock and core code usage of ide_lock has been sanitized we may safely replace ide_lock by per-hwgroup locks. This patch is partially based on earlier patch from Ravikiran G Thirumalai. While at it: - don't use deprecated HWIF() and HWGROUP() macros - update locking documentation in ide.h v2: Add missing spin_lock_init(&hwgroup->lock). (Noticed by Elias Oltmanns) Cc: Vaibhav V. Nivargi Cc: Alok N. Kataria Cc: Shai Fultheim Signed-off-by: Ravikiran Thirumalai Cc: Elias Oltmanns Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 010fb26a1579..c871d325cedb 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -909,6 +909,8 @@ typedef struct hwgroup_s { int req_gen; int req_gen_timer; + + spinlock_t lock; } ide_hwgroup_t; typedef struct ide_driver_s ide_driver_t; @@ -1610,13 +1612,13 @@ extern struct mutex ide_cfg_mtx; /* * Structure locking: * - * ide_cfg_mtx and ide_lock together protect changes to - * ide_hwif_t->{next,hwgroup} + * ide_cfg_mtx and hwgroup->lock together protect changes to + * ide_hwif_t->next * ide_drive_t->next * - * ide_hwgroup_t->busy: ide_lock - * ide_hwgroup_t->hwif: ide_lock - * ide_hwif_t->mate: constant, no locking + * ide_hwgroup_t->busy: hwgroup->lock + * ide_hwgroup_t->hwif: hwgroup->lock + * ide_hwif_t->{hwgroup,mate}: constant, no locking * ide_drive_t->hwif: constant, no locking */ -- cgit v1.2.3 From 27c01c2db05c3cf8824975e50403cd4fd9356dca Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:32 +0100 Subject: ide-cd: remove obsolete seek optimization It doesn't make much sense nowadays and is problematic on some drives. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index c871d325cedb..0b8af0535d85 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -122,8 +122,6 @@ struct ide_io_ports { #define MAX_DRIVES 2 /* per interface; 2 assumed by lots of code */ #define SECTOR_SIZE 512 -#define IDE_LARGE_SEEK(b1,b2,t) (((b1) > (b2) + (t)) || ((b2) > (b1) + (t))) - /* * Timeouts for various operations: */ @@ -496,8 +494,6 @@ enum { * when more than one interrupt is needed. */ IDE_AFLAG_LIMIT_NFRAMES = (1 << 7), - /* Seeking in progress. */ - IDE_AFLAG_SEEKING = (1 << 8), /* Saved TOC information is current. */ IDE_AFLAG_TOC_VALID = (1 << 9), /* We think that the drive door is locked. */ -- cgit v1.2.3 From 6b5cde3629701258004b94cde75dd1089b556b02 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:32 +0100 Subject: cmd64x: set IDE_HFLAG_SERIALIZE explictly for CMD646 * Set IDE_HFLAG_SERIALIZE explictly for CMD646. * Remove no longer needed ide_cmd646 chipset type (which has a nice side-effect of fixing handling of unexpected IRQs). Cc: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 0b8af0535d85..150e42311ee0 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -171,7 +171,7 @@ enum { ide_unknown, ide_generic, ide_pci, ide_cmd640, ide_dtc2278, ide_ali14xx, ide_qd65xx, ide_umc8672, ide_ht6560b, ide_rz1000, ide_trm290, - ide_cmd646, ide_cy82c693, ide_4drives, + ide_cy82c693, ide_4drives, ide_pmac, ide_acorn, ide_au1xxx, ide_palm3710 }; -- cgit v1.2.3 From f58c1ab8deebc2360cef998f169a6727c288210f Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:33 +0100 Subject: ide: always set nIEN on idle devices * Set nIEN for previous port/device in ide_do_request() also if port uses a non-shared IRQ. * Remove no longer needed ide_hwif_t.sharing_irq. Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 150e42311ee0..1d28006aec68 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -842,7 +842,6 @@ typedef struct hwif_s { unsigned present : 1; /* this interface exists */ unsigned serialized : 1; /* serialized all channel operation */ - unsigned sharing_irq: 1; /* 1 = sharing irq with another hwif */ unsigned sg_mapped : 1; /* sg_table and sg_nents are ready */ struct device gendev; -- cgit v1.2.3 From 7f1ac8c4b9dadc55ec656b368f5f470f2cbe3083 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:33 +0100 Subject: rz1000: apply chipset quirks early (v2) * Use pci_name(dev) instead of hwif->name in init_hwif_rz1000(). * init_hwif_rz1000() -> rz1000_init_chipset(). Update rz1000_init_one() to use rz1000_init_chipset() and add now required rz1000_remove(). * Remove superfluous ide_rz1000 chipset type. v2: * unsigned int rz1000_init_chipset() -> int rz1000_disable_readahead() per Sergei's suggestion. Cc: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 1d28006aec68..fc1a966c7b7d 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -170,8 +170,7 @@ typedef int (ide_ack_intr_t)(struct hwif_s *); enum { ide_unknown, ide_generic, ide_pci, ide_cmd640, ide_dtc2278, ide_ali14xx, ide_qd65xx, ide_umc8672, ide_ht6560b, - ide_rz1000, ide_trm290, - ide_cy82c693, ide_4drives, + ide_trm290, ide_cy82c693, ide_4drives, ide_pmac, ide_acorn, ide_au1xxx, ide_palm3710 }; -- cgit v1.2.3 From 6b4924962c49655494d2c8e9d3faab0e349a3062 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:34 +0100 Subject: ide: add ->max_sectors field to struct ide_port_info * Add ->max_sectors field to struct ide_port_info to allow host drivers to specify value used for hwif->rqsize (if smaller than the default). * Convert pdc202xx_old to use ->max_sectors and remove no longer needed IDE_HFLAG_RQSIZE_256 flag. There should be no functional changes caused by this patch. Acked-by: Sergei Shtyltov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index fc1a966c7b7d..2574dda4a3e7 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1372,8 +1372,6 @@ enum { IDE_HFLAG_LEGACY_IRQS = (1 << 21), /* force use of legacy IRQs */ IDE_HFLAG_FORCE_LEGACY_IRQS = (1 << 22), - /* limit LBA48 requests to 256 sectors */ - IDE_HFLAG_RQSIZE_256 = (1 << 23), /* use 32-bit I/O ops */ IDE_HFLAG_IO_32BIT = (1 << 24), /* unmask IRQs */ @@ -1411,6 +1409,9 @@ struct ide_port_info { ide_pci_enablebit_t enablebits[2]; hwif_chipset_t chipset; + + u16 max_sectors; /* if < than the default one */ + u32 host_flags; u8 pio_mask; u8 swdma_mask; -- cgit v1.2.3 From 1f66019bdf902cb59adf959e462bcd3f4c01f683 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:34 +0100 Subject: trm290: add IDE_HFLAG_TRM290 host flag * Add IDE_HFLAG_TRM290 host flag and use it in ide_build_dmatable(). * Remove no longer needed ide_trm290 chipset type. Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 2574dda4a3e7..f62d35a5fb71 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -170,7 +170,7 @@ typedef int (ide_ack_intr_t)(struct hwif_s *); enum { ide_unknown, ide_generic, ide_pci, ide_cmd640, ide_dtc2278, ide_ali14xx, ide_qd65xx, ide_umc8672, ide_ht6560b, - ide_trm290, ide_cy82c693, ide_4drives, + ide_cy82c693, ide_4drives, ide_pmac, ide_acorn, ide_au1xxx, ide_palm3710 }; @@ -1372,6 +1372,8 @@ enum { IDE_HFLAG_LEGACY_IRQS = (1 << 21), /* force use of legacy IRQs */ IDE_HFLAG_FORCE_LEGACY_IRQS = (1 << 22), + /* host is TRM290 */ + IDE_HFLAG_TRM290 = (1 << 23), /* use 32-bit I/O ops */ IDE_HFLAG_IO_32BIT = (1 << 24), /* unmask IRQs */ -- cgit v1.2.3 From b7876a6fb6e9bf6cbcf7b40cf034aa4138d7978f Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:34 +0100 Subject: cy82c693: remove superfluous ide_cy82c693 chipset type Since CY82C693 doesn't require serialization we may as well use the default ide_pci chipset type. Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index f62d35a5fb71..f89d6d69a386 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -170,8 +170,7 @@ typedef int (ide_ack_intr_t)(struct hwif_s *); enum { ide_unknown, ide_generic, ide_pci, ide_cmd640, ide_dtc2278, ide_ali14xx, ide_qd65xx, ide_umc8672, ide_ht6560b, - ide_cy82c693, ide_4drives, - ide_pmac, ide_acorn, + ide_4drives, ide_pmac, ide_acorn, ide_au1xxx, ide_palm3710 }; -- cgit v1.2.3 From 702c026be87ef8374ae58122969a4b0b081ce6f2 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:36 +0100 Subject: ide: rework handling of serialized ports (v2) * hpt366: set IDE_HFLAG_SERIALIZE in ->host_flags if needed in init_hwif_hpt366(). Remove HPT_SERIALIZE_IO while at it. * Set IDE_HFLAG_SERIALIZE in ->host_flags if needed in ide_init_port(). * Convert init_irq() to use IDE_HFLAG_SERIALIZE together with hwif->host to find out ports which need to be serialized. * Remove no longer needed save_match() and ide_hwif_t.serialized. v2: * Set host's ->host_flags field instead of port's copy. This patch should fix the incorrect grouping of port(s) from host(s) that need serialization with port(s) that happen to use the same IRQ(s) but are from the host(s) that don't need it. Cc: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index f89d6d69a386..9b89cab6d493 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -839,7 +839,6 @@ typedef struct hwif_s { unsigned extra_ports; /* number of extra dma ports */ unsigned present : 1; /* this interface exists */ - unsigned serialized : 1; /* serialized all channel operation */ unsigned sg_mapped : 1; /* sg_table and sg_nents are ready */ struct device gendev; -- cgit v1.2.3 From e2984c628c924442132304ae662da433f41c05c9 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:37 +0100 Subject: ide: move Power Management support to ide-pm.c There should be no functional changes caused by this patch. Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 9b89cab6d493..e99c56de7f56 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1116,6 +1116,14 @@ enum { IDE_PM_COMPLETED, }; +int generic_ide_suspend(struct device *, pm_message_t); +int generic_ide_resume(struct device *); + +void ide_complete_power_step(ide_drive_t *, struct request *); +ide_startstop_t ide_start_power_step(ide_drive_t *, struct request *); +void ide_complete_pm_request(ide_drive_t *, struct request *); +void ide_check_pm_state(ide_drive_t *, struct request *); + /* * Subdrivers support. * -- cgit v1.2.3 From 69acdf1e5a9146ec6667f6c4b439acd38c18f5ea Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Tue, 4 Nov 2008 21:59:37 -0300 Subject: V4L/DVB (9530): Add new pixel format VYUY 16 bits wide. There were already 3 YUV formats defined : - YUYV - YVYU - UYVY The only left combination is VYUY, which is added in this patch. Signed-off-by: Robert Jarzmik Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 4669d7e72e75..ec311d4616cd 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -293,6 +293,7 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_YVU420 v4l2_fourcc('Y', 'V', '1', '2') /* 12 YVU 4:2:0 */ #define V4L2_PIX_FMT_YUYV v4l2_fourcc('Y', 'U', 'Y', 'V') /* 16 YUV 4:2:2 */ #define V4L2_PIX_FMT_UYVY v4l2_fourcc('U', 'Y', 'V', 'Y') /* 16 YUV 4:2:2 */ +#define V4L2_PIX_FMT_VYUY v4l2_fourcc('V', 'Y', 'U', 'Y') /* 16 YUV 4:2:2 */ #define V4L2_PIX_FMT_YUV422P v4l2_fourcc('4', '2', '2', 'P') /* 16 YVU422 planar */ #define V4L2_PIX_FMT_YUV411P v4l2_fourcc('4', '1', '1', 'P') /* 16 YVU411 planar */ #define V4L2_PIX_FMT_Y41P v4l2_fourcc('Y', '4', '1', 'P') /* 12 YUV 4:1:1 */ -- cgit v1.2.3 From 278d1ed65e25d80af7c3a112d707b3f70516ddb4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:12 +1030 Subject: cpumask: make CONFIG_NR_CPUS always valid. Impact: cleanup Currently we have NR_CPUS, which is 1 on UP, and CONFIG_NR_CPUS on SMP. If we make CONFIG_NR_CPUS always valid (and always 1 on !SMP), we can skip the middleman. This also allows us to find and check all the unaudited NR_CPUS usage as we prepare for v. large NR_CPUS. To avoid breaking every arch, we cheat and do this for the moment in the header if the arch doesn't. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- include/linux/threads.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/threads.h b/include/linux/threads.h index 38d1a5d6568e..052b12bec8bd 100644 --- a/include/linux/threads.h +++ b/include/linux/threads.h @@ -8,17 +8,17 @@ */ /* - * Maximum supported processors that can run under SMP. This value is - * set via configure setting. The maximum is equal to the size of the - * bitmasks used on that platform, i.e. 32 or 64. Setting this smaller - * saves quite a bit of memory. + * Maximum supported processors. Setting this smaller saves quite a + * bit of memory. Use nr_cpu_ids instead of this except for static bitmaps. */ -#ifdef CONFIG_SMP -#define NR_CPUS CONFIG_NR_CPUS -#else -#define NR_CPUS 1 +#ifndef CONFIG_NR_CPUS +/* FIXME: This should be fixed in the arch's Kconfig */ +#define CONFIG_NR_CPUS 1 #endif +/* Places which use this should consider cpumask_var_t. */ +#define NR_CPUS CONFIG_NR_CPUS + #define MIN_THREADS_LEFT_FOR_ROOT 4 /* -- cgit v1.2.3 From 4b0bc0bca83f3fb7cf920e2ec80684c15d2269c0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:13 +1030 Subject: bitmap: test for constant as well as small size for inline versions Impact: reduce text size bitmap_zero et al have a fastpath for nbits <= BITS_PER_LONG, but this should really only apply where the nbits is known at compile time. This only saves about 1200 bytes on an allyesconfig kernel, but with cpumasks going variable that number will increase. text data bss dec hex filename 35327852 5035607 6782976 47146435 2cf65c3 vmlinux-before 35326640 5035607 6782976 47145223 2cf6107 vmlinux-after Signed-off-by: Rusty Russell --- include/linux/bitmap.h | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index a08c33a26ca9..2878811c6134 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -137,9 +137,12 @@ extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \ ) +#define small_const_nbits(nbits) \ + (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) + static inline void bitmap_zero(unsigned long *dst, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) *dst = 0UL; else { int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); @@ -150,7 +153,7 @@ static inline void bitmap_zero(unsigned long *dst, int nbits) static inline void bitmap_fill(unsigned long *dst, int nbits) { size_t nlongs = BITS_TO_LONGS(nbits); - if (nlongs > 1) { + if (!small_const_nbits(nbits)) { int len = (nlongs - 1) * sizeof(unsigned long); memset(dst, 0xff, len); } @@ -160,7 +163,7 @@ static inline void bitmap_fill(unsigned long *dst, int nbits) static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) *dst = *src; else { int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); @@ -171,7 +174,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, static inline void bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) *dst = *src1 & *src2; else __bitmap_and(dst, src1, src2, nbits); @@ -180,7 +183,7 @@ static inline void bitmap_and(unsigned long *dst, const unsigned long *src1, static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) *dst = *src1 | *src2; else __bitmap_or(dst, src1, src2, nbits); @@ -189,7 +192,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; else __bitmap_xor(dst, src1, src2, nbits); @@ -198,7 +201,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, static inline void bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) *dst = *src1 & ~(*src2); else __bitmap_andnot(dst, src1, src2, nbits); @@ -207,7 +210,7 @@ static inline void bitmap_andnot(unsigned long *dst, const unsigned long *src1, static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) *dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits); else __bitmap_complement(dst, src, nbits); @@ -216,7 +219,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr static inline int bitmap_equal(const unsigned long *src1, const unsigned long *src2, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); else return __bitmap_equal(src1, src2, nbits); @@ -225,7 +228,7 @@ static inline int bitmap_equal(const unsigned long *src1, static inline int bitmap_intersects(const unsigned long *src1, const unsigned long *src2, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; else return __bitmap_intersects(src1, src2, nbits); @@ -234,7 +237,7 @@ static inline int bitmap_intersects(const unsigned long *src1, static inline int bitmap_subset(const unsigned long *src1, const unsigned long *src2, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); else return __bitmap_subset(src1, src2, nbits); @@ -242,7 +245,7 @@ static inline int bitmap_subset(const unsigned long *src1, static inline int bitmap_empty(const unsigned long *src, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); else return __bitmap_empty(src, nbits); @@ -250,7 +253,7 @@ static inline int bitmap_empty(const unsigned long *src, int nbits) static inline int bitmap_full(const unsigned long *src, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); else return __bitmap_full(src, nbits); @@ -258,7 +261,7 @@ static inline int bitmap_full(const unsigned long *src, int nbits) static inline int bitmap_weight(const unsigned long *src, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight(src, nbits); } @@ -266,7 +269,7 @@ static inline int bitmap_weight(const unsigned long *src, int nbits) static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, int n, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) *dst = *src >> n; else __bitmap_shift_right(dst, src, n, nbits); @@ -275,7 +278,7 @@ static inline void bitmap_shift_right(unsigned long *dst, static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, int n, int nbits) { - if (nbits <= BITS_PER_LONG) + if (small_const_nbits(nbits)) *dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits); else __bitmap_shift_left(dst, src, n, nbits); -- cgit v1.2.3 From cb78a0ce69fad2026825f957e24e2d9cda1ec9f1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:14 +1030 Subject: bitmap: fix seq_bitmap and seq_cpumask to take const pointer Impact: cleanup seq_bitmap just calls bitmap_scnprintf on the bits: that arg can be const. Similarly, seq_cpumask just calls seq_bitmap. Signed-off-by: Rusty Russell --- include/linux/seq_file.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index b3dfa72f13b9..952e0187ba16 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -50,8 +50,9 @@ int seq_path(struct seq_file *, struct path *, char *); int seq_dentry(struct seq_file *, struct dentry *, char *); int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *esc); -int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits); -static inline int seq_cpumask(struct seq_file *m, cpumask_t *mask) +int seq_bitmap(struct seq_file *m, const unsigned long *bits, + unsigned int nr_bits); +static inline int seq_cpumask(struct seq_file *m, const struct cpumask *mask) { return seq_bitmap(m, mask->bits, NR_CPUS); } -- cgit v1.2.3 From b3199c025d1646e25e7d1d640dd605db251dccf8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:14 +1030 Subject: cpumask: switch over to cpu_online/possible/active/present_mask: core Impact: cleanup This implements the obsolescent cpu_online_map in terms of cpu_online_mask, rather than the other way around. Same for the other maps. The documentation comments are also updated to refer to _mask rather than _map. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- include/linux/cpumask.h | 75 +++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b5ad19a6f43f..db2341beca45 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -416,65 +416,54 @@ int __next_cpu_nr(int n, const cpumask_t *srcp); /* * The following particular system cpumasks and operations manage - * possible, present, active and online cpus. Each of them is a fixed size - * bitmap of size NR_CPUS. + * possible, present, active and online cpus. * - * #ifdef CONFIG_HOTPLUG_CPU - * cpu_possible_map - has bit 'cpu' set iff cpu is populatable - * cpu_present_map - has bit 'cpu' set iff cpu is populated - * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler - * cpu_active_map - has bit 'cpu' set iff cpu available to migration - * #else - * cpu_possible_map - has bit 'cpu' set iff cpu is populated - * cpu_present_map - copy of cpu_possible_map - * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler - * #endif + * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable + * cpu_present_mask - has bit 'cpu' set iff cpu is populated + * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler + * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * - * In either case, NR_CPUS is fixed at compile time, as the static - * size of these bitmaps. The cpu_possible_map is fixed at boot - * time, as the set of CPU id's that it is possible might ever - * be plugged in at anytime during the life of that system boot. - * The cpu_present_map is dynamic(*), representing which CPUs - * are currently plugged in. And cpu_online_map is the dynamic - * subset of cpu_present_map, indicating those CPUs available - * for scheduling. + * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * - * If HOTPLUG is enabled, then cpu_possible_map is forced to have + * The cpu_possible_mask is fixed at boot time, as the set of CPU id's + * that it is possible might ever be plugged in at anytime during the + * life of that system boot. The cpu_present_mask is dynamic(*), + * representing which CPUs are currently plugged in. And + * cpu_online_mask is the dynamic subset of cpu_present_mask, + * indicating those CPUs available for scheduling. + * + * If HOTPLUG is enabled, then cpu_possible_mask is forced to have * all NR_CPUS bits set, otherwise it is just the set of CPUs that * ACPI reports present at boot. * - * If HOTPLUG is enabled, then cpu_present_map varies dynamically, + * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise - * cpu_present_map is just a copy of cpu_possible_map. + * cpu_present_mask is just a copy of cpu_possible_mask. * - * (*) Well, cpu_present_map is dynamic in the hotplug case. If not - * hotplug, it's a copy of cpu_possible_map, hence fixed at boot. + * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not + * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP - * cpu_{online,possible,present}_maps are placebos. Changing them + * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. - * 2) Most SMP arch's #define some of these maps to be some - * other map specific to that arch. Therefore, the following - * must be #define macros, not inlines. To see why, examine - * the assembly code produced by the following. Note that - * set1() writes phys_x_map, but set2() writes x_map: - * int x_map, phys_x_map; - * #define set1(a) x_map = a - * inline void set2(int a) { x_map = a; } - * #define x_map phys_x_map - * main(){ set1(3); set2(5); } */ -extern cpumask_t cpu_possible_map; -extern cpumask_t cpu_online_map; -extern cpumask_t cpu_present_map; -extern cpumask_t cpu_active_map; +extern const struct cpumask *const cpu_possible_mask; +extern const struct cpumask *const cpu_online_mask; +extern const struct cpumask *const cpu_present_mask; +extern const struct cpumask *const cpu_active_mask; + +/* These strip const, as traditionally they weren't const. */ +#define cpu_possible_map (*(cpumask_t *)cpu_possible_mask) +#define cpu_online_map (*(cpumask_t *)cpu_online_mask) +#define cpu_present_map (*(cpumask_t *)cpu_present_mask) +#define cpu_active_map (*(cpumask_t *)cpu_active_mask) #if NR_CPUS > 1 #define num_online_cpus() cpus_weight_nr(cpu_online_map) @@ -1058,12 +1047,6 @@ static inline void free_bootmem_cpumask_var(cpumask_var_t mask) } #endif /* CONFIG_CPUMASK_OFFSTACK */ -/* The pointer versions of the maps, these will become the primary versions. */ -#define cpu_possible_mask ((const struct cpumask *)&cpu_possible_map) -#define cpu_online_mask ((const struct cpumask *)&cpu_online_map) -#define cpu_present_mask ((const struct cpumask *)&cpu_present_map) -#define cpu_active_mask ((const struct cpumask *)&cpu_active_map) - /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); -- cgit v1.2.3 From ae7a47e72e1a0b5e2b46d1596bc2c22942a73023 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:15 +1030 Subject: cpumask: make cpumask.h eat its own dogfood. Changes: 1) cpumask_t to struct cpumask, 2) cpus_weight_nr to cpumask_weight, 3) cpu_isset to cpumask_test_cpu, 4) ->bits to cpumask_bits() 5) cpu_*_map to cpu_*_mask. 6) for_each_cpu_mask_nr to for_each_cpu Signed-off-by: Rusty Russell --- include/linux/cpumask.h | 75 +++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index db2341beca45..e62a67156c53 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -268,6 +268,25 @@ static inline void __cpus_shift_left(cpumask_t *dstp, bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } +/** + * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * + * @bitmap: the bitmap + * + * There are a few places where cpumask_var_t isn't appropriate and + * static cpumasks must be used (eg. very early boot), yet we don't + * expose the definition of 'struct cpumask'. + * + * This does the conversion, and can be used as a constant initializer. + */ +#define to_cpumask(bitmap) \ + ((struct cpumask *)(1 ? (bitmap) \ + : (void *)sizeof(__check_is_bitmap(bitmap)))) + +static inline int __check_is_bitmap(const unsigned long *bitmap) +{ + return 1; +} + /* * Special-case data structure for "single bit set only" constant CPU masks. * @@ -278,11 +297,11 @@ static inline void __cpus_shift_left(cpumask_t *dstp, extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; -static inline const cpumask_t *get_cpu_mask(unsigned int cpu) +static inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; - return (const cpumask_t *)p; + return to_cpumask(p); } /* @@ -466,13 +485,13 @@ extern const struct cpumask *const cpu_active_mask; #define cpu_active_map (*(cpumask_t *)cpu_active_mask) #if NR_CPUS > 1 -#define num_online_cpus() cpus_weight_nr(cpu_online_map) -#define num_possible_cpus() cpus_weight_nr(cpu_possible_map) -#define num_present_cpus() cpus_weight_nr(cpu_present_map) -#define cpu_online(cpu) cpu_isset((cpu), cpu_online_map) -#define cpu_possible(cpu) cpu_isset((cpu), cpu_possible_map) -#define cpu_present(cpu) cpu_isset((cpu), cpu_present_map) -#define cpu_active(cpu) cpu_isset((cpu), cpu_active_map) +#define num_online_cpus() cpumask_weight(cpu_online_mask) +#define num_possible_cpus() cpumask_weight(cpu_possible_mask) +#define num_present_cpus() cpumask_weight(cpu_present_mask) +#define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) +#define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) +#define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) +#define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask) #else #define num_online_cpus() 1 #define num_possible_cpus() 1 @@ -485,10 +504,6 @@ extern const struct cpumask *const cpu_active_mask; #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) -#define for_each_possible_cpu(cpu) for_each_cpu_mask_nr((cpu), cpu_possible_map) -#define for_each_online_cpu(cpu) for_each_cpu_mask_nr((cpu), cpu_online_map) -#define for_each_present_cpu(cpu) for_each_cpu_mask_nr((cpu), cpu_present_map) - /* These are the new versions of the cpumask operators: passed by pointer. * The older versions will be implemented in terms of these, then deleted. */ #define cpumask_bits(maskp) ((maskp)->bits) @@ -676,7 +691,7 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) * No static inline type checking - see Subtlety (1) above. */ #define cpumask_test_cpu(cpu, cpumask) \ - test_bit(cpumask_check(cpu), (cpumask)->bits) + test_bit(cpumask_check(cpu), cpumask_bits((cpumask))) /** * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask @@ -919,7 +934,7 @@ static inline void cpumask_copy(struct cpumask *dstp, static inline int cpumask_scnprintf(char *buf, int len, const struct cpumask *srcp) { - return bitmap_scnprintf(buf, len, srcp->bits, nr_cpumask_bits); + return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpumask_bits); } /** @@ -933,7 +948,7 @@ static inline int cpumask_scnprintf(char *buf, int len, static inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { - return bitmap_parse_user(buf, len, dstp->bits, nr_cpumask_bits); + return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** @@ -948,7 +963,8 @@ static inline int cpumask_parse_user(const char __user *buf, int len, static inline int cpulist_scnprintf(char *buf, int len, const struct cpumask *srcp) { - return bitmap_scnlistprintf(buf, len, srcp->bits, nr_cpumask_bits); + return bitmap_scnlistprintf(buf, len, cpumask_bits(srcp), + nr_cpumask_bits); } /** @@ -961,26 +977,7 @@ static inline int cpulist_scnprintf(char *buf, int len, */ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) { - return bitmap_parselist(buf, dstp->bits, nr_cpumask_bits); -} - -/** - * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * - * @bitmap: the bitmap - * - * There are a few places where cpumask_var_t isn't appropriate and - * static cpumasks must be used (eg. very early boot), yet we don't - * expose the definition of 'struct cpumask'. - * - * This does the conversion, and can be used as a constant initializer. - */ -#define to_cpumask(bitmap) \ - ((struct cpumask *)(1 ? (bitmap) \ - : (void *)sizeof(__check_is_bitmap(bitmap)))) - -static inline int __check_is_bitmap(const unsigned long *bitmap) -{ - return 1; + return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } /** @@ -1055,6 +1052,10 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); /* First bits of cpu_bit_bitmap are in fact unset. */ #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) +#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) +#define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) +#define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) + /* Wrappers for arch boot code to manipulate normally-constant masks */ static inline void set_cpu_possible(unsigned int cpu, bool possible) { -- cgit v1.2.3 From 3fa41520696fec2815e2d88fbcccdda77ba4d693 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:16 +1030 Subject: cpumask: make set_cpu_*/init_cpu_* out-of-line They're only for use in boot/cpu hotplug code anyway, and this avoids the use of deprecated cpu_*_map. Stephen Rothwell points out that gcc 4.2.4 (on powerpc at least) didn't like the cast away of const anyway: include/linux/cpumask.h: In function 'set_cpu_possible': include/linux/cpumask.h:1052: warning: passing argument 2 of 'cpumask_set_cpu' discards qualifiers from pointer target type So this kills two birds with one stone. Signed-off-by: Rusty Russell --- include/linux/cpumask.h | 53 +++++++------------------------------------------ 1 file changed, 7 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index e62a67156c53..7c178a6baae3 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1057,50 +1057,11 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ -static inline void set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, &cpu_possible_map); - else - cpumask_clear_cpu(cpu, &cpu_possible_map); -} - -static inline void set_cpu_present(unsigned int cpu, bool present) -{ - if (present) - cpumask_set_cpu(cpu, &cpu_present_map); - else - cpumask_clear_cpu(cpu, &cpu_present_map); -} - -static inline void set_cpu_online(unsigned int cpu, bool online) -{ - if (online) - cpumask_set_cpu(cpu, &cpu_online_map); - else - cpumask_clear_cpu(cpu, &cpu_online_map); -} - -static inline void set_cpu_active(unsigned int cpu, bool active) -{ - if (active) - cpumask_set_cpu(cpu, &cpu_active_map); - else - cpumask_clear_cpu(cpu, &cpu_active_map); -} - -static inline void init_cpu_present(const struct cpumask *src) -{ - cpumask_copy(&cpu_present_map, src); -} - -static inline void init_cpu_possible(const struct cpumask *src) -{ - cpumask_copy(&cpu_possible_map, src); -} - -static inline void init_cpu_online(const struct cpumask *src) -{ - cpumask_copy(&cpu_online_map, src); -} +void set_cpu_possible(unsigned int cpu, bool possible); +void set_cpu_present(unsigned int cpu, bool present); +void set_cpu_online(unsigned int cpu, bool online); +void set_cpu_active(unsigned int cpu, bool active); +void init_cpu_present(const struct cpumask *src); +void init_cpu_possible(const struct cpumask *src); +void init_cpu_online(const struct cpumask *src); #endif /* __LINUX_CPUMASK_H */ -- cgit v1.2.3 From 54b11e6d57a10aa9d0009efd93873e17bffd5d30 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:16 +1030 Subject: cpumask: smp_call_function_many() Impact: Implementation change to remove cpumask_t from stack. Actually change smp_call_function_mask() to smp_call_function_many(). We avoid cpumasks on the stack in this version. (S390 has its own version, but that's going away apparently). We have to do some dancing to figure out if 0 or 1 other cpus are in the mask supplied and the online mask without allocating a tmp cpumask. It's still fairly cheap. We allocate the cpumask at the end of the call_function_data structure: if allocation fails we fallback to smp_call_function_single rather than using the baroque quiescing code (which needs a cpumask on stack). (Thanks to Hiroshi Shimamoto for spotting several bugs in previous versions!) Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Hiroshi Shimamoto Cc: npiggin@suse.de Cc: axboe@kernel.dk --- include/linux/smp.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index 2f85f3b04bc4..b82466968101 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -67,15 +67,16 @@ extern void smp_cpus_done(unsigned int max_cpus); * Call a function on all other processors */ int smp_call_function(void(*func)(void *info), void *info, int wait); -/* Deprecated: use smp_call_function_many() which uses a cpumask ptr. */ -int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, - int wait); +void smp_call_function_many(const struct cpumask *mask, + void (*func)(void *info), void *info, bool wait); -static inline void smp_call_function_many(const struct cpumask *mask, - void (*func)(void *info), void *info, - int wait) +/* Deprecated: Use smp_call_function_many which takes a pointer to the mask. */ +static inline int +smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, + int wait) { - smp_call_function_mask(*mask, func, info, wait); + smp_call_function_many(&mask, func, info, wait); + return 0; } int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, -- cgit v1.2.3 From e12f0102ac81d660c9f801d0a0e10ccf4537a9de Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:19 +1030 Subject: cpumask: Use nr_cpu_ids in seq_cpumask Impact: cleanup, futureproof nr_cpu_ids is the (badly named) runtime limit on possible CPU numbers; ie. the variable version of NR_CPUS. With the new cpumask operators, only bits less than this are defined. So we should use it everywhere, rather than NR_CPUS. Eventually this will make it possible to allocate cpumasks of the minimal length at runtime. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar --- include/linux/seq_file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 952e0187ba16..40ea5058c2ec 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -54,7 +54,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits, unsigned int nr_bits); static inline int seq_cpumask(struct seq_file *m, const struct cpumask *mask) { - return seq_bitmap(m, mask->bits, NR_CPUS); + return seq_bitmap(m, mask->bits, nr_cpu_ids); } static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask) -- cgit v1.2.3 From 480daab42c4dd74b3c07031ddf9031251c530c77 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:25:56 -0600 Subject: virtio: Don't use PAGE_SIZE in virtio_pci.c The virtio PCI devices don't depend on the guest page size. This matters now PowerPC virtio is gaining ground (they like 64k pages). Signed-off-by: Rusty Russell --- include/linux/virtio_pci.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h index cdef35742932..e13d7ebcf576 100644 --- a/include/linux/virtio_pci.h +++ b/include/linux/virtio_pci.h @@ -53,4 +53,8 @@ /* Virtio ABI version, this must match exactly */ #define VIRTIO_PCI_ABI_VERSION 0 + +/* How many bits to shift physical queue address written to QUEUE_PFN. + * 12 is historical, and due to x86 page size. */ +#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12 #endif -- cgit v1.2.3 From 5f0d1d7f2286c8a02dab69f5f0bd51681fab161e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:25:57 -0600 Subject: virtio: rename 'pagesize' arg to vring_init/vring_size It's really the alignment desired for consumer/producer separation; historically this x86 pagesize, but with PowerPC it'll still be x86 pagesize. And in theory lguest could choose a different value. Signed-off-by: Rusty Russell --- include/linux/virtio_ring.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index c4a598fb3826..01bf3124e312 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -83,7 +83,7 @@ struct vring { * __u16 avail_idx; * __u16 available[num]; * - * // Padding to the next page boundary. + * // Padding to the next align boundary. * char pad[]; * * // A ring of used descriptor heads with free-running index. @@ -93,19 +93,19 @@ struct vring { * }; */ static inline void vring_init(struct vring *vr, unsigned int num, void *p, - unsigned long pagesize) + unsigned long align) { vr->num = num; vr->desc = p; vr->avail = p + num*sizeof(struct vring_desc); - vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + pagesize-1) - & ~(pagesize - 1)); + vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + align-1) + & ~(align - 1)); } -static inline unsigned vring_size(unsigned int num, unsigned long pagesize) +static inline unsigned vring_size(unsigned int num, unsigned long align) { return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num) - + pagesize - 1) & ~(pagesize - 1)) + + align - 1) & ~(align - 1)) + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num; } -- cgit v1.2.3 From 498af14783935af487d17dbee4ac451783cbc2b7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:25:57 -0600 Subject: virtio: Don't use PAGE_SIZE for vring alignment in virtio_pci. That doesn't work for non-4k guests which are now appearing. Signed-off-by: Rusty Russell --- include/linux/virtio_pci.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h index e13d7ebcf576..cd0fd5d181a6 100644 --- a/include/linux/virtio_pci.h +++ b/include/linux/virtio_pci.h @@ -57,4 +57,8 @@ /* How many bits to shift physical queue address written to QUEUE_PFN. * 12 is historical, and due to x86 page size. */ #define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12 + +/* The alignment to use between consumer and producer parts of vring. + * x86 pagesize again. */ +#define VIRTIO_PCI_VRING_ALIGN 4096 #endif -- cgit v1.2.3 From 2966af73e70dee461c256b5eb877b2ff757f8c82 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:25:58 -0600 Subject: virtio: use LGUEST_VRING_ALIGN instead of relying on pagesize This doesn't really matter, since lguest is i386 only at the moment, but we could actually choose a different value. (lguest doesn't have a guarenteed ABI). Signed-off-by: Rusty Russell --- include/linux/lguest_launcher.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index e7217dc58f39..bd0eba760522 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -59,4 +59,8 @@ enum lguest_req LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ }; + +/* The alignment to use between consumer and producer parts of vring. + * x86 pagesize for historical reasons. */ +#define LGUEST_VRING_ALIGN 4096 #endif /* _LINUX_LGUEST_LAUNCHER */ -- cgit v1.2.3 From 87c7d57c17ade5024d95b6ca0da249da49b0672a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:26:03 -0600 Subject: virtio: hand virtio ring alignment as argument to vring_new_virtqueue This allows each virtio user to hand in the alignment appropriate to their virtio_ring structures. Signed-off-by: Rusty Russell Acked-by: Christian Borntraeger --- include/linux/virtio_ring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 01bf3124e312..71e03722fb59 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -115,6 +115,7 @@ struct virtio_device; struct virtqueue; struct virtqueue *vring_new_virtqueue(unsigned int num, + unsigned int vring_align, struct virtio_device *vdev, void *pages, void (*notify)(struct virtqueue *vq), -- cgit v1.2.3 From 1b4aa2faeca1b9922033daf2475b6fc13b0ffea6 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Thu, 13 Nov 2008 15:48:33 -0600 Subject: virtio: avoid implicit use of Linux page size in balloon interface Make the balloon interface always use 4K pages, and convert Linux pfns if necessary. This patch assumes that Linux's PAGE_SHIFT will never be less than 12. Signed-off-by: Hollis Blanchard Signed-off-by: Rusty Russell (modified) --- include/linux/virtio_balloon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h index c30c7bfbf39b..8726ff77763e 100644 --- a/include/linux/virtio_balloon.h +++ b/include/linux/virtio_balloon.h @@ -10,6 +10,9 @@ /* The feature bitmap for virtio balloon */ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ +/* Size of a PFN in the balloon interface. */ +#define VIRTIO_BALLOON_PFN_SHIFT 12 + struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ -- cgit v1.2.3 From c29834584ea4eafccf2f62a0b8a32e64f792044c Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 25 Nov 2008 13:36:26 +0100 Subject: virtio_console: support console resizing this patch uses the new hvc callback hvc_resize to set the window size which allows to change the tty size of hvc_console via a hvc_resize function. I have added a new feature bit VIRTIO_CONSOLE_F_SIZE. The driver will change the window size on tty open and via the config_changed callback of the transport. Currently lguest and kvm_s390 have not implemented this callback, but the callback can be implemented at a later point in time. Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- include/linux/virtio_console.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h index 19a0da0dba41..7615ffcdd555 100644 --- a/include/linux/virtio_console.h +++ b/include/linux/virtio_console.h @@ -7,6 +7,17 @@ /* The ID for virtio console */ #define VIRTIO_ID_CONSOLE 3 +/* Feature bits */ +#define VIRTIO_CONSOLE_F_SIZE 0 /* Does host provide console size? */ + +struct virtio_console_config { + /* colums of the screens */ + __u16 cols; + /* rows of the screens */ + __u16 rows; +} __attribute__((packed)); + + #ifdef __KERNEL__ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)); #endif /* __KERNEL__ */ -- cgit v1.2.3 From 58a24566449892dda409b9ad92c2e56c76c5670c Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Mon, 29 Sep 2008 01:40:07 -0300 Subject: lguest: move the initial guest page table creation code to the host This patch moves the initial guest page table creation code to the host, so the launcher keeps working with PAE enabled configs. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- include/linux/lguest_launcher.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index bd0eba760522..a53407a4165c 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -54,7 +54,7 @@ struct lguest_vqconfig { /* Write command first word is a request. */ enum lguest_req { - LHREQ_INITIALIZE, /* + base, pfnlimit, pgdir, start */ + LHREQ_INITIALIZE, /* + base, pfnlimit, start */ LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ -- cgit v1.2.3 From 0877258d98154565def498833ccb208234c85084 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 14 Dec 2008 16:21:16 -0300 Subject: V4L/DVB (9897): v4l2: Add camera zoom controls The zoom controls move the zoom lens group to a an absolute position, as a relative displacement or at a given speed until reaching physical device limits. Positive values move the zoom lens group towards the telephoto direction, negative values towards the wide-angle direction. Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index ec311d4616cd..e450a92a622b 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1118,6 +1118,10 @@ enum v4l2_exposure_auto_type { #define V4L2_CID_FOCUS_RELATIVE (V4L2_CID_CAMERA_CLASS_BASE+11) #define V4L2_CID_FOCUS_AUTO (V4L2_CID_CAMERA_CLASS_BASE+12) +#define V4L2_CID_ZOOM_ABSOLUTE (V4L2_CID_CAMERA_CLASS_BASE+13) +#define V4L2_CID_ZOOM_RELATIVE (V4L2_CID_CAMERA_CLASS_BASE+14) +#define V4L2_CID_ZOOM_CONTINUOUS (V4L2_CID_CAMERA_CLASS_BASE+15) + /* * T U N I N G */ -- cgit v1.2.3 From 046425f8c4ac431db00c09a6d9fba16560b8e5b9 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 14 Dec 2008 16:22:05 -0300 Subject: V4L/DVB (9898): v4l2: Add privacy control The privacy control prevents video from being acquired by the camera. A true value indicates that no image can be captured. Devices that implement the privacy control must support read access and may support write access. Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index e450a92a622b..6e6743bd0f92 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1122,6 +1122,8 @@ enum v4l2_exposure_auto_type { #define V4L2_CID_ZOOM_RELATIVE (V4L2_CID_CAMERA_CLASS_BASE+14) #define V4L2_CID_ZOOM_CONTINUOUS (V4L2_CID_CAMERA_CLASS_BASE+15) +#define V4L2_CID_PRIVACY (V4L2_CID_CAMERA_CLASS_BASE+16) + /* * T U N I N G */ -- cgit v1.2.3 From 92f45badbbaccdbc1be25085292a1e258948e221 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sun, 21 Dec 2008 10:35:25 -0300 Subject: V4L/DVB (9932): v4l2-compat32: fix 32-64 compatibility module Added all missing v4l1/2 ioctls and fix several broken conversions. Partially based on work done by Cody Pisto . Tested-by: Brandon Jenkins Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 6e6743bd0f92..e2cfd6add78c 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1465,6 +1465,8 @@ struct v4l2_chip_ident { #define VIDIOC_G_CHIP_IDENT _IOWR('V', 81, struct v4l2_chip_ident) #endif #define VIDIOC_S_HW_FREQ_SEEK _IOW('V', 82, struct v4l2_hw_freq_seek) +/* Reminder: when adding new ioctls please add support for them to + drivers/media/video/v4l2-compat-ioctl32.c as well! */ #ifdef __OLD_VIDIOC_ /* for compatibility, will go away some day */ -- cgit v1.2.3 From 4b00eb25340c1a9b9eedaf0bc5b0f0d18eddb028 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 19 Dec 2008 11:17:56 -0300 Subject: V4L/DVB (9944): videodev2.h: fix typo. The comment said CX2584X instead of CX2341X. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index e2cfd6add78c..754c8d9685a4 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1051,7 +1051,7 @@ enum v4l2_mpeg_video_bitrate_mode { #define V4L2_CID_MPEG_VIDEO_MUTE (V4L2_CID_MPEG_BASE+210) #define V4L2_CID_MPEG_VIDEO_MUTE_YUV (V4L2_CID_MPEG_BASE+211) -/* MPEG-class control IDs specific to the CX2584x driver as defined by V4L2 */ +/* MPEG-class control IDs specific to the CX2341x driver as defined by V4L2 */ #define V4L2_CID_MPEG_CX2341X_BASE (V4L2_CTRL_CLASS_MPEG | 0x1000) #define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE (V4L2_CID_MPEG_CX2341X_BASE+0) enum v4l2_mpeg_cx2341x_video_spatial_filter_mode { -- cgit v1.2.3 From 531c98e71805b32e9ea35a218119100bbd2b7615 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 22 Dec 2008 13:18:27 -0300 Subject: V4L/DVB (9953): em28xx: Add suport for debugging AC97 anciliary chips The em28xx driver can be coupled to an anciliary AC97 chip. This patch allows read/write AC97 registers directly. Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 754c8d9685a4..e31144d22237 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1376,6 +1376,7 @@ struct v4l2_streamparm { #define V4L2_CHIP_MATCH_HOST 0 /* Match against chip ID on host (0 for the host) */ #define V4L2_CHIP_MATCH_I2C_DRIVER 1 /* Match against I2C driver ID */ #define V4L2_CHIP_MATCH_I2C_ADDR 2 /* Match against I2C 7-bit address */ +#define V4L2_CHIP_MATCH_AC97 3 /* Match against anciliary AC97 chip */ struct v4l2_register { __u32 match_type; /* Match type */ -- cgit v1.2.3 From 91962fa713bd8bf47434b02ac661fdc201365fa5 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 18 Dec 2008 11:45:00 -0300 Subject: V4L/DVB (10078): video: add NV16 and NV61 pixel formats This patch adds support for NV16 and NV61 pixel formats. These pixel formats use two planes; one for 8-bit Y values and one for interleaved 8-bit U and V values. NV16/NV61 formats are very similar to NV12/NV21 with the exception that NV16/NV61 are using the same number of lines for both planes. The difference between NV16 and NV61 is the U and V byte order. The fourcc values are extrapolated from the NV12/NV21 case. Signed-off-by: Magnus Damm Signed-off-by: Guennadi Liakhovetski Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index e31144d22237..1f126e30766c 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -305,6 +305,8 @@ struct v4l2_pix_format { /* two planes -- one Y, one Cr + Cb interleaved */ #define V4L2_PIX_FMT_NV12 v4l2_fourcc('N', 'V', '1', '2') /* 12 Y/CbCr 4:2:0 */ #define V4L2_PIX_FMT_NV21 v4l2_fourcc('N', 'V', '2', '1') /* 12 Y/CrCb 4:2:0 */ +#define V4L2_PIX_FMT_NV16 v4l2_fourcc('N', 'V', '1', '6') /* 16 Y/CbCr 4:2:2 */ +#define V4L2_PIX_FMT_NV61 v4l2_fourcc('N', 'V', '6', '1') /* 16 Y/CrCb 4:2:2 */ /* The following formats are not defined in the V4L2 specification */ #define V4L2_PIX_FMT_YUV410 v4l2_fourcc('Y', 'U', 'V', '9') /* 9 YUV 4:1:0 */ -- cgit v1.2.3 From ab53d472e785e51fdfc08fc1d66252c1153e6c0f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:19 +1030 Subject: bitmap: find_last_bit() Impact: New API As the name suggests. For the moment everyone uses the generic one. Signed-off-by: Rusty Russell --- include/linux/bitops.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 024f2b027244..61829139795a 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -134,9 +134,20 @@ extern unsigned long find_first_bit(const unsigned long *addr, */ extern unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size); - #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ +#ifdef CONFIG_GENERIC_FIND_LAST_BIT +/** + * find_last_bit - find the last set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit number of the first set bit, or size. + */ +extern unsigned long find_last_bit(const unsigned long *addr, + unsigned long size); +#endif /* CONFIG_GENERIC_FIND_LAST_BIT */ + #ifdef CONFIG_GENERIC_FIND_NEXT_BIT /** -- cgit v1.2.3 From 6b954823c24f04ed026a8517f6bab5abda279db8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:25 +1030 Subject: cpumask: convert kernel time functions Impact: Use new APIs Convert kernel/time functions to use struct cpumask *. Note the ugly bitmap declarations in tick-broadcast.c. These should be cpumask_var_t, but there was no obvious initialization function to put the alloc_cpumask_var() calls in. This was safe. (Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK, so we use a bitmap here to show we really mean it). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- include/linux/tick.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index b6ec8189ac0c..469b82d88b3b 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -84,10 +84,10 @@ static inline void tick_cancel_sched_timer(int cpu) { } # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern struct tick_device *tick_get_broadcast_device(void); -extern cpumask_t *tick_get_broadcast_mask(void); +extern struct cpumask *tick_get_broadcast_mask(void); # ifdef CONFIG_TICK_ONESHOT -extern cpumask_t *tick_get_broadcast_oneshot_mask(void); +extern struct cpumask *tick_get_broadcast_oneshot_mask(void); # endif # endif /* BROADCAST */ -- cgit v1.2.3 From d036e67b40f52bdd95392390108defbac7e53837 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:26 +1030 Subject: cpumask: convert kernel/irq Impact: Reduce stack usage, use new cpumask API. ALPHA mod! Main change is that irq_default_affinity becomes a cpumask_var_t, so treat it as a pointer (this effects alpha). Signed-off-by: Rusty Russell --- include/linux/interrupt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index dfaee6bd265b..91f1ef8e5810 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -109,7 +109,7 @@ extern void enable_irq(unsigned int irq); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) -extern cpumask_t irq_default_affinity; +extern cpumask_var_t irq_default_affinity; extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); -- cgit v1.2.3 From bd232f97b30f6bb630efa136a777647545db3039 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:26 +1030 Subject: cpumask: convert RCU implementations Impact: use new cpumask API. rcu_ctrlblk contains a cpumask, and it's highly optimized so I don't want a cpumask_var_t (ie. a pointer) for the CONFIG_CPUMASK_OFFSTACK case. It could use a dangling bitmap, and be allocated in __rcu_init to save memory, but for the moment we use a bitmap. (Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK, so we use a bitmap here to show we really mean it). We remove on-stack cpumasks, using cpumask_var_t for rcu_torture_shuffle_tasks() and for_each_cpu_and in force_quiescent_state(). Signed-off-by: Rusty Russell --- include/linux/rcuclassic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 301dda829e37..f3f697df1d71 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -59,8 +59,8 @@ struct rcu_ctrlblk { int signaled; spinlock_t lock ____cacheline_internodealigned_in_smp; - cpumask_t cpumask; /* CPUs that need to switch in order */ - /* for current batch to proceed. */ + DECLARE_BITMAP(cpumask, NR_CPUS); /* CPUs that need to switch for */ + /* current batch to proceed. */ } ____cacheline_internodealigned_in_smp; /* Is batch a before batch b ? */ -- cgit v1.2.3 From 41c7bb9588904eb060a95bcad47bd3804a1ece25 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:28 +1030 Subject: cpumask: convert rest of files in kernel/ Impact: Reduce stack usage, use new cpumask API. Mainly changing cpumask_t to 'struct cpumask' and similar simple API conversion. Two conversions worth mentioning: 1) we use cpumask_any_but to avoid a temporary in kernel/softlockup.c, 2) Use cpumask_var_t in taskstats_user_cmd(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Balbir Singh Cc: Ingo Molnar --- include/linux/stop_machine.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index faf1519b5adc..74d59a641362 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -23,7 +23,7 @@ * * This can be thought of as a very heavy write lock, equivalent to * grabbing every spinlock in the kernel. */ -int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); +int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); /** * __stop_machine: freeze the machine on all CPUs and run this function @@ -34,11 +34,11 @@ int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); * Description: This is a special version of the above, which assumes cpus * won't come or go while it's being called. Used by hotplug cpu. */ -int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); +int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); #else static inline int stop_machine(int (*fn)(void *), void *data, - const cpumask_t *cpus) + const struct cpumask *cpus) { int ret; local_irq_disable(); -- cgit v1.2.3 From 8c384cdee3e04d6194a2c2b192b624754f990835 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:30 +1030 Subject: cpumask: CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS Impact: new debug CONFIG options This helps find unconverted code. It currently breaks compile horribly, but we never wanted a flag day so that's expected. Signed-off-by: Rusty Russell --- include/linux/cpumask.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 7c178a6baae3..9f315382610b 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -144,6 +144,7 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; extern cpumask_t _unused_cpumask_arg_; +#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS #define cpu_set(cpu, dst) __cpu_set((cpu), &(dst)) static inline void __cpu_set(int cpu, volatile cpumask_t *dstp) { @@ -267,6 +268,7 @@ static inline void __cpus_shift_left(cpumask_t *dstp, { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } +#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */ /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * @@ -304,6 +306,7 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) return to_cpumask(p); } +#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS /* * In cases where we take the address of the cpumask immediately, * gcc optimizes it out (it's a constant) and there's no huge stack @@ -389,19 +392,22 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp, { bitmap_fold(dstp->bits, origp->bits, sz, nbits); } +#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */ #if NR_CPUS == 1 #define nr_cpu_ids 1 +#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS #define first_cpu(src) ({ (void)(src); 0; }) #define next_cpu(n, src) ({ (void)(src); 1; }) #define any_online_cpu(mask) 0 #define for_each_cpu_mask(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) - +#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */ #else /* NR_CPUS > 1 */ extern int nr_cpu_ids; +#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS int __first_cpu(const cpumask_t *srcp); int __next_cpu(int n, const cpumask_t *srcp); int __any_online_cpu(const cpumask_t *mask); @@ -413,8 +419,10 @@ int __any_online_cpu(const cpumask_t *mask); for ((cpu) = -1; \ (cpu) = next_cpu((cpu), (mask)), \ (cpu) < NR_CPUS; ) +#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */ #endif +#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS #if NR_CPUS <= 64 #define next_cpu_nr(n, src) next_cpu(n, src) @@ -432,6 +440,7 @@ int __next_cpu_nr(int n, const cpumask_t *srcp); (cpu) < nr_cpu_ids; ) #endif /* NR_CPUS > 64 */ +#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */ /* * The following particular system cpumasks and operations manage -- cgit v1.2.3