From 814abfabef3ceed390c10d06a0cc69a86454b6cf Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:27:07 -0700 Subject: xdp: add bpf_redirect helper function This adds support for a bpf_redirect helper function to the XDP infrastructure. For now this only supports redirecting to the egress path of a port. In order to support drivers handling a xdp_buff natively this patches uses a new ndo operation ndo_xdp_xmit() that takes pushes a xdp_buff to the specified device. If the program specifies either (a) an unknown device or (b) a device that does not support the operation a BPF warning is thrown and the XDP_ABORTED error code is returned. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e99e3e6f8b37..4dbb7a3f4677 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -717,6 +717,7 @@ enum xdp_action { XDP_DROP, XDP_PASS, XDP_TX, + XDP_REDIRECT, }; /* user accessible metadata for XDP packet hook -- cgit v1.2.3 From 546ac1ffb70d25b56c1126940e5ec639c4dd7413 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:28:56 -0700 Subject: bpf: add devmap, a map for storing net device references Device map (devmap) is a BPF map, primarily useful for networking applications, that uses a key to lookup a reference to a netdevice. The map provides a clean way for BPF programs to build virtual port to physical port maps. Additionally, it provides a scoping function for the redirect action itself allowing multiple optimizations. Future patches will leverage the map to provide batching at the XDP layer. Another optimization/feature, that is not yet implemented, would be to support multiple netdevices per key to support efficient multicast and broadcast support. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4dbb7a3f4677..ecbb0e7e15bc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -104,6 +104,7 @@ enum bpf_map_type { BPF_MAP_TYPE_LPM_TRIE, BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, + BPF_MAP_TYPE_DEVMAP, }; enum bpf_prog_type { -- cgit v1.2.3 From 97f91a7cf04ff605845c20948b8a80e54cbd3376 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:29:18 -0700 Subject: bpf: add bpf_redirect_map helper routine BPF programs can use the devmap with a bpf_redirect_map() helper routine to forward packets to netdevice in map. Signed-off-by: John Fastabend Signed-off-by: Jesper Dangaard Brouer Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ecbb0e7e15bc..1106a8c4cd36 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -348,6 +348,11 @@ union bpf_attr { * @flags: bit 0 - if set, redirect to ingress instead of egress * other bits - reserved * Return: TC_ACT_REDIRECT + * int bpf_redirect_map(key, map, flags) + * redirect to endpoint in map + * @key: index in map to lookup + * @map: fd of map to do lookup in + * @flags: -- * * u32 bpf_get_route_realm(skb) * retrieve a dst's tclassid @@ -592,7 +597,8 @@ union bpf_attr { FN(get_socket_uid), \ FN(set_hash), \ FN(setsockopt), \ - FN(skb_adjust_room), + FN(skb_adjust_room), \ + FN(redirect_map), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From fc60a8b675bd9499c71716d21c238eed5092ddfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20F=C3=A4rber?= Date: Sun, 9 Jul 2017 22:29:42 +0200 Subject: tty: serial: owl: Implement console driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement serial console driver to complement earlycon. Based on LeMaker linux-actions tree. Signed-off-by: Andreas Färber Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index c34a2a3eeff5..38bea3217ead 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -70,6 +70,7 @@ #define PORT_CLPS711X 33 #define PORT_SA1100 34 #define PORT_UART00 35 +#define PORT_OWL 36 #define PORT_21285 37 /* Sparc type numbers. */ -- cgit v1.2.3 From eb0baf8a0d9259d168523b8e7c436b55ade7c546 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 18 Jul 2017 20:13:09 +0800 Subject: perf/core: Define the common branch type classification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is often useful to know the branch types while analyzing branch data. For example, a call is very different from a conditional branch. Currently we have to look it up in binary while the binary may later not be available and even the binary is available but user has to take some time. It is very useful for user to check it directly in perf report. Perf already has support for disassembling the branch instruction to get the x86 branch type. To keep consistent on kernel and userspace and make the classification more common, the patch adds the common branch type classification in perf_event.h. The patch only defines a minimum but most common set of branch types. PERF_BR_UNKNOWN : unknown PERF_BR_COND :conditional PERF_BR_UNCOND : unconditional PERF_BR_IND : indirect PERF_BR_CALL : function call PERF_BR_IND_CALL : indirect function call PERF_BR_RET : function return PERF_BR_SYSCALL : syscall PERF_BR_SYSRET : syscall return PERF_BR_COND_CALL : conditional function call PERF_BR_COND_RET : conditional function return The patch also adds a new field type (4 bits) in perf_branch_entry to record the branch type. Since the disassembling of branch instruction needs some overhead, a new PERF_SAMPLE_BRANCH_TYPE_SAVE is introduced to indicate if it needs to disassemble the branch instruction and record the branch type. Change log: v10: Not changed. v9: Not changed. v8: Change PERF_BR_NONE to PERF_BR_UNKNOWN. No other change. v7: Just keep the most common branch types. Others are removed. v6: Not changed. v5: Not changed. The v5 patch series just change the userspace. v4: Comparing to previous version, the major changes are: 1. Remove the PERF_BR_JCC_FWD/PERF_BR_JCC_BWD, they will be computed later in userspace. 2. Remove the "cross" field in perf_branch_entry. The cross page computing will be done later in userspace. Signed-off-by: Yao Jin Acked-by: Jiri Olsa Acked-by: Michael Ellerman Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Andi Kleen Cc: Kan Liang Link: http://lkml.kernel.org/r/1500379995-6449-2-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b1c0b187acfe..642db5fa3286 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT = 14, /* no flags */ PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT = 15, /* no cycles */ + PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -198,9 +200,30 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + PERF_SAMPLE_BRANCH_TYPE_SAVE = + 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; +/* + * Common flow change classification + */ +enum { + PERF_BR_UNKNOWN = 0, /* unknown */ + PERF_BR_COND = 1, /* conditional */ + PERF_BR_UNCOND = 2, /* unconditional */ + PERF_BR_IND = 3, /* indirect */ + PERF_BR_CALL = 4, /* function call */ + PERF_BR_IND_CALL = 5, /* indirect function call */ + PERF_BR_RET = 6, /* function return */ + PERF_BR_SYSCALL = 7, /* syscall */ + PERF_BR_SYSRET = 8, /* syscall return */ + PERF_BR_COND_CALL = 9, /* conditional function call */ + PERF_BR_COND_RET = 10, /* conditional function return */ + PERF_BR_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1015,6 +1038,7 @@ union perf_mem_data_src { * in_tx: running in a hardware transaction * abort: aborting a hardware transaction * cycles: cycles from last branch (or 0 if not supported) + * type: branch type */ struct perf_branch_entry { __u64 from; @@ -1024,7 +1048,8 @@ struct perf_branch_entry { in_tx:1, /* in transaction */ abort:1, /* transaction abort */ cycles:16, /* cycle count to last branch */ - reserved:44; + type:4, /* branch type */ + reserved:40; }; #endif /* _UAPI_LINUX_PERF_EVENT_H */ -- cgit v1.2.3 From 727f8914477e4642c7d1ff381667cdc4178b40c6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Jul 2017 10:39:26 +0100 Subject: rxrpc: Expose UAPI definitions to userspace Move UAPI definitions from the internal header and place them in a UAPI header file so that userspace can make use of them. Signed-off-by: David Howells --- include/uapi/linux/rxrpc.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 include/uapi/linux/rxrpc.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h new file mode 100644 index 000000000000..08e2fb9c70ae --- /dev/null +++ b/include/uapi/linux/rxrpc.h @@ -0,0 +1,80 @@ +/* Types and definitions for AF_RXRPC. + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_RXRPC_H +#define _UAPI_LINUX_RXRPC_H + +#include +#include +#include + +/* + * RxRPC socket address + */ +struct sockaddr_rxrpc { + sa_family_t srx_family; /* address family */ + u16 srx_service; /* service desired */ + u16 transport_type; /* type of transport socket (SOCK_DGRAM) */ + u16 transport_len; /* length of transport address */ + union { + sa_family_t family; /* transport address family */ + struct sockaddr_in sin; /* IPv4 transport address */ + struct sockaddr_in6 sin6; /* IPv6 transport address */ + } transport; +}; + +/* + * RxRPC socket options + */ +#define RXRPC_SECURITY_KEY 1 /* [clnt] set client security key */ +#define RXRPC_SECURITY_KEYRING 2 /* [srvr] set ring of server security keys */ +#define RXRPC_EXCLUSIVE_CONNECTION 3 /* Deprecated; use RXRPC_EXCLUSIVE_CALL instead */ +#define RXRPC_MIN_SECURITY_LEVEL 4 /* minimum security level */ +#define RXRPC_UPGRADEABLE_SERVICE 5 /* Upgrade service[0] -> service[1] */ +#define RXRPC_SUPPORTED_CMSG 6 /* Get highest supported control message type */ + +/* + * RxRPC control messages + * - If neither abort or accept are specified, the message is a data message. + * - terminal messages mean that a user call ID tag can be recycled + * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg() + */ +enum rxrpc_cmsg_type { + RXRPC_USER_CALL_ID = 1, /* sr: user call ID specifier */ + RXRPC_ABORT = 2, /* sr: abort request / notification [terminal] */ + RXRPC_ACK = 3, /* -r: [Service] RPC op final ACK received [terminal] */ + RXRPC_NET_ERROR = 5, /* -r: network error received [terminal] */ + RXRPC_BUSY = 6, /* -r: server busy received [terminal] */ + RXRPC_LOCAL_ERROR = 7, /* -r: local error generated [terminal] */ + RXRPC_NEW_CALL = 8, /* -r: [Service] new incoming call notification */ + RXRPC_ACCEPT = 9, /* s-: [Service] accept request */ + RXRPC_EXCLUSIVE_CALL = 10, /* s-: Call should be on exclusive connection */ + RXRPC_UPGRADE_SERVICE = 11, /* s-: Request service upgrade for client call */ + RXRPC_TX_LENGTH = 12, /* s-: Total length of Tx data */ + RXRPC__SUPPORTED +}; + +/* + * RxRPC security levels + */ +#define RXRPC_SECURITY_PLAIN 0 /* plain secure-checksummed packets only */ +#define RXRPC_SECURITY_AUTH 1 /* authenticated packets */ +#define RXRPC_SECURITY_ENCRYPT 2 /* encrypted packets */ + +/* + * RxRPC security indices + */ +#define RXRPC_SECURITY_NONE 0 /* no security protocol */ +#define RXRPC_SECURITY_RXKAD 2 /* kaserver or kerberos 4 */ +#define RXRPC_SECURITY_RXGK 4 /* gssapi-based */ +#define RXRPC_SECURITY_RXK5 5 /* kerberos 5 */ + +#endif /* _UAPI_LINUX_RXRPC_H */ -- cgit v1.2.3 From ddc6c70f07bb1f6dd39a2c6c430f7b4fa95199c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Jul 2017 10:07:10 +0100 Subject: rxrpc: Move the packet.h include file into net/rxrpc/ Move the protocol description header file into net/rxrpc/ and rename it to protocol.h. It's no longer necessary to expose it as packets are no longer exposed to kernel services (such as AFS) that use the facility. The abort codes are transferred to the UAPI header instead as we pass these back to userspace and also to kernel services. Signed-off-by: David Howells --- include/uapi/linux/rxrpc.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 08e2fb9c70ae..9656aad8f8f7 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -77,4 +77,48 @@ enum rxrpc_cmsg_type { #define RXRPC_SECURITY_RXGK 4 /* gssapi-based */ #define RXRPC_SECURITY_RXK5 5 /* kerberos 5 */ +/* + * RxRPC-level abort codes + */ +#define RX_CALL_DEAD -1 /* call/conn has been inactive and is shut down */ +#define RX_INVALID_OPERATION -2 /* invalid operation requested / attempted */ +#define RX_CALL_TIMEOUT -3 /* call timeout exceeded */ +#define RX_EOF -4 /* unexpected end of data on read op */ +#define RX_PROTOCOL_ERROR -5 /* low-level protocol error */ +#define RX_USER_ABORT -6 /* generic user abort */ +#define RX_ADDRINUSE -7 /* UDP port in use */ +#define RX_DEBUGI_BADTYPE -8 /* bad debugging packet type */ + +/* + * (un)marshalling abort codes (rxgen) + */ +#define RXGEN_CC_MARSHAL -450 +#define RXGEN_CC_UNMARSHAL -451 +#define RXGEN_SS_MARSHAL -452 +#define RXGEN_SS_UNMARSHAL -453 +#define RXGEN_DECODE -454 +#define RXGEN_OPCODE -455 +#define RXGEN_SS_XDRFREE -456 +#define RXGEN_CC_XDRFREE -457 + +/* + * Rx kerberos security abort codes + * - unfortunately we have no generalised security abort codes to say things + * like "unsupported security", so we have to use these instead and hope the + * other side understands + */ +#define RXKADINCONSISTENCY 19270400 /* security module structure inconsistent */ +#define RXKADPACKETSHORT 19270401 /* packet too short for security challenge */ +#define RXKADLEVELFAIL 19270402 /* security level negotiation failed */ +#define RXKADTICKETLEN 19270403 /* ticket length too short or too long */ +#define RXKADOUTOFSEQUENCE 19270404 /* packet had bad sequence number */ +#define RXKADNOAUTH 19270405 /* caller not authorised */ +#define RXKADBADKEY 19270406 /* illegal key: bad parity or weak */ +#define RXKADBADTICKET 19270407 /* security object was passed a bad ticket */ +#define RXKADUNKNOWNKEY 19270408 /* ticket contained unknown key version number */ +#define RXKADEXPIRED 19270409 /* authentication expired */ +#define RXKADSEALEDINCON 19270410 /* sealed data inconsistent */ +#define RXKADDATALEN 19270411 /* user data too long */ +#define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ + #endif /* _UAPI_LINUX_RXRPC_H */ -- cgit v1.2.3 From 784b4e612d42a2b7578d7fab2ed78940e10536bc Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 19 Jul 2017 16:32:23 +0200 Subject: netfilter: nf_tables: Attach process info to NFT_MSG_NEWGEN notifications This is helpful for 'nft monitor' to track which process caused a given change to the ruleset. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 683f6f88fcac..6f0a950e21c3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1221,6 +1221,8 @@ enum nft_objref_attributes { enum nft_gen_attributes { NFTA_GEN_UNSPEC, NFTA_GEN_ID, + NFTA_GEN_PROC_PID, + NFTA_GEN_PROC_NAME, __NFTA_GEN_MAX }; #define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) -- cgit v1.2.3 From ca1136c99b66b1566781ff12ecddc635d570f932 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:53 -0700 Subject: blktrace: export cgroup info in trace Currently blktrace isn't cgroup aware. blktrace prints out task name of current context, but the task of current context isn't always in the cgroup where the BIO comes from. We can't use task name to find out IO cgroup. For example, Writeback BIOs always comes from flusher thread but the BIOs are for different blk cgroups. Request could be requeued and dispatched from completely different tasks. MD/DM are another examples. This patch tries to fix the gap. We print out cgroup fhandle info in blktrace. Userspace can use open_by_handle_at() syscall to find the cgroup by fhandle. Or userspace can use name_to_handle_at() syscall to find fhandle for a cgroup and use a BPF program to filter out blktrace for a specific cgroup. We add a new 'blk_cgroup' trace option for blk tracer. It's default off. Application which doesn't know the new option isn't affected. When it's on, we output fhandle info right after blk_io_trace with an extra bit set in event action. So from application point of view, blktrace with the option will output new actions. I didn't change blk trace event yet, since I'm not sure if changing the trace event output is an ABI issue. If not, I'll do it later. Acked-by: Steven Rostedt (VMware) Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index c590ca6bfbd9..9cdaedeadb84 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -52,6 +52,7 @@ enum blktrace_act { __BLK_TA_REMAP, /* bio was remapped */ __BLK_TA_ABORT, /* request aborted */ __BLK_TA_DRV_DATA, /* driver-specific binary data */ + __BLK_TA_CGROUP = 1 << 8, /* from a cgroup*/ }; /* @@ -61,6 +62,7 @@ enum blktrace_notify { __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ __BLK_TN_TIMESTAMP, /* include system clock */ __BLK_TN_MESSAGE, /* Character string message */ + __BLK_TN_CGROUP = __BLK_TA_CGROUP, /* from a cgroup */ }; @@ -107,6 +109,7 @@ struct blk_io_trace { __u32 cpu; /* on what cpu did it happen */ __u16 error; /* completion error */ __u16 pdu_len; /* length of data after this trace */ + /* cgroup id will be stored here if exists */ }; /* -- cgit v1.2.3 From 1a5f3da20bd966220931239fbd31e6ac6ff42251 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Ravipati Date: Thu, 27 Jul 2017 16:47:26 -0700 Subject: net: ethtool: add support for forward error correction modes Forward Error Correction (FEC) modes i.e Base-R and Reed-Solomon modes are introduced in 25G/40G/100G standards for providing good BER at high speeds. Various networking devices which support 25G/40G/100G provides ability to manage supported FEC modes and the lack of FEC encoding control and reporting today is a source for interoperability issues for many vendors. FEC capability as well as specific FEC mode i.e. Base-R or RS modes can be requested or advertised through bits D44:47 of base link codeword. This patch set intends to provide option under ethtool to manage and report FEC encoding settings for networking devices as per IEEE 802.3 bj, bm and by specs. set-fec/show-fec option(s) are designed to provide control and report the FEC encoding on the link. SET FEC option: root@tor: ethtool --set-fec swp1 encoding [off | RS | BaseR | auto] Encoding: Types of encoding Off : Turning off any encoding RS : enforcing RS-FEC encoding on supported speeds BaseR : enforcing Base R encoding on supported speeds Auto : IEEE defaults for the speed/medium combination Here are a few examples of what we would expect if encoding=auto: - if autoneg is on, we are expecting FEC to be negotiated as on or off as long as protocol supports it - if the hardware is capable of detecting the FEC encoding on it's receiver it will reconfigure its encoder to match - in absence of the above, the configuration would be set to IEEE defaults. >From our understanding , this is essentially what most hardware/driver combinations are doing today in the absence of a way for users to control the behavior. SHOW FEC option: root@tor: ethtool --show-fec swp1 FEC parameters for swp1: Active FEC encodings: RS Configured FEC encodings: RS | BaseR ETHTOOL DEVNAME output modification: ethtool devname output: root@tor:~# ethtool swp1 Settings for swp1: root@hpe-7712-03:~# ethtool swp18 Settings for swp18: Supported ports: [ FIBRE ] Supported link modes: 40000baseCR4/Full 40000baseSR4/Full 40000baseLR4/Full 100000baseSR4/Full 100000baseCR4/Full 100000baseLR4_ER4/Full Supported pause frame use: No Supports auto-negotiation: Yes Supported FEC modes: [RS | BaseR | None | Not reported] Advertised link modes: Not reported Advertised pause frame use: No Advertised auto-negotiation: No Advertised FEC modes: [RS | BaseR | None | Not reported] <<<< One or more FEC modes Speed: 100000Mb/s Duplex: Full Port: FIBRE PHYAD: 106 Transceiver: internal Auto-negotiation: off Link detected: yes This patch includes following changes a) New ETHTOOL_SFECPARAM/SFECPARAM API, handled by the new get_fecparam/set_fecparam callbacks, provides support for configuration of forward error correction modes. b) Link mode bits for FEC modes i.e. None (No FEC mode), RS, BaseR/FC are defined so that users can configure these fec modes for supported and advertising fields as part of link autonegotiation. Signed-off-by: Vidya Sagar Ravipati Signed-off-by: Dustin Byford Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 48 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 7d4a594d5d58..9c041dae8e2c 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1238,6 +1238,47 @@ struct ethtool_per_queue_op { char data[]; }; +/** + * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters + * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM + * @active_fec: FEC mode which is active on porte + * @fec: Bitmask of supported/configured FEC modes + * @rsvd: Reserved for future extensions. i.e FEC bypass feature. + * + * Drivers should reject a non-zero setting of @autoneg when + * autoneogotiation is disabled (or not supported) for the link. + * + */ +struct ethtool_fecparam { + __u32 cmd; + /* bitmask of FEC modes */ + __u32 active_fec; + __u32 fec; + __u32 reserved; +}; + +/** + * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration + * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported + * @ETHTOOL_FEC_AUTO: Default/Best FEC mode provided by driver + * @ETHTOOL_FEC_OFF: No FEC Mode + * @ETHTOOL_FEC_RS: Reed-Solomon Forward Error Detection mode + * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon Forward Error Detection mode + */ +enum ethtool_fec_config_bits { + ETHTOOL_FEC_NONE_BIT, + ETHTOOL_FEC_AUTO_BIT, + ETHTOOL_FEC_OFF_BIT, + ETHTOOL_FEC_RS_BIT, + ETHTOOL_FEC_BASER_BIT, +}; + +#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) +#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT) +#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) +#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) +#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) + /* CMDs currently supported */ #define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. * Please use ETHTOOL_GLINKSETTINGS @@ -1330,6 +1371,8 @@ struct ethtool_per_queue_op { #define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ #define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ #define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ +#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */ +#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET @@ -1387,6 +1430,9 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47, ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48, + ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, + ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, + ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* @@ -1395,7 +1441,7 @@ enum ethtool_link_mode_bit_indices { */ __ETHTOOL_LINK_MODE_LAST - = ETHTOOL_LINK_MODE_5000baseT_Full_BIT, + = ETHTOOL_LINK_MODE_FEC_BASER_BIT, }; #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ -- cgit v1.2.3 From 64c83d837329531252a1a0f0dfdd4fd607e1d8e9 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 30 Jul 2017 13:24:49 -0400 Subject: net netlink: Add new type NLA_BITFIELD32 Generic bitflags attribute content sent to the kernel by user. With this netlink attr type the user can either set or unset a flag in the kernel. The value is a bitmap that defines the bit values being set The selector is a bitmask that defines which value bit is to be considered. A check is made to ensure the rules that a kernel subsystem always conforms to bitflags the kernel already knows about. i.e if the user tries to set a bit flag that is not understood then the _it will be rejected_. In the most basic form, the user specifies the attribute policy as: [ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags }, where myvalidflags is the bit mask of the flags the kernel understands. If the user _does not_ provide myvalidflags then the attribute will also be rejected. Examples: value = 0x0, and selector = 0x1 implies we are selecting bit 1 and we want to set its value to 0. value = 0x2, and selector = 0x2 implies we are selecting bit 2 and we want to set its value to 1. Suggested-by: Jiri Pirko Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/netlink.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index f86127a46cfc..f4fc9c9e123d 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -226,5 +226,22 @@ struct nlattr { #define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) #define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) +/* Generic 32 bitflags attribute content sent to the kernel. + * + * The value is a bitmap that defines the values being set + * The selector is a bitmask that defines which value is legit + * + * Examples: + * value = 0x0, and selector = 0x1 + * implies we are selecting bit 1 and we want to set its value to 0. + * + * value = 0x2, and selector = 0x2 + * implies we are selecting bit 2 and we want to set its value to 1. + * + */ +struct nla_bitfield32 { + __u32 value; + __u32 selector; +}; #endif /* _UAPI__LINUX_NETLINK_H */ -- cgit v1.2.3 From 90825b23a887f06f6c05bdde77b200c5fe9b6217 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 30 Jul 2017 13:24:51 -0400 Subject: net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch When you dump hundreds of thousands of actions, getting only 32 per dump batch even when the socket buffer and memory allocations allow is inefficient. With this change, the user will get as many as possibly fitting within the given constraints available to the kernel. The top level action TLV space is extended. An attribute TCA_ROOT_FLAGS is used to carry flags; flag TCA_FLAG_LARGE_DUMP_ON is set by the user indicating the user is capable of processing these large dumps. Older user space which doesnt set this flag doesnt get the large (than 32) batches. The kernel uses the TCA_ROOT_COUNT attribute to tell the user how many actions are put in a single batch. As such user space app knows how long to iterate (independent of the type of action being dumped) instead of hardcoded maximum of 32 thus maintaining backward compat. Some results dumping 1.5M actions below: first an unpatched tc which doesnt understand these features... prompt$ time -p tc actions ls action gact | grep index | wc -l 1500000 real 1388.43 user 2.07 sys 1386.79 Now lets see a patched tc which sets the correct flags when requesting a dump: prompt$ time -p updatedtc actions ls action gact | grep index | wc -l 1500000 real 178.13 user 2.02 sys 176.96 That is about 8x performance improvement for tc app which sets its receive buffer to about 32K. Signed-off-by: Jamal Hadi Salim Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index d148505010a7..bfa80a6164d9 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -683,10 +683,28 @@ struct tcamsg { unsigned char tca__pad1; unsigned short tca__pad2; }; + +enum { + TCA_ROOT_UNSPEC, + TCA_ROOT_TAB, +#define TCA_ACT_TAB TCA_ROOT_TAB +#define TCAA_MAX TCA_ROOT_TAB + TCA_ROOT_FLAGS, + TCA_ROOT_COUNT, + __TCA_ROOT_MAX, +#define TCA_ROOT_MAX (__TCA_ROOT_MAX - 1) +}; + #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) -#define TCA_ACT_TAB 1 /* attr type must be >=1 */ -#define TCAA_MAX 1 +/* tcamsg flags stored in attribute TCA_ROOT_FLAGS + * + * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO + * actions in a dump. All dump responses will contain the number of actions + * being dumped stored in for user app's consumption in TCA_ROOT_COUNT + * + */ +#define TCA_FLAG_LARGE_DUMP_ON (1 << 0) /* New extended info filters for IFLA_EXT_MASK */ #define RTEXT_FILTER_VF (1 << 0) -- cgit v1.2.3 From e62e484df04964ac947c679ef4f00c54ae5395aa Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 30 Jul 2017 13:24:52 -0400 Subject: net sched actions: add time filter for action dumping This patch adds support for filtering based on time since last used. When we are dumping a large number of actions it is useful to have the option of filtering based on when the action was last used to reduce the amount of data crossing to user space. With this patch the user space app sets the TCA_ROOT_TIME_DELTA attribute with the value in milliseconds with "time of interest since now". The kernel converts this to jiffies and does the filtering comparison matching entries that have seen activity since then and returns them to user space. Old kernels and old tc continue to work in legacy mode since they dont specify this attribute. Some example (we have 400 actions bound to 400 filters); at installation time. Using updated when tc setting the time of interest to 120 seconds earlier (we see 400 actions): prompt$ hackedtc actions ls action gact since 120000| grep index | wc -l 400 go get some coffee and wait for > 120 seconds and try again: prompt$ hackedtc actions ls action gact since 120000 | grep index | wc -l 0 Lets see a filter bound to one of these actions: .... filter pref 10 u32 filter pref 10 u32 fh 800: ht divisor 1 filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10 (rule hit 2 success 1) match 7f000002/ffffffff at 12 (success 1 ) action order 1: gact action pass random type none pass val 0 index 23 ref 2 bind 1 installed 1145 sec used 802 sec Action statistics: Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 .... that coffee took long, no? It was good. Now lets ping -c 1 127.0.0.2, then run the actions again: prompt$ hackedtc actions ls action gact since 120 | grep index | wc -l 1 More details please: prompt$ hackedtc -s actions ls action gact since 120000 action order 0: gact action pass random type none pass val 0 index 23 ref 2 bind 1 installed 1270 sec used 30 sec Action statistics: Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 And the filter? filter pref 10 u32 filter pref 10 u32 fh 800: ht divisor 1 filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10 (rule hit 4 success 2) match 7f000002/ffffffff at 12 (success 2 ) action order 1: gact action pass random type none pass val 0 index 23 ref 2 bind 1 installed 1324 sec used 84 sec Action statistics: Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 Signed-off-by: Jamal Hadi Salim Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index bfa80a6164d9..dab7dad9e01a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -691,6 +691,7 @@ enum { #define TCAA_MAX TCA_ROOT_TAB TCA_ROOT_FLAGS, TCA_ROOT_COUNT, + TCA_ROOT_TIME_DELTA, /* in msecs */ __TCA_ROOT_MAX, #define TCA_ROOT_MAX (__TCA_ROOT_MAX - 1) }; -- cgit v1.2.3 From e46abbcc05aa8a16b0e7f5c94e86d11af9aa2770 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Jul 2017 16:56:41 +0200 Subject: netfilter: nf_tables: Allow table names of up to 255 chars Allocate all table names dynamically to allow for arbitrary lengths but introduce NFT_NAME_MAXLEN as an upper sanity boundary. It's value was chosen to allow using a domain name as per RFC 1035. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6f0a950e21c3..0b94e572ef16 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1,7 +1,8 @@ #ifndef _LINUX_NF_TABLES_H #define _LINUX_NF_TABLES_H -#define NFT_TABLE_MAXNAMELEN 32 +#define NFT_NAME_MAXLEN 256 +#define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_CHAIN_MAXNAMELEN 32 #define NFT_SET_MAXNAMELEN 32 #define NFT_OBJ_MAXNAMELEN 32 -- cgit v1.2.3 From b7263e071aba736cea9e71cdf2e76dfa7aebd039 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Jul 2017 16:56:42 +0200 Subject: netfilter: nf_tables: Allow chain name of up to 255 chars Same conversion as for table names, use NFT_NAME_MAXLEN as upper boundary as well. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 0b94e572ef16..d9c03a8608ee 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -3,7 +3,7 @@ #define NFT_NAME_MAXLEN 256 #define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN -#define NFT_CHAIN_MAXNAMELEN 32 +#define NFT_CHAIN_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_SET_MAXNAMELEN 32 #define NFT_OBJ_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 -- cgit v1.2.3 From 387454901bd62022ac1b04e15bd8d4fcc60bbed4 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Jul 2017 16:56:43 +0200 Subject: netfilter: nf_tables: Allow set names of up to 255 chars Same conversion as for table names, use NFT_NAME_MAXLEN as upper boundary as well. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index d9c03a8608ee..b5e73e80b7b6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -4,7 +4,7 @@ #define NFT_NAME_MAXLEN 256 #define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_CHAIN_MAXNAMELEN NFT_NAME_MAXLEN -#define NFT_SET_MAXNAMELEN 32 +#define NFT_SET_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_OBJ_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 -- cgit v1.2.3 From 615095752100748e221028fc96163c2b78185ae4 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Jul 2017 16:56:44 +0200 Subject: netfilter: nf_tables: Allow object names of up to 255 chars Same conversion as for table names, use NFT_NAME_MAXLEN as upper boundary as well. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b5e73e80b7b6..be25cf69295b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -5,7 +5,7 @@ #define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_CHAIN_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_SET_MAXNAMELEN NFT_NAME_MAXLEN -#define NFT_OBJ_MAXNAMELEN 32 +#define NFT_OBJ_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_USERDATA_MAXLEN 256 /** -- cgit v1.2.3 From 3282e65558b3651e230ee985c174c35cb2fedaf1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Jul 2017 03:57:23 +0200 Subject: tcp: remove unused mib counters was used by tcp prequeue and header prediction. TCPFORWARDRETRANS use was removed in january. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index d85693295798..b3f346fb9fe3 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -184,14 +184,7 @@ enum LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ LINUX_MIB_LISTENOVERFLOWS, /* ListenOverflows */ LINUX_MIB_LISTENDROPS, /* ListenDrops */ - LINUX_MIB_TCPPREQUEUED, /* TCPPrequeued */ - LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, /* TCPDirectCopyFromBacklog */ - LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, /* TCPDirectCopyFromPrequeue */ - LINUX_MIB_TCPPREQUEUEDROPPED, /* TCPPrequeueDropped */ - LINUX_MIB_TCPHPHITS, /* TCPHPHits */ - LINUX_MIB_TCPHPHITSTOUSER, /* TCPHPHitsToUser */ LINUX_MIB_TCPPUREACKS, /* TCPPureAcks */ - LINUX_MIB_TCPHPACKS, /* TCPHPAcks */ LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */ LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */ LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */ @@ -208,14 +201,12 @@ enum LINUX_MIB_TCPSACKFAILURES, /* TCPSackFailures */ LINUX_MIB_TCPLOSSFAILURES, /* TCPLossFailures */ LINUX_MIB_TCPFASTRETRANS, /* TCPFastRetrans */ - LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */ LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */ LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */ LINUX_MIB_TCPLOSSPROBES, /* TCPLossProbes */ LINUX_MIB_TCPLOSSPROBERECOVERY, /* TCPLossProbeRecovery */ LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */ LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */ - LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */ LINUX_MIB_TCPRCVCOLLAPSED, /* TCPRcvCollapsed */ LINUX_MIB_TCPDSACKOLDSENT, /* TCPDSACKOldSent */ LINUX_MIB_TCPDSACKOFOSENT, /* TCPDSACKOfoSent */ -- cgit v1.2.3 From bb7c19f96012720b895111300b9d9f3f858c3a69 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 28 Jul 2017 10:28:21 -0700 Subject: tcp: add related fields into SCM_TIMESTAMPING_OPT_STATS Add the following stats into SCM_TIMESTAMPING_OPT_STATS control msg: TCP_NLA_PACING_RATE TCP_NLA_DELIVERY_RATE TCP_NLA_SND_CWND TCP_NLA_REORDERING TCP_NLA_MIN_RTT TCP_NLA_RECUR_RETRANS TCP_NLA_DELIVERY_RATE_APP_LMT Signed-off-by: Wei Wang Acked-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index a5507c977497..030e594bab45 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -231,6 +231,14 @@ enum { TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */ TCP_NLA_DATA_SEGS_OUT, /* Data pkts sent including retransmission */ TCP_NLA_TOTAL_RETRANS, /* Data pkts retransmitted */ + TCP_NLA_PACING_RATE, /* Pacing rate in bytes per second */ + TCP_NLA_DELIVERY_RATE, /* Delivery rate in bytes per second */ + TCP_NLA_SND_CWND, /* Sending congestion window */ + TCP_NLA_REORDERING, /* Reordering metric */ + TCP_NLA_MIN_RTT, /* minimum RTT */ + TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ + TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ + }; /* for TCP_MD5SIG socket option */ -- cgit v1.2.3 From 61e4d01e16acddadb9723143637a20417fa67ac9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 3 Aug 2017 13:28:20 +0200 Subject: ipv6: fib: Add offload indication to routes Allow user space applications to see which routes are offloaded and which aren't by setting the RTNH_F_OFFLOAD flag when dumping them. To be consistent with IPv4, offload indication is provided on a per-nexthop basis. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/ipv6_route.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index d496c02e14bc..33e2a5732bd1 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -35,6 +35,7 @@ #define RTF_PREF(pref) ((pref) << 27) #define RTF_PREF_MASK 0x18000000 +#define RTF_OFFLOAD 0x20000000 /* offloaded route */ #define RTF_PCPU 0x40000000 /* read-only: can not be set by user */ #define RTF_LOCAL 0x80000000 -- cgit v1.2.3 From 52267790ef52d7513879238ca9fac22c1733e0e3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:39 -0400 Subject: sock: add MSG_ZEROCOPY The kernel supports zerocopy sendmsg in virtio and tap. Expand the infrastructure to support other socket types. Introduce a completion notification channel over the socket error queue. Notifications are returned with ee_origin SO_EE_ORIGIN_ZEROCOPY. ee_errno is 0 to avoid blocking the send/recv path on receiving notifications. Add reference counting, to support the skb split, merge, resize and clone operations possible with SOCK_STREAM and other socket types. The patch does not yet modify any datapaths. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/errqueue.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 07bdce1f444a..78fdf52d6b2f 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -18,10 +18,13 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP 2 #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 +#define SO_EE_ORIGIN_ZEROCOPY 5 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) +#define SO_EE_CODE_ZEROCOPY_COPIED 1 + /** * struct scm_timestamping - timestamps exposed through cmsg * -- cgit v1.2.3 From 059cf566e123ca7eb7434285c6455d7afafb4e02 Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Thu, 16 Feb 2017 09:07:02 +0100 Subject: tee: indicate privileged dev in gen_caps Mirrors the TEE_DESC_PRIVILEGED bit of struct tee_desc:flags into struct tee_ioctl_version_data:gen_caps as TEE_GEN_CAP_PRIVILEGED in tee_ioctl_version() Reviewed-by: Jerome Forissier Signed-off-by: Jens Wiklander --- include/uapi/linux/tee.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 370d8845ab21..688782e90140 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -49,6 +49,7 @@ #define TEE_MAX_ARG_SIZE 1024 #define TEE_GEN_CAP_GP (1 << 0)/* GlobalPlatform compliant TEE */ +#define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ /* * TEE Implementation ID -- cgit v1.2.3 From 56ce097c1caede1f9c191a7c9699b950e7c36ad9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 4 Aug 2017 08:24:05 -0700 Subject: net: comment fixes against BPF devmap helper calls Update BPF comments to accurately reflect XDP usage. Fixes: 97f91a7cf04ff ("bpf: add bpf_redirect_map helper routine") Reported-by: Alexei Starovoitov Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1106a8c4cd36..1d06be1569b1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -345,14 +345,20 @@ union bpf_attr { * int bpf_redirect(ifindex, flags) * redirect to another netdev * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: TC_ACT_REDIRECT - * int bpf_redirect_map(key, map, flags) + * @flags: + * cls_bpf: + * bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * xdp_bpf: + * all bits - reserved + * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error + * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error + * int bpf_redirect_map(map, key, flags) * redirect to endpoint in map + * @map: pointer to dev map * @key: index in map to lookup - * @map: fd of map to do lookup in * @flags: -- + * Return: XDP_REDIRECT on success or XDP_ABORT on error * * u32 bpf_get_route_realm(skb) * retrieve a dst's tclassid -- cgit v1.2.3 From 472b46c352c9ff0b6fa57dbf85d77c51901a3368 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Sun, 6 Aug 2017 18:44:27 +0200 Subject: uapi linux/kfd_ioctl.h: only use __u32 and __u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include instead of which on Linux includes and on non-Linux platforms defines __u32 etc types. Fixes user space compilation errors like: linux/kfd_ioctl.h:33:2: error: unknown type name ‘uint32_t’ uint32_t major_version; /* from KFD */ ^~~~~~~~ Signed-off-by: Mikko Rapeli Acked-by: Arnd Bergmann Signed-off-by: Oded Gabbay --- include/uapi/linux/kfd_ioctl.h | 172 ++++++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 86 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 7b4567bacfc2..26283fefdf5f 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -23,15 +23,15 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include +#include #include #define KFD_IOCTL_MAJOR_VERSION 1 #define KFD_IOCTL_MINOR_VERSION 1 struct kfd_ioctl_get_version_args { - uint32_t major_version; /* from KFD */ - uint32_t minor_version; /* from KFD */ + __u32 major_version; /* from KFD */ + __u32 minor_version; /* from KFD */ }; /* For kfd_ioctl_create_queue_args.queue_type. */ @@ -43,36 +43,36 @@ struct kfd_ioctl_get_version_args { #define KFD_MAX_QUEUE_PRIORITY 15 struct kfd_ioctl_create_queue_args { - uint64_t ring_base_address; /* to KFD */ - uint64_t write_pointer_address; /* from KFD */ - uint64_t read_pointer_address; /* from KFD */ - uint64_t doorbell_offset; /* from KFD */ - - uint32_t ring_size; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t queue_type; /* to KFD */ - uint32_t queue_percentage; /* to KFD */ - uint32_t queue_priority; /* to KFD */ - uint32_t queue_id; /* from KFD */ - - uint64_t eop_buffer_address; /* to KFD */ - uint64_t eop_buffer_size; /* to KFD */ - uint64_t ctx_save_restore_address; /* to KFD */ - uint64_t ctx_save_restore_size; /* to KFD */ + __u64 ring_base_address; /* to KFD */ + __u64 write_pointer_address; /* from KFD */ + __u64 read_pointer_address; /* from KFD */ + __u64 doorbell_offset; /* from KFD */ + + __u32 ring_size; /* to KFD */ + __u32 gpu_id; /* to KFD */ + __u32 queue_type; /* to KFD */ + __u32 queue_percentage; /* to KFD */ + __u32 queue_priority; /* to KFD */ + __u32 queue_id; /* from KFD */ + + __u64 eop_buffer_address; /* to KFD */ + __u64 eop_buffer_size; /* to KFD */ + __u64 ctx_save_restore_address; /* to KFD */ + __u64 ctx_save_restore_size; /* to KFD */ }; struct kfd_ioctl_destroy_queue_args { - uint32_t queue_id; /* to KFD */ - uint32_t pad; + __u32 queue_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_update_queue_args { - uint64_t ring_base_address; /* to KFD */ + __u64 ring_base_address; /* to KFD */ - uint32_t queue_id; /* to KFD */ - uint32_t ring_size; /* to KFD */ - uint32_t queue_percentage; /* to KFD */ - uint32_t queue_priority; /* to KFD */ + __u32 queue_id; /* to KFD */ + __u32 ring_size; /* to KFD */ + __u32 queue_percentage; /* to KFD */ + __u32 queue_priority; /* to KFD */ }; /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ @@ -80,13 +80,13 @@ struct kfd_ioctl_update_queue_args { #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 struct kfd_ioctl_set_memory_policy_args { - uint64_t alternate_aperture_base; /* to KFD */ - uint64_t alternate_aperture_size; /* to KFD */ + __u64 alternate_aperture_base; /* to KFD */ + __u64 alternate_aperture_size; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t default_policy; /* to KFD */ - uint32_t alternate_policy; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 default_policy; /* to KFD */ + __u32 alternate_policy; /* to KFD */ + __u32 pad; }; /* @@ -97,26 +97,26 @@ struct kfd_ioctl_set_memory_policy_args { */ struct kfd_ioctl_get_clock_counters_args { - uint64_t gpu_clock_counter; /* from KFD */ - uint64_t cpu_clock_counter; /* from KFD */ - uint64_t system_clock_counter; /* from KFD */ - uint64_t system_clock_freq; /* from KFD */ + __u64 gpu_clock_counter; /* from KFD */ + __u64 cpu_clock_counter; /* from KFD */ + __u64 system_clock_counter; /* from KFD */ + __u64 system_clock_freq; /* from KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 pad; }; #define NUM_OF_SUPPORTED_GPUS 7 struct kfd_process_device_apertures { - uint64_t lds_base; /* from KFD */ - uint64_t lds_limit; /* from KFD */ - uint64_t scratch_base; /* from KFD */ - uint64_t scratch_limit; /* from KFD */ - uint64_t gpuvm_base; /* from KFD */ - uint64_t gpuvm_limit; /* from KFD */ - uint32_t gpu_id; /* from KFD */ - uint32_t pad; + __u64 lds_base; /* from KFD */ + __u64 lds_limit; /* from KFD */ + __u64 scratch_base; /* from KFD */ + __u64 scratch_limit; /* from KFD */ + __u64 gpuvm_base; /* from KFD */ + __u64 gpuvm_limit; /* from KFD */ + __u32 gpu_id; /* from KFD */ + __u32 pad; }; struct kfd_ioctl_get_process_apertures_args { @@ -124,8 +124,8 @@ struct kfd_ioctl_get_process_apertures_args { process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */ /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ - uint32_t num_of_nodes; - uint32_t pad; + __u32 num_of_nodes; + __u32 pad; }; #define MAX_ALLOWED_NUM_POINTS 100 @@ -133,25 +133,25 @@ struct kfd_ioctl_get_process_apertures_args { #define MAX_ALLOWED_WAC_BUFF_SIZE 128 struct kfd_ioctl_dbg_register_args { - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_dbg_unregister_args { - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_dbg_address_watch_args { - uint64_t content_ptr; /* a pointer to the actual content */ - uint32_t gpu_id; /* to KFD */ - uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ + __u64 content_ptr; /* a pointer to the actual content */ + __u32 gpu_id; /* to KFD */ + __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ }; struct kfd_ioctl_dbg_wave_control_args { - uint64_t content_ptr; /* a pointer to the actual content */ - uint32_t gpu_id; /* to KFD */ - uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ + __u64 content_ptr; /* a pointer to the actual content */ + __u32 gpu_id; /* to KFD */ + __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ }; /* Matching HSA_EVENTTYPE */ @@ -172,44 +172,44 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_SIGNAL_EVENT_LIMIT 256 struct kfd_ioctl_create_event_args { - uint64_t event_page_offset; /* from KFD */ - uint32_t event_trigger_data; /* from KFD - signal events only */ - uint32_t event_type; /* to KFD */ - uint32_t auto_reset; /* to KFD */ - uint32_t node_id; /* to KFD - only valid for certain + __u64 event_page_offset; /* from KFD */ + __u32 event_trigger_data; /* from KFD - signal events only */ + __u32 event_type; /* to KFD */ + __u32 auto_reset; /* to KFD */ + __u32 node_id; /* to KFD - only valid for certain event types */ - uint32_t event_id; /* from KFD */ - uint32_t event_slot_index; /* from KFD */ + __u32 event_id; /* from KFD */ + __u32 event_slot_index; /* from KFD */ }; struct kfd_ioctl_destroy_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_set_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_reset_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_memory_exception_failure { - uint32_t NotPresent; /* Page not present or supervisor privilege */ - uint32_t ReadOnly; /* Write access to a read-only page */ - uint32_t NoExecute; /* Execute access to a page marked NX */ - uint32_t pad; + __u32 NotPresent; /* Page not present or supervisor privilege */ + __u32 ReadOnly; /* Write access to a read-only page */ + __u32 NoExecute; /* Execute access to a page marked NX */ + __u32 pad; }; /* memory exception data*/ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; - uint64_t va; - uint32_t gpu_id; - uint32_t pad; + __u64 va; + __u32 gpu_id; + __u32 pad; }; /* Event data*/ @@ -217,19 +217,19 @@ struct kfd_event_data { union { struct kfd_hsa_memory_exception_data memory_exception_data; }; /* From KFD */ - uint64_t kfd_event_data_ext; /* pointer to an extension structure + __u64 kfd_event_data_ext; /* pointer to an extension structure for future exception types */ - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_wait_events_args { - uint64_t events_ptr; /* pointed to struct + __u64 events_ptr; /* pointed to struct kfd_event_data array, to KFD */ - uint32_t num_events; /* to KFD */ - uint32_t wait_for_all; /* to KFD */ - uint32_t timeout; /* to KFD */ - uint32_t wait_result; /* from KFD */ + __u32 num_events; /* to KFD */ + __u32 wait_for_all; /* to KFD */ + __u32 timeout; /* to KFD */ + __u32 wait_result; /* from KFD */ }; struct kfd_ioctl_set_scratch_backing_va_args { -- cgit v1.2.3 From f02a60924c221985fb8b634734d6610706fa779a Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Sun, 6 Aug 2017 18:44:07 +0200 Subject: uapi linux/dlm_netlink.h: include linux/dlmconstants.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes userspace compilation error: error: ‘DLM_RESNAME_MAXLEN’ undeclared here (not in a function) char resource_name[DLM_RESNAME_MAXLEN]; Signed-off-by: Mikko Rapeli Signed-off-by: David Teigland --- include/uapi/linux/dlm_netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dlm_netlink.h b/include/uapi/linux/dlm_netlink.h index 647c8ef27227..ef1e2e08769a 100644 --- a/include/uapi/linux/dlm_netlink.h +++ b/include/uapi/linux/dlm_netlink.h @@ -10,6 +10,7 @@ #define _DLM_NETLINK_H #include +#include enum { DLM_STATUS_WAITING = 1, -- cgit v1.2.3 From d1df6fd8a1d22d37cffa0075ab8ad423ce656777 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Sat, 5 Aug 2017 12:38:26 +0200 Subject: ipv6: sr: define core operations for seg6local lightweight tunnel This patch implements a new type of lightweight tunnel named seg6local. A seg6local lwt is defined by a type of action and a set of parameters. The action represents the operation to perform on the packets matching the lwt's route, and is not necessarily an encapsulation. The set of parameters are arguments for the processing function. Each action is defined in a struct seg6_action_desc within seg6_action_table[]. This structure contains the action, mandatory attributes, the processing function, and a static headroom size required by the action. The mandatory attributes are encoded as a bitmask field. The static headroom is set to a non-zero value when the processing function always add a constant number of bytes to the skb (e.g. the header size for encapsulations). To facilitate rtnetlink-related operations such as parsing, fill_encap, and cmp_encap, each type of action parameter is associated to three function pointers, in seg6_action_params[]. All actions defined in seg6_local.h are detailed in [1]. [1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01 Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 1 + include/uapi/linux/seg6_local.h | 68 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 include/uapi/linux/seg6_local.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 92724cba1eba..7fdd19ca7511 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -11,6 +11,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_IP6, LWTUNNEL_ENCAP_SEG6, LWTUNNEL_ENCAP_BPF, + LWTUNNEL_ENCAP_SEG6_LOCAL, __LWTUNNEL_ENCAP_MAX, }; diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h new file mode 100644 index 000000000000..ef2d8c3e76c1 --- /dev/null +++ b/include/uapi/linux/seg6_local.h @@ -0,0 +1,68 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_SEG6_LOCAL_H +#define _UAPI_LINUX_SEG6_LOCAL_H + +#include + +enum { + SEG6_LOCAL_UNSPEC, + SEG6_LOCAL_ACTION, + SEG6_LOCAL_SRH, + SEG6_LOCAL_TABLE, + SEG6_LOCAL_NH4, + SEG6_LOCAL_NH6, + SEG6_LOCAL_IIF, + SEG6_LOCAL_OIF, + __SEG6_LOCAL_MAX, +}; +#define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) + +enum { + SEG6_LOCAL_ACTION_UNSPEC = 0, + /* node segment */ + SEG6_LOCAL_ACTION_END = 1, + /* adjacency segment (IPv6 cross-connect) */ + SEG6_LOCAL_ACTION_END_X = 2, + /* lookup of next seg NH in table */ + SEG6_LOCAL_ACTION_END_T = 3, + /* decap and L2 cross-connect */ + SEG6_LOCAL_ACTION_END_DX2 = 4, + /* decap and IPv6 cross-connect */ + SEG6_LOCAL_ACTION_END_DX6 = 5, + /* decap and IPv4 cross-connect */ + SEG6_LOCAL_ACTION_END_DX4 = 6, + /* decap and lookup of DA in v6 table */ + SEG6_LOCAL_ACTION_END_DT6 = 7, + /* decap and lookup of DA in v4 table */ + SEG6_LOCAL_ACTION_END_DT4 = 8, + /* binding segment with insertion */ + SEG6_LOCAL_ACTION_END_B6 = 9, + /* binding segment with encapsulation */ + SEG6_LOCAL_ACTION_END_B6_ENCAP = 10, + /* binding segment with MPLS encap */ + SEG6_LOCAL_ACTION_END_BM = 11, + /* lookup last seg in table */ + SEG6_LOCAL_ACTION_END_S = 12, + /* forward to SR-unaware VNF with static proxy */ + SEG6_LOCAL_ACTION_END_AS = 13, + /* forward to SR-unaware VNF with masquerading */ + SEG6_LOCAL_ACTION_END_AM = 14, + + __SEG6_LOCAL_ACTION_MAX, +}; + +#define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1) + +#endif -- cgit v1.2.3 From 92b31a9af73b3a3fc801899335d6c47966351830 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Aug 2017 01:39:55 +0200 Subject: bpf: add BPF_J{LT,LE,SLT,SLE} instructions Currently, eBPF only understands BPF_JGT (>), BPF_JGE (>=), BPF_JSGT (s>), BPF_JSGE (s>=) instructions, this means that particularly *JLT/*JLE counterparts involving immediates need to be rewritten from e.g. X < [IMM] by swapping arguments into [IMM] > X, meaning the immediate first is required to be loaded into a register Y := [IMM], such that then we can compare with Y > X. Note that the destination operand is always required to be a register. This has the downside of having unnecessarily increased register pressure, meaning complex program would need to spill other registers temporarily to stack in order to obtain an unused register for the [IMM]. Loading to registers will thus also affect state pruning since we need to account for that register use and potentially those registers that had to be spilled/filled again. As a consequence slightly more stack space might have been used due to spilling, and BPF programs are a bit longer due to extra code involving the register load and potentially required spill/fills. Thus, add BPF_JLT (<), BPF_JLE (<=), BPF_JSLT (s<), BPF_JSLE (s<=) counterparts to the eBPF instruction set. Modifying LLVM to remove the NegateCC() workaround in a PoC patch at [1] and allowing it to also emit the new instructions resulted in cilium's BPF programs that are injected into the fast-path to have a reduced program length in the range of 2-3% (e.g. accumulated main and tail call sections from one of the object file reduced from 4864 to 4729 insns), reduced complexity in the range of 10-30% (e.g. accumulated sections reduced in one of the cases from 116432 to 88428 insns), and reduced stack usage in the range of 1-5% (e.g. accumulated sections from one of the object files reduced from 824 to 784b). The modification for LLVM will be incorporated in a backwards compatible way. Plan is for LLVM to have i) a target specific option to offer a possibility to explicitly enable the extension by the user (as we have with -m target specific extensions today for various CPU insns), and ii) have the kernel checked for presence of the extensions and enable them transparently when the user is selecting more aggressive options such as -march=native in a bpf target context. (Other frontends generating BPF byte code, e.g. ply can probe the kernel directly for its code generation.) [1] https://github.com/borkmann/llvm/tree/bpf-insns Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d06be1569b1..91da8371a2d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -30,9 +30,14 @@ #define BPF_FROM_LE BPF_TO_LE #define BPF_FROM_BE BPF_TO_BE +/* jmp encodings */ #define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ -- cgit v1.2.3 From 077fbac405bfc6d41419ad6c1725804ad4e9887c Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 11 Aug 2017 02:11:33 +0900 Subject: net: xfrm: support setting an output mark. On systems that use mark-based routing it may be necessary for routing lookups to use marks in order for packets to be routed correctly. An example of such a system is Android, which uses socket marks to route packets via different networks. Currently, routing lookups in tunnel mode always use a mark of zero, making routing incorrect on such systems. This patch adds a new output_mark element to the xfrm state and a corresponding XFRMA_OUTPUT_MARK netlink attribute. The output mark differs from the existing xfrm mark in two ways: 1. The xfrm mark is used to match xfrm policies and states, while the xfrm output mark is used to set the mark (and influence the routing) of the packets emitted by those states. 2. The existing mark is constrained to be a subset of the bits of the originating socket or transformed packet, but the output mark is arbitrary and depends only on the state. The use of a separate mark provides additional flexibility. For example: - A packet subject to two transforms (e.g., transport mode inside tunnel mode) can have two different output marks applied to it, one for the transport mode SA and one for the tunnel mode SA. - On a system where socket marks determine routing, the packets emitted by an IPsec tunnel can be routed based on a mark that is determined by the tunnel, not by the marks of the unencrypted packets. - Support for setting the output marks can be introduced without breaking any existing setups that employ both mark-based routing and xfrm tunnel mode. Simply changing the code to use the xfrm mark for routing output packets could xfrm mark could change behaviour in a way that breaks these setups. If the output mark is unspecified or set to zero, the mark is not set or changed. Tested: make allyesconfig; make -j64 Tested: https://android-review.googlesource.com/452776 Signed-off-by: Lorenzo Colitti Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 2b384ff09fa0..5fe7370a2bef 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -304,6 +304,7 @@ enum xfrm_attr_type_t { XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ XFRMA_PAD, XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ + XFRMA_OUTPUT_MARK, /* __u32 */ __XFRMA_MAX #define XFRMA_MAX (__XFRMA_MAX - 1) -- cgit v1.2.3 From 34fc75bfc616f1c1fbab56508c3f48f4b97c97ea Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Fri, 11 Aug 2017 16:24:15 +0200 Subject: uapi/linux/quota.h: Do not include linux/errno.h linux/errno.h is very sensitive to coordination with libc headers. Nothing in linux/quota.h needs it, so this change allows using this header in more contexts. Signed-off-by: Florian Weimer Signed-off-by: Jan Kara --- include/uapi/linux/quota.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h index 4d2489ef6f10..f17c9636a859 100644 --- a/include/uapi/linux/quota.h +++ b/include/uapi/linux/quota.h @@ -33,7 +33,6 @@ #ifndef _UAPI_LINUX_QUOTA_ #define _UAPI_LINUX_QUOTA_ -#include #include #define __DQUOT_VERSION__ "dquot_6.6.0" -- cgit v1.2.3 From d612b1fd8010d0d67b5287fe146b8b55bcbb8655 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:53 +0000 Subject: seccomp: Operation for checking if an action is available Userspace code that needs to check if the kernel supports a given action may not be able to use the /proc/sys/kernel/seccomp/actions_avail sysctl. The process may be running in a sandbox and, therefore, sufficient filesystem access may not be available. This patch adds an operation to the seccomp(2) syscall that allows userspace code to ask the kernel if a given action is available. If the action is supported by the kernel, 0 is returned. If the action is not supported by the kernel, -1 is returned with errno set to -EOPNOTSUPP. If this check is attempted on a kernel that doesn't support this new operation, -1 is returned with errno set to -EINVAL meaning that userspace code will have the ability to differentiate between the two error cases. Signed-off-by: Tyler Hicks Suggested-by: Andy Lutomirski Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 0f238a43ff1e..aaad61cc46bc 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -11,8 +11,9 @@ #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ /* Valid operations for seccomp syscall. */ -#define SECCOMP_SET_MODE_STRICT 0 -#define SECCOMP_SET_MODE_FILTER 1 +#define SECCOMP_SET_MODE_STRICT 0 +#define SECCOMP_SET_MODE_FILTER 1 +#define SECCOMP_GET_ACTION_AVAIL 2 /* Valid flags for SECCOMP_SET_MODE_FILTER */ #define SECCOMP_FILTER_FLAG_TSYNC 1 -- cgit v1.2.3 From e66a39977985b1e69e17c4042cb290768eca9b02 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:56 +0000 Subject: seccomp: Filter flag to log all actions except SECCOMP_RET_ALLOW Add a new filter flag, SECCOMP_FILTER_FLAG_LOG, that enables logging for all actions except for SECCOMP_RET_ALLOW for the given filter. SECCOMP_RET_KILL actions are always logged, when "kill" is in the actions_logged sysctl, and SECCOMP_RET_ALLOW actions are never logged, regardless of this flag. This flag can be used to create noisy filters that result in all non-allowed actions to be logged. A process may have one noisy filter, which is loaded with this flag, as well as a quiet filter that's not loaded with this flag. This allows for the actions in a set of filters to be selectively conveyed to the admin. Since a system could have a large number of allocated seccomp_filter structs, struct packing was taken in consideration. On 64 bit x86, the new log member takes up one byte of an existing four byte hole in the struct. On 32 bit x86, the new log member creates a new four byte hole (unavoidable) and consumes one of those bytes. Unfortunately, the tests added for SECCOMP_FILTER_FLAG_LOG are not capable of inspecting the audit log to verify that the actions taken in the filter were logged. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if filter-requests-logging && action in actions_logged: log else if audit_enabled && process-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index aaad61cc46bc..19a611d0712e 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -17,6 +17,7 @@ /* Valid flags for SECCOMP_SET_MODE_FILTER */ #define SECCOMP_FILTER_FLAG_TSYNC 1 +#define SECCOMP_FILTER_FLAG_LOG 2 /* * All BPF programs must return a 32-bit value. -- cgit v1.2.3 From 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:57 +0000 Subject: seccomp: Action to log before allowing Add a new action, SECCOMP_RET_LOG, that logs a syscall before allowing the syscall. At the implementation level, this action is identical to the existing SECCOMP_RET_ALLOW action. However, it can be very useful when initially developing a seccomp filter for an application. The developer can set the default action to be SECCOMP_RET_LOG, maybe mark any obviously needed syscalls with SECCOMP_RET_ALLOW, and then put the application through its paces. A list of syscalls that triggered the default action (SECCOMP_RET_LOG) can be easily gleaned from the logs and that list can be used to build the syscall whitelist. Finally, the developer can change the default action to the desired value. This provides a more friendly experience than seeing the application get killed, then updating the filter and rebuilding the app, seeing the application get killed due to a different syscall, then updating the filter and rebuilding the app, etc. The functionality is similar to what's supported by the various LSMs. SELinux has permissive mode, AppArmor has complain mode, SMACK has bring-up mode, etc. SECCOMP_RET_LOG is given a lower value than SECCOMP_RET_ALLOW as allow while logging is slightly more restrictive than quietly allowing. Unfortunately, the tests added for SECCOMP_RET_LOG are not capable of inspecting the audit log to verify that the syscall was logged. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if action == RET_LOG && RET_LOG in actions_logged: log else if filter-requests-logging && action in actions_logged: log else if audit_enabled && process-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 19a611d0712e..f94433263e4b 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -31,6 +31,7 @@ #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ /* Masks for the return value sections. */ -- cgit v1.2.3 From fd76875ca289a3d4722f266fd2d5532a27083903 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 12:53:18 -0700 Subject: seccomp: Rename SECCOMP_RET_KILL to SECCOMP_RET_KILL_THREAD In preparation for adding SECCOMP_RET_KILL_PROCESS, rename SECCOMP_RET_KILL to the more accurate SECCOMP_RET_KILL_THREAD. The existing selftest values are intentionally left as SECCOMP_RET_KILL just to be sure we're exercising the alias. Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index f94433263e4b..5a03f699eb17 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -27,7 +27,8 @@ * The ordering ensures that a min_t() over composed return values always * selects the least permissive choice. */ -#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */ +#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ +#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ -- cgit v1.2.3 From 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:01:39 -0700 Subject: seccomp: Introduce SECCOMP_RET_KILL_PROCESS This introduces the BPF return value for SECCOMP_RET_KILL_PROCESS to kill an entire process. This cannot yet be reached by seccomp, but it changes the default-kill behavior (for unknown return values) from kill-thread to kill-process. Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 5a03f699eb17..7e77c92df78a 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -22,18 +22,20 @@ /* * All BPF programs must return a 32-bit value. * The bottom 16-bits are for optional return data. - * The upper 16-bits are ordered from least permissive values to most. + * The upper 16-bits are ordered from least permissive values to most, + * as a signed value (so 0x8000000 is negative). * * The ordering ensures that a min_t() over composed return values always * selects the least permissive choice. */ -#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ -#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD -#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ -#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ -#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ -#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ -#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ +#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */ +#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ +#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD +#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ +#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ +#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ +#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ /* Masks for the return value sections. */ #define SECCOMP_RET_ACTION 0x7fff0000U -- cgit v1.2.3 From 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:12:11 -0700 Subject: seccomp: Implement SECCOMP_RET_KILL_PROCESS action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, SECCOMP_RET_KILL_THREAD (neé SECCOMP_RET_KILL) kills the current thread. There have been a few requests for this to kill the entire process (the thread group). This cannot be just changed (discovered when adding coredump support since coredumping kills the entire process) because there are userspace programs depending on the thread-kill behavior. Instead, implement SECCOMP_RET_KILL_PROCESS, which is 0x80000000, and can be processed as "-1" by the kernel, below the existing RET_KILL that is ABI-set to "0". For userspace, SECCOMP_RET_ACTION_FULL is added to expand the mask to the signed bit. Old userspace using the SECCOMP_RET_ACTION mask will see SECCOMP_RET_KILL_PROCESS as 0 still, but this would only be visible when examining the siginfo in a core dump from a RET_KILL_*, where it will think it was thread-killed instead of process-killed. Attempts to introduce this behavior via other ways (filter flags, seccomp struct flags, masked RET_DATA bits) all come with weird side-effects and baggage. This change preserves the central behavioral expectations of the seccomp filter engine without putting too great a burden on changes needed in userspace to use the new action. The new action is discoverable by userspace through either the new actions_avail sysctl or through the SECCOMP_GET_ACTION_AVAIL seccomp operation. If used without checking for availability, old kernels will treat RET_KILL_PROCESS as RET_KILL_THREAD (since the old mask will produce RET_KILL_THREAD). Cc: Paul Moore Cc: Fabricio Voznika Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 7e77c92df78a..f6bc1dea3247 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -38,6 +38,7 @@ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ /* Masks for the return value sections. */ +#define SECCOMP_RET_ACTION_FULL 0xffff0000U #define SECCOMP_RET_ACTION 0x7fff0000U #define SECCOMP_RET_DATA 0x0000ffffU -- cgit v1.2.3 From 44dd8a989c787e9077745417140aa132bfe45bf5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 15 Aug 2017 19:07:53 +0800 Subject: include: uapi: usb: Introduce USB charger type and state definition Introducing USB charger type and state definition can help to support USB charging which will be added in USB phy core. Signed-off-by: Baolin Wang Signed-off-by: Felipe Balbi --- include/uapi/linux/usb/charger.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/uapi/linux/usb/charger.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/usb/charger.h b/include/uapi/linux/usb/charger.h new file mode 100644 index 000000000000..5f72af35b3ed --- /dev/null +++ b/include/uapi/linux/usb/charger.h @@ -0,0 +1,31 @@ +/* + * This file defines the USB charger type and state that are needed for + * USB device APIs. + */ + +#ifndef _UAPI__LINUX_USB_CHARGER_H +#define _UAPI__LINUX_USB_CHARGER_H + +/* + * USB charger type: + * SDP (Standard Downstream Port) + * DCP (Dedicated Charging Port) + * CDP (Charging Downstream Port) + * ACA (Accessory Charger Adapters) + */ +enum usb_charger_type { + UNKNOWN_TYPE, + SDP_TYPE, + DCP_TYPE, + CDP_TYPE, + ACA_TYPE, +}; + +/* USB charger state */ +enum usb_charger_state { + USB_CHARGER_DEFAULT, + USB_CHARGER_PRESENT, + USB_CHARGER_ABSENT, +}; + +#endif /* _UAPI__LINUX_USB_CHARGER_H */ -- cgit v1.2.3 From 5c1aab1dd5445ed8bdcdbb575abc1b0d7ee5b2e7 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Wed, 9 Aug 2017 19:39:02 -0700 Subject: btrfs: Add zstd support Add zstd compression and decompression support to BtrFS. zstd at its fastest level compresses almost as well as zlib, while offering much faster compression and decompression, approaching lzo speeds. I benchmarked btrfs with zstd compression against no compression, lzo compression, and zlib compression. I benchmarked two scenarios. Copying a set of files to btrfs, and then reading the files. Copying a tarball to btrfs, extracting it to btrfs, and then reading the extracted files. After every operation, I call `sync` and include the sync time. Between every pair of operations I unmount and remount the filesystem to avoid caching. The benchmark files can be found in the upstream zstd source repository under `contrib/linux-kernel/{btrfs-benchmark.sh,btrfs-extract-benchmark.sh}` [1] [2]. I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM. The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor, 16 GB of RAM, and a SSD. The first compression benchmark is copying 10 copies of the unzipped Silesia corpus [3] into a BtrFS filesystem mounted with `-o compress-force=Method`. The decompression benchmark times how long it takes to `tar` all 10 copies into `/dev/null`. The compression ratio is measured by comparing the output of `df` and `du`. See the benchmark file [1] for details. I benchmarked multiple zstd compression levels, although the patch uses zstd level 1. | Method | Ratio | Compression MB/s | Decompression speed | |---------|-------|------------------|---------------------| | None | 0.99 | 504 | 686 | | lzo | 1.66 | 398 | 442 | | zlib | 2.58 | 65 | 241 | | zstd 1 | 2.57 | 260 | 383 | | zstd 3 | 2.71 | 174 | 408 | | zstd 6 | 2.87 | 70 | 398 | | zstd 9 | 2.92 | 43 | 406 | | zstd 12 | 2.93 | 21 | 408 | | zstd 15 | 3.01 | 11 | 354 | The next benchmark first copies `linux-4.11.6.tar` [4] to btrfs. Then it measures the compression ratio, extracts the tar, and deletes the tar. Then it measures the compression ratio again, and `tar`s the extracted files into `/dev/null`. See the benchmark file [2] for details. | Method | Tar Ratio | Extract Ratio | Copy (s) | Extract (s)| Read (s) | |--------|-----------|---------------|----------|------------|----------| | None | 0.97 | 0.78 | 0.981 | 5.501 | 8.807 | | lzo | 2.06 | 1.38 | 1.631 | 8.458 | 8.585 | | zlib | 3.40 | 1.86 | 7.750 | 21.544 | 11.744 | | zstd 1 | 3.57 | 1.85 | 2.579 | 11.479 | 9.389 | [1] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-benchmark.sh [2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-extract-benchmark.sh [3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia [4] https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.11.6.tar.xz zstd source repository: https://github.com/facebook/zstd Signed-off-by: Nick Terrell Signed-off-by: Chris Mason --- include/uapi/linux/btrfs.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 9aa74f317747..378230c163d5 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -255,13 +255,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) -/* - * some patches floated around with a second compression method - * lets save that incompat here for when they do get in - * Note we don't actually support it, we're just reserving the - * number - */ -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD (1ULL << 4) /* * older kernels tried to do bigger metadata blocks, but the -- cgit v1.2.3 From fe4007999599c02598c17b643e8de43e487d48e8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Aug 2017 09:09:49 +0200 Subject: ipv6: fib: Provide offload indication using nexthop flags IPv6 routes currently lack nexthop flags as in IPv4. This has several implications. In the forwarding path, it requires us to check the carrier state of the nexthop device and potentially ignore a linkdown route, instead of checking for RTNH_F_LINKDOWN. It also requires capable drivers to use the user facing IPv6-specific route flags to provide offload indication, instead of using the nexthop flags as in IPv4. Add nexthop flags to IPv6 routes in the 40 bytes hole and use it to provide offload indication instead of the RTF_OFFLOAD flag, which is removed while it's still not part of any official kernel release. In the near future we would like to use the field for the RTNH_F_{LINKDOWN,DEAD} flags, but this change is more involved and might not be ready in time for the current cycle. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/ipv6_route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index 33e2a5732bd1..d496c02e14bc 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -35,7 +35,6 @@ #define RTF_PREF(pref) ((pref) << 27) #define RTF_PREF_MASK 0x18000000 -#define RTF_OFFLOAD 0x20000000 /* offloaded route */ #define RTF_PCPU 0x40000000 /* read-only: can not be set by user */ #define RTF_LOCAL 0x80000000 -- cgit v1.2.3 From 6a1c9510694fe1e901a3b5b53386eac069adcea6 Mon Sep 17 00:00:00 2001 From: Moses Reuben Date: Tue, 15 Aug 2017 23:00:20 -0400 Subject: drm/amdkfd: Adding new IOCTL for scratch memory v2 v2: * Renamed ALLOC_MEMORY_OF_SCRATCH to SET_SCRATCH_BACKING_VA * Removed size parameter from the ioctl, it was unused * Removed hole in ioctl number space * No more call to write_config_static_mem * Return correct error code from ioctl Signed-off-by: Moses Reuben Signed-off-by: Ben Goz Signed-off-by: Felix Kuehling Signed-off-by: Oded Gabbay --- include/uapi/linux/kfd_ioctl.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index d6833426fdef..1b9c5609d523 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -232,6 +232,12 @@ struct kfd_ioctl_wait_events_args { uint32_t wait_result; /* from KFD */ }; +struct kfd_ioctl_set_scratch_backing_va_args { + uint64_t va_addr; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; +}; + #define AMDKFD_IOCTL_BASE 'K' #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) @@ -286,7 +292,10 @@ struct kfd_ioctl_wait_events_args { #define AMDKFD_IOC_DBG_WAVE_CONTROL \ AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args) +#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA \ + AMDKFD_IOWR(0x11, struct kfd_ioctl_set_scratch_backing_va_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x11 +#define AMDKFD_COMMAND_END 0x12 #endif -- cgit v1.2.3 From 5d71dbc3a588690c3d66d76db8cd29973425ce6d Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 15 Aug 2017 23:00:22 -0400 Subject: drm/amdkfd: Implement image tiling mode support v2 v2: Removed hole in ioctl number space Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/linux/kfd_ioctl.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 1b9c5609d523..7b4567bacfc2 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -238,6 +238,29 @@ struct kfd_ioctl_set_scratch_backing_va_args { uint32_t pad; }; +struct kfd_ioctl_get_tile_config_args { + /* to KFD: pointer to tile array */ + uint64_t tile_config_ptr; + /* to KFD: pointer to macro tile array */ + uint64_t macro_tile_config_ptr; + /* to KFD: array size allocated by user mode + * from KFD: array size filled by kernel + */ + uint32_t num_tile_configs; + /* to KFD: array size allocated by user mode + * from KFD: array size filled by kernel + */ + uint32_t num_macro_tile_configs; + + uint32_t gpu_id; /* to KFD */ + uint32_t gb_addr_config; /* from KFD */ + uint32_t num_banks; /* from KFD */ + uint32_t num_ranks; /* from KFD */ + /* struct size can be extended later if needed + * without breaking ABI compatibility + */ +}; + #define AMDKFD_IOCTL_BASE 'K' #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) @@ -295,7 +318,10 @@ struct kfd_ioctl_set_scratch_backing_va_args { #define AMDKFD_IOC_SET_SCRATCH_BACKING_VA \ AMDKFD_IOWR(0x11, struct kfd_ioctl_set_scratch_backing_va_args) +#define AMDKFD_IOC_GET_TILE_CONFIG \ + AMDKFD_IOWR(0x12, struct kfd_ioctl_get_tile_config_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x12 +#define AMDKFD_COMMAND_END 0x13 #endif -- cgit v1.2.3 From b005fd189cec9407b700599e1e80e0552446ee79 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:31:58 -0700 Subject: bpf: introduce new program type for skbs on sockets A class of programs, run from strparser and soon from a new map type called sock map, are used with skb as the context but on established sockets. By creating a specific program type for these we can use bpf helpers that expect full sockets and get the verifier to ensure these helpers are not used out of context. The new type is BPF_PROG_TYPE_SK_SKB. This patch introduces the infrastructure and type. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 91da8371a2d0..2e796e384aeb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -127,6 +127,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, + BPF_PROG_TYPE_SK_SKB, }; enum bpf_attach_type { -- cgit v1.2.3 From 174a79ff9515f400b9a6115643dafd62a635b7e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:32:47 -0700 Subject: bpf: sockmap with sk redirect support Recently we added a new map type called dev map used to forward XDP packets between ports (6093ec2dc313). This patches introduces a similar notion for sockets. A sockmap allows users to add participating sockets to a map. When sockets are added to the map enough context is stored with the map entry to use the entry with a new helper bpf_sk_redirect_map(map, key, flags) This helper (analogous to bpf_redirect_map in XDP) is given the map and an entry in the map. When called from a sockmap program, discussed below, the skb will be sent on the socket using skb_send_sock(). With the above we need a bpf program to call the helper from that will then implement the send logic. The initial site implemented in this series is the recv_sock hook. For this to work we implemented a map attach command to add attributes to a map. In sockmap we add two programs a parse program and a verdict program. The parse program uses strparser to build messages and pass them to the verdict program. The parse programs use the normal strparser semantics. The verdict program is of type SK_SKB. The verdict program returns a verdict SK_DROP, or SK_REDIRECT for now. Additional actions may be added later. When SK_REDIRECT is returned, expected when bpf program uses bpf_sk_redirect_map(), the sockmap logic will consult per cpu variables set by the helper routine and pull the sock entry out of the sock map. This pattern follows the existing redirect logic in cls and xdp programs. This gives the flow, recv_sock -> str_parser (parse_prog) -> verdict_prog -> skb_send_sock \ -> kfree_skb As an example use case a message based load balancer may use specific logic in the verdict program to select the sock to send on. Sample programs are provided in future patches that hopefully illustrate the user interfaces. Also selftests are in follow-on patches. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e796e384aeb..7f774769e3f5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -110,6 +110,7 @@ enum bpf_map_type { BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, + BPF_MAP_TYPE_SOCKMAP, }; enum bpf_prog_type { @@ -135,11 +136,15 @@ enum bpf_attach_type { BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, BPF_CGROUP_SOCK_OPS, + BPF_CGROUP_SMAP_INGRESS, __MAX_BPF_ATTACH_TYPE }; #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +/* If BPF_SOCKMAP_STRPARSER is used sockmap will use strparser on receive */ +#define BPF_SOCKMAP_STRPARSER (1U << 0) + /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command * to the given target_fd cgroup the descendent cgroup will be able to * override effective bpf program that was inherited from this cgroup @@ -211,6 +216,7 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; + __u32 attach_bpf_fd2; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -557,6 +563,23 @@ union bpf_attr { * @mode: operation mode (enum bpf_adj_room_mode) * @flags: reserved for future use * Return: 0 on success or negative error code + * + * int bpf_sk_redirect_map(map, key, flags) + * Redirect skb to a sock in map using key as a lookup key for the + * sock in map. + * @map: pointer to sockmap + * @key: key to lookup sock in map + * @flags: reserved for future use + * Return: SK_REDIRECT + * + * int bpf_sock_map_update(skops, map, key, flags, map_flags) + * @skops: pointer to bpf_sock_ops + * @map: pointer to sockmap to update + * @key: key to insert/update sock in map + * @flags: same flags as map update elem + * @map_flags: sock map specific flags + * bit 1: Enable strparser + * other bits: reserved */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -610,7 +633,9 @@ union bpf_attr { FN(set_hash), \ FN(setsockopt), \ FN(skb_adjust_room), \ - FN(redirect_map), + FN(redirect_map), \ + FN(sk_redirect_map), \ + FN(sock_map_update), \ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -747,6 +772,12 @@ struct xdp_md { __u32 data_end; }; +enum sk_action { + SK_ABORTED = 0, + SK_DROP, + SK_REDIRECT, +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { -- cgit v1.2.3 From 8a31db5615667956c513d205cfb06885c3ec6d0b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:33:09 -0700 Subject: bpf: add access to sock fields and pkt data from sk_skb programs Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7f774769e3f5..5ecbe812a2cc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -712,6 +712,15 @@ struct __sk_buff { __u32 data; __u32 data_end; __u32 napi_id; + + /* accessed by BPF_PROG_TYPE_sk_skb types */ + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ }; struct bpf_tunnel_key { -- cgit v1.2.3 From 22e4ebb975822833b083533035233d128b30e98f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 28 Jul 2017 16:40:40 -0400 Subject: membarrier: Provide expedited private command Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built from all runqueues for which current thread's mm is the same as the thread calling sys_membarrier. It executes faster than the non-expedited variant (no blocking). It also works on NOHZ_FULL configurations. Scheduler-wise, it requires a memory barrier before and after context switching between processes (which have different mm). The memory barrier before context switch is already present. For the barrier after context switch: * Our TSO archs can do RELEASE without being a full barrier. Look at x86 spin_unlock() being a regular STORE for example. But for those archs, all atomics imply smp_mb and all of them have atomic ops in switch_mm() for mm_cpumask(), and on x86 the CR3 load acts as a full barrier. * From all weakly ordered machines, only ARM64 and PPC can do RELEASE, the rest does indeed do smp_mb(), so there the spin_unlock() is a full barrier and we're good. * ARM64 has a very heavy barrier in switch_to(), which suffices. * PPC just removed its barrier from switch_to(), but appears to be talking about adding something to switch_mm(). So add a smp_mb__after_unlock_lock() for now, until this is settled on the PPC side. Changes since v3: - Properly document the memory barriers provided by each architecture. Changes since v2: - Address comments from Peter Zijlstra, - Add smp_mb__after_unlock_lock() after finish_lock_switch() in finish_task_switch() to add the memory barrier we need after storing to rq->curr. This is much simpler than the previous approach relying on atomic_dec_and_test() in mmdrop(), which actually added a memory barrier in the common case of switching between userspace processes. - Return -EINVAL when MEMBARRIER_CMD_SHARED is used on a nohz_full kernel, rather than having the whole membarrier system call returning -ENOSYS. Indeed, CMD_PRIVATE_EXPEDITED is compatible with nohz_full. Adapt the CMD_QUERY mask accordingly. Changes since v1: - move membarrier code under kernel/sched/ because it uses the scheduler runqueue, - only add the barrier when we switch from a kernel thread. The case where we switch from a user-space thread is already handled by the atomic_dec_and_test() in mmdrop(). - add a comment to mmdrop() documenting the requirement on the implicit memory barrier. CC: Peter Zijlstra CC: Paul E. McKenney CC: Boqun Feng CC: Andrew Hunter CC: Maged Michael CC: gromer@google.com CC: Avi Kivity CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: Michael Ellerman Signed-off-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney Tested-by: Dave Watson --- include/uapi/linux/membarrier.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index e0b108bd2624..6d47b3249d8a 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -40,14 +40,33 @@ * (non-running threads are de facto in such a * state). This covers threads from all processes * running on the system. This command returns 0. + * @MEMBARRIER_CMD_PRIVATE_EXPEDITED: + * Execute a memory barrier on each running + * thread belonging to the same process as the current + * thread. Upon return from system call, the + * caller thread is ensured that all its running + * threads siblings have passed through a state + * where all memory accesses to user-space + * addresses match program order between entry + * to and return from the system call + * (non-running threads are de facto in such a + * state). This only covers threads from the + * same processes as the caller thread. This + * command returns 0. The "expedited" commands + * complete faster than the non-expedited ones, + * they never block, but have the downside of + * causing extra overhead. * * Command to be passed to the membarrier system call. The commands need to * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to * the value 0. */ enum membarrier_cmd { - MEMBARRIER_CMD_QUERY = 0, - MEMBARRIER_CMD_SHARED = (1 << 0), + MEMBARRIER_CMD_QUERY = 0, + MEMBARRIER_CMD_SHARED = (1 << 0), + /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */ + /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */ + MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), }; #endif /* _UAPI_LINUX_MEMBARRIER_H */ -- cgit v1.2.3 From 0888e372c37fa31882c8ed89fb2f8188b08b6718 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Thu, 17 Aug 2017 00:35:11 +0000 Subject: net: inet: diag: expose sockets cgroup classid This is useful for directly looking up a task based on class id rather than having to scan through all open file descriptors. Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bbe201047df6..678496897a68 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -142,6 +142,7 @@ enum { INET_DIAG_PAD, INET_DIAG_MARK, INET_DIAG_BBRINFO, + INET_DIAG_CLASS_ID, __INET_DIAG_MAX, }; -- cgit v1.2.3 From 99d1712bc41c7c9a5a473c104a4ad15427757b22 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Aug 2017 15:15:29 +0200 Subject: netfilter: exthdr: tcp option set support This allows setting 2 and 4 byte quantities in the tcp option space. Main purpose is to allow native replacement for xt_TCPMSS to work around pmtu blackholes. Writes to kind and len are now allowed at the moment, it does not seem useful to do this as it causes corruption of the tcp option space. We can always lift this restriction later if a use-case appears. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index be25cf69295b..40fd199f7531 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -732,7 +732,8 @@ enum nft_exthdr_op { * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32) * @NFTA_EXTHDR_LEN: extension header length (NLA_U32) * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32) - * @NFTA_EXTHDR_OP: option match type (NLA_U8) + * @NFTA_EXTHDR_OP: option match type (NLA_U32) + * @NFTA_EXTHDR_SREG: option match type (NLA_U32) */ enum nft_exthdr_attributes { NFTA_EXTHDR_UNSPEC, @@ -742,6 +743,7 @@ enum nft_exthdr_attributes { NFTA_EXTHDR_LEN, NFTA_EXTHDR_FLAGS, NFTA_EXTHDR_OP, + NFTA_EXTHDR_SREG, __NFTA_EXTHDR_MAX }; #define NFTA_EXTHDR_MAX (__NFTA_EXTHDR_MAX - 1) -- cgit v1.2.3 From 6b5dc98e8fac041a3decfc3186e08c1c570ea691 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Aug 2017 15:48:04 +0200 Subject: netfilter: rt: add support to fetch path mss to be used in combination with tcp option set support to mimic iptables TCPMSS --clamp-mss-to-pmtu. v2: Eric Dumazet points out dst must be initialized. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 40fd199f7531..b49da72efa68 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -811,11 +811,13 @@ enum nft_meta_keys { * @NFT_RT_CLASSID: realm value of packet's route (skb->dst->tclassid) * @NFT_RT_NEXTHOP4: routing nexthop for IPv4 * @NFT_RT_NEXTHOP6: routing nexthop for IPv6 + * @NFT_RT_TCPMSS: fetch current path tcp mss */ enum nft_rt_keys { NFT_RT_CLASSID, NFT_RT_NEXTHOP4, NFT_RT_NEXTHOP6, + NFT_RT_TCPMSS, }; /** -- cgit v1.2.3 From 96eabe7a40aa17e613cf3db2c742ee8b1fc764d0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 18 Aug 2017 11:28:00 -0700 Subject: bpf: Allow selecting numa node during map creation The current map creation API does not allow to provide the numa-node preference. The memory usually comes from where the map-creation-process is running. The performance is not ideal if the bpf_prog is known to always run in a numa node different from the map-creation-process. One of the use case is sharding on CPU to different LRU maps (i.e. an array of LRU maps). Here is the test result of map_perf_test on the INNER_LRU_HASH_PREALLOC test if we force the lru map used by CPU0 to be allocated from a remote numa node: [ The machine has 20 cores. CPU0-9 at node 0. CPU10-19 at node 1 ] ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1628380 events per sec 4:inner_lru_hash_map_perf pre-alloc 1626396 events per sec 3:inner_lru_hash_map_perf pre-alloc 1626144 events per sec 6:inner_lru_hash_map_perf pre-alloc 1621657 events per sec 2:inner_lru_hash_map_perf pre-alloc 1621534 events per sec 1:inner_lru_hash_map_perf pre-alloc 1620292 events per sec 7:inner_lru_hash_map_perf pre-alloc 1613305 events per sec 0:inner_lru_hash_map_perf pre-alloc 1239150 events per sec #<<< After specifying numa node: ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1629627 events per sec 3:inner_lru_hash_map_perf pre-alloc 1628057 events per sec 1:inner_lru_hash_map_perf pre-alloc 1623054 events per sec 6:inner_lru_hash_map_perf pre-alloc 1616033 events per sec 2:inner_lru_hash_map_perf pre-alloc 1614630 events per sec 4:inner_lru_hash_map_perf pre-alloc 1612651 events per sec 7:inner_lru_hash_map_perf pre-alloc 1609337 events per sec 0:inner_lru_hash_map_perf pre-alloc 1619340 events per sec #<<< This patch adds one field, numa_node, to the bpf_attr. Since numa node 0 is a valid node, a new flag BPF_F_NUMA_NODE is also added. The numa_node field is honored if and only if the BPF_F_NUMA_NODE flag is set. Numa node selection is not supported for percpu map. This patch does not change all the kmalloc. F.e. 'htab = kzalloc()' is not changed since the object is small enough to stay in the cache. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5ecbe812a2cc..843818dff96d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_attach_type { #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +/* flags for BPF_MAP_CREATE command */ #define BPF_F_NO_PREALLOC (1U << 0) /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list @@ -173,6 +174,8 @@ enum bpf_attach_type { * across different LRU lists. */ #define BPF_F_NO_COMMON_LRU (1U << 1) +/* Specify numa node during map creation */ +#define BPF_F_NUMA_NODE (1U << 2) union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ @@ -180,8 +183,13 @@ union bpf_attr { __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ - __u32 map_flags; /* prealloc or not */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ -- cgit v1.2.3 From 84e54fe0a5eaed696dee4019c396f8396f5a908b Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 22 Aug 2017 09:40:28 -0700 Subject: gre: introduce native tunnel support for ERSPAN The patch adds ERSPAN type II tunnel support. The implementation is based on the draft at [1]. One of the purposes is for Linux box to be able to receive ERSPAN monitoring traffic sent from the Cisco switch, by creating a ERSPAN tunnel device. In addition, the patch also adds ERSPAN TX, so Linux virtual switch can redirect monitored traffic to the ERSPAN tunnel device. The traffic will be encapsulated into ERSPAN and sent out. The implementation reuses tunnel key as ERSPAN session ID, and field 'erspan' as ERSPAN Index fields: ./ip link add dev ers11 type erspan seq key 100 erspan 123 \ local 172.16.1.200 remote 172.16.1.100 To use the above device as ERSPAN receiver, configure Nexus 5000 switch as below: monitor session 100 type erspan-source erspan-id 123 vrf default destination ip 172.16.1.200 source interface Ethernet1/11 both source interface Ethernet1/12 both no shut monitor erspan origin ip-address 172.16.1.100 global [1] https://tools.ietf.org/html/draft-foschiano-erspan-01 [2] iproute2 patch: http://marc.info/?l=linux-netdev&m=150306086924951&w=2 [3] test script: http://marc.info/?l=linux-netdev&m=150231021807304&w=2 Signed-off-by: William Tu Signed-off-by: Meenakshi Vohra Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + include/uapi/linux/if_tunnel.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 5bc9bfd816b7..efeb1190c2ca 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -66,6 +66,7 @@ #define ETH_P_ATALK 0x809B /* Appletalk DDP */ #define ETH_P_AARP 0x80F3 /* Appletalk AARP */ #define ETH_P_8021Q 0x8100 /* 802.1Q VLAN Extended Header */ +#define ETH_P_ERSPAN 0x88BE /* ERSPAN type II */ #define ETH_P_IPX 0x8137 /* IPX over DIX */ #define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ #define ETH_P_PAUSE 0x8808 /* IEEE Pause frames. See 802.3 31B */ diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 6792d1967d31..2e520883c054 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -134,6 +134,7 @@ enum { IFLA_GRE_COLLECT_METADATA, IFLA_GRE_IGNORE_DF, IFLA_GRE_FWMARK, + IFLA_GRE_ERSPAN_INDEX, __IFLA_GRE_MAX, }; -- cgit v1.2.3 From 5cdcf4c6a638591ec0e98c57404a19e7f9997567 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 28 Jul 2017 13:56:06 +0200 Subject: ANDROID: binder: add padding to binder_fd_array_object. binder_fd_array_object starts with a 4-byte header, followed by a few fields that are 8 bytes when ANDROID_BINDER_IPC_32BIT=N. This can cause alignment issues in a 64-bit kernel with a 32-bit userspace, as on x86_32 an 8-byte primitive may be aligned to a 4-byte address. Pad with a __u32 to fix this. Signed-off-by: Martijn Coenen Cc: stable # 4.11+ Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/android/binder.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 51f891fb1b18..7668b5791c91 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -132,6 +132,7 @@ enum { /* struct binder_fd_array_object - object describing an array of fds in a buffer * @hdr: common header structure + * @pad: padding to ensure correct alignment * @num_fds: number of file descriptors in the buffer * @parent: index in offset array to buffer holding the fd array * @parent_offset: start offset of fd array in the buffer @@ -152,6 +153,7 @@ enum { */ struct binder_fd_array_object { struct binder_object_header hdr; + __u32 pad; binder_size_t num_fds; binder_size_t parent; binder_size_t parent_offset; -- cgit v1.2.3 From 1e6ec9ea89d30739b9447c1860fcb07fc29f3aef Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 23 Aug 2017 14:54:59 -0700 Subject: Revert "loop: support 4k physical blocksize" There's some stuff still up in the air, let's not get stuck with a subpar ABI. I'll follow up with something better for 4.14. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/uapi/linux/loop.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index a3960f98679c..c8125ec1f4f2 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -22,7 +22,6 @@ enum { LO_FLAGS_AUTOCLEAR = 4, LO_FLAGS_PARTSCAN = 8, LO_FLAGS_DIRECT_IO = 16, - LO_FLAGS_BLOCKSIZE = 32, }; #include /* for __kernel_old_dev_t */ @@ -60,8 +59,6 @@ struct loop_info64 { __u64 lo_init[2]; }; -#define LO_INFO_BLOCKSIZE(l) (l)->lo_init[0] - /* * Loop filter types */ -- cgit v1.2.3 From ecda85e70277ef24e44a1f6bc00243cebd19f985 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 16 Aug 2017 19:31:57 +0200 Subject: x86/lguest: Remove lguest support Lguest seems to be rather unused these days. It has seen only patches ensuring it still builds the last two years and its official state is "Odd Fixes". Remove it in order to be able to clean up the paravirt code. Signed-off-by: Juergen Gross Acked-by: Rusty Russell Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: lguest@lists.ozlabs.org Cc: rusty@rustcorp.com.au Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20170816173157.8633-3-jgross@suse.com Signed-off-by: Ingo Molnar --- include/uapi/linux/virtio_ring.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index c07295969b7e..6d5d5faa989b 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -1,7 +1,7 @@ #ifndef _UAPI_LINUX_VIRTIO_RING_H #define _UAPI_LINUX_VIRTIO_RING_H -/* An interface for efficient virtio implementation, currently for use by KVM - * and lguest, but hopefully others soon. Do NOT change this since it will +/* An interface for efficient virtio implementation, currently for use by KVM, + * but hopefully others soon. Do NOT change this since it will * break existing servers and clients. * * This header is BSD licensed so anyone can use the definitions to implement -- cgit v1.2.3 From ea5311c7e752dbec9bfbdd79992a8772b37f32fa Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 10 Aug 2017 10:54:31 -0600 Subject: PCI: Fix PCIe capability sizes PCI_CAP_EXP_ENDPOINT_SIZEOF_V1 defines the size of the PCIe capability structure for v1 devices with link, but we also have a need in the vfio code for sizing the capability for devices without link, such as Root Complex Integrated Endpoints. Create a separate define for this ending the structure before the link fields. Additionally, this reveals that PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 is currently incorrect, ending the capability length before the v2 link fields. Rename this to specify an RC Integrated Endpoint (no link) capability length and move PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 to include the link fields as we have for the v1 version. Signed-off-by: Alex Williamson [bhelgaas: add "_" in "PCI_CAP_EXP_RC ENDPOINT_SIZEOF_V2 44"] Signed-off-by: Bjorn Helgaas Reviewed-by: Eric Auger --- include/uapi/linux/pci_regs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index c22d3ebaca20..e185d2d39ea6 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -513,6 +513,7 @@ #define PCI_EXP_DEVSTA_URD 0x0008 /* Unsupported Request Detected */ #define PCI_EXP_DEVSTA_AUXPD 0x0010 /* AUX Power Detected */ #define PCI_EXP_DEVSTA_TRPND 0x0020 /* Transactions Pending */ +#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 /* v1 endpoints without link end here */ #define PCI_EXP_LNKCAP 12 /* Link Capabilities */ #define PCI_EXP_LNKCAP_SLS 0x0000000f /* Supported Link Speeds */ #define PCI_EXP_LNKCAP_SLS_2_5GB 0x00000001 /* LNKCAP2 SLS Vector bit 0 */ @@ -556,7 +557,7 @@ #define PCI_EXP_LNKSTA_DLLLA 0x2000 /* Data Link Layer Link Active */ #define PCI_EXP_LNKSTA_LBMS 0x4000 /* Link Bandwidth Management Status */ #define PCI_EXP_LNKSTA_LABS 0x8000 /* Link Autonomous Bandwidth Status */ -#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V1 20 /* v1 endpoints end here */ +#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V1 20 /* v1 endpoints with link end here */ #define PCI_EXP_SLTCAP 20 /* Slot Capabilities */ #define PCI_EXP_SLTCAP_ABP 0x00000001 /* Attention Button Present */ #define PCI_EXP_SLTCAP_PCP 0x00000002 /* Power Controller Present */ @@ -639,7 +640,7 @@ #define PCI_EXP_DEVCTL2_OBFF_MSGB_EN 0x4000 /* Enable OBFF Message type B */ #define PCI_EXP_DEVCTL2_OBFF_WAKE_EN 0x6000 /* OBFF using WAKE# signaling */ #define PCI_EXP_DEVSTA2 42 /* Device Status 2 */ -#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 44 /* v2 endpoints end here */ +#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2 44 /* v2 endpoints without link end here */ #define PCI_EXP_LNKCAP2 44 /* Link Capabilities 2 */ #define PCI_EXP_LNKCAP2_SLS_2_5GB 0x00000002 /* Supported Speed 2.5GT/s */ #define PCI_EXP_LNKCAP2_SLS_5_0GB 0x00000004 /* Supported Speed 5.0GT/s */ @@ -647,6 +648,7 @@ #define PCI_EXP_LNKCAP2_CROSSLINK 0x00000100 /* Crosslink supported */ #define PCI_EXP_LNKCTL2 48 /* Link Control 2 */ #define PCI_EXP_LNKSTA2 50 /* Link Status 2 */ +#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 52 /* v2 endpoints with link end here */ #define PCI_EXP_SLTCAP2 52 /* Slot Capabilities 2 */ #define PCI_EXP_SLTCTL2 56 /* Slot Control 2 */ #define PCI_EXP_SLTSTA2 58 /* Slot Status 2 */ -- cgit v1.2.3 From f20c4ea49ec4708de97248927ac6138c2d14eba9 Mon Sep 17 00:00:00 2001 From: Dongdong Liu Date: Sat, 19 Aug 2017 17:07:20 +0800 Subject: PCI/DPC: Add eDPC support Add eDPC support. Get and print the RP PIO error information when the trigger condition is RP PIO error. For more information on eDPC, please see PCI Express Base Specification Revision 3.1, section 6.2.10.3, or view the PCI-SIG eDPC ECN here: https://pcisig.com/sites/default/files/specification_documents/ECN_Enhanced_DPC_2012-11-19_final.pdf Signed-off-by: Dongdong Liu Signed-off-by: Bjorn Helgaas Reviewed-by: Keith Busch --- include/uapi/linux/pci_regs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index c22d3ebaca20..1ce96275531c 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -967,6 +967,7 @@ #define PCI_EXP_DPC_CAP_RP_EXT 0x20 /* Root Port Extensions for DPC */ #define PCI_EXP_DPC_CAP_POISONED_TLP 0x40 /* Poisoned TLP Egress Blocking Supported */ #define PCI_EXP_DPC_CAP_SW_TRIGGER 0x80 /* Software Triggering Supported */ +#define PCI_EXP_DPC_RP_PIO_LOG_SIZE 0xF00 /* RP PIO log size */ #define PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000 /* ERR_COR signal on DL_Active supported */ #define PCI_EXP_DPC_CTL 6 /* DPC control */ @@ -980,6 +981,15 @@ #define PCI_EXP_DPC_SOURCE_ID 10 /* DPC Source Identifier */ +#define PCI_EXP_DPC_RP_PIO_STATUS 0x0C /* RP PIO Status */ +#define PCI_EXP_DPC_RP_PIO_MASK 0x10 /* RP PIO MASK */ +#define PCI_EXP_DPC_RP_PIO_SEVERITY 0x14 /* RP PIO Severity */ +#define PCI_EXP_DPC_RP_PIO_SYSERROR 0x18 /* RP PIO SysError */ +#define PCI_EXP_DPC_RP_PIO_EXCEPTION 0x1C /* RP PIO Exception */ +#define PCI_EXP_DPC_RP_PIO_HEADER_LOG 0x20 /* RP PIO Header Log */ +#define PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG 0x30 /* RP PIO ImpSpec Log */ +#define PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG 0x34 /* RP PIO TLP Prefix Log */ + /* Precision Time Measurement */ #define PCI_PTM_CAP 0x04 /* PTM Capability */ #define PCI_PTM_CAP_REQ 0x00000001 /* Requester capable */ -- cgit v1.2.3 From 1177009131bee310421f5c04c43d3777cbacbdc8 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Thu, 24 Aug 2017 08:39:59 +0200 Subject: devlink: Add Ethernet header for dpipe This will be used by the IPv4 host table which will be introduced in the following patches. This header is global and can be reused by many drivers. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index b0e807ac53bb..a855f8dcc8ee 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -226,4 +226,12 @@ enum devlink_dpipe_action_type { DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY, }; +enum devlink_dpipe_field_ethernet_id { + DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC, +}; + +enum devlink_dpipe_header_id { + DEVLINK_DPIPE_HEADER_ETHERNET, +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ -- cgit v1.2.3 From 3fb886ecea93605a8ea14e258ff3158b8966781e Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Thu, 24 Aug 2017 08:40:00 +0200 Subject: devlink: Add IPv4 header for dpipe This will be used by the IPv4 host table which will be introduced in the following patches. This header is global and can be reused by many drivers. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a855f8dcc8ee..6c172548589d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -230,8 +230,13 @@ enum devlink_dpipe_field_ethernet_id { DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC, }; +enum devlink_dpipe_field_ipv4_id { + DEVLINK_DPIPE_FIELD_IPV4_DST_IP, +}; + enum devlink_dpipe_header_id { DEVLINK_DPIPE_HEADER_ETHERNET, + DEVLINK_DPIPE_HEADER_IPV4, }; #endif /* _UAPI_LINUX_DEVLINK_H_ */ -- cgit v1.2.3 From 6ae5fa61d27dcb055f4198bcf6c8dbbf1bb33f52 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Aug 2017 15:21:54 -0700 Subject: perf/x86: Fix data source decoding for Skylake Skylake changed the encoding of the PEBS data source field. Some combinations are not available anymore, but some new cases e.g. for L4 cache hit are added. Fix up the conversion table for Skylake, similar as had been done for Nehalem. On Skylake server the encoding for L4 actually means persistent memory. Handle this case too. To properly describe it in the abstracted perf format I had to add some new fields. Since a hit can have only one level add a new field that is an enumeration, not a bit field to describe the level. It can describe any level. Some numbers are also used to describe PMEM and LFB. Also add a new generic remote flag that can be combined with the generic level to signify a remote cache. And there is an extension field for the snoop indication to handle the Forward state. I didn't add a generic flag for hops because it's not needed for Skylake. I changed the existing encodings for older CPUs to also fill in the new level and remote fields. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: jolsa@kernel.org Link: http://lkml.kernel.org/r/20170816222156.19953-3-andi@firstfloor.org Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 642db5fa3286..2a37ae925d85 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -954,14 +954,20 @@ union perf_mem_data_src { mem_snoop:5, /* snoop mode */ mem_lock:2, /* lock instr */ mem_dtlb:7, /* tlb access */ - mem_rsvd:31; + mem_lvl_num:4, /* memory hierarchy level number */ + mem_remote:1, /* remote */ + mem_snoopx:2, /* snoop mode, ext */ + mem_rsvd:24; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:31, + __u64 mem_rsvd:24, + mem_snoopx:2, /* snoop mode, ext */ + mem_remote:1, /* remote */ + mem_lvl_num:4, /* memory hierarchy level number */ mem_dtlb:7, /* tlb access */ mem_lock:2, /* lock instr */ mem_snoop:5, /* snoop mode */ @@ -998,6 +1004,22 @@ union perf_mem_data_src { #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ #define PERF_MEM_LVL_SHIFT 5 +#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ +/* 5-0xa available */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + /* snoop mode */ #define PERF_MEM_SNOOP_NA 0x01 /* not available */ #define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ @@ -1006,6 +1028,10 @@ union perf_mem_data_src { #define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ #define PERF_MEM_SNOOP_SHIFT 19 +#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ +/* 1 free */ +#define PERF_MEM_SNOOPX_SHIFT 37 + /* locked instruction */ #define PERF_MEM_LOCK_NA 0x01 /* not available */ #define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ -- cgit v1.2.3 From 38ee7f2d47565689f35662d488d25e7afc43477d Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Fri, 25 Aug 2017 09:56:45 +0200 Subject: ipv6: sr: add support for encapsulation of L2 frames This patch implements the L2 frame encapsulation mechanism, referred to as T.Encaps.L2 in the SRv6 specifications [1]. A new type of SRv6 tunnel mode is added (SEG6_IPTUN_MODE_L2ENCAP). It only accepts packets with an existing MAC header (i.e., it will not work for locally generated packets). The resulting packet looks like IPv6 -> SRH -> Ethernet -> original L3 payload. The next header field of the SRH is set to NEXTHDR_NONE. [1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01 Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/uapi/linux/seg6_iptunnel.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h index b6e5a0a1afd7..b23df9f58354 100644 --- a/include/uapi/linux/seg6_iptunnel.h +++ b/include/uapi/linux/seg6_iptunnel.h @@ -33,16 +33,26 @@ struct seg6_iptunnel_encap { enum { SEG6_IPTUN_MODE_INLINE, SEG6_IPTUN_MODE_ENCAP, + SEG6_IPTUN_MODE_L2ENCAP, }; #ifdef __KERNEL__ static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) { - int encap = (tuninfo->mode == SEG6_IPTUN_MODE_ENCAP); - - return ((tuninfo->srh->hdrlen + 1) << 3) + - (encap * sizeof(struct ipv6hdr)); + int head = 0; + + switch (tuninfo->mode) { + case SEG6_IPTUN_MODE_INLINE: + break; + case SEG6_IPTUN_MODE_ENCAP: + head = sizeof(struct ipv6hdr); + break; + case SEG6_IPTUN_MODE_L2ENCAP: + return 0; + } + + return ((tuninfo->srh->hdrlen + 1) << 3) + head; } #endif -- cgit v1.2.3 From ddc088238cd6988bb4ac3776f403d7ff9d3c7a63 Mon Sep 17 00:00:00 2001 From: Pawel Baldysiak Date: Wed, 16 Aug 2017 17:13:45 +0200 Subject: md: Runtime support for multiple ppls Increase PPL area to 1MB and use it as circular buffer to store PPL. The entry with highest generation number is the latest one. If PPL to be written is larger then space left in a buffer, rewind the buffer to the start (don't wrap it). Signed-off-by: Pawel Baldysiak Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- include/uapi/linux/raid/md_p.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index d500bd224979..b9197976b660 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -324,9 +324,10 @@ struct mdp_superblock_1 { #define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening * is guided by bitmap. */ -#define MD_FEATURE_CLUSTERED 256 /* clustered MD */ +#define MD_FEATURE_CLUSTERED 256 /* clustered MD */ #define MD_FEATURE_JOURNAL 512 /* support write cache */ #define MD_FEATURE_PPL 1024 /* support PPL */ +#define MD_FEATURE_MULTIPLE_PPLS 2048 /* support for multiple PPLs */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ @@ -338,6 +339,7 @@ struct mdp_superblock_1 { |MD_FEATURE_CLUSTERED \ |MD_FEATURE_JOURNAL \ |MD_FEATURE_PPL \ + |MD_FEATURE_MULTIPLE_PPLS \ ) struct r5l_payload_header { -- cgit v1.2.3 From 7a14724f54bf9889fcb1a9f1d4aa4e1d2e969d93 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 28 Aug 2017 08:33:20 -0700 Subject: libnvdimm: clean up command definitions Remove the command payloads that do not have an associated libnvdimm ioctl. I.e. remove the payloads that would only ever be carried in the ND_CMD_CALL envelope. This prevents userspace from growing unnecessary dependencies on this kernel header when userspace already has everything it needs to craft and send these commands. Cc: Jerry Hoemann Reported-by: Yasunori Goto Signed-off-by: Dan Williams --- include/uapi/linux/ndctl.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 6d3c54264d8e..3f03567631cb 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -145,43 +145,6 @@ struct nd_cmd_clear_error { __u64 cleared; } __packed; -struct nd_cmd_trans_spa { - __u64 spa; - __u32 status; - __u8 flags; - __u8 _reserved[3]; - __u64 trans_length; - __u32 num_nvdimms; - struct nd_nvdimm_device { - __u32 nfit_device_handle; - __u32 _reserved; - __u64 dpa; - } __packed devices[0]; - -} __packed; - -struct nd_cmd_ars_err_inj { - __u64 err_inj_spa_range_base; - __u64 err_inj_spa_range_length; - __u8 err_inj_options; - __u32 status; -} __packed; - -struct nd_cmd_ars_err_inj_clr { - __u64 err_inj_clr_spa_range_base; - __u64 err_inj_clr_spa_range_length; - __u32 status; -} __packed; - -struct nd_cmd_ars_err_inj_stat { - __u32 status; - __u32 inj_err_rec_count; - struct nd_error_stat_query_record { - __u64 err_inj_stat_spa_range_base; - __u64 err_inj_stat_spa_range_length; - } __packed record[0]; -} __packed; - enum { ND_CMD_IMPLEMENTED = 0, -- cgit v1.2.3 From 464bc0fd6273d518aee79fbd37211dd9bc35d863 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:10:04 -0700 Subject: bpf: convert sockmap field attach_bpf_fd2 to type In the initial sockmap API we provided strparser and verdict programs using a single attach command by extending the attach API with a the attach_bpf_fd2 field. However, if we add other programs in the future we will be adding a field for every new possible type, attach_bpf_fd(3,4,..). This seems a bit clumsy for an API. So lets push the programs using two new type fields. BPF_SK_SKB_STREAM_PARSER BPF_SK_SKB_STREAM_VERDICT This has the advantage of having a readable name and can easily be extended in the future. Updates to samples and sockmap included here also generalize tests slightly to support upcoming patch for multiple map support. Signed-off-by: John Fastabend Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 843818dff96d..97227be3690c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -136,7 +136,8 @@ enum bpf_attach_type { BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, BPF_CGROUP_SOCK_OPS, - BPF_CGROUP_SMAP_INGRESS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -224,7 +225,6 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; - __u32 attach_bpf_fd2; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -580,14 +580,11 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_REDIRECT * - * int bpf_sock_map_update(skops, map, key, flags, map_flags) + * int bpf_sock_map_update(skops, map, key, flags) * @skops: pointer to bpf_sock_ops * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem - * @map_flags: sock map specific flags - * bit 1: Enable strparser - * other bits: reserved */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.2.3 From 2f857d04601a1bb56958b95a9f180bce0e91e5e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:10:25 -0700 Subject: bpf: sockmap, remove STRPARSER map_flags and add multi-map support The addition of map_flags BPF_SOCKMAP_STRPARSER flags was to handle a specific use case where we want to have BPF parse program disabled on an entry in a sockmap. However, Alexei found the API a bit cumbersome and I agreed. Lets remove the STRPARSER flag and support the use case by allowing socks to be in multiple maps. This allows users to create two maps one with programs attached and one without. When socks are added to maps they now inherit any programs attached to the map. This is a nice generalization and IMO improves the API. The API rules are less ambiguous and do not need a flag: - When a sock is added to a sockmap we have two cases, i. The sock map does not have any attached programs so we can add sock to map without inheriting bpf programs. The sock may exist in 0 or more other maps. ii. The sock map has an attached BPF program. To avoid duplicate bpf programs we only add the sock entry if it does not have an existing strparser/verdict attached, returning -EBUSY if a program is already attached. Otherwise attach the program and inherit strparser/verdict programs from the sock map. This allows for socks to be in a multiple maps for redirects and inherit a BPF program from a single map. Also this patch simplifies the logic around BPF_{EXIST|NOEXIST|ANY} flags. In the original patch I tried to be extra clever and only update map entries when necessary. Now I've decided the complexity is not worth it. If users constantly update an entry with the same sock for no reason (i.e. update an entry without actually changing any parameters on map or sock) we still do an alloc/release. Using this and allowing multiple entries of a sock to exist in a map the logic becomes much simpler. Note: Now that multiple maps are supported the "maps" pointer called when a socket is closed becomes a list of maps to remove the sock from. To keep the map up to date when a sock is added to the sockmap we must add the map/elem in the list. Likewise when it is removed we must remove it from the list. This results in searching the per psock list on delete operation. On TCP_CLOSE events we walk the list and remove the psock from all map/entry locations. I don't see any perf implications in this because at most I have a psock in two maps. If a psock were to be in many maps its possibly this might be noticeable on delete but I can't think of a reason to dup a psock in many maps. The sk_callback_lock is used to protect read/writes to the list. This was convenient because in all locations we were taking the lock anyways just after working on the list. Also the lock is per sock so in normal cases we shouldn't see any contention. Suggested-by: Alexei Starovoitov Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 97227be3690c..08c206a863e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,9 +143,6 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -/* If BPF_SOCKMAP_STRPARSER is used sockmap will use strparser on receive */ -#define BPF_SOCKMAP_STRPARSER (1U << 0) - /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command * to the given target_fd cgroup the descendent cgroup will be able to * override effective bpf program that was inherited from this cgroup -- cgit v1.2.3 From 63e8d4394a2d226803f47abd7287dbb6d21bf8e4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 22 Aug 2017 16:58:20 +0300 Subject: serial: pch_uart: Make port type explicit It used to be a gap in port definitions after PORT_MAX_8250. Since the new drivers are coming the gap become shorter and shorter until the commit a2d6a987bfe4 ("serial: 8250: Add new port type for TI DA8xx/66AK2x") completely removed it. So, while type here is just a formality, make things a little bit more explicit for this driver and move port types to UAPI header. Note, it uses two types for now. Fixes: fddceb8b5399 ("tty: 8250: Add 64byte UART support for FSL platforms") Cc: Priyanka Jain Cc: Poonam Aggrwal Signed-off-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 38bea3217ead..502aa23c7e15 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -57,7 +57,6 @@ #define PORT_RT2880 29 /* Ralink RT2880 internal UART */ #define PORT_16550A_FSL64 30 /* Freescale 16550 UART with 64 FIFOs */ #define PORT_DA830 31 /* TI DA8xx/66AK2x */ -#define PORT_MAX_8250 31 /* max port ID */ /* * ARM specific type numbers. These are not currently guaranteed @@ -77,6 +76,10 @@ #define PORT_SUNZILOG 38 #define PORT_SUNSAB 39 +/* Intel EG20 */ +#define PORT_PCH_8LINE 44 +#define PORT_PCH_2LINE 45 + /* DEC */ #define PORT_DZ 46 #define PORT_ZS 47 -- cgit v1.2.3 From 3f3dac7e4d815cb7f929c0ed98c3a45a86852e53 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 22 Aug 2017 16:58:21 +0300 Subject: serial: Remove unused port type PORT_MFD is not in use since commit 1bd187de5364 ("x86, intel-mid: remove Intel MID specific serial support") Remove leftover. Fixes: 1bd187de5364 ("x86, intel-mid: remove Intel MID specific serial support") Signed-off-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 502aa23c7e15..00d335634271 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -209,9 +209,6 @@ /* MAX310X */ #define PORT_MAX310X 94 -/* High Speed UART for Medfield */ -#define PORT_MFD 95 - /* TI OMAP-UART */ #define PORT_OMAP 96 -- cgit v1.2.3 From ee1c90cc2cea80638f559c552371ee6893ca9d9e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 22 Aug 2017 16:58:22 +0300 Subject: serial: Fix port type numbering for TI DA8xx The UAPI has a global list of unique numbers for different port types. The commit a2d6a987bfe4 ("serial: 8250: Add new port type for TI DA8xx/66AK2x") introduced a new port type and brought the collision with two other port types. Reuse 95 for it instead. Fixes: a2d6a987bfe4 ("serial: 8250: Add new port type for TI DA8xx/66AK2x") Cc: David Lechner Cc: Sekhar Nori Signed-off-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 00d335634271..dc2d7cb766ab 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -56,7 +56,6 @@ #define PORT_ALTR_16550_F128 28 /* Altera 16550 UART with 128 FIFOs */ #define PORT_RT2880 29 /* Ralink RT2880 internal UART */ #define PORT_16550A_FSL64 30 /* Freescale 16550 UART with 64 FIFOs */ -#define PORT_DA830 31 /* TI DA8xx/66AK2x */ /* * ARM specific type numbers. These are not currently guaranteed @@ -209,6 +208,9 @@ /* MAX310X */ #define PORT_MAX310X 94 +/* TI DA8xx/66AK2x */ +#define PORT_DA830 95 + /* TI OMAP-UART */ #define PORT_OMAP 96 -- cgit v1.2.3 From 1c16ae65e2502da05310b2ec56b3a1fd3efe6f4d Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Mon, 21 Aug 2017 01:17:56 +0800 Subject: serial: 8250: of: Add new port type for MediaTek BTIF controller on MT7622/23 SoC MediaTek BTIF controller is the serial interface similar to UART but it works only as the digital device which is mainly used to communicate with the connectivity module called CONNSYS inside the SoC which could be mostly found on those MediaTek SoCs with Bluetooth feature such as MT7622 and MT7623 SoCs. And the controller is made as being compatible with the 8250 register layout with extra registers such as DMA enablement so it tends to be integrated with reusing 8250 OF driver. However, DMA mode is not being supported yet in the current driver. Signed-off-by: Sean Wang Suggested-by: Andy Shevchenko Acked-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index dc2d7cb766ab..50d71c436323 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -274,4 +274,7 @@ /* MPS2 UART */ #define PORT_MPS2UART 116 +/* MediaTek BTIF */ +#define PORT_MTK_BTIF 117 + #endif /* _UAPILINUX_SERIAL_CORE_H */ -- cgit v1.2.3 From fc7ce9c74c3ad232b084d80148654f926d01ece7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 28 Aug 2017 20:52:49 -0400 Subject: perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR For understanding how the workload maps to memory channels and hardware behavior, it's very important to collect address maps with physical addresses. For example, 3D XPoint access can only be found by filtering the physical address. Add a new sample type for physical address. perf already has a facility to collect data virtual address. This patch introduces a function to convert the virtual address to physical address. The function is quite generic and can be extended to any architecture as long as a virtual address is provided. - For kernel direct mapping addresses, virt_to_phys is used to convert the virtual addresses to physical address. - For user virtual addresses, __get_user_pages_fast is used to walk the pages tables for user physical address. - This does not work for vmalloc addresses right now. These are not resolved, but code to do that could be added. The new sample type requires collecting the virtual address. The virtual address will not be output unless SAMPLE_ADDR is applied. For security, the physical address can only be exposed to root or privileged user. Tested-by: Madhavan Srinivasan Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mpe@ellerman.id.au Link: http://lkml.kernel.org/r/1503967969-48278-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 2a37ae925d85..140ae638cfd6 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -139,8 +139,9 @@ enum perf_event_sample_format { PERF_SAMPLE_IDENTIFIER = 1U << 16, PERF_SAMPLE_TRANSACTION = 1U << 17, PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_PHYS_ADDR = 1U << 19, - PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ }; /* @@ -814,6 +815,7 @@ enum perf_event_type { * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * }; */ PERF_RECORD_SAMPLE = 9, -- cgit v1.2.3 From 2804fd3af6ba5ae5737705b27146455eabe2e2f8 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 28 Aug 2017 15:03:13 -0400 Subject: if_ether: add forces ife lfb type This patch adds the forces IFE lfb type according to IEEE registered ethertypes. See http://standards-oui.ieee.org/ethertype/eth.txt for more information. Since there exists the IFE subsystem it can be used there. This patch also use the correct word "ForCES" instead of "FoRCES" which is a spelling error inside the IEEE ethertype specification. Signed-off-by: Alexander Aring Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index efeb1190c2ca..f68f6bf4a253 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -104,6 +104,7 @@ #define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_QINQ3 0x9300 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */ #define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is less than this value -- cgit v1.2.3 From 155e6f649757c902901e599c268f8b575ddac1f8 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Mon, 28 Aug 2017 21:43:21 +0200 Subject: ether: add NSH ethertype The NSH draft says: An IEEE EtherType, 0x894F, has been allocated for NSH. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index f68f6bf4a253..61f7ccce5b69 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -99,6 +99,7 @@ #define ETH_P_FIP 0x8914 /* FCoE Initialization Protocol */ #define ETH_P_80221 0x8917 /* IEEE 802.21 Media Independent Handover Protocol */ #define ETH_P_HSR 0x892F /* IEC 62439-3 HSRv1 */ +#define ETH_P_NSH 0x894F /* Network Service Header */ #define ETH_P_LOOPBACK 0x9000 /* Ethernet loopback packet, per IEEE 802.3 */ #define ETH_P_QINQ1 0x9100 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ -- cgit v1.2.3 From 31770e34e43d6c8dee129bfee77e56c34e61f0e5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 30 Aug 2017 19:24:58 +0200 Subject: tcp: Revert "tcp: remove header prediction" This reverts commit 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78. Eric Dumazet says: We found at Google a significant regression caused by 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78 tcp: remove header prediction In typical RPC (TCP_RR), when a TCP socket receives data, we now call tcp_ack() while we used to not call it. This touches enough cache lines to cause a slowdown. so problem does not seem to be HP removal itself but the tcp_ack() call. Therefore, it might be possible to remove HP after all, provided one finds a way to elide tcp_ack for most cases. Reported-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index b3f346fb9fe3..758f12b58541 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -184,7 +184,9 @@ enum LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ LINUX_MIB_LISTENOVERFLOWS, /* ListenOverflows */ LINUX_MIB_LISTENDROPS, /* ListenDrops */ + LINUX_MIB_TCPHPHITS, /* TCPHPHits */ LINUX_MIB_TCPPUREACKS, /* TCPPureAcks */ + LINUX_MIB_TCPHPACKS, /* TCPHPAcks */ LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */ LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */ LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */ -- cgit v1.2.3 From 7373ae7e8f0bf2c0718422481da986db5058b005 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Tue, 29 Aug 2017 22:44:16 -0600 Subject: net: ether: Add support for multiplexing and aggregation type Define the Qualcomm multiplexing and aggregation (MAP) ether type 0x00F9. This is needed for receiving data in the MAP protocol like RMNET. This is not an officially registered ID. Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 61f7ccce5b69..9037065e23d0 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -140,6 +140,9 @@ #define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */ #define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */ #define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */ +#define ETH_P_MAP 0x00F9 /* Qualcomm multiplexing and + * aggregation protocol + */ /* * This is an Ethernet frame header. -- cgit v1.2.3 From cdf4969c42a6c1a376dd03a9e846cf638d3cd4b1 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Tue, 29 Aug 2017 22:44:17 -0600 Subject: net: arp: Add support for raw IP device Define the raw IP type. This is needed for raw IP net devices like rmnet. Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- include/uapi/linux/if_arp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h index cf73510b9238..a2a635620600 100644 --- a/include/uapi/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h @@ -59,6 +59,7 @@ #define ARPHRD_LAPB 516 /* LAPB */ #define ARPHRD_DDCMP 517 /* Digital's DDCMP protocol */ #define ARPHRD_RAWHDLC 518 /* Raw HDLC */ +#define ARPHRD_RAWIP 519 /* Raw IP */ #define ARPHRD_TUNNEL 768 /* IPIP tunnel */ #define ARPHRD_TUNNEL6 769 /* IP6IP6 tunnel */ -- cgit v1.2.3 From e3bfed1df379c18f20feb06427d952b766e2c00f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 25 Aug 2017 19:53:39 +1000 Subject: KVM: PPC: Book3S HV: Report storage key support to userspace This adds information about storage keys to the struct returned by the KVM_PPC_GET_SMMU_INFO ioctl. The new fields replace a pad field, which was zeroed by previous kernel versions. Thus userspace that knows about the new fields will see zeroes when running on an older kernel, indicating that storage keys are not supported. The size of the structure has not changed. The number of keys is hard-coded for the CPUs supported by HV KVM, which is just POWER7, POWER8 and POWER9. Signed-off-by: Paul Mackerras Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- include/uapi/linux/kvm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6cd63c18708a..838887587411 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -711,7 +711,8 @@ struct kvm_ppc_one_seg_page_size { struct kvm_ppc_smmu_info { __u64 flags; __u32 slb_size; - __u32 pad; + __u16 data_keys; /* # storage keys supported for data */ + __u16 instr_keys; /* # storage keys supported for instructions */ struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; }; -- cgit v1.2.3 From 8fc614c0ae5cb5df11d6aa9559e63baacf20a840 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 31 Aug 2017 14:12:39 -0500 Subject: PCI/AER: Reformat AER register definitions Reformat so comments fit on same line as definition. No functional change intended. Signed-off-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index c22d3ebaca20..46632aaee1c0 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -733,23 +733,17 @@ #define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ #define PCI_ERR_HEADER_LOG 28 /* Header Log Register (16 bytes) */ #define PCI_ERR_ROOT_COMMAND 44 /* Root Error Command */ -/* Correctable Err Reporting Enable */ -#define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 -/* Non-fatal Err Reporting Enable */ -#define PCI_ERR_ROOT_CMD_NONFATAL_EN 0x00000002 -/* Fatal Err Reporting Enable */ -#define PCI_ERR_ROOT_CMD_FATAL_EN 0x00000004 +#define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 /* Correctable Err Reporting Enable */ +#define PCI_ERR_ROOT_CMD_NONFATAL_EN 0x00000002 /* Non-Fatal Err Reporting Enable */ +#define PCI_ERR_ROOT_CMD_FATAL_EN 0x00000004 /* Fatal Err Reporting Enable */ #define PCI_ERR_ROOT_STATUS 48 -#define PCI_ERR_ROOT_COR_RCV 0x00000001 /* ERR_COR Received */ -/* Multi ERR_COR Received */ -#define PCI_ERR_ROOT_MULTI_COR_RCV 0x00000002 -/* ERR_FATAL/NONFATAL Received */ -#define PCI_ERR_ROOT_UNCOR_RCV 0x00000004 -/* Multi ERR_FATAL/NONFATAL Received */ -#define PCI_ERR_ROOT_MULTI_UNCOR_RCV 0x00000008 -#define PCI_ERR_ROOT_FIRST_FATAL 0x00000010 /* First Fatal */ -#define PCI_ERR_ROOT_NONFATAL_RCV 0x00000020 /* Non-Fatal Received */ -#define PCI_ERR_ROOT_FATAL_RCV 0x00000040 /* Fatal Received */ +#define PCI_ERR_ROOT_COR_RCV 0x00000001 /* ERR_COR Received */ +#define PCI_ERR_ROOT_MULTI_COR_RCV 0x00000002 /* Multiple ERR_COR */ +#define PCI_ERR_ROOT_UNCOR_RCV 0x00000004 /* ERR_FATAL/NONFATAL */ +#define PCI_ERR_ROOT_MULTI_UNCOR_RCV 0x00000008 /* Multiple FATAL/NONFATAL */ +#define PCI_ERR_ROOT_FIRST_FATAL 0x00000010 /* First UNC is Fatal */ +#define PCI_ERR_ROOT_NONFATAL_RCV 0x00000020 /* Non-Fatal Received */ +#define PCI_ERR_ROOT_FATAL_RCV 0x00000040 /* Fatal Received */ #define PCI_ERR_ROOT_ERR_SRC 52 /* Error Source Identification */ /* Virtual Channel */ -- cgit v1.2.3 From 89e4fdecb51cf5535867026274bc97de9480ade5 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 00:03:43 -0700 Subject: loop: add ioctl for changing logical block size This is a different approach from the first attempt in f2c6df7dbf9a ("loop: support 4k physical blocksize"). Rather than extending LOOP_{GET,SET}_STATUS, add a separate ioctl just for setting the block size. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/uapi/linux/loop.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index c8125ec1f4f2..23158dbe2424 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -88,6 +88,7 @@ struct loop_info64 { #define LOOP_CHANGE_FD 0x4C06 #define LOOP_SET_CAPACITY 0x4C07 #define LOOP_SET_DIRECT_IO 0x4C08 +#define LOOP_SET_BLOCK_SIZE 0x4C09 /* /dev/loop-control interface */ #define LOOP_CTL_ADD 0x4C80 -- cgit v1.2.3 From ddef7ed2b5cbafae692d1d580bb5a07808926a9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jul 2017 18:58:37 +0200 Subject: annotate RWF_... flags [AV: added missing annotations in syscalls.h/compat.h] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/uapi/linux/aio_abi.h | 21 +++++++++++---------- include/uapi/linux/fs.h | 28 ++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index a2d4a8ac94ca..a04adbc70ddf 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -28,6 +28,7 @@ #define __LINUX__AIO_ABI_H #include +#include #include typedef __kernel_ulong_t aio_context_t; @@ -62,14 +63,6 @@ struct io_event { __s64 res2; /* secondary result */ }; -#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) -#define PADDED(x,y) x, y -#elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) -#define PADDED(x,y) y, x -#else -#error edit for your odd byteorder. -#endif - /* * we always use a 64bit off_t when communicating * with userland. its up to libraries to do the @@ -79,8 +72,16 @@ struct io_event { struct iocb { /* these are internal to the kernel/libc. */ __u64 aio_data; /* data to be returned in event's data */ - __u32 PADDED(aio_key, aio_rw_flags); - /* the kernel sets aio_key to the req # */ + +#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) + __u32 aio_key; /* the kernel sets aio_key to the req # */ + __kernel_rwf_t aio_rw_flags; /* RWF_* flags */ +#elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) + __kernel_rwf_t aio_rw_flags; /* RWF_* flags */ + __u32 aio_key; /* the kernel sets aio_key to the req # */ +#else +#error edit for your odd byteorder. +#endif /* common fields */ __u16 aio_lio_opcode; /* see IOCB_CMD_ above */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b7495d05e8de..56235dddea7d 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -358,13 +358,25 @@ struct fscrypt_key { #define SYNC_FILE_RANGE_WRITE 2 #define SYNC_FILE_RANGE_WAIT_AFTER 4 -/* flags for preadv2/pwritev2: */ -#define RWF_HIPRI 0x00000001 /* high priority request, poll if possible */ -#define RWF_DSYNC 0x00000002 /* per-IO O_DSYNC */ -#define RWF_SYNC 0x00000004 /* per-IO O_SYNC */ -#define RWF_NOWAIT 0x00000008 /* per-IO, return -EAGAIN if operation would block */ - -#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC |\ - RWF_NOWAIT) +/* + * Flags for preadv2/pwritev2: + */ + +typedef int __bitwise __kernel_rwf_t; + +/* high priority request, poll if possible */ +#define RWF_HIPRI ((__force __kernel_rwf_t)0x00000001) + +/* per-IO O_DSYNC */ +#define RWF_DSYNC ((__force __kernel_rwf_t)0x00000002) + +/* per-IO O_SYNC */ +#define RWF_SYNC ((__force __kernel_rwf_t)0x00000004) + +/* per-IO, return -EAGAIN if operation would block */ +#define RWF_NOWAIT ((__force __kernel_rwf_t)0x00000008) + +/* mask of flags supported by the kernel */ +#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT) #endif /* _UAPI_LINUX_FS_H */ -- cgit v1.2.3 From 1797f5b3cf0b3a73c42b89f7a8fd897417373730 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Thu, 31 Aug 2017 17:59:12 +0200 Subject: devlink: Add IPv6 header for dpipe This will be used by the IPv6 host table which will be introduced in the following patches. The fields in the header are added per-use. This header is global and can be reused by many drivers. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6c172548589d..0cbca96c66b9 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -234,9 +234,14 @@ enum devlink_dpipe_field_ipv4_id { DEVLINK_DPIPE_FIELD_IPV4_DST_IP, }; +enum devlink_dpipe_field_ipv6_id { + DEVLINK_DPIPE_FIELD_IPV6_DST_IP, +}; + enum devlink_dpipe_header_id { DEVLINK_DPIPE_HEADER_ETHERNET, DEVLINK_DPIPE_HEADER_IPV4, + DEVLINK_DPIPE_HEADER_IPV6, }; #endif /* _UAPI_LINUX_DEVLINK_H_ */ -- cgit v1.2.3 From 482dca939fb7ee35ba20b944b4c2476133dbf0df Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 31 Aug 2017 15:05:44 -0700 Subject: bpf: Add mark and priority to sock options that can be set Add socket mark and priority to fields that can be set by ebpf program when a socket is created. Signed-off-by: David Ahern Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 08c206a863e1..ba848b761cfb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -758,6 +758,8 @@ struct bpf_sock { __u32 family; __u32 type; __u32 protocol; + __u32 mark; + __u32 priority; }; #define XDP_PACKET_HEADROOM 256 -- cgit v1.2.3 From abcc61537e3566cae7f1fd225f2dcb82b3595fe3 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Thu, 31 Aug 2017 10:04:24 +0200 Subject: ANDROID: binder: Add BINDER_GET_NODE_DEBUG_INFO ioctl The BINDER_GET_NODE_DEBUG_INFO ioctl will return debug info on a node. Each successive call reusing the previous return value will return the next node. The data will be used by libmemunreachable to mark the pointers with kernel references as reachable. Signed-off-by: Colin Cross Signed-off-by: Martijn Coenen Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/android/binder.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 7668b5791c91..84a9a0944e13 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -186,6 +186,19 @@ struct binder_version { #define BINDER_CURRENT_PROTOCOL_VERSION 8 #endif +/* + * Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields. + * Set ptr to NULL for the first call to get the info for the first node, and + * then repeat the call passing the previously returned value to get the next + * nodes. ptr will be 0 when there are no more nodes. + */ +struct binder_node_debug_info { + binder_uintptr_t ptr; + binder_uintptr_t cookie; + __u32 has_strong_ref; + __u32 has_weak_ref; +}; + #define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) #define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) #define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) @@ -193,6 +206,7 @@ struct binder_version { #define BINDER_SET_CONTEXT_MGR _IOW('b', 7, __s32) #define BINDER_THREAD_EXIT _IOW('b', 8, __s32) #define BINDER_VERSION _IOWR('b', 9, struct binder_version) +#define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) /* * NOTE: Two special error codes you should check for when calling -- cgit v1.2.3 From 8db6c34f1dbc8e06aa016a9b829b06902c3e1340 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 8 May 2017 13:11:56 -0500 Subject: Introduce v3 namespaced file capabilities Root in a non-initial user ns cannot be trusted to write a traditional security.capability xattr. If it were allowed to do so, then any unprivileged user on the host could map his own uid to root in a private namespace, write the xattr, and execute the file with privilege on the host. However supporting file capabilities in a user namespace is very desirable. Not doing so means that any programs designed to run with limited privilege must continue to support other methods of gaining and dropping privilege. For instance a program installer must detect whether file capabilities can be assigned, and assign them if so but set setuid-root otherwise. The program in turn must know how to drop partial capabilities, and do so only if setuid-root. This patch introduces v3 of the security.capability xattr. It builds a vfs_ns_cap_data struct by appending a uid_t rootid to struct vfs_cap_data. This is the absolute uid_t (that is, the uid_t in user namespace which mounted the filesystem, usually init_user_ns) of the root id in whose namespaces the file capabilities may take effect. When a task asks to write a v2 security.capability xattr, if it is privileged with respect to the userns which mounted the filesystem, then nothing should change. Otherwise, the kernel will transparently rewrite the xattr as a v3 with the appropriate rootid. This is done during the execution of setxattr() to catch user-space-initiated capability writes. Subsequently, any task executing the file which has the noted kuid as its root uid, or which is in a descendent user_ns of such a user_ns, will run the file with capabilities. Similarly when asking to read file capabilities, a v3 capability will be presented as v2 if it applies to the caller's namespace. If a task writes a v3 security.capability, then it can provide a uid for the xattr so long as the uid is valid in its own user namespace, and it is privileged with CAP_SETFCAP over its namespace. The kernel will translate that rootid to an absolute uid, and write that to disk. After this, a task in the writer's namespace will not be able to use those capabilities (unless rootid was 0), but a task in a namespace where the given uid is root will. Only a single security.capability xattr may exist at a time for a given file. A task may overwrite an existing xattr so long as it is privileged over the inode. Note this is a departure from previous semantics, which required privilege to remove a security.capability xattr. This check can be re-added if deemed useful. This allows a simple setxattr to work, allows tar/untar to work, and allows us to tar in one namespace and untar in another while preserving the capability, without risking leaking privilege into a parent namespace. Example using tar: $ cp /bin/sleep sleepx $ mkdir b1 b2 $ lxc-usernsexec -m b:0:100000:1 -m b:1:$(id -u):1 -- chown 0:0 b1 $ lxc-usernsexec -m b:0:100001:1 -m b:1:$(id -u):1 -- chown 0:0 b2 $ lxc-usernsexec -m b:0:100000:1000 -- tar --xattrs-include=security.capability --xattrs -cf b1/sleepx.tar sleepx $ lxc-usernsexec -m b:0:100001:1000 -- tar --xattrs-include=security.capability --xattrs -C b2 -xf b1/sleepx.tar $ lxc-usernsexec -m b:0:100001:1000 -- getcap b2/sleepx b2/sleepx = cap_sys_admin+ep # /opt/ltp/testcases/bin/getv3xattr b2/sleepx v3 xattr, rootid is 100001 A patch to linux-test-project adding a new set of tests for this functionality is in the nsfscaps branch at github.com/hallyn/ltp Changelog: Nov 02 2016: fix invalid check at refuse_fcap_overwrite() Nov 07 2016: convert rootid from and to fs user_ns (From ebiederm: mar 28 2017) commoncap.c: fix typos - s/v4/v3 get_vfs_caps_from_disk: clarify the fs_ns root access check nsfscaps: change the code split for cap_inode_setxattr() Apr 09 2017: don't return v3 cap for caps owned by current root. return a v2 cap for a true v2 cap in non-init ns Apr 18 2017: . Change the flow of fscap writing to support s_user_ns writing. . Remove refuse_fcap_overwrite(). The value of the previous xattr doesn't matter. Apr 24 2017: . incorporate Eric's incremental diff . move cap_convert_nscap to setxattr and simplify its usage May 8, 2017: . fix leaking dentry refcount in cap_inode_getsecurity Signed-off-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- include/uapi/linux/capability.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 6fe14d001f68..230e05d35191 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -60,9 +60,13 @@ typedef struct __user_cap_data_struct { #define VFS_CAP_U32_2 2 #define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2)) -#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2 -#define VFS_CAP_U32 VFS_CAP_U32_2 -#define VFS_CAP_REVISION VFS_CAP_REVISION_2 +#define VFS_CAP_REVISION_3 0x03000000 +#define VFS_CAP_U32_3 2 +#define XATTR_CAPS_SZ_3 (sizeof(__le32)*(2 + 2*VFS_CAP_U32_3)) + +#define XATTR_CAPS_SZ XATTR_CAPS_SZ_3 +#define VFS_CAP_U32 VFS_CAP_U32_3 +#define VFS_CAP_REVISION VFS_CAP_REVISION_3 struct vfs_cap_data { __le32 magic_etc; /* Little endian */ @@ -72,6 +76,18 @@ struct vfs_cap_data { } data[VFS_CAP_U32]; }; +/* + * same as vfs_cap_data but with a rootid at the end + */ +struct vfs_ns_cap_data { + __le32 magic_etc; + struct { + __le32 permitted; /* Little endian */ + __le32 inheritable; /* Little endian */ + } data[VFS_CAP_U32]; + __le32 rootid; +}; + #ifndef __KERNEL__ /* -- cgit v1.2.3 From d897246df9fc0a5df97a784bf7b072be4a6ae479 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 31 Aug 2017 13:47:35 -0700 Subject: fsmap: fix documentation of FMR_OF_LAST The FMR_OF_LAST flag is set on the last fsmap record being returned for the dataset requested, contrary to what the header file says. Fix the docs to reflect the behavior of all fsmap implementations. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- include/uapi/linux/fsmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fsmap.h b/include/uapi/linux/fsmap.h index 7e8e5f0bd6d2..e5213c3e38b2 100644 --- a/include/uapi/linux/fsmap.h +++ b/include/uapi/linux/fsmap.h @@ -96,7 +96,7 @@ fsmap_advance( #define FMR_OF_EXTENT_MAP 0x4 /* segment = extent map */ #define FMR_OF_SHARED 0x8 /* segment = shared with another file */ #define FMR_OF_SPECIAL_OWNER 0x10 /* owner is a special value */ -#define FMR_OF_LAST 0x20 /* segment is the last in the FS */ +#define FMR_OF_LAST 0x20 /* segment is the last in the dataset */ /* Each FS gets to define its own special owner codes. */ #define FMR_OWNER(type, code) (((__u64)type << 32) | \ -- cgit v1.2.3 From c03fa9bcacd9ac04595cc13f34f3445f0a5ecf13 Mon Sep 17 00:00:00 2001 From: Ivan Delalande Date: Thu, 31 Aug 2017 09:59:39 -0700 Subject: tcp_diag: report TCP MD5 signing keys and addresses Report TCP MD5 (RFC2385) signing keys, addresses and address prefixes to processes with CAP_NET_ADMIN requesting INET_DIAG_INFO. Currently it is not possible to retrieve these from the kernel once they have been configured on sockets. Signed-off-by: Ivan Delalande Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + include/uapi/linux/tcp.h | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 678496897a68..f52ff62bfabe 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -143,6 +143,7 @@ enum { INET_DIAG_MARK, INET_DIAG_BBRINFO, INET_DIAG_CLASS_ID, + INET_DIAG_MD5SIG, __INET_DIAG_MAX, }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 030e594bab45..15c25eccab2b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -256,4 +256,13 @@ struct tcp_md5sig { __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; /* key (binary) */ }; +/* INET_DIAG_MD5SIG */ +struct tcp_diag_md5sig { + __u8 tcpm_family; + __u8 tcpm_prefixlen; + __u16 tcpm_keylen; + __be32 tcpm_addr[4]; + __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; +}; + #endif /* _UAPI_LINUX_TCP_H */ -- cgit v1.2.3 From bea74641e3786d51dcf1175527cc1781420961c9 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Fri, 18 Aug 2017 20:58:59 +0200 Subject: netfilter: xt_hashlimit: add rate match mode This patch adds a new feature to hashlimit that allows matching on the current packet/byte rate without rate limiting. This can be enabled with a new flag --hashlimit-rate-match. The match returns true if the current rate of packets is above/below the user specified value. The main difference between the existing algorithm and the new one is that the existing algorithm rate-limits the flow whereas the new algorithm does not. Instead it *classifies* the flow based on whether it is above or below a certain rate. I will demonstrate this with an example below. Let us assume this rule: iptables -A INPUT -m hashlimit --hashlimit-above 10/s -j new_chain If the packet rate is 15/s, the existing algorithm would ACCEPT 10 packets every second and send 5 packets to "new_chain". But with the new algorithm, as long as the rate of 15/s is sustained, all packets will continue to match and every packet is sent to new_chain. This new functionality will let us classify different flows based on their current rate, so that further decisions can be made on them based on what the current rate is. This is how the new algorithm works: We divide time into intervals of 1 (sec/min/hour) as specified by the user. We keep track of the number of packets/bytes processed in the current interval. After each interval we reset the counter to 0. When we receive a packet for match, we look at the packet rate during the current interval and the previous interval to make a decision: if [ prev_rate < user and cur_rate < user ] return Below else return Above Where cur_rate is the number of packets/bytes seen in the current interval, prev is the number of packets/bytes seen in the previous interval and 'user' is the rate specified by the user. We also provide flexibility to the user for choosing the time interval using the option --hashilmit-interval. For example the user can keep a low rate like x/hour but still keep the interval as small as 1 second. To preserve backwards compatibility we have to add this feature in a new revision, so I've created revision 3 for hashlimit. The two new options we add are: --hashlimit-rate-match --hashlimit-rate-interval I have updated the help text to add these new options. Also added a few tests for the new options. Suggested-by: Igor Lubashev Reviewed-by: Josh Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_hashlimit.h | 36 ++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h index 79da349f1060..aa98573248b1 100644 --- a/include/uapi/linux/netfilter/xt_hashlimit.h +++ b/include/uapi/linux/netfilter/xt_hashlimit.h @@ -19,12 +19,13 @@ struct xt_hashlimit_htable; enum { - XT_HASHLIMIT_HASH_DIP = 1 << 0, - XT_HASHLIMIT_HASH_DPT = 1 << 1, - XT_HASHLIMIT_HASH_SIP = 1 << 2, - XT_HASHLIMIT_HASH_SPT = 1 << 3, - XT_HASHLIMIT_INVERT = 1 << 4, - XT_HASHLIMIT_BYTES = 1 << 5, + XT_HASHLIMIT_HASH_DIP = 1 << 0, + XT_HASHLIMIT_HASH_DPT = 1 << 1, + XT_HASHLIMIT_HASH_SIP = 1 << 2, + XT_HASHLIMIT_HASH_SPT = 1 << 3, + XT_HASHLIMIT_INVERT = 1 << 4, + XT_HASHLIMIT_BYTES = 1 << 5, + XT_HASHLIMIT_RATE_MATCH = 1 << 6, }; struct hashlimit_cfg { @@ -79,6 +80,21 @@ struct hashlimit_cfg2 { __u8 srcmask, dstmask; }; +struct hashlimit_cfg3 { + __u64 avg; /* Average secs between packets * scale */ + __u64 burst; /* Period multiplier for upper limit. */ + __u32 mode; /* bitmask of XT_HASHLIMIT_HASH_* */ + + /* user specified */ + __u32 size; /* how many buckets */ + __u32 max; /* max number of entries */ + __u32 gc_interval; /* gc interval */ + __u32 expire; /* when do entries expire? */ + + __u32 interval; + __u8 srcmask, dstmask; +}; + struct xt_hashlimit_mtinfo1 { char name[IFNAMSIZ]; struct hashlimit_cfg1 cfg; @@ -95,4 +111,12 @@ struct xt_hashlimit_mtinfo2 { struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); }; +struct xt_hashlimit_mtinfo3 { + char name[NAME_MAX]; + struct hashlimit_cfg3 cfg; + + /* Used internally by the kernel */ + struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); +}; + #endif /* _UAPI_XT_HASHLIMIT_H */ -- cgit v1.2.3 From a691205571723cb0544110ca91653ac4b0eb5b17 Mon Sep 17 00:00:00 2001 From: "Pablo M. Bermudo Garay" Date: Wed, 23 Aug 2017 22:41:25 +0200 Subject: netfilter: nft_limit: add stateful object type Register a new limit stateful object type into the stateful object infrastructure. Signed-off-by: Pablo M. Bermudo Garay Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b49da72efa68..871afa4871bf 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1282,7 +1282,8 @@ enum nft_ct_helper_attributes { #define NFT_OBJECT_COUNTER 1 #define NFT_OBJECT_QUOTA 2 #define NFT_OBJECT_CT_HELPER 3 -#define __NFT_OBJECT_MAX 4 +#define NFT_OBJECT_LIMIT 4 +#define __NFT_OBJECT_MAX 5 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** -- cgit v1.2.3 From 2335ba704f32b855651d0cd15dd9b271ec565fb6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 3 Sep 2017 23:55:59 +0200 Subject: netlink: add NLM_F_NONREC flag for deletion requests In the last NFWS in Faro, Portugal, we discussed that netlink is lacking the semantics to request non recursive deletions, ie. do not delete an object iff it has child objects that hang from this parent object that the user requests to be deleted. We need this new flag to solve a problem for the iptables-compat backward compatibility utility, that runs iptables commands using the existing nf_tables netlink interface. Specifically, custom chains in iptables cannot be deleted if there are rules in it, however, nf_tables allows to remove any chain that is populated with content. To sort out this asymmetry, iptables-compat userspace sets this new NLM_F_NONREC flag to obtain the same semantics that iptables provides. This new flag should only be used for deletion requests. Note this new flag value overlaps with the existing: * NLM_F_ROOT for get requests. * NLM_F_REPLACE for new requests. However, those flags should not ever be used in deletion requests. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index f4fc9c9e123d..e8af60a7c56d 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -69,6 +69,9 @@ struct nlmsghdr { #define NLM_F_CREATE 0x400 /* Create, if it does not exist */ #define NLM_F_APPEND 0x800 /* Add to end of list */ +/* Modifiers to DELETE request */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + /* Flags for ACK message */ #define NLM_F_CAPPED 0x100 /* request was capped */ #define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ -- cgit v1.2.3 From aafd4562dfee81a40ba21b5ea3cf5e06664bc7f6 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 6 Sep 2017 16:23:29 -0700 Subject: mm: arch: consolidate mmap hugetlb size encodings A non-default huge page size can be encoded in the flags argument of the mmap system call. The definitions for these encodings are in arch specific header files. However, all architectures use the same values. Consolidate all the definitions in the primary user header file (uapi/linux/mman.h). Include definitions for all known huge page sizes. Use the generic encoding definitions in hugetlb_encode.h as the basis for these definitions. Link: http://lkml.kernel.org/r/1501527386-10736-3-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Davidlohr Bueso Cc: Matthew Wilcox Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mman.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index ade4acd3a90c..a937480d7cd3 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -2,6 +2,7 @@ #define _UAPI_LINUX_MMAN_H #include +#include #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 @@ -10,4 +11,25 @@ #define OVERCOMMIT_ALWAYS 1 #define OVERCOMMIT_NEVER 2 +/* + * Huge page size encoding when MAP_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB + #endif /* _UAPI_LINUX_MMAN_H */ -- cgit v1.2.3 From 4da243ac1cf6aeb30b7c555d56208982d66d6d33 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 6 Sep 2017 16:23:33 -0700 Subject: mm: shm: use new hugetlb size encoding definitions Use the common definitions from hugetlb_encode.h header file for encoding hugetlb size definitions in shmget system call flags. In addition, move these definitions from the internal (kernel) to user (uapi) header file. Link: http://lkml.kernel.org/r/1501527386-10736-4-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Suggested-by: Matthew Wilcox Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Davidlohr Bueso Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/shm.h | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index 1fbf24ea37fd..cf23c873719d 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h @@ -3,6 +3,7 @@ #include #include +#include #ifndef __KERNEL__ #include #endif @@ -40,11 +41,37 @@ struct shmid_ds { /* Include the definition of shmid64_ds and shminfo64 */ #include -/* permission flag for shmget */ +/* + * shmget() shmflg values. + */ +/* The bottom nine bits are the same as open(2) mode flags */ #define SHM_R 0400 /* or S_IRUGO from */ #define SHM_W 0200 /* or S_IWUGO from */ +/* Bits 9 & 10 are IPC_CREAT and IPC_EXCL */ +#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ +#define SHM_NORESERVE 010000 /* don't check for reservations */ + +/* + * Huge page size encoding when SHM_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h + */ +#define SHM_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define SHM_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define SHM_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define SHM_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define SHM_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define SHM_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define SHM_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define SHM_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define SHM_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define SHM_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define SHM_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define SHM_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB -/* mode for attach */ +/* + * shmat() shmflg values + */ #define SHM_RDONLY 010000 /* read-only access */ #define SHM_RND 020000 /* round attach address to SHMLBA boundary */ #define SHM_REMAP 040000 /* take-over region on attach */ -- cgit v1.2.3 From 2d6d6f5a09a96cc1fec7ed992b825e05f64cb50e Mon Sep 17 00:00:00 2001 From: Prakash Sangappa Date: Wed, 6 Sep 2017 16:23:39 -0700 Subject: mm: userfaultfd: add feature to request for a signal delivery In some cases, userfaultfd mechanism should just deliver a SIGBUS signal to the faulting process, instead of the page-fault event. Dealing with page-fault event using a monitor thread can be an overhead in these cases. For example applications like the database could use the signaling mechanism for robustness purpose. Database uses hugetlbfs for performance reason. Files on hugetlbfs filesystem are created and huge pages allocated using fallocate() API. Pages are deallocated/freed using fallocate() hole punching support. These files are mmapped and accessed by many processes as shared memory. The database keeps track of which offsets in the hugetlbfs file have pages allocated. Any access to mapped address over holes in the file, which can occur due to bugs in the application, is considered invalid and expect the process to simply receive a SIGBUS. However, currently when a hole in the file is accessed via the mapped address, kernel/mm attempts to automatically allocate a page at page fault time, resulting in implicitly filling the hole in the file. This may not be the desired behavior for applications like the database that want to explicitly manage page allocations of hugetlbfs files. Using userfaultfd mechanism with this support to get a signal, database application can prevent pages from being allocated implicitly when processes access mapped address over holes in the file. This patch adds UFFD_FEATURE_SIGBUS feature to userfaultfd mechnism to request for a SIGBUS signal. See following for previous discussion about the database requirement leading to this proposal as suggested by Andrea. http://www.spinics.net/lists/linux-mm/msg129224.html Link: http://lkml.kernel.org/r/1501552446-748335-2-git-send-email-prakash.sangappa@oracle.com Signed-off-by: Prakash Sangappa Reviewed-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 3b059530dac9..d39d5db56771 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -23,7 +23,8 @@ UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ - UFFD_FEATURE_MISSING_SHMEM) + UFFD_FEATURE_MISSING_SHMEM | \ + UFFD_FEATURE_SIGBUS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -153,6 +154,12 @@ struct uffdio_api { * UFFD_FEATURE_MISSING_SHMEM works the same as * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem * (i.e. tmpfs and other shmem based APIs). + * + * UFFD_FEATURE_SIGBUS feature means no page-fault + * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead + * a SIGBUS signal will be sent to the faulting process. + * The application process can enable this behavior by adding + * it to uffdio_api.features. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -161,6 +168,7 @@ struct uffdio_api { #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) #define UFFD_FEATURE_MISSING_SHMEM (1<<5) #define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_SIGBUS (1<<7) __u64 features; __u64 ioctls; -- cgit v1.2.3 From 9d4ac934829ac58c5109c49e6dfe677300e5e652 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Wed, 6 Sep 2017 16:23:56 -0700 Subject: userfaultfd: provide pid in userfault msg It could be useful for calculating downtime during postcopy live migration per vCPU. Side observer or application itself will be informed about proper task's sleep during userfaultfd processing. Process's thread id is being provided when user requeste it by setting UFFD_FEATURE_THREAD_ID bit into uffdio_api.features. Link: http://lkml.kernel.org/r/20170802165145.22628-6-aarcange@redhat.com Signed-off-by: Alexey Perevalov Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Maxime Coquelin Cc: Mike Kravetz Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index d39d5db56771..2b24c28d99a7 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -24,7 +24,8 @@ UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM | \ - UFFD_FEATURE_SIGBUS) + UFFD_FEATURE_SIGBUS | \ + UFFD_FEATURE_THREAD_ID) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -79,6 +80,7 @@ struct uffd_msg { struct { __u64 flags; __u64 address; + __u32 ptid; } pagefault; struct { @@ -158,8 +160,9 @@ struct uffdio_api { * UFFD_FEATURE_SIGBUS feature means no page-fault * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead * a SIGBUS signal will be sent to the faulting process. - * The application process can enable this behavior by adding - * it to uffdio_api.features. + * + * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will + * be returned, if feature is not requested 0 will be returned. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -169,6 +172,7 @@ struct uffdio_api { #define UFFD_FEATURE_MISSING_SHMEM (1<<5) #define UFFD_FEATURE_EVENT_UNMAP (1<<6) #define UFFD_FEATURE_SIGBUS (1<<7) +#define UFFD_FEATURE_THREAD_ID (1<<8) __u64 features; __u64 ioctls; -- cgit v1.2.3 From a36985d31a65d5c0559fb582719e32eaf0ccec3b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 6 Sep 2017 16:23:59 -0700 Subject: userfaultfd: provide pid in userfault msg - add feat union No ABI change, but this will make it more explicit to software that ptid is only available if requested by passing UFFD_FEATURE_THREAD_ID to UFFDIO_API. The fact it's a union will also self document it shouldn't be taken for granted there's a tpid there. Link: http://lkml.kernel.org/r/20170802165145.22628-7-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Alexey Perevalov Cc: Maxime Coquelin Cc: Mike Kravetz Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 2b24c28d99a7..d6d1f65cb3c3 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -80,7 +80,9 @@ struct uffd_msg { struct { __u64 flags; __u64 address; - __u32 ptid; + union { + __u32 ptid; + } feat; } pagefault; struct { -- cgit v1.2.3 From 749df87bd7bee5a79cef073f5d032ddb2b211de8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 6 Sep 2017 16:24:16 -0700 Subject: mm/shmem: add hugetlbfs support to memfd_create() This patch came out of discussions in this e-mail thread: http://lkml.kernel.org/r/1499357846-7481-1-git-send-email-mike.kravetz%40oracle.com The Oracle JVM team is developing a new garbage collection model. This new model requires multiple mappings of the same anonymous memory. One straight forward way to accomplish this is with memfd_create. They can use the returned fd to create multiple mappings of the same memory. The JVM today has an option to use (static hugetlb) huge pages. If this option is specified, they would like to use the same garbage collection model requiring multiple mappings to the same memory. Using hugetlbfs, it is possible to explicitly mount a filesystem and specify file paths in order to get an fd that can be used for multiple mappings. However, this introduces additional system admin work and coordination. Ideally they would like to get a hugetlbfs fd without requiring explicit mounting of a filesystem. Today, mmap and shmget can make use of hugetlbfs without explicitly mounting a filesystem. The patch adds this functionality to memfd_create. Add a new flag MFD_HUGETLB to memfd_create() that will specify the file to be created resides in the hugetlbfs filesystem. This is the generic hugetlbfs filesystem not associated with any specific mount point. As with other system calls that request hugetlbfs backed pages, there is the ability to encode huge page size in the flag arguments. hugetlbfs does not support sealing operations, therefore specifying MFD_ALLOW_SEALING with MFD_HUGETLB will result in EINVAL. Of course, the memfd_man page would need updating if this type of functionality moves forward. Link: http://lkml.kernel.org/r/1502149672-7759-2-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/memfd.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 534e364bda92..7f3a722dbd72 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -1,8 +1,32 @@ #ifndef _UAPI_LINUX_MEMFD_H #define _UAPI_LINUX_MEMFD_H +#include + /* flags for memfd_create(2) (unsigned int) */ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U +#define MFD_HUGETLB 0x0004U + +/* + * Huge page size encoding when MFD_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB #endif /* _UAPI_LINUX_MEMFD_H */ -- cgit v1.2.3 From 3dd8f7c3b78b9556582fd64bf5c9986723f9dca1 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 8 Sep 2017 16:16:30 -0700 Subject: autofs: make dev ioctl version and ismountpoint user accessible Some of the autofs miscellaneous device ioctls need to be accessable to user space applications without CAP_SYS_ADMIN to get information about autofs mounts. Link: http://lkml.kernel.org/r/150216642517.11652.2338933266137331637.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Colin Walters Cc: Ondrej Holy Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/auto_dev-ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h index 744b3d060968..5558db8e6646 100644 --- a/include/uapi/linux/auto_dev-ioctl.h +++ b/include/uapi/linux/auto_dev-ioctl.h @@ -16,7 +16,7 @@ #define AUTOFS_DEVICE_NAME "autofs" #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 -#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 +#define AUTOFS_DEV_IOCTL_VERSION_MINOR 1 #define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) -- cgit v1.2.3 From 1f28c5d055032e7e8ee5e48198dca7e125d0eec6 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Fri, 8 Sep 2017 16:16:34 -0700 Subject: autofs: remove unused AUTOFS_IOC_EXPIRE_DIRECT/INDIRECT These are not used by either kernel or userspace, although AUTOFS_IOC_EXPIRE_DIRECT once seems to have been used by userspace in around 2006-2008, which was technically just an alias of the existing ioctl AUTOFS_IOC_EXPIRE_MULTI. ioctls for autofs are already complicated enough that they could be removed unless these are staying here to be able to compile userspace code of certain period of time from a decade ago. Edit: raven@themaw.net Yes, this is indeed very old and anything that still uses must be updated becuase it will be using broken functionality. End edit: raven@themaw.net Link: http://lkml.kernel.org/r/150285067347.4670.11494624644273072003.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/auto_fs4.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h index 7c6da423d54e..9453e9a07c9d 100644 --- a/include/uapi/linux/auto_fs4.h +++ b/include/uapi/linux/auto_fs4.h @@ -155,8 +155,6 @@ enum { }; #define AUTOFS_IOC_EXPIRE_MULTI _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int) -#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI -#define AUTOFS_IOC_EXPIRE_DIRECT AUTOFS_IOC_EXPIRE_MULTI #define AUTOFS_IOC_PROTOSUBVER _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int) #define AUTOFS_IOC_ASKUMOUNT _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int) -- cgit v1.2.3 From a2d818030135c293f878fbb772cf40e7a14c5acc Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Fri, 8 Sep 2017 16:17:19 -0700 Subject: drivers/pps: aesthetic tweaks to PPS-related content Collection of aesthetic adjustments to various PPS-related files, directories and Documentation, some quite minor just for the sake of consistency, including: * Updated example of pps device tree node (courtesy Rodolfo G.) * "PPS-API" -> "PPS API" * "pps_source_info_s" -> "pps_source_info" * "ktimer driver" -> "pps-ktimer driver" * "ppstest /dev/pps0" -> "ppstest /dev/pps1" to match example * Add missing PPS-related entries to MAINTAINERS file * Other trivialities Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1708261048220.8106@localhost.localdomain Signed-off-by: Robert P. J. Day Acked-by: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/pps.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pps.h b/include/uapi/linux/pps.h index c1cb3825a8bc..c29d6b791c08 100644 --- a/include/uapi/linux/pps.h +++ b/include/uapi/linux/pps.h @@ -95,8 +95,8 @@ struct pps_kparams { #define PPS_CAPTURECLEAR 0x02 /* capture clear events */ #define PPS_CAPTUREBOTH 0x03 /* capture assert and clear events */ -#define PPS_OFFSETASSERT 0x10 /* apply compensation for assert ev. */ -#define PPS_OFFSETCLEAR 0x20 /* apply compensation for clear ev. */ +#define PPS_OFFSETASSERT 0x10 /* apply compensation for assert event */ +#define PPS_OFFSETCLEAR 0x20 /* apply compensation for clear event */ #define PPS_CANWAIT 0x100 /* can we wait for an event? */ #define PPS_CANPOLL 0x200 /* bit reserved for future use */ -- cgit v1.2.3 From 9beb8bedb05c5f3a353dba62b8fa7cbbb9ec685e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 9 Sep 2017 01:40:35 +0200 Subject: bpf: make error reporting in bpf_warn_invalid_xdp_action more clear Differ between illegal XDP action code and just driver unsupported one to provide better feedback when we throw a one-time warning here. Reason is that with 814abfabef3c ("xdp: add bpf_redirect helper function") not all drivers support the new XDP return code yet and thus they will fall into their 'default' case when checking for return codes after program return, which then triggers a bpf_warn_invalid_xdp_action() stating that the return code is illegal, but from XDP perspective it's not. I decided not to place something like a XDP_ACT_MAX define into uapi i) given we don't have this either for all other program types, ii) future action codes could have further encoding there, which would render such define unsuitable and we wouldn't be able to rip it out again, and iii) we rarely add new action codes. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ba848b761cfb..43ab5c402f98 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -766,8 +766,8 @@ struct bpf_sock { /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other - * return codes are reserved for future use. Unknown return codes will result - * in packet drop. + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). */ enum xdp_action { XDP_ABORTED = 0, -- cgit v1.2.3 From 19cab8872692960535aa6d12e3a295ac51d1a648 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Sep 2017 15:52:13 -0700 Subject: net: ethtool: Add back transceiver type Commit 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") deprecated the ethtool_cmd::transceiver field, which was fine in premise, except that the PHY library was actually using it to report the type of transceiver: internal or external. Use the first word of the reserved field to put this __u8 transceiver field back in. It is made read-only, and we don't expect the ETHTOOL_xLINKSETTINGS API to be doing anything with this anyway, so this is mostly for the legacy path where we do: ethtool_get_settings() -> dev->ethtool_ops->get_link_ksettings() -> convert_link_ksettings_to_legacy_settings() to have no information loss compared to the legacy get_settings API. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9c041dae8e2c..5bd1b1de4ea0 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1753,6 +1753,8 @@ enum ethtool_reset_flags { * %ethtool_link_mode_bit_indices for the link modes, and other * link features that the link partner advertised through * autonegotiation; 0 if unknown or not applicable. Read-only. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. * * If autonegotiation is disabled, the speed and @duplex represent the * fixed link mode and are writable if the driver supports multiple @@ -1804,7 +1806,9 @@ struct ethtool_link_settings { __u8 eth_tp_mdix; __u8 eth_tp_mdix_ctrl; __s8 link_mode_masks_nwords; - __u32 reserved[8]; + __u8 transceiver; + __u8 reserved1[3]; + __u32 reserved[7]; __u32 link_mode_masks[0]; /* layout of link_mode_masks fields: * __u32 map_supported[link_mode_masks_nwords]; -- cgit v1.2.3