diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2025-06-17 18:18:48 -0700 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2025-06-17 18:18:49 -0700 |
| commit | e15962ae74d9d093ecf3cf7f60dc145801d9273e (patch) | |
| tree | a3152a57bd55adb24a514ce957c0b981690eb2cd /include/net | |
| parent | ccde40812ad0525143e333fbf8ff1bba53f782f2 (diff) | |
| parent | e3180379e2df535ac492c24ecc9719bf83b7a0f9 (diff) | |
Merge branch 'ipmr-ip6mr-allow-mc-routing-locally-generated-mc-packets'
Petr Machata says:
====================
ipmr, ip6mr: Allow MC-routing locally-generated MC packets
Multicast routing is today handled in the input path. Locally generated MC
packets don't hit the IPMR code. Thus if a VXLAN remote address is
multicast, the driver needs to set an OIF during route lookup. In practice
that means that MC routing configuration needs to be kept in sync with the
VXLAN FDB and MDB. Ideally, the VXLAN packets would be routed by the MC
routing code instead.
To that end, this patchset adds support to route locally generated
multicast packets.
However, an installation that uses a VXLAN underlay netdevice for which it
also has matching MC routes, would get a different routing with this patch.
Previously, the MC packets would be delivered directly to the underlay
port, whereas now they would be MC-routed. In order to avoid this change in
behavior, introduce an IPCB/IP6CB flag. Unless the flag is set, the new
MC-routing code is skipped.
All this is keyed to a new VXLAN attribute, IFLA_VXLAN_MC_ROUTE. Only when
it is set does any of the above engage.
In addition to that, and as is the case today with MC forwarding,
IPV4_DEVCONF_MC_FORWARDING must be enabled for the netdevice that acts as a
source of MC traffic (i.e. the VXLAN PHYS_DEV), so an MC daemon must be
attached to the netdevice.
When a VXLAN netdevice with a MC remote is brought up, the physical
netdevice joins the indicated MC group. This is important for local
delivery of MC packets, so it is still necessary to configure a physical
netdevice -- the parameter cannot go away. The netdevice would however
typically not be a front panel port, but a dummy. An MC daemon would then
sit on top of that netdevice as well as any front panel ports that it needs
to service, and have routes set up between the two.
A way to configure the VXLAN netdevice to take advantage of the new MC
routing would be:
# ip link add name d up type dummy
# ip link add name vx10 up type vxlan id 1000 dstport 4789 \
local 192.0.2.1 group 225.0.0.1 ttl 16 dev d mrcoute
# ip link set dev vx10 master br # plus vlans etc.
With the following MC routes:
(192.0.2.1, 225.0.0.1) iif=d oil=swp1,swp2 # TX route
(*, 225.0.0.1) iif=swp1 oil=d,swp2 # RX route
(*, 225.0.0.1) iif=swp2 oil=d,swp1 # RX route
The RX path has not changed, with the exception of an extra MC hop. Packets
are delivered to the front panel port and MC-forwarded to the VXLAN
physical port, here "d". Since the port has joined the multicast group, the
packets are locally delivered, and end up being processed by the VXLAN
netdevice.
This patchset is based on earlier patches from Nikolay Aleksandrov and
Roopa Prabhu, though it underwent significant changes. Roopa broadly
presented the topic on LPC 2019 [0].
Patchset progression:
- Patches #1 to #4 add ip_mr_output()
- Patches #5 to #10 add ip6_mr_output()
- Patch #11 adds the VXLAN bits to enable MR engagement
- Patches #12 to #14 prepare selftest libraries
- Patch #15 includes a new test suite
[0] https://www.youtube.com/watch?v=xlReECfi-uo
====================
Link: https://patch.msgid.link/cover.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'include/net')
| -rw-r--r-- | include/net/ip.h | 2 | ||||
| -rw-r--r-- | include/net/ip6_tunnel.h | 3 | ||||
| -rw-r--r-- | include/net/ip_tunnels.h | 2 | ||||
| -rw-r--r-- | include/net/udp_tunnel.h | 19 | ||||
| -rw-r--r-- | include/net/vxlan.h | 5 |
5 files changed, 19 insertions, 12 deletions
diff --git a/include/net/ip.h b/include/net/ip.h index 47ed6d23853d..375304bb99f6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -59,6 +59,7 @@ struct inet_skb_parm { #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) #define IPSKB_MULTIPATH BIT(9) +#define IPSKB_MCROUTE BIT(10) u16 frag_max_size; }; @@ -167,6 +168,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, int ip_local_deliver(struct sk_buff *skb); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); int ip_mr_input(struct sk_buff *skb); +int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 399592405c72..dd163495f353 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -152,11 +152,12 @@ int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, - struct net_device *dev) + struct net_device *dev, u16 ip6cb_flags) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + IP6CB(skb)->flags = ip6cb_flags; pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0c3d571a04a1..8cf1380f3656 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -603,7 +603,7 @@ static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, - u8 tos, u8 ttl, __be16 df, bool xnet); + u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 2df3b8344eb5..e3c70b579095 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -150,15 +150,16 @@ static inline void udp_tunnel_drop_rx_info(struct net_device *dev) void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck); - -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck); + bool xnet, bool nocheck, u16 ipcb_flags); + +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags); void udp_tunnel_sock_release(struct socket *sock); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index e2f7ca045d3e..0ee50785f4f1 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -332,6 +332,7 @@ struct vxlan_dev { #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 +#define VXLAN_F_MC_ROUTE 0x100000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -353,7 +354,9 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ - VXLAN_F_LOCALBYPASS) + VXLAN_F_LOCALBYPASS | \ + VXLAN_F_MC_ROUTE | \ + 0) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); |
