diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2026-03-02 18:49:43 -0800 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2026-03-02 18:49:43 -0800 |
| commit | cd994652225f8758cb1e5c9dd879320dcf7ce4e5 (patch) | |
| tree | 277923ee9b620b511a000d8b5ad27f5296edc5b0 | |
| parent | a0e8c9a5060fbdb72fca767164467a3cf2b8fc30 (diff) | |
| parent | bddafc06ca5ee1be4d10061f7954c6d6be5dc1d8 (diff) | |
Merge branch 'ipmr-no-rtnl-for-rtnl_family_ipmr-rtnetlink'
Kuniyuki Iwashima says:
====================
ipmr: No RTNL for RTNL_FAMILY_IPMR rtnetlink.
This series removes RTNL from ipmr rtnetlink handlers.
After this series, there are a few RTNL left in net/ipv4/ipmr.c
and such users will be converted to per-netns RTNL in another
series.
Patch 1 adds a selftest to exercise most? of the RTNL paths
in net/ipv4/ipmr.c
Patch 2 - 6 converts RTM_GETLINK / RTM_GETROUTE handlers
to RCU.
Patch 7 - 9 converts ->exit_batch() to ->exit_rtnl() to
save one RTNL in cleanup_net().
Patch 10 - 11 removes unnecessary RTNL during setup_net()
failure.
Patch 12 is a random cleanup.
Patch 13 - 15 drops RTNL for RTM_NEWROUTE and RTM_DELROUTE.
====================
Link: https://patch.msgid.link/20260228221800.1082070-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
| -rw-r--r-- | include/linux/mroute_base.h | 9 | ||||
| -rw-r--r-- | include/net/netns/ipv4.h | 6 | ||||
| -rw-r--r-- | include/net/netns/ipv6.h | 2 | ||||
| -rw-r--r-- | net/ipv4/ipmr.c | 265 | ||||
| -rw-r--r-- | net/ipv4/ipmr_base.c | 4 | ||||
| -rw-r--r-- | net/ipv6/ip6mr.c | 4 | ||||
| -rw-r--r-- | tools/testing/selftests/net/forwarding/.gitignore | 1 | ||||
| -rw-r--r-- | tools/testing/selftests/net/forwarding/Makefile | 4 | ||||
| -rw-r--r-- | tools/testing/selftests/net/forwarding/ipmr.c | 455 |
9 files changed, 636 insertions, 114 deletions
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0075f6e5c3da..cf3374580f74 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -76,7 +76,7 @@ static inline int mr_call_vif_notifiers(struct net *net, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { @@ -89,7 +89,7 @@ static inline int mr_call_vif_notifiers(struct net *net, }; ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } @@ -198,7 +198,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { @@ -208,8 +208,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, .tb_id = tb_id }; - ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8e971c7bf164..4c249aeaf7f1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -279,6 +279,9 @@ struct netns_ipv4 { struct list_head mr_tables; struct fib_rules_ops *mr_rules_ops; #endif + struct fib_notifier_ops *ipmr_notifier_ops; + atomic_t ipmr_seq; + struct mutex mfc_mutex; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; @@ -290,9 +293,6 @@ struct netns_ipv4 { struct fib_notifier_ops *notifier_ops; unsigned int fib_seq; /* writes protected by rtnl_mutex */ - struct fib_notifier_ops *ipmr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ - atomic_t rt_genid; siphash_key_t ip_id_key; struct hlist_head *inet_addr_lst; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 34bdb1308e8f..499e4288170f 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -118,7 +118,7 @@ struct netns_ipv6 { struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; struct fib_notifier_ops *ip6mr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t ipmr_seq; struct { struct hlist_head head; spinlock_t lock; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 131382c388e9..8a08d09b4c30 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -102,7 +102,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __ro_after_init; static struct mr_table *ipmr_new_table(struct net *net, u32 id); -static void ipmr_free_table(struct mr_table *mrt); +static void ipmr_free_table(struct mr_table *mrt, + struct list_head *dev_kill_list); static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, @@ -112,7 +113,8 @@ static int ipmr_cache_report(const struct mr_table *mrt, static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); -static void mroute_clean_tables(struct mr_table *mrt, int flags); +static void mroute_clean_tables(struct mr_table *mrt, int flags, + struct list_head *dev_kill_list); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -250,6 +252,7 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { static int __net_init ipmr_rules_init(struct net *net) { struct fib_rules_ops *ops; + LIST_HEAD(dev_kill_list); struct mr_table *mrt; int err; @@ -273,9 +276,7 @@ static int __net_init ipmr_rules_init(struct net *net) return 0; err2: - rtnl_lock(); - ipmr_free_table(mrt); - rtnl_unlock(); + ipmr_free_table(mrt, &dev_kill_list); err1: fib_rules_unregister(ops); return err; @@ -283,14 +284,18 @@ err1: static void __net_exit ipmr_rules_exit(struct net *net) { + fib_rules_unregister(net->ipv4.mr_rules_ops); +} + +static void __net_exit ipmr_rules_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) +{ struct mr_table *mrt, *next; - ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); - ipmr_free_table(mrt); + ipmr_free_table(mrt, dev_kill_list); } - fib_rules_unregister(net->ipv4.mr_rules_ops); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, @@ -348,8 +353,13 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { - ASSERT_RTNL(); - ipmr_free_table(net->ipv4.mrt); +} + +static void __net_exit ipmr_rules_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) +{ + ipmr_free_table(net->ipv4.mrt, dev_kill_list); + net->ipv4.mrt = NULL; } @@ -424,17 +434,22 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) ipmr_expire_process, ipmr_new_table_set); } -static void ipmr_free_table(struct mr_table *mrt) +static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_list) { struct net *net = read_pnet(&mrt->net); + LIST_HEAD(ipmr_dev_kill_list); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | - MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); + MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC, + &ipmr_dev_kill_list); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); + + WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list)); + list_splice(&ipmr_dev_kill_list, dev_kill_list); } /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ @@ -1196,7 +1211,6 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); @@ -1223,7 +1237,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (mfc->mfcc_parent >= MAXVIFS) return -ENFILE; - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); @@ -1293,12 +1306,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, } /* Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr_table *mrt, int flags) +static void mroute_clean_tables(struct mr_table *mrt, int flags, + struct list_head *dev_kill_list) { struct net *net = read_pnet(&mrt->net); - struct mr_mfc *c, *tmp; struct mfc_cache *cache; - LIST_HEAD(list); + struct mr_mfc *c, *tmp; int i; /* Shut down all active vif entries */ @@ -1308,13 +1321,14 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) !(flags & MRT_FLUSH_VIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS))) continue; - vif_delete(mrt, i, 0, &list); + vif_delete(mrt, i, 0, dev_kill_list); } - unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) { + mutex_lock(&net->ipv4.mfc_mutex); + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC))) @@ -1327,6 +1341,8 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) mroute_netlink_event(mrt, cache, RTM_DELROUTE); mr_cache_put(c); } + + mutex_unlock(&net->ipv4.mfc_mutex); } if (flags & MRT_FLUSH_MFC) { @@ -1349,9 +1365,11 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) static void mrtsock_destruct(struct sock *sk) { struct net *net = sock_net(sk); + LIST_HEAD(dev_kill_list); struct mr_table *mrt; rtnl_lock(); + ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; @@ -1360,9 +1378,13 @@ static void mrtsock_destruct(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); - mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC); + mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC, + &dev_kill_list); } } + + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); } @@ -1478,14 +1500,21 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, } if (parent == 0) parent = mfc.mfcc_parent; + + mutex_lock(&net->ipv4.mfc_mutex); + if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) ret = ipmr_mfc_delete(mrt, &mfc, parent); else ret = ipmr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); + + mutex_unlock(&net->ipv4.mfc_mutex); break; - case MRT_FLUSH: + case MRT_FLUSH: { + LIST_HEAD(dev_kill_list); + if (optlen != sizeof(val)) { ret = -EINVAL; break; @@ -1494,8 +1523,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, ret = -EFAULT; break; } - mroute_clean_tables(mrt, val); + + mroute_clean_tables(mrt, val, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); break; + } /* Control PIM assert. */ case MRT_ASSERT: if (optlen != sizeof(val)) { @@ -1506,7 +1538,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, ret = -EFAULT; break; } - mrt->mroute_do_assert = val; + WRITE_ONCE(mrt->mroute_do_assert, val); break; case MRT_PIM: if (!ipmr_pimsm_enabled()) { @@ -1525,9 +1557,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { - mrt->mroute_do_pim = val; - mrt->mroute_do_assert = val; - mrt->mroute_do_wrvifwhole = do_wrvifwhole; + WRITE_ONCE(mrt->mroute_do_pim, val); + WRITE_ONCE(mrt->mroute_do_assert, val); + WRITE_ONCE(mrt->mroute_do_wrvifwhole, do_wrvifwhole); } break; case MRT_TABLE: @@ -1610,10 +1642,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, case MRT_PIM: if (!ipmr_pimsm_enabled()) return -ENOPROTOOPT; - val = mrt->mroute_do_pim; + val = READ_ONCE(mrt->mroute_do_pim); break; case MRT_ASSERT: - val = mrt->mroute_do_assert; + val = READ_ONCE(mrt->mroute_do_assert); break; default: return -ENOPROTOOPT; @@ -2037,20 +2069,20 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, atomic_long_inc(&c->_c.mfc_un.res.wrong_if); - if (true_vifi >= 0 && mrt->mroute_do_assert && + if (true_vifi >= 0 && READ_ONCE(mrt->mroute_do_assert) && /* pimsm uses asserts, when switching from RPT to SPT, * so that we cannot check that packet arrived on an oif. * It is bad, but otherwise we would need to move pretty * large chunk of pimd to kernel. Ough... --ANK */ - (mrt->mroute_do_pim || + (READ_ONCE(mrt->mroute_do_pim) || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); - if (mrt->mroute_do_wrvifwhole) + if (READ_ONCE(mrt->mroute_do_wrvifwhole)) ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRVIFWHOLE); } @@ -2358,7 +2390,7 @@ int pim_rcv_v1(struct sk_buff *skb) mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; - if (!mrt->mroute_do_pim || + if (!READ_ONCE(mrt->mroute_do_pim) || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; @@ -2510,7 +2542,7 @@ static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, cmd, flags); } -static size_t mroute_msgsize(bool unresolved, int maxvif) +static size_t mroute_msgsize(bool unresolved) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -2523,7 +2555,7 @@ static size_t mroute_msgsize(bool unresolved, int maxvif) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ - + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) + + MAXVIFS * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; @@ -2538,8 +2570,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS, - mrt->maxvif), + skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS), GFP_ATOMIC); if (!skb) goto errout; @@ -2681,9 +2712,9 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX + 1]; - struct sk_buff *skb = NULL; struct mfc_cache *cache; struct mr_table *mrt; + struct sk_buff *skb; __be32 src, grp; u32 tableid; int err; @@ -2696,39 +2727,40 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, grp = nla_get_in_addr_default(tb[RTA_DST], 0); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); + skb = nlmsg_new(mroute_msgsize(false), GFP_KERNEL); + if (!skb) { + err = -ENOBUFS; + goto errout; + } + + rcu_read_lock(); + mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); if (!mrt) { err = -ENOENT; - goto errout_free; + goto errout_unlock; } - /* entries are added/deleted only under RTNL */ - rcu_read_lock(); cache = ipmr_cache_find(mrt, src, grp); - rcu_read_unlock(); if (!cache) { err = -ENOENT; - goto errout_free; - } - - skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL); - if (!skb) { - err = -ENOBUFS; - goto errout_free; + goto errout_unlock; } err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) - goto errout_free; + goto errout_unlock; - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + rcu_read_unlock(); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; -errout_free: +errout_unlock: + rcu_read_unlock(); kfree_skb(skb); goto errout; } @@ -2736,15 +2768,17 @@ errout_free: static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { - .rtnl_held = true, + .rtnl_held = false, }; int err; + rcu_read_lock(); + if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh, &filter, cb); if (err < 0) - return err; + goto out; } if (filter.table_id) { @@ -2752,19 +2786,28 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) mrt = __ipmr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { - if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) - return skb->len; + if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) { + err = skb->len; + goto out; + } NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); - return -ENOENT; + err = -ENOENT; + goto out; } + err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute, &mfc_unres_lock, &filter); - return skb->len ? : err; + err = skb->len ? : err; + goto out; } - return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, - _ipmr_fill_mroute, &mfc_unres_lock, &filter); + err = mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, + _ipmr_fill_mroute, &mfc_unres_lock, &filter); +out: + rcu_read_unlock(); + + return err; } static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { @@ -2808,10 +2851,10 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, { struct net_device *dev = NULL; u32 tblid = RT_TABLE_DEFAULT; + int ret, rem, iif = 0; struct mr_table *mrt; struct nlattr *attr; struct rtmsg *rtm; - int ret, rem; ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, extack); @@ -2838,11 +2881,7 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr); break; case RTA_IIF: - dev = __dev_get_by_index(net, nla_get_u32(attr)); - if (!dev) { - ret = -ENODEV; - goto out; - } + iif = nla_get_u32(attr); break; case RTA_MULTIPATH: if (ipmr_nla_get_ttls(attr, mfcc) < 0) { @@ -2858,16 +2897,30 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, break; } } + + rcu_read_lock(); + mrt = __ipmr_get_table(net, tblid); if (!mrt) { ret = -ENOENT; - goto out; + goto unlock; } + + if (iif) { + dev = dev_get_by_index_rcu(net, iif); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + + mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); + } + *mrtret = mrt; *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0; - if (dev) - mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); +unlock: + rcu_read_unlock(); out: return ret; } @@ -2877,21 +2930,26 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - int ret, mrtsock, parent; - struct mr_table *tbl; + int ret, mrtsock = 0, parent; + struct mr_table *tbl = NULL; struct mfcctl mfcc; - mrtsock = 0; - tbl = NULL; ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack); if (ret < 0) return ret; parent = ret ? mfcc.mfcc_parent : -1; + + mutex_lock(&net->ipv4.mfc_mutex); + if (nlh->nlmsg_type == RTM_NEWROUTE) - return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); + ret = ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); else - return ipmr_mfc_delete(tbl, &mfcc, parent); + ret = ipmr_mfc_delete(tbl, &mfcc, parent); + + mutex_unlock(&net->ipv4.mfc_mutex); + + return ret; } static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) @@ -2901,12 +2959,13 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) || nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) || nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM, - mrt->mroute_reg_vif_num) || + READ_ONCE(mrt->mroute_reg_vif_num)) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, - mrt->mroute_do_assert) || - nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || + READ_ONCE(mrt->mroute_do_assert)) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, + READ_ONCE(mrt->mroute_do_pim)) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, - mrt->mroute_do_wrvifwhole)) + READ_ONCE(mrt->mroute_do_wrvifwhole))) return false; return true; @@ -2919,7 +2978,7 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) struct vif_device *vif; vif = &mrt->vif_table[vifid]; - vif_dev = rtnl_dereference(vif->dev); + vif_dev = vif_dev_read(vif); /* if the VIF doesn't exist just continue */ if (!vif_dev) return true; @@ -2928,16 +2987,16 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) if (!vif_nest) return false; - if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) || + if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, READ_ONCE(vif_dev->ifindex)) || nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) || nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) || - nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in, + nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, READ_ONCE(vif->bytes_in), IPMRA_VIFA_PAD) || - nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out, + nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, READ_ONCE(vif->bytes_out), IPMRA_VIFA_PAD) || - nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in, + nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, READ_ONCE(vif->pkt_in), IPMRA_VIFA_PAD) || - nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out, + nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, READ_ONCE(vif->pkt_out), IPMRA_VIFA_PAD) || nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) || nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) { @@ -2992,6 +3051,8 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; s_e = cb->args[1]; + rcu_read_lock(); + ipmr_for_each_table(mrt, net) { struct nlattr *vifs, *af; struct ifinfomsg *hdr; @@ -3026,7 +3087,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) nlmsg_end(skb, nlh); goto out; } - for (i = 0; i < mrt->maxvif; i++) { + for (i = 0; i < READ_ONCE(mrt->maxvif); i++) { if (e < s_e) goto skip_entry; if (!ipmr_fill_vif(mrt, i, skb)) { @@ -3048,6 +3109,8 @@ skip_table: } out: + rcu_read_unlock(); + cb->args[1] = e; cb->args[0] = t; @@ -3185,7 +3248,7 @@ static const struct net_protocol pim_protocol = { static unsigned int ipmr_seq_read(const struct net *net) { - return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); + return atomic_read(&net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); } static int ipmr_dump(struct net *net, struct notifier_block *nb, @@ -3206,7 +3269,7 @@ static int __net_init ipmr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; - net->ipv4.ipmr_seq = 0; + atomic_set(&net->ipv4.ipmr_seq, 0); ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); if (IS_ERR(ops)) @@ -3225,8 +3288,11 @@ static void __net_exit ipmr_notifier_exit(struct net *net) /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { + LIST_HEAD(dev_kill_list); int err; + mutex_init(&net->ipv4.mfc_mutex); + err = ipmr_notifier_init(net); if (err) goto ipmr_notifier_fail; @@ -3250,9 +3316,8 @@ static int __net_init ipmr_net_init(struct net *net) proc_cache_fail: remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: - rtnl_lock(); + ipmr_rules_exit_rtnl(net, &dev_kill_list); ipmr_rules_exit(net); - rtnl_unlock(); #endif ipmr_rules_fail: ipmr_notifier_exit(net); @@ -3266,34 +3331,32 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif + ipmr_rules_exit(net); ipmr_notifier_exit(net); } -static void __net_exit ipmr_net_exit_batch(struct list_head *net_list) +static void __net_exit ipmr_net_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) { - struct net *net; - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - ipmr_rules_exit(net); - rtnl_unlock(); + ipmr_rules_exit_rtnl(net, dev_kill_list); } static struct pernet_operations ipmr_net_ops = { .init = ipmr_net_init, .exit = ipmr_net_exit, - .exit_batch = ipmr_net_exit_batch, + .exit_rtnl = ipmr_net_exit_rtnl, }; static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK, - .dumpit = ipmr_rtm_dumplink}, + .dumpit = ipmr_rtm_dumplink, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE, - .doit = ipmr_rtm_route}, + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, - .doit = ipmr_rtm_route}, + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, - .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute}, + .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; int __init ip_mr_init(void) diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 2d62526406ca..b0fd9ffa01a2 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -223,7 +223,7 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, rcu_read_lock(); vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev); - if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) { + if (vif_dev && nla_put_u32(skb, RTA_IIF, READ_ONCE(vif_dev->ifindex)) < 0) { rcu_read_unlock(); return -EMSGSIZE; } @@ -252,7 +252,7 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = vif_dev->ifindex; + nhp->rtnh_ifindex = READ_ONCE(vif_dev->ifindex); nhp->rtnh_len = sizeof(*nhp); } } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e047a4680ab0..85010ff21c98 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1280,7 +1280,7 @@ static int ip6mr_device_event(struct notifier_block *this, static unsigned int ip6mr_seq_read(const struct net *net) { - return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); + return atomic_read(&net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, @@ -1305,7 +1305,7 @@ static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; - net->ipv6.ipmr_seq = 0; + atomic_set(&net->ipv6.ipmr_seq, 0); ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) diff --git a/tools/testing/selftests/net/forwarding/.gitignore b/tools/testing/selftests/net/forwarding/.gitignore index 2dea317f12e7..418ff96c52ef 100644 --- a/tools/testing/selftests/net/forwarding/.gitignore +++ b/tools/testing/selftests/net/forwarding/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only forwarding.config +ipmr diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index ff4a00d91a26..bbaf4d937dd8 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -133,6 +133,10 @@ TEST_FILES := \ tc_common.sh \ # end of TEST_FILES +TEST_GEN_PROGS := \ + ipmr +# end of TEST_GEN_PROGS + TEST_INCLUDES := \ $(wildcard ../lib/sh/*.sh) \ ../lib.sh \ diff --git a/tools/testing/selftests/net/forwarding/ipmr.c b/tools/testing/selftests/net/forwarding/ipmr.c new file mode 100644 index 000000000000..df870aad9ead --- /dev/null +++ b/tools/testing/selftests/net/forwarding/ipmr.c @@ -0,0 +1,455 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2026 Google LLC */ + +#include <linux/if.h> +#include <linux/mroute.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/socket.h> +#include <sched.h> +#include <sys/ioctl.h> +#include <sys/socket.h> + +#include "kselftest_harness.h" + +FIXTURE(ipmr) +{ + int netlink_sk; + int raw_sk; + int veth_ifindex; +}; + +FIXTURE_VARIANT(ipmr) +{ + int family; + int protocol; + int level; + int opts[MRT_MAX - MRT_BASE + 1]; +}; + +FIXTURE_VARIANT_ADD(ipmr, ipv4) +{ + .family = AF_INET, + .protocol = IPPROTO_IGMP, + .level = IPPROTO_IP, + .opts = { + MRT_INIT, + MRT_DONE, + MRT_ADD_VIF, + MRT_DEL_VIF, + MRT_ADD_MFC, + MRT_DEL_MFC, + MRT_VERSION, + MRT_ASSERT, + MRT_PIM, + MRT_TABLE, + MRT_ADD_MFC_PROXY, + MRT_DEL_MFC_PROXY, + MRT_FLUSH, + }, +}; + +struct mfc_attr { + int table; + __u32 origin; + __u32 group; + int ifindex; + bool proxy; +}; + +static struct rtattr *nl_add_rtattr(struct nlmsghdr *nlmsg, struct rtattr *rta, + int type, const void *data, int len) +{ + int unused = 0; + + rta->rta_type = type; + rta->rta_len = RTA_LENGTH(len); + memcpy(RTA_DATA(rta), data, len); + + nlmsg->nlmsg_len += NLMSG_ALIGN(rta->rta_len); + + return RTA_NEXT(rta, unused); +} + +static int nl_sendmsg_mfc(struct __test_metadata *_metadata, FIXTURE_DATA(ipmr) *self, + __u16 nlmsg_type, struct mfc_attr *mfc_attr) +{ + struct { + struct nlmsghdr nlmsg; + struct rtmsg rtm; + char buf[4096]; + } req = { + .nlmsg = { + .nlmsg_len = NLMSG_LENGTH(sizeof(req.rtm)), + /* ipmr does not care about NLM_F_CREATE and NLM_F_EXCL ... */ + .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, + .nlmsg_type = nlmsg_type, + }, + .rtm = { + /* hard requirements in rtm_to_ipmr_mfcc() */ + .rtm_family = RTNL_FAMILY_IPMR, + .rtm_dst_len = 32, + .rtm_type = RTN_MULTICAST, + .rtm_scope = RT_SCOPE_UNIVERSE, + .rtm_protocol = RTPROT_MROUTED, + }, + }; + struct nlmsghdr *nlmsg = &req.nlmsg; + struct nlmsgerr *errmsg; + struct rtattr *rta; + int err; + + rta = (struct rtattr *)&req.buf; + rta = nl_add_rtattr(nlmsg, rta, RTA_TABLE, &mfc_attr->table, sizeof(mfc_attr->table)); + rta = nl_add_rtattr(nlmsg, rta, RTA_SRC, &mfc_attr->origin, sizeof(mfc_attr->origin)); + rta = nl_add_rtattr(nlmsg, rta, RTA_DST, &mfc_attr->group, sizeof(mfc_attr->group)); + if (mfc_attr->ifindex) + rta = nl_add_rtattr(nlmsg, rta, RTA_IIF, &mfc_attr->ifindex, sizeof(mfc_attr->ifindex)); + if (mfc_attr->proxy) + rta = nl_add_rtattr(nlmsg, rta, RTA_PREFSRC, NULL, 0); + + err = send(self->netlink_sk, &req, req.nlmsg.nlmsg_len, 0); + ASSERT_EQ(err, req.nlmsg.nlmsg_len); + + memset(&req, 0, sizeof(req)); + + err = recv(self->netlink_sk, &req, sizeof(req), 0); + ASSERT_TRUE(NLMSG_OK(nlmsg, err)); + ASSERT_EQ(NLMSG_ERROR, nlmsg->nlmsg_type); + + errmsg = (struct nlmsgerr *)NLMSG_DATA(nlmsg); + return errmsg->error; +} + +FIXTURE_SETUP(ipmr) +{ + struct ifreq ifr = { + .ifr_name = "veth0", + }; + int err; + + err = unshare(CLONE_NEWNET); + ASSERT_EQ(0, err); + + self->netlink_sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + ASSERT_LE(0, self->netlink_sk); + + self->raw_sk = socket(variant->family, SOCK_RAW, variant->protocol); + ASSERT_LT(0, self->raw_sk); + + err = system("ip link add veth0 type veth peer veth1"); + ASSERT_EQ(0, err); + + err = ioctl(self->raw_sk, SIOCGIFINDEX, &ifr); + ASSERT_EQ(0, err); + + self->veth_ifindex = ifr.ifr_ifindex; +} + +FIXTURE_TEARDOWN(ipmr) +{ + close(self->raw_sk); + close(self->netlink_sk); +} + +TEST_F(ipmr, mrt_init) +{ + int err, val = 0; /* any value is ok, but size must be int for MRT_INIT. */ + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_INIT - MRT_BASE], + &val, sizeof(val)); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DONE - MRT_BASE], + &val, sizeof(val)); + ASSERT_EQ(0, err); +} + +TEST_F(ipmr, mrt_add_vif_register) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_REGISTER, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_vif | grep -q pimreg"); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); +} + +TEST_F(ipmr, mrt_del_vif_unreg) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_vif | grep -q veth0"); + ASSERT_EQ(0, err); + + /* VIF is removed along with its device. */ + err = system("ip link del veth0"); + ASSERT_EQ(0, err); + + /* mrt->vif_table[veth_ifindex]->dev is NULL. */ + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(-1, err); + ASSERT_EQ(EADDRNOTAVAIL, errno); +} + +TEST_F(ipmr, mrt_del_vif_netns_dismantle) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + /* Let cleanup_net() remove veth0 and VIF. */ +} + +TEST_F(ipmr, mrt_add_mfc) +{ + struct mfcctl mfc = {}; + int err; + + /* MRT_ADD_MFC / MRT_ADD_MFC_PROXY does not need vif to exist (unlike netlink). */ + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE], + &mfc, sizeof(mfc)); + ASSERT_EQ(0, err); + + /* (0.0.0.0 -> 0.0.0.0) */ + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DEL_MFC - MRT_BASE], + &mfc, sizeof(mfc)); +} + +TEST_F(ipmr, mrt_add_mfc_proxy) +{ + struct mfcctl mfc = {}; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_MFC_PROXY - MRT_BASE], + &mfc, sizeof(mfc)); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DEL_MFC_PROXY - MRT_BASE], + &mfc, sizeof(mfc)); +} + +TEST_F(ipmr, mrt_add_mfc_netlink) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + struct mfc_attr mfc_attr = { + .table = RT_TABLE_DEFAULT, + .origin = 0, + .group = 0, + .ifindex = self->veth_ifindex, + .proxy = false, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); + ASSERT_EQ(0, err); +} + +TEST_F(ipmr, mrt_add_mfc_netlink_proxy) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + struct mfc_attr mfc_attr = { + .table = RT_TABLE_DEFAULT, + .origin = 0, + .group = 0, + .ifindex = self->veth_ifindex, + .proxy = true, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); + ASSERT_EQ(0, err); +} + +TEST_F(ipmr, mrt_add_mfc_netlink_no_vif) +{ + struct mfc_attr mfc_attr = { + .table = RT_TABLE_DEFAULT, + .origin = 0, + .group = 0, + .proxy = false, + }; + int err; + + /* netlink always requires RTA_IIF of an existing vif. */ + mfc_attr.ifindex = 0; + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(-ENFILE, err); + + /* netlink always requires RTA_IIF of an existing vif. */ + mfc_attr.ifindex = self->veth_ifindex; + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(-ENFILE, err); +} + +TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) +{ + struct vifctl vifs[2] = { + { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }, + { + .vifc_vifi = 1, + .vifc_flags = VIFF_REGISTER, + } + }; + struct mfc_attr mfc_attr = { + .table = RT_TABLE_DEFAULT, + .origin = 0, + .group = 0, + .ifindex = self->veth_ifindex, + .proxy = false, + }; + int i, err; + + for (i = 0; i < 2; i++) { + /* Create 2 VIFs just to avoid -ENFILE later. */ + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vifs[i], sizeof(vifs[i])); + ASSERT_EQ(0, err); + } + + /* Create a MFC for mrt->vif_table[0]. */ + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + /* Remove mrt->vif_table[0]. */ + err = system("ip link del veth0"); + ASSERT_EQ(0, err); + + /* MFC entry is NOT removed even if the tied VIF is removed... */ + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + /* ... and netlink is not capable of removing such an entry + * because netlink always requires a valid RTA_IIF ... :/ + */ + err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); + ASSERT_EQ(-ENODEV, err); + + /* It can be removed by setsockopt(), but let cleanup_net() remove this time. */ +} + +TEST_F(ipmr, mrt_table_flush) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + struct mfc_attr mfc_attr = { + .origin = 0, + .group = 0, + .ifindex = self->veth_ifindex, + .proxy = false, + }; + int table_id = 92; + int err, flags; + + /* Set a random table id rather than RT_TABLE_DEFAULT. + * Note that /proc/net/ip_mr_{vif,cache} only supports RT_TABLE_DEFAULT. + */ + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_TABLE - MRT_BASE], + &table_id, sizeof(table_id)); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + mfc_attr.table = table_id; + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(0, err); + + /* Flush mrt->vif_table[] and all caches. */ + flags = MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | + MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC; + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_FLUSH - MRT_BASE], + &flags, sizeof(flags)); + ASSERT_EQ(0, err); +} + +TEST_HARNESS_MAIN |
