From 160ade55607ee5cfd0c0553f02bb2ba1a338e0b2 Mon Sep 17 00:00:00 2001 From: Yue Gao Date: Thu, 18 Jun 2026 14:33:43 -0700 Subject: [PATCH] vpp vlan bvi support - Add sonic-ext plugin - see https://github.com/sonic-net/SONiC/pull/2387 Signed-off-by: Yue Gao --- docker-sonic-vpp/conf/startup.conf.tmpl | 5 +- docker-syncd-vpp/conf/startup.conf.tmpl | 5 +- rules/vpp.mk | 2 +- vppbld/plugins/sonic_ext/CMakeLists.txt | 25 ++ vppbld/plugins/sonic_ext/FEATURE.yaml | 8 + .../sonic_ext/aggr_tap_redirect_node.c | 299 ++++++++++++++++++ .../plugins/sonic_ext/bcast_redirect_node.c | 275 ++++++++++++++++ vppbld/plugins/sonic_ext/capture_node.c | 157 +++++++++ vppbld/plugins/sonic_ext/cli.c | 108 +++++++ vppbld/plugins/sonic_ext/host_xc_node.c | 213 +++++++++++++ vppbld/plugins/sonic_ext/sonic_ext.c | 299 ++++++++++++++++++ vppbld/plugins/sonic_ext/sonic_ext.h | 160 ++++++++++ 12 files changed, 1547 insertions(+), 9 deletions(-) create mode 100644 vppbld/plugins/sonic_ext/CMakeLists.txt create mode 100644 vppbld/plugins/sonic_ext/FEATURE.yaml create mode 100644 vppbld/plugins/sonic_ext/aggr_tap_redirect_node.c create mode 100644 vppbld/plugins/sonic_ext/bcast_redirect_node.c create mode 100644 vppbld/plugins/sonic_ext/capture_node.c create mode 100644 vppbld/plugins/sonic_ext/cli.c create mode 100644 vppbld/plugins/sonic_ext/host_xc_node.c create mode 100644 vppbld/plugins/sonic_ext/sonic_ext.c create mode 100644 vppbld/plugins/sonic_ext/sonic_ext.h diff --git a/docker-sonic-vpp/conf/startup.conf.tmpl b/docker-sonic-vpp/conf/startup.conf.tmpl index 9ba68c5..6513b6d 100644 --- a/docker-sonic-vpp/conf/startup.conf.tmpl +++ b/docker-sonic-vpp/conf/startup.conf.tmpl @@ -252,6 +252,7 @@ plugins { plugin vxlan_plugin.so { enable } plugin tunterm_acl_plugin.so { enable } plugin ip_validate_plugin.so { enable } + plugin sonic_ext_plugin.so { enable } ## Enable all plugins by default and then selectively disable specific plugins # plugin dpdk_plugin.so { disable } @@ -299,7 +300,3 @@ plugins { # class dpdk/cryptodev { rate-limit 100 level debug syslog-level error } # } -# Why not support sub interfaces -linux-cp { - lcp-auto-subint -} diff --git a/docker-syncd-vpp/conf/startup.conf.tmpl b/docker-syncd-vpp/conf/startup.conf.tmpl index edb95f7..1d6f98e 100644 --- a/docker-syncd-vpp/conf/startup.conf.tmpl +++ b/docker-syncd-vpp/conf/startup.conf.tmpl @@ -240,6 +240,7 @@ plugins { plugin vxlan_plugin.so { enable } plugin tunterm_acl_plugin.so { enable } plugin ip_validate_plugin.so { enable } + plugin sonic_ext_plugin.so { enable } ## Enable all plugins by default and then selectively disable specific plugins # plugin dpdk_plugin.so { disable } @@ -287,7 +288,3 @@ plugins { # class dpdk/cryptodev { rate-limit 100 level debug syslog-level error } # } -# Why not support sub interfaces -linux-cp { - lcp-auto-subint -} diff --git a/rules/vpp.mk b/rules/vpp.mk index 3725d99..56ee659 100644 --- a/rules/vpp.mk +++ b/rules/vpp.mk @@ -7,7 +7,7 @@ VPP_VERSION_BASE = 2606 # https://packages.buildkite.com/sonic-vpp/vpp; if the suffix isn't bumped, # downstream sonic-buildimage builds will silently pull stale debs that # pre-date the new patch series and end up with VPP/SAI CRC drift. -VPP_VERSION = $(VPP_VERSION_BASE)-0.2 +VPP_VERSION = $(VPP_VERSION_BASE)-0.3 VPP_VERSION_SONIC = $(VPP_VERSION)+b1sonic1 VPP_SRC_PATH = platform/vpp/vppbld diff --git a/vppbld/plugins/sonic_ext/CMakeLists.txt b/vppbld/plugins/sonic_ext/CMakeLists.txt new file mode 100644 index 0000000..d37f2f4 --- /dev/null +++ b/vppbld/plugins/sonic_ext/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (c) 2026 SONiC-VPP contributors +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_vpp_plugin(sonic_ext + SOURCES + sonic_ext.c + capture_node.c + aggr_tap_redirect_node.c + host_xc_node.c + bcast_redirect_node.c + cli.c + + LINK_LIBRARIES + lcp +) diff --git a/vppbld/plugins/sonic_ext/FEATURE.yaml b/vppbld/plugins/sonic_ext/FEATURE.yaml new file mode 100644 index 0000000..d544f3f --- /dev/null +++ b/vppbld/plugins/sonic_ext/FEATURE.yaml @@ -0,0 +1,8 @@ +name: sonic_ext +maintainer: SONiC-VPP contributors +features: + - punt-via-member: redirect BVI-RX punted unicast/ARP to the original member tap + - host-xc: bypass ethernet-input for packets injected from the linux-cp host tap +description: "SONiC VPP extensions for BVI/L2 punt and host-side cross-connect" +state: experimental +properties: [API, CLI] diff --git a/vppbld/plugins/sonic_ext/aggr_tap_redirect_node.c b/vppbld/plugins/sonic_ext/aggr_tap_redirect_node.c new file mode 100644 index 0000000..fbb7dcf --- /dev/null +++ b/vppbld/plugins/sonic_ext/aggr_tap_redirect_node.c @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2026 SONiC-VPP contributors + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include + +/* + * sonic-ext-aggr-tap-redirect + * + * Single feature node on the `interface-output` arc, enabled per + * sw_if_index on the LCP host tap of every "aggregate" phy -- + * BVI today, bond / port-channel later. All three legacy redirect + * paths (ip4-punt, ip6-punt, arp) converge here: + * + * phy(rx) --> ethernet-input --> { l2-input | ip4-input | ... } + * --> ... --> linux-cp-punt-xc + * --> (linux-cp-punt-xc rewrites VLIB_TX = aggr_host_tap) + * --> interface-output[aggr_host_tap] <-- WE RUN HERE + * + * The capture node (device-input arc, enabled on every phy) stamps + * the buffer with magic + orig_rx_sw_if_index when the packet first + * enters VPP. Here we recover that stamp, look up the LCP pair of + * the original phy, and rewrite VLIB_TX to point at the *member* + * tap so Linux observes the packet on the correct netdev. + * + * Hooking on `interface-output` (rather than the upstream ip4-punt / + * ip6-punt / arp arcs) is the only correct location: by the time + * linux-cp-punt-xc has dispatched, VLIB_TX is already the aggr tap + * and the ip-punt/arp arcs no longer fire. It also unifies the L2 + * flooded ARP path with the L3 punt path -- l2-flood + bvi-to-l3 + * + linux-cp-punt-xc all funnel into interface-output. + * + * Because we re-enter "interface-output" with VLIB_TX changed to the + * member tap, the feature-arc config of the aggregate tap no longer + * applies: the next pass uses the member tap's per-interface + * features, and since this redirect is only enabled on aggregate + * taps (not member taps) there is no recursion. The cleared magic + * cookie is a second line of defence. + */ + +typedef struct +{ + u32 aggr_tap_sw_if_index; /* sw_if_index VLIB_TX was on entry */ + u32 orig_rx_sw_if_index; /* phy/sub-if where the packet entered VPP */ + u32 member_tap_sw_if_index; /* host tap of orig_rx's parent phy */ + u16 pushed_tpid; /* 0x8100 / 0x88a8 / 0x9100 if pushed, else 0 */ + u16 pushed_vlan_id; /* outer vlan id pushed (when pushed_tpid != 0) */ + u32 redirected; +} sonic_ext_aggr_tap_redirect_trace_t; + +static u8 * +format_sonic_ext_aggr_tap_redirect_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + sonic_ext_aggr_tap_redirect_trace_t *t = + va_arg (*args, sonic_ext_aggr_tap_redirect_trace_t *); + s = format (s, + "SONIC-EXT-AGGR-TAP-REDIRECT: aggr-tap %u orig-rx %u " + "member-tap %u", + t->aggr_tap_sw_if_index, t->orig_rx_sw_if_index, + t->member_tap_sw_if_index); + if (t->pushed_tpid) + s = format (s, " vlan-pushed vid %u tpid 0x%04x", t->pushed_vlan_id, + t->pushed_tpid); + s = format (s, " %s", t->redirected ? "REDIRECTED" : "passthru"); + return s; +} + +#define foreach_sonic_ext_aggr_tap_redirect_error \ + _ (REDIRECTED, "aggregate tap punt redirected to member tap") \ + _ (NO_COOKIE, "no capture cookie -- left on aggregate tap") \ + _ (NO_LCP, "no LCP pair for original phy -- left on aggregate tap") \ + _ (DISABLED, "punt-via-member disabled -- left on aggregate tap") + +typedef enum +{ +#define _(sym, str) SONIC_EXT_AGGR_TAP_REDIRECT_ERROR_##sym, + foreach_sonic_ext_aggr_tap_redirect_error +#undef _ + SONIC_EXT_AGGR_TAP_REDIRECT_N_ERROR, +} sonic_ext_aggr_tap_redirect_error_t; + +static char *sonic_ext_aggr_tap_redirect_error_strings[] = { +#define _(sym, string) string, + foreach_sonic_ext_aggr_tap_redirect_error +#undef _ +}; + +typedef enum +{ + SONIC_EXT_AGGR_TAP_REDIRECT_NEXT_INTERFACE_OUTPUT, + SONIC_EXT_AGGR_TAP_REDIRECT_N_NEXT, +} sonic_ext_aggr_tap_redirect_next_t; + +VLIB_NODE_FN (sonic_ext_aggr_tap_redirect_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + vnet_main_t *vnm = vnet_get_main (); + u32 n_left_from, *from; + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; + u16 nexts[VLIB_FRAME_SIZE], *next; + u32 n_redirected = 0, n_no_cookie = 0, n_no_lcp = 0, n_disabled = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + vlib_get_buffers (vm, from, bufs, n_left_from); + b = bufs; + next = nexts; + + while (n_left_from > 0) + { + u32 next0 = 0; + sonic_ext_buffer_opaque_t *seb; + vnet_sw_interface_t *swo; + const lcp_itf_pair_t *mlip; + index_t mlipi; + u32 aggr_tap = vnet_buffer (b[0])->sw_if_index[VLIB_TX]; + u32 orig_rx = ~0; + u32 member_tap = ~0; + u32 saved_vlan_tag = 0; + u16 pushed_tpid = 0; + u16 pushed_vlan_id = 0; + int did_redirect = 0; + i32 adv; + + /* Always default to the feature-arc next so non-redirected + * packets continue to the aggregate tap's TX node. */ + vnet_feature_next (&next0, b[0]); + next[0] = (u16) next0; + + if (PREDICT_FALSE (!sem->punt_via_member)) + { + n_disabled++; + goto trace0; + } + + seb = sonic_ext_buffer (b[0]); + if (PREDICT_FALSE (seb->magic != SONIC_EXT_BUFFER_MAGIC)) + { + n_no_cookie++; + goto trace0; + } + orig_rx = seb->orig_rx_sw_if_index; + saved_vlan_tag = seb->orig_vlan_tag; + + /* Clear the cookie before any branch that might let the buffer + * out -- this defends both against re-entering this node and + * against a future packet on a recycled buffer. */ + seb->magic = 0; + + swo = vnet_get_sw_interface (vnm, orig_rx); + { + u32 phy_sw = (swo->type == VNET_SW_INTERFACE_TYPE_SUB) + ? swo->sup_sw_if_index + : orig_rx; + mlipi = lcp_itf_pair_find_by_phy (phy_sw); + } + if (PREDICT_FALSE (mlipi == INDEX_INVALID)) + { + n_no_lcp++; + goto trace0; + } + mlip = lcp_itf_pair_get (mlipi); + member_tap = mlip->lip_host_sw_if_index; + + /* Don't redirect to ourselves -- can happen if linux-cp punt + * already pointed VLIB_TX at the right tap, e.g. in a future + * config where a non-aggregate phy somehow ends up on this + * arc with the feature enabled. Cheap defence. */ + if (PREDICT_FALSE (member_tap == aggr_tap)) + { + n_no_lcp++; /* count under the "no useful redirect" bucket */ + goto trace0; + } + + /* Rewind to the original wire L2 (recovers any vlan tag bytes + * that ethernet-input parsed past). See sonic_ext.h header + * comment on l2_hdr_offset. */ + adv = + (i32) vnet_buffer (b[0])->l2_hdr_offset - (i32) b[0]->current_data; + vlib_buffer_advance (b[0], adv); + + /* Re-push the outer VLAN tag from the wire-time snapshot the + * capture node took. This is the only reliable source: at + * capture time (device-input arc) we are still positioned at + * the head of the ethernet frame, before ethernet-input has + * classified or stripped anything. Mirror the inverse of + * l2_vtr push-1: save dst+src mac, advance the buffer back + * by 4, write dst+src to the new position, write TPID+TCI + * at offset 12. Skip if a tag is already present + * (transparent bridge / no-pop config) or the snapshot was + * empty (untagged ingress). */ + if (saved_vlan_tag && b[0]->current_data >= 4) + { + u8 *cur = vlib_buffer_get_current (b[0]); + u16 cur_etype = clib_net_to_host_u16 (*(u16 *) (cur + 12)); + if (cur_etype != ETHERNET_TYPE_VLAN && + cur_etype != ETHERNET_TYPE_DOT1AD && + cur_etype != ETHERNET_TYPE_VLAN_9100) + { + u8 save_macs[12]; + u8 *new_cur; + clib_memcpy_fast (save_macs, cur, 12); + vlib_buffer_advance (b[0], -4); + new_cur = vlib_buffer_get_current (b[0]); + clib_memcpy_fast (new_cur, save_macs, 12); + clib_memcpy_fast (new_cur + 12, &saved_vlan_tag, 4); + vnet_buffer (b[0])->l2_hdr_offset -= 4; + pushed_tpid = + clib_net_to_host_u16 (*(u16 *) &saved_vlan_tag); + pushed_vlan_id = + clib_net_to_host_u16 (*((u16 *) &saved_vlan_tag + 1)) & + 0x0fff; + } + } + + vnet_buffer (b[0])->sw_if_index[VLIB_TX] = member_tap; + next[0] = SONIC_EXT_AGGR_TAP_REDIRECT_NEXT_INTERFACE_OUTPUT; + did_redirect = 1; + n_redirected++; + + trace0: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b[0]->flags & VLIB_BUFFER_IS_TRACED))) + { + sonic_ext_aggr_tap_redirect_trace_t *t = + vlib_add_trace (vm, node, b[0], sizeof (*t)); + t->aggr_tap_sw_if_index = aggr_tap; + t->orig_rx_sw_if_index = orig_rx; + t->member_tap_sw_if_index = member_tap; + t->pushed_tpid = pushed_tpid; + t->pushed_vlan_id = pushed_vlan_id; + t->redirected = did_redirect; + } + + b += 1; + next += 1; + n_left_from -= 1; + } + + vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors); + + if (n_redirected) + vlib_node_increment_counter (vm, sonic_ext_aggr_tap_redirect_node.index, + SONIC_EXT_AGGR_TAP_REDIRECT_ERROR_REDIRECTED, + n_redirected); + if (n_no_cookie) + vlib_node_increment_counter (vm, sonic_ext_aggr_tap_redirect_node.index, + SONIC_EXT_AGGR_TAP_REDIRECT_ERROR_NO_COOKIE, + n_no_cookie); + if (n_no_lcp) + vlib_node_increment_counter (vm, sonic_ext_aggr_tap_redirect_node.index, + SONIC_EXT_AGGR_TAP_REDIRECT_ERROR_NO_LCP, + n_no_lcp); + if (n_disabled) + vlib_node_increment_counter (vm, sonic_ext_aggr_tap_redirect_node.index, + SONIC_EXT_AGGR_TAP_REDIRECT_ERROR_DISABLED, + n_disabled); + + sem->aggr_tap_redirects += n_redirected; + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (sonic_ext_aggr_tap_redirect_node) = { + .name = "sonic-ext-aggr-tap-redirect", + .vector_size = sizeof (u32), + .format_trace = format_sonic_ext_aggr_tap_redirect_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (sonic_ext_aggr_tap_redirect_error_strings), + .error_strings = sonic_ext_aggr_tap_redirect_error_strings, + .n_next_nodes = SONIC_EXT_AGGR_TAP_REDIRECT_N_NEXT, + .next_nodes = { + [SONIC_EXT_AGGR_TAP_REDIRECT_NEXT_INTERFACE_OUTPUT] = "interface-output", + }, +}; + +VNET_FEATURE_INIT (sonic_ext_aggr_tap_redirect_feat, static) = { + .arc_name = "interface-output", + .node_name = "sonic-ext-aggr-tap-redirect", + /* Run before the actual tap TX; no specific peer ordering required + * since we either redirect (and re-enter the arc) or fall through + * to whatever is configured next on the aggregate-tap interface. */ +}; diff --git a/vppbld/plugins/sonic_ext/bcast_redirect_node.c b/vppbld/plugins/sonic_ext/bcast_redirect_node.c new file mode 100644 index 0000000..a896e88 --- /dev/null +++ b/vppbld/plugins/sonic_ext/bcast_redirect_node.c @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2026 SONiC-VPP contributors + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include +#include + +#define SONIC_EXT_DHCP_BOOTPC 68 +#define SONIC_EXT_DHCP_BOOTPS 67 + +/* + * sonic-ext-bcast-redirect + * + * Feature on the `ip4-unicast` arc. Enabled per sw_if_index on + * every BVI to catch DHCPv4 client → server broadcasts that flooded + * into the BVI from one of the bridge's member ports, and dispatch + * them to `linux-cp-punt` so the frame ends up on the originating + * member's host tap (via aggr-tap-redirect). + * + * Match criteria (all required): + * - IPv4 dst == 255.255.255.255 (limited broadcast) + * - IP protocol == UDP (17) + * - UDP src port == 68 (bootpc) AND dst port == 67 (bootps) + * - sonic_ext cookie magic OK (frame arrived via a stamped wire phy) + * + * Anything else — unicast, subnet-directed broadcast, non-UDP, + * non-DHCP UDP broadcasts (NetBIOS, vendor discovery, RIPv1, WoL, + * etc.) — passes through to `ip4-lookup`. The narrow match avoids + * accidentally hijacking other limited-broadcast traffic that the + * BVI/host might want to handle locally. The set can be widened + * later if other limited-broadcast control protocols need per-member + * delivery; today DHCP is the only realistic case on a SONiC switch. + * + * member-N RX + * -> sonic-ext-capture (stamps cookie {orig_rx, orig_vlan_tag}) + * -> ethernet-input -> l2-input -> ... -> l2-flood + * -> ALL bridge members + * -> BVI <-- this is the path we redirect + * -> ethernet-input -> ip4-input -> ip4-unicast feature arc + * -> sonic-ext-bcast-redirect [HERE] + * if (cookie OK && dst==255.255.255.255 && + * proto==UDP && sport==68 && dport==67): + * next = linux-cp-punt + * else: vnet_feature_next() -> ip4-lookup -> dpo-drop + * -> linux-cp-punt + * rewinds to L2 header (l2_hdr_offset still valid), + * sets VLIB_TX = bvi-host-tap, dispatches to + * interface-output[bvi-host-tap] + * -> bvi-host-tap interface-output + * -> sonic-ext-aggr-tap-redirect + * reads cookie -> orig_rx + orig_vlan_tag, + * sets VLIB_TX = member-N-host-tap, + * re-pushes VLAN, redispatches to + * interface-output[member-N-host-tap] + * -> member-N-host-tap -> kernel sees DHCP on the + * original member's netdev, + * with original src MAC, + * dst = ff:ff:ff:ff:ff:ff, + * VLAN intact. + * + * Why we hand off to linux-cp-punt instead of redirecting directly + * from this node: + * 1. linux-cp-punt knows how to rewind to the L2 header without + * duplicating that logic here. L2 bytes are preserved verbatim + * (no MAC rewrite) — confirmed by reading lcp_node.c. + * 2. The aggr-tap-redirect node already on the bvi-host-tap + * interface-output arc handles the member-pick + VLAN re-push. + * Funneling through linux-cp-punt -> aggr-tap-redirect avoids + * duplicating that machinery here. + * + * Cookie magic check: a frame reaching the BVI's ip4-input that was + * NOT stamped at member device-input (e.g. host-originated, or some + * future internal source) will have cookie->magic != MAGIC and is + * passed through unchanged — the normal ip4-lookup rules then apply + * (which drop limited broadcast against the default 255/32 -> drop + * route, exactly as today). + */ + +typedef struct +{ + u32 rx_sw_if_index; + u32 dst_addr; + u32 cookie_orig_rx; + u16 sport; + u16 dport; + u8 proto; + u8 cookie_ok; + u8 punted; +} sonic_ext_bcast_trace_t; + +static u8 * +format_sonic_ext_bcast_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + sonic_ext_bcast_trace_t *t = + va_arg (*args, sonic_ext_bcast_trace_t *); + s = format (s, + "SONIC-EXT-BCAST-REDIRECT: rx %u dst %U " + "proto %u sport %u dport %u cookie %s orig_rx %u %s", + t->rx_sw_if_index, format_ip4_address, &t->dst_addr, + t->proto, t->sport, t->dport, + t->cookie_ok ? "ok" : "bad", t->cookie_orig_rx, + t->punted ? "PUNTED->linux-cp-punt" : "passthru"); + return s; +} + +#define foreach_sonic_ext_bcast_error \ + _ (PUNTED, "DHCPv4 broadcast redirected to linux-cp-punt") \ + _ (PASSTHRU, "not a redirect candidate, passthru") + +typedef enum +{ +#define _(sym, str) SONIC_EXT_BCAST_ERROR_##sym, + foreach_sonic_ext_bcast_error +#undef _ + SONIC_EXT_BCAST_N_ERROR, +} sonic_ext_bcast_error_t; + +static char *sonic_ext_bcast_error_strings[] = { +#define _(sym, str) str, + foreach_sonic_ext_bcast_error +#undef _ +}; + +typedef enum +{ + SONIC_EXT_BCAST_NEXT_PUNT, + SONIC_EXT_BCAST_N_NEXT, +} sonic_ext_bcast_next_t; + +VLIB_NODE_FN (sonic_ext_bcast_redirect_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + u32 n_left, *from; + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; + u16 nexts[VLIB_FRAME_SIZE], *next; + u32 n_punted = 0, n_pass = 0; + + from = vlib_frame_vector_args (frame); + n_left = frame->n_vectors; + vlib_get_buffers (vm, from, bufs, n_left); + b = bufs; + next = nexts; + + while (n_left > 0) + { + u32 next0; + ip4_header_t *ip0; + udp_header_t *udp0; + u32 dst0 = 0; + u32 cookie_orig_rx = ~0; + u16 sport0 = 0, dport0 = 0; + u8 proto0 = 0; + u8 cookie_ok = 0; + u8 punted = 0; + u32 ihl_bytes; + + if (PREDICT_FALSE (!sem->punt_via_member)) + goto pass; + + ip0 = vlib_buffer_get_current (b[0]); + dst0 = ip0->dst_address.as_u32; + proto0 = ip0->protocol; + + /* DHCPv4 client → server only. Match: + * - dst == 255.255.255.255 (limited broadcast) + * - proto == UDP + * - sport == 68, dport == 67 + * Subnet-directed broadcasts, unicast, non-UDP and other UDP + * broadcasts (NetBIOS, RIPv1, vendor discovery, etc.) fall + * through to ip4-lookup. */ + if (dst0 != 0xffffffff) + goto pass; + if (proto0 != IP_PROTOCOL_UDP) + goto pass; + + /* Bounds-check the UDP header is fully present in the buffer + * before reading sport/dport. IHL is in 4-byte units. */ + ihl_bytes = ip4_header_bytes (ip0); + if (PREDICT_FALSE (b[0]->current_length < ihl_bytes + sizeof (*udp0))) + goto pass; + udp0 = (udp_header_t *) ((u8 *) ip0 + ihl_bytes); + sport0 = clib_net_to_host_u16 (udp0->src_port); + dport0 = clib_net_to_host_u16 (udp0->dst_port); + if (sport0 != SONIC_EXT_DHCP_BOOTPC || + dport0 != SONIC_EXT_DHCP_BOOTPS) + goto pass; + + { + sonic_ext_buffer_opaque_t *cookie = sonic_ext_buffer (b[0]); + if (cookie->magic != SONIC_EXT_BUFFER_MAGIC) + goto pass; + cookie_orig_rx = cookie->orig_rx_sw_if_index; + cookie_ok = 1; + } + + next[0] = SONIC_EXT_BCAST_NEXT_PUNT; + punted = 1; + n_punted++; + goto traced; + + pass: + vnet_feature_next (&next0, b[0]); + next[0] = (u16) next0; + n_pass++; + + traced: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b[0]->flags & VLIB_BUFFER_IS_TRACED))) + { + sonic_ext_bcast_trace_t *t = + vlib_add_trace (vm, node, b[0], sizeof (*t)); + t->rx_sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_RX]; + t->dst_addr = dst0; + t->cookie_orig_rx = cookie_orig_rx; + t->proto = proto0; + t->sport = sport0; + t->dport = dport0; + t->cookie_ok = cookie_ok; + t->punted = punted; + } + + b += 1; + next += 1; + n_left -= 1; + } + + vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors); + + if (n_punted) + vlib_node_increment_counter (vm, sonic_ext_bcast_redirect_node.index, + SONIC_EXT_BCAST_ERROR_PUNTED, n_punted); + if (n_pass) + vlib_node_increment_counter (vm, sonic_ext_bcast_redirect_node.index, + SONIC_EXT_BCAST_ERROR_PASSTHRU, n_pass); + sem->bcast_punts += n_punted; + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (sonic_ext_bcast_redirect_node) = { + .name = "sonic-ext-bcast-redirect", + .vector_size = sizeof (u32), + .format_trace = format_sonic_ext_bcast_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (sonic_ext_bcast_error_strings), + .error_strings = sonic_ext_bcast_error_strings, + .n_next_nodes = SONIC_EXT_BCAST_N_NEXT, + .next_nodes = { + [SONIC_EXT_BCAST_NEXT_PUNT] = "linux-cp-punt", + }, +}; + +VNET_FEATURE_INIT (sonic_ext_bcast_redirect_feat, static) = { + .arc_name = "ip4-unicast", + .node_name = "sonic-ext-bcast-redirect", + .runs_before = VNET_FEATURES ("ip4-lookup"), +}; diff --git a/vppbld/plugins/sonic_ext/capture_node.c b/vppbld/plugins/sonic_ext/capture_node.c new file mode 100644 index 0000000..18d6e05 --- /dev/null +++ b/vppbld/plugins/sonic_ext/capture_node.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2026 SONiC-VPP contributors + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include + +/* + * sonic-ext-capture + * + * Runs on the device-input arc before ethernet-input. Stores the + * original phy rx_sw_if_index into a per-buffer overlay + * (sonic_ext_buffer_opaque_t) located inside vnet_buffer2(b)->unused, + * so that downstream punt-ip4/ip6/arp features running on the BVI can + * recover the member that the packet actually came in on. + * + * Storing inside opaque2 means the value is automatically copied into + * every clone produced by vlib_buffer_clone (l2-flood, ip-mcast, ...), + * so broadcast/multicast punt paths see the same orig_rx as unicast. + */ + +typedef struct +{ + u32 sw_if_index; +} sonic_ext_capture_trace_t; + +static u8 * +format_sonic_ext_capture_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + sonic_ext_capture_trace_t *t = + va_arg (*args, sonic_ext_capture_trace_t *); + s = format (s, "SONIC-EXT-CAPTURE: sw_if_index %d", t->sw_if_index); + return s; +} + +#define foreach_sonic_ext_capture_error \ + _ (CAPTURED, "rx captured") + +typedef enum +{ +#define _(sym, str) SONIC_EXT_CAPTURE_ERROR_##sym, + foreach_sonic_ext_capture_error +#undef _ + SONIC_EXT_CAPTURE_N_ERROR, +} sonic_ext_capture_error_t; + +static char *sonic_ext_capture_error_strings[] = { +#define _(sym, string) string, + foreach_sonic_ext_capture_error +#undef _ +}; + +VLIB_NODE_FN (sonic_ext_capture_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + u32 n_left_from, *from; + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; + u16 nexts[VLIB_FRAME_SIZE], *next; + u32 n_captured = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + vlib_get_buffers (vm, from, bufs, n_left_from); + b = bufs; + next = nexts; + + while (n_left_from > 0) + { + u32 next0; + sonic_ext_buffer_opaque_t *seb; + + /* Feature-arc next: continue to ethernet-input or whatever runs + * after us on the device-input arc. */ + vnet_feature_next (&next0, b[0]); + next[0] = (u16) next0; + + seb = sonic_ext_buffer (b[0]); + seb->orig_rx_sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_RX]; + + /* Snapshot the outermost VLAN tag (if any) directly off the + * wire frame. At device-input the buffer's current_data is + * still positioned at the start of the ethernet header and + * no tag-rewrite has run yet, so bytes [12..16] are either + * the ethertype (untagged) or TPID+TCI (tagged). Storing + * the raw 4 bytes lets the aggr-tap redirect re-push them + * verbatim if the bridge later pops the tag. */ + { + u8 *p = vlib_buffer_get_current (b[0]); + u16 etype = clib_net_to_host_u16 (*(u16 *) (p + 12)); + if (etype == ETHERNET_TYPE_VLAN /* 0x8100 */ + || etype == ETHERNET_TYPE_DOT1AD /* 0x88a8 */ + || etype == ETHERNET_TYPE_VLAN_9100 /* 0x9100 */) + seb->orig_vlan_tag = *(u32 *) (p + 12); /* TPID|TCI net order */ + else + seb->orig_vlan_tag = 0; + } + + seb->magic = SONIC_EXT_BUFFER_MAGIC; + n_captured++; + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b[0]->flags & VLIB_BUFFER_IS_TRACED))) + { + sonic_ext_capture_trace_t *t = + vlib_add_trace (vm, node, b[0], sizeof (*t)); + t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_RX]; + } + + b += 1; + next += 1; + n_left_from -= 1; + } + + vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors); + + if (n_captured) + vlib_node_increment_counter (vm, sonic_ext_capture_node.index, + SONIC_EXT_CAPTURE_ERROR_CAPTURED, n_captured); + sem->captures += n_captured; + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (sonic_ext_capture_node) = { + .name = "sonic-ext-capture", + .vector_size = sizeof (u32), + .format_trace = format_sonic_ext_capture_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (sonic_ext_capture_error_strings), + .error_strings = sonic_ext_capture_error_strings, + /* No fixed next-nodes: this is a feature-arc node and always uses + * vnet_feature_next() to decide where to go. Setting sibling_of + * shares the arc's next-node table with the arc start node. */ + .sibling_of = "device-input", +}; + +VNET_FEATURE_INIT (sonic_ext_capture_feat, static) = { + .arc_name = "device-input", + .node_name = "sonic-ext-capture", + .runs_before = VNET_FEATURES ("ethernet-input"), +}; diff --git a/vppbld/plugins/sonic_ext/cli.c b/vppbld/plugins/sonic_ext/cli.c new file mode 100644 index 0000000..da6bd08 --- /dev/null +++ b/vppbld/plugins/sonic_ext/cli.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2026 SONiC-VPP contributors + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include + +static clib_error_t * +sonic_ext_punt_via_member_command_fn (vlib_main_t *vm, + unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on") || unformat (line_input, "enable")) + sonic_ext_set_punt_via_member (1); + else if (unformat (line_input, "off") || + unformat (line_input, "disable")) + sonic_ext_set_punt_via_member (0); + else + { + unformat_free (line_input); + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + } + } + + unformat_free (line_input); + return 0; +} + +VLIB_CLI_COMMAND (sonic_ext_punt_via_member_command, static) = { + .path = "sonic-ext punt-via-member", + .short_help = "sonic-ext punt-via-member [on|enable|off|disable]", + .function = sonic_ext_punt_via_member_command_fn, +}; + +static clib_error_t * +sonic_ext_host_xc_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on") || unformat (line_input, "enable")) + sonic_ext_set_host_xc (1); + else if (unformat (line_input, "off") || + unformat (line_input, "disable")) + sonic_ext_set_host_xc (0); + else + { + unformat_free (line_input); + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + } + } + + unformat_free (line_input); + return 0; +} + +VLIB_CLI_COMMAND (sonic_ext_host_xc_command, static) = { + .path = "sonic-ext host-xc", + .short_help = "sonic-ext host-xc [on|enable|off|disable]", + .function = sonic_ext_host_xc_command_fn, +}; + +static clib_error_t * +show_sonic_ext_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + vlib_cli_output (vm, "sonic-ext state:"); + vlib_cli_output (vm, " punt-via-member : %s", + sem->punt_via_member ? "on" : "off"); + vlib_cli_output (vm, " host-xc : %s", + sem->host_xc ? "on" : "off"); + vlib_cli_output (vm, " captures : %llu", sem->captures); + vlib_cli_output (vm, " aggr-tap redir : %llu", sem->aggr_tap_redirects); + vlib_cli_output (vm, " host-xc direct : %llu", sem->host_xc_direct); + vlib_cli_output (vm, " bcast punts : %llu", sem->bcast_punts); + return 0; +} + +VLIB_CLI_COMMAND (show_sonic_ext_command, static) = { + .path = "show sonic-ext", + .short_help = "show sonic-ext", + .function = show_sonic_ext_command_fn, +}; diff --git a/vppbld/plugins/sonic_ext/host_xc_node.c b/vppbld/plugins/sonic_ext/host_xc_node.c new file mode 100644 index 0000000..bbcb7bc --- /dev/null +++ b/vppbld/plugins/sonic_ext/host_xc_node.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2026 SONiC-VPP contributors + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include + +/* + * sonic-ext-host-xc + * + * Feature on the device-input arc. When a packet ingresses on an + * LCP host tap (interface created by linux-cp to mirror a VPP phy + * into Linux), this node steers the packet straight out the + * corresponding phy by setting VLIB_TX = phy and dispatching to + * `interface-output`, bypassing ethernet-input / l2-input entirely. + * + * Why bypass l2-input even for bridged phys: + * The kernel has already done the L2 lookup (bridge-fdb, neighbor + * table, routing) when it composed the frame on the host tap. + * For punt-via-member traffic the kernel chose a specific egress + * netdev (`tap_EthernetN`) which we mapped 1:1 to the phy, so + * sending it back through l2-input would either: + * - cause a static-MAC mac-move-violate drop in l2-learn (if + * SONiC has pinned the dst/src MAC against another sw_if), or + * - re-flood the frame through the bridge, breaking the punt- + * via-member promise that the egress is the chosen member. + * For ARP-reply / transit unicast / L3 egress the dst MAC is + * already the right peer's MAC and the right thing on the wire + * is to relay it verbatim out the phy. Packets actually destined + * for VPP's own BVI never reach this node -- they were never + * handed to the host tap in the first place; ip4/ip6/arp punt + * delivers them to the BVI tap, Linux processes locally, and any + * reply Linux generates is again transit unicast addressed to a + * peer (not to the BVI MAC). + * + * If the ingress sw_if_index is not an LCP host tap, the node is a + * no-op (it lets the packet continue on the feature arc). + */ + +typedef struct +{ + u32 rx_sw_if_index; + u32 phy_sw_if_index; + u8 mode; +} sonic_ext_host_xc_trace_t; + +enum +{ + SONIC_EXT_HOST_XC_MODE_PASSTHRU = 0, + SONIC_EXT_HOST_XC_MODE_DIRECT, +}; + +static u8 * +format_sonic_ext_host_xc_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + sonic_ext_host_xc_trace_t *t = + va_arg (*args, sonic_ext_host_xc_trace_t *); + char *mode = "passthru"; + if (t->mode == SONIC_EXT_HOST_XC_MODE_DIRECT) + mode = "DIRECT->interface-output"; + s = format (s, "SONIC-EXT-HOST-XC: rx %u phy %u %s", t->rx_sw_if_index, + t->phy_sw_if_index, mode); + return s; +} + +#define foreach_sonic_ext_host_xc_error \ + _ (DIRECT, "host tap steered directly to phy interface-output") \ + _ (PASSTHRU, "not an LCP host tap, passthru") + +typedef enum +{ +#define _(sym, str) SONIC_EXT_HOST_XC_ERROR_##sym, + foreach_sonic_ext_host_xc_error +#undef _ + SONIC_EXT_HOST_XC_N_ERROR, +} sonic_ext_host_xc_error_t; + +static char *sonic_ext_host_xc_error_strings[] = { +#define _(sym, string) string, + foreach_sonic_ext_host_xc_error +#undef _ +}; + +typedef enum +{ + SONIC_EXT_HOST_XC_NEXT_DROP, + SONIC_EXT_HOST_XC_NEXT_INTF_OUTPUT, + SONIC_EXT_HOST_XC_N_NEXT, +} sonic_ext_host_xc_next_t; + +VLIB_NODE_FN (sonic_ext_host_xc_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + u32 n_left_from, *from; + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; + u16 nexts[VLIB_FRAME_SIZE], *next; + u32 n_direct = 0, n_pass = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + vlib_get_buffers (vm, from, bufs, n_left_from); + b = bufs; + next = nexts; + + while (n_left_from > 0) + { + u32 rx0, next0; + u8 mode = SONIC_EXT_HOST_XC_MODE_PASSTHRU; + u32 phy_sw = ~0; + index_t hlipi; + + rx0 = vnet_buffer (b[0])->sw_if_index[VLIB_RX]; + + if (PREDICT_FALSE (!sem->host_xc)) + goto passthru; + + hlipi = lcp_itf_pair_find_by_host (rx0); + if (hlipi == INDEX_INVALID) + goto passthru; + + { + const lcp_itf_pair_t *hlip = lcp_itf_pair_get (hlipi); + phy_sw = hlip->lip_phy_sw_if_index; + + /* Direct out the phy regardless of bridged / L3 -- the kernel + * has already chosen the correct egress. Bypass ethernet- + * input and l2-input to avoid (a) static-MAC mac-move-violate + * drops in l2-learn and (b) re-flooding through the bridge. + * See file header for the full rationale. */ + vnet_buffer (b[0])->sw_if_index[VLIB_TX] = phy_sw; + next[0] = SONIC_EXT_HOST_XC_NEXT_INTF_OUTPUT; + mode = SONIC_EXT_HOST_XC_MODE_DIRECT; + n_direct++; + goto traced; + } + + passthru: + vnet_feature_next (&next0, b[0]); + next[0] = (u16) next0; + n_pass++; + + traced: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b[0]->flags & VLIB_BUFFER_IS_TRACED))) + { + sonic_ext_host_xc_trace_t *t = + vlib_add_trace (vm, node, b[0], sizeof (*t)); + t->rx_sw_if_index = rx0; + t->phy_sw_if_index = phy_sw; + t->mode = mode; + } + + b += 1; + next += 1; + n_left_from -= 1; + } + + vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors); + + if (n_direct) + vlib_node_increment_counter (vm, sonic_ext_host_xc_node.index, + SONIC_EXT_HOST_XC_ERROR_DIRECT, n_direct); + if (n_pass) + vlib_node_increment_counter (vm, sonic_ext_host_xc_node.index, + SONIC_EXT_HOST_XC_ERROR_PASSTHRU, n_pass); + sem->host_xc_direct += n_direct; + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (sonic_ext_host_xc_node) = { + .name = "sonic-ext-host-xc", + .vector_size = sizeof (u32), + .format_trace = format_sonic_ext_host_xc_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (sonic_ext_host_xc_error_strings), + .error_strings = sonic_ext_host_xc_error_strings, + .n_next_nodes = SONIC_EXT_HOST_XC_N_NEXT, + .next_nodes = { + [SONIC_EXT_HOST_XC_NEXT_DROP] = "error-drop", + [SONIC_EXT_HOST_XC_NEXT_INTF_OUTPUT] = "interface-output", + }, +}; + +VNET_FEATURE_INIT (sonic_ext_host_xc_feat, static) = { + .arc_name = "device-input", + .node_name = "sonic-ext-host-xc", + /* Must run before ethernet-input so we can choose to bypass it, + * and after sonic-ext-capture so the per-buffer magic cookie / + * orig_rx_sw_if_index / orig_vlan_tag stash is already populated + * before host-xc consults it. Without the runs_after constraint + * VPP's feature ordering is undefined between sibling features + * that only declare runs_before("ethernet-input"), and we have + * observed host-xc dispatching ahead of capture in practice. */ + .runs_before = VNET_FEATURES ("ethernet-input"), + .runs_after = VNET_FEATURES ("sonic-ext-capture"), +}; diff --git a/vppbld/plugins/sonic_ext/sonic_ext.c b/vppbld/plugins/sonic_ext/sonic_ext.c new file mode 100644 index 0000000..6261fd5 --- /dev/null +++ b/vppbld/plugins/sonic_ext/sonic_ext.c @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2026 SONiC-VPP contributors + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +sonic_ext_main_t sonic_ext_main; + +VLIB_PLUGIN_REGISTER () = { + .version = SONIC_EXT_PLUGIN_BUILD_VER, + .description = "SONiC VPP extensions: punt-via-member, host-xc", +}; + +/* + * Per-interface feature enable helpers. All three live in this file + * (rather than in the per-node files) so that the LCP pair add/del + * callback, the sw_if_index add/del callback and the CLI all share the + * same code path. + */ +void +sonic_ext_capture_enable_disable (u32 sw_if_index, int enable) +{ + vnet_feature_enable_disable ("device-input", "sonic-ext-capture", + sw_if_index, enable, 0, 0); +} + +void +sonic_ext_host_xc_enable_disable (u32 sw_if_index, int enable) +{ + vnet_feature_enable_disable ("device-input", "sonic-ext-host-xc", + sw_if_index, enable, 0, 0); +} + +void +sonic_ext_aggr_tap_redirect_enable_disable (u32 sw_if_index, int enable) +{ + vnet_feature_enable_disable ("interface-output", + "sonic-ext-aggr-tap-redirect", sw_if_index, + enable, 0, 0); +} + +void +sonic_ext_bcast_redirect_enable_disable (u32 sw_if_index, int enable) +{ + vnet_feature_enable_disable ("ip4-unicast", + "sonic-ext-bcast-redirect", sw_if_index, + enable, 0, 0); +} + +/* + * Is `phy_sw_if_index` a BVI (bridge-virtual interface)? Used both + * by the aggregate-detection helper and by the bcast-redirect + * enable path. Distinct from is_aggregate so that future bond + * support can be added to is_aggregate without dragging + * bvi-specific features along for the ride. + */ +int +sonic_ext_phy_is_bvi (u32 phy_sw_if_index) +{ + vnet_main_t *vnm = vnet_get_main (); + l2input_main_t *l2im = &l2input_main; + vnet_sw_interface_t *swi; + + if (phy_sw_if_index == ~0) + return 0; + swi = vnet_get_sw_interface_or_null (vnm, phy_sw_if_index); + if (!swi || swi->type == VNET_SW_INTERFACE_TYPE_SUB) + return 0; + + if (phy_sw_if_index < vec_len (l2im->configs)) + { + l2_input_config_t *cfg = vec_elt_at_index (l2im->configs, + phy_sw_if_index); + if (l2_input_is_bvi (cfg)) + return 1; + } + return 0; +} + +/* + * Is `phy_sw_if_index` an "aggregate" parent whose linux-cp host tap + * should receive aggr-tap-redirect? Today that means BVI (loop / + * bridge-virtual interface); tomorrow it will also mean bond / + * port-channel masters. Sub-interfaces are never aggregates; their + * parent might be, but the parent has its own LCP pair which will + * have produced its own callback. + */ +int +sonic_ext_phy_is_aggregate (u32 phy_sw_if_index) +{ + if (sonic_ext_phy_is_bvi (phy_sw_if_index)) + return 1; + + /* TODO: bond / port-channel detection -- compare hw->dev_class + * name to "bond" so we don't have to link against the bonding + * plugin. */ + + return 0; +} + +/* + * lcp_itf_pair_walk callback: enable host-xc on this pair's host tap. + * Used at toggle-on time to catch every pair that was created before + * the operator flipped host_xc on; subsequent pair add/del go via the + * LCP vft callbacks. + */ +static walk_rc_t +sonic_ext_host_xc_walk_enable_cb (index_t lipi, void *ctx) +{ + const lcp_itf_pair_t *lip = lcp_itf_pair_get (lipi); + if (lip) + sonic_ext_host_xc_enable_disable (lip->lip_host_sw_if_index, 1); + return WALK_CONTINUE; +} + +/* + * lcp_itf_pair_walk callback: enable sonic-ext-capture on this pair's + * phy if (and only if) the phy is a real wire port -- i.e. NOT an + * "aggregate" pseudo-phy (BVI / bond master). Capture's job is to + * stamp the original wire-ingress sw_if_index + VLAN tag into the + * buffer cookie before any L2 bridging mangles VLIB_RX, so it must + * fire on the actual member port, not on synthetic interfaces. + * + * Sub-interfaces share their parent's device-input dispatch -- we + * never enable on the sub directly, only on the parent phy. + */ +static walk_rc_t +sonic_ext_capture_walk_enable_cb (index_t lipi, void *ctx) +{ + const lcp_itf_pair_t *lip = lcp_itf_pair_get (lipi); + if (lip && !sonic_ext_phy_is_aggregate (lip->lip_phy_sw_if_index)) + sonic_ext_capture_enable_disable (lip->lip_phy_sw_if_index, 1); + return WALK_CONTINUE; +} + +void +sonic_ext_set_punt_via_member (u8 is_enable) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + + sem->punt_via_member = (is_enable != 0); + + /* Capture only fires on the wire phy side of LCP pairs (real ports, + * not BVIs/bonds) so the original ingress sw_if_index + VLAN tag is + * recorded before L2 bridging overwrites VLIB_RX with the BVI. We + * leave the capture feature enabled even after disabling + * punt-via-member to avoid the per-interface enable/disable churn + * (the downstream redirect node short-circuits via the cookie magic + * check when the toggle is off). */ + if (is_enable && !sem->capture_enabled) + { + lcp_itf_pair_walk (sonic_ext_capture_walk_enable_cb, NULL); + sem->capture_enabled = 1; + } + + /* The aggr-tap-redirect feature itself is wired per-interface from + * the LCP pair add/del callback (sonic_ext_lcp_pair_add_cb) -- it + * only needs to fire on the host tap of BVI/bond masters, never on + * every phy. No per-interface iteration here. */ +} + +void +sonic_ext_set_host_xc (u8 is_enable) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + + sem->host_xc = (is_enable != 0); + + /* host-xc is only meaningful on LCP host taps -- it steers Linux- + * originated traffic out the corresponding phy. Enable on every + * existing pair's host tap, and let the LCP pair add/del callback + * keep the set in sync going forward. Don't iterate every sw_if: + * on phys / sub-ifs / BVIs the feature would always be a no-op + * (lcp_itf_pair_find_by_host returns INDEX_INVALID) but still + * costs a feature-arc dispatch per packet. */ + if (is_enable && !sem->host_xc_enabled) + { + lcp_itf_pair_walk (sonic_ext_host_xc_walk_enable_cb, NULL); + sem->host_xc_enabled = 1; + } +} + +/* + * On new interface creation, we don't enable any sonic-ext features + * directly: capture, host-xc and aggr-tap-redirect all need LCP pair + * context (we want capture only on real wire phys, and the other two + * only on host taps). The LCP pair add callback is the single point + * that wires every per-interface feature. Keeping this hook around + * (as a no-op stub) leaves space for future bookkeeping that doesn't + * need LCP context. + */ +static clib_error_t * +sonic_ext_sw_interface_add_del (vnet_main_t *vnm, u32 sw_if_index, u32 is_add) +{ + return 0; +} + +VNET_SW_INTERFACE_ADD_DEL_FUNCTION (sonic_ext_sw_interface_add_del); + +/* + * LCP pair add/del: when a new linux-cp pair appears, enable the per- + * interface sonic-ext features that apply. + * + * - sonic-ext-capture on the phy -- only if phy is a real wire + * port (not BVI / bond master) and punt-via-member is enabled. + * - sonic-ext-host-xc on the host -- only if host-xc is enabled. + * - sonic-ext-aggr-tap-redirect on the host -- only if phy is an + * aggregate (BVI today, bond tomorrow) and punt-via-member is on. + * - sonic-ext-bcast-redirect on the phy -- only if phy is a + * BVI and punt-via-member is on; catches limited-broadcast + * IPv4 frames that flooded into the BVI from a bridge member + * and feeds them to linux-cp-punt -> bvi-host-tap + * interface-output -> aggr-tap-redirect -> member-host-tap (so + * the kernel observes the broadcast on the original member's + * netdev). + * + * This is the only place we know both (a) the host tap sw_if_index + * and (b) which phy it shadows. + */ +static void +sonic_ext_lcp_pair_add_cb (lcp_itf_pair_t *lip) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + if (!lip) + return; + if (sem->capture_enabled + && !sonic_ext_phy_is_aggregate (lip->lip_phy_sw_if_index)) + sonic_ext_capture_enable_disable (lip->lip_phy_sw_if_index, 1); + if (sem->host_xc_enabled) + sonic_ext_host_xc_enable_disable (lip->lip_host_sw_if_index, 1); + if (sonic_ext_phy_is_aggregate (lip->lip_phy_sw_if_index)) + sonic_ext_aggr_tap_redirect_enable_disable (lip->lip_host_sw_if_index, 1); + if (sem->capture_enabled + && sonic_ext_phy_is_bvi (lip->lip_phy_sw_if_index)) + sonic_ext_bcast_redirect_enable_disable (lip->lip_phy_sw_if_index, 1); +} + +static void +sonic_ext_lcp_pair_del_cb (lcp_itf_pair_t *lip) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + if (!lip) + return; + if (sem->capture_enabled + && !sonic_ext_phy_is_aggregate (lip->lip_phy_sw_if_index)) + sonic_ext_capture_enable_disable (lip->lip_phy_sw_if_index, 0); + if (sem->host_xc_enabled) + sonic_ext_host_xc_enable_disable (lip->lip_host_sw_if_index, 0); + if (sonic_ext_phy_is_aggregate (lip->lip_phy_sw_if_index)) + sonic_ext_aggr_tap_redirect_enable_disable (lip->lip_host_sw_if_index, 0); + if (sem->capture_enabled + && sonic_ext_phy_is_bvi (lip->lip_phy_sw_if_index)) + sonic_ext_bcast_redirect_enable_disable (lip->lip_phy_sw_if_index, 0); +} + +static clib_error_t * +sonic_ext_init (vlib_main_t *vm) +{ + sonic_ext_main_t *sem = &sonic_ext_main; + lcp_itf_pair_vft_t sonic_ext_lcp_vft = { + .pair_add_fn = sonic_ext_lcp_pair_add_cb, + .pair_del_fn = sonic_ext_lcp_pair_del_cb, + }; + clib_memset (sem, 0, sizeof (*sem)); + lcp_itf_pair_register_vft (&sonic_ext_lcp_vft); + + /* Default-on: capture + aggr-tap-redirect (punt-via-member) and + * host-xc. At init time no LCP pairs exist yet, so the walks + * inside set_*() are no-ops and just flip the global toggles; as + * pairs are subsequently created, the LCP pair add callback wires + * the features per-interface. The CLI ("sonic-ext punt-via-member + * disable" / "sonic-ext host-xc disable") can still flip them off + * at runtime. */ + sonic_ext_set_punt_via_member (1); + sonic_ext_set_host_xc (1); + + return 0; +} + +VLIB_INIT_FUNCTION (sonic_ext_init); diff --git a/vppbld/plugins/sonic_ext/sonic_ext.h b/vppbld/plugins/sonic_ext/sonic_ext.h new file mode 100644 index 0000000..2149fdf --- /dev/null +++ b/vppbld/plugins/sonic_ext/sonic_ext.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2026 SONiC-VPP contributors + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_sonic_ext_h__ +#define __included_sonic_ext_h__ + +#include +#include +#include +#include +#include +#include +#include + +#define SONIC_EXT_PLUGIN_BUILD_VER "1.0" + +/* + * Per-buffer metadata stash, overlaid on vnet_buffer2(b)->unused[]. + * + * The capture node writes orig_rx_sw_if_index (the phy/sub-if the + * packet was actually received on) and a magic cookie; downstream + * redirect nodes only consume orig_rx_sw_if_index when the cookie + * matches. Storing inside opaque2 (rather than a global sidecar + * keyed by buffer_index) gives us four properties for free: + * + * 1. Survives vlib_buffer_clone / vlib_buffer_copy -- VPP memcpys + * opaque2 wholesale into every clone, so the value follows the + * packet through l2-flood, ip-mcast-replicate, mpls-replicate, + * etc. A bi-keyed external sidecar cannot do this without a + * core-VPP patch on every clone caller. + * + * 2. No memory allocation, no per-thread or atomic ops in the hot + * path -- the slot is buffer-local. + * + * 3. Self-cleaning -- when the buffer is freed and recycled, opaque2 + * is reset by vlib_buffer_pool_init, so a stale value cannot be + * mis-attributed to an unrelated future packet. + * + * 4. No conflict with stock VPP plugins. We use only the trailing + * `unused[]` array of vnet_buffer_opaque2_t, never the union with + * the nat / cnat scratch fields. The magic cookie defends + * against the (unlikely) case where another plugin also decides + * to scribble in unused[]: a non-matching cookie causes us to + * ignore the slot rather than redirect to a bogus interface. + */ +#define SONIC_EXT_BUFFER_MAGIC 0x534e4358u /* 'SNCX' */ + +/* + * orig_vlan_tag: outermost 802.1Q (or 802.1ad) tag observed on the + * wire frame at sonic-ext-capture time, stored as raw 4 bytes in + * network byte order: [TPID (2)] [TCI (2)]. Zero means the frame + * was untagged when it entered VPP. + * + * Why a wire-byte snapshot rather than reconstructing from sub-if + * config (vnet_sw_interface_t.sub.eth.outer_vlan_id): + * + * The capture node runs on the device-input arc, before + * ethernet-input has classified the inner ethertype. At that + * point VLIB_RX is the *main* hw_if_index, not the sub-if -- the + * sub-if is only resolved later in ethernet-input. So at capture + * time we cannot look up sub-if vlan config; we can only see the + * wire bytes. The snapshot is also future-proof against bridge + * configurations that don't use a vlan sub-interface at all + * (e.g. bridge group with explicit l2 vtr push/pop on the main + * phy), and against multi-tag stacks where the inner tag is not + * recoverable from a single sub.eth.outer_vlan_id field. + * + * The aggr-tap-redirect node uses this saved tag to re-push the + * VLAN before re-entering interface-output on the member tap, so + * Linux observes the same wire frame on the right netdev. + */ +typedef struct +{ + u32 magic; + u32 orig_rx_sw_if_index; + u32 orig_vlan_tag; +} sonic_ext_buffer_opaque_t; + +STATIC_ASSERT (sizeof (sonic_ext_buffer_opaque_t) <= + sizeof (((vnet_buffer_opaque2_t *) 0)->unused), + "sonic-ext per-buffer metadata too large for opaque2->unused"); + +static_always_inline sonic_ext_buffer_opaque_t * +sonic_ext_buffer (vlib_buffer_t *b) +{ + return (sonic_ext_buffer_opaque_t *) vnet_buffer2 (b)->unused; +} + +typedef struct +{ + /* Global feature toggles. */ + u8 punt_via_member; + u8 host_xc; + + /* Set once capture/host-xc have been enabled on all existing + * interfaces, so that toggling on/off is idempotent. */ + u8 capture_enabled; + u8 host_xc_enabled; + + /* Counters (per-feature, per-thread accounting kept in node + * registrations; these are summary counters for `show sonic-ext`). */ + u64 captures; + u64 aggr_tap_redirects; + u64 host_xc_direct; + u64 bcast_punts; +} sonic_ext_main_t; + +extern sonic_ext_main_t sonic_ext_main; + +extern vlib_node_registration_t sonic_ext_capture_node; +extern vlib_node_registration_t sonic_ext_aggr_tap_redirect_node; +extern vlib_node_registration_t sonic_ext_host_xc_node; +extern vlib_node_registration_t sonic_ext_bcast_redirect_node; + +/* Enable / disable sonic-ext-capture on a given interface. No-op if + * the capture sidecar is not yet initialized. */ +void sonic_ext_capture_enable_disable (u32 sw_if_index, int enable); + +/* Enable / disable sonic-ext-host-xc on a given interface. */ +void sonic_ext_host_xc_enable_disable (u32 sw_if_index, int enable); + +/* Enable / disable sonic-ext-aggr-tap-redirect on a given sw_if_index + * (always the LCP host tap of an aggregate phy -- BVI today, bond + * tomorrow). Driven from the LCP pair add/del callback. */ +void sonic_ext_aggr_tap_redirect_enable_disable (u32 sw_if_index, int enable); + +/* Enable / disable sonic-ext-bcast-redirect on a given sw_if_index. + * Today driven from the LCP pair add/del callback gated on + * sonic_ext_phy_is_bvi(); the node itself is generic and could be + * extended to non-BVI aggregates in the future. */ +void sonic_ext_bcast_redirect_enable_disable (u32 sw_if_index, int enable); + +/* Toggle accessors used by CLI and node fast paths. */ +void sonic_ext_set_punt_via_member (u8 is_enable); +void sonic_ext_set_host_xc (u8 is_enable); + +/* Returns non-zero if phy_sw_if_index is an "aggregate" parent whose + * LCP host tap should have the aggr-tap-redirect feature enabled -- + * today that means BVI; in the future it will also cover bond / + * port-channel master interfaces. Used by the LCP pair add callback. */ +int sonic_ext_phy_is_aggregate (u32 phy_sw_if_index); + +/* Returns non-zero iff phy_sw_if_index is a BVI (bridge-virtual + * interface). Distinct from is_aggregate so future bond support + * can opt out of bvi-specific features (bcast-redirect runs on + * the BVI's own ip4-unicast arc; today only BVIs need it). */ +int sonic_ext_phy_is_bvi (u32 phy_sw_if_index); + +#endif /* __included_sonic_ext_h__ */