1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3
4"""
5Devlink Rate TC Bandwidth Test Suite
6===================================
7
8This test suite verifies the functionality of devlink-rate traffic class (TC)
9bandwidth distribution in a virtualized environment. The tests validate that
10bandwidth can be properly allocated between different traffic classes and
11that TC mapping works as expected.
12
13Test Environment:
14----------------
15- Creates 1 VF
16- Establishes a bridge connecting the VF representor and the uplink representor
17- Sets up 2 VLAN interfaces on the VF with different VLAN IDs (101, 102)
18- Configures different traffic classes (TC3 and TC4) for each VLAN
19
20Test Cases:
21----------
221. test_no_tc_mapping_bandwidth:
23   - Verifies that without TC mapping, bandwidth is NOT distributed according to
24     the configured 80/20 split between TC4 and TC3
25   - This test should fail if bandwidth matches the 80/20 split without TC
26     mapping
27   - Expected: Bandwidth should NOT be distributed as 80/20
28
292. test_tc_mapping_bandwidth:
30   - Configures TC mapping using mqprio qdisc
31   - Verifies that with TC mapping, bandwidth IS distributed according to the
32     configured 80/20 split between TC3 and TC4
33   - Expected: Bandwidth should be distributed as 80/20
34
35Bandwidth Distribution:
36----------------------
37- TC3 (VLAN 101): Configured for 80% of total bandwidth
38- TC4 (VLAN 102): Configured for 20% of total bandwidth
39- Total bandwidth: 1Gbps
40- Tolerance: +-12%
41
42Hardware-Specific Behavior (mlx5):
43--------------------------
44mlx5 hardware enforces traffic class separation by ensuring that each transmit
45queue (SQ) is associated with a single TC. If a packet is sent on a queue that
46doesn't match the expected TC (based on DSCP or VLAN priority and hypervisor-set
47mapping), the hardware moves the queue to the correct TC scheduler to preserve
48traffic isolation.
49
50This behavior means that even without explicit TC-to-queue mapping, bandwidth
51enforcement may still appear to work—because the hardware dynamically adjusts
52the scheduling context. However, this can lead to performance issues in high
53rates and HOL blocking if traffic from different TCs is mixed on the same queue.
54"""
55
56import json
57import os
58import subprocess
59import threading
60import time
61
62from lib.py import ksft_pr, ksft_run, ksft_exit
63from lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx
64from lib.py import NetDrvEpEnv, DevlinkFamily
65from lib.py import NlError
66from lib.py import cmd, defer, ethtool, ip
67
68
69class BandwidthValidator:
70    """
71    Validates bandwidth totals and per-TC shares against expected values
72    with a tolerance.
73    """
74
75    def __init__(self):
76        self.tolerance_percent = 12
77        self.expected_total_gbps = 1.0
78        self.total_min_expected = self.min_expected(self.expected_total_gbps)
79        self.total_max_expected = self.max_expected(self.expected_total_gbps)
80        self.tc_expected_percent = {
81            3: 20.0,
82            4: 80.0,
83        }
84
85    def min_expected(self, value):
86        """Calculates the minimum acceptable value based on tolerance."""
87        return value - (value * self.tolerance_percent / 100)
88
89    def max_expected(self, value):
90        """Calculates the maximum acceptable value based on tolerance."""
91        return value + (value * self.tolerance_percent / 100)
92
93    def bound(self, expected, value):
94        """Returns True if value is within expected tolerance."""
95        return self.min_expected(expected) <= value <= self.max_expected(expected)
96
97    def tc_bandwidth_bound(self, value, tc_ix):
98        """
99        Returns True if the given bandwidth value is within tolerance
100        for the TC's expected bandwidth.
101        """
102        expected = self.tc_expected_percent[tc_ix]
103        return self.bound(expected, value)
104
105
106def setup_vf(cfg, set_tc_mapping=True):
107    """
108    Sets up a VF on the given network interface.
109
110    Enables SR-IOV and switchdev mode, brings the VF interface up,
111    and optionally configures TC mapping using mqprio.
112    """
113    try:
114        cmd(f"devlink dev eswitch set pci/{cfg.pci} mode switchdev")
115        defer(cmd, f"devlink dev eswitch set pci/{cfg.pci} mode legacy")
116    except Exception as exc:
117        raise KsftSkipEx(f"Failed to enable switchdev mode on {cfg.pci}") from exc
118    try:
119        cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs")
120        defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs")
121    except Exception as exc:
122        raise KsftSkipEx(f"Failed to enable SR-IOV on {cfg.ifname}") from exc
123
124    time.sleep(2)
125    vf_ifc = (os.listdir(
126        f"/sys/class/net/{cfg.ifname}/device/virtfn0/net") or [None])[0]
127    if vf_ifc:
128        ip(f"link set dev {vf_ifc} up")
129    else:
130        raise KsftSkipEx("VF interface not found")
131    if set_tc_mapping:
132        cmd(f"tc qdisc add dev {vf_ifc} root handle 5 mqprio mode dcb hw 1 num_tc 8")
133
134    return vf_ifc
135
136
137def setup_vlans_on_vf(vf_ifc):
138    """
139    Sets up two VLAN interfaces on the given VF, each mapped to a different TC.
140    """
141    vlan_configs = [
142        {"vlan_id": 101, "tc": 3, "ip": "198.51.100.2"},
143        {"vlan_id": 102, "tc": 4, "ip": "198.51.100.10"},
144    ]
145
146    for config in vlan_configs:
147        vlan_dev = f"{vf_ifc}.{config['vlan_id']}"
148        ip(f"link add link {vf_ifc} name {vlan_dev} type vlan id {config['vlan_id']}")
149        ip(f"addr add {config['ip']}/29 dev {vlan_dev}")
150        ip(f"link set dev {vlan_dev} up")
151        ip(f"link set dev {vlan_dev} type vlan egress-qos-map 0:{config['tc']}")
152        ksft_pr(f"Created VLAN {vlan_dev} on {vf_ifc} with tc {config['tc']} and IP {config['ip']}")
153
154
155def get_vf_info(cfg):
156    """
157    Finds the VF representor interface and devlink port index
158    for the given PCI device used in the test environment.
159    """
160    cfg.vf_representor = None
161    cfg.vf_port_index = None
162    out = subprocess.check_output(["devlink", "-j", "port", "show"], encoding="utf-8")
163    ports = json.loads(out)["port"]
164
165    for port_name, props in ports.items():
166        netdev = props.get("netdev")
167
168        if (port_name.startswith(f"pci/{cfg.pci}/") and
169            props.get("vfnum") == 0):
170            cfg.vf_representor = netdev
171            cfg.vf_port_index = int(port_name.split("/")[-1])
172            break
173
174
175def setup_bridge(cfg):
176    """
177    Creates and configures a Linux bridge, with both the uplink
178    and VF representor interfaces attached to it.
179    """
180    bridge_name = f"br_{os.getpid()}"
181    ip(f"link add name {bridge_name} type bridge")
182    defer(cmd, f"ip link del name {bridge_name} type bridge")
183
184    ip(f"link set dev {cfg.ifname} master {bridge_name}")
185
186    rep_name = cfg.vf_representor
187    if rep_name:
188        ip(f"link set dev {rep_name} master {bridge_name}")
189        ip(f"link set dev {rep_name} up")
190        ksft_pr(f"Set representor {rep_name} up and added to bridge")
191    else:
192        raise KsftSkipEx("Could not find representor for the VF")
193
194    ip(f"link set dev {bridge_name} up")
195
196
197def setup_devlink_rate(cfg):
198    """
199    Configures devlink rate tx_max and traffic class bandwidth for the VF.
200    """
201    port_index = cfg.vf_port_index
202    if port_index is None:
203        raise KsftSkipEx("Could not find VF port index")
204    try:
205        cfg.devnl.rate_set({
206            "bus-name": "pci",
207            "dev-name": cfg.pci,
208            "port-index": port_index,
209            "rate-tx-max": 125000000,
210            "rate-tc-bws": [
211                {"index": 0, "bw": 0},
212                {"index": 1, "bw": 0},
213                {"index": 2, "bw": 0},
214                {"index": 3, "bw": 20},
215                {"index": 4, "bw": 80},
216                {"index": 5, "bw": 0},
217                {"index": 6, "bw": 0},
218                {"index": 7, "bw": 0},
219            ]
220        })
221    except NlError as exc:
222        if exc.error == 95:  # EOPNOTSUPP
223            raise KsftSkipEx("devlink rate configuration is not supported on the VF") from exc
224        raise KsftFailEx(f"rate_set failed on VF port {port_index}") from exc
225
226
227def setup_remote_server(cfg):
228    """
229    Sets up VLAN interfaces and starts iperf3 servers on the remote side.
230    """
231    remote_dev = cfg.remote_ifname
232    vlan_ids = [101, 102]
233    remote_ips = ["198.51.100.1", "198.51.100.9"]
234
235    for vlan_id, ip_addr in zip(vlan_ids, remote_ips):
236        vlan_dev = f"{remote_dev}.{vlan_id}"
237        cmd(f"ip link add link {remote_dev} name {vlan_dev} "
238            f"type vlan id {vlan_id}", host=cfg.remote)
239        cmd(f"ip addr add {ip_addr}/29 dev {vlan_dev}", host=cfg.remote)
240        cmd(f"ip link set dev {vlan_dev} up", host=cfg.remote)
241        cmd(f"iperf3 -s -1 -B {ip_addr}",background=True, host=cfg.remote)
242        defer(cmd, f"ip link del {vlan_dev}", host=cfg.remote)
243
244
245def setup_test_environment(cfg, set_tc_mapping=True):
246    """
247    Sets up the complete test environment including VF creation, VLANs,
248    bridge configuration, devlink rate setup, and the remote server.
249    """
250    vf_ifc = setup_vf(cfg, set_tc_mapping)
251    ksft_pr(f"Created VF interface: {vf_ifc}")
252
253    setup_vlans_on_vf(vf_ifc)
254
255    get_vf_info(cfg)
256    setup_bridge(cfg)
257
258    setup_devlink_rate(cfg)
259    setup_remote_server(cfg)
260    time.sleep(2)
261
262
263def run_iperf_client(server_ip, local_ip, barrier, min_expected_gbps=0.1):
264    """
265    Runs a single iperf3 client instance, binding to the given local IP.
266    Waits on a barrier to synchronize with other threads.
267    """
268    try:
269        barrier.wait(timeout=10)
270    except Exception as exc:
271        raise KsftFailEx("iperf3 barrier wait timed") from exc
272
273    iperf_cmd = ["iperf3", "-c", server_ip, "-B", local_ip, "-J"]
274    result = subprocess.run(iperf_cmd, capture_output=True, text=True,
275                            check=True)
276
277    try:
278        output = json.loads(result.stdout)
279        bits_per_second = output["end"]["sum_received"]["bits_per_second"]
280        gbps = bits_per_second / 1e9
281        if gbps < min_expected_gbps:
282            ksft_pr(
283                f"iperf3 bandwidth too low: {gbps:.2f} Gbps "
284                f"(expected ≥ {min_expected_gbps} Gbps)"
285            )
286            return None
287        return gbps
288    except json.JSONDecodeError as exc:
289        ksft_pr(f"Failed to parse iperf3 JSON output: {exc}")
290        return None
291
292
293def run_bandwidth_test():
294    """
295    Launches iperf3 client threads for each VLAN/TC pair and collects results.
296    """
297    def _run_iperf_client_thread(server_ip, local_ip, results, barrier, tc_ix):
298        results[tc_ix] = run_iperf_client(server_ip, local_ip, barrier)
299
300    vf_vlan_data = [
301        # (local_ip, remote_ip, TC)
302        ("198.51.100.2",  "198.51.100.1", 3),
303        ("198.51.100.10", "198.51.100.9", 4),
304    ]
305
306    results = {}
307    threads = []
308    start_barrier = threading.Barrier(len(vf_vlan_data))
309
310    for local_ip, remote_ip, tc_ix in vf_vlan_data:
311        thread = threading.Thread(
312            target=_run_iperf_client_thread,
313            args=(remote_ip, local_ip, results, start_barrier, tc_ix)
314        )
315        thread.start()
316        threads.append(thread)
317
318    for thread in threads:
319        thread.join()
320
321    for tc_ix, tc_bw in results.items():
322        if tc_bw is None:
323            raise KsftFailEx("iperf3 client failed; cannot evaluate bandwidth")
324
325    return results
326
327def calculate_bandwidth_percentages(results):
328    """
329    Calculates the percentage of total bandwidth received by TC3 and TC4.
330    """
331    if 3 not in results or 4 not in results:
332        raise KsftFailEx(f"Missing expected TC results in {results}")
333
334    tc3_bw = results[3]
335    tc4_bw = results[4]
336    total_bw = tc3_bw + tc4_bw
337    tc3_percentage = (tc3_bw / total_bw) * 100
338    tc4_percentage = (tc4_bw / total_bw) * 100
339
340    return {
341        'tc3_bw': tc3_bw,
342        'tc4_bw': tc4_bw,
343        'tc3_percentage': tc3_percentage,
344        'tc4_percentage': tc4_percentage,
345        'total_bw': total_bw
346    }
347
348
349def print_bandwidth_results(bw_data, test_name):
350    """
351    Prints bandwidth measurements and TC usage summary for a given test.
352    """
353    ksft_pr(f"Bandwidth check results {test_name}:")
354    ksft_pr(f"TC 3: {bw_data['tc3_bw']:.2f} Gbits/sec")
355    ksft_pr(f"TC 4: {bw_data['tc4_bw']:.2f} Gbits/sec")
356    ksft_pr(f"Total bandwidth: {bw_data['total_bw']:.2f} Gbits/sec")
357    ksft_pr(f"TC 3 percentage: {bw_data['tc3_percentage']:.1f}%")
358    ksft_pr(f"TC 4 percentage: {bw_data['tc4_percentage']:.1f}%")
359
360
361def verify_total_bandwidth(bw_data, validator):
362    """
363    Ensures the total measured bandwidth falls within the acceptable tolerance.
364    """
365    total = bw_data['total_bw']
366
367    if validator.bound(validator.expected_total_gbps, total):
368        return
369
370    if total < validator.total_min_expected:
371        raise KsftSkipEx(
372            f"Total bandwidth {total:.2f} Gbps < minimum "
373            f"{validator.total_min_expected:.2f} Gbps; "
374            f"parent tx_max ({validator.expected_total_gbps:.1f} G) "
375            f"not reached, cannot validate share"
376        )
377
378    raise KsftFailEx(
379        f"Total bandwidth {total:.2f} Gbps exceeds allowed ceiling "
380        f"{validator.total_max_expected:.2f} Gbps "
381        f"(VF tx_max set to {validator.expected_total_gbps:.1f} G)"
382    )
383
384
385def check_bandwidth_distribution(bw_data, validator):
386    """
387    Checks whether the measured TC3 and TC4 bandwidth percentages
388    fall within their expected tolerance ranges.
389
390    Returns:
391        bool: True if both TC3 and TC4 percentages are within bounds.
392    """
393    tc3_valid = validator.tc_bandwidth_bound(bw_data['tc3_percentage'], 3)
394    tc4_valid = validator.tc_bandwidth_bound(bw_data['tc4_percentage'], 4)
395
396    return tc3_valid and tc4_valid
397
398
399def run_bandwidth_distribution_test(cfg, set_tc_mapping):
400    """
401    Runs parallel iperf3 tests for both TCs and collects results.
402    """
403    setup_test_environment(cfg, set_tc_mapping)
404    bandwidths = run_bandwidth_test()
405    bw_data = calculate_bandwidth_percentages(bandwidths)
406    test_name = "with TC mapping" if set_tc_mapping else "without TC mapping"
407    print_bandwidth_results(bw_data, test_name)
408
409    verify_total_bandwidth(bw_data, cfg.bw_validator)
410
411    return check_bandwidth_distribution(bw_data, cfg.bw_validator)
412
413
414def test_no_tc_mapping_bandwidth(cfg):
415    """
416    Verifies that bandwidth is not split 80/20 without traffic class mapping.
417    """
418    pass_bw_msg = "Bandwidth is NOT distributed as 80/20 without TC mapping"
419    fail_bw_msg = "Bandwidth matched 80/20 split without TC mapping"
420    is_mlx5 = "driver: mlx5" in ethtool(f"-i {cfg.ifname}").stdout
421
422    if run_bandwidth_distribution_test(cfg, set_tc_mapping=False):
423        if is_mlx5:
424            raise KsftXfailEx(fail_bw_msg)
425        raise KsftFailEx(fail_bw_msg)
426    if is_mlx5:
427        raise KsftFailEx("mlx5 behavior changed:" + pass_bw_msg)
428    ksft_pr(pass_bw_msg)
429
430
431def test_tc_mapping_bandwidth(cfg):
432    """
433    Verifies that bandwidth is correctly split 80/20 between TC3 and TC4
434    when traffic class mapping is set.
435    """
436    if run_bandwidth_distribution_test(cfg, set_tc_mapping=True):
437        ksft_pr("Bandwidth is distributed as 80/20 with TC mapping")
438    else:
439        raise KsftFailEx("Bandwidth did not match 80/20 split with TC mapping")
440
441
442def main() -> None:
443    """
444    Main entry point for running the test cases.
445    """
446    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
447        cfg.devnl = DevlinkFamily()
448
449        cfg.pci = os.path.basename(
450            os.path.realpath(f"/sys/class/net/{cfg.ifname}/device")
451        )
452        if not cfg.pci:
453            raise KsftSkipEx("Could not get PCI address of the interface")
454        cfg.require_cmd("iperf3", local=True, remote=True)
455
456        cfg.bw_validator = BandwidthValidator()
457
458        cases = [test_no_tc_mapping_bandwidth, test_tc_mapping_bandwidth]
459
460        ksft_run(cases=cases, args=(cfg,))
461    ksft_exit()
462
463
464if __name__ == "__main__":
465    main()
466