1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3 4""" 5Devlink Rate TC Bandwidth Test Suite 6=================================== 7 8This test suite verifies the functionality of devlink-rate traffic class (TC) 9bandwidth distribution in a virtualized environment. The tests validate that 10bandwidth can be properly allocated between different traffic classes and 11that TC mapping works as expected. 12 13Test Environment: 14---------------- 15- Creates 1 VF 16- Establishes a bridge connecting the VF representor and the uplink representor 17- Sets up 2 VLAN interfaces on the VF with different VLAN IDs (101, 102) 18- Configures different traffic classes (TC3 and TC4) for each VLAN 19 20Test Cases: 21---------- 221. test_no_tc_mapping_bandwidth: 23 - Verifies that without TC mapping, bandwidth is NOT distributed according to 24 the configured 80/20 split between TC4 and TC3 25 - This test should fail if bandwidth matches the 80/20 split without TC 26 mapping 27 - Expected: Bandwidth should NOT be distributed as 80/20 28 292. test_tc_mapping_bandwidth: 30 - Configures TC mapping using mqprio qdisc 31 - Verifies that with TC mapping, bandwidth IS distributed according to the 32 configured 80/20 split between TC3 and TC4 33 - Expected: Bandwidth should be distributed as 80/20 34 35Bandwidth Distribution: 36---------------------- 37- TC3 (VLAN 101): Configured for 80% of total bandwidth 38- TC4 (VLAN 102): Configured for 20% of total bandwidth 39- Total bandwidth: 1Gbps 40- Tolerance: +-12% 41 42Hardware-Specific Behavior (mlx5): 43-------------------------- 44mlx5 hardware enforces traffic class separation by ensuring that each transmit 45queue (SQ) is associated with a single TC. If a packet is sent on a queue that 46doesn't match the expected TC (based on DSCP or VLAN priority and hypervisor-set 47mapping), the hardware moves the queue to the correct TC scheduler to preserve 48traffic isolation. 49 50This behavior means that even without explicit TC-to-queue mapping, bandwidth 51enforcement may still appear to work—because the hardware dynamically adjusts 52the scheduling context. However, this can lead to performance issues in high 53rates and HOL blocking if traffic from different TCs is mixed on the same queue. 54""" 55 56import json 57import os 58import subprocess 59import threading 60import time 61 62from lib.py import ksft_pr, ksft_run, ksft_exit 63from lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx 64from lib.py import NetDrvEpEnv, DevlinkFamily 65from lib.py import NlError 66from lib.py import cmd, defer, ethtool, ip 67 68 69class BandwidthValidator: 70 """ 71 Validates bandwidth totals and per-TC shares against expected values 72 with a tolerance. 73 """ 74 75 def __init__(self): 76 self.tolerance_percent = 12 77 self.expected_total_gbps = 1.0 78 self.total_min_expected = self.min_expected(self.expected_total_gbps) 79 self.total_max_expected = self.max_expected(self.expected_total_gbps) 80 self.tc_expected_percent = { 81 3: 20.0, 82 4: 80.0, 83 } 84 85 def min_expected(self, value): 86 """Calculates the minimum acceptable value based on tolerance.""" 87 return value - (value * self.tolerance_percent / 100) 88 89 def max_expected(self, value): 90 """Calculates the maximum acceptable value based on tolerance.""" 91 return value + (value * self.tolerance_percent / 100) 92 93 def bound(self, expected, value): 94 """Returns True if value is within expected tolerance.""" 95 return self.min_expected(expected) <= value <= self.max_expected(expected) 96 97 def tc_bandwidth_bound(self, value, tc_ix): 98 """ 99 Returns True if the given bandwidth value is within tolerance 100 for the TC's expected bandwidth. 101 """ 102 expected = self.tc_expected_percent[tc_ix] 103 return self.bound(expected, value) 104 105 106def setup_vf(cfg, set_tc_mapping=True): 107 """ 108 Sets up a VF on the given network interface. 109 110 Enables SR-IOV and switchdev mode, brings the VF interface up, 111 and optionally configures TC mapping using mqprio. 112 """ 113 try: 114 cmd(f"devlink dev eswitch set pci/{cfg.pci} mode switchdev") 115 defer(cmd, f"devlink dev eswitch set pci/{cfg.pci} mode legacy") 116 except Exception as exc: 117 raise KsftSkipEx(f"Failed to enable switchdev mode on {cfg.pci}") from exc 118 try: 119 cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") 120 defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") 121 except Exception as exc: 122 raise KsftSkipEx(f"Failed to enable SR-IOV on {cfg.ifname}") from exc 123 124 time.sleep(2) 125 vf_ifc = (os.listdir( 126 f"/sys/class/net/{cfg.ifname}/device/virtfn0/net") or [None])[0] 127 if vf_ifc: 128 ip(f"link set dev {vf_ifc} up") 129 else: 130 raise KsftSkipEx("VF interface not found") 131 if set_tc_mapping: 132 cmd(f"tc qdisc add dev {vf_ifc} root handle 5 mqprio mode dcb hw 1 num_tc 8") 133 134 return vf_ifc 135 136 137def setup_vlans_on_vf(vf_ifc): 138 """ 139 Sets up two VLAN interfaces on the given VF, each mapped to a different TC. 140 """ 141 vlan_configs = [ 142 {"vlan_id": 101, "tc": 3, "ip": "198.51.100.2"}, 143 {"vlan_id": 102, "tc": 4, "ip": "198.51.100.10"}, 144 ] 145 146 for config in vlan_configs: 147 vlan_dev = f"{vf_ifc}.{config['vlan_id']}" 148 ip(f"link add link {vf_ifc} name {vlan_dev} type vlan id {config['vlan_id']}") 149 ip(f"addr add {config['ip']}/29 dev {vlan_dev}") 150 ip(f"link set dev {vlan_dev} up") 151 ip(f"link set dev {vlan_dev} type vlan egress-qos-map 0:{config['tc']}") 152 ksft_pr(f"Created VLAN {vlan_dev} on {vf_ifc} with tc {config['tc']} and IP {config['ip']}") 153 154 155def get_vf_info(cfg): 156 """ 157 Finds the VF representor interface and devlink port index 158 for the given PCI device used in the test environment. 159 """ 160 cfg.vf_representor = None 161 cfg.vf_port_index = None 162 out = subprocess.check_output(["devlink", "-j", "port", "show"], encoding="utf-8") 163 ports = json.loads(out)["port"] 164 165 for port_name, props in ports.items(): 166 netdev = props.get("netdev") 167 168 if (port_name.startswith(f"pci/{cfg.pci}/") and 169 props.get("vfnum") == 0): 170 cfg.vf_representor = netdev 171 cfg.vf_port_index = int(port_name.split("/")[-1]) 172 break 173 174 175def setup_bridge(cfg): 176 """ 177 Creates and configures a Linux bridge, with both the uplink 178 and VF representor interfaces attached to it. 179 """ 180 bridge_name = f"br_{os.getpid()}" 181 ip(f"link add name {bridge_name} type bridge") 182 defer(cmd, f"ip link del name {bridge_name} type bridge") 183 184 ip(f"link set dev {cfg.ifname} master {bridge_name}") 185 186 rep_name = cfg.vf_representor 187 if rep_name: 188 ip(f"link set dev {rep_name} master {bridge_name}") 189 ip(f"link set dev {rep_name} up") 190 ksft_pr(f"Set representor {rep_name} up and added to bridge") 191 else: 192 raise KsftSkipEx("Could not find representor for the VF") 193 194 ip(f"link set dev {bridge_name} up") 195 196 197def setup_devlink_rate(cfg): 198 """ 199 Configures devlink rate tx_max and traffic class bandwidth for the VF. 200 """ 201 port_index = cfg.vf_port_index 202 if port_index is None: 203 raise KsftSkipEx("Could not find VF port index") 204 try: 205 cfg.devnl.rate_set({ 206 "bus-name": "pci", 207 "dev-name": cfg.pci, 208 "port-index": port_index, 209 "rate-tx-max": 125000000, 210 "rate-tc-bws": [ 211 {"index": 0, "bw": 0}, 212 {"index": 1, "bw": 0}, 213 {"index": 2, "bw": 0}, 214 {"index": 3, "bw": 20}, 215 {"index": 4, "bw": 80}, 216 {"index": 5, "bw": 0}, 217 {"index": 6, "bw": 0}, 218 {"index": 7, "bw": 0}, 219 ] 220 }) 221 except NlError as exc: 222 if exc.error == 95: # EOPNOTSUPP 223 raise KsftSkipEx("devlink rate configuration is not supported on the VF") from exc 224 raise KsftFailEx(f"rate_set failed on VF port {port_index}") from exc 225 226 227def setup_remote_server(cfg): 228 """ 229 Sets up VLAN interfaces and starts iperf3 servers on the remote side. 230 """ 231 remote_dev = cfg.remote_ifname 232 vlan_ids = [101, 102] 233 remote_ips = ["198.51.100.1", "198.51.100.9"] 234 235 for vlan_id, ip_addr in zip(vlan_ids, remote_ips): 236 vlan_dev = f"{remote_dev}.{vlan_id}" 237 cmd(f"ip link add link {remote_dev} name {vlan_dev} " 238 f"type vlan id {vlan_id}", host=cfg.remote) 239 cmd(f"ip addr add {ip_addr}/29 dev {vlan_dev}", host=cfg.remote) 240 cmd(f"ip link set dev {vlan_dev} up", host=cfg.remote) 241 cmd(f"iperf3 -s -1 -B {ip_addr}",background=True, host=cfg.remote) 242 defer(cmd, f"ip link del {vlan_dev}", host=cfg.remote) 243 244 245def setup_test_environment(cfg, set_tc_mapping=True): 246 """ 247 Sets up the complete test environment including VF creation, VLANs, 248 bridge configuration, devlink rate setup, and the remote server. 249 """ 250 vf_ifc = setup_vf(cfg, set_tc_mapping) 251 ksft_pr(f"Created VF interface: {vf_ifc}") 252 253 setup_vlans_on_vf(vf_ifc) 254 255 get_vf_info(cfg) 256 setup_bridge(cfg) 257 258 setup_devlink_rate(cfg) 259 setup_remote_server(cfg) 260 time.sleep(2) 261 262 263def run_iperf_client(server_ip, local_ip, barrier, min_expected_gbps=0.1): 264 """ 265 Runs a single iperf3 client instance, binding to the given local IP. 266 Waits on a barrier to synchronize with other threads. 267 """ 268 try: 269 barrier.wait(timeout=10) 270 except Exception as exc: 271 raise KsftFailEx("iperf3 barrier wait timed") from exc 272 273 iperf_cmd = ["iperf3", "-c", server_ip, "-B", local_ip, "-J"] 274 result = subprocess.run(iperf_cmd, capture_output=True, text=True, 275 check=True) 276 277 try: 278 output = json.loads(result.stdout) 279 bits_per_second = output["end"]["sum_received"]["bits_per_second"] 280 gbps = bits_per_second / 1e9 281 if gbps < min_expected_gbps: 282 ksft_pr( 283 f"iperf3 bandwidth too low: {gbps:.2f} Gbps " 284 f"(expected ≥ {min_expected_gbps} Gbps)" 285 ) 286 return None 287 return gbps 288 except json.JSONDecodeError as exc: 289 ksft_pr(f"Failed to parse iperf3 JSON output: {exc}") 290 return None 291 292 293def run_bandwidth_test(): 294 """ 295 Launches iperf3 client threads for each VLAN/TC pair and collects results. 296 """ 297 def _run_iperf_client_thread(server_ip, local_ip, results, barrier, tc_ix): 298 results[tc_ix] = run_iperf_client(server_ip, local_ip, barrier) 299 300 vf_vlan_data = [ 301 # (local_ip, remote_ip, TC) 302 ("198.51.100.2", "198.51.100.1", 3), 303 ("198.51.100.10", "198.51.100.9", 4), 304 ] 305 306 results = {} 307 threads = [] 308 start_barrier = threading.Barrier(len(vf_vlan_data)) 309 310 for local_ip, remote_ip, tc_ix in vf_vlan_data: 311 thread = threading.Thread( 312 target=_run_iperf_client_thread, 313 args=(remote_ip, local_ip, results, start_barrier, tc_ix) 314 ) 315 thread.start() 316 threads.append(thread) 317 318 for thread in threads: 319 thread.join() 320 321 for tc_ix, tc_bw in results.items(): 322 if tc_bw is None: 323 raise KsftFailEx("iperf3 client failed; cannot evaluate bandwidth") 324 325 return results 326 327def calculate_bandwidth_percentages(results): 328 """ 329 Calculates the percentage of total bandwidth received by TC3 and TC4. 330 """ 331 if 3 not in results or 4 not in results: 332 raise KsftFailEx(f"Missing expected TC results in {results}") 333 334 tc3_bw = results[3] 335 tc4_bw = results[4] 336 total_bw = tc3_bw + tc4_bw 337 tc3_percentage = (tc3_bw / total_bw) * 100 338 tc4_percentage = (tc4_bw / total_bw) * 100 339 340 return { 341 'tc3_bw': tc3_bw, 342 'tc4_bw': tc4_bw, 343 'tc3_percentage': tc3_percentage, 344 'tc4_percentage': tc4_percentage, 345 'total_bw': total_bw 346 } 347 348 349def print_bandwidth_results(bw_data, test_name): 350 """ 351 Prints bandwidth measurements and TC usage summary for a given test. 352 """ 353 ksft_pr(f"Bandwidth check results {test_name}:") 354 ksft_pr(f"TC 3: {bw_data['tc3_bw']:.2f} Gbits/sec") 355 ksft_pr(f"TC 4: {bw_data['tc4_bw']:.2f} Gbits/sec") 356 ksft_pr(f"Total bandwidth: {bw_data['total_bw']:.2f} Gbits/sec") 357 ksft_pr(f"TC 3 percentage: {bw_data['tc3_percentage']:.1f}%") 358 ksft_pr(f"TC 4 percentage: {bw_data['tc4_percentage']:.1f}%") 359 360 361def verify_total_bandwidth(bw_data, validator): 362 """ 363 Ensures the total measured bandwidth falls within the acceptable tolerance. 364 """ 365 total = bw_data['total_bw'] 366 367 if validator.bound(validator.expected_total_gbps, total): 368 return 369 370 if total < validator.total_min_expected: 371 raise KsftSkipEx( 372 f"Total bandwidth {total:.2f} Gbps < minimum " 373 f"{validator.total_min_expected:.2f} Gbps; " 374 f"parent tx_max ({validator.expected_total_gbps:.1f} G) " 375 f"not reached, cannot validate share" 376 ) 377 378 raise KsftFailEx( 379 f"Total bandwidth {total:.2f} Gbps exceeds allowed ceiling " 380 f"{validator.total_max_expected:.2f} Gbps " 381 f"(VF tx_max set to {validator.expected_total_gbps:.1f} G)" 382 ) 383 384 385def check_bandwidth_distribution(bw_data, validator): 386 """ 387 Checks whether the measured TC3 and TC4 bandwidth percentages 388 fall within their expected tolerance ranges. 389 390 Returns: 391 bool: True if both TC3 and TC4 percentages are within bounds. 392 """ 393 tc3_valid = validator.tc_bandwidth_bound(bw_data['tc3_percentage'], 3) 394 tc4_valid = validator.tc_bandwidth_bound(bw_data['tc4_percentage'], 4) 395 396 return tc3_valid and tc4_valid 397 398 399def run_bandwidth_distribution_test(cfg, set_tc_mapping): 400 """ 401 Runs parallel iperf3 tests for both TCs and collects results. 402 """ 403 setup_test_environment(cfg, set_tc_mapping) 404 bandwidths = run_bandwidth_test() 405 bw_data = calculate_bandwidth_percentages(bandwidths) 406 test_name = "with TC mapping" if set_tc_mapping else "without TC mapping" 407 print_bandwidth_results(bw_data, test_name) 408 409 verify_total_bandwidth(bw_data, cfg.bw_validator) 410 411 return check_bandwidth_distribution(bw_data, cfg.bw_validator) 412 413 414def test_no_tc_mapping_bandwidth(cfg): 415 """ 416 Verifies that bandwidth is not split 80/20 without traffic class mapping. 417 """ 418 pass_bw_msg = "Bandwidth is NOT distributed as 80/20 without TC mapping" 419 fail_bw_msg = "Bandwidth matched 80/20 split without TC mapping" 420 is_mlx5 = "driver: mlx5" in ethtool(f"-i {cfg.ifname}").stdout 421 422 if run_bandwidth_distribution_test(cfg, set_tc_mapping=False): 423 if is_mlx5: 424 raise KsftXfailEx(fail_bw_msg) 425 raise KsftFailEx(fail_bw_msg) 426 if is_mlx5: 427 raise KsftFailEx("mlx5 behavior changed:" + pass_bw_msg) 428 ksft_pr(pass_bw_msg) 429 430 431def test_tc_mapping_bandwidth(cfg): 432 """ 433 Verifies that bandwidth is correctly split 80/20 between TC3 and TC4 434 when traffic class mapping is set. 435 """ 436 if run_bandwidth_distribution_test(cfg, set_tc_mapping=True): 437 ksft_pr("Bandwidth is distributed as 80/20 with TC mapping") 438 else: 439 raise KsftFailEx("Bandwidth did not match 80/20 split with TC mapping") 440 441 442def main() -> None: 443 """ 444 Main entry point for running the test cases. 445 """ 446 with NetDrvEpEnv(__file__, nsim_test=False) as cfg: 447 cfg.devnl = DevlinkFamily() 448 449 cfg.pci = os.path.basename( 450 os.path.realpath(f"/sys/class/net/{cfg.ifname}/device") 451 ) 452 if not cfg.pci: 453 raise KsftSkipEx("Could not get PCI address of the interface") 454 cfg.require_cmd("iperf3", local=True, remote=True) 455 456 cfg.bw_validator = BandwidthValidator() 457 458 cases = [test_no_tc_mapping_bandwidth, test_tc_mapping_bandwidth] 459 460 ksft_run(cases=cases, args=(cfg,)) 461 ksft_exit() 462 463 464if __name__ == "__main__": 465 main() 466